(segmentation) Pick best segmentation using |s|^|s|-style normalization

This is better than doing all segmentations possible at the same time.
This commit is contained in:
Viktor Lofgren 2024-04-12 17:44:14 +02:00
parent a0d9e66ff7
commit c96da0ce1e
2 changed files with 78 additions and 11 deletions

View File

@ -9,8 +9,7 @@ import nu.marginalia.segmentation.NgramLexicon;
import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import java.util.ArrayList; import java.util.*;
import java.util.List;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.IntStream; import java.util.stream.IntStream;
@ -110,9 +109,22 @@ public class QueryExpansion {
String[] words = nodes.stream().map(QWord::stemmed).toArray(String[]::new); String[] words = nodes.stream().map(QWord::stemmed).toArray(String[]::new);
// Look for known segments within the query // Grab all segments
List<NgramLexicon.SentenceSegment> allSegments = new ArrayList<>();
for (int length = 2; length < Math.min(10, words.length); length++) { for (int length = 2; length < Math.min(10, words.length); length++) {
for (var segment : lexicon.findSegmentOffsets(length, words)) { allSegments.addAll(lexicon.findSegmentOffsets(length, words));
}
allSegments.sort(Comparator.comparing(NgramLexicon.SentenceSegment::start));
if (allSegments.isEmpty()) {
return;
}
Set<NgramLexicon.SentenceSegment> bestSegmentation =
findBestSegmentation(allSegments);
for (var segment : bestSegmentation) {
int start = segment.start(); int start = segment.start();
int end = segment.start() + segment.length(); int end = segment.start() + segment.length();
@ -122,9 +134,47 @@ public class QueryExpansion {
.map(QWord::word) .map(QWord::word)
.collect(Collectors.joining("_")); .collect(Collectors.joining("_"));
System.out.println(word);
graph.addVariantForSpan(nodes.get(start), nodes.get(end - 1), word); graph.addVariantForSpan(nodes.get(start), nodes.get(end - 1), word);
} }
} }
private Set<NgramLexicon.SentenceSegment> findBestSegmentation(List<NgramLexicon.SentenceSegment> allSegments) {
Set<NgramLexicon.SentenceSegment> bestSet = Set.of();
double bestScore = Double.MIN_VALUE;
for (int i = 0; i < allSegments.size(); i++) {
Set<NgramLexicon.SentenceSegment> parts = new HashSet<>();
parts.add(allSegments.get(i));
outer:
for (int j = i+1; j < allSegments.size(); j++) {
var candidate = allSegments.get(j);
for (var part : parts) {
if (part.overlaps(candidate)) {
continue outer;
}
}
parts.add(candidate);
}
double score = 0.;
for (var part : parts) {
// |s|^|s|-normalization per M Hagen et al
double normFactor = Math.pow(part.count(), part.count());
score += normFactor * part.count();
}
if (bestScore < score) {
bestScore = score;
bestSet = parts;
}
}
return bestSet;
} }
public interface ExpansionStrategy { public interface ExpansionStrategy {

View File

@ -16,6 +16,8 @@ import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List; import java.util.List;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertEquals;
@ -52,6 +54,21 @@ public class QueryFactoryTest {
ResultRankingParameters.TemporalBias.NONE)).specs; ResultRankingParameters.TemporalBias.NONE)).specs;
} }
@Test
void qsec10() {
try (var lines = Files.lines(Path.of("/home/vlofgren/Exports/qsec10/webis-qsec-10-training-set/webis-qsec-10-training-set-queries.txt"))) {
lines.limit(1000).forEach(line -> {
String[] parts = line.split("\t");
if (parts.length == 2) {
System.out.println(parseAndGetSpecs(parts[1]).getQuery().compiledQuery);
}
});
} catch (IOException e) {
throw new RuntimeException(e);
}
}
@Test @Test
public void testParseNoSpecials() { public void testParseNoSpecials() {
var year = parseAndGetSpecs("in the year 2000").year; var year = parseAndGetSpecs("in the year 2000").year;