mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 13:19:02 +00:00
(segmentation) Pick best segmentation using |s|^|s|-style normalization
This is better than doing all segmentations possible at the same time.
This commit is contained in:
parent
a0d9e66ff7
commit
c96da0ce1e
@ -9,8 +9,7 @@ import nu.marginalia.segmentation.NgramLexicon;
|
|||||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.*;
|
||||||
import java.util.List;
|
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.IntStream;
|
import java.util.stream.IntStream;
|
||||||
@ -110,9 +109,22 @@ public class QueryExpansion {
|
|||||||
|
|
||||||
String[] words = nodes.stream().map(QWord::stemmed).toArray(String[]::new);
|
String[] words = nodes.stream().map(QWord::stemmed).toArray(String[]::new);
|
||||||
|
|
||||||
// Look for known segments within the query
|
// Grab all segments
|
||||||
|
|
||||||
|
List<NgramLexicon.SentenceSegment> allSegments = new ArrayList<>();
|
||||||
for (int length = 2; length < Math.min(10, words.length); length++) {
|
for (int length = 2; length < Math.min(10, words.length); length++) {
|
||||||
for (var segment : lexicon.findSegmentOffsets(length, words)) {
|
allSegments.addAll(lexicon.findSegmentOffsets(length, words));
|
||||||
|
}
|
||||||
|
allSegments.sort(Comparator.comparing(NgramLexicon.SentenceSegment::start));
|
||||||
|
|
||||||
|
if (allSegments.isEmpty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
Set<NgramLexicon.SentenceSegment> bestSegmentation =
|
||||||
|
findBestSegmentation(allSegments);
|
||||||
|
|
||||||
|
for (var segment : bestSegmentation) {
|
||||||
|
|
||||||
int start = segment.start();
|
int start = segment.start();
|
||||||
int end = segment.start() + segment.length();
|
int end = segment.start() + segment.length();
|
||||||
@ -122,9 +134,47 @@ public class QueryExpansion {
|
|||||||
.map(QWord::word)
|
.map(QWord::word)
|
||||||
.collect(Collectors.joining("_"));
|
.collect(Collectors.joining("_"));
|
||||||
|
|
||||||
|
System.out.println(word);
|
||||||
|
|
||||||
graph.addVariantForSpan(nodes.get(start), nodes.get(end - 1), word);
|
graph.addVariantForSpan(nodes.get(start), nodes.get(end - 1), word);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private Set<NgramLexicon.SentenceSegment> findBestSegmentation(List<NgramLexicon.SentenceSegment> allSegments) {
|
||||||
|
Set<NgramLexicon.SentenceSegment> bestSet = Set.of();
|
||||||
|
double bestScore = Double.MIN_VALUE;
|
||||||
|
|
||||||
|
for (int i = 0; i < allSegments.size(); i++) {
|
||||||
|
Set<NgramLexicon.SentenceSegment> parts = new HashSet<>();
|
||||||
|
parts.add(allSegments.get(i));
|
||||||
|
|
||||||
|
outer:
|
||||||
|
for (int j = i+1; j < allSegments.size(); j++) {
|
||||||
|
var candidate = allSegments.get(j);
|
||||||
|
for (var part : parts) {
|
||||||
|
if (part.overlaps(candidate)) {
|
||||||
|
continue outer;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
parts.add(candidate);
|
||||||
|
}
|
||||||
|
|
||||||
|
double score = 0.;
|
||||||
|
for (var part : parts) {
|
||||||
|
// |s|^|s|-normalization per M Hagen et al
|
||||||
|
double normFactor = Math.pow(part.count(), part.count());
|
||||||
|
|
||||||
|
score += normFactor * part.count();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (bestScore < score) {
|
||||||
|
bestScore = score;
|
||||||
|
bestSet = parts;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return bestSet;
|
||||||
}
|
}
|
||||||
|
|
||||||
public interface ExpansionStrategy {
|
public interface ExpansionStrategy {
|
||||||
|
@ -16,6 +16,8 @@ import org.junit.jupiter.api.BeforeAll;
|
|||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
@ -52,6 +54,21 @@ public class QueryFactoryTest {
|
|||||||
ResultRankingParameters.TemporalBias.NONE)).specs;
|
ResultRankingParameters.TemporalBias.NONE)).specs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void qsec10() {
|
||||||
|
try (var lines = Files.lines(Path.of("/home/vlofgren/Exports/qsec10/webis-qsec-10-training-set/webis-qsec-10-training-set-queries.txt"))) {
|
||||||
|
lines.limit(1000).forEach(line -> {
|
||||||
|
String[] parts = line.split("\t");
|
||||||
|
if (parts.length == 2) {
|
||||||
|
System.out.println(parseAndGetSpecs(parts[1]).getQuery().compiledQuery);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testParseNoSpecials() {
|
public void testParseNoSpecials() {
|
||||||
var year = parseAndGetSpecs("in the year 2000").year;
|
var year = parseAndGetSpecs("in the year 2000").year;
|
||||||
|
Loading…
Reference in New Issue
Block a user