From c96da0ce1e50b09e629cb13c517e550ebd5337cb Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 12 Apr 2024 17:44:14 +0200 Subject: [PATCH] (segmentation) Pick best segmentation using |s|^|s|-style normalization This is better than doing all segmentations possible at the same time. --- .../query_parser/QueryExpansion.java | 72 ++++++++++++++++--- .../query/svc/QueryFactoryTest.java | 17 +++++ 2 files changed, 78 insertions(+), 11 deletions(-) diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java index 9c9d81fa..80d8c8f3 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java @@ -9,8 +9,7 @@ import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import org.apache.commons.lang3.StringUtils; -import java.util.ArrayList; -import java.util.List; +import java.util.*; import java.util.regex.Pattern; import java.util.stream.Collectors; import java.util.stream.IntStream; @@ -110,21 +109,72 @@ public class QueryExpansion { String[] words = nodes.stream().map(QWord::stemmed).toArray(String[]::new); - // Look for known segments within the query + // Grab all segments + + List allSegments = new ArrayList<>(); for (int length = 2; length < Math.min(10, words.length); length++) { - for (var segment : lexicon.findSegmentOffsets(length, words)) { + allSegments.addAll(lexicon.findSegmentOffsets(length, words)); + } + allSegments.sort(Comparator.comparing(NgramLexicon.SentenceSegment::start)); - int start = segment.start(); - int end = segment.start() + segment.length(); + if (allSegments.isEmpty()) { + return; + } - var word = IntStream.range(start, end) - .mapToObj(nodes::get) - .map(QWord::word) - .collect(Collectors.joining("_")); + Set bestSegmentation = + findBestSegmentation(allSegments); - graph.addVariantForSpan(nodes.get(start), nodes.get(end - 1), word); + for (var segment : bestSegmentation) { + + int start = segment.start(); + int end = segment.start() + segment.length(); + + var word = IntStream.range(start, end) + .mapToObj(nodes::get) + .map(QWord::word) + .collect(Collectors.joining("_")); + + System.out.println(word); + + graph.addVariantForSpan(nodes.get(start), nodes.get(end - 1), word); + } + + } + + private Set findBestSegmentation(List allSegments) { + Set bestSet = Set.of(); + double bestScore = Double.MIN_VALUE; + + for (int i = 0; i < allSegments.size(); i++) { + Set parts = new HashSet<>(); + parts.add(allSegments.get(i)); + + outer: + for (int j = i+1; j < allSegments.size(); j++) { + var candidate = allSegments.get(j); + for (var part : parts) { + if (part.overlaps(candidate)) { + continue outer; + } + } + parts.add(candidate); + } + + double score = 0.; + for (var part : parts) { + // |s|^|s|-normalization per M Hagen et al + double normFactor = Math.pow(part.count(), part.count()); + + score += normFactor * part.count(); + } + + if (bestScore < score) { + bestScore = score; + bestSet = parts; } } + + return bestSet; } public interface ExpansionStrategy { diff --git a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java index 132944c4..622130b7 100644 --- a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java +++ b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java @@ -16,6 +16,8 @@ import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.List; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -52,6 +54,21 @@ public class QueryFactoryTest { ResultRankingParameters.TemporalBias.NONE)).specs; } + + @Test + void qsec10() { + try (var lines = Files.lines(Path.of("/home/vlofgren/Exports/qsec10/webis-qsec-10-training-set/webis-qsec-10-training-set-queries.txt"))) { + lines.limit(1000).forEach(line -> { + String[] parts = line.split("\t"); + if (parts.length == 2) { + System.out.println(parseAndGetSpecs(parts[1]).getQuery().compiledQuery); + } + }); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + @Test public void testParseNoSpecials() { var year = parseAndGetSpecs("in the year 2000").year;