From c583a538b15f5a4185df4cc92c03105b004666ca Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 17 Apr 2024 14:03:35 +0200 Subject: [PATCH] (search) Add implicit coherence constraints based on segmentation --- .../query_parser/QueryExpansion.java | 33 ++++++++++++++----- .../searchquery/svc/QueryFactory.java | 7 ++-- .../query/svc/QueryFactoryTest.java | 11 +++++++ 3 files changed, 39 insertions(+), 12 deletions(-) diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java index d4e324fa..0c9fa453 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java @@ -11,7 +11,6 @@ import org.apache.commons.lang3.StringUtils; import java.util.*; import java.util.regex.Pattern; -import java.util.stream.Collectors; import java.util.stream.IntStream; /** Responsible for expanding a query, that is creating alternative branches of query execution @@ -25,8 +24,7 @@ public class QueryExpansion { private final List expansionStrategies = List.of( this::joinDashes, this::splitWordNum, - this::joinTerms, - this::createSegments + this::joinTerms ); @Inject @@ -37,7 +35,7 @@ public class QueryExpansion { this.lexicon = lexicon; } - public String expandQuery(List words) { + public Expansion expandQuery(List words) { QWordGraph graph = new QWordGraph(words); @@ -45,7 +43,11 @@ public class QueryExpansion { strategy.expand(graph); } - return QWordPathsRenderer.render(graph); + List> coherences = createSegments(graph); + + var compiled = QWordPathsRenderer.render(graph); + + return new Expansion(compiled, coherences); } private static final Pattern dashPattern = Pattern.compile("-"); @@ -99,8 +101,12 @@ public class QueryExpansion { /** Create an alternative interpretation of the query that replaces a sequence of words * with a word n-gram. This makes it so that when possible, the order of words in the document * matches the order of the words in the query. + * + * The function modifies the graph in place, adding new variants to the graph; but also + * returns a list of the new groupings that were added. */ - public void createSegments(QWordGraph graph) { + public List> createSegments(QWordGraph graph) + { List nodes = new ArrayList<>(); for (var qw : graph) { @@ -118,25 +124,32 @@ public class QueryExpansion { allSegments.sort(Comparator.comparing(NgramLexicon.SentenceSegment::start)); if (allSegments.isEmpty()) { - return; + return List.of(); } Set bestSegmentation = findBestSegmentation(allSegments); + List> coherences = new ArrayList<>(); + for (var segment : bestSegmentation) { int start = segment.start(); int end = segment.start() + segment.length(); - var word = IntStream.range(start, end) + List components =IntStream.range(start, end) .mapToObj(nodes::get) .map(QWord::word) - .collect(Collectors.joining("_")); + .toList(); + coherences.add(components); + + String word = String.join("_", components); graph.addVariantForSpan(nodes.get(start), nodes.get(end - 1), word); } + return coherences; + } private Set findBestSegmentation(List allSegments) { @@ -178,4 +191,6 @@ public class QueryExpansion { public interface ExpansionStrategy { void expand(QWordGraph graph); } + + public record Expansion(String compiledQuery, List> extraCoherences) {} } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java index 15596d5c..382f62a8 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java @@ -137,10 +137,11 @@ public class QueryFactory { limits = limits.forSingleDomain(); } + var expansion = queryExpansion.expandQuery(searchTermsInclude); + searchTermCoherences.addAll(expansion.extraCoherences()); + var searchQuery = new SearchQuery( - queryExpansion.expandQuery( - searchTermsInclude - ), + expansion.compiledQuery(), searchTermsInclude, searchTermsExclude, searchTermsAdvice, diff --git a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java index 622130b7..d07e2d80 100644 --- a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java +++ b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java @@ -178,4 +178,15 @@ public class QueryFactoryTest { System.out.println(subquery.compiledQuery); } + + @Test + public void testExpansion2() { + + long start = System.currentTimeMillis(); + var subquery = parseAndGetSpecs("need for speed").query; + System.out.println("Time: " + (System.currentTimeMillis() - start)); + System.out.println(subquery); + + } + } \ No newline at end of file