(index) Implement working optional TermCoherences

2025-02-23 13:09:00 +00:00 · 2024-06-26 12:22:06 +02:00 · 2024-06-26 12:22:06 +02:00 · 95b9af92a0
commit 95b9af92a0
parent 8ee64c0771
8 changed files with 91 additions and 29 deletions
--- a/code/functions/search-query/api/build.gradle
+++ b/code/functions/search-query/api/build.gradle
@ -23,6 +23,7 @@ dependencies {
    implementation project(':code:common:config')
    implementation project(':code:common:service')
    implementation project(':code:index:query')
+    implementation project(':code:libraries:language-processing')

    implementation libs.bundles.slf4j

--- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java
+++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java
@ -86,7 +86,8 @@ public class IndexProtobufCodec {
        for (var coherences : searchQuery.searchTermCoherences) {
            subqueryBuilder.addCoherencesBuilder()
                    .addAllCoherences(coherences.terms())
-                    .setType(coherences.mandatory() ? RpcCoherences.TYPE.MANDATORY : RpcCoherences.TYPE.OPTIONAL);
+                    .setType(coherences.mandatory() ? RpcCoherences.TYPE.MANDATORY : RpcCoherences.TYPE.OPTIONAL)
+                    .build();
        }

        return subqueryBuilder.build();
--- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchCoherenceConstraint.java
+++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchCoherenceConstraint.java
@ -1,23 +1,71 @@
 package nu.marginalia.api.searchquery.model.query;

+import nu.marginalia.language.WordPatterns;
+
+import java.util.ArrayList;
 import java.util.List;

 public record SearchCoherenceConstraint(boolean mandatory, List<String> terms) {
-    public static SearchCoherenceConstraint mandatory(String... terms) {
-        return new SearchCoherenceConstraint(true, List.of(terms));
-    }
-    public static SearchCoherenceConstraint mandatory(List<String> terms) {
-        return new SearchCoherenceConstraint(true, List.copyOf(terms));
-    }
-
-    public static SearchCoherenceConstraint optional(String... terms) {
-        return new SearchCoherenceConstraint(false, List.of(terms));
-    }
-    public static SearchCoherenceConstraint optional(List<String> terms) {
-        return new SearchCoherenceConstraint(false, List.copyOf(terms));
-    }

    public int size() {
        return terms.size();
    }
+
+    /** Create a new SearchCoherenceConstraint with the given terms, and the given mandatory flag.
+     * Stop words are replaced with empty strings.
+     */
+    public static SearchCoherenceConstraint mandatory(String... terms) {
+        return new SearchCoherenceConstraint(true, trimStopWords(terms));
+    }
+    /** Create a new SearchCoherenceConstraint with the given terms, and the given mandatory flag.
+     * Stop words are replaced with empty strings.
+     */
+    public static SearchCoherenceConstraint mandatory(List<String> terms) {
+        return new SearchCoherenceConstraint(true, trimStopWords(terms));
+    }
+    /** Create a new SearchCoherenceConstraint with the given terms, without the mandatory flag.
+     * Stop words are replaced with empty strings.
+     */
+    public static SearchCoherenceConstraint optional(String... terms) {
+        return new SearchCoherenceConstraint(false, trimStopWords(terms));
+    }
+    /** Create a new SearchCoherenceConstraint with the given terms, without the mandatory flag.
+     * Stop words are replaced with empty strings.
+     */
+    public static SearchCoherenceConstraint optional(List<String> terms) {
+        return new SearchCoherenceConstraint(false, trimStopWords(terms));
+    }
+
+    private static List<String> trimStopWords(List<String> terms) {
+        List<String> ret = new ArrayList<>(terms.size());
+        for (var term : terms) {
+            if (WordPatterns.isStopWord(term)) {
+                ret.add("");
+            } else {
+                ret.add(term);
+            }
+        }
+        return List.copyOf(ret);
+    }
+
+    private static List<String> trimStopWords(String... terms) {
+        List<String> ret = new ArrayList<>(terms.length);
+        for (var term : terms) {
+            if (WordPatterns.isStopWord(term)) {
+                ret.add("");
+            } else {
+                ret.add(term);
+            }
+        }
+
+        while (!ret.isEmpty() && "".equals(ret.getFirst())) {
+            ret.removeFirst();
+        }
+        while (!ret.isEmpty() && "".equals(ret.getLast())) {
+            ret.removeLast();
+        }
+
+        return List.copyOf(ret);
+    }
+
 }
--- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java
+++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java
@ -166,6 +166,11 @@ public class QueryExpansion {
            graph.addVariantForSpan(nodes.get(start), nodes.get(end - 1), word);
        }

+        // also create a segmentation that is just the entire query
+        coherences.add(nodes.stream()
+                .map(QWord::word)
+                .collect(Collectors.toList()));
+
        return coherences;
    }

--- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java
+++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java
@ -75,23 +75,18 @@ public class QueryFactory {

                    String[] parts = StringUtils.split(str, '_');

-                    // Checking for stop words here is a bit of a stop-gap to fix the issue of stop words being
-                    // required in the query (which is a problem because they are not indexed). How to do this
-                    // in a clean way is a bit of an open problem that may not get resolved until query-parsing is
-                    // improved.
-
-                    if (parts.length > 1 && !anyPartIsStopWord(parts)) {
-                        // Prefer that the actual n-gram is present
-                        searchTermsAdvice.add(str);
-
-                        // Require that the terms appear in the same sentence
+                    if (parts.length > 1) {
+                        // Require that the terms appear in sequence
                        searchTermCoherences.add(SearchCoherenceConstraint.mandatory(parts));

-                        // Require that each term exists in the document
-                        // (needed for ranking)
+                        // Construct a regular query from the parts in the quoted string
                        searchTermsInclude.addAll(Arrays.asList(parts));
+
+                        // Prefer that the actual n-gram is present
+                        searchTermsPriority.add(str);
                    }
                    else {
+                        // If the quoted word is a single word, we don't need to do more than include it in the search
                        searchTermsInclude.add(str);
                    }
                }
--- a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java
+++ b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java
@ -229,4 +229,12 @@ public class QueryFactoryTest {
        System.out.println("Time: " + (System.currentTimeMillis() - start));
        System.out.println(subquery);
    }
+
+    @Test
+    public void testExpansion8() {
+        long start = System.currentTimeMillis();
+        var subquery = parseAndGetSpecs("success often consists of");
+        System.out.println("Time: " + (System.currentTimeMillis() - start));
+        System.out.println(subquery);
+    }
 }
--- a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java
+++ b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java
@ -74,6 +74,8 @@ public class IndexResultValuationContext {
        int htmlFeatures = index.getHtmlFeatures(docId);
        int docSize = index.getDocumentSize(docId);

+        int bestCoherence = searchTerms.coherences.testOptional(positions);
+
        double score = searchResultValuator.calculateSearchResultValue(
                wordFlagsQuery,
                positionsCountQuery,
@ -81,8 +83,8 @@ public class IndexResultValuationContext {
                docMetadata,
                htmlFeatures,
                docSize,
-                rankingContext,
-                null);
+                bestCoherence,
+                rankingContext, null);

        SearchResultItem searchResult = new SearchResultItem(docId,
                docMetadata,
--- a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java
+++ b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java
@ -40,6 +40,7 @@ public class ResultValuator {
                                             CompiledQueryInt positionsCountQuery, CompiledQuery<GammaCodedSequence> positionsQuery, long documentMetadata,
                                             int features,
                                             int length,
+                                             int bestCoherence,
                                             ResultRankingContext ctx,
                                             @Nullable Consumer<ResultRankingDetails> detailsConsumer
                                             )
@ -83,7 +84,8 @@ public class ResultValuator {
                           + rankingBonus
                           + topologyBonus
                           + temporalBias
-                           + flagsPenalty;
+                           + flagsPenalty
+                           + bestCoherence;

        // FIXME: need a weighting factor here
        double tcfAvgDist = 25. / termCoherenceFactor.calculateAvgMinDistance(positionsQuery, ctx);