(index) Implement working optional TermCoherences

2025-02-23 21:18:58 +00:00 · 2024-06-26 12:22:06 +02:00 · 2024-06-26 12:22:06 +02:00 · 95b9af92a0
commit 95b9af92a0
parent 8ee64c0771
8 changed files with 91 additions and 29 deletions
--- a/code/functions/search-query/api/build.gradle
+++ b/code/functions/search-query/api/build.gradle
@ -23,6 +23,7 @@ dependencies {
    implementation project(':code:common:config')
    implementation project(':code:common:service')
    implementation project(':code:index:query')
    implementation project(':code:libraries:language-processing')
    implementation libs.bundles.slf4j
--- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java
+++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java
@ -86,7 +86,8 @@ public class IndexProtobufCodec {
        for (var coherences : searchQuery.searchTermCoherences) {
            subqueryBuilder.addCoherencesBuilder()
                    .addAllCoherences(coherences.terms())
-                    .setType(coherences.mandatory() ? RpcCoherences.TYPE.MANDATORY : RpcCoherences.TYPE.OPTIONAL);
+                    .setType(coherences.mandatory() ? RpcCoherences.TYPE.MANDATORY : RpcCoherences.TYPE.OPTIONAL)
                    .build();
        }
        return subqueryBuilder.build();
--- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchCoherenceConstraint.java
+++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchCoherenceConstraint.java
@ -1,23 +1,71 @@
 package nu.marginalia.api.searchquery.model.query;
 import nu.marginalia.language.WordPatterns;
 import java.util.ArrayList;
 import java.util.List;
 public record SearchCoherenceConstraint(boolean mandatory, List<String> terms) {
    public static SearchCoherenceConstraint mandatory(String... terms) {
        return new SearchCoherenceConstraint(true, List.of(terms));
    }
    public static SearchCoherenceConstraint mandatory(List<String> terms) {
        return new SearchCoherenceConstraint(true, List.copyOf(terms));
    }
    public static SearchCoherenceConstraint optional(String... terms) {
        return new SearchCoherenceConstraint(false, List.of(terms));
    }
    public static SearchCoherenceConstraint optional(List<String> terms) {
        return new SearchCoherenceConstraint(false, List.copyOf(terms));
    }
    public int size() {
        return terms.size();
    }
    /** Create a new SearchCoherenceConstraint with the given terms, and the given mandatory flag.
     * Stop words are replaced with empty strings.
     */
    public static SearchCoherenceConstraint mandatory(String... terms) {
        return new SearchCoherenceConstraint(true, trimStopWords(terms));
    }
    /** Create a new SearchCoherenceConstraint with the given terms, and the given mandatory flag.
     * Stop words are replaced with empty strings.
     */
    public static SearchCoherenceConstraint mandatory(List<String> terms) {
        return new SearchCoherenceConstraint(true, trimStopWords(terms));
    }
    /** Create a new SearchCoherenceConstraint with the given terms, without the mandatory flag.
     * Stop words are replaced with empty strings.
     */
    public static SearchCoherenceConstraint optional(String... terms) {
        return new SearchCoherenceConstraint(false, trimStopWords(terms));
    }
    /** Create a new SearchCoherenceConstraint with the given terms, without the mandatory flag.
     * Stop words are replaced with empty strings.
     */
    public static SearchCoherenceConstraint optional(List<String> terms) {
        return new SearchCoherenceConstraint(false, trimStopWords(terms));
    }
    private static List<String> trimStopWords(List<String> terms) {
        List<String> ret = new ArrayList<>(terms.size());
        for (var term : terms) {
            if (WordPatterns.isStopWord(term)) {
                ret.add("");
            } else {
                ret.add(term);
            }
        }
        return List.copyOf(ret);
    }
    private static List<String> trimStopWords(String... terms) {
        List<String> ret = new ArrayList<>(terms.length);
        for (var term : terms) {
            if (WordPatterns.isStopWord(term)) {
                ret.add("");
            } else {
                ret.add(term);
            }
        }
        while (!ret.isEmpty() && "".equals(ret.getFirst())) {
            ret.removeFirst();
        }
        while (!ret.isEmpty() && "".equals(ret.getLast())) {
            ret.removeLast();
        }
        return List.copyOf(ret);
    }
 }
--- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java
+++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java
@ -166,6 +166,11 @@ public class QueryExpansion {
            graph.addVariantForSpan(nodes.get(start), nodes.get(end - 1), word);
        }
        // also create a segmentation that is just the entire query
        coherences.add(nodes.stream()
                .map(QWord::word)
                .collect(Collectors.toList()));
        return coherences;
    }
--- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java
+++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java
@ -75,23 +75,18 @@ public class QueryFactory {
                    String[] parts = StringUtils.split(str, '_');
-                    // Checking for stop words here is a bit of a stop-gap to fix the issue of stop words being
+                    if (parts.length > 1) {
-                    // required in the query (which is a problem because they are not indexed). How to do this
+                        // Require that the terms appear in sequence
                    // in a clean way is a bit of an open problem that may not get resolved until query-parsing is
                    // improved.
                    if (parts.length > 1 && !anyPartIsStopWord(parts)) {
                        // Prefer that the actual n-gram is present
                        searchTermsAdvice.add(str);
                        // Require that the terms appear in the same sentence
                        searchTermCoherences.add(SearchCoherenceConstraint.mandatory(parts));
-                        // Require that each term exists in the document
+                        // Construct a regular query from the parts in the quoted string
                        // (needed for ranking)
                        searchTermsInclude.addAll(Arrays.asList(parts));
                        // Prefer that the actual n-gram is present
                        searchTermsPriority.add(str);
                    }
                    else {
                        // If the quoted word is a single word, we don't need to do more than include it in the search
                        searchTermsInclude.add(str);
                    }
                }
--- a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java
+++ b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java
@ -229,4 +229,12 @@ public class QueryFactoryTest {
        System.out.println("Time: " + (System.currentTimeMillis() - start));
        System.out.println(subquery);
    }
    @Test
    public void testExpansion8() {
        long start = System.currentTimeMillis();
        var subquery = parseAndGetSpecs("success often consists of");
        System.out.println("Time: " + (System.currentTimeMillis() - start));
        System.out.println(subquery);
    }
 }
--- a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java
+++ b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java
@ -74,6 +74,8 @@ public class IndexResultValuationContext {
        int htmlFeatures = index.getHtmlFeatures(docId);
        int docSize = index.getDocumentSize(docId);
        int bestCoherence = searchTerms.coherences.testOptional(positions);
        double score = searchResultValuator.calculateSearchResultValue(
                wordFlagsQuery,
                positionsCountQuery,
@ -81,8 +83,8 @@ public class IndexResultValuationContext {
                docMetadata,
                htmlFeatures,
                docSize,
-                rankingContext,
+                bestCoherence,
-                null);
+                rankingContext, null);
        SearchResultItem searchResult = new SearchResultItem(docId,
                docMetadata,
--- a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java
+++ b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java
@ -40,6 +40,7 @@ public class ResultValuator {
                                             CompiledQueryInt positionsCountQuery, CompiledQuery<GammaCodedSequence> positionsQuery, long documentMetadata,
                                             int features,
                                             int length,
                                             int bestCoherence,
                                             ResultRankingContext ctx,
                                             @Nullable Consumer<ResultRankingDetails> detailsConsumer
                                             )
@ -83,7 +84,8 @@ public class ResultValuator {
                           + rankingBonus
                           + topologyBonus
                           + temporalBias
-                           + flagsPenalty;
+                           + flagsPenalty
                           + bestCoherence;
        // FIXME: need a weighting factor here
        double tcfAvgDist = 25. / termCoherenceFactor.calculateAvgMinDistance(positionsQuery, ctx);