Better handling of quote terms, fix bug in handling of longer queries.

... where some terms may previously have been ignored. The latter bug was due to the handling of QueryHeads with AnyOf-style predicates interacting poorly with alreadyConsideredTerms in SearchIndex.java
2025-02-23 13:09:00 +00:00 · 2023-04-10 13:11:40 +02:00 · 2023-04-10 13:11:40 +02:00 · fe419b12b4
commit fe419b12b4
parent 810515c08d
12 changed files with 146 additions and 72 deletions
--- a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/query/SearchSubquery.java
+++ b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/query/SearchSubquery.java
@ -10,7 +10,7 @@ import java.util.stream.Collectors;
@AllArgsConstructor
 public class SearchSubquery {

-    /** These terms must be present in the document */
+    /** These terms must be present in the document and are used in ranking*/
    public final List<String> searchTermsInclude;

    /** These terms must be absent from the document */
@ -21,18 +21,22 @@ public class SearchSubquery {

    /** If these optional terms are present in the document, rank it highly */
    public final List<String> searchTermsPriority;
-    
+
+    /** Terms that we require to be in the same sentence */
+    public final List<List<String>> searchTermCoherences;
+
    private double value = 0;

    public SearchSubquery(List<String> searchTermsInclude,
                          List<String> searchTermsExclude,
                          List<String> searchTermsAdvice,
-                          List<String> searchTermsPriority
-                              ) {
+                          List<String> searchTermsPriority,
+                          List<List<String>> searchTermCoherences) {
        this.searchTermsInclude = searchTermsInclude;
        this.searchTermsExclude = searchTermsExclude;
        this.searchTermsAdvice = searchTermsAdvice;
        this.searchTermsPriority = searchTermsPriority;
+        this.searchTermCoherences = searchTermCoherences;
    }

    public SearchSubquery setValue(double value) {
@ -51,6 +55,7 @@ public class SearchSubquery {
        if (!searchTermsExclude.isEmpty()) sb.append("exclude=").append(searchTermsExclude.stream().collect(Collectors.joining(",", "[", "] ")));
        if (!searchTermsAdvice.isEmpty()) sb.append("advice=").append(searchTermsAdvice.stream().collect(Collectors.joining(",", "[", "] ")));
        if (!searchTermsPriority.isEmpty()) sb.append("priority=").append(searchTermsPriority.stream().collect(Collectors.joining(",", "[", "] ")));
+        if (!searchTermCoherences.isEmpty()) sb.append("coherences=").append(searchTermCoherences.stream().map(coh->coh.stream().collect(Collectors.joining(",", "[", "] "))).collect(Collectors.joining(", ")));

        return sb.toString();
    }
--- a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultPreliminaryScore.java
+++ b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultPreliminaryScore.java
@ -6,9 +6,7 @@ import static java.lang.Boolean.compare;
 import static java.lang.Double.compare;

 public record SearchResultPreliminaryScore(
-        boolean anyAllSynthetic,
-        int minNumberOfFlagsSet,
-        int minPositionsSet,
+        boolean disqualified,
        boolean hasPriorityTerm,
        double searchRankingScore)
        implements Comparable<SearchResultPreliminaryScore>
@ -27,16 +25,7 @@ public record SearchResultPreliminaryScore(
        return PREFER_LOW * compare(searchRankingScore, other.searchRankingScore);
    }

-    public boolean isEmpty() {
-        if (minNumberOfFlagsSet > 0)
-            return false;
-
-        if (anyAllSynthetic)
-            return false;
-
-        if (minPositionsSet > 0)
-            return false;
-
-        return true;
+    public boolean isDisqualified() {
+        return disqualified;
    }
 }
--- a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java
+++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java
@ -110,11 +110,12 @@ public class SearchIndex {
        // that contain pairs of two search terms
        if (orderedIncludes.length > 1) {
            for (int i = 0; i + 1 < orderedIncludes.length; i++) {
-                var remainingWords = Arrays.copyOfRange(orderedIncludes, i+1, orderedIncludes.length);
-                var entrySource = indexReader
-                        .findPriorityWord(orderedIncludes[i])
-                        .alsoPrioAnyOf(remainingWords);
-                queryHeads.add(entrySource);
+                for (int j = i + 1; j < orderedIncludes.length; j++) {
+                    var entrySource = indexReader
+                            .findPriorityWord(orderedIncludes[i])
+                            .alsoPrio(orderedIncludes[j]);
+                    queryHeads.add(entrySource);
+                }
            }
        }

--- a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexSearchTerms.java
+++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexSearchTerms.java
@ -5,9 +5,18 @@ import it.unimi.dsi.fastutil.ints.IntComparator;
 import it.unimi.dsi.fastutil.ints.IntList;
 import it.unimi.dsi.fastutil.ints.IntOpenHashSet;

-public record SearchIndexSearchTerms(IntList includes, IntList excludes, IntList priority) {
+import java.util.Collections;
+import java.util.List;
+
+public record SearchIndexSearchTerms(
+        IntList includes,
+        IntList excludes,
+        IntList priority,
+        List<IntList> coherences
+        )
+{
    public SearchIndexSearchTerms() {
-        this(IntList.of(), IntList.of(), IntList.of());
+        this(IntList.of(), IntList.of(), IntList.of(), Collections.emptyList());
    }

    public boolean isEmpty() {
--- a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexMetadataService.java
+++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexMetadataService.java
@ -8,8 +8,10 @@ import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
 import nu.marginalia.index.client.model.query.SearchSubquery;
 import nu.marginalia.index.index.SearchIndex;
 import nu.marginalia.index.svc.SearchTermsService;
+import nu.marginalia.model.idx.WordMetadata;
 import nu.marginalia.ranking.ResultValuator;

+import java.util.ArrayList;
 import java.util.List;
 import java.util.OptionalInt;

@ -74,7 +76,27 @@ public class IndexMetadataService {
            }
        }

-        return new QuerySearchTerms(termToId, termIdsList.toIntArray());
+
+        return new QuerySearchTerms(termToId,
+                termIdsList.toIntArray(),
+                getTermCoherences(searchTermVariants));
+    }
+
+
+    private TermCoherences getTermCoherences(List<SearchSubquery> searchTermVariants) {
+        List<int[]> coherences = new ArrayList<>();
+
+        for (var subquery : searchTermVariants) {
+            for (var coh : subquery.searchTermCoherences) {
+                int[] ids = coh.stream().map(searchTermsService::lookUpWord).filter(OptionalInt::isPresent).mapToInt(OptionalInt::getAsInt).toArray();
+                coherences.add(ids);
+            }
+
+            // It's assumed each subquery has identical coherences
+            break;
+        }
+
+        return new TermCoherences(coherences);
    }

    public TLongHashSet getResultsWithPriorityTerms(List<SearchSubquery> subqueries, long[] resultsArray) {
@ -116,15 +138,32 @@ public class IndexMetadataService {
            return termdocToMeta.getOrDefault(termdocKey(termId, docId), 0);
        }

+        public boolean testCoherence(long docId, TermCoherences coherences) {
+
+            for (var coherenceSet : coherences.words()) {
+                long overlap = 0xFF_FFFF_FFFF_FFFFL;
+                for (var word : coherenceSet) {
+                    overlap &= WordMetadata.decodePositions(getTermMetadata(word, docId));
+                }
+                if (overlap == 0L) {
+                    return false;
+                }
+            }
+
+            return true;
+        }
    }

    public static class QuerySearchTerms {
        private final TObjectIntHashMap<String> termToId;
        public final int[] termIdsAll;

-        public QuerySearchTerms(TObjectIntHashMap<String> termToId, int[] termIdsAll) {
+        public final TermCoherences coherences;
+
+        public QuerySearchTerms(TObjectIntHashMap<String> termToId, int[] termIdsAll, TermCoherences coherences) {
            this.termToId = termToId;
            this.termIdsAll = termIdsAll;
+            this.coherences = coherences;
        }

        public int get(String searchTerm) {
@ -132,6 +171,8 @@ public class IndexMetadataService {
        }
    }

+    public record TermCoherences(List<int[]> words) {}
+
    private static long termdocKey(int termId, long docId) {
        return (docId << 32) | termId;
    }
--- a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java
+++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java
@ -117,10 +117,15 @@ public class IndexResultValuator {

        double score = searchResultValuator.calculateSearchResultValue(searchResult.keywordScores, 5000, rankingContext);

+        boolean disqualified = false;
+
+        if (!termMetadata.testCoherence(urlIdInt, searchTerms.coherences))
+            disqualified = true;
+        else if (maxFlagsCount == 0 && !anyAllSynthetic && maxPositionsSet == 0)
+            disqualified = true;
+
        searchResult.setScore(new SearchResultPreliminaryScore(
-                anyAllSynthetic,
-                maxFlagsCount,
-                maxPositionsSet,
+                disqualified,
                hasPriorityTerm,
                score
        ));
@ -140,6 +145,7 @@ public class IndexResultValuator {
                return false;
            }
        }
+
        return true;
    }

--- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java
+++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java
@ -155,6 +155,7 @@ public class IndexQueryService {
        outer:
        // These queries are various term combinations
        for (var subquery : params.subqueries) {
+
            final SearchIndexSearchTerms searchTerms = searchTermsSvc.getSearchTerms(subquery);

            if (searchTerms.isEmpty()) {
@ -195,16 +196,20 @@ public class IndexQueryService {
        }

        var includes = subquery.searchTermsInclude;
+        var advice = subquery.searchTermsAdvice;
        var excludes = subquery.searchTermsExclude;
        var priority = subquery.searchTermsPriority;

-        for (int i = 0; i < subquery.searchTermsInclude.size(); i++) {
+        for (int i = 0; i < includes.size(); i++) {
            logger.info(queryMarker, "{} -> {} I", includes.get(i), searchTerms.includes().getInt(i));
        }
-        for (int i = 0; i < subquery.searchTermsExclude.size(); i++) {
+        for (int i = 0; i < advice.size(); i++) {
+            logger.info(queryMarker, "{} -> {} A", advice.get(i), searchTerms.includes().getInt(includes.size() + i));
+        }
+        for (int i = 0; i < excludes.size(); i++) {
            logger.info(queryMarker, "{} -> {} E", excludes.get(i), searchTerms.excludes().getInt(i));
        }
-        for (int i = 0; i < subquery.searchTermsPriority.size(); i++) {
+        for (int i = 0; i < priority.size(); i++) {
            logger.info(queryMarker, "{} -> {} P", priority.get(i), searchTerms.priority().getInt(i));
        }
    }
@ -247,7 +252,7 @@ public class IndexQueryService {
        return Arrays.stream(resultIds.toArray())
                .parallel()
                .mapToObj(evaluator::calculatePreliminaryScore)
-                .filter(score -> !score.getScore().isEmpty())
+                .filter(score -> !score.getScore().isDisqualified())
                .collect(Collectors.toList());
    }

--- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchTermsService.java
+++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchTermsService.java
@ -11,10 +11,7 @@ import nu.marginalia.lexicon.KeywordLexiconReadOnlyView;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.OptionalInt;
+import java.util.*;

@Singleton
 public class SearchTermsService {
@ -30,34 +27,49 @@ public class SearchTermsService {
        final IntList excludes = new IntArrayList();
        final IntList includes = new IntArrayList();
        final IntList priority = new IntArrayList();
+        final List<IntList> coherences = new ArrayList<>();

-        for (var include : request.searchTermsInclude) {
-            var word = lookUpWord(include);
-            if (word.isEmpty()) {
-                logger.debug("Unknown search term: " + include);
+        if (!addEachTerm(includes, request.searchTermsInclude)) {
+            return new SearchIndexSearchTerms();
+        }
+
+        //                  This looks like a bug, but it's not
+        //                    v---                ----v
+        if (!addEachTerm(includes, request.searchTermsAdvice)) {
+            return new SearchIndexSearchTerms();
+        }
+
+        for (var coherence : request.searchTermCoherences) {
+            IntList parts = new IntArrayList(coherence.size());
+
+            if (!addEachTerm(parts, coherence)) {
                return new SearchIndexSearchTerms();
            }
-            includes.add(word.getAsInt());
+
+            coherences.add(parts);
        }

+        // we don't care if we can't find these:
+        addEachTerm(excludes, request.searchTermsExclude);
+        addEachTerm(priority, request.searchTermsPriority);

-        for (var advice : request.searchTermsAdvice) {
-            var word = lookUpWord(advice);
-            if (word.isEmpty()) {
-                logger.debug("Unknown search term: " + advice);
-                return new SearchIndexSearchTerms();
+        return new SearchIndexSearchTerms(includes, excludes, priority, coherences);
+    }
+
+    private boolean addEachTerm(IntList ret, List<String> words) {
+        boolean success = true;
+
+        for (var exclude : words) {
+            var word = lookUpWord(exclude);
+
+            if (word.isPresent()) {
+                lookUpWord(exclude).ifPresent(ret::add);
+            }
+            else {
+                success = false;
            }
-            includes.add(word.getAsInt());
        }
-
-        for (var exclude : request.searchTermsExclude) {
-            lookUpWord(exclude).ifPresent(excludes::add);
-        }
-        for (var exclude : request.searchTermsPriority) {
-            lookUpWord(exclude).ifPresent(priority::add);
-        }
-
-        return new SearchIndexSearchTerms(includes, excludes, priority);
+        return success;
    }


--- a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java
+++ b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java
@ -28,11 +28,9 @@ import spark.Spark;

 import java.io.IOException;
 import java.util.*;
-import java.util.stream.Collectors;
 import java.util.stream.IntStream;

 import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertTrue;
 import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD;

@Execution(SAME_THREAD)
@ -91,8 +89,8 @@ public class IndexQueryServiceIntegrationTest {
                        .domains(new ArrayList<>())
                        .searchSetIdentifier(SearchSetIdentifier.NONE)
                        .subqueries(List.of(new SearchSubquery(
-                                List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList()
-                        ))).build());
+                                List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(),
+                                Collections.emptyList()))).build());

        Assertions.assertArrayEquals(
                new int[] { 30, 90, 150, 210, 270, 330, 390, 450, 510 },
@ -123,8 +121,8 @@ public class IndexQueryServiceIntegrationTest {
                        .queryStrategy(QueryStrategy.SENTENCE)
                        .domains(List.of(2))
                        .subqueries(List.of(new SearchSubquery(
-                                List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList()
-                        ))).build());
+                                List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(),
+                                Collections.emptyList()))).build());
        Assertions.assertArrayEquals(
                new int[] { 210, 270 },
                rsp.results.stream().mapToInt(SearchResultItem::getUrlIdInt).toArray());
@ -149,8 +147,8 @@ public class IndexQueryServiceIntegrationTest {
                        .searchSetIdentifier(SearchSetIdentifier.NONE)
                        .rankingParams(ResultRankingParameters.sensibleDefaults())
                        .subqueries(List.of(new SearchSubquery(
-                                List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList()
-                        ))
+                                List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList(),
+                                Collections.emptyList()))
                        ).build());


@ -167,8 +165,6 @@ public class IndexQueryServiceIntegrationTest {

    }

-
-
    public void loadData(int id) {
        int[] factors = IntStream
                .rangeClosed(1, id)
--- a/code/services-core/search-service/src/main/java/nu/marginalia/search/query/QueryFactory.java
+++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/query/QueryFactory.java
@ -83,8 +83,8 @@ public class QueryFactory {
                Arrays.asList(termsInclude),
                Collections.emptyList(),
                Collections.emptyList(),
-                Collections.emptyList()
-        ));
+                Collections.emptyList(),
+                Collections.emptyList()));

        var specs = SearchSpecification.builder()
                .subqueries(sqs)
--- a/code/services-core/search-service/src/main/java/nu/marginalia/search/query/QuerySearchTermsAccumulator.java
+++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/query/QuerySearchTermsAccumulator.java
@ -9,17 +9,19 @@ import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;

+/** @see SearchSubquery */
 public class QuerySearchTermsAccumulator implements TokenVisitor {
    public List<String> searchTermsExclude = new ArrayList<>();
    public List<String> searchTermsInclude = new ArrayList<>();
    public List<String> searchTermsAdvice = new ArrayList<>();
    public List<String> searchTermsPriority = new ArrayList<>();
+    public List<List<String>> searchTermCoherences = new ArrayList<>();

    public String near;
    public String domain;

    public SearchSubquery createSubquery() {
-        return new SearchSubquery(searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority);
+        return new SearchSubquery(searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority, searchTermCoherences);
    }

    public QuerySearchTermsAccumulator(SearchProfile profile, List<Token> parts) {
@ -45,11 +47,19 @@ public class QuerySearchTermsAccumulator implements TokenVisitor {
    public void onQuotTerm(Token token) {
        String[] parts = token.str.split("_");
        if (parts.length > 1) {
+            // Prefer that the actual n-gram is present
            searchTermsAdvice.add(token.str);
+
+            // Require that the terms appear in the same sentence
+            searchTermCoherences.add(Arrays.asList(parts));
+
+            // Require that each term exists in the document
+            // (needed for ranking)
            searchTermsInclude.addAll(Arrays.asList(parts));
        }
        else {
            searchTermsInclude.add(token.str);
+
        }
    }

--- a/code/tools/load-test/src/main/java/nu/marginalia/load_test/LoadTestMain.java
+++ b/code/tools/load-test/src/main/java/nu/marginalia/load_test/LoadTestMain.java
@ -29,7 +29,7 @@ public class LoadTestMain {

        for (int i = 0; i < 10000; i++) {
            String uri = "http://127.0.0.1:8080/search?query=%s&profile=corpo".formatted(
-                    Strings.join(pickNCommonWords(2), '+')
+                    Strings.join(pickNCommonWords(4), '+')
            );

            HttpRequest req = HttpRequest.newBuilder(new URI(uri))