From fe419b12b46d8ef317fb8941ebc64b6c3c9eec26 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 10 Apr 2023 13:11:40 +0200 Subject: [PATCH] Better handling of quote terms, fix bug in handling of longer queries. ... where some terms may previously have been ignored. The latter bug was due to the handling of QueryHeads with AnyOf-style predicates interacting poorly with alreadyConsideredTerms in SearchIndex.java --- .../client/model/query/SearchSubquery.java | 13 ++-- .../results/SearchResultPreliminaryScore.java | 17 +----- .../marginalia/index/index/SearchIndex.java | 11 ++-- .../index/index/SearchIndexSearchTerms.java | 13 +++- .../index/results/IndexMetadataService.java | 45 +++++++++++++- .../index/results/IndexResultValuator.java | 12 +++- .../index/svc/IndexQueryService.java | 13 ++-- .../index/svc/SearchTermsService.java | 60 +++++++++++-------- .../svc/IndexQueryServiceIntegrationTest.java | 16 ++--- .../marginalia/search/query/QueryFactory.java | 4 +- .../query/QuerySearchTermsAccumulator.java | 12 +++- .../nu/marginalia/load_test/LoadTestMain.java | 2 +- 12 files changed, 146 insertions(+), 72 deletions(-) diff --git a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/query/SearchSubquery.java b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/query/SearchSubquery.java index 83422fab..3c2c4bbc 100644 --- a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/query/SearchSubquery.java +++ b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/query/SearchSubquery.java @@ -10,7 +10,7 @@ import java.util.stream.Collectors; @AllArgsConstructor public class SearchSubquery { - /** These terms must be present in the document */ + /** These terms must be present in the document and are used in ranking*/ public final List searchTermsInclude; /** These terms must be absent from the document */ @@ -21,18 +21,22 @@ public class SearchSubquery { /** If these optional terms are present in the document, rank it highly */ public final List searchTermsPriority; - + + /** Terms that we require to be in the same sentence */ + public final List> searchTermCoherences; + private double value = 0; public SearchSubquery(List searchTermsInclude, List searchTermsExclude, List searchTermsAdvice, - List searchTermsPriority - ) { + List searchTermsPriority, + List> searchTermCoherences) { this.searchTermsInclude = searchTermsInclude; this.searchTermsExclude = searchTermsExclude; this.searchTermsAdvice = searchTermsAdvice; this.searchTermsPriority = searchTermsPriority; + this.searchTermCoherences = searchTermCoherences; } public SearchSubquery setValue(double value) { @@ -51,6 +55,7 @@ public class SearchSubquery { if (!searchTermsExclude.isEmpty()) sb.append("exclude=").append(searchTermsExclude.stream().collect(Collectors.joining(",", "[", "] "))); if (!searchTermsAdvice.isEmpty()) sb.append("advice=").append(searchTermsAdvice.stream().collect(Collectors.joining(",", "[", "] "))); if (!searchTermsPriority.isEmpty()) sb.append("priority=").append(searchTermsPriority.stream().collect(Collectors.joining(",", "[", "] "))); + if (!searchTermCoherences.isEmpty()) sb.append("coherences=").append(searchTermCoherences.stream().map(coh->coh.stream().collect(Collectors.joining(",", "[", "] "))).collect(Collectors.joining(", "))); return sb.toString(); } diff --git a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultPreliminaryScore.java b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultPreliminaryScore.java index e2a38e52..b8696a4c 100644 --- a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultPreliminaryScore.java +++ b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultPreliminaryScore.java @@ -6,9 +6,7 @@ import static java.lang.Boolean.compare; import static java.lang.Double.compare; public record SearchResultPreliminaryScore( - boolean anyAllSynthetic, - int minNumberOfFlagsSet, - int minPositionsSet, + boolean disqualified, boolean hasPriorityTerm, double searchRankingScore) implements Comparable @@ -27,16 +25,7 @@ public record SearchResultPreliminaryScore( return PREFER_LOW * compare(searchRankingScore, other.searchRankingScore); } - public boolean isEmpty() { - if (minNumberOfFlagsSet > 0) - return false; - - if (anyAllSynthetic) - return false; - - if (minPositionsSet > 0) - return false; - - return true; + public boolean isDisqualified() { + return disqualified; } } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java index 7f4a7fc0..2656263a 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java @@ -110,11 +110,12 @@ public class SearchIndex { // that contain pairs of two search terms if (orderedIncludes.length > 1) { for (int i = 0; i + 1 < orderedIncludes.length; i++) { - var remainingWords = Arrays.copyOfRange(orderedIncludes, i+1, orderedIncludes.length); - var entrySource = indexReader - .findPriorityWord(orderedIncludes[i]) - .alsoPrioAnyOf(remainingWords); - queryHeads.add(entrySource); + for (int j = i + 1; j < orderedIncludes.length; j++) { + var entrySource = indexReader + .findPriorityWord(orderedIncludes[i]) + .alsoPrio(orderedIncludes[j]); + queryHeads.add(entrySource); + } } } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexSearchTerms.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexSearchTerms.java index fdd5f2ac..aedd7da3 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexSearchTerms.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexSearchTerms.java @@ -5,9 +5,18 @@ import it.unimi.dsi.fastutil.ints.IntComparator; import it.unimi.dsi.fastutil.ints.IntList; import it.unimi.dsi.fastutil.ints.IntOpenHashSet; -public record SearchIndexSearchTerms(IntList includes, IntList excludes, IntList priority) { +import java.util.Collections; +import java.util.List; + +public record SearchIndexSearchTerms( + IntList includes, + IntList excludes, + IntList priority, + List coherences + ) +{ public SearchIndexSearchTerms() { - this(IntList.of(), IntList.of(), IntList.of()); + this(IntList.of(), IntList.of(), IntList.of(), Collections.emptyList()); } public boolean isEmpty() { diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexMetadataService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexMetadataService.java index 1bbca926..105c6f04 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexMetadataService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexMetadataService.java @@ -8,8 +8,10 @@ import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap; import nu.marginalia.index.client.model.query.SearchSubquery; import nu.marginalia.index.index.SearchIndex; import nu.marginalia.index.svc.SearchTermsService; +import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.ranking.ResultValuator; +import java.util.ArrayList; import java.util.List; import java.util.OptionalInt; @@ -74,7 +76,27 @@ public class IndexMetadataService { } } - return new QuerySearchTerms(termToId, termIdsList.toIntArray()); + + return new QuerySearchTerms(termToId, + termIdsList.toIntArray(), + getTermCoherences(searchTermVariants)); + } + + + private TermCoherences getTermCoherences(List searchTermVariants) { + List coherences = new ArrayList<>(); + + for (var subquery : searchTermVariants) { + for (var coh : subquery.searchTermCoherences) { + int[] ids = coh.stream().map(searchTermsService::lookUpWord).filter(OptionalInt::isPresent).mapToInt(OptionalInt::getAsInt).toArray(); + coherences.add(ids); + } + + // It's assumed each subquery has identical coherences + break; + } + + return new TermCoherences(coherences); } public TLongHashSet getResultsWithPriorityTerms(List subqueries, long[] resultsArray) { @@ -116,15 +138,32 @@ public class IndexMetadataService { return termdocToMeta.getOrDefault(termdocKey(termId, docId), 0); } + public boolean testCoherence(long docId, TermCoherences coherences) { + + for (var coherenceSet : coherences.words()) { + long overlap = 0xFF_FFFF_FFFF_FFFFL; + for (var word : coherenceSet) { + overlap &= WordMetadata.decodePositions(getTermMetadata(word, docId)); + } + if (overlap == 0L) { + return false; + } + } + + return true; + } } public static class QuerySearchTerms { private final TObjectIntHashMap termToId; public final int[] termIdsAll; - public QuerySearchTerms(TObjectIntHashMap termToId, int[] termIdsAll) { + public final TermCoherences coherences; + + public QuerySearchTerms(TObjectIntHashMap termToId, int[] termIdsAll, TermCoherences coherences) { this.termToId = termToId; this.termIdsAll = termIdsAll; + this.coherences = coherences; } public int get(String searchTerm) { @@ -132,6 +171,8 @@ public class IndexMetadataService { } } + public record TermCoherences(List words) {} + private static long termdocKey(int termId, long docId) { return (docId << 32) | termId; } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java index 4cdd6949..dd13e942 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java @@ -117,10 +117,15 @@ public class IndexResultValuator { double score = searchResultValuator.calculateSearchResultValue(searchResult.keywordScores, 5000, rankingContext); + boolean disqualified = false; + + if (!termMetadata.testCoherence(urlIdInt, searchTerms.coherences)) + disqualified = true; + else if (maxFlagsCount == 0 && !anyAllSynthetic && maxPositionsSet == 0) + disqualified = true; + searchResult.setScore(new SearchResultPreliminaryScore( - anyAllSynthetic, - maxFlagsCount, - maxPositionsSet, + disqualified, hasPriorityTerm, score )); @@ -140,6 +145,7 @@ public class IndexResultValuator { return false; } } + return true; } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java index d1269749..13acaaf2 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java @@ -155,6 +155,7 @@ public class IndexQueryService { outer: // These queries are various term combinations for (var subquery : params.subqueries) { + final SearchIndexSearchTerms searchTerms = searchTermsSvc.getSearchTerms(subquery); if (searchTerms.isEmpty()) { @@ -195,16 +196,20 @@ public class IndexQueryService { } var includes = subquery.searchTermsInclude; + var advice = subquery.searchTermsAdvice; var excludes = subquery.searchTermsExclude; var priority = subquery.searchTermsPriority; - for (int i = 0; i < subquery.searchTermsInclude.size(); i++) { + for (int i = 0; i < includes.size(); i++) { logger.info(queryMarker, "{} -> {} I", includes.get(i), searchTerms.includes().getInt(i)); } - for (int i = 0; i < subquery.searchTermsExclude.size(); i++) { + for (int i = 0; i < advice.size(); i++) { + logger.info(queryMarker, "{} -> {} A", advice.get(i), searchTerms.includes().getInt(includes.size() + i)); + } + for (int i = 0; i < excludes.size(); i++) { logger.info(queryMarker, "{} -> {} E", excludes.get(i), searchTerms.excludes().getInt(i)); } - for (int i = 0; i < subquery.searchTermsPriority.size(); i++) { + for (int i = 0; i < priority.size(); i++) { logger.info(queryMarker, "{} -> {} P", priority.get(i), searchTerms.priority().getInt(i)); } } @@ -247,7 +252,7 @@ public class IndexQueryService { return Arrays.stream(resultIds.toArray()) .parallel() .mapToObj(evaluator::calculatePreliminaryScore) - .filter(score -> !score.getScore().isEmpty()) + .filter(score -> !score.getScore().isDisqualified()) .collect(Collectors.toList()); } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchTermsService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchTermsService.java index 81d561b1..7e5cad50 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchTermsService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchTermsService.java @@ -11,10 +11,7 @@ import nu.marginalia.lexicon.KeywordLexiconReadOnlyView; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.OptionalInt; +import java.util.*; @Singleton public class SearchTermsService { @@ -30,34 +27,49 @@ public class SearchTermsService { final IntList excludes = new IntArrayList(); final IntList includes = new IntArrayList(); final IntList priority = new IntArrayList(); + final List coherences = new ArrayList<>(); - for (var include : request.searchTermsInclude) { - var word = lookUpWord(include); - if (word.isEmpty()) { - logger.debug("Unknown search term: " + include); + if (!addEachTerm(includes, request.searchTermsInclude)) { + return new SearchIndexSearchTerms(); + } + + // This looks like a bug, but it's not + // v--- ----v + if (!addEachTerm(includes, request.searchTermsAdvice)) { + return new SearchIndexSearchTerms(); + } + + for (var coherence : request.searchTermCoherences) { + IntList parts = new IntArrayList(coherence.size()); + + if (!addEachTerm(parts, coherence)) { return new SearchIndexSearchTerms(); } - includes.add(word.getAsInt()); + + coherences.add(parts); } + // we don't care if we can't find these: + addEachTerm(excludes, request.searchTermsExclude); + addEachTerm(priority, request.searchTermsPriority); - for (var advice : request.searchTermsAdvice) { - var word = lookUpWord(advice); - if (word.isEmpty()) { - logger.debug("Unknown search term: " + advice); - return new SearchIndexSearchTerms(); + return new SearchIndexSearchTerms(includes, excludes, priority, coherences); + } + + private boolean addEachTerm(IntList ret, List words) { + boolean success = true; + + for (var exclude : words) { + var word = lookUpWord(exclude); + + if (word.isPresent()) { + lookUpWord(exclude).ifPresent(ret::add); + } + else { + success = false; } - includes.add(word.getAsInt()); } - - for (var exclude : request.searchTermsExclude) { - lookUpWord(exclude).ifPresent(excludes::add); - } - for (var exclude : request.searchTermsPriority) { - lookUpWord(exclude).ifPresent(priority::add); - } - - return new SearchIndexSearchTerms(includes, excludes, priority); + return success; } diff --git a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java index d4a75f80..fd92354b 100644 --- a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java +++ b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java @@ -28,11 +28,9 @@ import spark.Spark; import java.io.IOException; import java.util.*; -import java.util.stream.Collectors; import java.util.stream.IntStream; import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD; @Execution(SAME_THREAD) @@ -91,8 +89,8 @@ public class IndexQueryServiceIntegrationTest { .domains(new ArrayList<>()) .searchSetIdentifier(SearchSetIdentifier.NONE) .subqueries(List.of(new SearchSubquery( - List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList() - ))).build()); + List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(), + Collections.emptyList()))).build()); Assertions.assertArrayEquals( new int[] { 30, 90, 150, 210, 270, 330, 390, 450, 510 }, @@ -123,8 +121,8 @@ public class IndexQueryServiceIntegrationTest { .queryStrategy(QueryStrategy.SENTENCE) .domains(List.of(2)) .subqueries(List.of(new SearchSubquery( - List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList() - ))).build()); + List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(), + Collections.emptyList()))).build()); Assertions.assertArrayEquals( new int[] { 210, 270 }, rsp.results.stream().mapToInt(SearchResultItem::getUrlIdInt).toArray()); @@ -149,8 +147,8 @@ public class IndexQueryServiceIntegrationTest { .searchSetIdentifier(SearchSetIdentifier.NONE) .rankingParams(ResultRankingParameters.sensibleDefaults()) .subqueries(List.of(new SearchSubquery( - List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList() - )) + List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList(), + Collections.emptyList())) ).build()); @@ -167,8 +165,6 @@ public class IndexQueryServiceIntegrationTest { } - - public void loadData(int id) { int[] factors = IntStream .rangeClosed(1, id) diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/query/QueryFactory.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/query/QueryFactory.java index d61e5681..cb77ca32 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/query/QueryFactory.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/query/QueryFactory.java @@ -83,8 +83,8 @@ public class QueryFactory { Arrays.asList(termsInclude), Collections.emptyList(), Collections.emptyList(), - Collections.emptyList() - )); + Collections.emptyList(), + Collections.emptyList())); var specs = SearchSpecification.builder() .subqueries(sqs) diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/query/QuerySearchTermsAccumulator.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/query/QuerySearchTermsAccumulator.java index 730b8f99..6fcb62c2 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/query/QuerySearchTermsAccumulator.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/query/QuerySearchTermsAccumulator.java @@ -9,17 +9,19 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; +/** @see SearchSubquery */ public class QuerySearchTermsAccumulator implements TokenVisitor { public List searchTermsExclude = new ArrayList<>(); public List searchTermsInclude = new ArrayList<>(); public List searchTermsAdvice = new ArrayList<>(); public List searchTermsPriority = new ArrayList<>(); + public List> searchTermCoherences = new ArrayList<>(); public String near; public String domain; public SearchSubquery createSubquery() { - return new SearchSubquery(searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority); + return new SearchSubquery(searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority, searchTermCoherences); } public QuerySearchTermsAccumulator(SearchProfile profile, List parts) { @@ -45,11 +47,19 @@ public class QuerySearchTermsAccumulator implements TokenVisitor { public void onQuotTerm(Token token) { String[] parts = token.str.split("_"); if (parts.length > 1) { + // Prefer that the actual n-gram is present searchTermsAdvice.add(token.str); + + // Require that the terms appear in the same sentence + searchTermCoherences.add(Arrays.asList(parts)); + + // Require that each term exists in the document + // (needed for ranking) searchTermsInclude.addAll(Arrays.asList(parts)); } else { searchTermsInclude.add(token.str); + } } diff --git a/code/tools/load-test/src/main/java/nu/marginalia/load_test/LoadTestMain.java b/code/tools/load-test/src/main/java/nu/marginalia/load_test/LoadTestMain.java index 8dcde846..cfbaeec6 100644 --- a/code/tools/load-test/src/main/java/nu/marginalia/load_test/LoadTestMain.java +++ b/code/tools/load-test/src/main/java/nu/marginalia/load_test/LoadTestMain.java @@ -29,7 +29,7 @@ public class LoadTestMain { for (int i = 0; i < 10000; i++) { String uri = "http://127.0.0.1:8080/search?query=%s&profile=corpo".formatted( - Strings.join(pickNCommonWords(2), '+') + Strings.join(pickNCommonWords(4), '+') ); HttpRequest req = HttpRequest.newBuilder(new URI(uri))