From 1097fe6e25a45b13ce49a4449fe2d07d23b2b6f6 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 29 Mar 2023 15:17:55 +0200 Subject: [PATCH] Fix bugs related to search result selection in the case with multiple search terms. * A deduplication filter step ran too early, and removed many good results on the basis that they partially, but did not fully fit another set of search terms. * Altered the query creation process to prefer documents where multiple terms appear in the priority index. --- .../index/query/IndexQueryBuilder.java | 14 +++- .../priority/ReverseIndexPriorityReader.java | 14 ++++ .../marginalia/index/index/SearchIndex.java | 72 ++++++++++++------- .../index/index/SearchIndexQueryBuilder.java | 37 +++++++++- .../index/index/SearchIndexReader.java | 38 ++-------- .../index/svc/IndexQueryService.java | 35 +++++---- .../index/svc/SearchParameters.java | 4 +- .../index/svc/SearchTermsService.java | 4 +- 8 files changed, 140 insertions(+), 78 deletions(-) diff --git a/code/features-index/index-query/src/main/java/nu/marginalia/index/query/IndexQueryBuilder.java b/code/features-index/index-query/src/main/java/nu/marginalia/index/query/IndexQueryBuilder.java index dddc1e3b..adcd1861 100644 --- a/code/features-index/index-query/src/main/java/nu/marginalia/index/query/IndexQueryBuilder.java +++ b/code/features-index/index-query/src/main/java/nu/marginalia/index/query/IndexQueryBuilder.java @@ -3,9 +3,19 @@ package nu.marginalia.index.query; import nu.marginalia.index.query.filter.QueryFilterStepIf; public interface IndexQueryBuilder { - IndexQueryBuilder also(int termId); + /** Filters documents that also contain termId, within the full index. + */ + IndexQueryBuilder alsoFull(int termId); + + /** + * Filters documents that also contain any of the provided termIds, within the priority index. + */ + IndexQueryBuilder alsoPrioAnyOf(int... termIds); + + /** Excludes documents that contain termId, within the full index + */ + IndexQueryBuilder notFull(int termId); - IndexQueryBuilder not(int termId); IndexQueryBuilder addInclusionFilter(QueryFilterStepIf filterStep); IndexQuery build(); diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityReader.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityReader.java index b4b5d75a..bccce6c0 100644 --- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityReader.java +++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityReader.java @@ -5,6 +5,9 @@ import nu.marginalia.index.query.EntrySource; import nu.marginalia.array.LongArray; import nu.marginalia.btree.BTreeReader; import nu.marginalia.index.query.EmptyEntrySource; +import nu.marginalia.index.query.ReverseIndexRetainFilter; +import nu.marginalia.index.query.filter.QueryFilterNoPass; +import nu.marginalia.index.query.filter.QueryFilterStepIf; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -49,4 +52,15 @@ public class ReverseIndexPriorityReader { private BTreeReader createReaderNew(long offset) { return new BTreeReader(documents, ReverseIndexPriorityParameters.bTreeContext, offset); } + + public QueryFilterStepIf also(int wordId) { + if (wordId < 0) return new QueryFilterNoPass(); + + long offset = words.get(wordId); + + if (offset < 0) return new QueryFilterNoPass(); + + return new ReverseIndexRetainFilter(createReaderNew(offset)); + } + } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java index 14e31fff..af26b2c7 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java @@ -5,8 +5,8 @@ import com.google.inject.Singleton; import nu.marginalia.index.IndexServicesFactory; import nu.marginalia.index.query.IndexQuery; import nu.marginalia.index.query.IndexQueryBuilder; -import nu.marginalia.index.results.IndexResultDomainDeduplicator; import nu.marginalia.index.query.IndexQueryParams; +import nu.marginalia.index.query.ReverseIndexEntrySourceBehavior; import nu.marginalia.index.query.filter.QueryFilterStepFromPredicate; import nu.marginalia.index.svc.IndexSearchSetsService; import org.jetbrains.annotations.NotNull; @@ -14,7 +14,10 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; +import java.util.List; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; @@ -90,42 +93,63 @@ public class SearchIndex { return indexReader != null; } - public IndexQuery createQuery(SearchIndexSearchTerms terms, IndexQueryParams params, LongPredicate includePred) { - if (null == indexReader) { + public List createQueries(SearchIndexSearchTerms terms, IndexQueryParams params, LongPredicate includePred) { + + if (!isAvailable()) { logger.warn("Index reader not ready"); - return new IndexQuery(Collections.emptyList()); + return Collections.emptyList(); } final int[] orderedIncludes = terms.sortedDistinctIncludes(this::compareKeywords); - IndexQueryBuilder query = - switch(params.queryStrategy()) { - case SENTENCE -> indexReader.findWordAsSentence(orderedIncludes); - case TOPIC, REQUIRE_FIELD_SITE, REQUIRE_FIELD_TITLE, REQUIRE_FIELD_SUBJECT, REQUIRE_FIELD_DOMAIN, REQUIRE_FIELD_URL - -> indexReader.findWordAsTopic(orderedIncludes); - case AUTO -> indexReader.findWordTopicDynamicMode(orderedIncludes); - }; + List queryHeads = new ArrayList<>(10); + List queries = new ArrayList<>(10); - if (query == null) { - return new IndexQuery(Collections.emptyList()); + // To ensure that good results are processed first, create query heads for the priority index that filter for terms + // that contain pairs of two search terms + if (orderedIncludes.length > 1) { + for (int i = 0; i + 1 < orderedIncludes.length; i++) { + var remainingWords = Arrays.copyOfRange(orderedIncludes, i+1, orderedIncludes.length); + var entrySource = indexReader + .findPriorityWord(orderedIncludes[i]) + .alsoPrioAnyOf(remainingWords); + queryHeads.add(entrySource); + } } - query = query.addInclusionFilter(new QueryFilterStepFromPredicate(includePred)); - - for (int i = 0; i < orderedIncludes.length; i++) { - query = query.also(orderedIncludes[i]); + // Next consider entries that appear only once in the priority index + for (var wordId : orderedIncludes) { + queryHeads.add(indexReader.findPriorityWord(wordId)); } - for (int term : terms.excludes()) { - query = query.not(term); + // Finally consider terms in the full index + queryHeads.add(indexReader.findFullWord(orderedIncludes[0], ReverseIndexEntrySourceBehavior.DO_PREFER)); + + for (var query : queryHeads) { + if (query == null) { + return Collections.emptyList(); + } + + + for (int orderedInclude : orderedIncludes) { + query = query.alsoFull(orderedInclude); + } + + for (int term : terms.excludes()) { + query = query.notFull(term); + } + + // This filtering step needs to happen only on terms that have passed all term-based filtering steps, + // it's essentially a memoization of the params filtering job which is relatively expensive + query = query.addInclusionFilter(new QueryFilterStepFromPredicate(includePred)); + + // Run these last, as they'll worst-case cause as many page faults as there are + // items in the buffer + queries.add(query.addInclusionFilter(indexReader.filterForParams(params)).build()); } - // Run these last, as they'll worst-case cause as many page faults as there are - // items in the buffer - return query - .addInclusionFilter(indexReader.filterForParams(params)) - .build(); + return queries; } private int compareKeywords(int a, int b) { diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexQueryBuilder.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexQueryBuilder.java index 80196a36..6de051f1 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexQueryBuilder.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexQueryBuilder.java @@ -1,27 +1,58 @@ package nu.marginalia.index.index; +import nu.marginalia.index.priority.ReverseIndexPriorityReader; import nu.marginalia.index.query.IndexQuery; import nu.marginalia.index.query.IndexQueryBuilder; import nu.marginalia.index.query.filter.QueryFilterStepIf; import nu.marginalia.index.full.ReverseIndexFullReader; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + public class SearchIndexQueryBuilder implements IndexQueryBuilder { private final IndexQuery query; private final ReverseIndexFullReader reverseIndexFullReader; + private final ReverseIndexPriorityReader reverseIndexPrioReader; - SearchIndexQueryBuilder(ReverseIndexFullReader reverseIndexFullReader, IndexQuery query) { + SearchIndexQueryBuilder(ReverseIndexFullReader reverseIndexFullReader, + ReverseIndexPriorityReader reverseIndexPrioReader, + IndexQuery query) + { this.query = query; this.reverseIndexFullReader = reverseIndexFullReader; + this.reverseIndexPrioReader = reverseIndexPrioReader; } - public IndexQueryBuilder also(int termId) { + public IndexQueryBuilder alsoFull(int termId) { query.addInclusionFilter(reverseIndexFullReader.also(termId)); return this; } - public IndexQueryBuilder not(int termId) { + public IndexQueryBuilder alsoPrioAnyOf(int... termIds) { + + QueryFilterStepIf step; + + if (termIds.length == 0) { + step = QueryFilterStepIf.noPass(); + } + else if (termIds.length == 1) { + step = reverseIndexPrioReader.also(termIds[0]); + } + else { + var steps = IntStream.of(termIds) + .mapToObj(reverseIndexPrioReader::also) + .collect(Collectors.toList()); + step = QueryFilterStepIf.anyOf(steps); + } + + query.addInclusionFilter(step); + + return this; + } + + public IndexQueryBuilder notFull(int termId) { query.addInclusionFilter(reverseIndexFullReader.not(termId)); diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexReader.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexReader.java index fe134f6e..d6dd854e 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexReader.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexReader.java @@ -13,12 +13,12 @@ import nu.marginalia.index.query.ReverseIndexEntrySourceBehavior; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.ArrayList; import java.util.List; public class SearchIndexReader { private final Logger logger = LoggerFactory.getLogger(getClass()); + private final ForwardIndexReader forwardIndexReader; private final ReverseIndexFullReader reverseIndexFullReader; private final ReverseIndexPriorityReader reverseIndexPriorityReader; @@ -31,38 +31,14 @@ public class SearchIndexReader { this.reverseIndexPriorityReader = reverseIndexPriorityReader; } - public IndexQueryBuilder findWordAsSentence(int[] wordIdsByFrequency) { - List entrySources = new ArrayList<>(1); - - entrySources.add(reverseIndexFullReader.documents(wordIdsByFrequency[0], ReverseIndexEntrySourceBehavior.DO_PREFER)); - - return new SearchIndexQueryBuilder(reverseIndexFullReader, new IndexQuery(entrySources)); + public IndexQueryBuilder findPriorityWord(int wordId) { + return new SearchIndexQueryBuilder(reverseIndexFullReader, reverseIndexPriorityReader, new IndexQuery( + List.of(reverseIndexPriorityReader.priorityDocuments(wordId)))); } - public IndexQueryBuilder findWordAsTopic(int[] wordIdsByFrequency) { - List entrySources = new ArrayList<>(wordIdsByFrequency.length); - - for (int wordId : wordIdsByFrequency) { - entrySources.add(reverseIndexPriorityReader.priorityDocuments(wordId)); - } - - return new SearchIndexQueryBuilder(reverseIndexFullReader, new IndexQuery(entrySources)); - } - - public IndexQueryBuilder findWordTopicDynamicMode(int[] wordIdsByFrequency) { - if (wordIdsByFrequency.length > 3) { - return findWordAsSentence(wordIdsByFrequency); - } - - List entrySources = new ArrayList<>(wordIdsByFrequency.length + 1); - - for (int wordId : wordIdsByFrequency) { - entrySources.add(reverseIndexPriorityReader.priorityDocuments(wordId)); - } - - entrySources.add(reverseIndexFullReader.documents(wordIdsByFrequency[0], ReverseIndexEntrySourceBehavior.DO_NOT_PREFER)); - - return new SearchIndexQueryBuilder(reverseIndexFullReader, new IndexQuery(entrySources)); + public IndexQueryBuilder findFullWord(int wordId, ReverseIndexEntrySourceBehavior behavior) { + return new SearchIndexQueryBuilder(reverseIndexFullReader, reverseIndexPriorityReader, new IndexQuery( + List.of(reverseIndexFullReader.documents(wordId, behavior)))); } QueryFilterStepIf filterForParams(IndexQueryParams params) { diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java index 8e3e5ae5..0b3b06fb 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java @@ -69,6 +69,10 @@ public class IndexQueryService { String json = request.body(); SearchSpecification specsSet = gson.fromJson(json, SearchSpecification.class); + if (!index.isAvailable()) { + Spark.halt(503, "Index is not loaded"); + } + try { return wmsa_edge_index_query_time.time(() -> { var params = new SearchParameters(specsSet, getSearchSet(specsSet)); @@ -135,36 +139,39 @@ public class IndexQueryService { final TLongList results = new TLongArrayList(params.fetchSize); logger.info(queryMarker, "{}", params.queryParams); - for (var sq : params.subqueries) { - final SearchIndexSearchTerms searchTerms = searchTermsSvc.getSearchTerms(sq); + + outer: + // These queries are various term combinations + for (var subquery : params.subqueries) { + final SearchIndexSearchTerms searchTerms = searchTermsSvc.getSearchTerms(subquery); if (searchTerms.isEmpty()) { continue; } - var resultsForSq = executeSubquery(searchTerms, params); + // These queries are different indices for one subquery + List queries = params.createIndexQueries(index, searchTerms); + for (var query : queries) { + var resultsForSq = executeQuery(query, params); + logger.info(queryMarker, "{} from {}", resultsForSq.size(), subquery); + results.addAll(resultsForSq); - logger.info(queryMarker, "{} from {}", resultsForSq.size(), sq); - - results.addAll(resultsForSq); - - if (!params.hasTimeLeft()) { - logger.info("Query timed out {}, ({}), -{}", - sq.searchTermsInclude, sq.searchTermsAdvice, sq.searchTermsExclude); - break; + if (!params.hasTimeLeft()) { + logger.info("Query timed out {}, ({}), -{}", + subquery.searchTermsInclude, subquery.searchTermsAdvice, subquery.searchTermsExclude); + break outer; + } } } return results; } - private TLongArrayList executeSubquery(SearchIndexSearchTerms terms, SearchParameters params) + private TLongArrayList executeQuery(IndexQuery query, SearchParameters params) { final TLongArrayList results = new TLongArrayList(params.fetchSize); final LongQueryBuffer buffer = new LongQueryBuffer(params.fetchSize); - IndexQuery query = params.createIndexQuery(index, terms); - while (query.hasMore() && results.size() < params.fetchSize && params.budget.hasTimeLeft()) diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchParameters.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchParameters.java index 846f52c0..313f3694 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchParameters.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchParameters.java @@ -58,8 +58,8 @@ public class SearchParameters { specsSet.queryStrategy); } - IndexQuery createIndexQuery(SearchIndex index, SearchIndexSearchTerms terms) { - return index.createQuery(terms, queryParams, consideredUrlIds::add); + List createIndexQueries(SearchIndex index, SearchIndexSearchTerms terms) { + return index.createQueries(terms, queryParams, consideredUrlIds::add); } boolean hasTimeLeft() { diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchTermsService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchTermsService.java index d82a8c42..81d561b1 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchTermsService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchTermsService.java @@ -34,7 +34,7 @@ public class SearchTermsService { for (var include : request.searchTermsInclude) { var word = lookUpWord(include); if (word.isEmpty()) { - logger.info("Unknown search term: " + include); + logger.debug("Unknown search term: " + include); return new SearchIndexSearchTerms(); } includes.add(word.getAsInt()); @@ -44,7 +44,7 @@ public class SearchTermsService { for (var advice : request.searchTermsAdvice) { var word = lookUpWord(advice); if (word.isEmpty()) { - logger.info("Unknown search term: " + advice); + logger.debug("Unknown search term: " + advice); return new SearchIndexSearchTerms(); } includes.add(word.getAsInt());