Fix bugs related to search result selection in the case with multiple search terms.

* A deduplication filter step ran too early, and removed many good results on the basis that they partially, but did not fully fit another set of search terms.

* Altered the query creation process to prefer documents where multiple terms appear in the priority index.
This commit is contained in:
Viktor Lofgren 2023-03-29 15:17:55 +02:00
parent 30584887f9
commit 1097fe6e25
8 changed files with 140 additions and 78 deletions

View File

@ -3,9 +3,19 @@ package nu.marginalia.index.query;
import nu.marginalia.index.query.filter.QueryFilterStepIf; import nu.marginalia.index.query.filter.QueryFilterStepIf;
public interface IndexQueryBuilder { public interface IndexQueryBuilder {
IndexQueryBuilder also(int termId); /** Filters documents that also contain termId, within the full index.
*/
IndexQueryBuilder alsoFull(int termId);
/**
* Filters documents that also contain <i>any of the provided termIds</i>, within the priority index.
*/
IndexQueryBuilder alsoPrioAnyOf(int... termIds);
/** Excludes documents that contain termId, within the full index
*/
IndexQueryBuilder notFull(int termId);
IndexQueryBuilder not(int termId);
IndexQueryBuilder addInclusionFilter(QueryFilterStepIf filterStep); IndexQueryBuilder addInclusionFilter(QueryFilterStepIf filterStep);
IndexQuery build(); IndexQuery build();

View File

@ -5,6 +5,9 @@ import nu.marginalia.index.query.EntrySource;
import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArray;
import nu.marginalia.btree.BTreeReader; import nu.marginalia.btree.BTreeReader;
import nu.marginalia.index.query.EmptyEntrySource; import nu.marginalia.index.query.EmptyEntrySource;
import nu.marginalia.index.query.ReverseIndexRetainFilter;
import nu.marginalia.index.query.filter.QueryFilterNoPass;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -49,4 +52,15 @@ public class ReverseIndexPriorityReader {
private BTreeReader createReaderNew(long offset) { private BTreeReader createReaderNew(long offset) {
return new BTreeReader(documents, ReverseIndexPriorityParameters.bTreeContext, offset); return new BTreeReader(documents, ReverseIndexPriorityParameters.bTreeContext, offset);
} }
public QueryFilterStepIf also(int wordId) {
if (wordId < 0) return new QueryFilterNoPass();
long offset = words.get(wordId);
if (offset < 0) return new QueryFilterNoPass();
return new ReverseIndexRetainFilter(createReaderNew(offset));
}
} }

View File

@ -5,8 +5,8 @@ import com.google.inject.Singleton;
import nu.marginalia.index.IndexServicesFactory; import nu.marginalia.index.IndexServicesFactory;
import nu.marginalia.index.query.IndexQuery; import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.query.IndexQueryBuilder; import nu.marginalia.index.query.IndexQueryBuilder;
import nu.marginalia.index.results.IndexResultDomainDeduplicator;
import nu.marginalia.index.query.IndexQueryParams; import nu.marginalia.index.query.IndexQueryParams;
import nu.marginalia.index.query.ReverseIndexEntrySourceBehavior;
import nu.marginalia.index.query.filter.QueryFilterStepFromPredicate; import nu.marginalia.index.query.filter.QueryFilterStepFromPredicate;
import nu.marginalia.index.svc.IndexSearchSetsService; import nu.marginalia.index.svc.IndexSearchSetsService;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
@ -14,7 +14,10 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections; import java.util.Collections;
import java.util.List;
import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock;
@ -90,42 +93,63 @@ public class SearchIndex {
return indexReader != null; return indexReader != null;
} }
public IndexQuery createQuery(SearchIndexSearchTerms terms, IndexQueryParams params, LongPredicate includePred) {
if (null == indexReader) { public List<IndexQuery> createQueries(SearchIndexSearchTerms terms, IndexQueryParams params, LongPredicate includePred) {
if (!isAvailable()) {
logger.warn("Index reader not ready"); logger.warn("Index reader not ready");
return new IndexQuery(Collections.emptyList()); return Collections.emptyList();
} }
final int[] orderedIncludes = terms.sortedDistinctIncludes(this::compareKeywords); final int[] orderedIncludes = terms.sortedDistinctIncludes(this::compareKeywords);
IndexQueryBuilder query = List<IndexQueryBuilder> queryHeads = new ArrayList<>(10);
switch(params.queryStrategy()) { List<IndexQuery> queries = new ArrayList<>(10);
case SENTENCE -> indexReader.findWordAsSentence(orderedIncludes);
case TOPIC, REQUIRE_FIELD_SITE, REQUIRE_FIELD_TITLE, REQUIRE_FIELD_SUBJECT, REQUIRE_FIELD_DOMAIN, REQUIRE_FIELD_URL
-> indexReader.findWordAsTopic(orderedIncludes);
case AUTO -> indexReader.findWordTopicDynamicMode(orderedIncludes);
};
if (query == null) { // To ensure that good results are processed first, create query heads for the priority index that filter for terms
return new IndexQuery(Collections.emptyList()); // that contain pairs of two search terms
if (orderedIncludes.length > 1) {
for (int i = 0; i + 1 < orderedIncludes.length; i++) {
var remainingWords = Arrays.copyOfRange(orderedIncludes, i+1, orderedIncludes.length);
var entrySource = indexReader
.findPriorityWord(orderedIncludes[i])
.alsoPrioAnyOf(remainingWords);
queryHeads.add(entrySource);
}
} }
query = query.addInclusionFilter(new QueryFilterStepFromPredicate(includePred)); // Next consider entries that appear only once in the priority index
for (var wordId : orderedIncludes) {
for (int i = 0; i < orderedIncludes.length; i++) { queryHeads.add(indexReader.findPriorityWord(wordId));
query = query.also(orderedIncludes[i]);
} }
for (int term : terms.excludes()) { // Finally consider terms in the full index
query = query.not(term); queryHeads.add(indexReader.findFullWord(orderedIncludes[0], ReverseIndexEntrySourceBehavior.DO_PREFER));
for (var query : queryHeads) {
if (query == null) {
return Collections.emptyList();
}
for (int orderedInclude : orderedIncludes) {
query = query.alsoFull(orderedInclude);
}
for (int term : terms.excludes()) {
query = query.notFull(term);
}
// This filtering step needs to happen only on terms that have passed all term-based filtering steps,
// it's essentially a memoization of the params filtering job which is relatively expensive
query = query.addInclusionFilter(new QueryFilterStepFromPredicate(includePred));
// Run these last, as they'll worst-case cause as many page faults as there are
// items in the buffer
queries.add(query.addInclusionFilter(indexReader.filterForParams(params)).build());
} }
// Run these last, as they'll worst-case cause as many page faults as there are return queries;
// items in the buffer
return query
.addInclusionFilter(indexReader.filterForParams(params))
.build();
} }
private int compareKeywords(int a, int b) { private int compareKeywords(int a, int b) {

View File

@ -1,27 +1,58 @@
package nu.marginalia.index.index; package nu.marginalia.index.index;
import nu.marginalia.index.priority.ReverseIndexPriorityReader;
import nu.marginalia.index.query.IndexQuery; import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.query.IndexQueryBuilder; import nu.marginalia.index.query.IndexQueryBuilder;
import nu.marginalia.index.query.filter.QueryFilterStepIf; import nu.marginalia.index.query.filter.QueryFilterStepIf;
import nu.marginalia.index.full.ReverseIndexFullReader; import nu.marginalia.index.full.ReverseIndexFullReader;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
public class SearchIndexQueryBuilder implements IndexQueryBuilder { public class SearchIndexQueryBuilder implements IndexQueryBuilder {
private final IndexQuery query; private final IndexQuery query;
private final ReverseIndexFullReader reverseIndexFullReader; private final ReverseIndexFullReader reverseIndexFullReader;
private final ReverseIndexPriorityReader reverseIndexPrioReader;
SearchIndexQueryBuilder(ReverseIndexFullReader reverseIndexFullReader, IndexQuery query) { SearchIndexQueryBuilder(ReverseIndexFullReader reverseIndexFullReader,
ReverseIndexPriorityReader reverseIndexPrioReader,
IndexQuery query)
{
this.query = query; this.query = query;
this.reverseIndexFullReader = reverseIndexFullReader; this.reverseIndexFullReader = reverseIndexFullReader;
this.reverseIndexPrioReader = reverseIndexPrioReader;
} }
public IndexQueryBuilder also(int termId) { public IndexQueryBuilder alsoFull(int termId) {
query.addInclusionFilter(reverseIndexFullReader.also(termId)); query.addInclusionFilter(reverseIndexFullReader.also(termId));
return this; return this;
} }
public IndexQueryBuilder not(int termId) { public IndexQueryBuilder alsoPrioAnyOf(int... termIds) {
QueryFilterStepIf step;
if (termIds.length == 0) {
step = QueryFilterStepIf.noPass();
}
else if (termIds.length == 1) {
step = reverseIndexPrioReader.also(termIds[0]);
}
else {
var steps = IntStream.of(termIds)
.mapToObj(reverseIndexPrioReader::also)
.collect(Collectors.toList());
step = QueryFilterStepIf.anyOf(steps);
}
query.addInclusionFilter(step);
return this;
}
public IndexQueryBuilder notFull(int termId) {
query.addInclusionFilter(reverseIndexFullReader.not(termId)); query.addInclusionFilter(reverseIndexFullReader.not(termId));

View File

@ -13,12 +13,12 @@ import nu.marginalia.index.query.ReverseIndexEntrySourceBehavior;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.List; import java.util.List;
public class SearchIndexReader { public class SearchIndexReader {
private final Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());
private final ForwardIndexReader forwardIndexReader; private final ForwardIndexReader forwardIndexReader;
private final ReverseIndexFullReader reverseIndexFullReader; private final ReverseIndexFullReader reverseIndexFullReader;
private final ReverseIndexPriorityReader reverseIndexPriorityReader; private final ReverseIndexPriorityReader reverseIndexPriorityReader;
@ -31,38 +31,14 @@ public class SearchIndexReader {
this.reverseIndexPriorityReader = reverseIndexPriorityReader; this.reverseIndexPriorityReader = reverseIndexPriorityReader;
} }
public IndexQueryBuilder findWordAsSentence(int[] wordIdsByFrequency) { public IndexQueryBuilder findPriorityWord(int wordId) {
List<EntrySource> entrySources = new ArrayList<>(1); return new SearchIndexQueryBuilder(reverseIndexFullReader, reverseIndexPriorityReader, new IndexQuery(
List.of(reverseIndexPriorityReader.priorityDocuments(wordId))));
entrySources.add(reverseIndexFullReader.documents(wordIdsByFrequency[0], ReverseIndexEntrySourceBehavior.DO_PREFER));
return new SearchIndexQueryBuilder(reverseIndexFullReader, new IndexQuery(entrySources));
} }
public IndexQueryBuilder findWordAsTopic(int[] wordIdsByFrequency) { public IndexQueryBuilder findFullWord(int wordId, ReverseIndexEntrySourceBehavior behavior) {
List<EntrySource> entrySources = new ArrayList<>(wordIdsByFrequency.length); return new SearchIndexQueryBuilder(reverseIndexFullReader, reverseIndexPriorityReader, new IndexQuery(
List.of(reverseIndexFullReader.documents(wordId, behavior))));
for (int wordId : wordIdsByFrequency) {
entrySources.add(reverseIndexPriorityReader.priorityDocuments(wordId));
}
return new SearchIndexQueryBuilder(reverseIndexFullReader, new IndexQuery(entrySources));
}
public IndexQueryBuilder findWordTopicDynamicMode(int[] wordIdsByFrequency) {
if (wordIdsByFrequency.length > 3) {
return findWordAsSentence(wordIdsByFrequency);
}
List<EntrySource> entrySources = new ArrayList<>(wordIdsByFrequency.length + 1);
for (int wordId : wordIdsByFrequency) {
entrySources.add(reverseIndexPriorityReader.priorityDocuments(wordId));
}
entrySources.add(reverseIndexFullReader.documents(wordIdsByFrequency[0], ReverseIndexEntrySourceBehavior.DO_NOT_PREFER));
return new SearchIndexQueryBuilder(reverseIndexFullReader, new IndexQuery(entrySources));
} }
QueryFilterStepIf filterForParams(IndexQueryParams params) { QueryFilterStepIf filterForParams(IndexQueryParams params) {

View File

@ -69,6 +69,10 @@ public class IndexQueryService {
String json = request.body(); String json = request.body();
SearchSpecification specsSet = gson.fromJson(json, SearchSpecification.class); SearchSpecification specsSet = gson.fromJson(json, SearchSpecification.class);
if (!index.isAvailable()) {
Spark.halt(503, "Index is not loaded");
}
try { try {
return wmsa_edge_index_query_time.time(() -> { return wmsa_edge_index_query_time.time(() -> {
var params = new SearchParameters(specsSet, getSearchSet(specsSet)); var params = new SearchParameters(specsSet, getSearchSet(specsSet));
@ -135,36 +139,39 @@ public class IndexQueryService {
final TLongList results = new TLongArrayList(params.fetchSize); final TLongList results = new TLongArrayList(params.fetchSize);
logger.info(queryMarker, "{}", params.queryParams); logger.info(queryMarker, "{}", params.queryParams);
for (var sq : params.subqueries) {
final SearchIndexSearchTerms searchTerms = searchTermsSvc.getSearchTerms(sq); outer:
// These queries are various term combinations
for (var subquery : params.subqueries) {
final SearchIndexSearchTerms searchTerms = searchTermsSvc.getSearchTerms(subquery);
if (searchTerms.isEmpty()) { if (searchTerms.isEmpty()) {
continue; continue;
} }
var resultsForSq = executeSubquery(searchTerms, params); // These queries are different indices for one subquery
List<IndexQuery> queries = params.createIndexQueries(index, searchTerms);
for (var query : queries) {
var resultsForSq = executeQuery(query, params);
logger.info(queryMarker, "{} from {}", resultsForSq.size(), subquery);
results.addAll(resultsForSq);
logger.info(queryMarker, "{} from {}", resultsForSq.size(), sq); if (!params.hasTimeLeft()) {
logger.info("Query timed out {}, ({}), -{}",
results.addAll(resultsForSq); subquery.searchTermsInclude, subquery.searchTermsAdvice, subquery.searchTermsExclude);
break outer;
if (!params.hasTimeLeft()) { }
logger.info("Query timed out {}, ({}), -{}",
sq.searchTermsInclude, sq.searchTermsAdvice, sq.searchTermsExclude);
break;
} }
} }
return results; return results;
} }
private TLongArrayList executeSubquery(SearchIndexSearchTerms terms, SearchParameters params) private TLongArrayList executeQuery(IndexQuery query, SearchParameters params)
{ {
final TLongArrayList results = new TLongArrayList(params.fetchSize); final TLongArrayList results = new TLongArrayList(params.fetchSize);
final LongQueryBuffer buffer = new LongQueryBuffer(params.fetchSize); final LongQueryBuffer buffer = new LongQueryBuffer(params.fetchSize);
IndexQuery query = params.createIndexQuery(index, terms);
while (query.hasMore() while (query.hasMore()
&& results.size() < params.fetchSize && results.size() < params.fetchSize
&& params.budget.hasTimeLeft()) && params.budget.hasTimeLeft())

View File

@ -58,8 +58,8 @@ public class SearchParameters {
specsSet.queryStrategy); specsSet.queryStrategy);
} }
IndexQuery createIndexQuery(SearchIndex index, SearchIndexSearchTerms terms) { List<IndexQuery> createIndexQueries(SearchIndex index, SearchIndexSearchTerms terms) {
return index.createQuery(terms, queryParams, consideredUrlIds::add); return index.createQueries(terms, queryParams, consideredUrlIds::add);
} }
boolean hasTimeLeft() { boolean hasTimeLeft() {

View File

@ -34,7 +34,7 @@ public class SearchTermsService {
for (var include : request.searchTermsInclude) { for (var include : request.searchTermsInclude) {
var word = lookUpWord(include); var word = lookUpWord(include);
if (word.isEmpty()) { if (word.isEmpty()) {
logger.info("Unknown search term: " + include); logger.debug("Unknown search term: " + include);
return new SearchIndexSearchTerms(); return new SearchIndexSearchTerms();
} }
includes.add(word.getAsInt()); includes.add(word.getAsInt());
@ -44,7 +44,7 @@ public class SearchTermsService {
for (var advice : request.searchTermsAdvice) { for (var advice : request.searchTermsAdvice) {
var word = lookUpWord(advice); var word = lookUpWord(advice);
if (word.isEmpty()) { if (word.isEmpty()) {
logger.info("Unknown search term: " + advice); logger.debug("Unknown search term: " + advice);
return new SearchIndexSearchTerms(); return new SearchIndexSearchTerms();
} }
includes.add(word.getAsInt()); includes.add(word.getAsInt());