Fix bugs related to search result selection in the case with multiple search terms.

* A deduplication filter step ran too early, and removed many good results on the basis that they partially, but did not fully fit another set of search terms.

* Altered the query creation process to prefer documents where multiple terms appear in the priority index.
This commit is contained in:
Viktor Lofgren 2023-03-29 15:17:55 +02:00
parent 30584887f9
commit 1097fe6e25
8 changed files with 140 additions and 78 deletions

View File

@ -3,9 +3,19 @@ package nu.marginalia.index.query;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
public interface IndexQueryBuilder {
IndexQueryBuilder also(int termId);
/** Filters documents that also contain termId, within the full index.
*/
IndexQueryBuilder alsoFull(int termId);
/**
* Filters documents that also contain <i>any of the provided termIds</i>, within the priority index.
*/
IndexQueryBuilder alsoPrioAnyOf(int... termIds);
/** Excludes documents that contain termId, within the full index
*/
IndexQueryBuilder notFull(int termId);
IndexQueryBuilder not(int termId);
IndexQueryBuilder addInclusionFilter(QueryFilterStepIf filterStep);
IndexQuery build();

View File

@ -5,6 +5,9 @@ import nu.marginalia.index.query.EntrySource;
import nu.marginalia.array.LongArray;
import nu.marginalia.btree.BTreeReader;
import nu.marginalia.index.query.EmptyEntrySource;
import nu.marginalia.index.query.ReverseIndexRetainFilter;
import nu.marginalia.index.query.filter.QueryFilterNoPass;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -49,4 +52,15 @@ public class ReverseIndexPriorityReader {
private BTreeReader createReaderNew(long offset) {
return new BTreeReader(documents, ReverseIndexPriorityParameters.bTreeContext, offset);
}
public QueryFilterStepIf also(int wordId) {
if (wordId < 0) return new QueryFilterNoPass();
long offset = words.get(wordId);
if (offset < 0) return new QueryFilterNoPass();
return new ReverseIndexRetainFilter(createReaderNew(offset));
}
}

View File

@ -5,8 +5,8 @@ import com.google.inject.Singleton;
import nu.marginalia.index.IndexServicesFactory;
import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.query.IndexQueryBuilder;
import nu.marginalia.index.results.IndexResultDomainDeduplicator;
import nu.marginalia.index.query.IndexQueryParams;
import nu.marginalia.index.query.ReverseIndexEntrySourceBehavior;
import nu.marginalia.index.query.filter.QueryFilterStepFromPredicate;
import nu.marginalia.index.svc.IndexSearchSetsService;
import org.jetbrains.annotations.NotNull;
@ -14,7 +14,10 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
@ -90,42 +93,63 @@ public class SearchIndex {
return indexReader != null;
}
public IndexQuery createQuery(SearchIndexSearchTerms terms, IndexQueryParams params, LongPredicate includePred) {
if (null == indexReader) {
public List<IndexQuery> createQueries(SearchIndexSearchTerms terms, IndexQueryParams params, LongPredicate includePred) {
if (!isAvailable()) {
logger.warn("Index reader not ready");
return new IndexQuery(Collections.emptyList());
return Collections.emptyList();
}
final int[] orderedIncludes = terms.sortedDistinctIncludes(this::compareKeywords);
IndexQueryBuilder query =
switch(params.queryStrategy()) {
case SENTENCE -> indexReader.findWordAsSentence(orderedIncludes);
case TOPIC, REQUIRE_FIELD_SITE, REQUIRE_FIELD_TITLE, REQUIRE_FIELD_SUBJECT, REQUIRE_FIELD_DOMAIN, REQUIRE_FIELD_URL
-> indexReader.findWordAsTopic(orderedIncludes);
case AUTO -> indexReader.findWordTopicDynamicMode(orderedIncludes);
};
List<IndexQueryBuilder> queryHeads = new ArrayList<>(10);
List<IndexQuery> queries = new ArrayList<>(10);
if (query == null) {
return new IndexQuery(Collections.emptyList());
// To ensure that good results are processed first, create query heads for the priority index that filter for terms
// that contain pairs of two search terms
if (orderedIncludes.length > 1) {
for (int i = 0; i + 1 < orderedIncludes.length; i++) {
var remainingWords = Arrays.copyOfRange(orderedIncludes, i+1, orderedIncludes.length);
var entrySource = indexReader
.findPriorityWord(orderedIncludes[i])
.alsoPrioAnyOf(remainingWords);
queryHeads.add(entrySource);
}
}
query = query.addInclusionFilter(new QueryFilterStepFromPredicate(includePred));
for (int i = 0; i < orderedIncludes.length; i++) {
query = query.also(orderedIncludes[i]);
// Next consider entries that appear only once in the priority index
for (var wordId : orderedIncludes) {
queryHeads.add(indexReader.findPriorityWord(wordId));
}
for (int term : terms.excludes()) {
query = query.not(term);
// Finally consider terms in the full index
queryHeads.add(indexReader.findFullWord(orderedIncludes[0], ReverseIndexEntrySourceBehavior.DO_PREFER));
for (var query : queryHeads) {
if (query == null) {
return Collections.emptyList();
}
for (int orderedInclude : orderedIncludes) {
query = query.alsoFull(orderedInclude);
}
for (int term : terms.excludes()) {
query = query.notFull(term);
}
// This filtering step needs to happen only on terms that have passed all term-based filtering steps,
// it's essentially a memoization of the params filtering job which is relatively expensive
query = query.addInclusionFilter(new QueryFilterStepFromPredicate(includePred));
// Run these last, as they'll worst-case cause as many page faults as there are
// items in the buffer
queries.add(query.addInclusionFilter(indexReader.filterForParams(params)).build());
}
// Run these last, as they'll worst-case cause as many page faults as there are
// items in the buffer
return query
.addInclusionFilter(indexReader.filterForParams(params))
.build();
return queries;
}
private int compareKeywords(int a, int b) {

View File

@ -1,27 +1,58 @@
package nu.marginalia.index.index;
import nu.marginalia.index.priority.ReverseIndexPriorityReader;
import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.query.IndexQueryBuilder;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
import nu.marginalia.index.full.ReverseIndexFullReader;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
public class SearchIndexQueryBuilder implements IndexQueryBuilder {
private final IndexQuery query;
private final ReverseIndexFullReader reverseIndexFullReader;
private final ReverseIndexPriorityReader reverseIndexPrioReader;
SearchIndexQueryBuilder(ReverseIndexFullReader reverseIndexFullReader, IndexQuery query) {
SearchIndexQueryBuilder(ReverseIndexFullReader reverseIndexFullReader,
ReverseIndexPriorityReader reverseIndexPrioReader,
IndexQuery query)
{
this.query = query;
this.reverseIndexFullReader = reverseIndexFullReader;
this.reverseIndexPrioReader = reverseIndexPrioReader;
}
public IndexQueryBuilder also(int termId) {
public IndexQueryBuilder alsoFull(int termId) {
query.addInclusionFilter(reverseIndexFullReader.also(termId));
return this;
}
public IndexQueryBuilder not(int termId) {
public IndexQueryBuilder alsoPrioAnyOf(int... termIds) {
QueryFilterStepIf step;
if (termIds.length == 0) {
step = QueryFilterStepIf.noPass();
}
else if (termIds.length == 1) {
step = reverseIndexPrioReader.also(termIds[0]);
}
else {
var steps = IntStream.of(termIds)
.mapToObj(reverseIndexPrioReader::also)
.collect(Collectors.toList());
step = QueryFilterStepIf.anyOf(steps);
}
query.addInclusionFilter(step);
return this;
}
public IndexQueryBuilder notFull(int termId) {
query.addInclusionFilter(reverseIndexFullReader.not(termId));

View File

@ -13,12 +13,12 @@ import nu.marginalia.index.query.ReverseIndexEntrySourceBehavior;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.List;
public class SearchIndexReader {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final ForwardIndexReader forwardIndexReader;
private final ReverseIndexFullReader reverseIndexFullReader;
private final ReverseIndexPriorityReader reverseIndexPriorityReader;
@ -31,38 +31,14 @@ public class SearchIndexReader {
this.reverseIndexPriorityReader = reverseIndexPriorityReader;
}
public IndexQueryBuilder findWordAsSentence(int[] wordIdsByFrequency) {
List<EntrySource> entrySources = new ArrayList<>(1);
entrySources.add(reverseIndexFullReader.documents(wordIdsByFrequency[0], ReverseIndexEntrySourceBehavior.DO_PREFER));
return new SearchIndexQueryBuilder(reverseIndexFullReader, new IndexQuery(entrySources));
public IndexQueryBuilder findPriorityWord(int wordId) {
return new SearchIndexQueryBuilder(reverseIndexFullReader, reverseIndexPriorityReader, new IndexQuery(
List.of(reverseIndexPriorityReader.priorityDocuments(wordId))));
}
public IndexQueryBuilder findWordAsTopic(int[] wordIdsByFrequency) {
List<EntrySource> entrySources = new ArrayList<>(wordIdsByFrequency.length);
for (int wordId : wordIdsByFrequency) {
entrySources.add(reverseIndexPriorityReader.priorityDocuments(wordId));
}
return new SearchIndexQueryBuilder(reverseIndexFullReader, new IndexQuery(entrySources));
}
public IndexQueryBuilder findWordTopicDynamicMode(int[] wordIdsByFrequency) {
if (wordIdsByFrequency.length > 3) {
return findWordAsSentence(wordIdsByFrequency);
}
List<EntrySource> entrySources = new ArrayList<>(wordIdsByFrequency.length + 1);
for (int wordId : wordIdsByFrequency) {
entrySources.add(reverseIndexPriorityReader.priorityDocuments(wordId));
}
entrySources.add(reverseIndexFullReader.documents(wordIdsByFrequency[0], ReverseIndexEntrySourceBehavior.DO_NOT_PREFER));
return new SearchIndexQueryBuilder(reverseIndexFullReader, new IndexQuery(entrySources));
public IndexQueryBuilder findFullWord(int wordId, ReverseIndexEntrySourceBehavior behavior) {
return new SearchIndexQueryBuilder(reverseIndexFullReader, reverseIndexPriorityReader, new IndexQuery(
List.of(reverseIndexFullReader.documents(wordId, behavior))));
}
QueryFilterStepIf filterForParams(IndexQueryParams params) {

View File

@ -69,6 +69,10 @@ public class IndexQueryService {
String json = request.body();
SearchSpecification specsSet = gson.fromJson(json, SearchSpecification.class);
if (!index.isAvailable()) {
Spark.halt(503, "Index is not loaded");
}
try {
return wmsa_edge_index_query_time.time(() -> {
var params = new SearchParameters(specsSet, getSearchSet(specsSet));
@ -135,36 +139,39 @@ public class IndexQueryService {
final TLongList results = new TLongArrayList(params.fetchSize);
logger.info(queryMarker, "{}", params.queryParams);
for (var sq : params.subqueries) {
final SearchIndexSearchTerms searchTerms = searchTermsSvc.getSearchTerms(sq);
outer:
// These queries are various term combinations
for (var subquery : params.subqueries) {
final SearchIndexSearchTerms searchTerms = searchTermsSvc.getSearchTerms(subquery);
if (searchTerms.isEmpty()) {
continue;
}
var resultsForSq = executeSubquery(searchTerms, params);
// These queries are different indices for one subquery
List<IndexQuery> queries = params.createIndexQueries(index, searchTerms);
for (var query : queries) {
var resultsForSq = executeQuery(query, params);
logger.info(queryMarker, "{} from {}", resultsForSq.size(), subquery);
results.addAll(resultsForSq);
logger.info(queryMarker, "{} from {}", resultsForSq.size(), sq);
results.addAll(resultsForSq);
if (!params.hasTimeLeft()) {
logger.info("Query timed out {}, ({}), -{}",
sq.searchTermsInclude, sq.searchTermsAdvice, sq.searchTermsExclude);
break;
if (!params.hasTimeLeft()) {
logger.info("Query timed out {}, ({}), -{}",
subquery.searchTermsInclude, subquery.searchTermsAdvice, subquery.searchTermsExclude);
break outer;
}
}
}
return results;
}
private TLongArrayList executeSubquery(SearchIndexSearchTerms terms, SearchParameters params)
private TLongArrayList executeQuery(IndexQuery query, SearchParameters params)
{
final TLongArrayList results = new TLongArrayList(params.fetchSize);
final LongQueryBuffer buffer = new LongQueryBuffer(params.fetchSize);
IndexQuery query = params.createIndexQuery(index, terms);
while (query.hasMore()
&& results.size() < params.fetchSize
&& params.budget.hasTimeLeft())

View File

@ -58,8 +58,8 @@ public class SearchParameters {
specsSet.queryStrategy);
}
IndexQuery createIndexQuery(SearchIndex index, SearchIndexSearchTerms terms) {
return index.createQuery(terms, queryParams, consideredUrlIds::add);
List<IndexQuery> createIndexQueries(SearchIndex index, SearchIndexSearchTerms terms) {
return index.createQueries(terms, queryParams, consideredUrlIds::add);
}
boolean hasTimeLeft() {

View File

@ -34,7 +34,7 @@ public class SearchTermsService {
for (var include : request.searchTermsInclude) {
var word = lookUpWord(include);
if (word.isEmpty()) {
logger.info("Unknown search term: " + include);
logger.debug("Unknown search term: " + include);
return new SearchIndexSearchTerms();
}
includes.add(word.getAsInt());
@ -44,7 +44,7 @@ public class SearchTermsService {
for (var advice : request.searchTermsAdvice) {
var word = lookUpWord(advice);
if (word.isEmpty()) {
logger.info("Unknown search term: " + advice);
logger.debug("Unknown search term: " + advice);
return new SearchIndexSearchTerms();
}
includes.add(word.getAsInt());