mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-22 20:48:59 +00:00
Fix bugs related to search result selection in the case with multiple search terms.
* A deduplication filter step ran too early, and removed many good results on the basis that they partially, but did not fully fit another set of search terms. * Altered the query creation process to prefer documents where multiple terms appear in the priority index.
This commit is contained in:
parent
30584887f9
commit
1097fe6e25
@ -3,9 +3,19 @@ package nu.marginalia.index.query;
|
||||
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
||||
|
||||
public interface IndexQueryBuilder {
|
||||
IndexQueryBuilder also(int termId);
|
||||
/** Filters documents that also contain termId, within the full index.
|
||||
*/
|
||||
IndexQueryBuilder alsoFull(int termId);
|
||||
|
||||
/**
|
||||
* Filters documents that also contain <i>any of the provided termIds</i>, within the priority index.
|
||||
*/
|
||||
IndexQueryBuilder alsoPrioAnyOf(int... termIds);
|
||||
|
||||
/** Excludes documents that contain termId, within the full index
|
||||
*/
|
||||
IndexQueryBuilder notFull(int termId);
|
||||
|
||||
IndexQueryBuilder not(int termId);
|
||||
IndexQueryBuilder addInclusionFilter(QueryFilterStepIf filterStep);
|
||||
|
||||
IndexQuery build();
|
||||
|
@ -5,6 +5,9 @@ import nu.marginalia.index.query.EntrySource;
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.btree.BTreeReader;
|
||||
import nu.marginalia.index.query.EmptyEntrySource;
|
||||
import nu.marginalia.index.query.ReverseIndexRetainFilter;
|
||||
import nu.marginalia.index.query.filter.QueryFilterNoPass;
|
||||
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -49,4 +52,15 @@ public class ReverseIndexPriorityReader {
|
||||
private BTreeReader createReaderNew(long offset) {
|
||||
return new BTreeReader(documents, ReverseIndexPriorityParameters.bTreeContext, offset);
|
||||
}
|
||||
|
||||
public QueryFilterStepIf also(int wordId) {
|
||||
if (wordId < 0) return new QueryFilterNoPass();
|
||||
|
||||
long offset = words.get(wordId);
|
||||
|
||||
if (offset < 0) return new QueryFilterNoPass();
|
||||
|
||||
return new ReverseIndexRetainFilter(createReaderNew(offset));
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -5,8 +5,8 @@ import com.google.inject.Singleton;
|
||||
import nu.marginalia.index.IndexServicesFactory;
|
||||
import nu.marginalia.index.query.IndexQuery;
|
||||
import nu.marginalia.index.query.IndexQueryBuilder;
|
||||
import nu.marginalia.index.results.IndexResultDomainDeduplicator;
|
||||
import nu.marginalia.index.query.IndexQueryParams;
|
||||
import nu.marginalia.index.query.ReverseIndexEntrySourceBehavior;
|
||||
import nu.marginalia.index.query.filter.QueryFilterStepFromPredicate;
|
||||
import nu.marginalia.index.svc.IndexSearchSetsService;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
@ -14,7 +14,10 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.locks.Lock;
|
||||
import java.util.concurrent.locks.ReadWriteLock;
|
||||
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
||||
@ -90,42 +93,63 @@ public class SearchIndex {
|
||||
return indexReader != null;
|
||||
}
|
||||
|
||||
public IndexQuery createQuery(SearchIndexSearchTerms terms, IndexQueryParams params, LongPredicate includePred) {
|
||||
|
||||
if (null == indexReader) {
|
||||
public List<IndexQuery> createQueries(SearchIndexSearchTerms terms, IndexQueryParams params, LongPredicate includePred) {
|
||||
|
||||
if (!isAvailable()) {
|
||||
logger.warn("Index reader not ready");
|
||||
return new IndexQuery(Collections.emptyList());
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
final int[] orderedIncludes = terms.sortedDistinctIncludes(this::compareKeywords);
|
||||
|
||||
IndexQueryBuilder query =
|
||||
switch(params.queryStrategy()) {
|
||||
case SENTENCE -> indexReader.findWordAsSentence(orderedIncludes);
|
||||
case TOPIC, REQUIRE_FIELD_SITE, REQUIRE_FIELD_TITLE, REQUIRE_FIELD_SUBJECT, REQUIRE_FIELD_DOMAIN, REQUIRE_FIELD_URL
|
||||
-> indexReader.findWordAsTopic(orderedIncludes);
|
||||
case AUTO -> indexReader.findWordTopicDynamicMode(orderedIncludes);
|
||||
};
|
||||
List<IndexQueryBuilder> queryHeads = new ArrayList<>(10);
|
||||
List<IndexQuery> queries = new ArrayList<>(10);
|
||||
|
||||
if (query == null) {
|
||||
return new IndexQuery(Collections.emptyList());
|
||||
// To ensure that good results are processed first, create query heads for the priority index that filter for terms
|
||||
// that contain pairs of two search terms
|
||||
if (orderedIncludes.length > 1) {
|
||||
for (int i = 0; i + 1 < orderedIncludes.length; i++) {
|
||||
var remainingWords = Arrays.copyOfRange(orderedIncludes, i+1, orderedIncludes.length);
|
||||
var entrySource = indexReader
|
||||
.findPriorityWord(orderedIncludes[i])
|
||||
.alsoPrioAnyOf(remainingWords);
|
||||
queryHeads.add(entrySource);
|
||||
}
|
||||
}
|
||||
|
||||
query = query.addInclusionFilter(new QueryFilterStepFromPredicate(includePred));
|
||||
|
||||
for (int i = 0; i < orderedIncludes.length; i++) {
|
||||
query = query.also(orderedIncludes[i]);
|
||||
// Next consider entries that appear only once in the priority index
|
||||
for (var wordId : orderedIncludes) {
|
||||
queryHeads.add(indexReader.findPriorityWord(wordId));
|
||||
}
|
||||
|
||||
for (int term : terms.excludes()) {
|
||||
query = query.not(term);
|
||||
// Finally consider terms in the full index
|
||||
queryHeads.add(indexReader.findFullWord(orderedIncludes[0], ReverseIndexEntrySourceBehavior.DO_PREFER));
|
||||
|
||||
for (var query : queryHeads) {
|
||||
if (query == null) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
|
||||
for (int orderedInclude : orderedIncludes) {
|
||||
query = query.alsoFull(orderedInclude);
|
||||
}
|
||||
|
||||
for (int term : terms.excludes()) {
|
||||
query = query.notFull(term);
|
||||
}
|
||||
|
||||
// This filtering step needs to happen only on terms that have passed all term-based filtering steps,
|
||||
// it's essentially a memoization of the params filtering job which is relatively expensive
|
||||
query = query.addInclusionFilter(new QueryFilterStepFromPredicate(includePred));
|
||||
|
||||
// Run these last, as they'll worst-case cause as many page faults as there are
|
||||
// items in the buffer
|
||||
queries.add(query.addInclusionFilter(indexReader.filterForParams(params)).build());
|
||||
}
|
||||
|
||||
// Run these last, as they'll worst-case cause as many page faults as there are
|
||||
// items in the buffer
|
||||
return query
|
||||
.addInclusionFilter(indexReader.filterForParams(params))
|
||||
.build();
|
||||
return queries;
|
||||
}
|
||||
|
||||
private int compareKeywords(int a, int b) {
|
||||
|
@ -1,27 +1,58 @@
|
||||
package nu.marginalia.index.index;
|
||||
|
||||
import nu.marginalia.index.priority.ReverseIndexPriorityReader;
|
||||
import nu.marginalia.index.query.IndexQuery;
|
||||
import nu.marginalia.index.query.IndexQueryBuilder;
|
||||
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
||||
import nu.marginalia.index.full.ReverseIndexFullReader;
|
||||
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
public class SearchIndexQueryBuilder implements IndexQueryBuilder {
|
||||
private final IndexQuery query;
|
||||
private final ReverseIndexFullReader reverseIndexFullReader;
|
||||
private final ReverseIndexPriorityReader reverseIndexPrioReader;
|
||||
|
||||
SearchIndexQueryBuilder(ReverseIndexFullReader reverseIndexFullReader, IndexQuery query) {
|
||||
SearchIndexQueryBuilder(ReverseIndexFullReader reverseIndexFullReader,
|
||||
ReverseIndexPriorityReader reverseIndexPrioReader,
|
||||
IndexQuery query)
|
||||
{
|
||||
this.query = query;
|
||||
this.reverseIndexFullReader = reverseIndexFullReader;
|
||||
this.reverseIndexPrioReader = reverseIndexPrioReader;
|
||||
}
|
||||
|
||||
public IndexQueryBuilder also(int termId) {
|
||||
public IndexQueryBuilder alsoFull(int termId) {
|
||||
|
||||
query.addInclusionFilter(reverseIndexFullReader.also(termId));
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
public IndexQueryBuilder not(int termId) {
|
||||
public IndexQueryBuilder alsoPrioAnyOf(int... termIds) {
|
||||
|
||||
QueryFilterStepIf step;
|
||||
|
||||
if (termIds.length == 0) {
|
||||
step = QueryFilterStepIf.noPass();
|
||||
}
|
||||
else if (termIds.length == 1) {
|
||||
step = reverseIndexPrioReader.also(termIds[0]);
|
||||
}
|
||||
else {
|
||||
var steps = IntStream.of(termIds)
|
||||
.mapToObj(reverseIndexPrioReader::also)
|
||||
.collect(Collectors.toList());
|
||||
step = QueryFilterStepIf.anyOf(steps);
|
||||
}
|
||||
|
||||
query.addInclusionFilter(step);
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
public IndexQueryBuilder notFull(int termId) {
|
||||
|
||||
query.addInclusionFilter(reverseIndexFullReader.not(termId));
|
||||
|
||||
|
@ -13,12 +13,12 @@ import nu.marginalia.index.query.ReverseIndexEntrySourceBehavior;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public class SearchIndexReader {
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private final ForwardIndexReader forwardIndexReader;
|
||||
private final ReverseIndexFullReader reverseIndexFullReader;
|
||||
private final ReverseIndexPriorityReader reverseIndexPriorityReader;
|
||||
@ -31,38 +31,14 @@ public class SearchIndexReader {
|
||||
this.reverseIndexPriorityReader = reverseIndexPriorityReader;
|
||||
}
|
||||
|
||||
public IndexQueryBuilder findWordAsSentence(int[] wordIdsByFrequency) {
|
||||
List<EntrySource> entrySources = new ArrayList<>(1);
|
||||
|
||||
entrySources.add(reverseIndexFullReader.documents(wordIdsByFrequency[0], ReverseIndexEntrySourceBehavior.DO_PREFER));
|
||||
|
||||
return new SearchIndexQueryBuilder(reverseIndexFullReader, new IndexQuery(entrySources));
|
||||
public IndexQueryBuilder findPriorityWord(int wordId) {
|
||||
return new SearchIndexQueryBuilder(reverseIndexFullReader, reverseIndexPriorityReader, new IndexQuery(
|
||||
List.of(reverseIndexPriorityReader.priorityDocuments(wordId))));
|
||||
}
|
||||
|
||||
public IndexQueryBuilder findWordAsTopic(int[] wordIdsByFrequency) {
|
||||
List<EntrySource> entrySources = new ArrayList<>(wordIdsByFrequency.length);
|
||||
|
||||
for (int wordId : wordIdsByFrequency) {
|
||||
entrySources.add(reverseIndexPriorityReader.priorityDocuments(wordId));
|
||||
}
|
||||
|
||||
return new SearchIndexQueryBuilder(reverseIndexFullReader, new IndexQuery(entrySources));
|
||||
}
|
||||
|
||||
public IndexQueryBuilder findWordTopicDynamicMode(int[] wordIdsByFrequency) {
|
||||
if (wordIdsByFrequency.length > 3) {
|
||||
return findWordAsSentence(wordIdsByFrequency);
|
||||
}
|
||||
|
||||
List<EntrySource> entrySources = new ArrayList<>(wordIdsByFrequency.length + 1);
|
||||
|
||||
for (int wordId : wordIdsByFrequency) {
|
||||
entrySources.add(reverseIndexPriorityReader.priorityDocuments(wordId));
|
||||
}
|
||||
|
||||
entrySources.add(reverseIndexFullReader.documents(wordIdsByFrequency[0], ReverseIndexEntrySourceBehavior.DO_NOT_PREFER));
|
||||
|
||||
return new SearchIndexQueryBuilder(reverseIndexFullReader, new IndexQuery(entrySources));
|
||||
public IndexQueryBuilder findFullWord(int wordId, ReverseIndexEntrySourceBehavior behavior) {
|
||||
return new SearchIndexQueryBuilder(reverseIndexFullReader, reverseIndexPriorityReader, new IndexQuery(
|
||||
List.of(reverseIndexFullReader.documents(wordId, behavior))));
|
||||
}
|
||||
|
||||
QueryFilterStepIf filterForParams(IndexQueryParams params) {
|
||||
|
@ -69,6 +69,10 @@ public class IndexQueryService {
|
||||
String json = request.body();
|
||||
SearchSpecification specsSet = gson.fromJson(json, SearchSpecification.class);
|
||||
|
||||
if (!index.isAvailable()) {
|
||||
Spark.halt(503, "Index is not loaded");
|
||||
}
|
||||
|
||||
try {
|
||||
return wmsa_edge_index_query_time.time(() -> {
|
||||
var params = new SearchParameters(specsSet, getSearchSet(specsSet));
|
||||
@ -135,36 +139,39 @@ public class IndexQueryService {
|
||||
final TLongList results = new TLongArrayList(params.fetchSize);
|
||||
|
||||
logger.info(queryMarker, "{}", params.queryParams);
|
||||
for (var sq : params.subqueries) {
|
||||
final SearchIndexSearchTerms searchTerms = searchTermsSvc.getSearchTerms(sq);
|
||||
|
||||
outer:
|
||||
// These queries are various term combinations
|
||||
for (var subquery : params.subqueries) {
|
||||
final SearchIndexSearchTerms searchTerms = searchTermsSvc.getSearchTerms(subquery);
|
||||
|
||||
if (searchTerms.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
var resultsForSq = executeSubquery(searchTerms, params);
|
||||
// These queries are different indices for one subquery
|
||||
List<IndexQuery> queries = params.createIndexQueries(index, searchTerms);
|
||||
for (var query : queries) {
|
||||
var resultsForSq = executeQuery(query, params);
|
||||
logger.info(queryMarker, "{} from {}", resultsForSq.size(), subquery);
|
||||
results.addAll(resultsForSq);
|
||||
|
||||
logger.info(queryMarker, "{} from {}", resultsForSq.size(), sq);
|
||||
|
||||
results.addAll(resultsForSq);
|
||||
|
||||
if (!params.hasTimeLeft()) {
|
||||
logger.info("Query timed out {}, ({}), -{}",
|
||||
sq.searchTermsInclude, sq.searchTermsAdvice, sq.searchTermsExclude);
|
||||
break;
|
||||
if (!params.hasTimeLeft()) {
|
||||
logger.info("Query timed out {}, ({}), -{}",
|
||||
subquery.searchTermsInclude, subquery.searchTermsAdvice, subquery.searchTermsExclude);
|
||||
break outer;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
private TLongArrayList executeSubquery(SearchIndexSearchTerms terms, SearchParameters params)
|
||||
private TLongArrayList executeQuery(IndexQuery query, SearchParameters params)
|
||||
{
|
||||
final TLongArrayList results = new TLongArrayList(params.fetchSize);
|
||||
final LongQueryBuffer buffer = new LongQueryBuffer(params.fetchSize);
|
||||
|
||||
IndexQuery query = params.createIndexQuery(index, terms);
|
||||
|
||||
while (query.hasMore()
|
||||
&& results.size() < params.fetchSize
|
||||
&& params.budget.hasTimeLeft())
|
||||
|
@ -58,8 +58,8 @@ public class SearchParameters {
|
||||
specsSet.queryStrategy);
|
||||
}
|
||||
|
||||
IndexQuery createIndexQuery(SearchIndex index, SearchIndexSearchTerms terms) {
|
||||
return index.createQuery(terms, queryParams, consideredUrlIds::add);
|
||||
List<IndexQuery> createIndexQueries(SearchIndex index, SearchIndexSearchTerms terms) {
|
||||
return index.createQueries(terms, queryParams, consideredUrlIds::add);
|
||||
}
|
||||
|
||||
boolean hasTimeLeft() {
|
||||
|
@ -34,7 +34,7 @@ public class SearchTermsService {
|
||||
for (var include : request.searchTermsInclude) {
|
||||
var word = lookUpWord(include);
|
||||
if (word.isEmpty()) {
|
||||
logger.info("Unknown search term: " + include);
|
||||
logger.debug("Unknown search term: " + include);
|
||||
return new SearchIndexSearchTerms();
|
||||
}
|
||||
includes.add(word.getAsInt());
|
||||
@ -44,7 +44,7 @@ public class SearchTermsService {
|
||||
for (var advice : request.searchTermsAdvice) {
|
||||
var word = lookUpWord(advice);
|
||||
if (word.isEmpty()) {
|
||||
logger.info("Unknown search term: " + advice);
|
||||
logger.debug("Unknown search term: " + advice);
|
||||
return new SearchIndexSearchTerms();
|
||||
}
|
||||
includes.add(word.getAsInt());
|
||||
|
Loading…
Reference in New Issue
Block a user