mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Fix bugs related to search result selection in the case with multiple search terms.
* A deduplication filter step ran too early, and removed many good results on the basis that they partially, but did not fully fit another set of search terms. * Altered the query creation process to prefer documents where multiple terms appear in the priority index.
This commit is contained in:
parent
30584887f9
commit
1097fe6e25
@ -3,9 +3,19 @@ package nu.marginalia.index.query;
|
|||||||
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
||||||
|
|
||||||
public interface IndexQueryBuilder {
|
public interface IndexQueryBuilder {
|
||||||
IndexQueryBuilder also(int termId);
|
/** Filters documents that also contain termId, within the full index.
|
||||||
|
*/
|
||||||
|
IndexQueryBuilder alsoFull(int termId);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Filters documents that also contain <i>any of the provided termIds</i>, within the priority index.
|
||||||
|
*/
|
||||||
|
IndexQueryBuilder alsoPrioAnyOf(int... termIds);
|
||||||
|
|
||||||
|
/** Excludes documents that contain termId, within the full index
|
||||||
|
*/
|
||||||
|
IndexQueryBuilder notFull(int termId);
|
||||||
|
|
||||||
IndexQueryBuilder not(int termId);
|
|
||||||
IndexQueryBuilder addInclusionFilter(QueryFilterStepIf filterStep);
|
IndexQueryBuilder addInclusionFilter(QueryFilterStepIf filterStep);
|
||||||
|
|
||||||
IndexQuery build();
|
IndexQuery build();
|
||||||
|
@ -5,6 +5,9 @@ import nu.marginalia.index.query.EntrySource;
|
|||||||
import nu.marginalia.array.LongArray;
|
import nu.marginalia.array.LongArray;
|
||||||
import nu.marginalia.btree.BTreeReader;
|
import nu.marginalia.btree.BTreeReader;
|
||||||
import nu.marginalia.index.query.EmptyEntrySource;
|
import nu.marginalia.index.query.EmptyEntrySource;
|
||||||
|
import nu.marginalia.index.query.ReverseIndexRetainFilter;
|
||||||
|
import nu.marginalia.index.query.filter.QueryFilterNoPass;
|
||||||
|
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -49,4 +52,15 @@ public class ReverseIndexPriorityReader {
|
|||||||
private BTreeReader createReaderNew(long offset) {
|
private BTreeReader createReaderNew(long offset) {
|
||||||
return new BTreeReader(documents, ReverseIndexPriorityParameters.bTreeContext, offset);
|
return new BTreeReader(documents, ReverseIndexPriorityParameters.bTreeContext, offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public QueryFilterStepIf also(int wordId) {
|
||||||
|
if (wordId < 0) return new QueryFilterNoPass();
|
||||||
|
|
||||||
|
long offset = words.get(wordId);
|
||||||
|
|
||||||
|
if (offset < 0) return new QueryFilterNoPass();
|
||||||
|
|
||||||
|
return new ReverseIndexRetainFilter(createReaderNew(offset));
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -5,8 +5,8 @@ import com.google.inject.Singleton;
|
|||||||
import nu.marginalia.index.IndexServicesFactory;
|
import nu.marginalia.index.IndexServicesFactory;
|
||||||
import nu.marginalia.index.query.IndexQuery;
|
import nu.marginalia.index.query.IndexQuery;
|
||||||
import nu.marginalia.index.query.IndexQueryBuilder;
|
import nu.marginalia.index.query.IndexQueryBuilder;
|
||||||
import nu.marginalia.index.results.IndexResultDomainDeduplicator;
|
|
||||||
import nu.marginalia.index.query.IndexQueryParams;
|
import nu.marginalia.index.query.IndexQueryParams;
|
||||||
|
import nu.marginalia.index.query.ReverseIndexEntrySourceBehavior;
|
||||||
import nu.marginalia.index.query.filter.QueryFilterStepFromPredicate;
|
import nu.marginalia.index.query.filter.QueryFilterStepFromPredicate;
|
||||||
import nu.marginalia.index.svc.IndexSearchSetsService;
|
import nu.marginalia.index.svc.IndexSearchSetsService;
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
@ -14,7 +14,10 @@ import org.slf4j.Logger;
|
|||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
import java.util.concurrent.locks.Lock;
|
import java.util.concurrent.locks.Lock;
|
||||||
import java.util.concurrent.locks.ReadWriteLock;
|
import java.util.concurrent.locks.ReadWriteLock;
|
||||||
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
||||||
@ -90,42 +93,63 @@ public class SearchIndex {
|
|||||||
return indexReader != null;
|
return indexReader != null;
|
||||||
}
|
}
|
||||||
|
|
||||||
public IndexQuery createQuery(SearchIndexSearchTerms terms, IndexQueryParams params, LongPredicate includePred) {
|
|
||||||
|
|
||||||
if (null == indexReader) {
|
public List<IndexQuery> createQueries(SearchIndexSearchTerms terms, IndexQueryParams params, LongPredicate includePred) {
|
||||||
|
|
||||||
|
if (!isAvailable()) {
|
||||||
logger.warn("Index reader not ready");
|
logger.warn("Index reader not ready");
|
||||||
return new IndexQuery(Collections.emptyList());
|
return Collections.emptyList();
|
||||||
}
|
}
|
||||||
|
|
||||||
final int[] orderedIncludes = terms.sortedDistinctIncludes(this::compareKeywords);
|
final int[] orderedIncludes = terms.sortedDistinctIncludes(this::compareKeywords);
|
||||||
|
|
||||||
IndexQueryBuilder query =
|
List<IndexQueryBuilder> queryHeads = new ArrayList<>(10);
|
||||||
switch(params.queryStrategy()) {
|
List<IndexQuery> queries = new ArrayList<>(10);
|
||||||
case SENTENCE -> indexReader.findWordAsSentence(orderedIncludes);
|
|
||||||
case TOPIC, REQUIRE_FIELD_SITE, REQUIRE_FIELD_TITLE, REQUIRE_FIELD_SUBJECT, REQUIRE_FIELD_DOMAIN, REQUIRE_FIELD_URL
|
|
||||||
-> indexReader.findWordAsTopic(orderedIncludes);
|
|
||||||
case AUTO -> indexReader.findWordTopicDynamicMode(orderedIncludes);
|
|
||||||
};
|
|
||||||
|
|
||||||
if (query == null) {
|
// To ensure that good results are processed first, create query heads for the priority index that filter for terms
|
||||||
return new IndexQuery(Collections.emptyList());
|
// that contain pairs of two search terms
|
||||||
|
if (orderedIncludes.length > 1) {
|
||||||
|
for (int i = 0; i + 1 < orderedIncludes.length; i++) {
|
||||||
|
var remainingWords = Arrays.copyOfRange(orderedIncludes, i+1, orderedIncludes.length);
|
||||||
|
var entrySource = indexReader
|
||||||
|
.findPriorityWord(orderedIncludes[i])
|
||||||
|
.alsoPrioAnyOf(remainingWords);
|
||||||
|
queryHeads.add(entrySource);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
query = query.addInclusionFilter(new QueryFilterStepFromPredicate(includePred));
|
// Next consider entries that appear only once in the priority index
|
||||||
|
for (var wordId : orderedIncludes) {
|
||||||
|
queryHeads.add(indexReader.findPriorityWord(wordId));
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 0; i < orderedIncludes.length; i++) {
|
// Finally consider terms in the full index
|
||||||
query = query.also(orderedIncludes[i]);
|
queryHeads.add(indexReader.findFullWord(orderedIncludes[0], ReverseIndexEntrySourceBehavior.DO_PREFER));
|
||||||
|
|
||||||
|
for (var query : queryHeads) {
|
||||||
|
if (query == null) {
|
||||||
|
return Collections.emptyList();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
for (int orderedInclude : orderedIncludes) {
|
||||||
|
query = query.alsoFull(orderedInclude);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int term : terms.excludes()) {
|
for (int term : terms.excludes()) {
|
||||||
query = query.not(term);
|
query = query.notFull(term);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// This filtering step needs to happen only on terms that have passed all term-based filtering steps,
|
||||||
|
// it's essentially a memoization of the params filtering job which is relatively expensive
|
||||||
|
query = query.addInclusionFilter(new QueryFilterStepFromPredicate(includePred));
|
||||||
|
|
||||||
// Run these last, as they'll worst-case cause as many page faults as there are
|
// Run these last, as they'll worst-case cause as many page faults as there are
|
||||||
// items in the buffer
|
// items in the buffer
|
||||||
return query
|
queries.add(query.addInclusionFilter(indexReader.filterForParams(params)).build());
|
||||||
.addInclusionFilter(indexReader.filterForParams(params))
|
}
|
||||||
.build();
|
|
||||||
|
return queries;
|
||||||
}
|
}
|
||||||
|
|
||||||
private int compareKeywords(int a, int b) {
|
private int compareKeywords(int a, int b) {
|
||||||
|
@ -1,27 +1,58 @@
|
|||||||
package nu.marginalia.index.index;
|
package nu.marginalia.index.index;
|
||||||
|
|
||||||
|
import nu.marginalia.index.priority.ReverseIndexPriorityReader;
|
||||||
import nu.marginalia.index.query.IndexQuery;
|
import nu.marginalia.index.query.IndexQuery;
|
||||||
import nu.marginalia.index.query.IndexQueryBuilder;
|
import nu.marginalia.index.query.IndexQueryBuilder;
|
||||||
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
||||||
import nu.marginalia.index.full.ReverseIndexFullReader;
|
import nu.marginalia.index.full.ReverseIndexFullReader;
|
||||||
|
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.IntStream;
|
||||||
|
|
||||||
public class SearchIndexQueryBuilder implements IndexQueryBuilder {
|
public class SearchIndexQueryBuilder implements IndexQueryBuilder {
|
||||||
private final IndexQuery query;
|
private final IndexQuery query;
|
||||||
private final ReverseIndexFullReader reverseIndexFullReader;
|
private final ReverseIndexFullReader reverseIndexFullReader;
|
||||||
|
private final ReverseIndexPriorityReader reverseIndexPrioReader;
|
||||||
|
|
||||||
SearchIndexQueryBuilder(ReverseIndexFullReader reverseIndexFullReader, IndexQuery query) {
|
SearchIndexQueryBuilder(ReverseIndexFullReader reverseIndexFullReader,
|
||||||
|
ReverseIndexPriorityReader reverseIndexPrioReader,
|
||||||
|
IndexQuery query)
|
||||||
|
{
|
||||||
this.query = query;
|
this.query = query;
|
||||||
this.reverseIndexFullReader = reverseIndexFullReader;
|
this.reverseIndexFullReader = reverseIndexFullReader;
|
||||||
|
this.reverseIndexPrioReader = reverseIndexPrioReader;
|
||||||
}
|
}
|
||||||
|
|
||||||
public IndexQueryBuilder also(int termId) {
|
public IndexQueryBuilder alsoFull(int termId) {
|
||||||
|
|
||||||
query.addInclusionFilter(reverseIndexFullReader.also(termId));
|
query.addInclusionFilter(reverseIndexFullReader.also(termId));
|
||||||
|
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public IndexQueryBuilder not(int termId) {
|
public IndexQueryBuilder alsoPrioAnyOf(int... termIds) {
|
||||||
|
|
||||||
|
QueryFilterStepIf step;
|
||||||
|
|
||||||
|
if (termIds.length == 0) {
|
||||||
|
step = QueryFilterStepIf.noPass();
|
||||||
|
}
|
||||||
|
else if (termIds.length == 1) {
|
||||||
|
step = reverseIndexPrioReader.also(termIds[0]);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
var steps = IntStream.of(termIds)
|
||||||
|
.mapToObj(reverseIndexPrioReader::also)
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
step = QueryFilterStepIf.anyOf(steps);
|
||||||
|
}
|
||||||
|
|
||||||
|
query.addInclusionFilter(step);
|
||||||
|
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public IndexQueryBuilder notFull(int termId) {
|
||||||
|
|
||||||
query.addInclusionFilter(reverseIndexFullReader.not(termId));
|
query.addInclusionFilter(reverseIndexFullReader.not(termId));
|
||||||
|
|
||||||
|
@ -13,12 +13,12 @@ import nu.marginalia.index.query.ReverseIndexEntrySourceBehavior;
|
|||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
public class SearchIndexReader {
|
public class SearchIndexReader {
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
private final ForwardIndexReader forwardIndexReader;
|
private final ForwardIndexReader forwardIndexReader;
|
||||||
private final ReverseIndexFullReader reverseIndexFullReader;
|
private final ReverseIndexFullReader reverseIndexFullReader;
|
||||||
private final ReverseIndexPriorityReader reverseIndexPriorityReader;
|
private final ReverseIndexPriorityReader reverseIndexPriorityReader;
|
||||||
@ -31,38 +31,14 @@ public class SearchIndexReader {
|
|||||||
this.reverseIndexPriorityReader = reverseIndexPriorityReader;
|
this.reverseIndexPriorityReader = reverseIndexPriorityReader;
|
||||||
}
|
}
|
||||||
|
|
||||||
public IndexQueryBuilder findWordAsSentence(int[] wordIdsByFrequency) {
|
public IndexQueryBuilder findPriorityWord(int wordId) {
|
||||||
List<EntrySource> entrySources = new ArrayList<>(1);
|
return new SearchIndexQueryBuilder(reverseIndexFullReader, reverseIndexPriorityReader, new IndexQuery(
|
||||||
|
List.of(reverseIndexPriorityReader.priorityDocuments(wordId))));
|
||||||
entrySources.add(reverseIndexFullReader.documents(wordIdsByFrequency[0], ReverseIndexEntrySourceBehavior.DO_PREFER));
|
|
||||||
|
|
||||||
return new SearchIndexQueryBuilder(reverseIndexFullReader, new IndexQuery(entrySources));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public IndexQueryBuilder findWordAsTopic(int[] wordIdsByFrequency) {
|
public IndexQueryBuilder findFullWord(int wordId, ReverseIndexEntrySourceBehavior behavior) {
|
||||||
List<EntrySource> entrySources = new ArrayList<>(wordIdsByFrequency.length);
|
return new SearchIndexQueryBuilder(reverseIndexFullReader, reverseIndexPriorityReader, new IndexQuery(
|
||||||
|
List.of(reverseIndexFullReader.documents(wordId, behavior))));
|
||||||
for (int wordId : wordIdsByFrequency) {
|
|
||||||
entrySources.add(reverseIndexPriorityReader.priorityDocuments(wordId));
|
|
||||||
}
|
|
||||||
|
|
||||||
return new SearchIndexQueryBuilder(reverseIndexFullReader, new IndexQuery(entrySources));
|
|
||||||
}
|
|
||||||
|
|
||||||
public IndexQueryBuilder findWordTopicDynamicMode(int[] wordIdsByFrequency) {
|
|
||||||
if (wordIdsByFrequency.length > 3) {
|
|
||||||
return findWordAsSentence(wordIdsByFrequency);
|
|
||||||
}
|
|
||||||
|
|
||||||
List<EntrySource> entrySources = new ArrayList<>(wordIdsByFrequency.length + 1);
|
|
||||||
|
|
||||||
for (int wordId : wordIdsByFrequency) {
|
|
||||||
entrySources.add(reverseIndexPriorityReader.priorityDocuments(wordId));
|
|
||||||
}
|
|
||||||
|
|
||||||
entrySources.add(reverseIndexFullReader.documents(wordIdsByFrequency[0], ReverseIndexEntrySourceBehavior.DO_NOT_PREFER));
|
|
||||||
|
|
||||||
return new SearchIndexQueryBuilder(reverseIndexFullReader, new IndexQuery(entrySources));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
QueryFilterStepIf filterForParams(IndexQueryParams params) {
|
QueryFilterStepIf filterForParams(IndexQueryParams params) {
|
||||||
|
@ -69,6 +69,10 @@ public class IndexQueryService {
|
|||||||
String json = request.body();
|
String json = request.body();
|
||||||
SearchSpecification specsSet = gson.fromJson(json, SearchSpecification.class);
|
SearchSpecification specsSet = gson.fromJson(json, SearchSpecification.class);
|
||||||
|
|
||||||
|
if (!index.isAvailable()) {
|
||||||
|
Spark.halt(503, "Index is not loaded");
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
return wmsa_edge_index_query_time.time(() -> {
|
return wmsa_edge_index_query_time.time(() -> {
|
||||||
var params = new SearchParameters(specsSet, getSearchSet(specsSet));
|
var params = new SearchParameters(specsSet, getSearchSet(specsSet));
|
||||||
@ -135,36 +139,39 @@ public class IndexQueryService {
|
|||||||
final TLongList results = new TLongArrayList(params.fetchSize);
|
final TLongList results = new TLongArrayList(params.fetchSize);
|
||||||
|
|
||||||
logger.info(queryMarker, "{}", params.queryParams);
|
logger.info(queryMarker, "{}", params.queryParams);
|
||||||
for (var sq : params.subqueries) {
|
|
||||||
final SearchIndexSearchTerms searchTerms = searchTermsSvc.getSearchTerms(sq);
|
outer:
|
||||||
|
// These queries are various term combinations
|
||||||
|
for (var subquery : params.subqueries) {
|
||||||
|
final SearchIndexSearchTerms searchTerms = searchTermsSvc.getSearchTerms(subquery);
|
||||||
|
|
||||||
if (searchTerms.isEmpty()) {
|
if (searchTerms.isEmpty()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
var resultsForSq = executeSubquery(searchTerms, params);
|
// These queries are different indices for one subquery
|
||||||
|
List<IndexQuery> queries = params.createIndexQueries(index, searchTerms);
|
||||||
logger.info(queryMarker, "{} from {}", resultsForSq.size(), sq);
|
for (var query : queries) {
|
||||||
|
var resultsForSq = executeQuery(query, params);
|
||||||
|
logger.info(queryMarker, "{} from {}", resultsForSq.size(), subquery);
|
||||||
results.addAll(resultsForSq);
|
results.addAll(resultsForSq);
|
||||||
|
|
||||||
if (!params.hasTimeLeft()) {
|
if (!params.hasTimeLeft()) {
|
||||||
logger.info("Query timed out {}, ({}), -{}",
|
logger.info("Query timed out {}, ({}), -{}",
|
||||||
sq.searchTermsInclude, sq.searchTermsAdvice, sq.searchTermsExclude);
|
subquery.searchTermsInclude, subquery.searchTermsAdvice, subquery.searchTermsExclude);
|
||||||
break;
|
break outer;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return results;
|
return results;
|
||||||
}
|
}
|
||||||
|
|
||||||
private TLongArrayList executeSubquery(SearchIndexSearchTerms terms, SearchParameters params)
|
private TLongArrayList executeQuery(IndexQuery query, SearchParameters params)
|
||||||
{
|
{
|
||||||
final TLongArrayList results = new TLongArrayList(params.fetchSize);
|
final TLongArrayList results = new TLongArrayList(params.fetchSize);
|
||||||
final LongQueryBuffer buffer = new LongQueryBuffer(params.fetchSize);
|
final LongQueryBuffer buffer = new LongQueryBuffer(params.fetchSize);
|
||||||
|
|
||||||
IndexQuery query = params.createIndexQuery(index, terms);
|
|
||||||
|
|
||||||
while (query.hasMore()
|
while (query.hasMore()
|
||||||
&& results.size() < params.fetchSize
|
&& results.size() < params.fetchSize
|
||||||
&& params.budget.hasTimeLeft())
|
&& params.budget.hasTimeLeft())
|
||||||
|
@ -58,8 +58,8 @@ public class SearchParameters {
|
|||||||
specsSet.queryStrategy);
|
specsSet.queryStrategy);
|
||||||
}
|
}
|
||||||
|
|
||||||
IndexQuery createIndexQuery(SearchIndex index, SearchIndexSearchTerms terms) {
|
List<IndexQuery> createIndexQueries(SearchIndex index, SearchIndexSearchTerms terms) {
|
||||||
return index.createQuery(terms, queryParams, consideredUrlIds::add);
|
return index.createQueries(terms, queryParams, consideredUrlIds::add);
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean hasTimeLeft() {
|
boolean hasTimeLeft() {
|
||||||
|
@ -34,7 +34,7 @@ public class SearchTermsService {
|
|||||||
for (var include : request.searchTermsInclude) {
|
for (var include : request.searchTermsInclude) {
|
||||||
var word = lookUpWord(include);
|
var word = lookUpWord(include);
|
||||||
if (word.isEmpty()) {
|
if (word.isEmpty()) {
|
||||||
logger.info("Unknown search term: " + include);
|
logger.debug("Unknown search term: " + include);
|
||||||
return new SearchIndexSearchTerms();
|
return new SearchIndexSearchTerms();
|
||||||
}
|
}
|
||||||
includes.add(word.getAsInt());
|
includes.add(word.getAsInt());
|
||||||
@ -44,7 +44,7 @@ public class SearchTermsService {
|
|||||||
for (var advice : request.searchTermsAdvice) {
|
for (var advice : request.searchTermsAdvice) {
|
||||||
var word = lookUpWord(advice);
|
var word = lookUpWord(advice);
|
||||||
if (word.isEmpty()) {
|
if (word.isEmpty()) {
|
||||||
logger.info("Unknown search term: " + advice);
|
logger.debug("Unknown search term: " + advice);
|
||||||
return new SearchIndexSearchTerms();
|
return new SearchIndexSearchTerms();
|
||||||
}
|
}
|
||||||
includes.add(word.getAsInt());
|
includes.add(word.getAsInt());
|
||||||
|
Loading…
Reference in New Issue
Block a user