diff --git a/code/features-index/index-query/src/main/java/nu/marginalia/index/query/IndexQueryBuilder.java b/code/features-index/index-query/src/main/java/nu/marginalia/index/query/IndexQueryBuilder.java
index dddc1e3b..adcd1861 100644
--- a/code/features-index/index-query/src/main/java/nu/marginalia/index/query/IndexQueryBuilder.java
+++ b/code/features-index/index-query/src/main/java/nu/marginalia/index/query/IndexQueryBuilder.java
@@ -3,9 +3,19 @@ package nu.marginalia.index.query;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
public interface IndexQueryBuilder {
- IndexQueryBuilder also(int termId);
+ /** Filters documents that also contain termId, within the full index.
+ */
+ IndexQueryBuilder alsoFull(int termId);
+
+ /**
+ * Filters documents that also contain any of the provided termIds, within the priority index.
+ */
+ IndexQueryBuilder alsoPrioAnyOf(int... termIds);
+
+ /** Excludes documents that contain termId, within the full index
+ */
+ IndexQueryBuilder notFull(int termId);
- IndexQueryBuilder not(int termId);
IndexQueryBuilder addInclusionFilter(QueryFilterStepIf filterStep);
IndexQuery build();
diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityReader.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityReader.java
index b4b5d75a..bccce6c0 100644
--- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityReader.java
+++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityReader.java
@@ -5,6 +5,9 @@ import nu.marginalia.index.query.EntrySource;
import nu.marginalia.array.LongArray;
import nu.marginalia.btree.BTreeReader;
import nu.marginalia.index.query.EmptyEntrySource;
+import nu.marginalia.index.query.ReverseIndexRetainFilter;
+import nu.marginalia.index.query.filter.QueryFilterNoPass;
+import nu.marginalia.index.query.filter.QueryFilterStepIf;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -49,4 +52,15 @@ public class ReverseIndexPriorityReader {
private BTreeReader createReaderNew(long offset) {
return new BTreeReader(documents, ReverseIndexPriorityParameters.bTreeContext, offset);
}
+
+ public QueryFilterStepIf also(int wordId) {
+ if (wordId < 0) return new QueryFilterNoPass();
+
+ long offset = words.get(wordId);
+
+ if (offset < 0) return new QueryFilterNoPass();
+
+ return new ReverseIndexRetainFilter(createReaderNew(offset));
+ }
+
}
diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java
index 14e31fff..af26b2c7 100644
--- a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java
+++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java
@@ -5,8 +5,8 @@ import com.google.inject.Singleton;
import nu.marginalia.index.IndexServicesFactory;
import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.query.IndexQueryBuilder;
-import nu.marginalia.index.results.IndexResultDomainDeduplicator;
import nu.marginalia.index.query.IndexQueryParams;
+import nu.marginalia.index.query.ReverseIndexEntrySourceBehavior;
import nu.marginalia.index.query.filter.QueryFilterStepFromPredicate;
import nu.marginalia.index.svc.IndexSearchSetsService;
import org.jetbrains.annotations.NotNull;
@@ -14,7 +14,10 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
import java.util.Collections;
+import java.util.List;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
@@ -90,42 +93,63 @@ public class SearchIndex {
return indexReader != null;
}
- public IndexQuery createQuery(SearchIndexSearchTerms terms, IndexQueryParams params, LongPredicate includePred) {
- if (null == indexReader) {
+ public List createQueries(SearchIndexSearchTerms terms, IndexQueryParams params, LongPredicate includePred) {
+
+ if (!isAvailable()) {
logger.warn("Index reader not ready");
- return new IndexQuery(Collections.emptyList());
+ return Collections.emptyList();
}
final int[] orderedIncludes = terms.sortedDistinctIncludes(this::compareKeywords);
- IndexQueryBuilder query =
- switch(params.queryStrategy()) {
- case SENTENCE -> indexReader.findWordAsSentence(orderedIncludes);
- case TOPIC, REQUIRE_FIELD_SITE, REQUIRE_FIELD_TITLE, REQUIRE_FIELD_SUBJECT, REQUIRE_FIELD_DOMAIN, REQUIRE_FIELD_URL
- -> indexReader.findWordAsTopic(orderedIncludes);
- case AUTO -> indexReader.findWordTopicDynamicMode(orderedIncludes);
- };
+ List queryHeads = new ArrayList<>(10);
+ List queries = new ArrayList<>(10);
- if (query == null) {
- return new IndexQuery(Collections.emptyList());
+ // To ensure that good results are processed first, create query heads for the priority index that filter for terms
+ // that contain pairs of two search terms
+ if (orderedIncludes.length > 1) {
+ for (int i = 0; i + 1 < orderedIncludes.length; i++) {
+ var remainingWords = Arrays.copyOfRange(orderedIncludes, i+1, orderedIncludes.length);
+ var entrySource = indexReader
+ .findPriorityWord(orderedIncludes[i])
+ .alsoPrioAnyOf(remainingWords);
+ queryHeads.add(entrySource);
+ }
}
- query = query.addInclusionFilter(new QueryFilterStepFromPredicate(includePred));
-
- for (int i = 0; i < orderedIncludes.length; i++) {
- query = query.also(orderedIncludes[i]);
+ // Next consider entries that appear only once in the priority index
+ for (var wordId : orderedIncludes) {
+ queryHeads.add(indexReader.findPriorityWord(wordId));
}
- for (int term : terms.excludes()) {
- query = query.not(term);
+ // Finally consider terms in the full index
+ queryHeads.add(indexReader.findFullWord(orderedIncludes[0], ReverseIndexEntrySourceBehavior.DO_PREFER));
+
+ for (var query : queryHeads) {
+ if (query == null) {
+ return Collections.emptyList();
+ }
+
+
+ for (int orderedInclude : orderedIncludes) {
+ query = query.alsoFull(orderedInclude);
+ }
+
+ for (int term : terms.excludes()) {
+ query = query.notFull(term);
+ }
+
+ // This filtering step needs to happen only on terms that have passed all term-based filtering steps,
+ // it's essentially a memoization of the params filtering job which is relatively expensive
+ query = query.addInclusionFilter(new QueryFilterStepFromPredicate(includePred));
+
+ // Run these last, as they'll worst-case cause as many page faults as there are
+ // items in the buffer
+ queries.add(query.addInclusionFilter(indexReader.filterForParams(params)).build());
}
- // Run these last, as they'll worst-case cause as many page faults as there are
- // items in the buffer
- return query
- .addInclusionFilter(indexReader.filterForParams(params))
- .build();
+ return queries;
}
private int compareKeywords(int a, int b) {
diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexQueryBuilder.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexQueryBuilder.java
index 80196a36..6de051f1 100644
--- a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexQueryBuilder.java
+++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexQueryBuilder.java
@@ -1,27 +1,58 @@
package nu.marginalia.index.index;
+import nu.marginalia.index.priority.ReverseIndexPriorityReader;
import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.query.IndexQueryBuilder;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
import nu.marginalia.index.full.ReverseIndexFullReader;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
public class SearchIndexQueryBuilder implements IndexQueryBuilder {
private final IndexQuery query;
private final ReverseIndexFullReader reverseIndexFullReader;
+ private final ReverseIndexPriorityReader reverseIndexPrioReader;
- SearchIndexQueryBuilder(ReverseIndexFullReader reverseIndexFullReader, IndexQuery query) {
+ SearchIndexQueryBuilder(ReverseIndexFullReader reverseIndexFullReader,
+ ReverseIndexPriorityReader reverseIndexPrioReader,
+ IndexQuery query)
+ {
this.query = query;
this.reverseIndexFullReader = reverseIndexFullReader;
+ this.reverseIndexPrioReader = reverseIndexPrioReader;
}
- public IndexQueryBuilder also(int termId) {
+ public IndexQueryBuilder alsoFull(int termId) {
query.addInclusionFilter(reverseIndexFullReader.also(termId));
return this;
}
- public IndexQueryBuilder not(int termId) {
+ public IndexQueryBuilder alsoPrioAnyOf(int... termIds) {
+
+ QueryFilterStepIf step;
+
+ if (termIds.length == 0) {
+ step = QueryFilterStepIf.noPass();
+ }
+ else if (termIds.length == 1) {
+ step = reverseIndexPrioReader.also(termIds[0]);
+ }
+ else {
+ var steps = IntStream.of(termIds)
+ .mapToObj(reverseIndexPrioReader::also)
+ .collect(Collectors.toList());
+ step = QueryFilterStepIf.anyOf(steps);
+ }
+
+ query.addInclusionFilter(step);
+
+ return this;
+ }
+
+ public IndexQueryBuilder notFull(int termId) {
query.addInclusionFilter(reverseIndexFullReader.not(termId));
diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexReader.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexReader.java
index fe134f6e..d6dd854e 100644
--- a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexReader.java
+++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexReader.java
@@ -13,12 +13,12 @@ import nu.marginalia.index.query.ReverseIndexEntrySourceBehavior;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.util.ArrayList;
import java.util.List;
public class SearchIndexReader {
private final Logger logger = LoggerFactory.getLogger(getClass());
+
private final ForwardIndexReader forwardIndexReader;
private final ReverseIndexFullReader reverseIndexFullReader;
private final ReverseIndexPriorityReader reverseIndexPriorityReader;
@@ -31,38 +31,14 @@ public class SearchIndexReader {
this.reverseIndexPriorityReader = reverseIndexPriorityReader;
}
- public IndexQueryBuilder findWordAsSentence(int[] wordIdsByFrequency) {
- List entrySources = new ArrayList<>(1);
-
- entrySources.add(reverseIndexFullReader.documents(wordIdsByFrequency[0], ReverseIndexEntrySourceBehavior.DO_PREFER));
-
- return new SearchIndexQueryBuilder(reverseIndexFullReader, new IndexQuery(entrySources));
+ public IndexQueryBuilder findPriorityWord(int wordId) {
+ return new SearchIndexQueryBuilder(reverseIndexFullReader, reverseIndexPriorityReader, new IndexQuery(
+ List.of(reverseIndexPriorityReader.priorityDocuments(wordId))));
}
- public IndexQueryBuilder findWordAsTopic(int[] wordIdsByFrequency) {
- List entrySources = new ArrayList<>(wordIdsByFrequency.length);
-
- for (int wordId : wordIdsByFrequency) {
- entrySources.add(reverseIndexPriorityReader.priorityDocuments(wordId));
- }
-
- return new SearchIndexQueryBuilder(reverseIndexFullReader, new IndexQuery(entrySources));
- }
-
- public IndexQueryBuilder findWordTopicDynamicMode(int[] wordIdsByFrequency) {
- if (wordIdsByFrequency.length > 3) {
- return findWordAsSentence(wordIdsByFrequency);
- }
-
- List entrySources = new ArrayList<>(wordIdsByFrequency.length + 1);
-
- for (int wordId : wordIdsByFrequency) {
- entrySources.add(reverseIndexPriorityReader.priorityDocuments(wordId));
- }
-
- entrySources.add(reverseIndexFullReader.documents(wordIdsByFrequency[0], ReverseIndexEntrySourceBehavior.DO_NOT_PREFER));
-
- return new SearchIndexQueryBuilder(reverseIndexFullReader, new IndexQuery(entrySources));
+ public IndexQueryBuilder findFullWord(int wordId, ReverseIndexEntrySourceBehavior behavior) {
+ return new SearchIndexQueryBuilder(reverseIndexFullReader, reverseIndexPriorityReader, new IndexQuery(
+ List.of(reverseIndexFullReader.documents(wordId, behavior))));
}
QueryFilterStepIf filterForParams(IndexQueryParams params) {
diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java
index 8e3e5ae5..0b3b06fb 100644
--- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java
+++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java
@@ -69,6 +69,10 @@ public class IndexQueryService {
String json = request.body();
SearchSpecification specsSet = gson.fromJson(json, SearchSpecification.class);
+ if (!index.isAvailable()) {
+ Spark.halt(503, "Index is not loaded");
+ }
+
try {
return wmsa_edge_index_query_time.time(() -> {
var params = new SearchParameters(specsSet, getSearchSet(specsSet));
@@ -135,36 +139,39 @@ public class IndexQueryService {
final TLongList results = new TLongArrayList(params.fetchSize);
logger.info(queryMarker, "{}", params.queryParams);
- for (var sq : params.subqueries) {
- final SearchIndexSearchTerms searchTerms = searchTermsSvc.getSearchTerms(sq);
+
+ outer:
+ // These queries are various term combinations
+ for (var subquery : params.subqueries) {
+ final SearchIndexSearchTerms searchTerms = searchTermsSvc.getSearchTerms(subquery);
if (searchTerms.isEmpty()) {
continue;
}
- var resultsForSq = executeSubquery(searchTerms, params);
+ // These queries are different indices for one subquery
+ List queries = params.createIndexQueries(index, searchTerms);
+ for (var query : queries) {
+ var resultsForSq = executeQuery(query, params);
+ logger.info(queryMarker, "{} from {}", resultsForSq.size(), subquery);
+ results.addAll(resultsForSq);
- logger.info(queryMarker, "{} from {}", resultsForSq.size(), sq);
-
- results.addAll(resultsForSq);
-
- if (!params.hasTimeLeft()) {
- logger.info("Query timed out {}, ({}), -{}",
- sq.searchTermsInclude, sq.searchTermsAdvice, sq.searchTermsExclude);
- break;
+ if (!params.hasTimeLeft()) {
+ logger.info("Query timed out {}, ({}), -{}",
+ subquery.searchTermsInclude, subquery.searchTermsAdvice, subquery.searchTermsExclude);
+ break outer;
+ }
}
}
return results;
}
- private TLongArrayList executeSubquery(SearchIndexSearchTerms terms, SearchParameters params)
+ private TLongArrayList executeQuery(IndexQuery query, SearchParameters params)
{
final TLongArrayList results = new TLongArrayList(params.fetchSize);
final LongQueryBuffer buffer = new LongQueryBuffer(params.fetchSize);
- IndexQuery query = params.createIndexQuery(index, terms);
-
while (query.hasMore()
&& results.size() < params.fetchSize
&& params.budget.hasTimeLeft())
diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchParameters.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchParameters.java
index 846f52c0..313f3694 100644
--- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchParameters.java
+++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchParameters.java
@@ -58,8 +58,8 @@ public class SearchParameters {
specsSet.queryStrategy);
}
- IndexQuery createIndexQuery(SearchIndex index, SearchIndexSearchTerms terms) {
- return index.createQuery(terms, queryParams, consideredUrlIds::add);
+ List createIndexQueries(SearchIndex index, SearchIndexSearchTerms terms) {
+ return index.createQueries(terms, queryParams, consideredUrlIds::add);
}
boolean hasTimeLeft() {
diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchTermsService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchTermsService.java
index d82a8c42..81d561b1 100644
--- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchTermsService.java
+++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchTermsService.java
@@ -34,7 +34,7 @@ public class SearchTermsService {
for (var include : request.searchTermsInclude) {
var word = lookUpWord(include);
if (word.isEmpty()) {
- logger.info("Unknown search term: " + include);
+ logger.debug("Unknown search term: " + include);
return new SearchIndexSearchTerms();
}
includes.add(word.getAsInt());
@@ -44,7 +44,7 @@ public class SearchTermsService {
for (var advice : request.searchTermsAdvice) {
var word = lookUpWord(advice);
if (word.isEmpty()) {
- logger.info("Unknown search term: " + advice);
+ logger.debug("Unknown search term: " + advice);
return new SearchIndexSearchTerms();
}
includes.add(word.getAsInt());