diff --git a/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexFinalizeTest.java b/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexFinalizeTest.java
index 3f742897..1ef2df4e 100644
--- a/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexFinalizeTest.java
+++ b/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexFinalizeTest.java
@@ -69,8 +69,8 @@ class ReversePreindexFinalizeTest {
var docsArray = LongArrayFactory.mmapForReadingConfined(docsFile);
var wordsArray = LongArrayFactory.mmapForReadingConfined(wordsFile);
- var docsHeader = BTreeReader.readHeader(docsArray, 0);
- var wordsHeader = BTreeReader.readHeader(wordsArray, 0);
+ var docsHeader = new BTreeHeader(docsArray, 0);
+ var wordsHeader = new BTreeHeader(wordsArray, 0);
assertEquals(1, docsHeader.numEntries());
assertEquals(1, wordsHeader.numEntries());
@@ -107,7 +107,7 @@ class ReversePreindexFinalizeTest {
var wordsArray = LongArrayFactory.mmapForReadingConfined(wordsFile);
- var wordsHeader = BTreeReader.readHeader(wordsArray, 0);
+ var wordsHeader = new BTreeHeader(wordsArray, 0);
System.out.println(wordsHeader);
@@ -123,14 +123,14 @@ class ReversePreindexFinalizeTest {
BTreeHeader docsHeader;
- docsHeader = BTreeReader.readHeader(docsArray, offset1);
+ docsHeader = new BTreeHeader(docsArray, offset1);
System.out.println(docsHeader);
assertEquals(1, docsHeader.numEntries());
assertEquals(100, docsArray.get(docsHeader.dataOffsetLongs() + 0));
assertEquals(51, docsArray.get(docsHeader.dataOffsetLongs() + 1));
- docsHeader = BTreeReader.readHeader(docsArray, offset2);
+ docsHeader = new BTreeHeader(docsArray, offset2);
System.out.println(docsHeader);
assertEquals(1, docsHeader.numEntries());
diff --git a/code/index/java/nu/marginalia/index/IndexGrpcService.java b/code/index/java/nu/marginalia/index/IndexGrpcService.java
index 07199f9e..1c430014 100644
--- a/code/index/java/nu/marginalia/index/IndexGrpcService.java
+++ b/code/index/java/nu/marginalia/index/IndexGrpcService.java
@@ -78,7 +78,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
.labelNames("node")
.register();
- private final StatefulIndex index;
+ private final StatefulIndex statefulIndex;
private final SearchSetsService searchSetsService;
private final IndexResultValuatorService resultValuator;
@@ -89,13 +89,13 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
@Inject
public IndexGrpcService(ServiceConfiguration serviceConfiguration,
- StatefulIndex index,
+ StatefulIndex statefulIndex,
SearchSetsService searchSetsService,
IndexResultValuatorService resultValuator)
{
var nodeId = serviceConfiguration.node();
this.nodeName = Integer.toString(nodeId);
- this.index = index;
+ this.statefulIndex = statefulIndex;
this.searchSetsService = searchSetsService;
this.resultValuator = resultValuator;
}
@@ -207,7 +207,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
private SearchResultSet executeSearch(SearchParameters params) throws SQLException, InterruptedException {
- if (!index.isLoaded()) {
+ if (!statefulIndex.isLoaded()) {
// Short-circuit if the index is not loaded, as we trivially know that there can be no results
return new SearchResultSet(List.of());
}
@@ -268,7 +268,8 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
var terms = new SearchTerms(parameters.query, parameters.compiledQueryIds);
- for (var indexQuery : index.createQueries(terms, parameters.queryParams)) {
+ var currentIndex = statefulIndex.get();
+ for (var indexQuery : currentIndex.createQueries(terms, parameters.queryParams)) {
workerPool.execute(new IndexLookup(indexQuery, parameters.budget));
}
@@ -435,10 +436,12 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
BitSet ngramsMask = new BitSet(compiledQuery.size());
BitSet regularMask = new BitSet(compiledQuery.size());
+ var currentIndex = statefulIndex.get();
+
for (int idx = 0; idx < compiledQueryIds.size(); idx++) {
long id = compiledQueryIds.at(idx);
- full[idx] = index.getTermFrequency(id);
- prio[idx] = index.getTermFrequencyPrio(id);
+ full[idx] = currentIndex.numHits(id);
+ prio[idx] = currentIndex.numHitsPrio(id);
if (compiledQuery.at(idx).contains("_")) {
ngramsMask.set(idx);
@@ -448,7 +451,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
}
}
- return new ResultRankingContext(index.getTotalDocCount(),
+ return new ResultRankingContext(currentIndex.totalDocCount(),
rankingParams,
ngramsMask,
regularMask,
diff --git a/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java b/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java
index 27a631f5..afc52094 100644
--- a/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java
+++ b/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java
@@ -1,8 +1,14 @@
package nu.marginalia.index.index;
+import it.unimi.dsi.fastutil.longs.LongArrayList;
+import it.unimi.dsi.fastutil.longs.LongList;
+import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
+import it.unimi.dsi.fastutil.longs.LongSet;
+import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
import nu.marginalia.index.ReverseIndexReader;
import nu.marginalia.index.forward.ForwardIndexReader;
import nu.marginalia.index.model.QueryParams;
+import nu.marginalia.index.model.SearchTerms;
import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.query.IndexQueryBuilder;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
@@ -15,9 +21,17 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.time.Duration;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
import java.util.concurrent.TimeUnit;
+import java.util.function.Predicate;
-/** A reader for the combined forward and reverse indexes */
+/** A reader for the combined forward and reverse indexes.
+ *
+ * This class does not deal with the lifecycle of the indexes,
+ * that is the responsibility of {@link StatefulIndex}.
+ * */
public class CombinedIndexReader {
private final Logger logger = LoggerFactory.getLogger(getClass());
@@ -66,12 +80,91 @@ public class CombinedIndexReader {
}
/** Returns the number of occurrences of the word in the full index */
- public long numHits(long word) {
+ public int numHits(long word) {
return reverseIndexFullReader.numDocuments(word);
}
+ public List createQueries(SearchTerms terms, QueryParams params) {
+
+ if (!isLoaded()) {
+ logger.warn("Index reader not ready");
+ return Collections.emptyList();
+ }
+
+ List queryHeads = new ArrayList<>(10);
+
+ final long[] termPriority = terms.sortedDistinctIncludes(this::compareKeywords);
+ List paths = CompiledQueryAggregates.queriesAggregate(terms.compiledQuery());
+
+ // Remove any paths that do not contain all prioritized terms, as this means
+ // the term is missing from the index and can never be found
+ paths.removeIf(containsAll(termPriority).negate());
+
+ for (var path : paths) {
+ LongList elements = new LongArrayList(path);
+
+ elements.sort((a, b) -> {
+ for (int i = 0; i < termPriority.length; i++) {
+ if (termPriority[i] == a)
+ return -1;
+ if (termPriority[i] == b)
+ return 1;
+ }
+ return 0;
+ });
+
+ var head = findFullWord(elements.getLong(0));
+ for (int i = 1; i < elements.size(); i++) {
+ head.addInclusionFilter(hasWordFull(elements.getLong(i)));
+ }
+ queryHeads.add(head);
+
+ // If there are few paths, we can afford to check the priority index as well
+ if (paths.size() < 4) {
+ var prioHead = findPriorityWord(elements.getLong(0));
+ for (int i = 1; i < elements.size(); i++) {
+ prioHead.addInclusionFilter(hasWordPrio(elements.getLong(i)));
+ }
+ queryHeads.add(prioHead);
+ }
+ }
+
+ // Add additional conditions to the query heads
+ for (var query : queryHeads) {
+
+ // Advice terms are a special case, mandatory but not ranked, and exempt from re-writing
+ for (long term : terms.advice()) {
+ query = query.also(term);
+ }
+
+ for (long term : terms.excludes()) {
+ query = query.not(term);
+ }
+
+ // Run these filter steps last, as they'll worst-case cause as many page faults as there are
+ // items in the buffer
+ query.addInclusionFilter(filterForParams(params));
+ }
+
+ return queryHeads
+ .stream()
+ .map(IndexQueryBuilder::build)
+ .toList();
+ }
+
+ private Predicate containsAll(long[] permitted) {
+ LongSet permittedTerms = new LongOpenHashSet(permitted);
+ return permittedTerms::containsAll;
+ }
+
+ private int compareKeywords(long a, long b) {
+ return Long.compare(
+ numHits(a),
+ numHits(b)
+ );
+ }
/** Returns the number of occurrences of the word in the priority index */
- public long numHitsPrio(long word) {
+ public int numHitsPrio(long word) {
return reverseIndexPriorityReader.numDocuments(word);
}
diff --git a/code/index/java/nu/marginalia/index/index/StatefulIndex.java b/code/index/java/nu/marginalia/index/index/StatefulIndex.java
index 74ca220f..7da5f74b 100644
--- a/code/index/java/nu/marginalia/index/index/StatefulIndex.java
+++ b/code/index/java/nu/marginalia/index/index/StatefulIndex.java
@@ -2,32 +2,19 @@ package nu.marginalia.index.index;
import com.google.inject.Inject;
import com.google.inject.Singleton;
-import it.unimi.dsi.fastutil.longs.*;
-import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
-import nu.marginalia.index.query.filter.QueryFilterAllOf;
-import nu.marginalia.index.query.filter.QueryFilterAnyOf;
-import nu.marginalia.index.query.filter.QueryFilterStepIf;
-import nu.marginalia.index.results.model.ids.CombinedDocIdList;
-import nu.marginalia.index.results.model.ids.DocMetadataList;
-import nu.marginalia.index.model.QueryParams;
import nu.marginalia.index.IndexFactory;
-import nu.marginalia.index.model.SearchTerms;
-import nu.marginalia.index.query.*;
import nu.marginalia.service.control.ServiceEventLog;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
-import java.util.*;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
-import java.util.function.Predicate;
-/** This class delegates SearchIndexReader and deals with the stateful nature of the index,
+/** This class holds {@link CombinedIndexReader} and deals with the stateful nature of the index,
* i.e. it may be possible to reconstruct the index and load a new set of data.
- *
*/
@Singleton
public class StatefulIndex {
@@ -108,109 +95,11 @@ public class StatefulIndex {
return combinedIndexReader != null && combinedIndexReader.isLoaded();
}
- public List createQueries(SearchTerms terms, QueryParams params) {
-
- if (!isLoaded()) {
- logger.warn("Index reader not ready");
- return Collections.emptyList();
- }
-
- List queryHeads = new ArrayList<>(10);
-
- final long[] termPriority = terms.sortedDistinctIncludes(this::compareKeywords);
- List paths = CompiledQueryAggregates.queriesAggregate(terms.compiledQuery());
-
- // Remove any paths that do not contain all prioritized terms, as this means
- // the term is missing from the index and can never be found
- paths.removeIf(containsAll(termPriority).negate());
-
- for (var path : paths) {
- LongList elements = new LongArrayList(path);
-
- elements.sort((a, b) -> {
- for (int i = 0; i < termPriority.length; i++) {
- if (termPriority[i] == a)
- return -1;
- if (termPriority[i] == b)
- return 1;
- }
- return 0;
- });
-
- var head = combinedIndexReader.findFullWord(elements.getLong(0));
- for (int i = 1; i < elements.size(); i++) {
- head.addInclusionFilter(combinedIndexReader.hasWordFull(elements.getLong(i)));
- }
- queryHeads.add(head);
-
- // If there are few paths, we can afford to check the priority index as well
- if (paths.size() < 4) {
- var prioHead = combinedIndexReader.findPriorityWord(elements.getLong(0));
- for (int i = 1; i < elements.size(); i++) {
- prioHead.addInclusionFilter(combinedIndexReader.hasWordPrio(elements.getLong(i)));
- }
- queryHeads.add(prioHead);
- }
- }
-
- // Add additional conditions to the query heads
- for (var query : queryHeads) {
-
- // Advice terms are a special case, mandatory but not ranked, and exempt from re-writing
- for (long term : terms.advice()) {
- query = query.also(term);
- }
-
- for (long term : terms.excludes()) {
- query = query.not(term);
- }
-
- // Run these filter steps last, as they'll worst-case cause as many page faults as there are
- // items in the buffer
- query.addInclusionFilter(combinedIndexReader.filterForParams(params));
- }
-
- return queryHeads
- .stream()
- .map(IndexQueryBuilder::build)
- .toList();
- }
-
- private Predicate containsAll(long[] permitted) {
- LongSet permittedTerms = new LongOpenHashSet(permitted);
- return permittedTerms::containsAll;
- }
-
- private int compareKeywords(long a, long b) {
- return Long.compare(
- combinedIndexReader.numHits(a),
- combinedIndexReader.numHits(b)
- );
- }
-
- /** Return an array of encoded document metadata longs corresponding to the
- * document identifiers provided; with metadata for termId. The input array
- * docs[] *must* be sorted.
+ /** Returns the current index reader. It is acceptable to hold the returned value for the duration of the query,
+ * but not share it between queries
*/
- public DocMetadataList getTermMetadata(long termId, CombinedDocIdList docs) {
- return combinedIndexReader.getMetadata(termId, docs);
- }
- public long getDocumentMetadata(long docId) {
- return combinedIndexReader.getDocumentMetadata(docId);
+ public CombinedIndexReader get() {
+ return combinedIndexReader;
}
- public int getHtmlFeatures(long docId) {
- return combinedIndexReader.getHtmlFeatures(docId);
- }
-
- public int getTotalDocCount() {
- return combinedIndexReader.totalDocCount();
- }
- public int getTermFrequency(long id) {
- return (int) combinedIndexReader.numHits(id);
- }
-
- public int getTermFrequencyPrio(long id) {
- return (int) combinedIndexReader.numHitsPrio(id);
- }
}
diff --git a/code/index/java/nu/marginalia/index/results/IndexMetadataService.java b/code/index/java/nu/marginalia/index/results/IndexMetadataService.java
index a43f9436..d068c0f4 100644
--- a/code/index/java/nu/marginalia/index/results/IndexMetadataService.java
+++ b/code/index/java/nu/marginalia/index/results/IndexMetadataService.java
@@ -18,21 +18,24 @@ import static nu.marginalia.index.results.model.TermCoherenceGroupList.TermCoher
import static nu.marginalia.index.results.model.TermMetadataForCombinedDocumentIds.DocumentsWithMetadata;
public class IndexMetadataService {
- private final StatefulIndex index;
+ private final StatefulIndex statefulIndex;
@Inject
public IndexMetadataService(StatefulIndex index) {
- this.index = index;
+ this.statefulIndex = index;
}
public TermMetadataForCombinedDocumentIds getTermMetadataForDocuments(CombinedDocIdList combinedIdsAll,
TermIdList termIdsList)
{
+ var currentIndex = statefulIndex.get();
+
Long2ObjectArrayMap termdocToMeta =
new Long2ObjectArrayMap<>(termIdsList.size());
for (long termId : termIdsList.array()) {
- var metadata = index.getTermMetadata(termId, combinedIdsAll);
+ var metadata = currentIndex.getMetadata(termId, combinedIdsAll);
+
termdocToMeta.put(termId,
new DocumentsWithMetadata(combinedIdsAll, metadata));
}
diff --git a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java
index f642dfc0..0fc4bdc1 100644
--- a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java
+++ b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java
@@ -5,6 +5,7 @@ import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggre
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
+import nu.marginalia.index.index.CombinedIndexReader;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.model.SearchParameters;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
@@ -24,7 +25,7 @@ import java.util.List;
* It holds the data required to perform the scoring, as there is strong
* reasons to cache this data, and performs the calculations */
public class IndexResultValuationContext {
- private final StatefulIndex statefulIndex;
+ private final CombinedIndexReader index;
private final QueryParams queryParams;
private final TermMetadataForCombinedDocumentIds termMetadataForCombinedDocumentIds;
@@ -42,7 +43,7 @@ public class IndexResultValuationContext {
ResultRankingContext rankingContext,
SearchParameters params
) {
- this.statefulIndex = statefulIndex;
+ this.index = statefulIndex.get();
this.rankingContext = rankingContext;
this.searchResultValuator = searchResultValuator;
@@ -67,8 +68,8 @@ public class IndexResultValuationContext {
if (!searchTerms.coherences.test(termMetadataForCombinedDocumentIds, combinedId))
return null;
- long docMetadata = statefulIndex.getDocumentMetadata(docId);
- int htmlFeatures = statefulIndex.getHtmlFeatures(docId);
+ long docMetadata = index.getDocumentMetadata(docId);
+ int htmlFeatures = index.getHtmlFeatures(docId);
SearchResultItem searchResult = new SearchResultItem(docId,
docMetadata,
diff --git a/code/libraries/btree/java/nu/marginalia/btree/BTreeReader.java b/code/libraries/btree/java/nu/marginalia/btree/BTreeReader.java
index 1663f1c2..f74d2370 100644
--- a/code/libraries/btree/java/nu/marginalia/btree/BTreeReader.java
+++ b/code/libraries/btree/java/nu/marginalia/btree/BTreeReader.java
@@ -19,7 +19,7 @@ public class BTreeReader {
public BTreeReader(LongArray file, BTreeContext ctx, long offset) {
this.ctx = ctx;
- this.header = readHeader(file, offset);
+ this.header = new BTreeHeader(file, offset);
dataBlockEnd = (long) ctx.entrySize * header.numEntries();
index = file.range(header.indexOffsetLongs(), header.dataOffsetLongs());
@@ -35,10 +35,6 @@ public class BTreeReader {
return index;
}
- public static BTreeHeader readHeader(LongArray file, long fileOffset) {
- return new BTreeHeader(file, fileOffset);
- }
-
public BTreeHeader getHeader() {
return header;
}
@@ -153,7 +149,6 @@ public class BTreeReader {
pointer.walkToData(keys[i]);
long dataAddress = pointer.findData(keys[i]);
-
if (dataAddress >= 0) {
ret[i] = data.get(dataAddress + offset);
}