diff --git a/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexFinalizeTest.java b/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexFinalizeTest.java index 3f742897..1ef2df4e 100644 --- a/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexFinalizeTest.java +++ b/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexFinalizeTest.java @@ -69,8 +69,8 @@ class ReversePreindexFinalizeTest { var docsArray = LongArrayFactory.mmapForReadingConfined(docsFile); var wordsArray = LongArrayFactory.mmapForReadingConfined(wordsFile); - var docsHeader = BTreeReader.readHeader(docsArray, 0); - var wordsHeader = BTreeReader.readHeader(wordsArray, 0); + var docsHeader = new BTreeHeader(docsArray, 0); + var wordsHeader = new BTreeHeader(wordsArray, 0); assertEquals(1, docsHeader.numEntries()); assertEquals(1, wordsHeader.numEntries()); @@ -107,7 +107,7 @@ class ReversePreindexFinalizeTest { var wordsArray = LongArrayFactory.mmapForReadingConfined(wordsFile); - var wordsHeader = BTreeReader.readHeader(wordsArray, 0); + var wordsHeader = new BTreeHeader(wordsArray, 0); System.out.println(wordsHeader); @@ -123,14 +123,14 @@ class ReversePreindexFinalizeTest { BTreeHeader docsHeader; - docsHeader = BTreeReader.readHeader(docsArray, offset1); + docsHeader = new BTreeHeader(docsArray, offset1); System.out.println(docsHeader); assertEquals(1, docsHeader.numEntries()); assertEquals(100, docsArray.get(docsHeader.dataOffsetLongs() + 0)); assertEquals(51, docsArray.get(docsHeader.dataOffsetLongs() + 1)); - docsHeader = BTreeReader.readHeader(docsArray, offset2); + docsHeader = new BTreeHeader(docsArray, offset2); System.out.println(docsHeader); assertEquals(1, docsHeader.numEntries()); diff --git a/code/index/java/nu/marginalia/index/IndexGrpcService.java b/code/index/java/nu/marginalia/index/IndexGrpcService.java index 07199f9e..1c430014 100644 --- a/code/index/java/nu/marginalia/index/IndexGrpcService.java +++ b/code/index/java/nu/marginalia/index/IndexGrpcService.java @@ -78,7 +78,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { .labelNames("node") .register(); - private final StatefulIndex index; + private final StatefulIndex statefulIndex; private final SearchSetsService searchSetsService; private final IndexResultValuatorService resultValuator; @@ -89,13 +89,13 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { @Inject public IndexGrpcService(ServiceConfiguration serviceConfiguration, - StatefulIndex index, + StatefulIndex statefulIndex, SearchSetsService searchSetsService, IndexResultValuatorService resultValuator) { var nodeId = serviceConfiguration.node(); this.nodeName = Integer.toString(nodeId); - this.index = index; + this.statefulIndex = statefulIndex; this.searchSetsService = searchSetsService; this.resultValuator = resultValuator; } @@ -207,7 +207,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { private SearchResultSet executeSearch(SearchParameters params) throws SQLException, InterruptedException { - if (!index.isLoaded()) { + if (!statefulIndex.isLoaded()) { // Short-circuit if the index is not loaded, as we trivially know that there can be no results return new SearchResultSet(List.of()); } @@ -268,7 +268,8 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { var terms = new SearchTerms(parameters.query, parameters.compiledQueryIds); - for (var indexQuery : index.createQueries(terms, parameters.queryParams)) { + var currentIndex = statefulIndex.get(); + for (var indexQuery : currentIndex.createQueries(terms, parameters.queryParams)) { workerPool.execute(new IndexLookup(indexQuery, parameters.budget)); } @@ -435,10 +436,12 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { BitSet ngramsMask = new BitSet(compiledQuery.size()); BitSet regularMask = new BitSet(compiledQuery.size()); + var currentIndex = statefulIndex.get(); + for (int idx = 0; idx < compiledQueryIds.size(); idx++) { long id = compiledQueryIds.at(idx); - full[idx] = index.getTermFrequency(id); - prio[idx] = index.getTermFrequencyPrio(id); + full[idx] = currentIndex.numHits(id); + prio[idx] = currentIndex.numHitsPrio(id); if (compiledQuery.at(idx).contains("_")) { ngramsMask.set(idx); @@ -448,7 +451,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { } } - return new ResultRankingContext(index.getTotalDocCount(), + return new ResultRankingContext(currentIndex.totalDocCount(), rankingParams, ngramsMask, regularMask, diff --git a/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java b/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java index 27a631f5..afc52094 100644 --- a/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java +++ b/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java @@ -1,8 +1,14 @@ package nu.marginalia.index.index; +import it.unimi.dsi.fastutil.longs.LongArrayList; +import it.unimi.dsi.fastutil.longs.LongList; +import it.unimi.dsi.fastutil.longs.LongOpenHashSet; +import it.unimi.dsi.fastutil.longs.LongSet; +import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; import nu.marginalia.index.ReverseIndexReader; import nu.marginalia.index.forward.ForwardIndexReader; import nu.marginalia.index.model.QueryParams; +import nu.marginalia.index.model.SearchTerms; import nu.marginalia.index.query.IndexQuery; import nu.marginalia.index.query.IndexQueryBuilder; import nu.marginalia.index.query.filter.QueryFilterStepIf; @@ -15,9 +21,17 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.time.Duration; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; import java.util.concurrent.TimeUnit; +import java.util.function.Predicate; -/** A reader for the combined forward and reverse indexes */ +/** A reader for the combined forward and reverse indexes. + *

+ * This class does not deal with the lifecycle of the indexes, + * that is the responsibility of {@link StatefulIndex}. + * */ public class CombinedIndexReader { private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -66,12 +80,91 @@ public class CombinedIndexReader { } /** Returns the number of occurrences of the word in the full index */ - public long numHits(long word) { + public int numHits(long word) { return reverseIndexFullReader.numDocuments(word); } + public List createQueries(SearchTerms terms, QueryParams params) { + + if (!isLoaded()) { + logger.warn("Index reader not ready"); + return Collections.emptyList(); + } + + List queryHeads = new ArrayList<>(10); + + final long[] termPriority = terms.sortedDistinctIncludes(this::compareKeywords); + List paths = CompiledQueryAggregates.queriesAggregate(terms.compiledQuery()); + + // Remove any paths that do not contain all prioritized terms, as this means + // the term is missing from the index and can never be found + paths.removeIf(containsAll(termPriority).negate()); + + for (var path : paths) { + LongList elements = new LongArrayList(path); + + elements.sort((a, b) -> { + for (int i = 0; i < termPriority.length; i++) { + if (termPriority[i] == a) + return -1; + if (termPriority[i] == b) + return 1; + } + return 0; + }); + + var head = findFullWord(elements.getLong(0)); + for (int i = 1; i < elements.size(); i++) { + head.addInclusionFilter(hasWordFull(elements.getLong(i))); + } + queryHeads.add(head); + + // If there are few paths, we can afford to check the priority index as well + if (paths.size() < 4) { + var prioHead = findPriorityWord(elements.getLong(0)); + for (int i = 1; i < elements.size(); i++) { + prioHead.addInclusionFilter(hasWordPrio(elements.getLong(i))); + } + queryHeads.add(prioHead); + } + } + + // Add additional conditions to the query heads + for (var query : queryHeads) { + + // Advice terms are a special case, mandatory but not ranked, and exempt from re-writing + for (long term : terms.advice()) { + query = query.also(term); + } + + for (long term : terms.excludes()) { + query = query.not(term); + } + + // Run these filter steps last, as they'll worst-case cause as many page faults as there are + // items in the buffer + query.addInclusionFilter(filterForParams(params)); + } + + return queryHeads + .stream() + .map(IndexQueryBuilder::build) + .toList(); + } + + private Predicate containsAll(long[] permitted) { + LongSet permittedTerms = new LongOpenHashSet(permitted); + return permittedTerms::containsAll; + } + + private int compareKeywords(long a, long b) { + return Long.compare( + numHits(a), + numHits(b) + ); + } /** Returns the number of occurrences of the word in the priority index */ - public long numHitsPrio(long word) { + public int numHitsPrio(long word) { return reverseIndexPriorityReader.numDocuments(word); } diff --git a/code/index/java/nu/marginalia/index/index/StatefulIndex.java b/code/index/java/nu/marginalia/index/index/StatefulIndex.java index 74ca220f..7da5f74b 100644 --- a/code/index/java/nu/marginalia/index/index/StatefulIndex.java +++ b/code/index/java/nu/marginalia/index/index/StatefulIndex.java @@ -2,32 +2,19 @@ package nu.marginalia.index.index; import com.google.inject.Inject; import com.google.inject.Singleton; -import it.unimi.dsi.fastutil.longs.*; -import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; -import nu.marginalia.index.query.filter.QueryFilterAllOf; -import nu.marginalia.index.query.filter.QueryFilterAnyOf; -import nu.marginalia.index.query.filter.QueryFilterStepIf; -import nu.marginalia.index.results.model.ids.CombinedDocIdList; -import nu.marginalia.index.results.model.ids.DocMetadataList; -import nu.marginalia.index.model.QueryParams; import nu.marginalia.index.IndexFactory; -import nu.marginalia.index.model.SearchTerms; -import nu.marginalia.index.query.*; import nu.marginalia.service.control.ServiceEventLog; import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; -import java.util.*; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; -import java.util.function.Predicate; -/** This class delegates SearchIndexReader and deals with the stateful nature of the index, +/** This class holds {@link CombinedIndexReader} and deals with the stateful nature of the index, * i.e. it may be possible to reconstruct the index and load a new set of data. - * */ @Singleton public class StatefulIndex { @@ -108,109 +95,11 @@ public class StatefulIndex { return combinedIndexReader != null && combinedIndexReader.isLoaded(); } - public List createQueries(SearchTerms terms, QueryParams params) { - - if (!isLoaded()) { - logger.warn("Index reader not ready"); - return Collections.emptyList(); - } - - List queryHeads = new ArrayList<>(10); - - final long[] termPriority = terms.sortedDistinctIncludes(this::compareKeywords); - List paths = CompiledQueryAggregates.queriesAggregate(terms.compiledQuery()); - - // Remove any paths that do not contain all prioritized terms, as this means - // the term is missing from the index and can never be found - paths.removeIf(containsAll(termPriority).negate()); - - for (var path : paths) { - LongList elements = new LongArrayList(path); - - elements.sort((a, b) -> { - for (int i = 0; i < termPriority.length; i++) { - if (termPriority[i] == a) - return -1; - if (termPriority[i] == b) - return 1; - } - return 0; - }); - - var head = combinedIndexReader.findFullWord(elements.getLong(0)); - for (int i = 1; i < elements.size(); i++) { - head.addInclusionFilter(combinedIndexReader.hasWordFull(elements.getLong(i))); - } - queryHeads.add(head); - - // If there are few paths, we can afford to check the priority index as well - if (paths.size() < 4) { - var prioHead = combinedIndexReader.findPriorityWord(elements.getLong(0)); - for (int i = 1; i < elements.size(); i++) { - prioHead.addInclusionFilter(combinedIndexReader.hasWordPrio(elements.getLong(i))); - } - queryHeads.add(prioHead); - } - } - - // Add additional conditions to the query heads - for (var query : queryHeads) { - - // Advice terms are a special case, mandatory but not ranked, and exempt from re-writing - for (long term : terms.advice()) { - query = query.also(term); - } - - for (long term : terms.excludes()) { - query = query.not(term); - } - - // Run these filter steps last, as they'll worst-case cause as many page faults as there are - // items in the buffer - query.addInclusionFilter(combinedIndexReader.filterForParams(params)); - } - - return queryHeads - .stream() - .map(IndexQueryBuilder::build) - .toList(); - } - - private Predicate containsAll(long[] permitted) { - LongSet permittedTerms = new LongOpenHashSet(permitted); - return permittedTerms::containsAll; - } - - private int compareKeywords(long a, long b) { - return Long.compare( - combinedIndexReader.numHits(a), - combinedIndexReader.numHits(b) - ); - } - - /** Return an array of encoded document metadata longs corresponding to the - * document identifiers provided; with metadata for termId. The input array - * docs[] *must* be sorted. + /** Returns the current index reader. It is acceptable to hold the returned value for the duration of the query, + * but not share it between queries */ - public DocMetadataList getTermMetadata(long termId, CombinedDocIdList docs) { - return combinedIndexReader.getMetadata(termId, docs); - } - public long getDocumentMetadata(long docId) { - return combinedIndexReader.getDocumentMetadata(docId); + public CombinedIndexReader get() { + return combinedIndexReader; } - public int getHtmlFeatures(long docId) { - return combinedIndexReader.getHtmlFeatures(docId); - } - - public int getTotalDocCount() { - return combinedIndexReader.totalDocCount(); - } - public int getTermFrequency(long id) { - return (int) combinedIndexReader.numHits(id); - } - - public int getTermFrequencyPrio(long id) { - return (int) combinedIndexReader.numHitsPrio(id); - } } diff --git a/code/index/java/nu/marginalia/index/results/IndexMetadataService.java b/code/index/java/nu/marginalia/index/results/IndexMetadataService.java index a43f9436..d068c0f4 100644 --- a/code/index/java/nu/marginalia/index/results/IndexMetadataService.java +++ b/code/index/java/nu/marginalia/index/results/IndexMetadataService.java @@ -18,21 +18,24 @@ import static nu.marginalia.index.results.model.TermCoherenceGroupList.TermCoher import static nu.marginalia.index.results.model.TermMetadataForCombinedDocumentIds.DocumentsWithMetadata; public class IndexMetadataService { - private final StatefulIndex index; + private final StatefulIndex statefulIndex; @Inject public IndexMetadataService(StatefulIndex index) { - this.index = index; + this.statefulIndex = index; } public TermMetadataForCombinedDocumentIds getTermMetadataForDocuments(CombinedDocIdList combinedIdsAll, TermIdList termIdsList) { + var currentIndex = statefulIndex.get(); + Long2ObjectArrayMap termdocToMeta = new Long2ObjectArrayMap<>(termIdsList.size()); for (long termId : termIdsList.array()) { - var metadata = index.getTermMetadata(termId, combinedIdsAll); + var metadata = currentIndex.getMetadata(termId, combinedIdsAll); + termdocToMeta.put(termId, new DocumentsWithMetadata(combinedIdsAll, metadata)); } diff --git a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java index f642dfc0..0fc4bdc1 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java @@ -5,6 +5,7 @@ import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggre import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.SearchResultItem; import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; +import nu.marginalia.index.index.CombinedIndexReader; import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.index.model.SearchParameters; import nu.marginalia.index.results.model.ids.CombinedDocIdList; @@ -24,7 +25,7 @@ import java.util.List; * It holds the data required to perform the scoring, as there is strong * reasons to cache this data, and performs the calculations */ public class IndexResultValuationContext { - private final StatefulIndex statefulIndex; + private final CombinedIndexReader index; private final QueryParams queryParams; private final TermMetadataForCombinedDocumentIds termMetadataForCombinedDocumentIds; @@ -42,7 +43,7 @@ public class IndexResultValuationContext { ResultRankingContext rankingContext, SearchParameters params ) { - this.statefulIndex = statefulIndex; + this.index = statefulIndex.get(); this.rankingContext = rankingContext; this.searchResultValuator = searchResultValuator; @@ -67,8 +68,8 @@ public class IndexResultValuationContext { if (!searchTerms.coherences.test(termMetadataForCombinedDocumentIds, combinedId)) return null; - long docMetadata = statefulIndex.getDocumentMetadata(docId); - int htmlFeatures = statefulIndex.getHtmlFeatures(docId); + long docMetadata = index.getDocumentMetadata(docId); + int htmlFeatures = index.getHtmlFeatures(docId); SearchResultItem searchResult = new SearchResultItem(docId, docMetadata, diff --git a/code/libraries/btree/java/nu/marginalia/btree/BTreeReader.java b/code/libraries/btree/java/nu/marginalia/btree/BTreeReader.java index 1663f1c2..f74d2370 100644 --- a/code/libraries/btree/java/nu/marginalia/btree/BTreeReader.java +++ b/code/libraries/btree/java/nu/marginalia/btree/BTreeReader.java @@ -19,7 +19,7 @@ public class BTreeReader { public BTreeReader(LongArray file, BTreeContext ctx, long offset) { this.ctx = ctx; - this.header = readHeader(file, offset); + this.header = new BTreeHeader(file, offset); dataBlockEnd = (long) ctx.entrySize * header.numEntries(); index = file.range(header.indexOffsetLongs(), header.dataOffsetLongs()); @@ -35,10 +35,6 @@ public class BTreeReader { return index; } - public static BTreeHeader readHeader(LongArray file, long fileOffset) { - return new BTreeHeader(file, fileOffset); - } - public BTreeHeader getHeader() { return header; } @@ -153,7 +149,6 @@ public class BTreeReader { pointer.walkToData(keys[i]); long dataAddress = pointer.findData(keys[i]); - if (dataAddress >= 0) { ret[i] = data.get(dataAddress + offset); }