(index) Refactor to reduce the level of indirection

This commit is contained in:
Viktor Lofgren 2024-05-19 12:40:33 +02:00
parent daf2a8df54
commit 4fcd4a8197
7 changed files with 129 additions and 145 deletions

View File

@ -69,8 +69,8 @@ class ReversePreindexFinalizeTest {
var docsArray = LongArrayFactory.mmapForReadingConfined(docsFile); var docsArray = LongArrayFactory.mmapForReadingConfined(docsFile);
var wordsArray = LongArrayFactory.mmapForReadingConfined(wordsFile); var wordsArray = LongArrayFactory.mmapForReadingConfined(wordsFile);
var docsHeader = BTreeReader.readHeader(docsArray, 0); var docsHeader = new BTreeHeader(docsArray, 0);
var wordsHeader = BTreeReader.readHeader(wordsArray, 0); var wordsHeader = new BTreeHeader(wordsArray, 0);
assertEquals(1, docsHeader.numEntries()); assertEquals(1, docsHeader.numEntries());
assertEquals(1, wordsHeader.numEntries()); assertEquals(1, wordsHeader.numEntries());
@ -107,7 +107,7 @@ class ReversePreindexFinalizeTest {
var wordsArray = LongArrayFactory.mmapForReadingConfined(wordsFile); var wordsArray = LongArrayFactory.mmapForReadingConfined(wordsFile);
var wordsHeader = BTreeReader.readHeader(wordsArray, 0); var wordsHeader = new BTreeHeader(wordsArray, 0);
System.out.println(wordsHeader); System.out.println(wordsHeader);
@ -123,14 +123,14 @@ class ReversePreindexFinalizeTest {
BTreeHeader docsHeader; BTreeHeader docsHeader;
docsHeader = BTreeReader.readHeader(docsArray, offset1); docsHeader = new BTreeHeader(docsArray, offset1);
System.out.println(docsHeader); System.out.println(docsHeader);
assertEquals(1, docsHeader.numEntries()); assertEquals(1, docsHeader.numEntries());
assertEquals(100, docsArray.get(docsHeader.dataOffsetLongs() + 0)); assertEquals(100, docsArray.get(docsHeader.dataOffsetLongs() + 0));
assertEquals(51, docsArray.get(docsHeader.dataOffsetLongs() + 1)); assertEquals(51, docsArray.get(docsHeader.dataOffsetLongs() + 1));
docsHeader = BTreeReader.readHeader(docsArray, offset2); docsHeader = new BTreeHeader(docsArray, offset2);
System.out.println(docsHeader); System.out.println(docsHeader);
assertEquals(1, docsHeader.numEntries()); assertEquals(1, docsHeader.numEntries());

View File

@ -78,7 +78,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
.labelNames("node") .labelNames("node")
.register(); .register();
private final StatefulIndex index; private final StatefulIndex statefulIndex;
private final SearchSetsService searchSetsService; private final SearchSetsService searchSetsService;
private final IndexResultValuatorService resultValuator; private final IndexResultValuatorService resultValuator;
@ -89,13 +89,13 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
@Inject @Inject
public IndexGrpcService(ServiceConfiguration serviceConfiguration, public IndexGrpcService(ServiceConfiguration serviceConfiguration,
StatefulIndex index, StatefulIndex statefulIndex,
SearchSetsService searchSetsService, SearchSetsService searchSetsService,
IndexResultValuatorService resultValuator) IndexResultValuatorService resultValuator)
{ {
var nodeId = serviceConfiguration.node(); var nodeId = serviceConfiguration.node();
this.nodeName = Integer.toString(nodeId); this.nodeName = Integer.toString(nodeId);
this.index = index; this.statefulIndex = statefulIndex;
this.searchSetsService = searchSetsService; this.searchSetsService = searchSetsService;
this.resultValuator = resultValuator; this.resultValuator = resultValuator;
} }
@ -207,7 +207,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
private SearchResultSet executeSearch(SearchParameters params) throws SQLException, InterruptedException { private SearchResultSet executeSearch(SearchParameters params) throws SQLException, InterruptedException {
if (!index.isLoaded()) { if (!statefulIndex.isLoaded()) {
// Short-circuit if the index is not loaded, as we trivially know that there can be no results // Short-circuit if the index is not loaded, as we trivially know that there can be no results
return new SearchResultSet(List.of()); return new SearchResultSet(List.of());
} }
@ -268,7 +268,8 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
var terms = new SearchTerms(parameters.query, parameters.compiledQueryIds); var terms = new SearchTerms(parameters.query, parameters.compiledQueryIds);
for (var indexQuery : index.createQueries(terms, parameters.queryParams)) { var currentIndex = statefulIndex.get();
for (var indexQuery : currentIndex.createQueries(terms, parameters.queryParams)) {
workerPool.execute(new IndexLookup(indexQuery, parameters.budget)); workerPool.execute(new IndexLookup(indexQuery, parameters.budget));
} }
@ -435,10 +436,12 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
BitSet ngramsMask = new BitSet(compiledQuery.size()); BitSet ngramsMask = new BitSet(compiledQuery.size());
BitSet regularMask = new BitSet(compiledQuery.size()); BitSet regularMask = new BitSet(compiledQuery.size());
var currentIndex = statefulIndex.get();
for (int idx = 0; idx < compiledQueryIds.size(); idx++) { for (int idx = 0; idx < compiledQueryIds.size(); idx++) {
long id = compiledQueryIds.at(idx); long id = compiledQueryIds.at(idx);
full[idx] = index.getTermFrequency(id); full[idx] = currentIndex.numHits(id);
prio[idx] = index.getTermFrequencyPrio(id); prio[idx] = currentIndex.numHitsPrio(id);
if (compiledQuery.at(idx).contains("_")) { if (compiledQuery.at(idx).contains("_")) {
ngramsMask.set(idx); ngramsMask.set(idx);
@ -448,7 +451,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
} }
} }
return new ResultRankingContext(index.getTotalDocCount(), return new ResultRankingContext(currentIndex.totalDocCount(),
rankingParams, rankingParams,
ngramsMask, ngramsMask,
regularMask, regularMask,

View File

@ -1,8 +1,14 @@
package nu.marginalia.index.index; package nu.marginalia.index.index;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import it.unimi.dsi.fastutil.longs.LongList;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import it.unimi.dsi.fastutil.longs.LongSet;
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
import nu.marginalia.index.ReverseIndexReader; import nu.marginalia.index.ReverseIndexReader;
import nu.marginalia.index.forward.ForwardIndexReader; import nu.marginalia.index.forward.ForwardIndexReader;
import nu.marginalia.index.model.QueryParams; import nu.marginalia.index.model.QueryParams;
import nu.marginalia.index.model.SearchTerms;
import nu.marginalia.index.query.IndexQuery; import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.query.IndexQueryBuilder; import nu.marginalia.index.query.IndexQueryBuilder;
import nu.marginalia.index.query.filter.QueryFilterStepIf; import nu.marginalia.index.query.filter.QueryFilterStepIf;
@ -15,9 +21,17 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.time.Duration; import java.time.Duration;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import java.util.function.Predicate;
/** A reader for the combined forward and reverse indexes */ /** A reader for the combined forward and reverse indexes.
* <p></p>
* This class does not deal with the lifecycle of the indexes,
* that is the responsibility of {@link StatefulIndex}.
* */
public class CombinedIndexReader { public class CombinedIndexReader {
private final Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());
@ -66,12 +80,91 @@ public class CombinedIndexReader {
} }
/** Returns the number of occurrences of the word in the full index */ /** Returns the number of occurrences of the word in the full index */
public long numHits(long word) { public int numHits(long word) {
return reverseIndexFullReader.numDocuments(word); return reverseIndexFullReader.numDocuments(word);
} }
public List<IndexQuery> createQueries(SearchTerms terms, QueryParams params) {
if (!isLoaded()) {
logger.warn("Index reader not ready");
return Collections.emptyList();
}
List<IndexQueryBuilder> queryHeads = new ArrayList<>(10);
final long[] termPriority = terms.sortedDistinctIncludes(this::compareKeywords);
List<LongSet> paths = CompiledQueryAggregates.queriesAggregate(terms.compiledQuery());
// Remove any paths that do not contain all prioritized terms, as this means
// the term is missing from the index and can never be found
paths.removeIf(containsAll(termPriority).negate());
for (var path : paths) {
LongList elements = new LongArrayList(path);
elements.sort((a, b) -> {
for (int i = 0; i < termPriority.length; i++) {
if (termPriority[i] == a)
return -1;
if (termPriority[i] == b)
return 1;
}
return 0;
});
var head = findFullWord(elements.getLong(0));
for (int i = 1; i < elements.size(); i++) {
head.addInclusionFilter(hasWordFull(elements.getLong(i)));
}
queryHeads.add(head);
// If there are few paths, we can afford to check the priority index as well
if (paths.size() < 4) {
var prioHead = findPriorityWord(elements.getLong(0));
for (int i = 1; i < elements.size(); i++) {
prioHead.addInclusionFilter(hasWordPrio(elements.getLong(i)));
}
queryHeads.add(prioHead);
}
}
// Add additional conditions to the query heads
for (var query : queryHeads) {
// Advice terms are a special case, mandatory but not ranked, and exempt from re-writing
for (long term : terms.advice()) {
query = query.also(term);
}
for (long term : terms.excludes()) {
query = query.not(term);
}
// Run these filter steps last, as they'll worst-case cause as many page faults as there are
// items in the buffer
query.addInclusionFilter(filterForParams(params));
}
return queryHeads
.stream()
.map(IndexQueryBuilder::build)
.toList();
}
private Predicate<LongSet> containsAll(long[] permitted) {
LongSet permittedTerms = new LongOpenHashSet(permitted);
return permittedTerms::containsAll;
}
private int compareKeywords(long a, long b) {
return Long.compare(
numHits(a),
numHits(b)
);
}
/** Returns the number of occurrences of the word in the priority index */ /** Returns the number of occurrences of the word in the priority index */
public long numHitsPrio(long word) { public int numHitsPrio(long word) {
return reverseIndexPriorityReader.numDocuments(word); return reverseIndexPriorityReader.numDocuments(word);
} }

View File

@ -2,32 +2,19 @@ package nu.marginalia.index.index;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import it.unimi.dsi.fastutil.longs.*;
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
import nu.marginalia.index.query.filter.QueryFilterAllOf;
import nu.marginalia.index.query.filter.QueryFilterAnyOf;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import nu.marginalia.index.results.model.ids.DocMetadataList;
import nu.marginalia.index.model.QueryParams;
import nu.marginalia.index.IndexFactory; import nu.marginalia.index.IndexFactory;
import nu.marginalia.index.model.SearchTerms;
import nu.marginalia.index.query.*;
import nu.marginalia.service.control.ServiceEventLog; import nu.marginalia.service.control.ServiceEventLog;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.IOException; import java.io.IOException;
import java.util.*;
import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.function.Predicate;
/** This class delegates SearchIndexReader and deals with the stateful nature of the index, /** This class holds {@link CombinedIndexReader} and deals with the stateful nature of the index,
* i.e. it may be possible to reconstruct the index and load a new set of data. * i.e. it may be possible to reconstruct the index and load a new set of data.
*
*/ */
@Singleton @Singleton
public class StatefulIndex { public class StatefulIndex {
@ -108,109 +95,11 @@ public class StatefulIndex {
return combinedIndexReader != null && combinedIndexReader.isLoaded(); return combinedIndexReader != null && combinedIndexReader.isLoaded();
} }
public List<IndexQuery> createQueries(SearchTerms terms, QueryParams params) { /** Returns the current index reader. It is acceptable to hold the returned value for the duration of the query,
* but not share it between queries
if (!isLoaded()) {
logger.warn("Index reader not ready");
return Collections.emptyList();
}
List<IndexQueryBuilder> queryHeads = new ArrayList<>(10);
final long[] termPriority = terms.sortedDistinctIncludes(this::compareKeywords);
List<LongSet> paths = CompiledQueryAggregates.queriesAggregate(terms.compiledQuery());
// Remove any paths that do not contain all prioritized terms, as this means
// the term is missing from the index and can never be found
paths.removeIf(containsAll(termPriority).negate());
for (var path : paths) {
LongList elements = new LongArrayList(path);
elements.sort((a, b) -> {
for (int i = 0; i < termPriority.length; i++) {
if (termPriority[i] == a)
return -1;
if (termPriority[i] == b)
return 1;
}
return 0;
});
var head = combinedIndexReader.findFullWord(elements.getLong(0));
for (int i = 1; i < elements.size(); i++) {
head.addInclusionFilter(combinedIndexReader.hasWordFull(elements.getLong(i)));
}
queryHeads.add(head);
// If there are few paths, we can afford to check the priority index as well
if (paths.size() < 4) {
var prioHead = combinedIndexReader.findPriorityWord(elements.getLong(0));
for (int i = 1; i < elements.size(); i++) {
prioHead.addInclusionFilter(combinedIndexReader.hasWordPrio(elements.getLong(i)));
}
queryHeads.add(prioHead);
}
}
// Add additional conditions to the query heads
for (var query : queryHeads) {
// Advice terms are a special case, mandatory but not ranked, and exempt from re-writing
for (long term : terms.advice()) {
query = query.also(term);
}
for (long term : terms.excludes()) {
query = query.not(term);
}
// Run these filter steps last, as they'll worst-case cause as many page faults as there are
// items in the buffer
query.addInclusionFilter(combinedIndexReader.filterForParams(params));
}
return queryHeads
.stream()
.map(IndexQueryBuilder::build)
.toList();
}
private Predicate<LongSet> containsAll(long[] permitted) {
LongSet permittedTerms = new LongOpenHashSet(permitted);
return permittedTerms::containsAll;
}
private int compareKeywords(long a, long b) {
return Long.compare(
combinedIndexReader.numHits(a),
combinedIndexReader.numHits(b)
);
}
/** Return an array of encoded document metadata longs corresponding to the
* document identifiers provided; with metadata for termId. The input array
* docs[] *must* be sorted.
*/ */
public DocMetadataList getTermMetadata(long termId, CombinedDocIdList docs) { public CombinedIndexReader get() {
return combinedIndexReader.getMetadata(termId, docs); return combinedIndexReader;
}
public long getDocumentMetadata(long docId) {
return combinedIndexReader.getDocumentMetadata(docId);
} }
public int getHtmlFeatures(long docId) {
return combinedIndexReader.getHtmlFeatures(docId);
}
public int getTotalDocCount() {
return combinedIndexReader.totalDocCount();
}
public int getTermFrequency(long id) {
return (int) combinedIndexReader.numHits(id);
}
public int getTermFrequencyPrio(long id) {
return (int) combinedIndexReader.numHitsPrio(id);
}
} }

View File

@ -18,21 +18,24 @@ import static nu.marginalia.index.results.model.TermCoherenceGroupList.TermCoher
import static nu.marginalia.index.results.model.TermMetadataForCombinedDocumentIds.DocumentsWithMetadata; import static nu.marginalia.index.results.model.TermMetadataForCombinedDocumentIds.DocumentsWithMetadata;
public class IndexMetadataService { public class IndexMetadataService {
private final StatefulIndex index; private final StatefulIndex statefulIndex;
@Inject @Inject
public IndexMetadataService(StatefulIndex index) { public IndexMetadataService(StatefulIndex index) {
this.index = index; this.statefulIndex = index;
} }
public TermMetadataForCombinedDocumentIds getTermMetadataForDocuments(CombinedDocIdList combinedIdsAll, public TermMetadataForCombinedDocumentIds getTermMetadataForDocuments(CombinedDocIdList combinedIdsAll,
TermIdList termIdsList) TermIdList termIdsList)
{ {
var currentIndex = statefulIndex.get();
Long2ObjectArrayMap<DocumentsWithMetadata> termdocToMeta = Long2ObjectArrayMap<DocumentsWithMetadata> termdocToMeta =
new Long2ObjectArrayMap<>(termIdsList.size()); new Long2ObjectArrayMap<>(termIdsList.size());
for (long termId : termIdsList.array()) { for (long termId : termIdsList.array()) {
var metadata = index.getTermMetadata(termId, combinedIdsAll); var metadata = currentIndex.getMetadata(termId, combinedIdsAll);
termdocToMeta.put(termId, termdocToMeta.put(termId,
new DocumentsWithMetadata(combinedIdsAll, metadata)); new DocumentsWithMetadata(combinedIdsAll, metadata));
} }

View File

@ -5,6 +5,7 @@ import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggre
import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.api.searchquery.model.results.SearchResultItem; import nu.marginalia.api.searchquery.model.results.SearchResultItem;
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
import nu.marginalia.index.index.CombinedIndexReader;
import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.model.SearchParameters; import nu.marginalia.index.model.SearchParameters;
import nu.marginalia.index.results.model.ids.CombinedDocIdList; import nu.marginalia.index.results.model.ids.CombinedDocIdList;
@ -24,7 +25,7 @@ import java.util.List;
* It holds the data required to perform the scoring, as there is strong * It holds the data required to perform the scoring, as there is strong
* reasons to cache this data, and performs the calculations */ * reasons to cache this data, and performs the calculations */
public class IndexResultValuationContext { public class IndexResultValuationContext {
private final StatefulIndex statefulIndex; private final CombinedIndexReader index;
private final QueryParams queryParams; private final QueryParams queryParams;
private final TermMetadataForCombinedDocumentIds termMetadataForCombinedDocumentIds; private final TermMetadataForCombinedDocumentIds termMetadataForCombinedDocumentIds;
@ -42,7 +43,7 @@ public class IndexResultValuationContext {
ResultRankingContext rankingContext, ResultRankingContext rankingContext,
SearchParameters params SearchParameters params
) { ) {
this.statefulIndex = statefulIndex; this.index = statefulIndex.get();
this.rankingContext = rankingContext; this.rankingContext = rankingContext;
this.searchResultValuator = searchResultValuator; this.searchResultValuator = searchResultValuator;
@ -67,8 +68,8 @@ public class IndexResultValuationContext {
if (!searchTerms.coherences.test(termMetadataForCombinedDocumentIds, combinedId)) if (!searchTerms.coherences.test(termMetadataForCombinedDocumentIds, combinedId))
return null; return null;
long docMetadata = statefulIndex.getDocumentMetadata(docId); long docMetadata = index.getDocumentMetadata(docId);
int htmlFeatures = statefulIndex.getHtmlFeatures(docId); int htmlFeatures = index.getHtmlFeatures(docId);
SearchResultItem searchResult = new SearchResultItem(docId, SearchResultItem searchResult = new SearchResultItem(docId,
docMetadata, docMetadata,

View File

@ -19,7 +19,7 @@ public class BTreeReader {
public BTreeReader(LongArray file, BTreeContext ctx, long offset) { public BTreeReader(LongArray file, BTreeContext ctx, long offset) {
this.ctx = ctx; this.ctx = ctx;
this.header = readHeader(file, offset); this.header = new BTreeHeader(file, offset);
dataBlockEnd = (long) ctx.entrySize * header.numEntries(); dataBlockEnd = (long) ctx.entrySize * header.numEntries();
index = file.range(header.indexOffsetLongs(), header.dataOffsetLongs()); index = file.range(header.indexOffsetLongs(), header.dataOffsetLongs());
@ -35,10 +35,6 @@ public class BTreeReader {
return index; return index;
} }
public static BTreeHeader readHeader(LongArray file, long fileOffset) {
return new BTreeHeader(file, fileOffset);
}
public BTreeHeader getHeader() { public BTreeHeader getHeader() {
return header; return header;
} }
@ -153,7 +149,6 @@ public class BTreeReader {
pointer.walkToData(keys[i]); pointer.walkToData(keys[i]);
long dataAddress = pointer.findData(keys[i]); long dataAddress = pointer.findData(keys[i]);
if (dataAddress >= 0) { if (dataAddress >= 0) {
ret[i] = data.get(dataAddress + offset); ret[i] = data.get(dataAddress + offset);
} }