mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(index) Clean up the code
This commit is contained in:
parent
7dd8c78c6b
commit
55f627ed4c
@ -68,8 +68,12 @@ public class ReverseIndexReader {
|
||||
}
|
||||
|
||||
|
||||
long wordOffset(long wordId) {
|
||||
long idx = wordsBTreeReader.findEntry(wordId);
|
||||
/** Calculate the offset of the word in the documents.
|
||||
* If the return-value is negative, the term does not exist
|
||||
* in the index.
|
||||
*/
|
||||
long wordOffset(long termId) {
|
||||
long idx = wordsBTreeReader.findEntry(termId);
|
||||
|
||||
if (idx < 0)
|
||||
return -1L;
|
||||
@ -77,37 +81,43 @@ public class ReverseIndexReader {
|
||||
return words.get(wordsDataOffset + idx + 1);
|
||||
}
|
||||
|
||||
public EntrySource documents(long wordId) {
|
||||
public EntrySource documents(long termId) {
|
||||
if (null == words) {
|
||||
logger.warn("Reverse index is not ready, dropping query");
|
||||
return new EmptyEntrySource();
|
||||
}
|
||||
|
||||
long offset = wordOffset(wordId);
|
||||
long offset = wordOffset(termId);
|
||||
|
||||
if (offset < 0) return new EmptyEntrySource();
|
||||
if (offset < 0) // No documents
|
||||
return new EmptyEntrySource();
|
||||
|
||||
return new ReverseIndexEntrySource(name, createReaderNew(offset), 2, wordId);
|
||||
return new ReverseIndexEntrySource(name, createReaderNew(offset), 2, termId);
|
||||
}
|
||||
|
||||
public QueryFilterStepIf also(long wordId) {
|
||||
long offset = wordOffset(wordId);
|
||||
/** Create a filter step requiring the specified termId to exist in the documents */
|
||||
public QueryFilterStepIf also(long termId) {
|
||||
long offset = wordOffset(termId);
|
||||
|
||||
if (offset < 0) return new QueryFilterNoPass();
|
||||
if (offset < 0) // No documents
|
||||
return new QueryFilterNoPass();
|
||||
|
||||
return new ReverseIndexRetainFilter(createReaderNew(offset), name, wordId);
|
||||
return new ReverseIndexRetainFilter(createReaderNew(offset), name, termId);
|
||||
}
|
||||
|
||||
public QueryFilterStepIf not(long wordId) {
|
||||
long offset = wordOffset(wordId);
|
||||
/** Create a filter step requiring the specified termId to be absent from the documents */
|
||||
public QueryFilterStepIf not(long termId) {
|
||||
long offset = wordOffset(termId);
|
||||
|
||||
if (offset < 0) return new QueryFilterLetThrough();
|
||||
if (offset < 0) // No documents
|
||||
return new QueryFilterLetThrough();
|
||||
|
||||
return new ReverseIndexRejectFilter(createReaderNew(offset));
|
||||
}
|
||||
|
||||
public int numDocuments(long wordId) {
|
||||
long offset = wordOffset(wordId);
|
||||
/** Return the number of documents with the termId in the index */
|
||||
public int numDocuments(long termId) {
|
||||
long offset = wordOffset(termId);
|
||||
|
||||
if (offset < 0)
|
||||
return 0;
|
||||
@ -115,15 +125,20 @@ public class ReverseIndexReader {
|
||||
return createReaderNew(offset).numEntries();
|
||||
}
|
||||
|
||||
/** Create a BTreeReader for the document offset associated with a termId */
|
||||
private BTreeReader createReaderNew(long offset) {
|
||||
return new BTreeReader(documents, ReverseIndexParameters.docsBTreeContext, offset);
|
||||
return new BTreeReader(
|
||||
documents,
|
||||
ReverseIndexParameters.docsBTreeContext,
|
||||
offset);
|
||||
}
|
||||
|
||||
public long[] getTermMeta(long wordId, long[] docIds) {
|
||||
long offset = wordOffset(wordId);
|
||||
public long[] getTermMeta(long termId, long[] docIds) {
|
||||
long offset = wordOffset(termId);
|
||||
|
||||
if (offset < 0) {
|
||||
logger.debug("Missing offset for word {}", wordId);
|
||||
// This is likely a bug in the code, but we can't throw an exception here
|
||||
logger.debug("Missing offset for word {}", termId);
|
||||
return new long[docIds.length];
|
||||
}
|
||||
|
||||
@ -136,10 +151,9 @@ public class ReverseIndexReader {
|
||||
private boolean isUniqueAndSorted(long[] ids) {
|
||||
if (ids.length == 0)
|
||||
return true;
|
||||
long prev = ids[0];
|
||||
|
||||
for (int i = 1; i < ids.length; i++) {
|
||||
if(ids[i] <= prev)
|
||||
if(ids[i] <= ids[i-1])
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -41,6 +41,7 @@ public class CombinedIndexReader {
|
||||
public QueryFilterStepIf hasWordFull(long termId) {
|
||||
return reverseIndexFullReader.also(termId);
|
||||
}
|
||||
|
||||
public QueryFilterStepIf hasWordPrio(long termId) {
|
||||
return reverseIndexPriorityReader.also(termId);
|
||||
}
|
||||
|
@ -36,7 +36,7 @@ public class IndexQueryBuilderImpl implements IndexQueryBuilder {
|
||||
return this;
|
||||
}
|
||||
|
||||
public IndexQueryBuilder alsoFull(long termId) {
|
||||
public IndexQueryBuilder also(long termId) {
|
||||
|
||||
if (alreadyConsideredTerms.add(termId)) {
|
||||
query.addInclusionFilter(reverseIndexFullReader.also(termId));
|
||||
@ -45,16 +45,7 @@ public class IndexQueryBuilderImpl implements IndexQueryBuilder {
|
||||
return this;
|
||||
}
|
||||
|
||||
public IndexQueryBuilder alsoPrio(long termId) {
|
||||
|
||||
if (alreadyConsideredTerms.add(termId)) {
|
||||
query.addInclusionFilter(reverseIndexPrioReader.also(termId));
|
||||
}
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
public IndexQueryBuilder notFull(long termId) {
|
||||
public IndexQueryBuilder not(long termId) {
|
||||
|
||||
query.addInclusionFilter(reverseIndexFullReader.not(termId));
|
||||
|
||||
|
@ -75,7 +75,7 @@ public class QueryBranchWalker {
|
||||
// in practice only when an index doesn't contain all the search terms, so we can just
|
||||
// skip those paths
|
||||
if (!remainingPaths.isEmpty()) {
|
||||
logger.info("Dropping: {}", remainingPaths);
|
||||
logger.debug("Dropping: {}", remainingPaths);
|
||||
}
|
||||
|
||||
return ret;
|
||||
|
@ -125,59 +125,65 @@ public class StatefulIndex {
|
||||
// the term is missing from the index and can never be found
|
||||
paths.removeIf(containsAll(termPriority).negate());
|
||||
|
||||
List<QueryBranchWalker> helpers = QueryBranchWalker.create(termPriority, paths);
|
||||
List<QueryBranchWalker> walkers = QueryBranchWalker.create(termPriority, paths);
|
||||
|
||||
for (var helper : helpers) {
|
||||
for (var walker : walkers) {
|
||||
for (var builder : List.of(
|
||||
combinedIndexReader.findPriorityWord(helper.termId),
|
||||
combinedIndexReader.findFullWord(helper.termId)
|
||||
combinedIndexReader.findPriorityWord(walker.termId),
|
||||
combinedIndexReader.findFullWord(walker.termId)
|
||||
))
|
||||
{
|
||||
queryHeads.add(builder);
|
||||
|
||||
if (helper.atEnd())
|
||||
continue;
|
||||
if (walker.atEnd())
|
||||
continue; // Single term search query
|
||||
|
||||
// Add filter steps for the remaining combinations of terms
|
||||
List<QueryFilterStepIf> filterSteps = new ArrayList<>();
|
||||
for (var step : helper.next()) {
|
||||
for (var step : walker.next()) {
|
||||
filterSteps.add(createFilter(step, 0));
|
||||
}
|
||||
builder.addInclusionFilterAny(filterSteps);
|
||||
}
|
||||
}
|
||||
|
||||
List<IndexQuery> ret = new ArrayList<>(10);
|
||||
|
||||
// Add additional conditions to the query heads
|
||||
for (var query : queryHeads) {
|
||||
|
||||
// Advice terms are a special case, mandatory but not ranked, and exempt from re-writing
|
||||
for (long term : terms.advice()) {
|
||||
query = query.alsoFull(term);
|
||||
query = query.also(term);
|
||||
}
|
||||
|
||||
for (long term : terms.excludes()) {
|
||||
query = query.notFull(term);
|
||||
query = query.not(term);
|
||||
}
|
||||
|
||||
// Run these filter steps last, as they'll worst-case cause as many page faults as there are
|
||||
// items in the buffer
|
||||
ret.add(query.addInclusionFilter(combinedIndexReader.filterForParams(params)).build());
|
||||
query.addInclusionFilter(combinedIndexReader.filterForParams(params));
|
||||
}
|
||||
|
||||
|
||||
return ret;
|
||||
return queryHeads
|
||||
.stream()
|
||||
.map(IndexQueryBuilder::build)
|
||||
.toList();
|
||||
}
|
||||
|
||||
/** Recursively create a filter step based on the QBW and its children */
|
||||
private QueryFilterStepIf createFilter(QueryBranchWalker walker, int depth) {
|
||||
|
||||
// Create a filter for the current termId
|
||||
final QueryFilterStepIf ownFilterCondition = ownFilterCondition(walker, depth);
|
||||
|
||||
var childSteps = walker.next();
|
||||
|
||||
if (childSteps.isEmpty())
|
||||
if (childSteps.isEmpty()) // no children, and so we're satisfied with just a single filter condition
|
||||
return ownFilterCondition;
|
||||
|
||||
// If there are children, we append the filter conditions for each child as an anyOf condition
|
||||
// to the current filter condition
|
||||
|
||||
List<QueryFilterStepIf> combinedFilters = new ArrayList<>();
|
||||
|
||||
for (var step : childSteps) {
|
||||
@ -186,6 +192,7 @@ public class StatefulIndex {
|
||||
combinedFilters.add(new QueryFilterAllOf(ownFilterCondition, childFilter));
|
||||
}
|
||||
|
||||
// Flatten the filter conditions if there's only one branch
|
||||
if (combinedFilters.size() == 1)
|
||||
return combinedFilters.getFirst();
|
||||
else
|
||||
@ -196,7 +203,7 @@ public class StatefulIndex {
|
||||
private QueryFilterStepIf ownFilterCondition(QueryBranchWalker walker, int depth) {
|
||||
if (depth < 2) {
|
||||
// At shallow depths we prioritize terms that appear in the priority index,
|
||||
// to increase the odds we find "good" results before the sand runs out
|
||||
// to increase the odds we find "good" results before the execution timer runs out
|
||||
return new QueryFilterAnyOf(
|
||||
combinedIndexReader.hasWordPrio(walker.termId),
|
||||
combinedIndexReader.hasWordFull(walker.termId)
|
||||
|
@ -11,16 +11,11 @@ import java.util.List;
|
||||
public interface IndexQueryBuilder {
|
||||
/** Filters documents that also contain termId, within the full index.
|
||||
*/
|
||||
IndexQueryBuilder alsoFull(long termId);
|
||||
|
||||
/**
|
||||
* Filters documents that also contain the termId, within the priority index.
|
||||
*/
|
||||
IndexQueryBuilder alsoPrio(long termIds);
|
||||
IndexQueryBuilder also(long termId);
|
||||
|
||||
/** Excludes documents that contain termId, within the full index
|
||||
*/
|
||||
IndexQueryBuilder notFull(long termId);
|
||||
IndexQueryBuilder not(long termId);
|
||||
|
||||
IndexQueryBuilder addInclusionFilter(QueryFilterStepIf filterStep);
|
||||
IndexQueryBuilder addInclusionFilterAny(List<QueryFilterStepIf> filterStep);
|
||||
|
Loading…
Reference in New Issue
Block a user