From 55f627ed4c24331e2a400dd57a3abfa4e0e6e4db Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 11 Apr 2024 18:50:21 +0200 Subject: [PATCH] (index) Clean up the code --- .../marginalia/index/ReverseIndexReader.java | 56 ++++++++++++------- .../index/index/CombinedIndexReader.java | 1 + .../index/index/IndexQueryBuilderImpl.java | 13 +---- .../index/index/QueryBranchWalker.java | 2 +- .../marginalia/index/index/StatefulIndex.java | 39 +++++++------ .../index/query/IndexQueryBuilder.java | 9 +-- 6 files changed, 64 insertions(+), 56 deletions(-) diff --git a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexReader.java b/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexReader.java index e37de80d..72feb7fd 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexReader.java +++ b/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexReader.java @@ -68,8 +68,12 @@ public class ReverseIndexReader { } - long wordOffset(long wordId) { - long idx = wordsBTreeReader.findEntry(wordId); + /** Calculate the offset of the word in the documents. + * If the return-value is negative, the term does not exist + * in the index. + */ + long wordOffset(long termId) { + long idx = wordsBTreeReader.findEntry(termId); if (idx < 0) return -1L; @@ -77,37 +81,43 @@ public class ReverseIndexReader { return words.get(wordsDataOffset + idx + 1); } - public EntrySource documents(long wordId) { + public EntrySource documents(long termId) { if (null == words) { logger.warn("Reverse index is not ready, dropping query"); return new EmptyEntrySource(); } - long offset = wordOffset(wordId); + long offset = wordOffset(termId); - if (offset < 0) return new EmptyEntrySource(); + if (offset < 0) // No documents + return new EmptyEntrySource(); - return new ReverseIndexEntrySource(name, createReaderNew(offset), 2, wordId); + return new ReverseIndexEntrySource(name, createReaderNew(offset), 2, termId); } - public QueryFilterStepIf also(long wordId) { - long offset = wordOffset(wordId); + /** Create a filter step requiring the specified termId to exist in the documents */ + public QueryFilterStepIf also(long termId) { + long offset = wordOffset(termId); - if (offset < 0) return new QueryFilterNoPass(); + if (offset < 0) // No documents + return new QueryFilterNoPass(); - return new ReverseIndexRetainFilter(createReaderNew(offset), name, wordId); + return new ReverseIndexRetainFilter(createReaderNew(offset), name, termId); } - public QueryFilterStepIf not(long wordId) { - long offset = wordOffset(wordId); + /** Create a filter step requiring the specified termId to be absent from the documents */ + public QueryFilterStepIf not(long termId) { + long offset = wordOffset(termId); - if (offset < 0) return new QueryFilterLetThrough(); + if (offset < 0) // No documents + return new QueryFilterLetThrough(); return new ReverseIndexRejectFilter(createReaderNew(offset)); } - public int numDocuments(long wordId) { - long offset = wordOffset(wordId); + /** Return the number of documents with the termId in the index */ + public int numDocuments(long termId) { + long offset = wordOffset(termId); if (offset < 0) return 0; @@ -115,15 +125,20 @@ public class ReverseIndexReader { return createReaderNew(offset).numEntries(); } + /** Create a BTreeReader for the document offset associated with a termId */ private BTreeReader createReaderNew(long offset) { - return new BTreeReader(documents, ReverseIndexParameters.docsBTreeContext, offset); + return new BTreeReader( + documents, + ReverseIndexParameters.docsBTreeContext, + offset); } - public long[] getTermMeta(long wordId, long[] docIds) { - long offset = wordOffset(wordId); + public long[] getTermMeta(long termId, long[] docIds) { + long offset = wordOffset(termId); if (offset < 0) { - logger.debug("Missing offset for word {}", wordId); + // This is likely a bug in the code, but we can't throw an exception here + logger.debug("Missing offset for word {}", termId); return new long[docIds.length]; } @@ -136,10 +151,9 @@ public class ReverseIndexReader { private boolean isUniqueAndSorted(long[] ids) { if (ids.length == 0) return true; - long prev = ids[0]; for (int i = 1; i < ids.length; i++) { - if(ids[i] <= prev) + if(ids[i] <= ids[i-1]) return false; } diff --git a/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java b/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java index 3846bad8..27a631f5 100644 --- a/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java +++ b/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java @@ -41,6 +41,7 @@ public class CombinedIndexReader { public QueryFilterStepIf hasWordFull(long termId) { return reverseIndexFullReader.also(termId); } + public QueryFilterStepIf hasWordPrio(long termId) { return reverseIndexPriorityReader.also(termId); } diff --git a/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java b/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java index 33ca033e..0f63fdbc 100644 --- a/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java +++ b/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java @@ -36,7 +36,7 @@ public class IndexQueryBuilderImpl implements IndexQueryBuilder { return this; } - public IndexQueryBuilder alsoFull(long termId) { + public IndexQueryBuilder also(long termId) { if (alreadyConsideredTerms.add(termId)) { query.addInclusionFilter(reverseIndexFullReader.also(termId)); @@ -45,16 +45,7 @@ public class IndexQueryBuilderImpl implements IndexQueryBuilder { return this; } - public IndexQueryBuilder alsoPrio(long termId) { - - if (alreadyConsideredTerms.add(termId)) { - query.addInclusionFilter(reverseIndexPrioReader.also(termId)); - } - - return this; - } - - public IndexQueryBuilder notFull(long termId) { + public IndexQueryBuilder not(long termId) { query.addInclusionFilter(reverseIndexFullReader.not(termId)); diff --git a/code/index/java/nu/marginalia/index/index/QueryBranchWalker.java b/code/index/java/nu/marginalia/index/index/QueryBranchWalker.java index 34b04f0a..ffaa5176 100644 --- a/code/index/java/nu/marginalia/index/index/QueryBranchWalker.java +++ b/code/index/java/nu/marginalia/index/index/QueryBranchWalker.java @@ -75,7 +75,7 @@ public class QueryBranchWalker { // in practice only when an index doesn't contain all the search terms, so we can just // skip those paths if (!remainingPaths.isEmpty()) { - logger.info("Dropping: {}", remainingPaths); + logger.debug("Dropping: {}", remainingPaths); } return ret; diff --git a/code/index/java/nu/marginalia/index/index/StatefulIndex.java b/code/index/java/nu/marginalia/index/index/StatefulIndex.java index 273da2d0..ae7b1353 100644 --- a/code/index/java/nu/marginalia/index/index/StatefulIndex.java +++ b/code/index/java/nu/marginalia/index/index/StatefulIndex.java @@ -125,59 +125,65 @@ public class StatefulIndex { // the term is missing from the index and can never be found paths.removeIf(containsAll(termPriority).negate()); - List helpers = QueryBranchWalker.create(termPriority, paths); + List walkers = QueryBranchWalker.create(termPriority, paths); - for (var helper : helpers) { + for (var walker : walkers) { for (var builder : List.of( - combinedIndexReader.findPriorityWord(helper.termId), - combinedIndexReader.findFullWord(helper.termId) + combinedIndexReader.findPriorityWord(walker.termId), + combinedIndexReader.findFullWord(walker.termId) )) { queryHeads.add(builder); - if (helper.atEnd()) - continue; + if (walker.atEnd()) + continue; // Single term search query + // Add filter steps for the remaining combinations of terms List filterSteps = new ArrayList<>(); - for (var step : helper.next()) { + for (var step : walker.next()) { filterSteps.add(createFilter(step, 0)); } builder.addInclusionFilterAny(filterSteps); } } - List ret = new ArrayList<>(10); // Add additional conditions to the query heads for (var query : queryHeads) { // Advice terms are a special case, mandatory but not ranked, and exempt from re-writing for (long term : terms.advice()) { - query = query.alsoFull(term); + query = query.also(term); } for (long term : terms.excludes()) { - query = query.notFull(term); + query = query.not(term); } // Run these filter steps last, as they'll worst-case cause as many page faults as there are // items in the buffer - ret.add(query.addInclusionFilter(combinedIndexReader.filterForParams(params)).build()); + query.addInclusionFilter(combinedIndexReader.filterForParams(params)); } - - return ret; + return queryHeads + .stream() + .map(IndexQueryBuilder::build) + .toList(); } /** Recursively create a filter step based on the QBW and its children */ private QueryFilterStepIf createFilter(QueryBranchWalker walker, int depth) { + + // Create a filter for the current termId final QueryFilterStepIf ownFilterCondition = ownFilterCondition(walker, depth); var childSteps = walker.next(); - - if (childSteps.isEmpty()) + if (childSteps.isEmpty()) // no children, and so we're satisfied with just a single filter condition return ownFilterCondition; + // If there are children, we append the filter conditions for each child as an anyOf condition + // to the current filter condition + List combinedFilters = new ArrayList<>(); for (var step : childSteps) { @@ -186,6 +192,7 @@ public class StatefulIndex { combinedFilters.add(new QueryFilterAllOf(ownFilterCondition, childFilter)); } + // Flatten the filter conditions if there's only one branch if (combinedFilters.size() == 1) return combinedFilters.getFirst(); else @@ -196,7 +203,7 @@ public class StatefulIndex { private QueryFilterStepIf ownFilterCondition(QueryBranchWalker walker, int depth) { if (depth < 2) { // At shallow depths we prioritize terms that appear in the priority index, - // to increase the odds we find "good" results before the sand runs out + // to increase the odds we find "good" results before the execution timer runs out return new QueryFilterAnyOf( combinedIndexReader.hasWordPrio(walker.termId), combinedIndexReader.hasWordFull(walker.termId) diff --git a/code/index/query/java/nu/marginalia/index/query/IndexQueryBuilder.java b/code/index/query/java/nu/marginalia/index/query/IndexQueryBuilder.java index 74ebdea1..855309fa 100644 --- a/code/index/query/java/nu/marginalia/index/query/IndexQueryBuilder.java +++ b/code/index/query/java/nu/marginalia/index/query/IndexQueryBuilder.java @@ -11,16 +11,11 @@ import java.util.List; public interface IndexQueryBuilder { /** Filters documents that also contain termId, within the full index. */ - IndexQueryBuilder alsoFull(long termId); - - /** - * Filters documents that also contain the termId, within the priority index. - */ - IndexQueryBuilder alsoPrio(long termIds); + IndexQueryBuilder also(long termId); /** Excludes documents that contain termId, within the full index */ - IndexQueryBuilder notFull(long termId); + IndexQueryBuilder not(long termId); IndexQueryBuilder addInclusionFilter(QueryFilterStepIf filterStep); IndexQueryBuilder addInclusionFilterAny(List filterStep);