(index) Clean up the code

2025-02-24 05:18:58 +00:00 · 2024-04-11 18:50:21 +02:00 · 2024-04-11 18:50:21 +02:00 · 55f627ed4c
commit 55f627ed4c
parent 7dd8c78c6b
6 changed files with 64 additions and 56 deletions
--- a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexReader.java
+++ b/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexReader.java
@ -68,8 +68,12 @@ public class ReverseIndexReader {
    }


-    long wordOffset(long wordId) {
-        long idx = wordsBTreeReader.findEntry(wordId);
+    /** Calculate the offset of the word in the documents.
+     * If the return-value is negative, the term does not exist
+     * in the index.
+     */
+    long wordOffset(long termId) {
+        long idx = wordsBTreeReader.findEntry(termId);

        if (idx < 0)
            return -1L;
@ -77,37 +81,43 @@ public class ReverseIndexReader {
        return words.get(wordsDataOffset + idx + 1);
    }

-    public EntrySource documents(long wordId) {
+    public EntrySource documents(long termId) {
        if (null == words) {
            logger.warn("Reverse index is not ready, dropping query");
            return new EmptyEntrySource();
        }

-        long offset = wordOffset(wordId);
+        long offset = wordOffset(termId);

-        if (offset < 0) return new EmptyEntrySource();
+        if (offset < 0) // No documents
+            return new EmptyEntrySource();

-        return new ReverseIndexEntrySource(name, createReaderNew(offset), 2, wordId);
+        return new ReverseIndexEntrySource(name, createReaderNew(offset), 2, termId);
    }

-    public QueryFilterStepIf also(long wordId) {
-        long offset = wordOffset(wordId);
+    /** Create a filter step requiring the specified termId to exist in the documents */
+    public QueryFilterStepIf also(long termId) {
+        long offset = wordOffset(termId);

-        if (offset < 0) return new QueryFilterNoPass();
+        if (offset < 0) // No documents
+            return new QueryFilterNoPass();

-        return new ReverseIndexRetainFilter(createReaderNew(offset), name, wordId);
+        return new ReverseIndexRetainFilter(createReaderNew(offset), name, termId);
    }

-    public QueryFilterStepIf not(long wordId) {
-        long offset = wordOffset(wordId);
+    /** Create a filter step requiring the specified termId to be absent from the documents */
+    public QueryFilterStepIf not(long termId) {
+        long offset = wordOffset(termId);

-        if (offset < 0) return new QueryFilterLetThrough();
+        if (offset < 0) // No documents
+            return new QueryFilterLetThrough();

        return new ReverseIndexRejectFilter(createReaderNew(offset));
    }

-    public int numDocuments(long wordId) {
-        long offset = wordOffset(wordId);
+    /** Return the number of documents with the termId in the index */
+    public int numDocuments(long termId) {
+        long offset = wordOffset(termId);

        if (offset < 0)
            return 0;
@ -115,15 +125,20 @@ public class ReverseIndexReader {
        return createReaderNew(offset).numEntries();
    }

+    /** Create a BTreeReader for the document offset associated with a termId */
    private BTreeReader createReaderNew(long offset) {
-        return new BTreeReader(documents, ReverseIndexParameters.docsBTreeContext, offset);
+        return new BTreeReader(
+                documents,
+                ReverseIndexParameters.docsBTreeContext,
+                offset);
    }

-    public long[] getTermMeta(long wordId, long[] docIds) {
-        long offset = wordOffset(wordId);
+    public long[] getTermMeta(long termId, long[] docIds) {
+        long offset = wordOffset(termId);

        if (offset < 0) {
-            logger.debug("Missing offset for word {}", wordId);
+            // This is likely a bug in the code, but we can't throw an exception here
+            logger.debug("Missing offset for word {}", termId);
            return new long[docIds.length];
        }

@ -136,10 +151,9 @@ public class ReverseIndexReader {
    private boolean isUniqueAndSorted(long[] ids) {
        if (ids.length == 0)
            return true;
-        long prev = ids[0];

        for (int i = 1; i < ids.length; i++) {
-            if(ids[i] <= prev)
+            if(ids[i] <= ids[i-1])
                return false;
        }

--- a/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java
+++ b/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java
@ -41,6 +41,7 @@ public class CombinedIndexReader {
    public QueryFilterStepIf hasWordFull(long termId) {
        return reverseIndexFullReader.also(termId);
    }
+
    public QueryFilterStepIf hasWordPrio(long termId) {
        return reverseIndexPriorityReader.also(termId);
    }
--- a/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java
+++ b/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java
@ -36,7 +36,7 @@ public class IndexQueryBuilderImpl implements IndexQueryBuilder  {
        return this;
    }

-    public IndexQueryBuilder alsoFull(long termId) {
+    public IndexQueryBuilder also(long termId) {

        if (alreadyConsideredTerms.add(termId)) {
            query.addInclusionFilter(reverseIndexFullReader.also(termId));
@ -45,16 +45,7 @@ public class IndexQueryBuilderImpl implements IndexQueryBuilder  {
        return this;
    }

-    public IndexQueryBuilder alsoPrio(long termId) {
-
-        if (alreadyConsideredTerms.add(termId)) {
-            query.addInclusionFilter(reverseIndexPrioReader.also(termId));
-        }
-
-        return this;
-    }
-
-    public IndexQueryBuilder notFull(long termId) {
+    public IndexQueryBuilder not(long termId) {

        query.addInclusionFilter(reverseIndexFullReader.not(termId));

--- a/code/index/java/nu/marginalia/index/index/QueryBranchWalker.java
+++ b/code/index/java/nu/marginalia/index/index/QueryBranchWalker.java
@ -75,7 +75,7 @@ public class QueryBranchWalker {
        // in practice only when an index doesn't contain all the search terms, so we can just
        // skip those paths
        if (!remainingPaths.isEmpty()) {
-            logger.info("Dropping: {}", remainingPaths);
+            logger.debug("Dropping: {}", remainingPaths);
        }

        return ret;
--- a/code/index/java/nu/marginalia/index/index/StatefulIndex.java
+++ b/code/index/java/nu/marginalia/index/index/StatefulIndex.java
@ -125,59 +125,65 @@ public class StatefulIndex {
        // the term is missing from the index and can never be found
        paths.removeIf(containsAll(termPriority).negate());

-        List<QueryBranchWalker> helpers = QueryBranchWalker.create(termPriority, paths);
+        List<QueryBranchWalker> walkers = QueryBranchWalker.create(termPriority, paths);

-        for (var helper : helpers) {
+        for (var walker : walkers) {
            for (var builder : List.of(
-                    combinedIndexReader.findPriorityWord(helper.termId),
-                    combinedIndexReader.findFullWord(helper.termId)
+                    combinedIndexReader.findPriorityWord(walker.termId),
+                    combinedIndexReader.findFullWord(walker.termId)
            ))
            {
                queryHeads.add(builder);

-                if (helper.atEnd())
-                    continue;
+                if (walker.atEnd())
+                    continue; // Single term search query

+                // Add filter steps for the remaining combinations of terms
                List<QueryFilterStepIf> filterSteps = new ArrayList<>();
-                for (var step : helper.next()) {
+                for (var step : walker.next()) {
                    filterSteps.add(createFilter(step, 0));
                }
                builder.addInclusionFilterAny(filterSteps);
            }
        }

-        List<IndexQuery> ret = new ArrayList<>(10);

        // Add additional conditions to the query heads
        for (var query : queryHeads) {

            // Advice terms are a special case, mandatory but not ranked, and exempt from re-writing
            for (long term : terms.advice()) {
-                query = query.alsoFull(term);
+                query = query.also(term);
            }

            for (long term : terms.excludes()) {
-                query = query.notFull(term);
+                query = query.not(term);
            }

            // Run these filter steps last, as they'll worst-case cause as many page faults as there are
            // items in the buffer
-            ret.add(query.addInclusionFilter(combinedIndexReader.filterForParams(params)).build());
+            query.addInclusionFilter(combinedIndexReader.filterForParams(params));
        }

-
-        return ret;
+        return queryHeads
+                .stream()
+                .map(IndexQueryBuilder::build)
+                .toList();
    }

    /** Recursively create a filter step based on the QBW and its children */
    private QueryFilterStepIf createFilter(QueryBranchWalker walker, int depth) {
+
+        // Create a filter for the current termId
        final QueryFilterStepIf ownFilterCondition = ownFilterCondition(walker, depth);

        var childSteps = walker.next();
-
-        if (childSteps.isEmpty())
+        if (childSteps.isEmpty()) // no children, and so we're satisfied with just a single filter condition
            return ownFilterCondition;

+        // If there are children, we append the filter conditions for each child as an anyOf condition
+        // to the current filter condition
+
        List<QueryFilterStepIf> combinedFilters = new ArrayList<>();

        for (var step : childSteps) {
@ -186,6 +192,7 @@ public class StatefulIndex {
            combinedFilters.add(new QueryFilterAllOf(ownFilterCondition, childFilter));
        }

+        // Flatten the filter conditions if there's only one branch
        if (combinedFilters.size() == 1)
            return combinedFilters.getFirst();
        else
@ -196,7 +203,7 @@ public class StatefulIndex {
    private QueryFilterStepIf ownFilterCondition(QueryBranchWalker walker, int depth) {
        if (depth < 2) {
            // At shallow depths we prioritize terms that appear in the priority index,
-            // to increase the odds we find "good" results before the sand runs out
+            // to increase the odds we find "good" results before the execution timer runs out
            return new QueryFilterAnyOf(
                    combinedIndexReader.hasWordPrio(walker.termId),
                    combinedIndexReader.hasWordFull(walker.termId)
--- a/code/index/query/java/nu/marginalia/index/query/IndexQueryBuilder.java
+++ b/code/index/query/java/nu/marginalia/index/query/IndexQueryBuilder.java
@ -11,16 +11,11 @@ import java.util.List;
 public interface IndexQueryBuilder {
    /** Filters documents that also contain termId, within the full index.
     */
-    IndexQueryBuilder alsoFull(long termId);
-
-    /**
-     * Filters documents that also contain the termId, within the priority index.
-     */
-    IndexQueryBuilder alsoPrio(long termIds);
+    IndexQueryBuilder also(long termId);

    /** Excludes documents that contain termId, within the full index
     */
-    IndexQueryBuilder notFull(long termId);
+    IndexQueryBuilder not(long termId);

    IndexQueryBuilder addInclusionFilter(QueryFilterStepIf filterStep);
    IndexQueryBuilder addInclusionFilterAny(List<QueryFilterStepIf> filterStep);