(index) Clean up new index query code

2025-02-23 21:18:58 +00:00 · 2024-04-05 13:30:49 +02:00 · 2024-04-05 13:30:49 +02:00 · ae7c760772
commit ae7c760772
parent 81815f3e0a
9 changed files with 208 additions and 178 deletions
--- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryLong.java
+++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryLong.java
@ -39,4 +39,12 @@ public class CompiledQueryLong implements Iterable<Long> {
    public Iterator<Long> iterator() {
        return stream().iterator();
    }
    public long[] copyData() {
        return data.copyData();
    }
    public boolean isEmpty() {
        return data.size() == 0;
    }
 }
--- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqDataLong.java
+++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqDataLong.java
@ -24,4 +24,8 @@ public class CqDataLong {
    public int size() {
        return data.length;
    }
    public long[] copyData() {
        return Arrays.copyOf(data, data.length);
    }
 }
--- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java
+++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java
@ -8,6 +8,7 @@ import java.util.ArrayList;
 import java.util.List;
 import java.util.function.*;
 /** Contains methods for aggregating across a CompiledQuery or CompiledQueryLong */
 public class CompiledQueryAggregates {
    /** Compiled query aggregate that for a single boolean that treats or-branches as logical OR,
     * and and-branches as logical AND operations.  Will return true if there exists a path through
--- a/code/index/java/nu/marginalia/index/index/QueryBranchWalker.java
+++ b/code/index/java/nu/marginalia/index/index/QueryBranchWalker.java
@ -1,13 +1,18 @@
 package nu.marginalia.index.index;
 import it.unimi.dsi.fastutil.longs.LongArrayList;
 import it.unimi.dsi.fastutil.longs.LongArraySet;
 import it.unimi.dsi.fastutil.longs.LongSet;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import java.util.ArrayList;
 import java.util.LinkedList;
 import java.util.List;
-class QueryBranchWalker {
+/** Helper class for index query construction */
 public class QueryBranchWalker {
    private static final Logger logger = LoggerFactory.getLogger(QueryBranchWalker.class);
    public final long[] priorityOrder;
    public final List<LongSet> paths;
    public final long termId;
@ -22,56 +27,81 @@ class QueryBranchWalker {
        return priorityOrder.length == 0;
    }
    /** Group the provided paths by the lowest termId they contain per the provided priorityOrder,
     * into a list of QueryBranchWalkers.  This can be performed iteratively on the resultant QBW:s
     * to traverse the tree via the next() method.
     * <p></p>
     * The paths can be extracted through the {@link nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates CompiledQueryAggregates}
     * queriesAggregate method.
     */
    public static List<QueryBranchWalker> create(long[] priorityOrder, List<LongSet> paths) {
        if (paths.isEmpty())
            return List.of();
        List<QueryBranchWalker> ret = new ArrayList<>();
        List<LongSet> remainingPaths = new LinkedList<>(paths);
        remainingPaths.removeIf(LongSet::isEmpty);
        List<LongSet> pathsForPrio = new ArrayList<>();
        for (int i = 0; i < priorityOrder.length; i++) {
-            long prio = priorityOrder[i];
+            long termId = priorityOrder[i];
            var it = remainingPaths.iterator();
            List<LongSet> pathsForPrio = new ArrayList<>();
            while (it.hasNext()) {
                var path = it.next();
-                if (path.contains(prio)) {
+                if (path.contains(termId)) {
-                    path.remove(prio);
+                    // Remove the current termId from the path
                    path.remove(termId);
                    // Add it to the set of paths associated with the termId
                    pathsForPrio.add(path);
                    // Remove it from consideration
                    it.remove();
                }
            }
            if (!pathsForPrio.isEmpty()) {
-                LongArrayList remainingPrios = new LongArrayList(pathsForPrio.size());
+                long[] newPrios = keepRelevantPriorities(priorityOrder, pathsForPrio);
-
+                ret.add(new QueryBranchWalker(newPrios, new ArrayList<>(pathsForPrio), termId));
-                for (var p : priorityOrder) {
+                pathsForPrio.clear();
                    for (var path : pathsForPrio) {
                        if (path.contains(p)) {
                            remainingPrios.add(p);
                            break;
                        }
                    }
                }
                ret.add(new QueryBranchWalker(remainingPrios.elements(), pathsForPrio, prio));
            }
        }
        // This happens if the priorityOrder array doesn't contain all items in the paths,
        // in practice only when an index doesn't contain all the search terms, so we can just
        // skip those paths
        if (!remainingPaths.isEmpty()) {
-            System.out.println("Dropping: " + remainingPaths);
+            logger.info("Dropping: {}", remainingPaths);
        }
        return ret;
    }
-    public List<QueryBranchWalker> next() {
+    /** From the provided priorityOrder array, keep the elements that are present in any set in paths */
-        if (atEnd())
+    private static long[] keepRelevantPriorities(long[] priorityOrder, List<LongSet> paths) {
-            return List.of();
+        LongArrayList remainingPrios = new LongArrayList(paths.size());
        // these sets are typically very small so array set is a good choice
        LongSet allElements = new LongArraySet(priorityOrder.length);
        for (var path : paths) {
            allElements.addAll(path);
        }
        for (var p : priorityOrder) {
            if (allElements.contains(p))
                remainingPrios.add(p);
        }
        return remainingPrios.elements();
    }
    /** Convenience method that applies the create() method
     * to the priority order and paths associated with this instance */
    public List<QueryBranchWalker> next() {
        return create(priorityOrder, paths);
    }
--- a/code/index/java/nu/marginalia/index/index/StatefulIndex.java
+++ b/code/index/java/nu/marginalia/index/index/StatefulIndex.java
@ -4,7 +4,6 @@ import com.google.inject.Inject;
 import com.google.inject.Singleton;
 import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
 import it.unimi.dsi.fastutil.longs.LongSet;
 import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
 import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
 import nu.marginalia.index.query.filter.QueryFilterAllOf;
 import nu.marginalia.index.query.filter.QueryFilterAnyOf;
@ -25,9 +24,7 @@ import java.util.*;
 import java.util.concurrent.locks.Lock;
 import java.util.concurrent.locks.ReadWriteLock;
 import java.util.concurrent.locks.ReentrantReadWriteLock;
 import java.util.function.LongFunction;
 import java.util.function.Predicate;
 import java.util.stream.Collectors;
 /** This class delegates SearchIndexReader and deals with the stateful nature of the index,
 * i.e. it may be possible to reconstruct the index and load a new set of data.
@ -95,7 +92,6 @@ public class StatefulIndex {
            logger.error("Uncaught exception", ex);
        }
        finally {
            lock.unlock();
        }
@ -113,62 +109,6 @@ public class StatefulIndex {
        return combinedIndexReader != null && combinedIndexReader.isLoaded();
    }
    private Predicate<LongSet> containsOnly(long[] permitted) {
        LongSet permittedTerms = new LongOpenHashSet(permitted);
        return permittedTerms::containsAll;
    }
    private List<IndexQueryBuilder> createBuilders(CompiledQueryLong query,
                                                   LongFunction<IndexQueryBuilder> builderFactory,
                                                   long[] termPriority) {
        List<LongSet> paths = CompiledQueryAggregates.queriesAggregate(query);
        // Remove any paths that do not contain all prioritized terms, as this means
        // the term is missing from the index and can never be found
        paths.removeIf(containsOnly(termPriority).negate());
        List<QueryBranchWalker> helpers = QueryBranchWalker.create(termPriority, paths);
        List<IndexQueryBuilder> builders = new ArrayList<>();
        for (var helper : helpers) {
            var builder = builderFactory.apply(helper.termId);
            builders.add(builder);
            if (helper.atEnd())
                continue;
            var filters = helper.next().stream()
                            .map(this::createFilter)
                            .toList();
            builder.addInclusionFilterAny(filters);
        }
        return builders;
    }
    private QueryFilterStepIf createFilter(QueryBranchWalker helper) {
        var selfCondition = combinedIndexReader.hasWordFull(helper.termId);
        if (helper.atEnd())
            return selfCondition;
        var nextSteps = helper.next();
        var nextFilters = nextSteps.stream()
                .map(this::createFilter)
                .map(filter -> new QueryFilterAllOf(List.of(selfCondition, filter)))
                .collect(Collectors.toList());
        if (nextFilters.isEmpty())
            return selfCondition;
        if (nextFilters.size() == 1)
            return nextFilters.getFirst();
        return new QueryFilterAnyOf(nextFilters);
    }
    public List<IndexQuery> createQueries(SearchTerms terms, QueryParams params) {
        if (!isLoaded()) {
@ -176,29 +116,99 @@ public class StatefulIndex {
            return Collections.emptyList();
        }
        final long[] orderedIncludes = terms.sortedDistinctIncludes(this::compareKeywords);
        final long[] orderedIncludesPrio = terms.sortedDistinctIncludes(this::compareKeywordsPrio);
        List<IndexQueryBuilder> queryHeads = new ArrayList<>(10);
-        queryHeads.addAll(createBuilders(terms.compiledQuery(), combinedIndexReader::findFullWord, orderedIncludes));
+        final long[] termPriority = terms.sortedDistinctIncludes(this::compareKeywords);
-        queryHeads.addAll(createBuilders(terms.compiledQuery(), combinedIndexReader::findPriorityWord, orderedIncludesPrio));
+        List<LongSet> paths = CompiledQueryAggregates.queriesAggregate(terms.compiledQuery());
-        List<IndexQuery> queries = new ArrayList<>(10);
+        // Remove any paths that do not contain all prioritized terms, as this means
        // the term is missing from the index and can never be found
        paths.removeIf(containsAll(termPriority).negate());
        List<QueryBranchWalker> helpers = QueryBranchWalker.create(termPriority, paths);
        for (var helper : helpers) {
            for (var builder : List.of(
                    combinedIndexReader.findPriorityWord(helper.termId),
                    combinedIndexReader.findFullWord(helper.termId)
            ))
            {
                queryHeads.add(builder);
                if (helper.atEnd())
                    continue;
                List<QueryFilterStepIf> filterSteps = new ArrayList<>();
                for (var step : helper.next()) {
                    filterSteps.add(createFilter(step, 0));
                }
                builder.addInclusionFilterAny(filterSteps);
            }
        }
        List<IndexQuery> ret = new ArrayList<>(10);
        // Add additional conditions to the query heads
        for (var query : queryHeads) {
            // Advice terms are a special case, mandatory but not ranked, and exempt from re-writing
            for (long term : terms.advice()) {
                query = query.alsoFull(term);
            }
            for (long term : terms.excludes()) {
                query = query.notFull(term);
            }
            // Run these filter steps last, as they'll worst-case cause as many page faults as there are
            // items in the buffer
-            queries.add(query.addInclusionFilter(combinedIndexReader.filterForParams(params)).build());
+            ret.add(query.addInclusionFilter(combinedIndexReader.filterForParams(params)).build());
        }
-        return queries;
+        return ret;
    }
    /** Recursively create a filter step based on the QBW and its children */
    private QueryFilterStepIf createFilter(QueryBranchWalker walker, int depth) {
        final QueryFilterStepIf ownFilterCondition = ownFilterCondition(walker, depth);
        var childSteps = walker.next();
        if (childSteps.isEmpty())
            return ownFilterCondition;
        List<QueryFilterStepIf> combinedFilters = new ArrayList<>();
        for (var step : childSteps) {
            // Recursion will be limited to a fairly shallow stack depth due to how the queries are constructed.
            var childFilter = createFilter(step, depth+1);
            combinedFilters.add(new QueryFilterAllOf(ownFilterCondition, childFilter));
        }
        if (combinedFilters.size() == 1)
            return combinedFilters.getFirst();
        else
            return new QueryFilterAnyOf(combinedFilters);
    }
    /** Create a filter condition based on the termId associated with the QBW */
    private QueryFilterStepIf ownFilterCondition(QueryBranchWalker walker, int depth) {
        if (depth < 2) {
            // At shallow depths we prioritize terms that appear in the priority index,
            // to increase the odds we find "good" results before the sand runs out
            return new QueryFilterAnyOf(
                    combinedIndexReader.hasWordPrio(walker.termId),
                    combinedIndexReader.hasWordFull(walker.termId)
            );
        } else {
            return combinedIndexReader.hasWordFull(walker.termId);
        }
    }
    private Predicate<LongSet> containsAll(long[] permitted) {
        LongSet permittedTerms = new LongOpenHashSet(permitted);
        return permittedTerms::containsAll;
    }
    private int compareKeywords(long a, long b) {
@ -208,13 +218,6 @@ public class StatefulIndex {
        );
    }
    private int compareKeywordsPrio(long a, long b) {
        return Long.compare(
                combinedIndexReader.numHitsPrio(a),
                combinedIndexReader.numHitsPrio(b)
        );
    }
    /** Return an array of encoded document metadata longs corresponding to the
     * document identifiers provided; with metadata for termId.  The input array
     * docs[] *must* be sorted.
--- a/code/index/java/nu/marginalia/index/model/SearchTerms.java
+++ b/code/index/java/nu/marginalia/index/model/SearchTerms.java
@ -3,54 +3,35 @@ package nu.marginalia.index.model;
 import it.unimi.dsi.fastutil.longs.LongArrayList;
 import it.unimi.dsi.fastutil.longs.LongComparator;
 import it.unimi.dsi.fastutil.longs.LongList;
 import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
 import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
 import nu.marginalia.api.searchquery.model.query.SearchQuery;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Objects;
 import static nu.marginalia.index.model.SearchTermsUtil.getWordId;
 public final class SearchTerms {
-    private final LongList includes;
+    private final LongList advice;
    private final LongList excludes;
    private final LongList priority;
    private final List<LongList> coherences;
    private final CompiledQueryLong compiledQueryIds;
-    public SearchTerms(
+    public SearchTerms(SearchQuery query,
-            LongList includes,
+                       CompiledQueryLong compiledQueryIds)
-            LongList excludes,
+    {
-            LongList priority,
+        this.excludes = new LongArrayList();
-            List<LongList> coherences,
+        this.priority = new LongArrayList();
-            CompiledQueryLong compiledQueryIds
+        this.coherences = new ArrayList<>();
-    ) {
+        this.advice = new LongArrayList();
        this.includes = includes;
        this.excludes = excludes;
        this.priority = priority;
        this.coherences = coherences;
        this.compiledQueryIds = compiledQueryIds;
    }
    public SearchTerms(SearchQuery query, CompiledQueryLong compiledQueryIds) {
        this(new LongArrayList(),
                new LongArrayList(),
                new LongArrayList(),
                new ArrayList<>(),
                compiledQueryIds);
        for (var word : query.searchTermsInclude) {
            includes.add(getWordId(word));
        }
        for (var word : query.searchTermsAdvice) {
-            // This looks like a bug, but it's not
+            advice.add(getWordId(word));
            includes.add(getWordId(word));
        }
        for (var coherence : query.searchTermCoherences) {
            LongList parts = new LongArrayList(coherence.size());
@ -64,36 +45,29 @@ public final class SearchTerms {
        for (var word : query.searchTermsExclude) {
            excludes.add(getWordId(word));
        }
        for (var word : query.searchTermsPriority) {
            priority.add(getWordId(word));
        }
    }
    public boolean isEmpty() {
-        return includes.isEmpty();
+        return compiledQueryIds.isEmpty();
    }
    public long[] sortedDistinctIncludes(LongComparator comparator) {
-        if (includes.isEmpty())
+        LongList list = new LongArrayList(compiledQueryIds.copyData());
            return includes.toLongArray();
        LongList list = new LongArrayList(new LongOpenHashSet(includes));
        list.sort(comparator);
        return list.toLongArray();
    }
    public int size() {
        return includes.size() + excludes.size() + priority.size();
    }
    public LongList includes() {
        return includes;
    }
    public LongList excludes() {
        return excludes;
    }
-
+    public LongList advice() {
        return advice;
    }
    public LongList priority() {
        return priority;
    }
@ -104,29 +78,4 @@ public final class SearchTerms {
    public CompiledQueryLong compiledQuery() { return compiledQueryIds; }
    @Override
    public boolean equals(Object obj) {
        if (obj == this) return true;
        if (obj == null || obj.getClass() != this.getClass()) return false;
        var that = (SearchTerms) obj;
        return Objects.equals(this.includes, that.includes) &&
                Objects.equals(this.excludes, that.excludes) &&
                Objects.equals(this.priority, that.priority) &&
                Objects.equals(this.coherences, that.coherences);
    }
    @Override
    public int hashCode() {
        return Objects.hash(includes, excludes, priority, coherences);
    }
    @Override
    public String toString() {
        return "SearchTerms[" +
                "includes=" + includes + ", " +
                "excludes=" + excludes + ", " +
                "priority=" + priority + ", " +
                "coherences=" + coherences + ']';
    }
 }
--- a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAllOf.java
+++ b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAllOf.java
@ -2,14 +2,28 @@ package nu.marginalia.index.query.filter;
 import nu.marginalia.array.buffer.LongQueryBuffer;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.StringJoiner;
 public class QueryFilterAllOf implements QueryFilterStepIf {
-    private final List<? extends QueryFilterStepIf> steps;
+    private final List<QueryFilterStepIf> steps;
    public QueryFilterAllOf(List<? extends QueryFilterStepIf> steps) {
-        this.steps = steps;
+        this.steps = new ArrayList<>(steps.size());
        for (var step : steps) {
            if (step instanceof QueryFilterAllOf allOf) {
                this.steps.addAll(allOf.steps);
            }
            else {
                this.steps.add(step);
            }
        }
    }
    public QueryFilterAllOf(QueryFilterStepIf... steps) {
        this(List.of(steps));
    }
    public double cost() {
--- a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAnyOf.java
+++ b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAnyOf.java
@ -2,14 +2,27 @@ package nu.marginalia.index.query.filter;
 import nu.marginalia.array.buffer.LongQueryBuffer;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.StringJoiner;
 public class QueryFilterAnyOf implements QueryFilterStepIf {
-    private final List<? extends QueryFilterStepIf> steps;
+    private final List<QueryFilterStepIf> steps;
    public QueryFilterAnyOf(List<? extends QueryFilterStepIf> steps) {
-        this.steps = steps;
+        this.steps = new ArrayList<>(steps.size());
        for (var step : steps) {
            if (step instanceof QueryFilterAnyOf anyOf) {
                this.steps.addAll(anyOf.steps);
            } else {
                this.steps.add(step);
            }
        }
    }
    public QueryFilterAnyOf(QueryFilterStepIf... steps) {
        this(List.of(steps));
    }
    public double cost() {
@ -30,23 +43,37 @@ public class QueryFilterAnyOf implements QueryFilterStepIf {
        if (steps.isEmpty())
            return;
        if (steps.size() == 1) {
            steps.getFirst().apply(buffer);
            return;
        }
        int start = 0;
-        int end = buffer.end;
+        final int endOfValidData = buffer.end; // End of valid data range
        // The filters act as a partitioning function, where anything before buffer.end
        // is "in", and is guaranteed to be sorted; and anything after buffer.end is "out"
        // but no sorting guaranteed is provided.
        // To provide a conditional filter, we re-sort the "out" range, slice it and apply filtering to the slice
        for (var step : steps)
        {
-            var slice = buffer.slice(start, end);
+            var slice = buffer.slice(start, endOfValidData);
            slice.data.quickSort(0, slice.size());
            step.apply(slice);
            start += slice.end;
        }
-        buffer.data.quickSort(0, start);
+        // After we're done, read and write pointers should be 0 and "end" should be the length of valid data,
-
+        // normally done through buffer.finalizeFiltering(); but that won't work here
        // Special finalization
        buffer.reset();
        buffer.end = start;
        // After all filters have been applied, we must re-sort all the retained data
        // to uphold the sortedness contract
        buffer.data.quickSort(0, buffer.end);
    }
    public String describe() {
--- a/code/libraries/array/java/nu/marginalia/array/buffer/LongQueryBuffer.java
+++ b/code/libraries/array/java/nu/marginalia/array/buffer/LongQueryBuffer.java
@ -133,12 +133,6 @@ public class LongQueryBuffer {
        write = 0;
    }
    public void finalizeFiltering(int pos) {
        end = write;
        read = pos;
        write = pos;
    }
    /**  Retain only unique values in the buffer, and update the end pointer to the new length.
     * <p></p>
     *   The buffer is assumed to be sorted up until the end pointer.