(index) Remove dead code

Since the performance fix in 3359f72239 had a huge positive impact without reducing result quality, it's possible to remove the QueryBranchWalker and associated code.
This commit is contained in:
Viktor Lofgren 2024-04-16 19:59:27 +02:00
parent 3359f72239
commit 7fa3e86e64
5 changed files with 1 additions and 227 deletions

View File

@ -35,24 +35,13 @@ public class CombinedIndexReader {
}
public IndexQueryBuilderImpl newQueryBuilder(IndexQuery query) {
return new IndexQueryBuilderImpl(reverseIndexFullReader, reverseIndexPriorityReader, query);
return new IndexQueryBuilderImpl(reverseIndexFullReader, query);
}
public QueryFilterStepIf hasWordFull(long termId) {
return reverseIndexFullReader.also(termId);
}
public QueryFilterStepIf hasWordPrio(long termId) {
return reverseIndexPriorityReader.also(termId);
}
/** Creates a query builder for terms in the priority index */
public IndexQueryBuilder findPriorityWord(long wordId) {
return newQueryBuilder(new IndexQuery(reverseIndexPriorityReader.documents(wordId)))
.withSourceTerms(wordId);
}
/** Creates a query builder for terms in the full index */
public IndexQueryBuilder findFullWord(long wordId) {
return newQueryBuilder(

View File

@ -11,7 +11,6 @@ import nu.marginalia.index.query.filter.QueryFilterStepIf;
public class IndexQueryBuilderImpl implements IndexQueryBuilder {
private final IndexQuery query;
private final ReverseIndexReader reverseIndexFullReader;
private final ReverseIndexReader reverseIndexPrioReader;
/* Keep track of already added include terms to avoid redundant checks.
*
@ -22,12 +21,10 @@ public class IndexQueryBuilderImpl implements IndexQueryBuilder {
private final TLongHashSet alreadyConsideredTerms = new TLongHashSet();
IndexQueryBuilderImpl(ReverseIndexReader reverseIndexFullReader,
ReverseIndexReader reverseIndexPrioReader,
IndexQuery query)
{
this.query = query;
this.reverseIndexFullReader = reverseIndexFullReader;
this.reverseIndexPrioReader = reverseIndexPrioReader;
}
public IndexQueryBuilder withSourceTerms(long... sourceTerms) {

View File

@ -1,108 +0,0 @@
package nu.marginalia.index.index;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import it.unimi.dsi.fastutil.longs.LongArraySet;
import it.unimi.dsi.fastutil.longs.LongSet;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
/** Helper class for index query construction */
public class QueryBranchWalker {
private static final Logger logger = LoggerFactory.getLogger(QueryBranchWalker.class);
public final long[] priorityOrder;
public final List<LongSet> paths;
public final long termId;
private QueryBranchWalker(long[] priorityOrder, List<LongSet> paths, long termId) {
this.priorityOrder = priorityOrder;
this.paths = paths;
this.termId = termId;
}
public boolean atEnd() {
return priorityOrder.length == 0;
}
/** Group the provided paths by the lowest termId they contain per the provided priorityOrder,
* into a list of QueryBranchWalkers. This can be performed iteratively on the resultant QBW:s
* to traverse the tree via the next() method.
* <p></p>
* The paths can be extracted through the {@link nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates CompiledQueryAggregates}
* queriesAggregate method.
*/
public static List<QueryBranchWalker> create(long[] priorityOrder, List<LongSet> paths) {
if (paths.isEmpty())
return List.of();
List<QueryBranchWalker> ret = new ArrayList<>();
List<LongSet> remainingPaths = new LinkedList<>(paths);
remainingPaths.removeIf(LongSet::isEmpty);
List<LongSet> pathsForPrio = new ArrayList<>();
for (int i = 0; i < priorityOrder.length; i++) {
long termId = priorityOrder[i];
var it = remainingPaths.iterator();
while (it.hasNext()) {
var path = it.next();
if (path.contains(termId)) {
// Remove the current termId from the path
path.remove(termId);
// Add it to the set of paths associated with the termId
pathsForPrio.add(path);
// Remove it from consideration
it.remove();
}
}
if (!pathsForPrio.isEmpty()) {
long[] newPrios = keepRelevantPriorities(priorityOrder, pathsForPrio);
ret.add(new QueryBranchWalker(newPrios, new ArrayList<>(pathsForPrio), termId));
pathsForPrio.clear();
}
}
// This happens if the priorityOrder array doesn't contain all items in the paths,
// in practice only when an index doesn't contain all the search terms, so we can just
// skip those paths
if (!remainingPaths.isEmpty()) {
logger.debug("Dropping: {}", remainingPaths);
}
return ret;
}
/** From the provided priorityOrder array, keep the elements that are present in any set in paths */
private static long[] keepRelevantPriorities(long[] priorityOrder, List<LongSet> paths) {
LongArrayList remainingPrios = new LongArrayList(paths.size());
// these sets are typically very small so array set is a good choice
LongSet allElements = new LongArraySet(priorityOrder.length);
for (var path : paths) {
allElements.addAll(path);
}
for (var p : priorityOrder) {
if (allElements.contains(p))
remainingPrios.add(p);
}
return remainingPrios.elements();
}
/** Convenience method that applies the create() method
* to the priority order and paths associated with this instance */
public List<QueryBranchWalker> next() {
return create(priorityOrder, paths);
}
}

View File

@ -4,9 +4,6 @@ import com.google.inject.Inject;
import com.google.inject.Singleton;
import it.unimi.dsi.fastutil.longs.*;
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
import nu.marginalia.index.query.filter.QueryFilterAllOf;
import nu.marginalia.index.query.filter.QueryFilterAnyOf;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import nu.marginalia.index.results.model.ids.DocMetadataList;
import nu.marginalia.index.model.QueryParams;
@ -168,48 +165,6 @@ public class StatefulIndex {
.toList();
}
/** Recursively create a filter step based on the QBW and its children */
private QueryFilterStepIf createFilter(QueryBranchWalker walker, int depth) {
// Create a filter for the current termId
final QueryFilterStepIf ownFilterCondition = ownFilterCondition(walker, depth);
var childSteps = walker.next();
if (childSteps.isEmpty()) // no children, and so we're satisfied with just a single filter condition
return ownFilterCondition;
// If there are children, we append the filter conditions for each child as an anyOf condition
// to the current filter condition
List<QueryFilterStepIf> combinedFilters = new ArrayList<>();
for (var step : childSteps) {
// Recursion will be limited to a fairly shallow stack depth due to how the queries are constructed.
var childFilter = createFilter(step, depth+1);
combinedFilters.add(new QueryFilterAllOf(ownFilterCondition, childFilter));
}
// Flatten the filter conditions if there's only one branch
if (combinedFilters.size() == 1)
return combinedFilters.getFirst();
else
return new QueryFilterAnyOf(combinedFilters);
}
/** Create a filter condition based on the termId associated with the QBW */
private QueryFilterStepIf ownFilterCondition(QueryBranchWalker walker, int depth) {
if (depth < 2) {
// At shallow depths we prioritize terms that appear in the priority index,
// to increase the odds we find "good" results before the execution timer runs out
return new QueryFilterAnyOf(
combinedIndexReader.hasWordPrio(walker.termId),
combinedIndexReader.hasWordFull(walker.termId)
);
} else {
return combinedIndexReader.hasWordFull(walker.termId);
}
}
private Predicate<LongSet> containsAll(long[] permitted) {
LongSet permittedTerms = new LongOpenHashSet(permitted);
return permittedTerms::containsAll;

View File

@ -1,59 +0,0 @@
package nu.marginalia.index.index;
import it.unimi.dsi.fastutil.longs.LongArraySet;
import it.unimi.dsi.fastutil.longs.LongSet;
import org.junit.jupiter.api.Test;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import static org.junit.jupiter.api.Assertions.*;
class QueryBranchWalkerTest {
@Test
public void testNoOverlap() {
var paths = QueryBranchWalker.create(
new long[] { 1, 2 },
List.of(set(1), set(2))
);
assertEquals(2, paths.size());
assertEquals(Set.of(1L, 2L), paths.stream().map(path -> path.termId).collect(Collectors.toSet()));
}
@Test
public void testCond() {
var paths = QueryBranchWalker.create(
new long[] { 1, 2, 3, 4 },
List.of(set(1,2,3), set(1,4,3))
);
assertEquals(1, paths.size());
assertEquals(Set.of(1L), paths.stream().map(path -> path.termId).collect(Collectors.toSet()));
System.out.println(Arrays.toString(paths.getFirst().priorityOrder));
assertArrayEquals(new long[] { 2, 3, 4 }, paths.getFirst().priorityOrder);
var next = paths.getFirst().next();
assertEquals(2, next.size());
assertEquals(Set.of(2L, 3L), next.stream().map(path -> path.termId).collect(Collectors.toSet()));
Map<Long, QueryBranchWalker> byId = next.stream().collect(Collectors.toMap(w -> w.termId, w->w));
assertArrayEquals(new long[] { 3L }, byId.get(2L).priorityOrder );
assertArrayEquals(new long[] { 4L }, byId.get(3L).priorityOrder );
}
@Test
public void testNoOverlapFirst() {
var paths = QueryBranchWalker.create(
new long[] { 1, 2, 3 },
List.of(set(1, 2), set(1, 3))
);
assertEquals(1, paths.size());
assertArrayEquals(new long[] { 2, 3 }, paths.getFirst().priorityOrder);
assertEquals(Set.of(1L), paths.stream().map(path -> path.termId).collect(Collectors.toSet()));
}
LongSet set(long... args) {
return new LongArraySet(args);
}
}