diff --git a/code/functions/search-query/api/build.gradle b/code/functions/search-query/api/build.gradle index 727b5b86..1a8d55d2 100644 --- a/code/functions/search-query/api/build.gradle +++ b/code/functions/search-query/api/build.gradle @@ -30,6 +30,7 @@ dependencies { implementation libs.notnull implementation libs.guice implementation libs.gson + implementation libs.commons.lang3 implementation libs.bundles.protobuf implementation libs.bundles.grpc implementation libs.fastutil diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java index 4b2f0032..4d2cf7a6 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java @@ -1,7 +1,6 @@ package nu.marginalia.api.searchquery; -import nu.marginalia.api.searchquery.*; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.results.Bm25Parameters; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.index.query.limit.QueryLimits; @@ -45,33 +44,37 @@ public class IndexProtobufCodec { .build(); } - public static SearchSubquery convertSearchSubquery(RpcSubquery subquery) { + public static SearchQuery convertRpcQuery(RpcQuery query) { List> coherences = new ArrayList<>(); - for (int j = 0; j < subquery.getCoherencesCount(); j++) { - var coh = subquery.getCoherences(j); + for (int j = 0; j < query.getCoherencesCount(); j++) { + var coh = query.getCoherences(j); coherences.add(new ArrayList<>(coh.getCoherencesList())); } - return new SearchSubquery( - subquery.getIncludeList(), - subquery.getExcludeList(), - subquery.getAdviceList(), - subquery.getPriorityList(), + return new SearchQuery( + query.getCompiledQuery(), + query.getIncludeList(), + query.getExcludeList(), + query.getAdviceList(), + query.getPriorityList(), coherences ); } - public static RpcSubquery convertSearchSubquery(SearchSubquery searchSubquery) { + public static RpcQuery convertRpcQuery(SearchQuery searchQuery) { var subqueryBuilder = - RpcSubquery.newBuilder() - .addAllAdvice(searchSubquery.getSearchTermsAdvice()) - .addAllExclude(searchSubquery.getSearchTermsExclude()) - .addAllInclude(searchSubquery.getSearchTermsInclude()) - .addAllPriority(searchSubquery.getSearchTermsPriority()); - for (var coherences : searchSubquery.searchTermCoherences) { + RpcQuery.newBuilder() + .setCompiledQuery(searchQuery.compiledQuery) + .addAllInclude(searchQuery.getSearchTermsInclude()) + .addAllAdvice(searchQuery.getSearchTermsAdvice()) + .addAllExclude(searchQuery.getSearchTermsExclude()) + .addAllPriority(searchQuery.getSearchTermsPriority()); + + for (var coherences : searchQuery.searchTermCoherences) { subqueryBuilder.addCoherencesBuilder().addAllCoherences(coherences); } + return subqueryBuilder.build(); } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java index 28d14c82..f0113870 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java @@ -2,7 +2,6 @@ package nu.marginalia.api.searchquery; import lombok.SneakyThrows; import nu.marginalia.api.searchquery.model.query.SearchSpecification; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.api.searchquery.model.results.SearchResultItem; @@ -14,7 +13,6 @@ import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.api.searchquery.model.query.QueryResponse; import java.util.ArrayList; -import java.util.List; public class QueryProtobufCodec { @@ -23,9 +21,7 @@ public class QueryProtobufCodec { builder.addAllDomains(request.getDomainIdsList()); - for (var subquery : query.specs.subqueries) { - builder.addSubqueries(IndexProtobufCodec.convertSearchSubquery(subquery)); - } + builder.setQuery(IndexProtobufCodec.convertRpcQuery(query.specs.query)); builder.setSearchSetIdentifier(query.specs.searchSetIdentifier); builder.setHumanQuery(request.getHumanQuery()); @@ -51,9 +47,7 @@ public class QueryProtobufCodec { public static RpcIndexQuery convertQuery(String humanQuery, ProcessedQuery query) { var builder = RpcIndexQuery.newBuilder(); - for (var subquery : query.specs.subqueries) { - builder.addSubqueries(IndexProtobufCodec.convertSearchSubquery(subquery)); - } + builder.setQuery(IndexProtobufCodec.convertRpcQuery(query.specs.query)); builder.setSearchSetIdentifier(query.specs.searchSetIdentifier); builder.setHumanQuery(humanQuery); @@ -147,8 +141,8 @@ public class QueryProtobufCodec { private static SearchResultKeywordScore convertKeywordScore(RpcResultKeywordScore keywordScores) { return new SearchResultKeywordScore( - keywordScores.getSubquery(), keywordScores.getKeyword(), + -1, // termId is internal to index service keywordScores.getEncodedWordMetadata(), keywordScores.getEncodedDocMetadata(), keywordScores.getHtmlFeatures() @@ -156,14 +150,8 @@ public class QueryProtobufCodec { } private static SearchSpecification convertSearchSpecification(RpcIndexQuery specs) { - List subqueries = new ArrayList<>(specs.getSubqueriesCount()); - - for (int i = 0; i < specs.getSubqueriesCount(); i++) { - subqueries.add(IndexProtobufCodec.convertSearchSubquery(specs.getSubqueries(i))); - } - return new SearchSpecification( - subqueries, + IndexProtobufCodec.convertRpcQuery(specs.getQuery()), specs.getDomainsList(), specs.getSearchSetIdentifier(), specs.getHumanQuery(), @@ -182,7 +170,6 @@ public class QueryProtobufCodec { .addAllDomainIds(params.domainIds()) .addAllTacitAdvice(params.tacitAdvice()) .addAllTacitExcludes(params.tacitExcludes()) - .addAllTacitIncludes(params.tacitIncludes()) .addAllTacitPriority(params.tacitPriority()) .setHumanQuery(params.humanQuery()) .setQueryLimits(IndexProtobufCodec.convertQueryLimits(params.limits())) diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQuery.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQuery.java new file mode 100644 index 00000000..3ae850a3 --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQuery.java @@ -0,0 +1,76 @@ +package nu.marginalia.api.searchquery.model.compiled; + +import org.jetbrains.annotations.NotNull; + +import java.util.Iterator; +import java.util.function.*; +import java.util.stream.IntStream; +import java.util.stream.Stream; + + +/** A compiled index service query. The class separates the topology of the query from the data, + * and it's possible to create new queries supplanting the data */ +public class CompiledQuery implements Iterable { + + /** The root expression, conveys the topology of the query */ + public final CqExpression root; + + private final CqData data; + + public CompiledQuery(CqExpression root, CqData data) { + this.root = root; + this.data = data; + } + + public CompiledQuery(CqExpression root, T[] data) { + this.root = root; + this.data = new CqData<>(data); + } + + /** Exists for testing, creates a simple query that ANDs all the provided items */ + public static CompiledQuery just(T... item) { + return new CompiledQuery<>(new CqExpression.And( + IntStream.range(0, item.length).mapToObj(CqExpression.Word::new).toList() + ), item); + } + + /** Create a new CompiledQuery mapping the leaf nodes using the provided mapper */ + public CompiledQuery map(Class clazz, Function mapper) { + return new CompiledQuery<>( + root, + data.map(clazz, mapper) + ); + } + + public CompiledQueryLong mapToLong(ToLongFunction mapper) { + return new CompiledQueryLong(root, data.mapToLong(mapper)); + } + + public CqExpression root() { + return root; + } + + public Stream stream() { + return data.stream(); + } + + public IntStream indices() { + return IntStream.range(0, data.size()); + } + + public T at(int index) { + return data.get(index); + } + + @NotNull + @Override + public Iterator iterator() { + return stream().iterator(); + } + + public int size() { + return data.size(); + } + + +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryLong.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryLong.java new file mode 100644 index 00000000..639778dc --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryLong.java @@ -0,0 +1,42 @@ +package nu.marginalia.api.searchquery.model.compiled; + +import org.jetbrains.annotations.NotNull; + +import java.util.Iterator; +import java.util.stream.IntStream; +import java.util.stream.LongStream; + + +/** A compiled index service query */ +public class CompiledQueryLong implements Iterable { + private final CqExpression root; + private final CqDataLong data; + + public CompiledQueryLong(CqExpression root, CqDataLong data) { + this.root = root; + this.data = data; + } + + + public CqExpression root() { + return root; + } + + public LongStream stream() { + return data.stream(); + } + + public IntStream indices() { + return IntStream.range(0, data.size()); + } + + public long at(int index) { + return data.get(index); + } + + @NotNull + @Override + public Iterator iterator() { + return stream().iterator(); + } +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryParser.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryParser.java new file mode 100644 index 00000000..ae197fb9 --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryParser.java @@ -0,0 +1,113 @@ +package nu.marginalia.api.searchquery.model.compiled; + +import org.apache.commons.lang3.StringUtils; + +import java.util.*; + +/** Parser for a compiled index query */ +public class CompiledQueryParser { + + public static CompiledQuery parse(String query) { + List parts = tokenize(query); + + if (parts.isEmpty()) { + return new CompiledQuery<>( + CqExpression.empty(), + new CqData<>(new String[0]) + ); + } + + // We aren't interested in a binary tree representation, but an n-ary tree one, + // so a somewhat unusual parsing technique is used to avoid having an additional + // flattening step at the end. + + // This is only possible due to the trivial and unambiguous grammar of the compiled queries + + List parenState = new ArrayList<>(); + parenState.add(new AndOrState()); + + Map wordIds = new HashMap<>(); + + for (var part : parts) { + var head = parenState.getLast(); + + if (part.equals("|")) { + head.or(); + } + else if (part.equals("(")) { + parenState.addLast(new AndOrState()); + } + else if (part.equals(")")) { + if (parenState.size() < 2) { + throw new IllegalStateException("Mismatched parentheses in expression: " + query); + } + parenState.removeLast(); + parenState.getLast().and(head.closeOr()); + } + else { + head.and( + new CqExpression.Word( + wordIds.computeIfAbsent(part, p -> wordIds.size()) + ) + ); + } + } + + if (parenState.size() != 1) + throw new IllegalStateException("Mismatched parentheses in expression: " + query); + + // Construct the CompiledQuery object with String:s as leaves + var root = parenState.getLast().closeOr(); + + String[] cqData = new String[wordIds.size()]; + wordIds.forEach((w, i) -> cqData[i] = w); + return new CompiledQuery<>(root, new CqData<>(cqData)); + + } + + private static class AndOrState { + private List andState = new ArrayList<>(); + private List orState = new ArrayList<>(); + + /** Add a new item to the and-list */ + public void and(CqExpression e) { + andState.add(e); + } + + /** Turn the and-list into an expression on the or-list, and then start a new and-list */ + public void or() { + closeAnd(); + + andState = new ArrayList<>(); + } + + /** Turn the and-list into an And-expression in the or-list */ + private void closeAnd() { + if (andState.size() == 1) + orState.add(andState.getFirst()); + else if (!andState.isEmpty()) + orState.add(new CqExpression.And(andState)); + } + + /** Finalize the current and-list, then turn the or-list into an Or-expression */ + public CqExpression closeOr() { + closeAnd(); + + if (orState.isEmpty()) + return CqExpression.empty(); + if (orState.size() == 1) + return orState.getFirst(); + + return new CqExpression.Or(orState); + } + } + + private static List tokenize(String query) { + // Each token is guaranteed to be separated by one or more space characters + + return Arrays.stream(StringUtils.split(query, ' ')) + .filter(StringUtils::isNotBlank) + .toList(); + } + +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqData.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqData.java new file mode 100644 index 00000000..b1565dc0 --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqData.java @@ -0,0 +1,51 @@ +package nu.marginalia.api.searchquery.model.compiled; + +import java.lang.reflect.Array; +import java.util.Arrays; +import java.util.function.Function; +import java.util.function.ToDoubleFunction; +import java.util.function.ToLongFunction; +import java.util.stream.Stream; + +public class CqData { + private final T[] data; + + public CqData(T[] data) { + this.data = data; + } + + @SuppressWarnings("unchecked") + public CqData map(Class clazz, Function mapper) { + T2[] newData = (T2[]) Array.newInstance(clazz, data.length); + for (int i = 0; i < data.length; i++) { + newData[i] = mapper.apply((T) data[i]); + } + + return new CqData<>(newData); + } + + public CqDataLong mapToLong(ToLongFunction mapper) { + long[] newData = new long[data.length]; + for (int i = 0; i < data.length; i++) { + newData[i] = mapper.applyAsLong((T) data[i]); + } + + return new CqDataLong(newData); + } + + public T get(int i) { + return data[i]; + } + + public T get(CqExpression.Word w) { + return data[w.idx()]; + } + + public Stream stream() { + return Arrays.stream(data); + } + + public int size() { + return data.length; + } +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqDataLong.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqDataLong.java new file mode 100644 index 00000000..8049631e --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqDataLong.java @@ -0,0 +1,27 @@ +package nu.marginalia.api.searchquery.model.compiled; + +import java.util.Arrays; +import java.util.stream.LongStream; + +public class CqDataLong { + private final long[] data; + + public CqDataLong(long[] data) { + this.data = data; + } + + public long get(int i) { + return data[i]; + } + public long get(CqExpression.Word w) { + return data[w.idx()]; + } + + public LongStream stream() { + return Arrays.stream(data); + } + + public int size() { + return data.length; + } +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqExpression.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqExpression.java new file mode 100644 index 00000000..e9972526 --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqExpression.java @@ -0,0 +1,170 @@ +package nu.marginalia.api.searchquery.model.compiled; + +import java.util.List; +import java.util.StringJoiner; +import java.util.stream.Stream; + +/** Expression in a parsed index service query + * + */ +public sealed interface CqExpression { + + Stream stream(); + + /** @see nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates */ + long visit(LongVisitor visitor); + /** @see nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates */ + double visit(DoubleVisitor visitor); + /** @see nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates */ + int visit(IntVisitor visitor); + /** @see nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates */ + boolean visit(BoolVisitor visitor); + + T visit(ObjectVisitor visitor); + + static CqExpression empty() { + return new Or(List.of()); + } + + + record And(List parts) implements CqExpression { + @Override + public Stream stream() { + return parts.stream().flatMap(CqExpression::stream); + } + + @Override + public long visit(LongVisitor visitor) { + return visitor.onAnd(parts); + } + + @Override + public double visit(DoubleVisitor visitor) { + return visitor.onAnd(parts); + } + + @Override + public int visit(IntVisitor visitor) { + return visitor.onAnd(parts); + } + + @Override + public boolean visit(BoolVisitor visitor) { + return visitor.onAnd(parts); + } + + @Override + public T visit(ObjectVisitor visitor) { return visitor.onAnd(parts); } + + public String toString() { + StringJoiner sj = new StringJoiner(", ", "And[ ", "]"); + parts.forEach(part -> sj.add(part.toString())); + return sj.toString(); + } + + } + + record Or(List parts) implements CqExpression { + @Override + public Stream stream() { + return parts.stream().flatMap(CqExpression::stream); + } + + @Override + public long visit(LongVisitor visitor) { + return visitor.onOr(parts); + } + + @Override + public double visit(DoubleVisitor visitor) { + return visitor.onOr(parts); + } + + @Override + public int visit(IntVisitor visitor) { + return visitor.onOr(parts); + } + + @Override + public boolean visit(BoolVisitor visitor) { + return visitor.onOr(parts); + } + + @Override + public T visit(ObjectVisitor visitor) { return visitor.onOr(parts); } + + public String toString() { + StringJoiner sj = new StringJoiner(", ", "Or[ ", "]"); + parts.forEach(part -> sj.add(part.toString())); + return sj.toString(); + } + + + } + + record Word(int idx) implements CqExpression { + @Override + public Stream stream() { + return Stream.of(this); + } + + @Override + public long visit(LongVisitor visitor) { + return visitor.onLeaf(idx); + } + + @Override + public double visit(DoubleVisitor visitor) { + return visitor.onLeaf(idx); + } + + @Override + public int visit(IntVisitor visitor) { + return visitor.onLeaf(idx); + } + + @Override + public boolean visit(BoolVisitor visitor) { + return visitor.onLeaf(idx); + } + + @Override + public T visit(ObjectVisitor visitor) { return visitor.onLeaf(idx); } + + @Override + public String toString() { + return Integer.toString(idx); + } + } + + interface LongVisitor { + long onAnd(List parts); + long onOr(List parts); + long onLeaf(int idx); + } + + interface IntVisitor { + int onAnd(List parts); + int onOr(List parts); + int onLeaf(int idx); + } + + interface BoolVisitor { + boolean onAnd(List parts); + boolean onOr(List parts); + boolean onLeaf(int idx); + } + + interface DoubleVisitor { + double onAnd(List parts); + double onOr(List parts); + double onLeaf(int idx); + } + + interface ObjectVisitor { + T onAnd(List parts); + T onOr(List parts); + T onLeaf(int idx); + } + +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java new file mode 100644 index 00000000..209acbee --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java @@ -0,0 +1,46 @@ +package nu.marginalia.api.searchquery.model.compiled.aggregate; + +import it.unimi.dsi.fastutil.longs.LongSet; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; + +import java.util.ArrayList; +import java.util.List; +import java.util.function.*; + +public class CompiledQueryAggregates { + /** Compiled query aggregate that for a single boolean that treats or-branches as logical OR, + * and and-branches as logical AND operations. Will return true if there exists a path through + * the query where the provided predicate returns true for each item. + */ + static public boolean booleanAggregate(CompiledQuery query, Predicate predicate) { + return query.root.visit(new CqBooleanAggregate(query, predicate)); + } + + + /** Compiled query aggregate that for a 64b bitmask that treats or-branches as logical OR, + * and and-branches as logical AND operations. + */ + public static long longBitmaskAggregate(CompiledQuery query, ToLongFunction operator) { + return query.root.visit(new CqLongBitmaskOperator(query, operator)); + } + + + /** Apply the operator to each leaf node, then return the highest minimum value found along any path */ + public static int intMaxMinAggregate(CompiledQuery query, ToIntFunction operator) { + return query.root.visit(new CqIntMaxMinOperator(query, operator)); + } + + /** Apply the operator to each leaf node, and then return the highest sum of values possible + * through each branch in the compiled query. + * + */ + public static double doubleSumAggregate(CompiledQuery query, ToDoubleFunction operator) { + return query.root.visit(new CqDoubleSumOperator(query, operator)); + } + + /** Enumerate all possible paths through the compiled query */ + public static List queriesAggregate(CompiledQueryLong query) { + return new ArrayList<>(query.root().visit(new CqQueryPathsOperator(query))); + } +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqBooleanAggregate.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqBooleanAggregate.java new file mode 100644 index 00000000..05ebf4c7 --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqBooleanAggregate.java @@ -0,0 +1,40 @@ +package nu.marginalia.api.searchquery.model.compiled.aggregate; + +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CqExpression; + +import java.util.List; +import java.util.function.IntPredicate; +import java.util.function.Predicate; + +public class CqBooleanAggregate implements CqExpression.BoolVisitor { + + private final IntPredicate predicate; + + public CqBooleanAggregate(CompiledQuery query, Predicate objPred) { + this.predicate = idx -> objPred.test(query.at(idx)); + } + + @Override + public boolean onAnd(List parts) { + for (var part : parts) { + if (!part.visit(this)) // short-circuit + return false; + } + return true; + } + + @Override + public boolean onOr(List parts) { + for (var part : parts) { + if (part.visit(this)) // short-circuit + return true; + } + return false; + } + + @Override + public boolean onLeaf(int idx) { + return predicate.test(idx); + } +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqDoubleSumOperator.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqDoubleSumOperator.java new file mode 100644 index 00000000..23d1904e --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqDoubleSumOperator.java @@ -0,0 +1,40 @@ +package nu.marginalia.api.searchquery.model.compiled.aggregate; + +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CqExpression; + +import java.util.List; +import java.util.function.IntToDoubleFunction; +import java.util.function.ToDoubleFunction; + +public class CqDoubleSumOperator implements CqExpression.DoubleVisitor { + + private final IntToDoubleFunction operator; + + public CqDoubleSumOperator(CompiledQuery query, ToDoubleFunction operator) { + this.operator = idx -> operator.applyAsDouble(query.at(idx)); + } + + @Override + public double onAnd(List parts) { + double value = 0; + for (var part : parts) { + value += part.visit(this); + } + return value; + } + + @Override + public double onOr(List parts) { + double value = parts.getFirst().visit(this); + for (int i = 1; i < parts.size(); i++) { + value = Math.max(value, parts.get(i).visit(this)); + } + return value; + } + + @Override + public double onLeaf(int idx) { + return operator.applyAsDouble(idx); + } +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqIntMaxMinOperator.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqIntMaxMinOperator.java new file mode 100644 index 00000000..b3ec86bb --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqIntMaxMinOperator.java @@ -0,0 +1,41 @@ +package nu.marginalia.api.searchquery.model.compiled.aggregate; + +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CqExpression; + +import java.util.List; +import java.util.function.IntUnaryOperator; +import java.util.function.ToIntFunction; + +public class CqIntMaxMinOperator implements CqExpression.IntVisitor { + + private final IntUnaryOperator operator; + + + public CqIntMaxMinOperator(CompiledQuery query, ToIntFunction operator) { + this.operator = idx -> operator.applyAsInt(query.at(idx)); + } + + @Override + public int onAnd(List parts) { + int value = parts.getFirst().visit(this); + for (int i = 1; i < parts.size(); i++) { + value = Math.min(value, parts.get(i).visit(this)); + } + return value; + } + + @Override + public int onOr(List parts) { + int value = parts.getFirst().visit(this); + for (int i = 1; i < parts.size(); i++) { + value = Math.max(value, parts.get(i).visit(this)); + } + return value; + } + + @Override + public int onLeaf(int idx) { + return operator.applyAsInt(idx); + } +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqLongBitmaskOperator.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqLongBitmaskOperator.java new file mode 100644 index 00000000..d9a4804b --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqLongBitmaskOperator.java @@ -0,0 +1,40 @@ +package nu.marginalia.api.searchquery.model.compiled.aggregate; + +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CqExpression; + +import java.util.List; +import java.util.function.IntToLongFunction; +import java.util.function.ToLongFunction; + +public class CqLongBitmaskOperator implements CqExpression.LongVisitor { + + private final IntToLongFunction operator; + + public CqLongBitmaskOperator(CompiledQuery query, ToLongFunction operator) { + this.operator = idx-> operator.applyAsLong(query.at(idx)); + } + + @Override + public long onAnd(List parts) { + long value = ~0L; + for (var part : parts) { + value &= part.visit(this); + } + return value; + } + + @Override + public long onOr(List parts) { + long value = 0L; + for (var part : parts) { + value |= part.visit(this); + } + return value; + } + + @Override + public long onLeaf(int idx) { + return operator.applyAsLong(idx); + } +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqQueryPathsOperator.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqQueryPathsOperator.java new file mode 100644 index 00000000..2339104e --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqQueryPathsOperator.java @@ -0,0 +1,75 @@ +package nu.marginalia.api.searchquery.model.compiled.aggregate; + +import it.unimi.dsi.fastutil.longs.LongArraySet; +import it.unimi.dsi.fastutil.longs.LongOpenHashSet; +import it.unimi.dsi.fastutil.longs.LongSet; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; +import nu.marginalia.api.searchquery.model.compiled.CqExpression; + +import java.util.ArrayList; +import java.util.List; + +public class CqQueryPathsOperator implements CqExpression.ObjectVisitor> { + private final CompiledQueryLong query; + + public CqQueryPathsOperator(CompiledQueryLong query) { + this.query = query; + } + + @Override + public List onAnd(List parts) { + return parts.stream() + .map(expr -> expr.visit(this)) + .reduce(List.of(), this::combineAnd); + } + + private List combineAnd(List a, List b) { + // No-op cases + if (a.isEmpty()) + return b; + if (b.isEmpty()) + return a; + + // Simple cases + if (a.size() == 1) { + b.forEach(set -> set.addAll(a.getFirst())); + return b; + } + else if (b.size() == 1) { + a.forEach(set -> set.addAll(b.getFirst())); + return a; + } + + // Case where we AND two ORs + List ret = new ArrayList<>(); + + for (var aPart : a) { + for (var bPart : b) { + LongSet set = new LongOpenHashSet(aPart.size() + bPart.size()); + set.addAll(aPart); + set.addAll(bPart); + ret.add(set); + } + } + + return ret; + } + + @Override + public List onOr(List parts) { + List ret = new ArrayList<>(); + + for (var part : parts) { + ret.addAll(part.visit(this)); + } + + return ret; + } + + @Override + public List onLeaf(int idx) { + var set = new LongArraySet(1); + set.add(query.at(idx)); + return List.of(set); + } +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/QueryResponse.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/QueryResponse.java index 80e5b61a..1834c08f 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/QueryResponse.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/QueryResponse.java @@ -13,10 +13,6 @@ public record QueryResponse(SearchSpecification specs, String domain) { public Set getAllKeywords() { - Set keywords = new HashSet<>(100); - for (var sq : specs.subqueries) { - keywords.addAll(sq.searchTermsInclude); - } - return keywords; + return new HashSet<>(specs.query.searchTermsInclude); } } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchSubquery.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchQuery.java similarity index 76% rename from code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchSubquery.java rename to code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchQuery.java index 3798ae89..9dd10396 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchSubquery.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchQuery.java @@ -13,9 +13,12 @@ import java.util.stream.Collectors; @AllArgsConstructor @With @EqualsAndHashCode -public class SearchSubquery { +public class SearchQuery { - /** These terms must be present in the document and are used in ranking*/ + /** An infix style expression that encodes the required terms in the query */ + public final String compiledQuery; + + /** All terms that appear in {@see compiledQuery} */ public final List searchTermsInclude; /** These terms must be absent from the document */ @@ -33,7 +36,8 @@ public class SearchSubquery { @Deprecated // why does this exist? private double value = 0; - public SearchSubquery() { + public SearchQuery() { + this.compiledQuery = ""; this.searchTermsInclude = new ArrayList<>(); this.searchTermsExclude = new ArrayList<>(); this.searchTermsAdvice = new ArrayList<>(); @@ -41,11 +45,13 @@ public class SearchSubquery { this.searchTermCoherences = new ArrayList<>(); } - public SearchSubquery(List searchTermsInclude, - List searchTermsExclude, - List searchTermsAdvice, - List searchTermsPriority, - List> searchTermCoherences) { + public SearchQuery(String compiledQuery, + List searchTermsInclude, + List searchTermsExclude, + List searchTermsAdvice, + List searchTermsPriority, + List> searchTermCoherences) { + this.compiledQuery = compiledQuery; this.searchTermsInclude = searchTermsInclude; this.searchTermsExclude = searchTermsExclude; this.searchTermsAdvice = searchTermsAdvice; @@ -54,7 +60,7 @@ public class SearchSubquery { } @Deprecated // why does this exist? - public SearchSubquery setValue(double value) { + public SearchQuery setValue(double value) { if (Double.isInfinite(value) || Double.isNaN(value)) { this.value = Double.MAX_VALUE; } else { @@ -66,7 +72,7 @@ public class SearchSubquery { @Override public String toString() { StringBuilder sb = new StringBuilder(); - if (!searchTermsInclude.isEmpty()) sb.append("include=").append(searchTermsInclude.stream().collect(Collectors.joining(",", "[", "] "))); + if (!compiledQuery.isEmpty()) sb.append("compiledQuery=").append(compiledQuery); if (!searchTermsExclude.isEmpty()) sb.append("exclude=").append(searchTermsExclude.stream().collect(Collectors.joining(",", "[", "] "))); if (!searchTermsAdvice.isEmpty()) sb.append("advice=").append(searchTermsAdvice.stream().collect(Collectors.joining(",", "[", "] "))); if (!searchTermsPriority.isEmpty()) sb.append("priority=").append(searchTermsPriority.stream().collect(Collectors.joining(",", "[", "] "))); diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchSpecification.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchSpecification.java index be2a6895..bbb5b7ae 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchSpecification.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchSpecification.java @@ -10,7 +10,7 @@ import java.util.List; @ToString @Getter @Builder @With @AllArgsConstructor public class SearchSpecification { - public List subqueries; + public SearchQuery query; /** If present and not empty, limit the search to these domain IDs */ public List domains; diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java index cc02ae28..8f50c9fb 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java @@ -21,9 +21,9 @@ public class SearchResultItem implements Comparable { /** How many other potential results existed in the same domain */ public int resultsFromDomain; - public SearchResultItem(long combinedId, int scoresCount) { + public SearchResultItem(long combinedId) { this.combinedId = combinedId; - this.keywordScores = new ArrayList<>(scoresCount); + this.keywordScores = new ArrayList<>(); } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultKeywordScore.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultKeywordScore.java index b84dad0b..f5a9fc02 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultKeywordScore.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultKeywordScore.java @@ -7,19 +7,22 @@ import nu.marginalia.model.idx.DocumentMetadata; import java.util.Objects; public final class SearchResultKeywordScore { + @Deprecated public final int subquery; + public final long termId; public final String keyword; private final long encodedWordMetadata; private final long encodedDocMetadata; private final int htmlFeatures; - public SearchResultKeywordScore(int subquery, - String keyword, + public SearchResultKeywordScore(String keyword, + long termId, long encodedWordMetadata, long encodedDocMetadata, int htmlFeatures) { - this.subquery = subquery; + this.termId = termId; + this.subquery = -1; // FIXME, deprecated this.keyword = keyword; this.encodedWordMetadata = encodedWordMetadata; this.encodedDocMetadata = encodedDocMetadata; diff --git a/code/functions/search-query/api/src/main/protobuf/query-api.proto b/code/functions/search-query/api/src/main/protobuf/query-api.proto index f5ec5e8d..606b18f8 100644 --- a/code/functions/search-query/api/src/main/protobuf/query-api.proto +++ b/code/functions/search-query/api/src/main/protobuf/query-api.proto @@ -52,7 +52,7 @@ message RpcTemporalBias { /* Index service query request */ message RpcIndexQuery { - repeated RpcSubquery subqueries = 1; + RpcQuery query = 1; repeated int32 domains = 2; // (optional) A list of domain IDs to consider string searchSetIdentifier = 3; // (optional) A named set of domains to consider string humanQuery = 4; // The search query as the user entered it @@ -102,12 +102,11 @@ message RpcRawResultItem { /* Information about how well a keyword matches a query */ message RpcResultKeywordScore { - int32 subquery = 1; // index of the subquery this keyword relates to - string keyword = 2; // the keyword - int64 encodedWordMetadata = 3; // bit encoded word metadata - int64 encodedDocMetadata = 4; // bit encoded document metadata - bool hasPriorityTerms = 5; // true if this word is important to the document - int32 htmlFeatures = 6; // bit encoded document features + string keyword = 1; // the keyword + int64 encodedWordMetadata = 2; // bit encoded word metadata + int64 encodedDocMetadata = 3; // bit encoded document metadata + bool hasPriorityTerms = 4; // true if this word is important to the document + int32 htmlFeatures = 5; // bit encoded document features } /* Query execution parameters */ @@ -137,12 +136,13 @@ message RpcResultRankingParameters { } /* Defines a single subquery */ -message RpcSubquery { +message RpcQuery { repeated string include = 1; // These terms must be present repeated string exclude = 2; // These terms must be absent repeated string advice = 3; // These terms must be present, but do not affect ranking repeated string priority = 4; // These terms are not mandatory, but affect ranking positively if they are present repeated RpcCoherences coherences = 5; // Groups of terms that must exist in proximity of each other + string compiledQuery = 6; // Compiled query in infix notation } /* Defines a group of search terms that must exist in close proximity within the document */ diff --git a/code/functions/search-query/api/test/nu/marginalia/api/searchquery/model/compiled/CompiledQueryParserTest.java b/code/functions/search-query/api/test/nu/marginalia/api/searchquery/model/compiled/CompiledQueryParserTest.java new file mode 100644 index 00000000..47983820 --- /dev/null +++ b/code/functions/search-query/api/test/nu/marginalia/api/searchquery/model/compiled/CompiledQueryParserTest.java @@ -0,0 +1,79 @@ +package nu.marginalia.api.searchquery.model.compiled; + +import org.junit.jupiter.api.Test; + +import java.util.List; + +import static org.junit.jupiter.api.Assertions.*; + +class CompiledQueryParserTest { + + @Test + public void testEmpty() { + assertEquals(CqExpression.empty(), CompiledQueryParser.parse("").root); + assertEquals(CqExpression.empty(), CompiledQueryParser.parse("( )").root); + assertEquals(CqExpression.empty(), CompiledQueryParser.parse("( | )").root); + assertEquals(CqExpression.empty(), CompiledQueryParser.parse("| ( | ) |").root); + } + + @Test + public void testSingleWord() { + CompiledQuery q = CompiledQueryParser.parse("foo"); + assertEquals(w(q, "foo"), q.root); + } + + @Test + public void testAndTwoWords() { + CompiledQuery q = CompiledQueryParser.parse("foo bar"); + assertEquals(and(w(q, "foo"), w(q,"bar")), q.root); + } + + @Test + public void testOrTwoWords() { + CompiledQuery q = CompiledQueryParser.parse("foo | bar"); + assertEquals(or(w(q, "foo"), w(q,"bar")), q.root); + } + + @Test + public void testOrAndWords() { + CompiledQuery q = CompiledQueryParser.parse("foo | bar baz"); + assertEquals(or(w(q,"foo"), and(w(q,"bar"), w(q,"baz"))), q.root); + } + + @Test + public void testAndAndOrAndAndWords() { + CompiledQuery q = CompiledQueryParser.parse("foo foobar | bar baz"); + assertEquals(or( + and(w(q, "foo"), w(q, "foobar")), + and(w(q, "bar"), w(q, "baz"))) + , q.root); + } + @Test + public void testComplex1() { + CompiledQuery q = CompiledQueryParser.parse("foo ( bar | baz ) quux"); + assertEquals(and(w(q,"foo"), or(w(q, "bar"), w(q, "baz")), w(q, "quux")), q.root); + } + @Test + public void testComplex2() { + CompiledQuery q = CompiledQueryParser.parse("( ( ( a ) b ) c ) d"); + assertEquals(and(and(and(w(q, "a"), w(q, "b")), w(q, "c")), w(q, "d")), q.root); + } + + @Test + public void testNested() { + CompiledQuery q = CompiledQueryParser.parse("( ( ( a ) ) )"); + assertEquals(w(q,"a"), q.root); + } + + private CqExpression.Word w(CompiledQuery query, String word) { + return new CqExpression.Word(query.indices().filter(idx -> word.equals(query.at(idx))).findAny().orElseThrow()); + } + + private CqExpression and(CqExpression... parts) { + return new CqExpression.And(List.of(parts)); + } + + private CqExpression or(CqExpression... parts) { + return new CqExpression.Or(List.of(parts)); + } +} \ No newline at end of file diff --git a/code/functions/search-query/api/test/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregatesTest.java b/code/functions/search-query/api/test/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregatesTest.java new file mode 100644 index 00000000..c3e36180 --- /dev/null +++ b/code/functions/search-query/api/test/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregatesTest.java @@ -0,0 +1,35 @@ +package nu.marginalia.api.searchquery.model.compiled.aggregate; + +import static nu.marginalia.api.searchquery.model.compiled.CompiledQueryParser.parse; +import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.*; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +class CompiledQueryAggregatesTest { + + @Test + void booleanAggregates() { + assertFalse(booleanAggregate(parse("false"), Boolean::parseBoolean)); + assertTrue(booleanAggregate(parse("true"), Boolean::parseBoolean)); + assertFalse(booleanAggregate(parse("false true"), Boolean::parseBoolean)); + assertTrue(booleanAggregate(parse("( true ) | ( true false )"), Boolean::parseBoolean)); + assertTrue(booleanAggregate(parse("( false ) | ( true )"), Boolean::parseBoolean)); + assertTrue(booleanAggregate(parse("( true false ) | ( true true )"), Boolean::parseBoolean)); + assertFalse(booleanAggregate(parse("( true false ) | ( true false )"), Boolean::parseBoolean)); + } + + @Test + void intMaxMinAggregates() { + assertEquals(5, intMaxMinAggregate(parse("5"), Integer::parseInt)); + assertEquals(3, intMaxMinAggregate(parse("5 3"), Integer::parseInt)); + assertEquals(6, intMaxMinAggregate(parse("5 3 | 6 7"), Integer::parseInt)); + } + + @Test + void doubleSumAggregates() { + assertEquals(5, (int) doubleSumAggregate(parse("5"), Double::parseDouble)); + assertEquals(8, (int) doubleSumAggregate(parse("5 3"), Double::parseDouble)); + assertEquals(13, (int) doubleSumAggregate(parse("1 ( 5 3 | 2 10 )"), Double::parseDouble)); + } +} \ No newline at end of file diff --git a/code/functions/search-query/api/test/nu/marginalia/index/client/IndexProtobufCodecTest.java b/code/functions/search-query/api/test/nu/marginalia/index/client/IndexProtobufCodecTest.java index 1782765d..e93f715c 100644 --- a/code/functions/search-query/api/test/nu/marginalia/index/client/IndexProtobufCodecTest.java +++ b/code/functions/search-query/api/test/nu/marginalia/index/client/IndexProtobufCodecTest.java @@ -1,7 +1,7 @@ package nu.marginalia.index.client; import nu.marginalia.api.searchquery.IndexProtobufCodec; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.index.query.limit.SpecificationLimit; @@ -35,14 +35,15 @@ class IndexProtobufCodecTest { } @Test public void testSubqery() { - verifyIsIdentityTransformation(new SearchSubquery( + verifyIsIdentityTransformation(new SearchQuery( + "qs", List.of("a", "b"), List.of("c", "d"), List.of("e", "f"), List.of("g", "h"), List.of(List.of("i", "j"), List.of("k")) ), - s -> IndexProtobufCodec.convertSearchSubquery(IndexProtobufCodec.convertSearchSubquery(s)) + s -> IndexProtobufCodec.convertRpcQuery(IndexProtobufCodec.convertRpcQuery(s)) ); } private void verifyIsIdentityTransformation(T val, Function transformation) { diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java index 3c0e5219..55467b4f 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java @@ -2,18 +2,16 @@ package nu.marginalia.functions.searchquery.svc; import com.google.inject.Inject; import com.google.inject.Singleton; -import nu.marginalia.LanguageModels; import nu.marginalia.api.searchquery.model.query.SearchSpecification; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; -import nu.marginalia.util.language.EnglishDictionary; +import nu.marginalia.functions.searchquery.query_parser.QueryExpansion; import nu.marginalia.language.WordPatterns; import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.api.searchquery.model.query.ProcessedQuery; import nu.marginalia.functions.searchquery.query_parser.QueryParser; import nu.marginalia.functions.searchquery.query_parser.token.Token; import nu.marginalia.functions.searchquery.query_parser.token.TokenType; -import nu.marginalia.term_frequency_dict.TermFrequencyDict; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -26,15 +24,14 @@ import java.util.List; public class QueryFactory { private final Logger logger = LoggerFactory.getLogger(getClass()); - private static final int RETAIN_QUERY_VARIANT_COUNT = 5; private final QueryParser queryParser = new QueryParser(); + private final QueryExpansion queryExpansion; @Inject - public QueryFactory(LanguageModels lm, - TermFrequencyDict dict, - EnglishDictionary englishDictionary) + public QueryFactory(QueryExpansion queryExpansion) { + this.queryExpansion = queryExpansion; } @@ -49,8 +46,6 @@ public class QueryFactory { List searchTermsHuman = new ArrayList<>(); List problems = new ArrayList<>(); - String domain = null; - List basicQuery = queryParser.parse(query); if (basicQuery.size() >= 12) { @@ -74,19 +69,8 @@ public class QueryFactory { t.visit(qualityLimits); } -// var queryPermutations = queryPermutation.permuteQueriesNew(basicQuery); - List subqueries = new ArrayList<>(); QuerySearchTermsAccumulator termsAccumulator = new QuerySearchTermsAccumulator(basicQuery); - domain = termsAccumulator.domain; - -// for (var parts : queryPermutations) { -// QuerySearchTermsAccumulator termsAccumulator = new QuerySearchTermsAccumulator(basicQuery); -// -// domain = termsAccumulator.domain; -// -// SearchSubquery subquery = termsAccumulator.createSubquery(); -// subqueries.add(subquery); -// } + String domain = termsAccumulator.domain; List domainIds = params.domainIds(); @@ -97,7 +81,18 @@ public class QueryFactory { } var specsBuilder = SearchSpecification.builder() - .subqueries(subqueries) + .query( + new SearchQuery( + queryExpansion.expandQuery( + termsAccumulator.searchTermsInclude + ), + termsAccumulator.searchTermsInclude, + termsAccumulator.searchTermsExclude, + termsAccumulator.searchTermsAdvice, + termsAccumulator.searchTermsPriority, + termsAccumulator.searchTermCoherences + ) + ) .humanQuery(query) .quality(qualityLimits.qualityLimit) .year(qualityLimits.year) @@ -111,12 +106,9 @@ public class QueryFactory { SearchSpecification specs = specsBuilder.build(); - for (var sq : specs.subqueries) { - sq.searchTermsAdvice.addAll(params.tacitAdvice()); - sq.searchTermsPriority.addAll(params.tacitPriority()); - sq.searchTermsInclude.addAll(params.tacitIncludes()); - sq.searchTermsExclude.addAll(params.tacitExcludes()); - } + specs.query.searchTermsAdvice.addAll(params.tacitAdvice()); + specs.query.searchTermsPriority.addAll(params.tacitPriority()); + specs.query.searchTermsExclude.addAll(params.tacitExcludes()); return new ProcessedQuery(specs, searchTermsHuman, domain); } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QuerySearchTermsAccumulator.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QuerySearchTermsAccumulator.java index e4def0d0..cc3a7e56 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QuerySearchTermsAccumulator.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QuerySearchTermsAccumulator.java @@ -1,6 +1,6 @@ package nu.marginalia.functions.searchquery.svc; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.language.WordPatterns; import nu.marginalia.functions.searchquery.query_parser.token.Token; import nu.marginalia.functions.searchquery.query_parser.token.TokenVisitor; @@ -9,7 +9,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; -/** @see SearchSubquery */ +/** @see SearchQuery */ public class QuerySearchTermsAccumulator implements TokenVisitor { public List searchTermsExclude = new ArrayList<>(); public List searchTermsInclude = new ArrayList<>(); @@ -19,10 +19,6 @@ public class QuerySearchTermsAccumulator implements TokenVisitor { public String domain; - public SearchSubquery createSubquery() { - return new SearchSubquery(searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority, searchTermCoherences); - } - public QuerySearchTermsAccumulator(List parts) { for (Token t : parts) { t.visit(this); diff --git a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java index 24131143..132944c4 100644 --- a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java +++ b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java @@ -3,12 +3,13 @@ package nu.marginalia.query.svc; import nu.marginalia.WmsaHome; import nu.marginalia.api.searchquery.model.query.SearchSpecification; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; +import nu.marginalia.functions.searchquery.query_parser.QueryExpansion; import nu.marginalia.functions.searchquery.svc.QueryFactory; import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.query.limit.SpecificationLimit; import nu.marginalia.index.query.limit.SpecificationLimitType; -import nu.marginalia.util.language.EnglishDictionary; +import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import org.junit.jupiter.api.BeforeAll; @@ -27,11 +28,9 @@ public class QueryFactoryTest { public static void setUpAll() throws IOException { var lm = WmsaHome.getLanguageModels(); - var tfd = new TermFrequencyDict(lm); - queryFactory = new QueryFactory(lm, - tfd, - new EnglishDictionary(tfd) + queryFactory = new QueryFactory( + new QueryExpansion(new TermFrequencyDict(lm), new NgramLexicon(lm)) ); } @@ -112,17 +111,15 @@ public class QueryFactoryTest { { // the is a stopword, so it should generate an ngram search term var specs = parseAndGetSpecs("\"the shining\""); - assertEquals(List.of("the_shining"), specs.subqueries.iterator().next().searchTermsInclude); - assertEquals(List.of(), specs.subqueries.iterator().next().searchTermsAdvice); - assertEquals(List.of(), specs.subqueries.iterator().next().searchTermCoherences); + assertEquals("the_shining", specs.query.compiledQuery); } { // tde isn't a stopword, so we should get the normal behavior var specs = parseAndGetSpecs("\"tde shining\""); - assertEquals(List.of("tde", "shining"), specs.subqueries.iterator().next().searchTermsInclude); - assertEquals(List.of("tde_shining"), specs.subqueries.iterator().next().searchTermsAdvice); - assertEquals(List.of(List.of("tde", "shining")), specs.subqueries.iterator().next().searchTermCoherences); + assertEquals("tde shining", specs.query.compiledQuery); + assertEquals(List.of("tde_shining"), specs.query.searchTermsAdvice); + assertEquals(List.of(List.of("tde", "shining")), specs.query.searchTermCoherences); } } @@ -150,8 +147,18 @@ public class QueryFactoryTest { @Test public void testPriorityTerm() { - var subquery = parseAndGetSpecs("physics ?tld:edu").subqueries.iterator().next(); + var subquery = parseAndGetSpecs("physics ?tld:edu").query; assertEquals(List.of("tld:edu"), subquery.searchTermsPriority); - assertEquals(List.of("physics"), subquery.searchTermsInclude); + assertEquals("physics", subquery.compiledQuery); + } + + @Test + public void testExpansion() { + + long start = System.currentTimeMillis(); + var subquery = parseAndGetSpecs("elden ring mechanical keyboard slackware linux duke nukem 3d").query; + System.out.println("Time: " + (System.currentTimeMillis() - start)); + System.out.println(subquery.compiledQuery); + } } \ No newline at end of file diff --git a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexEntrySource.java b/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexEntrySource.java index 37c79941..7c12563b 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexEntrySource.java +++ b/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexEntrySource.java @@ -46,7 +46,7 @@ public class ReverseIndexEntrySource implements EntrySource { return; for (int ri = entrySize, wi=1; ri < buffer.end ; ri+=entrySize, wi++) { - buffer.data[wi] = buffer.data[ri]; + buffer.data.set(wi, buffer.data.get(ri)); } buffer.end /= entrySize; diff --git a/code/index/java/nu/marginalia/index/IndexGrpcService.java b/code/index/java/nu/marginalia/index/IndexGrpcService.java index a47c4684..b675f749 100644 --- a/code/index/java/nu/marginalia/index/IndexGrpcService.java +++ b/code/index/java/nu/marginalia/index/IndexGrpcService.java @@ -9,14 +9,14 @@ import io.prometheus.client.Histogram; import it.unimi.dsi.fastutil.longs.LongArrayList; import lombok.SneakyThrows; import nu.marginalia.api.searchquery.*; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.query.SearchSpecification; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; import nu.marginalia.api.searchquery.model.results.*; import nu.marginalia.array.buffer.LongQueryBuffer; import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.index.model.SearchParameters; import nu.marginalia.index.model.SearchTerms; -import nu.marginalia.index.model.SearchTermsUtil; import nu.marginalia.index.query.IndexQuery; import nu.marginalia.index.query.IndexSearchBudget; import nu.marginalia.index.results.IndexResultValuatorService; @@ -143,7 +143,6 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { .setEncodedWordMetadata(score.encodedWordMetadata()) .setKeyword(score.keyword) .setHtmlFeatures(score.htmlFeatures()) - .setSubquery(score.subquery) ); } @@ -203,7 +202,9 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { return new SearchResultSet(List.of()); } - ResultRankingContext rankingContext = createRankingContext(params.rankingParams, params.subqueries); + ResultRankingContext rankingContext = createRankingContext(params.rankingParams, + params.compiledQuery, + params.compiledQueryIds); var queryExecution = new QueryExecution(rankingContext, params.fetchSize); @@ -255,14 +256,10 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { /** Execute a search query */ public SearchResultSet run(SearchParameters parameters) throws SQLException, InterruptedException { - for (var subquery : parameters.subqueries) { - var terms = new SearchTerms(subquery); - if (terms.isEmpty()) - continue; + var terms = new SearchTerms(parameters.query, parameters.compiledQueryIds); - for (var indexQuery : index.createQueries(terms, parameters.queryParams)) { - workerPool.execute(new IndexLookup(indexQuery, parameters.budget)); - } + for (var indexQuery : index.createQueries(terms, parameters.queryParams)) { + workerPool.execute(new IndexLookup(indexQuery, parameters.budget)); } for (int i = 0; i < indexValuationThreads; i++) { @@ -327,7 +324,9 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { buffer.reset(); query.getMoreResults(buffer); - results.addElements(0, buffer.data, 0, buffer.end); + for (int i = 0; i < buffer.end; i++) { + results.add(buffer.data.get(i)); + } if (results.size() < 512) { enqueueResults(new CombinedDocIdList(results)); @@ -413,8 +412,13 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { } - private ResultRankingContext createRankingContext(ResultRankingParameters rankingParams, List subqueries) { - final var termToId = SearchTermsUtil.getAllIncludeTerms(subqueries); + private ResultRankingContext createRankingContext(ResultRankingParameters rankingParams, + CompiledQuery query, + CompiledQueryLong compiledQueryIds) + { + Map termToId = new HashMap<>(query.size()); + query.indices().forEach(id -> termToId.put(query.at(id), compiledQueryIds.at(id))); + final Map termFrequencies = new HashMap<>(termToId.size()); final Map prioFrequencies = new HashMap<>(termToId.size()); diff --git a/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java b/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java index ea78739c..3846bad8 100644 --- a/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java +++ b/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java @@ -38,6 +38,13 @@ public class CombinedIndexReader { return new IndexQueryBuilderImpl(reverseIndexFullReader, reverseIndexPriorityReader, query); } + public QueryFilterStepIf hasWordFull(long termId) { + return reverseIndexFullReader.also(termId); + } + public QueryFilterStepIf hasWordPrio(long termId) { + return reverseIndexPriorityReader.also(termId); + } + /** Creates a query builder for terms in the priority index */ public IndexQueryBuilder findPriorityWord(long wordId) { diff --git a/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java b/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java index 825728ae..33ca033e 100644 --- a/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java +++ b/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java @@ -1,9 +1,11 @@ package nu.marginalia.index.index; +import java.util.List; import gnu.trove.set.hash.TLongHashSet; import nu.marginalia.index.ReverseIndexReader; import nu.marginalia.index.query.IndexQuery; import nu.marginalia.index.query.IndexQueryBuilder; +import nu.marginalia.index.query.filter.QueryFilterAnyOf; import nu.marginalia.index.query.filter.QueryFilterStepIf; public class IndexQueryBuilderImpl implements IndexQueryBuilder { @@ -66,6 +68,20 @@ public class IndexQueryBuilderImpl implements IndexQueryBuilder { return this; } + public IndexQueryBuilder addInclusionFilterAny(List filterSteps) { + if (filterSteps.isEmpty()) + return this; + + if (filterSteps.size() == 1) { + query.addInclusionFilter(filterSteps.getFirst()); + } + else { + query.addInclusionFilter(new QueryFilterAnyOf(filterSteps)); + } + + return this; + } + public IndexQuery build() { return query; } diff --git a/code/index/java/nu/marginalia/index/index/QueryBranchWalker.java b/code/index/java/nu/marginalia/index/index/QueryBranchWalker.java new file mode 100644 index 00000000..a465bd86 --- /dev/null +++ b/code/index/java/nu/marginalia/index/index/QueryBranchWalker.java @@ -0,0 +1,78 @@ +package nu.marginalia.index.index; + +import it.unimi.dsi.fastutil.longs.LongArrayList; +import it.unimi.dsi.fastutil.longs.LongSet; + +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; + +class QueryBranchWalker { + public final long[] priorityOrder; + public final List paths; + public final long termId; + + private QueryBranchWalker(long[] priorityOrder, List paths, long termId) { + this.priorityOrder = priorityOrder; + this.paths = paths; + this.termId = termId; + } + + public boolean atEnd() { + return priorityOrder.length == 0; + } + + public static List create(long[] priorityOrder, List paths) { + + List ret = new ArrayList<>(); + List remainingPaths = new LinkedList<>(paths); + + remainingPaths.removeIf(LongSet::isEmpty); + + for (int i = 0; i < priorityOrder.length; i++) { + long prio = priorityOrder[i]; + + var it = remainingPaths.iterator(); + List pathsForPrio = new ArrayList<>(); + + while (it.hasNext()) { + var path = it.next(); + + if (path.contains(prio)) { + path.remove(prio); + pathsForPrio.add(path); + it.remove(); + } + } + + if (!pathsForPrio.isEmpty()) { + LongArrayList remainingPrios = new LongArrayList(pathsForPrio.size()); + + for (var p : priorityOrder) { + for (var path : pathsForPrio) { + if (path.contains(p)) { + remainingPrios.add(p); + break; + } + } + } + + ret.add(new QueryBranchWalker(remainingPrios.elements(), pathsForPrio, prio)); + } + } + + if (!remainingPaths.isEmpty()) { + System.out.println("Dropping: " + remainingPaths); + } + + return ret; + } + + public List next() { + if (atEnd()) + return List.of(); + + return create(priorityOrder, paths); + } + +} diff --git a/code/index/java/nu/marginalia/index/index/StatefulIndex.java b/code/index/java/nu/marginalia/index/index/StatefulIndex.java index a49e740e..0f55c0c8 100644 --- a/code/index/java/nu/marginalia/index/index/StatefulIndex.java +++ b/code/index/java/nu/marginalia/index/index/StatefulIndex.java @@ -2,6 +2,13 @@ package nu.marginalia.index.index; import com.google.inject.Inject; import com.google.inject.Singleton; +import it.unimi.dsi.fastutil.longs.LongOpenHashSet; +import it.unimi.dsi.fastutil.longs.LongSet; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; +import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; +import nu.marginalia.index.query.filter.QueryFilterAllOf; +import nu.marginalia.index.query.filter.QueryFilterAnyOf; +import nu.marginalia.index.query.filter.QueryFilterStepIf; import nu.marginalia.index.results.model.ids.CombinedDocIdList; import nu.marginalia.index.results.model.ids.DocMetadataList; import nu.marginalia.index.model.QueryParams; @@ -14,12 +21,13 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; +import java.util.*; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; +import java.util.function.LongFunction; +import java.util.function.Predicate; +import java.util.stream.Collectors; /** This class delegates SearchIndexReader and deals with the stateful nature of the index, * i.e. it may be possible to reconstruct the index and load a new set of data. @@ -105,6 +113,61 @@ public class StatefulIndex { return combinedIndexReader != null && combinedIndexReader.isLoaded(); } + private Predicate containsOnly(long[] permitted) { + LongSet permittedTerms = new LongOpenHashSet(permitted); + return permittedTerms::containsAll; + } + + private List createBuilders(CompiledQueryLong query, + LongFunction builderFactory, + long[] termPriority) { + List paths = CompiledQueryAggregates.queriesAggregate(query); + + // Remove any paths that do not contain all prioritized terms, as this means + // the term is missing from the index and can never be found + paths.removeIf(containsOnly(termPriority).negate()); + + List helpers = QueryBranchWalker.create(termPriority, paths); + List builders = new ArrayList<>(); + + for (var helper : helpers) { + var builder = builderFactory.apply(helper.termId); + + builders.add(builder); + + if (helper.atEnd()) + continue; + + var filters = helper.next().stream() + .map(this::createFilter) + .toList(); + + builder.addInclusionFilterAny(filters); + } + + return builders; + } + + private QueryFilterStepIf createFilter(QueryBranchWalker helper) { + var selfCondition = combinedIndexReader.hasWordFull(helper.termId); + if (helper.atEnd()) + return selfCondition; + + var nextSteps = helper.next(); + var nextFilters = nextSteps.stream() + .map(this::createFilter) + .map(filter -> new QueryFilterAllOf(List.of(selfCondition, filter))) + .collect(Collectors.toList()); + + if (nextFilters.isEmpty()) + return selfCondition; + + if (nextFilters.size() == 1) + return nextFilters.getFirst(); + + + return new QueryFilterAnyOf(nextFilters); + } public List createQueries(SearchTerms terms, QueryParams params) { @@ -117,40 +180,13 @@ public class StatefulIndex { final long[] orderedIncludesPrio = terms.sortedDistinctIncludes(this::compareKeywordsPrio); List queryHeads = new ArrayList<>(10); + + queryHeads.addAll(createBuilders(terms.compiledQuery(), combinedIndexReader::findFullWord, orderedIncludes)); + queryHeads.addAll(createBuilders(terms.compiledQuery(), combinedIndexReader::findPriorityWord, orderedIncludesPrio)); + List queries = new ArrayList<>(10); - // To ensure that good results are discovered, create separate query heads for the priority index that - // filter for terms that contain pairs of two search terms - if (orderedIncludesPrio.length > 1) { - for (int i = 0; i + 1 < orderedIncludesPrio.length; i++) { - for (int j = i + 1; j < orderedIncludesPrio.length; j++) { - var entrySource = combinedIndexReader - .findPriorityWord(orderedIncludesPrio[i]) - .alsoPrio(orderedIncludesPrio[j]); - queryHeads.add(entrySource); - } - } - } - - // Next consider entries that appear only once in the priority index - for (var wordId : orderedIncludesPrio) { - queryHeads.add(combinedIndexReader.findPriorityWord(wordId)); - } - - // Finally consider terms in the full index - queryHeads.add(combinedIndexReader.findFullWord(orderedIncludes[0])); - for (var query : queryHeads) { - if (query == null) { - return Collections.emptyList(); - } - - // Note that we can add all includes as filters, even though - // they may not be present in the query head, as the query builder - // will ignore redundant include filters: - for (long orderedInclude : orderedIncludes) { - query = query.alsoFull(orderedInclude); - } for (long term : terms.excludes()) { query = query.notFull(term); @@ -161,6 +197,7 @@ public class StatefulIndex { queries.add(query.addInclusionFilter(combinedIndexReader.filterForParams(params)).build()); } + return queries; } diff --git a/code/index/java/nu/marginalia/index/model/SearchParameters.java b/code/index/java/nu/marginalia/index/model/SearchParameters.java index 7db25341..f0e851e5 100644 --- a/code/index/java/nu/marginalia/index/model/SearchParameters.java +++ b/code/index/java/nu/marginalia/index/model/SearchParameters.java @@ -2,16 +2,16 @@ package nu.marginalia.index.model; import nu.marginalia.api.searchquery.IndexProtobufCodec; import nu.marginalia.api.searchquery.RpcIndexQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryParser; import nu.marginalia.api.searchquery.model.query.SearchSpecification; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.index.query.IndexSearchBudget; import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.searchset.SearchSet; -import java.util.ArrayList; -import java.util.List; - import static nu.marginalia.api.searchquery.IndexProtobufCodec.convertSpecLimit; public class SearchParameters { @@ -21,13 +21,16 @@ public class SearchParameters { */ public final int fetchSize; public final IndexSearchBudget budget; - public final List subqueries; + public final SearchQuery query; public final QueryParams queryParams; public final ResultRankingParameters rankingParams; public final int limitByDomain; public final int limitTotal; + public final CompiledQuery compiledQuery; + public final CompiledQueryLong compiledQueryIds; + // mutable: /** @@ -40,7 +43,7 @@ public class SearchParameters { this.fetchSize = limits.fetchSize(); this.budget = new IndexSearchBudget(limits.timeoutMs()); - this.subqueries = specsSet.subqueries; + this.query = specsSet.query; this.limitByDomain = limits.resultsByDomain(); this.limitTotal = limits.resultsTotal(); @@ -52,6 +55,9 @@ public class SearchParameters { searchSet, specsSet.queryStrategy); + compiledQuery = CompiledQueryParser.parse(this.query.compiledQuery); + compiledQueryIds = compiledQuery.mapToLong(SearchTermsUtil::getWordId); + rankingParams = specsSet.rankingParams; } @@ -63,11 +69,8 @@ public class SearchParameters { // The time budget is halved because this is the point when we start to // wrap up the search and return the results. this.budget = new IndexSearchBudget(limits.timeoutMs() / 2); + this.query = IndexProtobufCodec.convertRpcQuery(request.getQuery()); - this.subqueries = new ArrayList<>(request.getSubqueriesCount()); - for (int i = 0; i < request.getSubqueriesCount(); i++) { - this.subqueries.add(IndexProtobufCodec.convertSearchSubquery(request.getSubqueries(i))); - } this.limitByDomain = limits.resultsByDomain(); this.limitTotal = limits.resultsTotal(); @@ -79,9 +82,13 @@ public class SearchParameters { searchSet, QueryStrategy.valueOf(request.getQueryStrategy())); + compiledQuery = CompiledQueryParser.parse(this.query.compiledQuery); + compiledQueryIds = compiledQuery.mapToLong(SearchTermsUtil::getWordId); + rankingParams = IndexProtobufCodec.convertRankingParameterss(request.getParameters()); } + public long getDataCost() { return dataCost; } diff --git a/code/index/java/nu/marginalia/index/model/SearchTerms.java b/code/index/java/nu/marginalia/index/model/SearchTerms.java index c32b1aa3..307e4179 100644 --- a/code/index/java/nu/marginalia/index/model/SearchTerms.java +++ b/code/index/java/nu/marginalia/index/model/SearchTerms.java @@ -4,7 +4,8 @@ import it.unimi.dsi.fastutil.longs.LongArrayList; import it.unimi.dsi.fastutil.longs.LongComparator; import it.unimi.dsi.fastutil.longs.LongList; import it.unimi.dsi.fastutil.longs.LongOpenHashSet; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import java.util.ArrayList; import java.util.List; @@ -18,34 +19,39 @@ public final class SearchTerms { private final LongList priority; private final List coherences; + private final CompiledQueryLong compiledQueryIds; + public SearchTerms( LongList includes, LongList excludes, LongList priority, - List coherences + List coherences, + CompiledQueryLong compiledQueryIds ) { this.includes = includes; this.excludes = excludes; this.priority = priority; this.coherences = coherences; + this.compiledQueryIds = compiledQueryIds; } - public SearchTerms(SearchSubquery subquery) { + public SearchTerms(SearchQuery query, CompiledQueryLong compiledQueryIds) { this(new LongArrayList(), new LongArrayList(), new LongArrayList(), - new ArrayList<>()); + new ArrayList<>(), + compiledQueryIds); - for (var word : subquery.searchTermsInclude) { + for (var word : query.searchTermsInclude) { includes.add(getWordId(word)); } - for (var word : subquery.searchTermsAdvice) { + for (var word : query.searchTermsAdvice) { // This looks like a bug, but it's not includes.add(getWordId(word)); } - for (var coherence : subquery.searchTermCoherences) { + for (var coherence : query.searchTermCoherences) { LongList parts = new LongArrayList(coherence.size()); for (var word : coherence) { @@ -55,10 +61,10 @@ public final class SearchTerms { coherences.add(parts); } - for (var word : subquery.searchTermsExclude) { + for (var word : query.searchTermsExclude) { excludes.add(getWordId(word)); } - for (var word : subquery.searchTermsPriority) { + for (var word : query.searchTermsPriority) { priority.add(getWordId(word)); } } @@ -96,6 +102,8 @@ public final class SearchTerms { return coherences; } + public CompiledQueryLong compiledQuery() { return compiledQueryIds; } + @Override public boolean equals(Object obj) { if (obj == this) return true; diff --git a/code/index/java/nu/marginalia/index/model/SearchTermsUtil.java b/code/index/java/nu/marginalia/index/model/SearchTermsUtil.java index 9797ca95..fa516565 100644 --- a/code/index/java/nu/marginalia/index/model/SearchTermsUtil.java +++ b/code/index/java/nu/marginalia/index/model/SearchTermsUtil.java @@ -1,29 +1,9 @@ package nu.marginalia.index.model; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; import nu.marginalia.hash.MurmurHash3_128; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - public class SearchTermsUtil { - /** Extract all include-terms from the specified subqueries, - * and a return a map of the terms and their termIds. - */ - public static Map getAllIncludeTerms(List subqueries) { - Map ret = new HashMap<>(); - - for (var subquery : subqueries) { - for (var include : subquery.searchTermsInclude) { - ret.computeIfAbsent(include, i -> getWordId(include)); - } - } - - return ret; - } - private static final MurmurHash3_128 hasher = new MurmurHash3_128(); /** Translate the word to a unique id. */ diff --git a/code/index/java/nu/marginalia/index/results/IndexMetadataService.java b/code/index/java/nu/marginalia/index/results/IndexMetadataService.java index 1932a5a4..977a87e7 100644 --- a/code/index/java/nu/marginalia/index/results/IndexMetadataService.java +++ b/code/index/java/nu/marginalia/index/results/IndexMetadataService.java @@ -4,7 +4,8 @@ import com.google.inject.Inject; import gnu.trove.map.hash.TObjectLongHashMap; import it.unimi.dsi.fastutil.longs.Long2ObjectArrayMap; import it.unimi.dsi.fastutil.longs.LongArrayList; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.index.model.SearchTermsUtil; import nu.marginalia.index.results.model.QuerySearchTerms; @@ -13,9 +14,6 @@ import nu.marginalia.index.results.model.TermMetadataForCombinedDocumentIds; import nu.marginalia.index.results.model.ids.CombinedDocIdList; import nu.marginalia.index.results.model.ids.TermIdList; -import java.util.ArrayList; -import java.util.List; - import static nu.marginalia.index.results.model.TermCoherenceGroupList.TermCoherenceGroup; import static nu.marginalia.index.results.model.TermMetadataForCombinedDocumentIds.DocumentsWithMetadata; @@ -42,43 +40,24 @@ public class IndexMetadataService { return new TermMetadataForCombinedDocumentIds(termdocToMeta); } - public QuerySearchTerms getSearchTerms(List searchTermVariants) { + public QuerySearchTerms getSearchTerms(CompiledQuery compiledQuery, SearchQuery searchQuery) { LongArrayList termIdsList = new LongArrayList(); TObjectLongHashMap termToId = new TObjectLongHashMap<>(10, 0.75f, -1); - for (var subquery : searchTermVariants) { - for (var term : subquery.searchTermsInclude) { - if (termToId.containsKey(term)) { - continue; - } - - long id = SearchTermsUtil.getWordId(term); - termIdsList.add(id); - termToId.put(term, id); - } + for (String word : compiledQuery) { + long id = SearchTermsUtil.getWordId(word); + termIdsList.add(id); + termToId.put(word, id); } return new QuerySearchTerms(termToId, new TermIdList(termIdsList), - getTermCoherences(searchTermVariants)); - } - - - private TermCoherenceGroupList getTermCoherences(List searchTermVariants) { - List coherences = new ArrayList<>(); - - for (var subquery : searchTermVariants) { - for (var coh : subquery.searchTermCoherences) { - coherences.add(new TermCoherenceGroup(coh)); - } - - // It's assumed each subquery has identical coherences - break; - } - - return new TermCoherenceGroupList(coherences); + new TermCoherenceGroupList( + searchQuery.searchTermCoherences.stream().map(TermCoherenceGroup::new).toList() + ) + ); } } diff --git a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java index 967a600f..3777cf4f 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java @@ -1,10 +1,13 @@ package nu.marginalia.index.results; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; +import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.SearchResultItem; import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; import nu.marginalia.index.index.StatefulIndex; +import nu.marginalia.index.model.SearchParameters; import nu.marginalia.index.results.model.ids.CombinedDocIdList; import nu.marginalia.index.model.QueryParams; import nu.marginalia.index.results.model.QuerySearchTerms; @@ -23,7 +26,6 @@ import java.util.List; * reasons to cache this data, and performs the calculations */ public class IndexResultValuationContext { private final StatefulIndex statefulIndex; - private final List> searchTermVariants; private final QueryParams queryParams; private final TermMetadataForCombinedDocumentIds termMetadataForCombinedDocumentIds; @@ -31,23 +33,26 @@ public class IndexResultValuationContext { private final ResultRankingContext rankingContext; private final ResultValuator searchResultValuator; + private final CompiledQuery compiledQuery; + private final CompiledQueryLong compiledQueryIds; public IndexResultValuationContext(IndexMetadataService metadataService, ResultValuator searchResultValuator, CombinedDocIdList ids, StatefulIndex statefulIndex, ResultRankingContext rankingContext, - List subqueries, - QueryParams queryParams + SearchParameters params ) { this.statefulIndex = statefulIndex; this.rankingContext = rankingContext; this.searchResultValuator = searchResultValuator; - this.searchTermVariants = subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList(); - this.queryParams = queryParams; + this.queryParams = params.queryParams; + this.compiledQuery = params.compiledQuery; + this.compiledQueryIds = params.compiledQueryIds; + + this.searchTerms = metadataService.getSearchTerms(params.compiledQuery, params.query); - this.searchTerms = metadataService.getSearchTerms(subqueries); this.termMetadataForCombinedDocumentIds = metadataService.getTermMetadataForDocuments(ids, searchTerms.termIdsAll); } @@ -65,68 +70,39 @@ public class IndexResultValuationContext { long docMetadata = statefulIndex.getDocumentMetadata(docId); int htmlFeatures = statefulIndex.getHtmlFeatures(docId); - int maxFlagsCount = 0; - boolean anyAllSynthetic = false; - int maxPositionsSet = 0; + SearchResultItem searchResult = new SearchResultItem(docId); - SearchResultItem searchResult = new SearchResultItem(docId, - searchTermVariants.stream().mapToInt(List::size).sum()); + SearchResultKeywordScore[] scores = compiledQuery.indices().mapToObj(idx -> + new SearchResultKeywordScore( + compiledQuery.at(idx), + compiledQueryIds.at(idx), + termMetadataForCombinedDocumentIds.getTermMetadata( + compiledQueryIds.at(idx), combinedId + ), + docMetadata, + htmlFeatures) + ) + .toArray(SearchResultKeywordScore[]::new); - for (int querySetId = 0; - querySetId < searchTermVariants.size(); - querySetId++) - { - var termList = searchTermVariants.get(querySetId); + // DANGER: IndexResultValuatorService assumes that searchResult.keywordScores has this specific order, as it needs + // to be able to re-construct its own CompiledQuery for re-ranking the results. This is + // a very flimsy assumption. + searchResult.keywordScores.addAll(List.of(scores)); - SearchResultKeywordScore[] termScoresForSet = new SearchResultKeywordScore[termList.size()]; + CompiledQuery queryGraphScores = new CompiledQuery<>(compiledQuery.root, scores); - boolean synthetic = true; + boolean allSynthetic = !CompiledQueryAggregates.booleanAggregate(queryGraphScores, score -> !score.hasTermFlag(WordFlags.Synthetic)); + int flagsCount = CompiledQueryAggregates.intMaxMinAggregate(queryGraphScores, score -> Long.bitCount(score.encodedWordMetadata() & flagsFilterMask)); + int positionsCount = CompiledQueryAggregates.intMaxMinAggregate(queryGraphScores, SearchResultKeywordScore::positionCount); - for (int termIdx = 0; termIdx < termList.size(); termIdx++) { - String searchTerm = termList.get(termIdx); - - long termMetadata = termMetadataForCombinedDocumentIds.getTermMetadata( - searchTerms.getIdForTerm(searchTerm), - combinedId - ); - - var score = new SearchResultKeywordScore( - querySetId, - searchTerm, - termMetadata, - docMetadata, - htmlFeatures - ); - - synthetic &= WordFlags.Synthetic.isPresent(termMetadata); - - searchResult.keywordScores.add(score); - - termScoresForSet[termIdx] = score; - } - - if (!meetsQueryStrategyRequirements(termScoresForSet, queryParams.queryStrategy())) { - continue; - } - - int minFlagsCount = 8; - int minPositionsSet = 4; - - for (var termScore : termScoresForSet) { - final int flagCount = Long.bitCount(termScore.encodedWordMetadata() & flagsFilterMask); - minFlagsCount = Math.min(minFlagsCount, flagCount); - minPositionsSet = Math.min(minPositionsSet, termScore.positionCount()); - } - - maxFlagsCount = Math.max(maxFlagsCount, minFlagsCount); - maxPositionsSet = Math.max(maxPositionsSet, minPositionsSet); - anyAllSynthetic |= synthetic; + if (!meetsQueryStrategyRequirements(queryGraphScores, queryParams.queryStrategy())) { + return null; } - if (maxFlagsCount == 0 && !anyAllSynthetic && maxPositionsSet == 0) + if (flagsCount == 0 && !allSynthetic && positionsCount == 0) return null; - double score = searchResultValuator.calculateSearchResultValue(searchResult.keywordScores, + double score = searchResultValuator.calculateSearchResultValue(queryGraphScores, 5000, // use a dummy value here as it's not present in the index rankingContext); @@ -135,20 +111,17 @@ public class IndexResultValuationContext { return searchResult; } - private boolean meetsQueryStrategyRequirements(SearchResultKeywordScore[] termSet, QueryStrategy queryStrategy) { + private boolean meetsQueryStrategyRequirements(CompiledQuery queryGraphScores, + QueryStrategy queryStrategy) + { if (queryStrategy == QueryStrategy.AUTO || queryStrategy == QueryStrategy.SENTENCE || queryStrategy == QueryStrategy.TOPIC) { return true; } - for (var keyword : termSet) { - if (!meetsQueryStrategyRequirements(keyword, queryParams.queryStrategy())) { - return false; - } - } - - return true; + return CompiledQueryAggregates.booleanAggregate(queryGraphScores, + docs -> meetsQueryStrategyRequirements(docs, queryParams.queryStrategy())); } private boolean meetsQueryStrategyRequirements(SearchResultKeywordScore termScore, QueryStrategy queryStrategy) { diff --git a/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java b/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java index 51e59c63..f1dabea4 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java @@ -4,10 +4,11 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import gnu.trove.list.TLongList; import gnu.trove.list.array.TLongArrayList; -import it.unimi.dsi.fastutil.longs.LongArrayList; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.SearchResultItem; +import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.index.model.SearchParameters; import nu.marginalia.index.results.model.ids.CombinedDocIdList; @@ -19,8 +20,6 @@ import org.slf4j.LoggerFactory; import java.sql.SQLException; import java.util.*; -import java.util.function.Consumer; -import java.util.stream.Collectors; @Singleton public class IndexResultValuatorService { @@ -44,8 +43,8 @@ public class IndexResultValuatorService { } public List rankResults(SearchParameters params, - ResultRankingContext rankingContext, - CombinedDocIdList resultIds) + ResultRankingContext rankingContext, + CombinedDocIdList resultIds) { final var evaluator = createValuationContext(params, rankingContext, resultIds); @@ -70,8 +69,7 @@ public class IndexResultValuatorService { resultIds, statefulIndex, rankingContext, - params.subqueries, - params.queryParams); + params); } @@ -96,12 +94,13 @@ public class IndexResultValuatorService { item.resultsFromDomain = domainCountFilter.getCount(item); } - return decorateAndRerank(resultsList, rankingContext); + return decorateAndRerank(resultsList, params.compiledQuery, rankingContext); } /** Decorate the result items with additional information from the link database * and calculate an updated ranking with the additional information */ public List decorateAndRerank(List rawResults, + CompiledQuery compiledQuery, ResultRankingContext rankingContext) throws SQLException { @@ -125,13 +124,22 @@ public class IndexResultValuatorService { continue; } - resultItems.add(createCombinedItem(result, docData, rankingContext)); + // Reconstruct the SearchResultKeywordScore-compiledquery for re-valuation + // + // CAVEAT: This hinges on a very fragile that IndexResultValuationContext puts them in the same + // order as the data for the CompiledQuery. + CompiledQuery resultQuery = + new CompiledQuery<>(compiledQuery.root, result.keywordScores.toArray(SearchResultKeywordScore[]::new)); + + + resultItems.add(createCombinedItem(result, docData, resultQuery, rankingContext)); } return resultItems; } private DecoratedSearchResultItem createCombinedItem(SearchResultItem result, DocdbUrlDetail docData, + CompiledQuery resultQuery, ResultRankingContext rankingContext) { return new DecoratedSearchResultItem( result, @@ -144,7 +152,7 @@ public class IndexResultValuatorService { docData.pubYear(), docData.dataHash(), docData.wordsTotal(), - resultValuator.calculateSearchResultValue(result.keywordScores, docData.wordsTotal(), rankingContext) + resultValuator.calculateSearchResultValue(resultQuery, docData.wordsTotal(), rankingContext) ); } diff --git a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java index 6c67559d..05ff83d2 100644 --- a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java +++ b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java @@ -1,5 +1,6 @@ package nu.marginalia.ranking.results; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; @@ -33,14 +34,17 @@ public class ResultValuator { this.termCoherenceFactor = termCoherenceFactor; } - public double calculateSearchResultValue(List scores, + public double calculateSearchResultValue(CompiledQuery scores, int length, ResultRankingContext ctx) { - int sets = numberOfSets(scores); + if (scores.size() == 0) + return Double.MAX_VALUE; + if (length < 0) + length = 5000; - long documentMetadata = documentMetadata(scores); - int features = htmlFeatures(scores); + long documentMetadata = scores.at(0).encodedDocMetadata(); + int features = scores.at(0).htmlFeatures(); var rankingParams = ctx.params; int rank = DocumentMetadata.decodeRank(documentMetadata); @@ -75,32 +79,16 @@ public class ResultValuator { + temporalBias + flagsPenalty; - double bestTcf = 0; - double bestBM25F = 0; - double bestBM25P = 0; - double bestBM25PN = 0; - - for (int set = 0; set < sets; set++) { - ResultKeywordSet keywordSet = createKeywordSet(scores, set); - - if (keywordSet.isEmpty()) - continue; - - bestTcf = Math.max(bestTcf, rankingParams.tcfWeight * termCoherenceFactor.calculate(keywordSet)); - bestBM25P = Math.max(bestBM25P, rankingParams.bm25PrioWeight * bm25Factor.calculateBm25Prio(rankingParams.prioParams, keywordSet, ctx)); - bestBM25F = Math.max(bestBM25F, rankingParams.bm25FullWeight * bm25Factor.calculateBm25(rankingParams.fullParams, keywordSet, length, ctx)); - if (keywordSet.hasNgram()) { - bestBM25PN = Math.max(bestBM25PN, rankingParams.bm25PrioWeight * bm25Factor.calculateBm25Prio(rankingParams.prioParams, keywordSet, ctx)); - } - } - + double bestTcf = rankingParams.tcfWeight * termCoherenceFactor.calculate(scores); + double bestBM25F = rankingParams.bm25FullWeight * bm25Factor.calculateBm25(rankingParams.prioParams, scores, length, ctx); + double bestBM25P = rankingParams.bm25PrioWeight * bm25Factor.calculateBm25Prio(rankingParams.prioParams, scores, ctx); double overallPartPositive = Math.max(0, overallPart); double overallPartNegative = -Math.min(0, overallPart); // Renormalize to 0...15, where 0 is the best possible score; // this is a historical artifact of the original ranking function - return normalize(1.5 * bestTcf + bestBM25F + bestBM25P + 0.25 * bestBM25PN + overallPartPositive, overallPartNegative); + return normalize(1.5 * bestTcf + bestBM25F + bestBM25P + overallPartPositive, overallPartNegative); } private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) { diff --git a/code/index/java/nu/marginalia/ranking/results/factors/Bm25Factor.java b/code/index/java/nu/marginalia/ranking/results/factors/Bm25Factor.java index 335b5fa8..bc13671e 100644 --- a/code/index/java/nu/marginalia/ranking/results/factors/Bm25Factor.java +++ b/code/index/java/nu/marginalia/ranking/results/factors/Bm25Factor.java @@ -1,10 +1,11 @@ package nu.marginalia.ranking.results.factors; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; import nu.marginalia.api.searchquery.model.results.Bm25Parameters; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; import nu.marginalia.model.idx.WordFlags; -import nu.marginalia.ranking.results.ResultKeywordSet; public class Bm25Factor { private static final int AVG_LENGTH = 5000; @@ -13,43 +14,33 @@ public class Bm25Factor { * * @see Bm25Parameters */ - public double calculateBm25(Bm25Parameters bm25Parameters, ResultKeywordSet keywordSet, int length, ResultRankingContext ctx) { + public double calculateBm25(Bm25Parameters bm25Parameters, CompiledQuery scores, int length, ResultRankingContext ctx) { final int docCount = ctx.termFreqDocCount(); - if (length <= 0) - length = AVG_LENGTH; - - double sum = 0.; - - for (var keyword : keywordSet.keywords()) { + return CompiledQueryAggregates.doubleSumAggregate(scores, keyword -> { double count = keyword.positionCount(); int freq = ctx.frequency(keyword.keyword); - sum += invFreq(docCount, freq) * f(bm25Parameters.k(), bm25Parameters.b(), count, length); - } - - return sum; + return invFreq(docCount, freq) * f(bm25Parameters.k(), bm25Parameters.b(), count, length); + }); } /** Bm25 calculation, except instead of counting positions in the document, * the number of relevance signals for the term is counted instead. */ - public double calculateBm25Prio(Bm25Parameters bm25Parameters, ResultKeywordSet keywordSet, ResultRankingContext ctx) { + public double calculateBm25Prio(Bm25Parameters bm25Parameters, CompiledQuery scores, ResultRankingContext ctx) { final int docCount = ctx.termFreqDocCount(); - double sum = 0.; - - for (var keyword : keywordSet.keywords()) { + return CompiledQueryAggregates.doubleSumAggregate(scores, keyword -> { double count = evaluatePriorityScore(keyword); int freq = ctx.priorityFrequency(keyword.keyword); // note we override b to zero for priority terms as they are independent of document length - sum += invFreq(docCount, freq) * f(bm25Parameters.k(), 0, count, 0); - } + return invFreq(docCount, freq) * f(bm25Parameters.k(), 0, count, 0); + }); - return sum; } private static double evaluatePriorityScore(SearchResultKeywordScore keyword) { diff --git a/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java b/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java index f956ce88..71159c58 100644 --- a/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java +++ b/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java @@ -1,14 +1,16 @@ package nu.marginalia.ranking.results.factors; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; +import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; import nu.marginalia.model.idx.WordMetadata; -import nu.marginalia.ranking.results.ResultKeywordSet; /** Rewards documents where terms appear frequently within the same sentences */ public class TermCoherenceFactor { - public double calculate(ResultKeywordSet keywordSet) { - long mask = combinedMask(keywordSet); + public double calculate(CompiledQuery scores) { + long mask = CompiledQueryAggregates.longBitmaskAggregate(scores, score -> score.positions() & WordMetadata.POSITIONS_MASK); return bitsSetFactor(mask); } @@ -19,14 +21,5 @@ public class TermCoherenceFactor { return Math.pow(bitsSetInMask/(float) WordMetadata.POSITIONS_COUNT, 0.25); } - long combinedMask(ResultKeywordSet keywordSet) { - long mask = WordMetadata.POSITIONS_MASK; - - for (var keyword : keywordSet.keywords()) { - mask &= keyword.positions(); - } - - return mask; - } } \ No newline at end of file diff --git a/code/index/query/java/nu/marginalia/index/query/IndexQueryBuilder.java b/code/index/query/java/nu/marginalia/index/query/IndexQueryBuilder.java index 68a88625..74ebdea1 100644 --- a/code/index/query/java/nu/marginalia/index/query/IndexQueryBuilder.java +++ b/code/index/query/java/nu/marginalia/index/query/IndexQueryBuilder.java @@ -2,6 +2,8 @@ package nu.marginalia.index.query; import nu.marginalia.index.query.filter.QueryFilterStepIf; +import java.util.List; + /** Builds a query. *

* Note: The query builder may omit predicates that are deemed redundant. @@ -21,6 +23,7 @@ public interface IndexQueryBuilder { IndexQueryBuilder notFull(long termId); IndexQueryBuilder addInclusionFilter(QueryFilterStepIf filterStep); + IndexQueryBuilder addInclusionFilterAny(List filterStep); IndexQuery build(); } diff --git a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAllOf.java b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAllOf.java new file mode 100644 index 00000000..8c20fe98 --- /dev/null +++ b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAllOf.java @@ -0,0 +1,57 @@ +package nu.marginalia.index.query.filter; + +import nu.marginalia.array.buffer.LongQueryBuffer; + +import java.util.List; +import java.util.StringJoiner; + +public class QueryFilterAllOf implements QueryFilterStepIf { + private final List steps; + + public QueryFilterAllOf(List steps) { + this.steps = steps; + } + + public double cost() { + double prod = 1.; + + for (var step : steps) { + double cost = step.cost(); + if (cost > 1.0) { + prod *= Math.log(cost); + } + else { + prod += cost; + } + } + + return prod; + } + + @Override + public boolean test(long value) { + for (var step : steps) { + if (!step.test(value)) + return false; + } + return true; + } + + + public void apply(LongQueryBuffer buffer) { + if (steps.isEmpty()) + return; + + for (var step : steps) { + step.apply(buffer); + } + } + + public String describe() { + StringJoiner sj = new StringJoiner(",", "[All Of: ", "]"); + for (var step : steps) { + sj.add(step.describe()); + } + return sj.toString(); + } +} diff --git a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAnyOf.java b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAnyOf.java index c9ee2c6e..2d177645 100644 --- a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAnyOf.java +++ b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAnyOf.java @@ -2,7 +2,6 @@ package nu.marginalia.index.query.filter; import nu.marginalia.array.buffer.LongQueryBuffer; -import java.util.Arrays; import java.util.List; import java.util.StringJoiner; @@ -14,7 +13,7 @@ public class QueryFilterAnyOf implements QueryFilterStepIf { } public double cost() { - return steps.stream().mapToDouble(QueryFilterStepIf::cost).average().orElse(0.); + return steps.stream().mapToDouble(QueryFilterStepIf::cost).sum(); } @Override @@ -31,31 +30,23 @@ public class QueryFilterAnyOf implements QueryFilterStepIf { if (steps.isEmpty()) return; - int start; + int start = 0; int end = buffer.end; - steps.getFirst().apply(buffer); - - // The filter functions will partition the data in the buffer from 0 to END, - // and update END to the length of the retained items, keeping the retained - // items sorted but making no guarantees about the rejected half - // - // Therefore, we need to re-sort the rejected side, and to satisfy the - // constraint that the data is sorted up to END, finally sort it again. - // - // This sorting may seem like it's slower, but filter.apply(...) is - // typically much faster than iterating over filter.test(...); so this - // is more than made up for - - for (int fi = 1; fi < steps.size(); fi++) + for (var step : steps) { - start = buffer.end; - Arrays.sort(buffer.data, start, end); - buffer.startFilterForRange(start, end); - steps.get(fi).apply(buffer); + var slice = buffer.slice(start, end); + slice.data.quickSort(0, slice.size()); + + step.apply(slice); + start += slice.end; } - Arrays.sort(buffer.data, 0, buffer.end); + buffer.data.quickSort(0, start); + + // Special finalization + buffer.reset(); + buffer.end = start; } public String describe() { diff --git a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterLetThrough.java b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterLetThrough.java index ed02dd6d..77f503cf 100644 --- a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterLetThrough.java +++ b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterLetThrough.java @@ -16,7 +16,7 @@ public class QueryFilterLetThrough implements QueryFilterStepIf { } public double cost() { - return 0.; + return 1.; } public String describe() { diff --git a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterNoPass.java b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterNoPass.java index 1bcd04ae..502e7c4c 100644 --- a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterNoPass.java +++ b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterNoPass.java @@ -15,7 +15,7 @@ public class QueryFilterNoPass implements QueryFilterStepIf { } public double cost() { - return 0.; + return 1.; } public String describe() { diff --git a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterStepExcludeFromPredicate.java b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterStepExcludeFromPredicate.java index 92c8c972..0d715863 100644 --- a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterStepExcludeFromPredicate.java +++ b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterStepExcludeFromPredicate.java @@ -16,7 +16,7 @@ public class QueryFilterStepExcludeFromPredicate implements QueryFilterStepIf { @Override public double cost() { - return 0; + return 1; } @Override diff --git a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterStepFromPredicate.java b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterStepFromPredicate.java index 56f08b71..9cd51d7a 100644 --- a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterStepFromPredicate.java +++ b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterStepFromPredicate.java @@ -16,7 +16,7 @@ public class QueryFilterStepFromPredicate implements QueryFilterStepIf { @Override public double cost() { - return 0; + return 1; } @Override diff --git a/code/index/query/test/nu/marginalia/index/query/filter/QueryFilterStepIfTest.java b/code/index/query/test/nu/marginalia/index/query/filter/QueryFilterStepIfTest.java index a7450b11..b2ef1bdb 100644 --- a/code/index/query/test/nu/marginalia/index/query/filter/QueryFilterStepIfTest.java +++ b/code/index/query/test/nu/marginalia/index/query/filter/QueryFilterStepIfTest.java @@ -55,6 +55,32 @@ class QueryFilterStepIfTest { assertArrayEquals(new long[]{8, 10}, buffer.copyData()); } + @Test + public void testSuccessiveApplicationWithAllOf() { + var buffer = createBuffer(1, 2, 3, 4, 5, 6, 7, 8, 9, 10); + var filter1 = new QueryFilterStepFromPredicate(value -> value % 2 == 0); + var filter2 = new QueryFilterStepExcludeFromPredicate(value -> value <= 6); + new QueryFilterAllOf(List.of(filter1, filter2)).apply(buffer); + assertArrayEquals(new long[]{8, 10}, buffer.copyData()); + } + @Test + public void testCombinedOrAnd() { + var buffer = createBuffer(1, 2, 3, 4, 5, 6, 7, 8, 9, 10); + + var filter1 = new QueryFilterStepFromPredicate(value -> value % 2 == 0); + var filter2 = new QueryFilterStepFromPredicate(value -> value <= 5); + var filter1_2 = new QueryFilterAllOf(List.of(filter1, filter2)); + + var filter3 = new QueryFilterStepFromPredicate(value -> value % 2 == 1); + var filter4 = new QueryFilterStepFromPredicate(value -> value > 5); + var filter3_4 = new QueryFilterAllOf(List.of(filter3, filter4)); + + var filter12_34 = new QueryFilterAnyOf(List.of(filter1_2, filter3_4)); + + filter12_34.apply(buffer); + + assertArrayEquals(new long[]{2, 4, 7, 9}, buffer.copyData()); + } @Test public void testCombinedApplication() { var buffer = createBuffer(1, 2, 3, 4, 5, 6, 7, 8, 9, 10); diff --git a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java index 634481f4..301b5e19 100644 --- a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java +++ b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java @@ -5,7 +5,7 @@ import com.google.inject.Inject; import lombok.SneakyThrows; import nu.marginalia.IndexLocations; import nu.marginalia.api.searchquery.model.query.SearchSpecification; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.process.control.FakeProcessHeartbeat; @@ -123,9 +123,10 @@ public class IndexQueryServiceIntegrationSmokeTest { .rankingParams(ResultRankingParameters.sensibleDefaults()) .domains(new ArrayList<>()) .searchSetIdentifier("NONE") - .subqueries(List.of(new SearchSubquery( + .query(new SearchQuery( + "2 3 5", List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(), - Collections.emptyList()))).build()); + Collections.emptyList())).build()); int[] idxes = new int[] { 30, 510, 90, 150, 210, 270, 330, 390, 450 }; long[] ids = IntStream.of(idxes).mapToLong(this::fullId).toArray(); @@ -166,9 +167,13 @@ public class IndexQueryServiceIntegrationSmokeTest { .rankingParams(ResultRankingParameters.sensibleDefaults()) .queryStrategy(QueryStrategy.SENTENCE) .domains(List.of(2)) - .subqueries(List.of(new SearchSubquery( - List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(), - Collections.emptyList()))).build()); + .query(new SearchQuery( + "2 3 5", + List.of("3", "5", "2"), + List.of("4"), + Collections.emptyList(), + Collections.emptyList(), + Collections.emptyList())).build()); int[] idxes = new int[] { 210, 270 }; long[] ids = IntStream.of(idxes).mapToLong(id -> UrlIdCodec.encodeId(id/100, id)).toArray(); long[] actual = rsp.results.stream().mapToLong(i -> i.rawIndexResult.getDocumentId()).toArray(); @@ -202,9 +207,8 @@ public class IndexQueryServiceIntegrationSmokeTest { .queryStrategy(QueryStrategy.SENTENCE) .searchSetIdentifier("NONE") .rankingParams(ResultRankingParameters.sensibleDefaults()) - .subqueries(List.of(new SearchSubquery( - List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList(), - Collections.emptyList())) + .query( + new SearchQuery("4", List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList(), Collections.emptyList()) ).build()); diff --git a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTest.java b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTest.java index 6def5bbc..e29f8751 100644 --- a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTest.java +++ b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTest.java @@ -4,7 +4,7 @@ import com.google.inject.Guice; import com.google.inject.Inject; import nu.marginalia.IndexLocations; import nu.marginalia.api.searchquery.model.query.SearchSpecification; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.storage.FileStorageService; @@ -35,6 +35,7 @@ import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.index.domainrankings.DomainRankings; import nu.marginalia.service.control.ServiceHeartbeat; import nu.marginalia.service.server.Initialization; +import org.apache.logging.log4j.util.Strings; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -108,7 +109,7 @@ public class IndexQueryServiceIntegrationTest { w("world", WordFlags.Title) ).load(); - var query = basicQuery(builder -> builder.subqueries(justInclude("hello", "world"))); + var query = basicQuery(builder -> builder.query(justInclude("hello", "world"))); executeSearch(query) .expectDocumentsInOrder(d(1,1)); @@ -127,57 +128,51 @@ public class IndexQueryServiceIntegrationTest { ).load(); var queryMissingExclude = basicQuery(builder -> - builder.subqueries(includeAndExclude("hello", "missing"))); + builder.query(includeAndExclude("hello", "missing"))); executeSearch(queryMissingExclude) .expectDocumentsInOrder(d(1,1)); var queryMissingInclude = basicQuery(builder -> - builder.subqueries(justInclude("missing"))); + builder.query(justInclude("missing"))); executeSearch(queryMissingInclude) .expectCount(0); var queryMissingPriority = basicQuery(builder -> - builder.subqueries( - List.of( - new SearchSubquery( - List.of("hello"), - List.of(), - List.of(), - List.of("missing"), - List.of() - ) - ))); + builder.query(new SearchQuery( + "hello", + List.of("hello"), + List.of(), + List.of(), + List.of("missing"), + List.of()) + )); executeSearch(queryMissingPriority) .expectCount(1); var queryMissingAdvice = basicQuery(builder -> - builder.subqueries( - List.of( - new SearchSubquery( - List.of("hello"), - List.of(), - List.of("missing"), - List.of(), - List.of() - ) + builder.query( + new SearchQuery("hello", + List.of("hello"), + List.of(), + List.of("missing"), + List.of(), + List.of() ))); executeSearch(queryMissingAdvice) .expectCount(0); var queryMissingCoherence = basicQuery(builder -> - builder.subqueries( - List.of( - new SearchSubquery( - List.of("hello"), - List.of(), - List.of(), - List.of(), - List.of(List.of("missing", "hello")) - ) + builder.query( + new SearchQuery("hello", + List.of("hello"), + List.of(), + List.of(), + List.of(), + List.of(List.of("missing", "hello")) ))); executeSearch(queryMissingCoherence) @@ -202,7 +197,7 @@ public class IndexQueryServiceIntegrationTest { ).load(); - var query = basicQuery(builder -> builder.subqueries(justInclude("hello", "world"))); + var query = basicQuery(builder -> builder.query(justInclude("hello", "world"))); executeSearch(query) .expectDocumentsInOrder(d(1,1)); @@ -234,15 +229,15 @@ public class IndexQueryServiceIntegrationTest { var beforeY2K = basicQuery(builder -> - builder.subqueries(justInclude("hello", "world")) + builder.query(justInclude("hello", "world")) .year(SpecificationLimit.lessThan(2000)) ); var atY2K = basicQuery(builder -> - builder.subqueries(justInclude("hello", "world")) + builder.query(justInclude("hello", "world")) .year(SpecificationLimit.equals(2000)) ); var afterY2K = basicQuery(builder -> - builder.subqueries(justInclude("hello", "world")) + builder.query(justInclude("hello", "world")) .year(SpecificationLimit.greaterThan(2000)) ); @@ -296,11 +291,11 @@ public class IndexQueryServiceIntegrationTest { var domain1 = basicQuery(builder -> - builder.subqueries(justInclude("hello", "world")) + builder.query(justInclude("hello", "world")) .domains(List.of(1)) ); var domain2 = basicQuery(builder -> - builder.subqueries(justInclude("hello", "world")) + builder.query(justInclude("hello", "world")) .domains(List.of(2)) ); @@ -334,7 +329,7 @@ public class IndexQueryServiceIntegrationTest { ).load(); var query = basicQuery(builder -> - builder.subqueries(includeAndExclude("hello", "my_darling")) + builder.query(includeAndExclude("hello", "my_darling")) ); executeSearch(query) @@ -403,7 +398,7 @@ public class IndexQueryServiceIntegrationTest { .load(); var rsp = queryService.justQuery( - basicQuery(builder -> builder.subqueries( + basicQuery(builder -> builder.query( // note coherence requriement includeAndCohere("hello", "world") ))); @@ -424,50 +419,53 @@ public class IndexQueryServiceIntegrationTest { .rank(SpecificationLimit.none()) .rankingParams(ResultRankingParameters.sensibleDefaults()) .domains(new ArrayList<>()) - .searchSetIdentifier("NONE") - .subqueries(List.of()); + .searchSetIdentifier("NONE"); return mutator.apply(builder).build(); } - List justInclude(String... includes) { - return List.of(new SearchSubquery( + SearchQuery justInclude(String... includes) { + return new SearchQuery( + Strings.join(List.of(includes), ' '), List.of(includes), List.of(), List.of(), List.of(), List.of() - )); + ); } - List includeAndExclude(List includes, List excludes) { - return List.of(new SearchSubquery( + SearchQuery includeAndExclude(List includes, List excludes) { + return new SearchQuery( + Strings.join(List.of(includes), ' '), includes, excludes, List.of(), List.of(), List.of() - )); + ); } - List includeAndExclude(String include, String exclude) { - return List.of(new SearchSubquery( + SearchQuery includeAndExclude(String include, String exclude) { + return new SearchQuery( + include, List.of(include), List.of(exclude), List.of(), List.of(), List.of() - )); + ); } - List includeAndCohere(String... includes) { - return List.of(new SearchSubquery( + SearchQuery includeAndCohere(String... includes) { + return new SearchQuery( + Strings.join(List.of(includes), ' '), List.of(includes), List.of(), List.of(), List.of(), List.of(List.of(includes)) - )); + ); } private MockDataDocument d(int domainId, int ordinal) { return new MockDataDocument(domainId, ordinal); diff --git a/code/index/test/nu/marginalia/index/index/QueryBranchWalkerTest.java b/code/index/test/nu/marginalia/index/index/QueryBranchWalkerTest.java new file mode 100644 index 00000000..8d2f45c8 --- /dev/null +++ b/code/index/test/nu/marginalia/index/index/QueryBranchWalkerTest.java @@ -0,0 +1,59 @@ +package nu.marginalia.index.index; + +import it.unimi.dsi.fastutil.longs.LongArraySet; +import it.unimi.dsi.fastutil.longs.LongSet; +import org.junit.jupiter.api.Test; + +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.junit.jupiter.api.Assertions.*; + +class QueryBranchWalkerTest { + @Test + public void testNoOverlap() { + var paths = QueryBranchWalker.create( + new long[] { 1, 2 }, + List.of(set(1), set(2)) + ); + assertEquals(2, paths.size()); + assertEquals(Set.of(1L, 2L), paths.stream().map(path -> path.termId).collect(Collectors.toSet())); + } + + @Test + public void testCond() { + var paths = QueryBranchWalker.create( + new long[] { 1, 2, 3, 4 }, + List.of(set(1,2,3), set(1,4,3)) + ); + assertEquals(1, paths.size()); + assertEquals(Set.of(1L), paths.stream().map(path -> path.termId).collect(Collectors.toSet())); + System.out.println(Arrays.toString(paths.getFirst().priorityOrder)); + assertArrayEquals(new long[] { 2, 3, 4 }, paths.getFirst().priorityOrder); + + var next = paths.getFirst().next(); + assertEquals(2, next.size()); + assertEquals(Set.of(2L, 3L), next.stream().map(path -> path.termId).collect(Collectors.toSet())); + Map byId = next.stream().collect(Collectors.toMap(w -> w.termId, w->w)); + assertArrayEquals(new long[] { 3L }, byId.get(2L).priorityOrder ); + assertArrayEquals(new long[] { 4L }, byId.get(3L).priorityOrder ); + } + + @Test + public void testNoOverlapFirst() { + var paths = QueryBranchWalker.create( + new long[] { 1, 2, 3 }, + List.of(set(1, 2), set(1, 3)) + ); + assertEquals(1, paths.size()); + assertArrayEquals(new long[] { 2, 3 }, paths.getFirst().priorityOrder); + assertEquals(Set.of(1L), paths.stream().map(path -> path.termId).collect(Collectors.toSet())); + } + + LongSet set(long... args) { + return new LongArraySet(args); + } +} \ No newline at end of file diff --git a/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java b/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java index 4f5a12cd..948c5857 100644 --- a/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java +++ b/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java @@ -2,9 +2,10 @@ package nu.marginalia.index.results; import nu.marginalia.api.searchquery.model.results.SearchResultItem; import nu.marginalia.model.id.UrlIdCodec; -import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Test; +import java.util.List; + import static org.junit.jupiter.api.Assertions.*; class IndexResultDomainDeduplicatorTest { @@ -24,7 +25,7 @@ class IndexResultDomainDeduplicatorTest { } SearchResultItem forId(int domain, int ordinal) { - return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), 4); + return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), List.of(), 4, Double.NaN); } } \ No newline at end of file diff --git a/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java b/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java index 8f8f7eaa..243ae90d 100644 --- a/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java +++ b/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java @@ -1,5 +1,6 @@ package nu.marginalia.ranking.results; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; @@ -35,21 +36,21 @@ class ResultValuatorTest { ); } - List titleOnlyLowCountSet = List.of( - new SearchResultKeywordScore(0, "bob", + CompiledQuery titleOnlyLowCountSet = CompiledQuery.just( + new SearchResultKeywordScore("bob", 1, wordMetadata(Set.of(1), EnumSet.of(WordFlags.Title)), docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)), 0) ); - List highCountNoTitleSet = List.of( - new SearchResultKeywordScore(0, "bob", + CompiledQuery highCountNoTitleSet = CompiledQuery.just( + new SearchResultKeywordScore("bob", 1, wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh)), docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)), 0) ); - List highCountSubjectSet = List.of( - new SearchResultKeywordScore(0, "bob", + CompiledQuery highCountSubjectSet = CompiledQuery.just( + new SearchResultKeywordScore("bob", 1, wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh, WordFlags.Subjects)), docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)), 0) @@ -75,7 +76,10 @@ class ResultValuatorTest { System.out.println(highCountSubject); } - private long docMetadata(int topology, int year, int quality, EnumSet flags) { + private long docMetadata(int topology, + int year, + int quality, + EnumSet flags) { return new DocumentMetadata(topology, PubDate.toYearByte(year), quality, flags).encode(); } diff --git a/code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java b/code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java index a5bca54e..028896d9 100644 --- a/code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java +++ b/code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java @@ -1,9 +1,10 @@ package nu.marginalia.ranking.results.factors; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; import nu.marginalia.bbpc.BrailleBlockPunchCards; import nu.marginalia.model.idx.WordMetadata; -import nu.marginalia.ranking.results.ResultKeywordSet; import org.junit.jupiter.api.Test; import java.util.ArrayList; @@ -20,7 +21,7 @@ class TermCoherenceFactorTest { WordMetadata.POSITIONS_MASK, WordMetadata.POSITIONS_MASK ); - long mask = termCoherenceFactor.combinedMask(allPositionsSet); + long mask = CompiledQueryAggregates.longBitmaskAggregate(allPositionsSet, score -> score.positions() & WordMetadata.POSITIONS_MASK); assertEquals(1.0, termCoherenceFactor.bitsSetFactor(mask), 0.01); @@ -33,7 +34,7 @@ class TermCoherenceFactorTest { 0, 0 ); - long mask = termCoherenceFactor.combinedMask(allPositionsSet); + long mask = CompiledQueryAggregates.longBitmaskAggregate(allPositionsSet, score -> score.positions() & WordMetadata.POSITIONS_MASK); assertEquals(0, termCoherenceFactor.bitsSetFactor(mask), 0.01); @@ -46,7 +47,7 @@ class TermCoherenceFactorTest { List.of(0, 1, 2, 3), List.of(0, 1, 2, 3) ); - long mask = termCoherenceFactor.combinedMask(positions); + long mask = CompiledQueryAggregates.longBitmaskAggregate(positions, score -> score.positions() & WordMetadata.POSITIONS_MASK); printMask(mask); } @@ -57,7 +58,7 @@ class TermCoherenceFactorTest { List.of(55, 54, 53, 52), List.of(55, 54, 53, 52) ); - long mask = termCoherenceFactor.combinedMask(positions); + long mask = CompiledQueryAggregates.longBitmaskAggregate(positions, score -> score.positions() & WordMetadata.POSITIONS_MASK); printMask(mask); } @@ -72,7 +73,7 @@ class TermCoherenceFactorTest { System.out.println(BrailleBlockPunchCards.printBits(mask, 48)); } - ResultKeywordSet createSet(List... maskPositions) { + CompiledQuery createSet(List... maskPositions) { long[] positions = new long[maskPositions.length]; for (int i = 0; i < maskPositions.length; i++) { @@ -84,14 +85,14 @@ class TermCoherenceFactorTest { return createSet(positions); } - ResultKeywordSet createSet(long... positionMasks) { + CompiledQuery createSet(long... positionMasks) { List keywords = new ArrayList<>(); for (int i = 0; i < positionMasks.length; i++) { - keywords.add(new SearchResultKeywordScore(0, "", + keywords.add(new SearchResultKeywordScore("", 0, new WordMetadata(positionMasks[i], (byte) 0).encode(), 0, 0)); } - return new ResultKeywordSet(keywords); + return CompiledQuery.just(keywords.toArray(SearchResultKeywordScore[]::new)); } } \ No newline at end of file diff --git a/code/libraries/array/java/nu/marginalia/array/algo/LongArrayBase.java b/code/libraries/array/java/nu/marginalia/array/algo/LongArrayBase.java index 39d9bff7..ab7f18bd 100644 --- a/code/libraries/array/java/nu/marginalia/array/algo/LongArrayBase.java +++ b/code/libraries/array/java/nu/marginalia/array/algo/LongArrayBase.java @@ -1,5 +1,7 @@ package nu.marginalia.array.algo; +import nu.marginalia.array.LongArray; + import java.io.IOException; import java.nio.LongBuffer; import java.nio.channels.FileChannel; @@ -61,6 +63,12 @@ public interface LongArrayBase extends BulkTransferArray { } } + default void get(long start, long end, LongArray buffer, int bufferStart) { + for (int i = 0; i < (end-start); i++) { + buffer.set(i + bufferStart, get(start + i)); + } + } + default void get(long start, LongBuffer buffer) { get(start, start + buffer.remaining(), buffer, buffer.position()); } diff --git a/code/libraries/array/java/nu/marginalia/array/buffer/LongQueryBuffer.java b/code/libraries/array/java/nu/marginalia/array/buffer/LongQueryBuffer.java index 390325ee..d5b44389 100644 --- a/code/libraries/array/java/nu/marginalia/array/buffer/LongQueryBuffer.java +++ b/code/libraries/array/java/nu/marginalia/array/buffer/LongQueryBuffer.java @@ -1,5 +1,8 @@ package nu.marginalia.array.buffer; +import nu.marginalia.array.LongArray; +import nu.marginalia.array.LongArrayFactory; + import java.util.Arrays; /** A buffer for long values that can be used to filter and manipulate the data. @@ -17,7 +20,7 @@ import java.util.Arrays; public class LongQueryBuffer { /** Direct access to the data in the buffer, * guaranteed to be populated until `end` */ - public final long[] data; + public final LongArray data; /** Number of items in the data buffer */ public int end; @@ -25,18 +28,27 @@ public class LongQueryBuffer { private int read = 0; private int write = 0; + private LongQueryBuffer(LongArray array, int size) { + this.data = array; + this.end = size; + } + public LongQueryBuffer(int size) { - this.data = new long[size]; + this.data = LongArrayFactory.onHeapConfined(size); this.end = size; } public LongQueryBuffer(long[] data, int size) { - this.data = data; + this.data = LongArrayFactory.onHeapConfined(size); + this.data.set(0, data); + this.end = size; } public long[] copyData() { - return Arrays.copyOf(data, end); + long[] copy = new long[end]; + data.forEach(0, end, (pos, val) -> copy[(int)pos]=val ); + return copy; } public boolean isEmpty() { @@ -48,7 +60,7 @@ public class LongQueryBuffer { } public void reset() { - end = data.length; + end = (int) data.size(); read = 0; write = 0; } @@ -59,12 +71,16 @@ public class LongQueryBuffer { write = 0; } + public LongQueryBuffer slice(int start, int end) { + return new LongQueryBuffer(data.range(start, end), end - start); + } + /* == Filtering methods == */ /** Returns the current value at the read pointer. */ public long currentValue() { - return data[read]; + return data.get(read); } /** Advances the read pointer and returns true if there are more values to read. */ @@ -79,9 +95,9 @@ public class LongQueryBuffer { */ public boolean retainAndAdvance() { if (read != write) { - long tmp = data[write]; - data[write] = data[read]; - data[read] = tmp; + long tmp = data.get(write); + data.set(write, data.get(read)); + data.set(read, tmp); } write++; @@ -117,9 +133,10 @@ public class LongQueryBuffer { write = 0; } - public void startFilterForRange(int pos, int end) { - read = write = pos; - this.end = end; + public void finalizeFiltering(int pos) { + end = write; + read = pos; + write = pos; } /** Retain only unique values in the buffer, and update the end pointer to the new length. @@ -153,7 +170,7 @@ public class LongQueryBuffer { "read = " + read + ",write = " + write + ",end = " + end + - ",data = [" + Arrays.toString(Arrays.copyOf(data, end)) + "]]"; + ",data = [" + Arrays.toString(copyData()) + "]]"; } diff --git a/code/libraries/array/test/nu/marginalia/array/algo/LongArraySearchTest.java b/code/libraries/array/test/nu/marginalia/array/algo/LongArraySearchTest.java index a515917b..fa50045e 100644 --- a/code/libraries/array/test/nu/marginalia/array/algo/LongArraySearchTest.java +++ b/code/libraries/array/test/nu/marginalia/array/algo/LongArraySearchTest.java @@ -143,7 +143,7 @@ class LongArraySearchTest { assertEquals(43, buffer.size()); for (int i = 0; i < 43; i++) { - assertEquals(buffer.data[i], i*3); + assertEquals(buffer.data.get(i), i*3); } } @@ -160,7 +160,7 @@ class LongArraySearchTest { int j = 0; for (int i = 0; i < 43; i++) { if (++j % 3 == 0) j++; - assertEquals(buffer.data[i], j); + assertEquals(buffer.data.get(i), j); } } } \ No newline at end of file diff --git a/code/libraries/btree/java/nu/marginalia/btree/BTreeReader.java b/code/libraries/btree/java/nu/marginalia/btree/BTreeReader.java index 048e0301..bc40bb43 100644 --- a/code/libraries/btree/java/nu/marginalia/btree/BTreeReader.java +++ b/code/libraries/btree/java/nu/marginalia/btree/BTreeReader.java @@ -109,8 +109,8 @@ public class BTreeReader { return ip.findData(key); } - public void readData(long[] buf, int n, long pos) { - data.get(pos, pos + n, buf); + public void readData(LongArray buf, int n, long pos) { + data.get(pos, pos + n, buf, 0); } /** Used for querying interlaced data in the btree. diff --git a/code/libraries/btree/test/nu/marginalia/btree/BTreeReaderRejectRetainWithIndexTest.java b/code/libraries/btree/test/nu/marginalia/btree/BTreeReaderRejectRetainWithIndexTest.java index 8b65753d..be24de10 100644 --- a/code/libraries/btree/test/nu/marginalia/btree/BTreeReaderRejectRetainWithIndexTest.java +++ b/code/libraries/btree/test/nu/marginalia/btree/BTreeReaderRejectRetainWithIndexTest.java @@ -32,7 +32,8 @@ public class BTreeReaderRejectRetainWithIndexTest { @Test public void testRetain() { LongQueryBuffer odds = new LongQueryBuffer(50); - Arrays.setAll(odds.data, i -> 2L*i + 1); + for (int i = 0; i < 50; i++) + odds.data.set(i, 2L*i + 1); BTreeReader reader = new BTreeReader(array, ctx, 0); reader.retainEntries(odds); @@ -46,7 +47,8 @@ public class BTreeReaderRejectRetainWithIndexTest { @Test public void testReject() { LongQueryBuffer odds = new LongQueryBuffer(50); - Arrays.setAll(odds.data, i -> 2L*i + 1); + for (int i = 0; i < 50; i++) + odds.data.set(i, 2L*i + 1); BTreeReader reader = new BTreeReader(array, ctx, 0); reader.rejectEntries(odds); diff --git a/code/libraries/btree/test/nu/marginalia/btree/BTreeReaderRejectRetainWithoutIndexTest.java b/code/libraries/btree/test/nu/marginalia/btree/BTreeReaderRejectRetainWithoutIndexTest.java index e5d4dc79..fc3b71df 100644 --- a/code/libraries/btree/test/nu/marginalia/btree/BTreeReaderRejectRetainWithoutIndexTest.java +++ b/code/libraries/btree/test/nu/marginalia/btree/BTreeReaderRejectRetainWithoutIndexTest.java @@ -32,7 +32,8 @@ public class BTreeReaderRejectRetainWithoutIndexTest { @Test public void testRetain() { LongQueryBuffer odds = new LongQueryBuffer(50); - Arrays.setAll(odds.data, i -> 2L*i + 1); + for (int i = 0; i < 50; i++) + odds.data.set(i, 2L*i + 1); BTreeReader reader = new BTreeReader(array, ctx, 0); reader.retainEntries(odds); @@ -46,7 +47,9 @@ public class BTreeReaderRejectRetainWithoutIndexTest { @Test public void testReject() { LongQueryBuffer odds = new LongQueryBuffer(50); - Arrays.setAll(odds.data, i -> 2L*i + 1); + for (int i = 0; i < 50; i++) + odds.data.set(i, 2L*i + 1); + BTreeReader reader = new BTreeReader(array, ctx, 0); reader.rejectEntries(odds); diff --git a/code/services-application/search-service/java/nu/marginalia/search/SearchQueryParamFactory.java b/code/services-application/search-service/java/nu/marginalia/search/SearchQueryParamFactory.java index 15c8567e..cc28b209 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/SearchQueryParamFactory.java +++ b/code/services-application/search-service/java/nu/marginalia/search/SearchQueryParamFactory.java @@ -1,7 +1,7 @@ package nu.marginalia.search; import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.index.query.limit.QueryStrategy; @@ -14,7 +14,7 @@ import java.util.List; public class SearchQueryParamFactory { public QueryParams forRegularSearch(SearchParameters userParams) { - SearchSubquery prototype = new SearchSubquery(); + SearchQuery prototype = new SearchQuery(); var profile = userParams.profile(); profile.addTacitTerms(prototype); diff --git a/code/services-application/search-service/java/nu/marginalia/search/command/SearchAdtechParameter.java b/code/services-application/search-service/java/nu/marginalia/search/command/SearchAdtechParameter.java index 9e8383f3..ce3bf099 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/command/SearchAdtechParameter.java +++ b/code/services-application/search-service/java/nu/marginalia/search/command/SearchAdtechParameter.java @@ -1,6 +1,6 @@ package nu.marginalia.search.command; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import javax.annotation.Nullable; import java.util.Arrays; @@ -23,7 +23,7 @@ public enum SearchAdtechParameter { return DEFAULT; } - public void addTacitTerms(SearchSubquery subquery) { + public void addTacitTerms(SearchQuery subquery) { subquery.searchTermsExclude.addAll(Arrays.asList(implictExcludeSearchTerms)); } } diff --git a/code/services-application/search-service/java/nu/marginalia/search/command/SearchJsParameter.java b/code/services-application/search-service/java/nu/marginalia/search/command/SearchJsParameter.java index 6c8634ac..8cf6aada 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/command/SearchJsParameter.java +++ b/code/services-application/search-service/java/nu/marginalia/search/command/SearchJsParameter.java @@ -1,6 +1,6 @@ package nu.marginalia.search.command; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import javax.annotation.Nullable; import java.util.Arrays; @@ -25,7 +25,7 @@ public enum SearchJsParameter { return DEFAULT; } - public void addTacitTerms(SearchSubquery subquery) { + public void addTacitTerms(SearchQuery subquery) { subquery.searchTermsExclude.addAll(Arrays.asList(implictExcludeSearchTerms)); } } diff --git a/code/services-application/search-service/java/nu/marginalia/search/model/SearchProfile.java b/code/services-application/search-service/java/nu/marginalia/search/model/SearchProfile.java index 27d9f4aa..955c3fcb 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/model/SearchProfile.java +++ b/code/services-application/search-service/java/nu/marginalia/search/model/SearchProfile.java @@ -2,7 +2,7 @@ package nu.marginalia.search.model; import nu.marginalia.index.query.limit.SpecificationLimit; import nu.marginalia.model.crawl.HtmlFeature; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier; import java.util.Objects; @@ -47,7 +47,7 @@ public enum SearchProfile { return NO_FILTER; } - public void addTacitTerms(SearchSubquery subquery) { + public void addTacitTerms(SearchQuery subquery) { if (this == ACADEMIA) { subquery.searchTermsAdvice.add("special:academia"); }