(qs, index) New query model integrated with index service.

Seems to work, tests are green and initial testing finds no errors.  Still a bit untested, committing WIP as-is because it would suck to lose weeks of work due to a drive failure or something.
This commit is contained in:
Viktor Lofgren 2024-04-04 20:17:58 +02:00
parent 8cb9455c32
commit a3a6d6292b
66 changed files with 1613 additions and 503 deletions

View File

@ -30,6 +30,7 @@ dependencies {
implementation libs.notnull
implementation libs.guice
implementation libs.gson
implementation libs.commons.lang3
implementation libs.bundles.protobuf
implementation libs.bundles.grpc
implementation libs.fastutil

View File

@ -1,7 +1,6 @@
package nu.marginalia.api.searchquery;
import nu.marginalia.api.searchquery.*;
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.results.Bm25Parameters;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.index.query.limit.QueryLimits;
@ -45,33 +44,37 @@ public class IndexProtobufCodec {
.build();
}
public static SearchSubquery convertSearchSubquery(RpcSubquery subquery) {
public static SearchQuery convertRpcQuery(RpcQuery query) {
List<List<String>> coherences = new ArrayList<>();
for (int j = 0; j < subquery.getCoherencesCount(); j++) {
var coh = subquery.getCoherences(j);
for (int j = 0; j < query.getCoherencesCount(); j++) {
var coh = query.getCoherences(j);
coherences.add(new ArrayList<>(coh.getCoherencesList()));
}
return new SearchSubquery(
subquery.getIncludeList(),
subquery.getExcludeList(),
subquery.getAdviceList(),
subquery.getPriorityList(),
return new SearchQuery(
query.getCompiledQuery(),
query.getIncludeList(),
query.getExcludeList(),
query.getAdviceList(),
query.getPriorityList(),
coherences
);
}
public static RpcSubquery convertSearchSubquery(SearchSubquery searchSubquery) {
public static RpcQuery convertRpcQuery(SearchQuery searchQuery) {
var subqueryBuilder =
RpcSubquery.newBuilder()
.addAllAdvice(searchSubquery.getSearchTermsAdvice())
.addAllExclude(searchSubquery.getSearchTermsExclude())
.addAllInclude(searchSubquery.getSearchTermsInclude())
.addAllPriority(searchSubquery.getSearchTermsPriority());
for (var coherences : searchSubquery.searchTermCoherences) {
RpcQuery.newBuilder()
.setCompiledQuery(searchQuery.compiledQuery)
.addAllInclude(searchQuery.getSearchTermsInclude())
.addAllAdvice(searchQuery.getSearchTermsAdvice())
.addAllExclude(searchQuery.getSearchTermsExclude())
.addAllPriority(searchQuery.getSearchTermsPriority());
for (var coherences : searchQuery.searchTermCoherences) {
subqueryBuilder.addCoherencesBuilder().addAllCoherences(coherences);
}
return subqueryBuilder.build();
}

View File

@ -2,7 +2,6 @@ package nu.marginalia.api.searchquery;
import lombok.SneakyThrows;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
@ -14,7 +13,6 @@ import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.api.searchquery.model.query.QueryResponse;
import java.util.ArrayList;
import java.util.List;
public class QueryProtobufCodec {
@ -23,9 +21,7 @@ public class QueryProtobufCodec {
builder.addAllDomains(request.getDomainIdsList());
for (var subquery : query.specs.subqueries) {
builder.addSubqueries(IndexProtobufCodec.convertSearchSubquery(subquery));
}
builder.setQuery(IndexProtobufCodec.convertRpcQuery(query.specs.query));
builder.setSearchSetIdentifier(query.specs.searchSetIdentifier);
builder.setHumanQuery(request.getHumanQuery());
@ -51,9 +47,7 @@ public class QueryProtobufCodec {
public static RpcIndexQuery convertQuery(String humanQuery, ProcessedQuery query) {
var builder = RpcIndexQuery.newBuilder();
for (var subquery : query.specs.subqueries) {
builder.addSubqueries(IndexProtobufCodec.convertSearchSubquery(subquery));
}
builder.setQuery(IndexProtobufCodec.convertRpcQuery(query.specs.query));
builder.setSearchSetIdentifier(query.specs.searchSetIdentifier);
builder.setHumanQuery(humanQuery);
@ -147,8 +141,8 @@ public class QueryProtobufCodec {
private static SearchResultKeywordScore convertKeywordScore(RpcResultKeywordScore keywordScores) {
return new SearchResultKeywordScore(
keywordScores.getSubquery(),
keywordScores.getKeyword(),
-1, // termId is internal to index service
keywordScores.getEncodedWordMetadata(),
keywordScores.getEncodedDocMetadata(),
keywordScores.getHtmlFeatures()
@ -156,14 +150,8 @@ public class QueryProtobufCodec {
}
private static SearchSpecification convertSearchSpecification(RpcIndexQuery specs) {
List<SearchSubquery> subqueries = new ArrayList<>(specs.getSubqueriesCount());
for (int i = 0; i < specs.getSubqueriesCount(); i++) {
subqueries.add(IndexProtobufCodec.convertSearchSubquery(specs.getSubqueries(i)));
}
return new SearchSpecification(
subqueries,
IndexProtobufCodec.convertRpcQuery(specs.getQuery()),
specs.getDomainsList(),
specs.getSearchSetIdentifier(),
specs.getHumanQuery(),
@ -182,7 +170,6 @@ public class QueryProtobufCodec {
.addAllDomainIds(params.domainIds())
.addAllTacitAdvice(params.tacitAdvice())
.addAllTacitExcludes(params.tacitExcludes())
.addAllTacitIncludes(params.tacitIncludes())
.addAllTacitPriority(params.tacitPriority())
.setHumanQuery(params.humanQuery())
.setQueryLimits(IndexProtobufCodec.convertQueryLimits(params.limits()))

View File

@ -0,0 +1,76 @@
package nu.marginalia.api.searchquery.model.compiled;
import org.jetbrains.annotations.NotNull;
import java.util.Iterator;
import java.util.function.*;
import java.util.stream.IntStream;
import java.util.stream.Stream;
/** A compiled index service query. The class separates the topology of the query from the data,
* and it's possible to create new queries supplanting the data */
public class CompiledQuery<T> implements Iterable<T> {
/** The root expression, conveys the topology of the query */
public final CqExpression root;
private final CqData<T> data;
public CompiledQuery(CqExpression root, CqData<T> data) {
this.root = root;
this.data = data;
}
public CompiledQuery(CqExpression root, T[] data) {
this.root = root;
this.data = new CqData<>(data);
}
/** Exists for testing, creates a simple query that ANDs all the provided items */
public static <T> CompiledQuery<T> just(T... item) {
return new CompiledQuery<>(new CqExpression.And(
IntStream.range(0, item.length).mapToObj(CqExpression.Word::new).toList()
), item);
}
/** Create a new CompiledQuery mapping the leaf nodes using the provided mapper */
public <T2> CompiledQuery<T2> map(Class<T2> clazz, Function<T, T2> mapper) {
return new CompiledQuery<>(
root,
data.map(clazz, mapper)
);
}
public CompiledQueryLong mapToLong(ToLongFunction<T> mapper) {
return new CompiledQueryLong(root, data.mapToLong(mapper));
}
public CqExpression root() {
return root;
}
public Stream<T> stream() {
return data.stream();
}
public IntStream indices() {
return IntStream.range(0, data.size());
}
public T at(int index) {
return data.get(index);
}
@NotNull
@Override
public Iterator<T> iterator() {
return stream().iterator();
}
public int size() {
return data.size();
}
}

View File

@ -0,0 +1,42 @@
package nu.marginalia.api.searchquery.model.compiled;
import org.jetbrains.annotations.NotNull;
import java.util.Iterator;
import java.util.stream.IntStream;
import java.util.stream.LongStream;
/** A compiled index service query */
public class CompiledQueryLong implements Iterable<Long> {
private final CqExpression root;
private final CqDataLong data;
public CompiledQueryLong(CqExpression root, CqDataLong data) {
this.root = root;
this.data = data;
}
public CqExpression root() {
return root;
}
public LongStream stream() {
return data.stream();
}
public IntStream indices() {
return IntStream.range(0, data.size());
}
public long at(int index) {
return data.get(index);
}
@NotNull
@Override
public Iterator<Long> iterator() {
return stream().iterator();
}
}

View File

@ -0,0 +1,113 @@
package nu.marginalia.api.searchquery.model.compiled;
import org.apache.commons.lang3.StringUtils;
import java.util.*;
/** Parser for a compiled index query */
public class CompiledQueryParser {
public static CompiledQuery<String> parse(String query) {
List<String> parts = tokenize(query);
if (parts.isEmpty()) {
return new CompiledQuery<>(
CqExpression.empty(),
new CqData<>(new String[0])
);
}
// We aren't interested in a binary tree representation, but an n-ary tree one,
// so a somewhat unusual parsing technique is used to avoid having an additional
// flattening step at the end.
// This is only possible due to the trivial and unambiguous grammar of the compiled queries
List<AndOrState> parenState = new ArrayList<>();
parenState.add(new AndOrState());
Map<String, Integer> wordIds = new HashMap<>();
for (var part : parts) {
var head = parenState.getLast();
if (part.equals("|")) {
head.or();
}
else if (part.equals("(")) {
parenState.addLast(new AndOrState());
}
else if (part.equals(")")) {
if (parenState.size() < 2) {
throw new IllegalStateException("Mismatched parentheses in expression: " + query);
}
parenState.removeLast();
parenState.getLast().and(head.closeOr());
}
else {
head.and(
new CqExpression.Word(
wordIds.computeIfAbsent(part, p -> wordIds.size())
)
);
}
}
if (parenState.size() != 1)
throw new IllegalStateException("Mismatched parentheses in expression: " + query);
// Construct the CompiledQuery object with String:s as leaves
var root = parenState.getLast().closeOr();
String[] cqData = new String[wordIds.size()];
wordIds.forEach((w, i) -> cqData[i] = w);
return new CompiledQuery<>(root, new CqData<>(cqData));
}
private static class AndOrState {
private List<CqExpression> andState = new ArrayList<>();
private List<CqExpression> orState = new ArrayList<>();
/** Add a new item to the and-list */
public void and(CqExpression e) {
andState.add(e);
}
/** Turn the and-list into an expression on the or-list, and then start a new and-list */
public void or() {
closeAnd();
andState = new ArrayList<>();
}
/** Turn the and-list into an And-expression in the or-list */
private void closeAnd() {
if (andState.size() == 1)
orState.add(andState.getFirst());
else if (!andState.isEmpty())
orState.add(new CqExpression.And(andState));
}
/** Finalize the current and-list, then turn the or-list into an Or-expression */
public CqExpression closeOr() {
closeAnd();
if (orState.isEmpty())
return CqExpression.empty();
if (orState.size() == 1)
return orState.getFirst();
return new CqExpression.Or(orState);
}
}
private static List<String> tokenize(String query) {
// Each token is guaranteed to be separated by one or more space characters
return Arrays.stream(StringUtils.split(query, ' '))
.filter(StringUtils::isNotBlank)
.toList();
}
}

View File

@ -0,0 +1,51 @@
package nu.marginalia.api.searchquery.model.compiled;
import java.lang.reflect.Array;
import java.util.Arrays;
import java.util.function.Function;
import java.util.function.ToDoubleFunction;
import java.util.function.ToLongFunction;
import java.util.stream.Stream;
public class CqData<T> {
private final T[] data;
public CqData(T[] data) {
this.data = data;
}
@SuppressWarnings("unchecked")
public <T2> CqData<T2> map(Class<T2> clazz, Function<T, T2> mapper) {
T2[] newData = (T2[]) Array.newInstance(clazz, data.length);
for (int i = 0; i < data.length; i++) {
newData[i] = mapper.apply((T) data[i]);
}
return new CqData<>(newData);
}
public CqDataLong mapToLong(ToLongFunction<T> mapper) {
long[] newData = new long[data.length];
for (int i = 0; i < data.length; i++) {
newData[i] = mapper.applyAsLong((T) data[i]);
}
return new CqDataLong(newData);
}
public T get(int i) {
return data[i];
}
public T get(CqExpression.Word w) {
return data[w.idx()];
}
public Stream<T> stream() {
return Arrays.stream(data);
}
public int size() {
return data.length;
}
}

View File

@ -0,0 +1,27 @@
package nu.marginalia.api.searchquery.model.compiled;
import java.util.Arrays;
import java.util.stream.LongStream;
public class CqDataLong {
private final long[] data;
public CqDataLong(long[] data) {
this.data = data;
}
public long get(int i) {
return data[i];
}
public long get(CqExpression.Word w) {
return data[w.idx()];
}
public LongStream stream() {
return Arrays.stream(data);
}
public int size() {
return data.length;
}
}

View File

@ -0,0 +1,170 @@
package nu.marginalia.api.searchquery.model.compiled;
import java.util.List;
import java.util.StringJoiner;
import java.util.stream.Stream;
/** Expression in a parsed index service query
*
*/
public sealed interface CqExpression {
Stream<Word> stream();
/** @see nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates */
long visit(LongVisitor visitor);
/** @see nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates */
double visit(DoubleVisitor visitor);
/** @see nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates */
int visit(IntVisitor visitor);
/** @see nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates */
boolean visit(BoolVisitor visitor);
<T> T visit(ObjectVisitor<T> visitor);
static CqExpression empty() {
return new Or(List.of());
}
record And(List<? extends CqExpression> parts) implements CqExpression {
@Override
public Stream<Word> stream() {
return parts.stream().flatMap(CqExpression::stream);
}
@Override
public long visit(LongVisitor visitor) {
return visitor.onAnd(parts);
}
@Override
public double visit(DoubleVisitor visitor) {
return visitor.onAnd(parts);
}
@Override
public int visit(IntVisitor visitor) {
return visitor.onAnd(parts);
}
@Override
public boolean visit(BoolVisitor visitor) {
return visitor.onAnd(parts);
}
@Override
public <T> T visit(ObjectVisitor<T> visitor) { return visitor.onAnd(parts); }
public String toString() {
StringJoiner sj = new StringJoiner(", ", "And[ ", "]");
parts.forEach(part -> sj.add(part.toString()));
return sj.toString();
}
}
record Or(List<? extends CqExpression> parts) implements CqExpression {
@Override
public Stream<Word> stream() {
return parts.stream().flatMap(CqExpression::stream);
}
@Override
public long visit(LongVisitor visitor) {
return visitor.onOr(parts);
}
@Override
public double visit(DoubleVisitor visitor) {
return visitor.onOr(parts);
}
@Override
public int visit(IntVisitor visitor) {
return visitor.onOr(parts);
}
@Override
public boolean visit(BoolVisitor visitor) {
return visitor.onOr(parts);
}
@Override
public <T> T visit(ObjectVisitor<T> visitor) { return visitor.onOr(parts); }
public String toString() {
StringJoiner sj = new StringJoiner(", ", "Or[ ", "]");
parts.forEach(part -> sj.add(part.toString()));
return sj.toString();
}
}
record Word(int idx) implements CqExpression {
@Override
public Stream<Word> stream() {
return Stream.of(this);
}
@Override
public long visit(LongVisitor visitor) {
return visitor.onLeaf(idx);
}
@Override
public double visit(DoubleVisitor visitor) {
return visitor.onLeaf(idx);
}
@Override
public int visit(IntVisitor visitor) {
return visitor.onLeaf(idx);
}
@Override
public boolean visit(BoolVisitor visitor) {
return visitor.onLeaf(idx);
}
@Override
public <T> T visit(ObjectVisitor<T> visitor) { return visitor.onLeaf(idx); }
@Override
public String toString() {
return Integer.toString(idx);
}
}
interface LongVisitor {
long onAnd(List<? extends CqExpression> parts);
long onOr(List<? extends CqExpression> parts);
long onLeaf(int idx);
}
interface IntVisitor {
int onAnd(List<? extends CqExpression> parts);
int onOr(List<? extends CqExpression> parts);
int onLeaf(int idx);
}
interface BoolVisitor {
boolean onAnd(List<? extends CqExpression> parts);
boolean onOr(List<? extends CqExpression> parts);
boolean onLeaf(int idx);
}
interface DoubleVisitor {
double onAnd(List<? extends CqExpression> parts);
double onOr(List<? extends CqExpression> parts);
double onLeaf(int idx);
}
interface ObjectVisitor<T> {
T onAnd(List<? extends CqExpression> parts);
T onOr(List<? extends CqExpression> parts);
T onLeaf(int idx);
}
}

View File

@ -0,0 +1,46 @@
package nu.marginalia.api.searchquery.model.compiled.aggregate;
import it.unimi.dsi.fastutil.longs.LongSet;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import java.util.ArrayList;
import java.util.List;
import java.util.function.*;
public class CompiledQueryAggregates {
/** Compiled query aggregate that for a single boolean that treats or-branches as logical OR,
* and and-branches as logical AND operations. Will return true if there exists a path through
* the query where the provided predicate returns true for each item.
*/
static public <T> boolean booleanAggregate(CompiledQuery<T> query, Predicate<T> predicate) {
return query.root.visit(new CqBooleanAggregate(query, predicate));
}
/** Compiled query aggregate that for a 64b bitmask that treats or-branches as logical OR,
* and and-branches as logical AND operations.
*/
public static <T> long longBitmaskAggregate(CompiledQuery<T> query, ToLongFunction<T> operator) {
return query.root.visit(new CqLongBitmaskOperator(query, operator));
}
/** Apply the operator to each leaf node, then return the highest minimum value found along any path */
public static <T> int intMaxMinAggregate(CompiledQuery<T> query, ToIntFunction<T> operator) {
return query.root.visit(new CqIntMaxMinOperator(query, operator));
}
/** Apply the operator to each leaf node, and then return the highest sum of values possible
* through each branch in the compiled query.
*
*/
public static <T> double doubleSumAggregate(CompiledQuery<T> query, ToDoubleFunction<T> operator) {
return query.root.visit(new CqDoubleSumOperator(query, operator));
}
/** Enumerate all possible paths through the compiled query */
public static List<LongSet> queriesAggregate(CompiledQueryLong query) {
return new ArrayList<>(query.root().visit(new CqQueryPathsOperator(query)));
}
}

View File

@ -0,0 +1,40 @@
package nu.marginalia.api.searchquery.model.compiled.aggregate;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
import java.util.List;
import java.util.function.IntPredicate;
import java.util.function.Predicate;
public class CqBooleanAggregate implements CqExpression.BoolVisitor {
private final IntPredicate predicate;
public <T> CqBooleanAggregate(CompiledQuery<T> query, Predicate<T> objPred) {
this.predicate = idx -> objPred.test(query.at(idx));
}
@Override
public boolean onAnd(List<? extends CqExpression> parts) {
for (var part : parts) {
if (!part.visit(this)) // short-circuit
return false;
}
return true;
}
@Override
public boolean onOr(List<? extends CqExpression> parts) {
for (var part : parts) {
if (part.visit(this)) // short-circuit
return true;
}
return false;
}
@Override
public boolean onLeaf(int idx) {
return predicate.test(idx);
}
}

View File

@ -0,0 +1,40 @@
package nu.marginalia.api.searchquery.model.compiled.aggregate;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
import java.util.List;
import java.util.function.IntToDoubleFunction;
import java.util.function.ToDoubleFunction;
public class CqDoubleSumOperator implements CqExpression.DoubleVisitor {
private final IntToDoubleFunction operator;
public <T> CqDoubleSumOperator(CompiledQuery<T> query, ToDoubleFunction<T> operator) {
this.operator = idx -> operator.applyAsDouble(query.at(idx));
}
@Override
public double onAnd(List<? extends CqExpression> parts) {
double value = 0;
for (var part : parts) {
value += part.visit(this);
}
return value;
}
@Override
public double onOr(List<? extends CqExpression> parts) {
double value = parts.getFirst().visit(this);
for (int i = 1; i < parts.size(); i++) {
value = Math.max(value, parts.get(i).visit(this));
}
return value;
}
@Override
public double onLeaf(int idx) {
return operator.applyAsDouble(idx);
}
}

View File

@ -0,0 +1,41 @@
package nu.marginalia.api.searchquery.model.compiled.aggregate;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
import java.util.List;
import java.util.function.IntUnaryOperator;
import java.util.function.ToIntFunction;
public class CqIntMaxMinOperator implements CqExpression.IntVisitor {
private final IntUnaryOperator operator;
public <T> CqIntMaxMinOperator(CompiledQuery<T> query, ToIntFunction<T> operator) {
this.operator = idx -> operator.applyAsInt(query.at(idx));
}
@Override
public int onAnd(List<? extends CqExpression> parts) {
int value = parts.getFirst().visit(this);
for (int i = 1; i < parts.size(); i++) {
value = Math.min(value, parts.get(i).visit(this));
}
return value;
}
@Override
public int onOr(List<? extends CqExpression> parts) {
int value = parts.getFirst().visit(this);
for (int i = 1; i < parts.size(); i++) {
value = Math.max(value, parts.get(i).visit(this));
}
return value;
}
@Override
public int onLeaf(int idx) {
return operator.applyAsInt(idx);
}
}

View File

@ -0,0 +1,40 @@
package nu.marginalia.api.searchquery.model.compiled.aggregate;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
import java.util.List;
import java.util.function.IntToLongFunction;
import java.util.function.ToLongFunction;
public class CqLongBitmaskOperator implements CqExpression.LongVisitor {
private final IntToLongFunction operator;
public <T> CqLongBitmaskOperator(CompiledQuery<T> query, ToLongFunction<T> operator) {
this.operator = idx-> operator.applyAsLong(query.at(idx));
}
@Override
public long onAnd(List<? extends CqExpression> parts) {
long value = ~0L;
for (var part : parts) {
value &= part.visit(this);
}
return value;
}
@Override
public long onOr(List<? extends CqExpression> parts) {
long value = 0L;
for (var part : parts) {
value |= part.visit(this);
}
return value;
}
@Override
public long onLeaf(int idx) {
return operator.applyAsLong(idx);
}
}

View File

@ -0,0 +1,75 @@
package nu.marginalia.api.searchquery.model.compiled.aggregate;
import it.unimi.dsi.fastutil.longs.LongArraySet;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import it.unimi.dsi.fastutil.longs.LongSet;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
import java.util.ArrayList;
import java.util.List;
public class CqQueryPathsOperator implements CqExpression.ObjectVisitor<List<LongSet>> {
private final CompiledQueryLong query;
public CqQueryPathsOperator(CompiledQueryLong query) {
this.query = query;
}
@Override
public List<LongSet> onAnd(List<? extends CqExpression> parts) {
return parts.stream()
.map(expr -> expr.visit(this))
.reduce(List.of(), this::combineAnd);
}
private List<LongSet> combineAnd(List<LongSet> a, List<LongSet> b) {
// No-op cases
if (a.isEmpty())
return b;
if (b.isEmpty())
return a;
// Simple cases
if (a.size() == 1) {
b.forEach(set -> set.addAll(a.getFirst()));
return b;
}
else if (b.size() == 1) {
a.forEach(set -> set.addAll(b.getFirst()));
return a;
}
// Case where we AND two ORs
List<LongSet> ret = new ArrayList<>();
for (var aPart : a) {
for (var bPart : b) {
LongSet set = new LongOpenHashSet(aPart.size() + bPart.size());
set.addAll(aPart);
set.addAll(bPart);
ret.add(set);
}
}
return ret;
}
@Override
public List<LongSet> onOr(List<? extends CqExpression> parts) {
List<LongSet> ret = new ArrayList<>();
for (var part : parts) {
ret.addAll(part.visit(this));
}
return ret;
}
@Override
public List<LongSet> onLeaf(int idx) {
var set = new LongArraySet(1);
set.add(query.at(idx));
return List.of(set);
}
}

View File

@ -13,10 +13,6 @@ public record QueryResponse(SearchSpecification specs,
String domain)
{
public Set<String> getAllKeywords() {
Set<String> keywords = new HashSet<>(100);
for (var sq : specs.subqueries) {
keywords.addAll(sq.searchTermsInclude);
}
return keywords;
return new HashSet<>(specs.query.searchTermsInclude);
}
}

View File

@ -13,9 +13,12 @@ import java.util.stream.Collectors;
@AllArgsConstructor
@With
@EqualsAndHashCode
public class SearchSubquery {
public class SearchQuery {
/** These terms must be present in the document and are used in ranking*/
/** An infix style expression that encodes the required terms in the query */
public final String compiledQuery;
/** All terms that appear in {@see compiledQuery} */
public final List<String> searchTermsInclude;
/** These terms must be absent from the document */
@ -33,7 +36,8 @@ public class SearchSubquery {
@Deprecated // why does this exist?
private double value = 0;
public SearchSubquery() {
public SearchQuery() {
this.compiledQuery = "";
this.searchTermsInclude = new ArrayList<>();
this.searchTermsExclude = new ArrayList<>();
this.searchTermsAdvice = new ArrayList<>();
@ -41,11 +45,13 @@ public class SearchSubquery {
this.searchTermCoherences = new ArrayList<>();
}
public SearchSubquery(List<String> searchTermsInclude,
List<String> searchTermsExclude,
List<String> searchTermsAdvice,
List<String> searchTermsPriority,
List<List<String>> searchTermCoherences) {
public SearchQuery(String compiledQuery,
List<String> searchTermsInclude,
List<String> searchTermsExclude,
List<String> searchTermsAdvice,
List<String> searchTermsPriority,
List<List<String>> searchTermCoherences) {
this.compiledQuery = compiledQuery;
this.searchTermsInclude = searchTermsInclude;
this.searchTermsExclude = searchTermsExclude;
this.searchTermsAdvice = searchTermsAdvice;
@ -54,7 +60,7 @@ public class SearchSubquery {
}
@Deprecated // why does this exist?
public SearchSubquery setValue(double value) {
public SearchQuery setValue(double value) {
if (Double.isInfinite(value) || Double.isNaN(value)) {
this.value = Double.MAX_VALUE;
} else {
@ -66,7 +72,7 @@ public class SearchSubquery {
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
if (!searchTermsInclude.isEmpty()) sb.append("include=").append(searchTermsInclude.stream().collect(Collectors.joining(",", "[", "] ")));
if (!compiledQuery.isEmpty()) sb.append("compiledQuery=").append(compiledQuery);
if (!searchTermsExclude.isEmpty()) sb.append("exclude=").append(searchTermsExclude.stream().collect(Collectors.joining(",", "[", "] ")));
if (!searchTermsAdvice.isEmpty()) sb.append("advice=").append(searchTermsAdvice.stream().collect(Collectors.joining(",", "[", "] ")));
if (!searchTermsPriority.isEmpty()) sb.append("priority=").append(searchTermsPriority.stream().collect(Collectors.joining(",", "[", "] ")));

View File

@ -10,7 +10,7 @@ import java.util.List;
@ToString @Getter @Builder @With @AllArgsConstructor
public class SearchSpecification {
public List<SearchSubquery> subqueries;
public SearchQuery query;
/** If present and not empty, limit the search to these domain IDs */
public List<Integer> domains;

View File

@ -21,9 +21,9 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
/** How many other potential results existed in the same domain */
public int resultsFromDomain;
public SearchResultItem(long combinedId, int scoresCount) {
public SearchResultItem(long combinedId) {
this.combinedId = combinedId;
this.keywordScores = new ArrayList<>(scoresCount);
this.keywordScores = new ArrayList<>();
}

View File

@ -7,19 +7,22 @@ import nu.marginalia.model.idx.DocumentMetadata;
import java.util.Objects;
public final class SearchResultKeywordScore {
@Deprecated
public final int subquery;
public final long termId;
public final String keyword;
private final long encodedWordMetadata;
private final long encodedDocMetadata;
private final int htmlFeatures;
public SearchResultKeywordScore(int subquery,
String keyword,
public SearchResultKeywordScore(String keyword,
long termId,
long encodedWordMetadata,
long encodedDocMetadata,
int htmlFeatures) {
this.subquery = subquery;
this.termId = termId;
this.subquery = -1; // FIXME, deprecated
this.keyword = keyword;
this.encodedWordMetadata = encodedWordMetadata;
this.encodedDocMetadata = encodedDocMetadata;

View File

@ -52,7 +52,7 @@ message RpcTemporalBias {
/* Index service query request */
message RpcIndexQuery {
repeated RpcSubquery subqueries = 1;
RpcQuery query = 1;
repeated int32 domains = 2; // (optional) A list of domain IDs to consider
string searchSetIdentifier = 3; // (optional) A named set of domains to consider
string humanQuery = 4; // The search query as the user entered it
@ -102,12 +102,11 @@ message RpcRawResultItem {
/* Information about how well a keyword matches a query */
message RpcResultKeywordScore {
int32 subquery = 1; // index of the subquery this keyword relates to
string keyword = 2; // the keyword
int64 encodedWordMetadata = 3; // bit encoded word metadata
int64 encodedDocMetadata = 4; // bit encoded document metadata
bool hasPriorityTerms = 5; // true if this word is important to the document
int32 htmlFeatures = 6; // bit encoded document features
string keyword = 1; // the keyword
int64 encodedWordMetadata = 2; // bit encoded word metadata
int64 encodedDocMetadata = 3; // bit encoded document metadata
bool hasPriorityTerms = 4; // true if this word is important to the document
int32 htmlFeatures = 5; // bit encoded document features
}
/* Query execution parameters */
@ -137,12 +136,13 @@ message RpcResultRankingParameters {
}
/* Defines a single subquery */
message RpcSubquery {
message RpcQuery {
repeated string include = 1; // These terms must be present
repeated string exclude = 2; // These terms must be absent
repeated string advice = 3; // These terms must be present, but do not affect ranking
repeated string priority = 4; // These terms are not mandatory, but affect ranking positively if they are present
repeated RpcCoherences coherences = 5; // Groups of terms that must exist in proximity of each other
string compiledQuery = 6; // Compiled query in infix notation
}
/* Defines a group of search terms that must exist in close proximity within the document */

View File

@ -0,0 +1,79 @@
package nu.marginalia.api.searchquery.model.compiled;
import org.junit.jupiter.api.Test;
import java.util.List;
import static org.junit.jupiter.api.Assertions.*;
class CompiledQueryParserTest {
@Test
public void testEmpty() {
assertEquals(CqExpression.empty(), CompiledQueryParser.parse("").root);
assertEquals(CqExpression.empty(), CompiledQueryParser.parse("( )").root);
assertEquals(CqExpression.empty(), CompiledQueryParser.parse("( | )").root);
assertEquals(CqExpression.empty(), CompiledQueryParser.parse("| ( | ) |").root);
}
@Test
public void testSingleWord() {
CompiledQuery<String> q = CompiledQueryParser.parse("foo");
assertEquals(w(q, "foo"), q.root);
}
@Test
public void testAndTwoWords() {
CompiledQuery<String> q = CompiledQueryParser.parse("foo bar");
assertEquals(and(w(q, "foo"), w(q,"bar")), q.root);
}
@Test
public void testOrTwoWords() {
CompiledQuery<String> q = CompiledQueryParser.parse("foo | bar");
assertEquals(or(w(q, "foo"), w(q,"bar")), q.root);
}
@Test
public void testOrAndWords() {
CompiledQuery<String> q = CompiledQueryParser.parse("foo | bar baz");
assertEquals(or(w(q,"foo"), and(w(q,"bar"), w(q,"baz"))), q.root);
}
@Test
public void testAndAndOrAndAndWords() {
CompiledQuery<String> q = CompiledQueryParser.parse("foo foobar | bar baz");
assertEquals(or(
and(w(q, "foo"), w(q, "foobar")),
and(w(q, "bar"), w(q, "baz")))
, q.root);
}
@Test
public void testComplex1() {
CompiledQuery<String> q = CompiledQueryParser.parse("foo ( bar | baz ) quux");
assertEquals(and(w(q,"foo"), or(w(q, "bar"), w(q, "baz")), w(q, "quux")), q.root);
}
@Test
public void testComplex2() {
CompiledQuery<String> q = CompiledQueryParser.parse("( ( ( a ) b ) c ) d");
assertEquals(and(and(and(w(q, "a"), w(q, "b")), w(q, "c")), w(q, "d")), q.root);
}
@Test
public void testNested() {
CompiledQuery<String> q = CompiledQueryParser.parse("( ( ( a ) ) )");
assertEquals(w(q,"a"), q.root);
}
private CqExpression.Word w(CompiledQuery<String> query, String word) {
return new CqExpression.Word(query.indices().filter(idx -> word.equals(query.at(idx))).findAny().orElseThrow());
}
private CqExpression and(CqExpression... parts) {
return new CqExpression.And(List.of(parts));
}
private CqExpression or(CqExpression... parts) {
return new CqExpression.Or(List.of(parts));
}
}

View File

@ -0,0 +1,35 @@
package nu.marginalia.api.searchquery.model.compiled.aggregate;
import static nu.marginalia.api.searchquery.model.compiled.CompiledQueryParser.parse;
import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.*;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.*;
class CompiledQueryAggregatesTest {
@Test
void booleanAggregates() {
assertFalse(booleanAggregate(parse("false"), Boolean::parseBoolean));
assertTrue(booleanAggregate(parse("true"), Boolean::parseBoolean));
assertFalse(booleanAggregate(parse("false true"), Boolean::parseBoolean));
assertTrue(booleanAggregate(parse("( true ) | ( true false )"), Boolean::parseBoolean));
assertTrue(booleanAggregate(parse("( false ) | ( true )"), Boolean::parseBoolean));
assertTrue(booleanAggregate(parse("( true false ) | ( true true )"), Boolean::parseBoolean));
assertFalse(booleanAggregate(parse("( true false ) | ( true false )"), Boolean::parseBoolean));
}
@Test
void intMaxMinAggregates() {
assertEquals(5, intMaxMinAggregate(parse("5"), Integer::parseInt));
assertEquals(3, intMaxMinAggregate(parse("5 3"), Integer::parseInt));
assertEquals(6, intMaxMinAggregate(parse("5 3 | 6 7"), Integer::parseInt));
}
@Test
void doubleSumAggregates() {
assertEquals(5, (int) doubleSumAggregate(parse("5"), Double::parseDouble));
assertEquals(8, (int) doubleSumAggregate(parse("5 3"), Double::parseDouble));
assertEquals(13, (int) doubleSumAggregate(parse("1 ( 5 3 | 2 10 )"), Double::parseDouble));
}
}

View File

@ -1,7 +1,7 @@
package nu.marginalia.index.client;
import nu.marginalia.api.searchquery.IndexProtobufCodec;
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.index.query.limit.QueryLimits;
import nu.marginalia.index.query.limit.SpecificationLimit;
@ -35,14 +35,15 @@ class IndexProtobufCodecTest {
}
@Test
public void testSubqery() {
verifyIsIdentityTransformation(new SearchSubquery(
verifyIsIdentityTransformation(new SearchQuery(
"qs",
List.of("a", "b"),
List.of("c", "d"),
List.of("e", "f"),
List.of("g", "h"),
List.of(List.of("i", "j"), List.of("k"))
),
s -> IndexProtobufCodec.convertSearchSubquery(IndexProtobufCodec.convertSearchSubquery(s))
s -> IndexProtobufCodec.convertRpcQuery(IndexProtobufCodec.convertRpcQuery(s))
);
}
private <T> void verifyIsIdentityTransformation(T val, Function<T,T> transformation) {

View File

@ -2,18 +2,16 @@ package nu.marginalia.functions.searchquery.svc;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.LanguageModels;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.util.language.EnglishDictionary;
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
import nu.marginalia.language.WordPatterns;
import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.api.searchquery.model.query.ProcessedQuery;
import nu.marginalia.functions.searchquery.query_parser.QueryParser;
import nu.marginalia.functions.searchquery.query_parser.token.Token;
import nu.marginalia.functions.searchquery.query_parser.token.TokenType;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -26,15 +24,14 @@ import java.util.List;
public class QueryFactory {
private final Logger logger = LoggerFactory.getLogger(getClass());
private static final int RETAIN_QUERY_VARIANT_COUNT = 5;
private final QueryParser queryParser = new QueryParser();
private final QueryExpansion queryExpansion;
@Inject
public QueryFactory(LanguageModels lm,
TermFrequencyDict dict,
EnglishDictionary englishDictionary)
public QueryFactory(QueryExpansion queryExpansion)
{
this.queryExpansion = queryExpansion;
}
@ -49,8 +46,6 @@ public class QueryFactory {
List<String> searchTermsHuman = new ArrayList<>();
List<String> problems = new ArrayList<>();
String domain = null;
List<Token> basicQuery = queryParser.parse(query);
if (basicQuery.size() >= 12) {
@ -74,19 +69,8 @@ public class QueryFactory {
t.visit(qualityLimits);
}
// var queryPermutations = queryPermutation.permuteQueriesNew(basicQuery);
List<SearchSubquery> subqueries = new ArrayList<>();
QuerySearchTermsAccumulator termsAccumulator = new QuerySearchTermsAccumulator(basicQuery);
domain = termsAccumulator.domain;
// for (var parts : queryPermutations) {
// QuerySearchTermsAccumulator termsAccumulator = new QuerySearchTermsAccumulator(basicQuery);
//
// domain = termsAccumulator.domain;
//
// SearchSubquery subquery = termsAccumulator.createSubquery();
// subqueries.add(subquery);
// }
String domain = termsAccumulator.domain;
List<Integer> domainIds = params.domainIds();
@ -97,7 +81,18 @@ public class QueryFactory {
}
var specsBuilder = SearchSpecification.builder()
.subqueries(subqueries)
.query(
new SearchQuery(
queryExpansion.expandQuery(
termsAccumulator.searchTermsInclude
),
termsAccumulator.searchTermsInclude,
termsAccumulator.searchTermsExclude,
termsAccumulator.searchTermsAdvice,
termsAccumulator.searchTermsPriority,
termsAccumulator.searchTermCoherences
)
)
.humanQuery(query)
.quality(qualityLimits.qualityLimit)
.year(qualityLimits.year)
@ -111,12 +106,9 @@ public class QueryFactory {
SearchSpecification specs = specsBuilder.build();
for (var sq : specs.subqueries) {
sq.searchTermsAdvice.addAll(params.tacitAdvice());
sq.searchTermsPriority.addAll(params.tacitPriority());
sq.searchTermsInclude.addAll(params.tacitIncludes());
sq.searchTermsExclude.addAll(params.tacitExcludes());
}
specs.query.searchTermsAdvice.addAll(params.tacitAdvice());
specs.query.searchTermsPriority.addAll(params.tacitPriority());
specs.query.searchTermsExclude.addAll(params.tacitExcludes());
return new ProcessedQuery(specs, searchTermsHuman, domain);
}

View File

@ -1,6 +1,6 @@
package nu.marginalia.functions.searchquery.svc;
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.language.WordPatterns;
import nu.marginalia.functions.searchquery.query_parser.token.Token;
import nu.marginalia.functions.searchquery.query_parser.token.TokenVisitor;
@ -9,7 +9,7 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/** @see SearchSubquery */
/** @see SearchQuery */
public class QuerySearchTermsAccumulator implements TokenVisitor {
public List<String> searchTermsExclude = new ArrayList<>();
public List<String> searchTermsInclude = new ArrayList<>();
@ -19,10 +19,6 @@ public class QuerySearchTermsAccumulator implements TokenVisitor {
public String domain;
public SearchSubquery createSubquery() {
return new SearchSubquery(searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority, searchTermCoherences);
}
public QuerySearchTermsAccumulator(List<Token> parts) {
for (Token t : parts) {
t.visit(this);

View File

@ -3,12 +3,13 @@ package nu.marginalia.query.svc;
import nu.marginalia.WmsaHome;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
import nu.marginalia.functions.searchquery.svc.QueryFactory;
import nu.marginalia.index.query.limit.QueryLimits;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.index.query.limit.SpecificationLimitType;
import nu.marginalia.util.language.EnglishDictionary;
import nu.marginalia.segmentation.NgramLexicon;
import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import org.junit.jupiter.api.BeforeAll;
@ -27,11 +28,9 @@ public class QueryFactoryTest {
public static void setUpAll() throws IOException {
var lm = WmsaHome.getLanguageModels();
var tfd = new TermFrequencyDict(lm);
queryFactory = new QueryFactory(lm,
tfd,
new EnglishDictionary(tfd)
queryFactory = new QueryFactory(
new QueryExpansion(new TermFrequencyDict(lm), new NgramLexicon(lm))
);
}
@ -112,17 +111,15 @@ public class QueryFactoryTest {
{
// the is a stopword, so it should generate an ngram search term
var specs = parseAndGetSpecs("\"the shining\"");
assertEquals(List.of("the_shining"), specs.subqueries.iterator().next().searchTermsInclude);
assertEquals(List.of(), specs.subqueries.iterator().next().searchTermsAdvice);
assertEquals(List.of(), specs.subqueries.iterator().next().searchTermCoherences);
assertEquals("the_shining", specs.query.compiledQuery);
}
{
// tde isn't a stopword, so we should get the normal behavior
var specs = parseAndGetSpecs("\"tde shining\"");
assertEquals(List.of("tde", "shining"), specs.subqueries.iterator().next().searchTermsInclude);
assertEquals(List.of("tde_shining"), specs.subqueries.iterator().next().searchTermsAdvice);
assertEquals(List.of(List.of("tde", "shining")), specs.subqueries.iterator().next().searchTermCoherences);
assertEquals("tde shining", specs.query.compiledQuery);
assertEquals(List.of("tde_shining"), specs.query.searchTermsAdvice);
assertEquals(List.of(List.of("tde", "shining")), specs.query.searchTermCoherences);
}
}
@ -150,8 +147,18 @@ public class QueryFactoryTest {
@Test
public void testPriorityTerm() {
var subquery = parseAndGetSpecs("physics ?tld:edu").subqueries.iterator().next();
var subquery = parseAndGetSpecs("physics ?tld:edu").query;
assertEquals(List.of("tld:edu"), subquery.searchTermsPriority);
assertEquals(List.of("physics"), subquery.searchTermsInclude);
assertEquals("physics", subquery.compiledQuery);
}
@Test
public void testExpansion() {
long start = System.currentTimeMillis();
var subquery = parseAndGetSpecs("elden ring mechanical keyboard slackware linux duke nukem 3d").query;
System.out.println("Time: " + (System.currentTimeMillis() - start));
System.out.println(subquery.compiledQuery);
}
}

View File

@ -46,7 +46,7 @@ public class ReverseIndexEntrySource implements EntrySource {
return;
for (int ri = entrySize, wi=1; ri < buffer.end ; ri+=entrySize, wi++) {
buffer.data[wi] = buffer.data[ri];
buffer.data.set(wi, buffer.data.get(ri));
}
buffer.end /= entrySize;

View File

@ -9,14 +9,14 @@ import io.prometheus.client.Histogram;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import lombok.SneakyThrows;
import nu.marginalia.api.searchquery.*;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
import nu.marginalia.api.searchquery.model.results.*;
import nu.marginalia.array.buffer.LongQueryBuffer;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.model.SearchParameters;
import nu.marginalia.index.model.SearchTerms;
import nu.marginalia.index.model.SearchTermsUtil;
import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.query.IndexSearchBudget;
import nu.marginalia.index.results.IndexResultValuatorService;
@ -143,7 +143,6 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
.setEncodedWordMetadata(score.encodedWordMetadata())
.setKeyword(score.keyword)
.setHtmlFeatures(score.htmlFeatures())
.setSubquery(score.subquery)
);
}
@ -203,7 +202,9 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
return new SearchResultSet(List.of());
}
ResultRankingContext rankingContext = createRankingContext(params.rankingParams, params.subqueries);
ResultRankingContext rankingContext = createRankingContext(params.rankingParams,
params.compiledQuery,
params.compiledQueryIds);
var queryExecution = new QueryExecution(rankingContext, params.fetchSize);
@ -255,14 +256,10 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
/** Execute a search query */
public SearchResultSet run(SearchParameters parameters) throws SQLException, InterruptedException {
for (var subquery : parameters.subqueries) {
var terms = new SearchTerms(subquery);
if (terms.isEmpty())
continue;
var terms = new SearchTerms(parameters.query, parameters.compiledQueryIds);
for (var indexQuery : index.createQueries(terms, parameters.queryParams)) {
workerPool.execute(new IndexLookup(indexQuery, parameters.budget));
}
for (var indexQuery : index.createQueries(terms, parameters.queryParams)) {
workerPool.execute(new IndexLookup(indexQuery, parameters.budget));
}
for (int i = 0; i < indexValuationThreads; i++) {
@ -327,7 +324,9 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
buffer.reset();
query.getMoreResults(buffer);
results.addElements(0, buffer.data, 0, buffer.end);
for (int i = 0; i < buffer.end; i++) {
results.add(buffer.data.get(i));
}
if (results.size() < 512) {
enqueueResults(new CombinedDocIdList(results));
@ -413,8 +412,13 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
}
private ResultRankingContext createRankingContext(ResultRankingParameters rankingParams, List<SearchSubquery> subqueries) {
final var termToId = SearchTermsUtil.getAllIncludeTerms(subqueries);
private ResultRankingContext createRankingContext(ResultRankingParameters rankingParams,
CompiledQuery<String> query,
CompiledQueryLong compiledQueryIds)
{
Map<String, Long> termToId = new HashMap<>(query.size());
query.indices().forEach(id -> termToId.put(query.at(id), compiledQueryIds.at(id)));
final Map<String, Integer> termFrequencies = new HashMap<>(termToId.size());
final Map<String, Integer> prioFrequencies = new HashMap<>(termToId.size());

View File

@ -38,6 +38,13 @@ public class CombinedIndexReader {
return new IndexQueryBuilderImpl(reverseIndexFullReader, reverseIndexPriorityReader, query);
}
public QueryFilterStepIf hasWordFull(long termId) {
return reverseIndexFullReader.also(termId);
}
public QueryFilterStepIf hasWordPrio(long termId) {
return reverseIndexPriorityReader.also(termId);
}
/** Creates a query builder for terms in the priority index */
public IndexQueryBuilder findPriorityWord(long wordId) {

View File

@ -1,9 +1,11 @@
package nu.marginalia.index.index;
import java.util.List;
import gnu.trove.set.hash.TLongHashSet;
import nu.marginalia.index.ReverseIndexReader;
import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.query.IndexQueryBuilder;
import nu.marginalia.index.query.filter.QueryFilterAnyOf;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
public class IndexQueryBuilderImpl implements IndexQueryBuilder {
@ -66,6 +68,20 @@ public class IndexQueryBuilderImpl implements IndexQueryBuilder {
return this;
}
public IndexQueryBuilder addInclusionFilterAny(List<QueryFilterStepIf> filterSteps) {
if (filterSteps.isEmpty())
return this;
if (filterSteps.size() == 1) {
query.addInclusionFilter(filterSteps.getFirst());
}
else {
query.addInclusionFilter(new QueryFilterAnyOf(filterSteps));
}
return this;
}
public IndexQuery build() {
return query;
}

View File

@ -0,0 +1,78 @@
package nu.marginalia.index.index;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import it.unimi.dsi.fastutil.longs.LongSet;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
class QueryBranchWalker {
public final long[] priorityOrder;
public final List<LongSet> paths;
public final long termId;
private QueryBranchWalker(long[] priorityOrder, List<LongSet> paths, long termId) {
this.priorityOrder = priorityOrder;
this.paths = paths;
this.termId = termId;
}
public boolean atEnd() {
return priorityOrder.length == 0;
}
public static List<QueryBranchWalker> create(long[] priorityOrder, List<LongSet> paths) {
List<QueryBranchWalker> ret = new ArrayList<>();
List<LongSet> remainingPaths = new LinkedList<>(paths);
remainingPaths.removeIf(LongSet::isEmpty);
for (int i = 0; i < priorityOrder.length; i++) {
long prio = priorityOrder[i];
var it = remainingPaths.iterator();
List<LongSet> pathsForPrio = new ArrayList<>();
while (it.hasNext()) {
var path = it.next();
if (path.contains(prio)) {
path.remove(prio);
pathsForPrio.add(path);
it.remove();
}
}
if (!pathsForPrio.isEmpty()) {
LongArrayList remainingPrios = new LongArrayList(pathsForPrio.size());
for (var p : priorityOrder) {
for (var path : pathsForPrio) {
if (path.contains(p)) {
remainingPrios.add(p);
break;
}
}
}
ret.add(new QueryBranchWalker(remainingPrios.elements(), pathsForPrio, prio));
}
}
if (!remainingPaths.isEmpty()) {
System.out.println("Dropping: " + remainingPaths);
}
return ret;
}
public List<QueryBranchWalker> next() {
if (atEnd())
return List.of();
return create(priorityOrder, paths);
}
}

View File

@ -2,6 +2,13 @@ package nu.marginalia.index.index;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import it.unimi.dsi.fastutil.longs.LongSet;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
import nu.marginalia.index.query.filter.QueryFilterAllOf;
import nu.marginalia.index.query.filter.QueryFilterAnyOf;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import nu.marginalia.index.results.model.ids.DocMetadataList;
import nu.marginalia.index.model.QueryParams;
@ -14,12 +21,13 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.*;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.function.LongFunction;
import java.util.function.Predicate;
import java.util.stream.Collectors;
/** This class delegates SearchIndexReader and deals with the stateful nature of the index,
* i.e. it may be possible to reconstruct the index and load a new set of data.
@ -105,6 +113,61 @@ public class StatefulIndex {
return combinedIndexReader != null && combinedIndexReader.isLoaded();
}
private Predicate<LongSet> containsOnly(long[] permitted) {
LongSet permittedTerms = new LongOpenHashSet(permitted);
return permittedTerms::containsAll;
}
private List<IndexQueryBuilder> createBuilders(CompiledQueryLong query,
LongFunction<IndexQueryBuilder> builderFactory,
long[] termPriority) {
List<LongSet> paths = CompiledQueryAggregates.queriesAggregate(query);
// Remove any paths that do not contain all prioritized terms, as this means
// the term is missing from the index and can never be found
paths.removeIf(containsOnly(termPriority).negate());
List<QueryBranchWalker> helpers = QueryBranchWalker.create(termPriority, paths);
List<IndexQueryBuilder> builders = new ArrayList<>();
for (var helper : helpers) {
var builder = builderFactory.apply(helper.termId);
builders.add(builder);
if (helper.atEnd())
continue;
var filters = helper.next().stream()
.map(this::createFilter)
.toList();
builder.addInclusionFilterAny(filters);
}
return builders;
}
private QueryFilterStepIf createFilter(QueryBranchWalker helper) {
var selfCondition = combinedIndexReader.hasWordFull(helper.termId);
if (helper.atEnd())
return selfCondition;
var nextSteps = helper.next();
var nextFilters = nextSteps.stream()
.map(this::createFilter)
.map(filter -> new QueryFilterAllOf(List.of(selfCondition, filter)))
.collect(Collectors.toList());
if (nextFilters.isEmpty())
return selfCondition;
if (nextFilters.size() == 1)
return nextFilters.getFirst();
return new QueryFilterAnyOf(nextFilters);
}
public List<IndexQuery> createQueries(SearchTerms terms, QueryParams params) {
@ -117,40 +180,13 @@ public class StatefulIndex {
final long[] orderedIncludesPrio = terms.sortedDistinctIncludes(this::compareKeywordsPrio);
List<IndexQueryBuilder> queryHeads = new ArrayList<>(10);
queryHeads.addAll(createBuilders(terms.compiledQuery(), combinedIndexReader::findFullWord, orderedIncludes));
queryHeads.addAll(createBuilders(terms.compiledQuery(), combinedIndexReader::findPriorityWord, orderedIncludesPrio));
List<IndexQuery> queries = new ArrayList<>(10);
// To ensure that good results are discovered, create separate query heads for the priority index that
// filter for terms that contain pairs of two search terms
if (orderedIncludesPrio.length > 1) {
for (int i = 0; i + 1 < orderedIncludesPrio.length; i++) {
for (int j = i + 1; j < orderedIncludesPrio.length; j++) {
var entrySource = combinedIndexReader
.findPriorityWord(orderedIncludesPrio[i])
.alsoPrio(orderedIncludesPrio[j]);
queryHeads.add(entrySource);
}
}
}
// Next consider entries that appear only once in the priority index
for (var wordId : orderedIncludesPrio) {
queryHeads.add(combinedIndexReader.findPriorityWord(wordId));
}
// Finally consider terms in the full index
queryHeads.add(combinedIndexReader.findFullWord(orderedIncludes[0]));
for (var query : queryHeads) {
if (query == null) {
return Collections.emptyList();
}
// Note that we can add all includes as filters, even though
// they may not be present in the query head, as the query builder
// will ignore redundant include filters:
for (long orderedInclude : orderedIncludes) {
query = query.alsoFull(orderedInclude);
}
for (long term : terms.excludes()) {
query = query.notFull(term);
@ -161,6 +197,7 @@ public class StatefulIndex {
queries.add(query.addInclusionFilter(combinedIndexReader.filterForParams(params)).build());
}
return queries;
}

View File

@ -2,16 +2,16 @@ package nu.marginalia.index.model;
import nu.marginalia.api.searchquery.IndexProtobufCodec;
import nu.marginalia.api.searchquery.RpcIndexQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryParser;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.index.query.IndexSearchBudget;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.searchset.SearchSet;
import java.util.ArrayList;
import java.util.List;
import static nu.marginalia.api.searchquery.IndexProtobufCodec.convertSpecLimit;
public class SearchParameters {
@ -21,13 +21,16 @@ public class SearchParameters {
*/
public final int fetchSize;
public final IndexSearchBudget budget;
public final List<SearchSubquery> subqueries;
public final SearchQuery query;
public final QueryParams queryParams;
public final ResultRankingParameters rankingParams;
public final int limitByDomain;
public final int limitTotal;
public final CompiledQuery<String> compiledQuery;
public final CompiledQueryLong compiledQueryIds;
// mutable:
/**
@ -40,7 +43,7 @@ public class SearchParameters {
this.fetchSize = limits.fetchSize();
this.budget = new IndexSearchBudget(limits.timeoutMs());
this.subqueries = specsSet.subqueries;
this.query = specsSet.query;
this.limitByDomain = limits.resultsByDomain();
this.limitTotal = limits.resultsTotal();
@ -52,6 +55,9 @@ public class SearchParameters {
searchSet,
specsSet.queryStrategy);
compiledQuery = CompiledQueryParser.parse(this.query.compiledQuery);
compiledQueryIds = compiledQuery.mapToLong(SearchTermsUtil::getWordId);
rankingParams = specsSet.rankingParams;
}
@ -63,11 +69,8 @@ public class SearchParameters {
// The time budget is halved because this is the point when we start to
// wrap up the search and return the results.
this.budget = new IndexSearchBudget(limits.timeoutMs() / 2);
this.query = IndexProtobufCodec.convertRpcQuery(request.getQuery());
this.subqueries = new ArrayList<>(request.getSubqueriesCount());
for (int i = 0; i < request.getSubqueriesCount(); i++) {
this.subqueries.add(IndexProtobufCodec.convertSearchSubquery(request.getSubqueries(i)));
}
this.limitByDomain = limits.resultsByDomain();
this.limitTotal = limits.resultsTotal();
@ -79,9 +82,13 @@ public class SearchParameters {
searchSet,
QueryStrategy.valueOf(request.getQueryStrategy()));
compiledQuery = CompiledQueryParser.parse(this.query.compiledQuery);
compiledQueryIds = compiledQuery.mapToLong(SearchTermsUtil::getWordId);
rankingParams = IndexProtobufCodec.convertRankingParameterss(request.getParameters());
}
public long getDataCost() {
return dataCost;
}

View File

@ -4,7 +4,8 @@ import it.unimi.dsi.fastutil.longs.LongArrayList;
import it.unimi.dsi.fastutil.longs.LongComparator;
import it.unimi.dsi.fastutil.longs.LongList;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import java.util.ArrayList;
import java.util.List;
@ -18,34 +19,39 @@ public final class SearchTerms {
private final LongList priority;
private final List<LongList> coherences;
private final CompiledQueryLong compiledQueryIds;
public SearchTerms(
LongList includes,
LongList excludes,
LongList priority,
List<LongList> coherences
List<LongList> coherences,
CompiledQueryLong compiledQueryIds
) {
this.includes = includes;
this.excludes = excludes;
this.priority = priority;
this.coherences = coherences;
this.compiledQueryIds = compiledQueryIds;
}
public SearchTerms(SearchSubquery subquery) {
public SearchTerms(SearchQuery query, CompiledQueryLong compiledQueryIds) {
this(new LongArrayList(),
new LongArrayList(),
new LongArrayList(),
new ArrayList<>());
new ArrayList<>(),
compiledQueryIds);
for (var word : subquery.searchTermsInclude) {
for (var word : query.searchTermsInclude) {
includes.add(getWordId(word));
}
for (var word : subquery.searchTermsAdvice) {
for (var word : query.searchTermsAdvice) {
// This looks like a bug, but it's not
includes.add(getWordId(word));
}
for (var coherence : subquery.searchTermCoherences) {
for (var coherence : query.searchTermCoherences) {
LongList parts = new LongArrayList(coherence.size());
for (var word : coherence) {
@ -55,10 +61,10 @@ public final class SearchTerms {
coherences.add(parts);
}
for (var word : subquery.searchTermsExclude) {
for (var word : query.searchTermsExclude) {
excludes.add(getWordId(word));
}
for (var word : subquery.searchTermsPriority) {
for (var word : query.searchTermsPriority) {
priority.add(getWordId(word));
}
}
@ -96,6 +102,8 @@ public final class SearchTerms {
return coherences;
}
public CompiledQueryLong compiledQuery() { return compiledQueryIds; }
@Override
public boolean equals(Object obj) {
if (obj == this) return true;

View File

@ -1,29 +1,9 @@
package nu.marginalia.index.model;
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
import nu.marginalia.hash.MurmurHash3_128;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class SearchTermsUtil {
/** Extract all include-terms from the specified subqueries,
* and a return a map of the terms and their termIds.
*/
public static Map<String, Long> getAllIncludeTerms(List<SearchSubquery> subqueries) {
Map<String, Long> ret = new HashMap<>();
for (var subquery : subqueries) {
for (var include : subquery.searchTermsInclude) {
ret.computeIfAbsent(include, i -> getWordId(include));
}
}
return ret;
}
private static final MurmurHash3_128 hasher = new MurmurHash3_128();
/** Translate the word to a unique id. */

View File

@ -4,7 +4,8 @@ import com.google.inject.Inject;
import gnu.trove.map.hash.TObjectLongHashMap;
import it.unimi.dsi.fastutil.longs.Long2ObjectArrayMap;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.model.SearchTermsUtil;
import nu.marginalia.index.results.model.QuerySearchTerms;
@ -13,9 +14,6 @@ import nu.marginalia.index.results.model.TermMetadataForCombinedDocumentIds;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import nu.marginalia.index.results.model.ids.TermIdList;
import java.util.ArrayList;
import java.util.List;
import static nu.marginalia.index.results.model.TermCoherenceGroupList.TermCoherenceGroup;
import static nu.marginalia.index.results.model.TermMetadataForCombinedDocumentIds.DocumentsWithMetadata;
@ -42,43 +40,24 @@ public class IndexMetadataService {
return new TermMetadataForCombinedDocumentIds(termdocToMeta);
}
public QuerySearchTerms getSearchTerms(List<SearchSubquery> searchTermVariants) {
public QuerySearchTerms getSearchTerms(CompiledQuery<String> compiledQuery, SearchQuery searchQuery) {
LongArrayList termIdsList = new LongArrayList();
TObjectLongHashMap<String> termToId = new TObjectLongHashMap<>(10, 0.75f, -1);
for (var subquery : searchTermVariants) {
for (var term : subquery.searchTermsInclude) {
if (termToId.containsKey(term)) {
continue;
}
long id = SearchTermsUtil.getWordId(term);
termIdsList.add(id);
termToId.put(term, id);
}
for (String word : compiledQuery) {
long id = SearchTermsUtil.getWordId(word);
termIdsList.add(id);
termToId.put(word, id);
}
return new QuerySearchTerms(termToId,
new TermIdList(termIdsList),
getTermCoherences(searchTermVariants));
}
private TermCoherenceGroupList getTermCoherences(List<SearchSubquery> searchTermVariants) {
List<TermCoherenceGroup> coherences = new ArrayList<>();
for (var subquery : searchTermVariants) {
for (var coh : subquery.searchTermCoherences) {
coherences.add(new TermCoherenceGroup(coh));
}
// It's assumed each subquery has identical coherences
break;
}
return new TermCoherenceGroupList(coherences);
new TermCoherenceGroupList(
searchQuery.searchTermCoherences.stream().map(TermCoherenceGroup::new).toList()
)
);
}
}

View File

@ -1,10 +1,13 @@
package nu.marginalia.index.results;
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.model.SearchParameters;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import nu.marginalia.index.model.QueryParams;
import nu.marginalia.index.results.model.QuerySearchTerms;
@ -23,7 +26,6 @@ import java.util.List;
* reasons to cache this data, and performs the calculations */
public class IndexResultValuationContext {
private final StatefulIndex statefulIndex;
private final List<List<String>> searchTermVariants;
private final QueryParams queryParams;
private final TermMetadataForCombinedDocumentIds termMetadataForCombinedDocumentIds;
@ -31,23 +33,26 @@ public class IndexResultValuationContext {
private final ResultRankingContext rankingContext;
private final ResultValuator searchResultValuator;
private final CompiledQuery<String> compiledQuery;
private final CompiledQueryLong compiledQueryIds;
public IndexResultValuationContext(IndexMetadataService metadataService,
ResultValuator searchResultValuator,
CombinedDocIdList ids,
StatefulIndex statefulIndex,
ResultRankingContext rankingContext,
List<SearchSubquery> subqueries,
QueryParams queryParams
SearchParameters params
) {
this.statefulIndex = statefulIndex;
this.rankingContext = rankingContext;
this.searchResultValuator = searchResultValuator;
this.searchTermVariants = subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList();
this.queryParams = queryParams;
this.queryParams = params.queryParams;
this.compiledQuery = params.compiledQuery;
this.compiledQueryIds = params.compiledQueryIds;
this.searchTerms = metadataService.getSearchTerms(params.compiledQuery, params.query);
this.searchTerms = metadataService.getSearchTerms(subqueries);
this.termMetadataForCombinedDocumentIds = metadataService.getTermMetadataForDocuments(ids, searchTerms.termIdsAll);
}
@ -65,68 +70,39 @@ public class IndexResultValuationContext {
long docMetadata = statefulIndex.getDocumentMetadata(docId);
int htmlFeatures = statefulIndex.getHtmlFeatures(docId);
int maxFlagsCount = 0;
boolean anyAllSynthetic = false;
int maxPositionsSet = 0;
SearchResultItem searchResult = new SearchResultItem(docId);
SearchResultItem searchResult = new SearchResultItem(docId,
searchTermVariants.stream().mapToInt(List::size).sum());
SearchResultKeywordScore[] scores = compiledQuery.indices().mapToObj(idx ->
new SearchResultKeywordScore(
compiledQuery.at(idx),
compiledQueryIds.at(idx),
termMetadataForCombinedDocumentIds.getTermMetadata(
compiledQueryIds.at(idx), combinedId
),
docMetadata,
htmlFeatures)
)
.toArray(SearchResultKeywordScore[]::new);
for (int querySetId = 0;
querySetId < searchTermVariants.size();
querySetId++)
{
var termList = searchTermVariants.get(querySetId);
// DANGER: IndexResultValuatorService assumes that searchResult.keywordScores has this specific order, as it needs
// to be able to re-construct its own CompiledQuery<SearchResultKeywordScore> for re-ranking the results. This is
// a very flimsy assumption.
searchResult.keywordScores.addAll(List.of(scores));
SearchResultKeywordScore[] termScoresForSet = new SearchResultKeywordScore[termList.size()];
CompiledQuery<SearchResultKeywordScore> queryGraphScores = new CompiledQuery<>(compiledQuery.root, scores);
boolean synthetic = true;
boolean allSynthetic = !CompiledQueryAggregates.booleanAggregate(queryGraphScores, score -> !score.hasTermFlag(WordFlags.Synthetic));
int flagsCount = CompiledQueryAggregates.intMaxMinAggregate(queryGraphScores, score -> Long.bitCount(score.encodedWordMetadata() & flagsFilterMask));
int positionsCount = CompiledQueryAggregates.intMaxMinAggregate(queryGraphScores, SearchResultKeywordScore::positionCount);
for (int termIdx = 0; termIdx < termList.size(); termIdx++) {
String searchTerm = termList.get(termIdx);
long termMetadata = termMetadataForCombinedDocumentIds.getTermMetadata(
searchTerms.getIdForTerm(searchTerm),
combinedId
);
var score = new SearchResultKeywordScore(
querySetId,
searchTerm,
termMetadata,
docMetadata,
htmlFeatures
);
synthetic &= WordFlags.Synthetic.isPresent(termMetadata);
searchResult.keywordScores.add(score);
termScoresForSet[termIdx] = score;
}
if (!meetsQueryStrategyRequirements(termScoresForSet, queryParams.queryStrategy())) {
continue;
}
int minFlagsCount = 8;
int minPositionsSet = 4;
for (var termScore : termScoresForSet) {
final int flagCount = Long.bitCount(termScore.encodedWordMetadata() & flagsFilterMask);
minFlagsCount = Math.min(minFlagsCount, flagCount);
minPositionsSet = Math.min(minPositionsSet, termScore.positionCount());
}
maxFlagsCount = Math.max(maxFlagsCount, minFlagsCount);
maxPositionsSet = Math.max(maxPositionsSet, minPositionsSet);
anyAllSynthetic |= synthetic;
if (!meetsQueryStrategyRequirements(queryGraphScores, queryParams.queryStrategy())) {
return null;
}
if (maxFlagsCount == 0 && !anyAllSynthetic && maxPositionsSet == 0)
if (flagsCount == 0 && !allSynthetic && positionsCount == 0)
return null;
double score = searchResultValuator.calculateSearchResultValue(searchResult.keywordScores,
double score = searchResultValuator.calculateSearchResultValue(queryGraphScores,
5000, // use a dummy value here as it's not present in the index
rankingContext);
@ -135,20 +111,17 @@ public class IndexResultValuationContext {
return searchResult;
}
private boolean meetsQueryStrategyRequirements(SearchResultKeywordScore[] termSet, QueryStrategy queryStrategy) {
private boolean meetsQueryStrategyRequirements(CompiledQuery<SearchResultKeywordScore> queryGraphScores,
QueryStrategy queryStrategy)
{
if (queryStrategy == QueryStrategy.AUTO ||
queryStrategy == QueryStrategy.SENTENCE ||
queryStrategy == QueryStrategy.TOPIC) {
return true;
}
for (var keyword : termSet) {
if (!meetsQueryStrategyRequirements(keyword, queryParams.queryStrategy())) {
return false;
}
}
return true;
return CompiledQueryAggregates.booleanAggregate(queryGraphScores,
docs -> meetsQueryStrategyRequirements(docs, queryParams.queryStrategy()));
}
private boolean meetsQueryStrategyRequirements(SearchResultKeywordScore termScore, QueryStrategy queryStrategy) {

View File

@ -4,10 +4,11 @@ import com.google.inject.Inject;
import com.google.inject.Singleton;
import gnu.trove.list.TLongList;
import gnu.trove.list.array.TLongArrayList;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.model.SearchParameters;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
@ -19,8 +20,6 @@ import org.slf4j.LoggerFactory;
import java.sql.SQLException;
import java.util.*;
import java.util.function.Consumer;
import java.util.stream.Collectors;
@Singleton
public class IndexResultValuatorService {
@ -44,8 +43,8 @@ public class IndexResultValuatorService {
}
public List<SearchResultItem> rankResults(SearchParameters params,
ResultRankingContext rankingContext,
CombinedDocIdList resultIds)
ResultRankingContext rankingContext,
CombinedDocIdList resultIds)
{
final var evaluator = createValuationContext(params, rankingContext, resultIds);
@ -70,8 +69,7 @@ public class IndexResultValuatorService {
resultIds,
statefulIndex,
rankingContext,
params.subqueries,
params.queryParams);
params);
}
@ -96,12 +94,13 @@ public class IndexResultValuatorService {
item.resultsFromDomain = domainCountFilter.getCount(item);
}
return decorateAndRerank(resultsList, rankingContext);
return decorateAndRerank(resultsList, params.compiledQuery, rankingContext);
}
/** Decorate the result items with additional information from the link database
* and calculate an updated ranking with the additional information */
public List<DecoratedSearchResultItem> decorateAndRerank(List<SearchResultItem> rawResults,
CompiledQuery<String> compiledQuery,
ResultRankingContext rankingContext)
throws SQLException
{
@ -125,13 +124,22 @@ public class IndexResultValuatorService {
continue;
}
resultItems.add(createCombinedItem(result, docData, rankingContext));
// Reconstruct the SearchResultKeywordScore-compiledquery for re-valuation
//
// CAVEAT: This hinges on a very fragile that IndexResultValuationContext puts them in the same
// order as the data for the CompiledQuery<String>.
CompiledQuery<SearchResultKeywordScore> resultQuery =
new CompiledQuery<>(compiledQuery.root, result.keywordScores.toArray(SearchResultKeywordScore[]::new));
resultItems.add(createCombinedItem(result, docData, resultQuery, rankingContext));
}
return resultItems;
}
private DecoratedSearchResultItem createCombinedItem(SearchResultItem result,
DocdbUrlDetail docData,
CompiledQuery<SearchResultKeywordScore> resultQuery,
ResultRankingContext rankingContext) {
return new DecoratedSearchResultItem(
result,
@ -144,7 +152,7 @@ public class IndexResultValuatorService {
docData.pubYear(),
docData.dataHash(),
docData.wordsTotal(),
resultValuator.calculateSearchResultValue(result.keywordScores, docData.wordsTotal(), rankingContext)
resultValuator.calculateSearchResultValue(resultQuery, docData.wordsTotal(), rankingContext)
);
}

View File

@ -1,5 +1,6 @@
package nu.marginalia.ranking.results;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
@ -33,14 +34,17 @@ public class ResultValuator {
this.termCoherenceFactor = termCoherenceFactor;
}
public double calculateSearchResultValue(List<SearchResultKeywordScore> scores,
public double calculateSearchResultValue(CompiledQuery<SearchResultKeywordScore> scores,
int length,
ResultRankingContext ctx)
{
int sets = numberOfSets(scores);
if (scores.size() == 0)
return Double.MAX_VALUE;
if (length < 0)
length = 5000;
long documentMetadata = documentMetadata(scores);
int features = htmlFeatures(scores);
long documentMetadata = scores.at(0).encodedDocMetadata();
int features = scores.at(0).htmlFeatures();
var rankingParams = ctx.params;
int rank = DocumentMetadata.decodeRank(documentMetadata);
@ -75,32 +79,16 @@ public class ResultValuator {
+ temporalBias
+ flagsPenalty;
double bestTcf = 0;
double bestBM25F = 0;
double bestBM25P = 0;
double bestBM25PN = 0;
for (int set = 0; set < sets; set++) {
ResultKeywordSet keywordSet = createKeywordSet(scores, set);
if (keywordSet.isEmpty())
continue;
bestTcf = Math.max(bestTcf, rankingParams.tcfWeight * termCoherenceFactor.calculate(keywordSet));
bestBM25P = Math.max(bestBM25P, rankingParams.bm25PrioWeight * bm25Factor.calculateBm25Prio(rankingParams.prioParams, keywordSet, ctx));
bestBM25F = Math.max(bestBM25F, rankingParams.bm25FullWeight * bm25Factor.calculateBm25(rankingParams.fullParams, keywordSet, length, ctx));
if (keywordSet.hasNgram()) {
bestBM25PN = Math.max(bestBM25PN, rankingParams.bm25PrioWeight * bm25Factor.calculateBm25Prio(rankingParams.prioParams, keywordSet, ctx));
}
}
double bestTcf = rankingParams.tcfWeight * termCoherenceFactor.calculate(scores);
double bestBM25F = rankingParams.bm25FullWeight * bm25Factor.calculateBm25(rankingParams.prioParams, scores, length, ctx);
double bestBM25P = rankingParams.bm25PrioWeight * bm25Factor.calculateBm25Prio(rankingParams.prioParams, scores, ctx);
double overallPartPositive = Math.max(0, overallPart);
double overallPartNegative = -Math.min(0, overallPart);
// Renormalize to 0...15, where 0 is the best possible score;
// this is a historical artifact of the original ranking function
return normalize(1.5 * bestTcf + bestBM25F + bestBM25P + 0.25 * bestBM25PN + overallPartPositive, overallPartNegative);
return normalize(1.5 * bestTcf + bestBM25F + bestBM25P + overallPartPositive, overallPartNegative);
}
private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) {

View File

@ -1,10 +1,11 @@
package nu.marginalia.ranking.results.factors;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
import nu.marginalia.api.searchquery.model.results.Bm25Parameters;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.ranking.results.ResultKeywordSet;
public class Bm25Factor {
private static final int AVG_LENGTH = 5000;
@ -13,43 +14,33 @@ public class Bm25Factor {
*
* @see Bm25Parameters
*/
public double calculateBm25(Bm25Parameters bm25Parameters, ResultKeywordSet keywordSet, int length, ResultRankingContext ctx) {
public double calculateBm25(Bm25Parameters bm25Parameters, CompiledQuery<SearchResultKeywordScore> scores, int length, ResultRankingContext ctx) {
final int docCount = ctx.termFreqDocCount();
if (length <= 0)
length = AVG_LENGTH;
double sum = 0.;
for (var keyword : keywordSet.keywords()) {
return CompiledQueryAggregates.doubleSumAggregate(scores, keyword -> {
double count = keyword.positionCount();
int freq = ctx.frequency(keyword.keyword);
sum += invFreq(docCount, freq) * f(bm25Parameters.k(), bm25Parameters.b(), count, length);
}
return sum;
return invFreq(docCount, freq) * f(bm25Parameters.k(), bm25Parameters.b(), count, length);
});
}
/** Bm25 calculation, except instead of counting positions in the document,
* the number of relevance signals for the term is counted instead.
*/
public double calculateBm25Prio(Bm25Parameters bm25Parameters, ResultKeywordSet keywordSet, ResultRankingContext ctx) {
public double calculateBm25Prio(Bm25Parameters bm25Parameters, CompiledQuery<SearchResultKeywordScore> scores, ResultRankingContext ctx) {
final int docCount = ctx.termFreqDocCount();
double sum = 0.;
for (var keyword : keywordSet.keywords()) {
return CompiledQueryAggregates.doubleSumAggregate(scores, keyword -> {
double count = evaluatePriorityScore(keyword);
int freq = ctx.priorityFrequency(keyword.keyword);
// note we override b to zero for priority terms as they are independent of document length
sum += invFreq(docCount, freq) * f(bm25Parameters.k(), 0, count, 0);
}
return invFreq(docCount, freq) * f(bm25Parameters.k(), 0, count, 0);
});
return sum;
}
private static double evaluatePriorityScore(SearchResultKeywordScore keyword) {

View File

@ -1,14 +1,16 @@
package nu.marginalia.ranking.results.factors;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.ranking.results.ResultKeywordSet;
/** Rewards documents where terms appear frequently within the same sentences
*/
public class TermCoherenceFactor {
public double calculate(ResultKeywordSet keywordSet) {
long mask = combinedMask(keywordSet);
public double calculate(CompiledQuery<SearchResultKeywordScore> scores) {
long mask = CompiledQueryAggregates.longBitmaskAggregate(scores, score -> score.positions() & WordMetadata.POSITIONS_MASK);
return bitsSetFactor(mask);
}
@ -19,14 +21,5 @@ public class TermCoherenceFactor {
return Math.pow(bitsSetInMask/(float) WordMetadata.POSITIONS_COUNT, 0.25);
}
long combinedMask(ResultKeywordSet keywordSet) {
long mask = WordMetadata.POSITIONS_MASK;
for (var keyword : keywordSet.keywords()) {
mask &= keyword.positions();
}
return mask;
}
}

View File

@ -2,6 +2,8 @@ package nu.marginalia.index.query;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
import java.util.List;
/** Builds a query.
* <p />
* Note: The query builder may omit predicates that are deemed redundant.
@ -21,6 +23,7 @@ public interface IndexQueryBuilder {
IndexQueryBuilder notFull(long termId);
IndexQueryBuilder addInclusionFilter(QueryFilterStepIf filterStep);
IndexQueryBuilder addInclusionFilterAny(List<QueryFilterStepIf> filterStep);
IndexQuery build();
}

View File

@ -0,0 +1,57 @@
package nu.marginalia.index.query.filter;
import nu.marginalia.array.buffer.LongQueryBuffer;
import java.util.List;
import java.util.StringJoiner;
public class QueryFilterAllOf implements QueryFilterStepIf {
private final List<? extends QueryFilterStepIf> steps;
public QueryFilterAllOf(List<? extends QueryFilterStepIf> steps) {
this.steps = steps;
}
public double cost() {
double prod = 1.;
for (var step : steps) {
double cost = step.cost();
if (cost > 1.0) {
prod *= Math.log(cost);
}
else {
prod += cost;
}
}
return prod;
}
@Override
public boolean test(long value) {
for (var step : steps) {
if (!step.test(value))
return false;
}
return true;
}
public void apply(LongQueryBuffer buffer) {
if (steps.isEmpty())
return;
for (var step : steps) {
step.apply(buffer);
}
}
public String describe() {
StringJoiner sj = new StringJoiner(",", "[All Of: ", "]");
for (var step : steps) {
sj.add(step.describe());
}
return sj.toString();
}
}

View File

@ -2,7 +2,6 @@ package nu.marginalia.index.query.filter;
import nu.marginalia.array.buffer.LongQueryBuffer;
import java.util.Arrays;
import java.util.List;
import java.util.StringJoiner;
@ -14,7 +13,7 @@ public class QueryFilterAnyOf implements QueryFilterStepIf {
}
public double cost() {
return steps.stream().mapToDouble(QueryFilterStepIf::cost).average().orElse(0.);
return steps.stream().mapToDouble(QueryFilterStepIf::cost).sum();
}
@Override
@ -31,31 +30,23 @@ public class QueryFilterAnyOf implements QueryFilterStepIf {
if (steps.isEmpty())
return;
int start;
int start = 0;
int end = buffer.end;
steps.getFirst().apply(buffer);
// The filter functions will partition the data in the buffer from 0 to END,
// and update END to the length of the retained items, keeping the retained
// items sorted but making no guarantees about the rejected half
//
// Therefore, we need to re-sort the rejected side, and to satisfy the
// constraint that the data is sorted up to END, finally sort it again.
//
// This sorting may seem like it's slower, but filter.apply(...) is
// typically much faster than iterating over filter.test(...); so this
// is more than made up for
for (int fi = 1; fi < steps.size(); fi++)
for (var step : steps)
{
start = buffer.end;
Arrays.sort(buffer.data, start, end);
buffer.startFilterForRange(start, end);
steps.get(fi).apply(buffer);
var slice = buffer.slice(start, end);
slice.data.quickSort(0, slice.size());
step.apply(slice);
start += slice.end;
}
Arrays.sort(buffer.data, 0, buffer.end);
buffer.data.quickSort(0, start);
// Special finalization
buffer.reset();
buffer.end = start;
}
public String describe() {

View File

@ -16,7 +16,7 @@ public class QueryFilterLetThrough implements QueryFilterStepIf {
}
public double cost() {
return 0.;
return 1.;
}
public String describe() {

View File

@ -15,7 +15,7 @@ public class QueryFilterNoPass implements QueryFilterStepIf {
}
public double cost() {
return 0.;
return 1.;
}
public String describe() {

View File

@ -16,7 +16,7 @@ public class QueryFilterStepExcludeFromPredicate implements QueryFilterStepIf {
@Override
public double cost() {
return 0;
return 1;
}
@Override

View File

@ -16,7 +16,7 @@ public class QueryFilterStepFromPredicate implements QueryFilterStepIf {
@Override
public double cost() {
return 0;
return 1;
}
@Override

View File

@ -55,6 +55,32 @@ class QueryFilterStepIfTest {
assertArrayEquals(new long[]{8, 10}, buffer.copyData());
}
@Test
public void testSuccessiveApplicationWithAllOf() {
var buffer = createBuffer(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
var filter1 = new QueryFilterStepFromPredicate(value -> value % 2 == 0);
var filter2 = new QueryFilterStepExcludeFromPredicate(value -> value <= 6);
new QueryFilterAllOf(List.of(filter1, filter2)).apply(buffer);
assertArrayEquals(new long[]{8, 10}, buffer.copyData());
}
@Test
public void testCombinedOrAnd() {
var buffer = createBuffer(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
var filter1 = new QueryFilterStepFromPredicate(value -> value % 2 == 0);
var filter2 = new QueryFilterStepFromPredicate(value -> value <= 5);
var filter1_2 = new QueryFilterAllOf(List.of(filter1, filter2));
var filter3 = new QueryFilterStepFromPredicate(value -> value % 2 == 1);
var filter4 = new QueryFilterStepFromPredicate(value -> value > 5);
var filter3_4 = new QueryFilterAllOf(List.of(filter3, filter4));
var filter12_34 = new QueryFilterAnyOf(List.of(filter1_2, filter3_4));
filter12_34.apply(buffer);
assertArrayEquals(new long[]{2, 4, 7, 9}, buffer.copyData());
}
@Test
public void testCombinedApplication() {
var buffer = createBuffer(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);

View File

@ -5,7 +5,7 @@ import com.google.inject.Inject;
import lombok.SneakyThrows;
import nu.marginalia.IndexLocations;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.process.control.FakeProcessHeartbeat;
@ -123,9 +123,10 @@ public class IndexQueryServiceIntegrationSmokeTest {
.rankingParams(ResultRankingParameters.sensibleDefaults())
.domains(new ArrayList<>())
.searchSetIdentifier("NONE")
.subqueries(List.of(new SearchSubquery(
.query(new SearchQuery(
"2 3 5",
List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(),
Collections.emptyList()))).build());
Collections.emptyList())).build());
int[] idxes = new int[] { 30, 510, 90, 150, 210, 270, 330, 390, 450 };
long[] ids = IntStream.of(idxes).mapToLong(this::fullId).toArray();
@ -166,9 +167,13 @@ public class IndexQueryServiceIntegrationSmokeTest {
.rankingParams(ResultRankingParameters.sensibleDefaults())
.queryStrategy(QueryStrategy.SENTENCE)
.domains(List.of(2))
.subqueries(List.of(new SearchSubquery(
List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(),
Collections.emptyList()))).build());
.query(new SearchQuery(
"2 3 5",
List.of("3", "5", "2"),
List.of("4"),
Collections.emptyList(),
Collections.emptyList(),
Collections.emptyList())).build());
int[] idxes = new int[] { 210, 270 };
long[] ids = IntStream.of(idxes).mapToLong(id -> UrlIdCodec.encodeId(id/100, id)).toArray();
long[] actual = rsp.results.stream().mapToLong(i -> i.rawIndexResult.getDocumentId()).toArray();
@ -202,9 +207,8 @@ public class IndexQueryServiceIntegrationSmokeTest {
.queryStrategy(QueryStrategy.SENTENCE)
.searchSetIdentifier("NONE")
.rankingParams(ResultRankingParameters.sensibleDefaults())
.subqueries(List.of(new SearchSubquery(
List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList(),
Collections.emptyList()))
.query(
new SearchQuery("4", List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList(), Collections.emptyList())
).build());

View File

@ -4,7 +4,7 @@ import com.google.inject.Guice;
import com.google.inject.Inject;
import nu.marginalia.IndexLocations;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.storage.FileStorageService;
@ -35,6 +35,7 @@ import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.service.control.ServiceHeartbeat;
import nu.marginalia.service.server.Initialization;
import org.apache.logging.log4j.util.Strings;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@ -108,7 +109,7 @@ public class IndexQueryServiceIntegrationTest {
w("world", WordFlags.Title)
).load();
var query = basicQuery(builder -> builder.subqueries(justInclude("hello", "world")));
var query = basicQuery(builder -> builder.query(justInclude("hello", "world")));
executeSearch(query)
.expectDocumentsInOrder(d(1,1));
@ -127,57 +128,51 @@ public class IndexQueryServiceIntegrationTest {
).load();
var queryMissingExclude = basicQuery(builder ->
builder.subqueries(includeAndExclude("hello", "missing")));
builder.query(includeAndExclude("hello", "missing")));
executeSearch(queryMissingExclude)
.expectDocumentsInOrder(d(1,1));
var queryMissingInclude = basicQuery(builder ->
builder.subqueries(justInclude("missing")));
builder.query(justInclude("missing")));
executeSearch(queryMissingInclude)
.expectCount(0);
var queryMissingPriority = basicQuery(builder ->
builder.subqueries(
List.of(
new SearchSubquery(
List.of("hello"),
List.of(),
List.of(),
List.of("missing"),
List.of()
)
)));
builder.query(new SearchQuery(
"hello",
List.of("hello"),
List.of(),
List.of(),
List.of("missing"),
List.of())
));
executeSearch(queryMissingPriority)
.expectCount(1);
var queryMissingAdvice = basicQuery(builder ->
builder.subqueries(
List.of(
new SearchSubquery(
List.of("hello"),
List.of(),
List.of("missing"),
List.of(),
List.of()
)
builder.query(
new SearchQuery("hello",
List.of("hello"),
List.of(),
List.of("missing"),
List.of(),
List.of()
)));
executeSearch(queryMissingAdvice)
.expectCount(0);
var queryMissingCoherence = basicQuery(builder ->
builder.subqueries(
List.of(
new SearchSubquery(
List.of("hello"),
List.of(),
List.of(),
List.of(),
List.of(List.of("missing", "hello"))
)
builder.query(
new SearchQuery("hello",
List.of("hello"),
List.of(),
List.of(),
List.of(),
List.of(List.of("missing", "hello"))
)));
executeSearch(queryMissingCoherence)
@ -202,7 +197,7 @@ public class IndexQueryServiceIntegrationTest {
).load();
var query = basicQuery(builder -> builder.subqueries(justInclude("hello", "world")));
var query = basicQuery(builder -> builder.query(justInclude("hello", "world")));
executeSearch(query)
.expectDocumentsInOrder(d(1,1));
@ -234,15 +229,15 @@ public class IndexQueryServiceIntegrationTest {
var beforeY2K = basicQuery(builder ->
builder.subqueries(justInclude("hello", "world"))
builder.query(justInclude("hello", "world"))
.year(SpecificationLimit.lessThan(2000))
);
var atY2K = basicQuery(builder ->
builder.subqueries(justInclude("hello", "world"))
builder.query(justInclude("hello", "world"))
.year(SpecificationLimit.equals(2000))
);
var afterY2K = basicQuery(builder ->
builder.subqueries(justInclude("hello", "world"))
builder.query(justInclude("hello", "world"))
.year(SpecificationLimit.greaterThan(2000))
);
@ -296,11 +291,11 @@ public class IndexQueryServiceIntegrationTest {
var domain1 = basicQuery(builder ->
builder.subqueries(justInclude("hello", "world"))
builder.query(justInclude("hello", "world"))
.domains(List.of(1))
);
var domain2 = basicQuery(builder ->
builder.subqueries(justInclude("hello", "world"))
builder.query(justInclude("hello", "world"))
.domains(List.of(2))
);
@ -334,7 +329,7 @@ public class IndexQueryServiceIntegrationTest {
).load();
var query = basicQuery(builder ->
builder.subqueries(includeAndExclude("hello", "my_darling"))
builder.query(includeAndExclude("hello", "my_darling"))
);
executeSearch(query)
@ -403,7 +398,7 @@ public class IndexQueryServiceIntegrationTest {
.load();
var rsp = queryService.justQuery(
basicQuery(builder -> builder.subqueries(
basicQuery(builder -> builder.query(
// note coherence requriement
includeAndCohere("hello", "world")
)));
@ -424,50 +419,53 @@ public class IndexQueryServiceIntegrationTest {
.rank(SpecificationLimit.none())
.rankingParams(ResultRankingParameters.sensibleDefaults())
.domains(new ArrayList<>())
.searchSetIdentifier("NONE")
.subqueries(List.of());
.searchSetIdentifier("NONE");
return mutator.apply(builder).build();
}
List<SearchSubquery> justInclude(String... includes) {
return List.of(new SearchSubquery(
SearchQuery justInclude(String... includes) {
return new SearchQuery(
Strings.join(List.of(includes), ' '),
List.of(includes),
List.of(),
List.of(),
List.of(),
List.of()
));
);
}
List<SearchSubquery> includeAndExclude(List<String> includes, List<String> excludes) {
return List.of(new SearchSubquery(
SearchQuery includeAndExclude(List<String> includes, List<String> excludes) {
return new SearchQuery(
Strings.join(List.of(includes), ' '),
includes,
excludes,
List.of(),
List.of(),
List.of()
));
);
}
List<SearchSubquery> includeAndExclude(String include, String exclude) {
return List.of(new SearchSubquery(
SearchQuery includeAndExclude(String include, String exclude) {
return new SearchQuery(
include,
List.of(include),
List.of(exclude),
List.of(),
List.of(),
List.of()
));
);
}
List<SearchSubquery> includeAndCohere(String... includes) {
return List.of(new SearchSubquery(
SearchQuery includeAndCohere(String... includes) {
return new SearchQuery(
Strings.join(List.of(includes), ' '),
List.of(includes),
List.of(),
List.of(),
List.of(),
List.of(List.of(includes))
));
);
}
private MockDataDocument d(int domainId, int ordinal) {
return new MockDataDocument(domainId, ordinal);

View File

@ -0,0 +1,59 @@
package nu.marginalia.index.index;
import it.unimi.dsi.fastutil.longs.LongArraySet;
import it.unimi.dsi.fastutil.longs.LongSet;
import org.junit.jupiter.api.Test;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import static org.junit.jupiter.api.Assertions.*;
class QueryBranchWalkerTest {
@Test
public void testNoOverlap() {
var paths = QueryBranchWalker.create(
new long[] { 1, 2 },
List.of(set(1), set(2))
);
assertEquals(2, paths.size());
assertEquals(Set.of(1L, 2L), paths.stream().map(path -> path.termId).collect(Collectors.toSet()));
}
@Test
public void testCond() {
var paths = QueryBranchWalker.create(
new long[] { 1, 2, 3, 4 },
List.of(set(1,2,3), set(1,4,3))
);
assertEquals(1, paths.size());
assertEquals(Set.of(1L), paths.stream().map(path -> path.termId).collect(Collectors.toSet()));
System.out.println(Arrays.toString(paths.getFirst().priorityOrder));
assertArrayEquals(new long[] { 2, 3, 4 }, paths.getFirst().priorityOrder);
var next = paths.getFirst().next();
assertEquals(2, next.size());
assertEquals(Set.of(2L, 3L), next.stream().map(path -> path.termId).collect(Collectors.toSet()));
Map<Long, QueryBranchWalker> byId = next.stream().collect(Collectors.toMap(w -> w.termId, w->w));
assertArrayEquals(new long[] { 3L }, byId.get(2L).priorityOrder );
assertArrayEquals(new long[] { 4L }, byId.get(3L).priorityOrder );
}
@Test
public void testNoOverlapFirst() {
var paths = QueryBranchWalker.create(
new long[] { 1, 2, 3 },
List.of(set(1, 2), set(1, 3))
);
assertEquals(1, paths.size());
assertArrayEquals(new long[] { 2, 3 }, paths.getFirst().priorityOrder);
assertEquals(Set.of(1L), paths.stream().map(path -> path.termId).collect(Collectors.toSet()));
}
LongSet set(long... args) {
return new LongArraySet(args);
}
}

View File

@ -2,9 +2,10 @@ package nu.marginalia.index.results;
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
import nu.marginalia.model.id.UrlIdCodec;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Test;
import java.util.List;
import static org.junit.jupiter.api.Assertions.*;
class IndexResultDomainDeduplicatorTest {
@ -24,7 +25,7 @@ class IndexResultDomainDeduplicatorTest {
}
SearchResultItem forId(int domain, int ordinal) {
return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), 4);
return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), List.of(), 4, Double.NaN);
}
}

View File

@ -1,5 +1,6 @@
package nu.marginalia.ranking.results;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
@ -35,21 +36,21 @@ class ResultValuatorTest {
);
}
List<SearchResultKeywordScore> titleOnlyLowCountSet = List.of(
new SearchResultKeywordScore(0, "bob",
CompiledQuery<SearchResultKeywordScore> titleOnlyLowCountSet = CompiledQuery.just(
new SearchResultKeywordScore("bob", 1,
wordMetadata(Set.of(1), EnumSet.of(WordFlags.Title)),
docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)),
0)
);
List<SearchResultKeywordScore> highCountNoTitleSet = List.of(
new SearchResultKeywordScore(0, "bob",
CompiledQuery<SearchResultKeywordScore> highCountNoTitleSet = CompiledQuery.just(
new SearchResultKeywordScore("bob", 1,
wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh)),
docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)),
0)
);
List<SearchResultKeywordScore> highCountSubjectSet = List.of(
new SearchResultKeywordScore(0, "bob",
CompiledQuery<SearchResultKeywordScore> highCountSubjectSet = CompiledQuery.just(
new SearchResultKeywordScore("bob", 1,
wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh, WordFlags.Subjects)),
docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)),
0)
@ -75,7 +76,10 @@ class ResultValuatorTest {
System.out.println(highCountSubject);
}
private long docMetadata(int topology, int year, int quality, EnumSet<DocumentFlags> flags) {
private long docMetadata(int topology,
int year,
int quality,
EnumSet<DocumentFlags> flags) {
return new DocumentMetadata(topology, PubDate.toYearByte(year), quality, flags).encode();
}

View File

@ -1,9 +1,10 @@
package nu.marginalia.ranking.results.factors;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
import nu.marginalia.bbpc.BrailleBlockPunchCards;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.ranking.results.ResultKeywordSet;
import org.junit.jupiter.api.Test;
import java.util.ArrayList;
@ -20,7 +21,7 @@ class TermCoherenceFactorTest {
WordMetadata.POSITIONS_MASK, WordMetadata.POSITIONS_MASK
);
long mask = termCoherenceFactor.combinedMask(allPositionsSet);
long mask = CompiledQueryAggregates.longBitmaskAggregate(allPositionsSet, score -> score.positions() & WordMetadata.POSITIONS_MASK);
assertEquals(1.0, termCoherenceFactor.bitsSetFactor(mask), 0.01);
@ -33,7 +34,7 @@ class TermCoherenceFactorTest {
0, 0
);
long mask = termCoherenceFactor.combinedMask(allPositionsSet);
long mask = CompiledQueryAggregates.longBitmaskAggregate(allPositionsSet, score -> score.positions() & WordMetadata.POSITIONS_MASK);
assertEquals(0, termCoherenceFactor.bitsSetFactor(mask), 0.01);
@ -46,7 +47,7 @@ class TermCoherenceFactorTest {
List.of(0, 1, 2, 3), List.of(0, 1, 2, 3)
);
long mask = termCoherenceFactor.combinedMask(positions);
long mask = CompiledQueryAggregates.longBitmaskAggregate(positions, score -> score.positions() & WordMetadata.POSITIONS_MASK);
printMask(mask);
}
@ -57,7 +58,7 @@ class TermCoherenceFactorTest {
List.of(55, 54, 53, 52), List.of(55, 54, 53, 52)
);
long mask = termCoherenceFactor.combinedMask(positions);
long mask = CompiledQueryAggregates.longBitmaskAggregate(positions, score -> score.positions() & WordMetadata.POSITIONS_MASK);
printMask(mask);
}
@ -72,7 +73,7 @@ class TermCoherenceFactorTest {
System.out.println(BrailleBlockPunchCards.printBits(mask, 48));
}
ResultKeywordSet createSet(List<Integer>... maskPositions) {
CompiledQuery<SearchResultKeywordScore> createSet(List<Integer>... maskPositions) {
long[] positions = new long[maskPositions.length];
for (int i = 0; i < maskPositions.length; i++) {
@ -84,14 +85,14 @@ class TermCoherenceFactorTest {
return createSet(positions);
}
ResultKeywordSet createSet(long... positionMasks) {
CompiledQuery<SearchResultKeywordScore> createSet(long... positionMasks) {
List<SearchResultKeywordScore> keywords = new ArrayList<>();
for (int i = 0; i < positionMasks.length; i++) {
keywords.add(new SearchResultKeywordScore(0, "",
keywords.add(new SearchResultKeywordScore("", 0,
new WordMetadata(positionMasks[i], (byte) 0).encode(), 0, 0));
}
return new ResultKeywordSet(keywords);
return CompiledQuery.just(keywords.toArray(SearchResultKeywordScore[]::new));
}
}

View File

@ -1,5 +1,7 @@
package nu.marginalia.array.algo;
import nu.marginalia.array.LongArray;
import java.io.IOException;
import java.nio.LongBuffer;
import java.nio.channels.FileChannel;
@ -61,6 +63,12 @@ public interface LongArrayBase extends BulkTransferArray<LongBuffer> {
}
}
default void get(long start, long end, LongArray buffer, int bufferStart) {
for (int i = 0; i < (end-start); i++) {
buffer.set(i + bufferStart, get(start + i));
}
}
default void get(long start, LongBuffer buffer) {
get(start, start + buffer.remaining(), buffer, buffer.position());
}

View File

@ -1,5 +1,8 @@
package nu.marginalia.array.buffer;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import java.util.Arrays;
/** A buffer for long values that can be used to filter and manipulate the data.
@ -17,7 +20,7 @@ import java.util.Arrays;
public class LongQueryBuffer {
/** Direct access to the data in the buffer,
* guaranteed to be populated until `end` */
public final long[] data;
public final LongArray data;
/** Number of items in the data buffer */
public int end;
@ -25,18 +28,27 @@ public class LongQueryBuffer {
private int read = 0;
private int write = 0;
private LongQueryBuffer(LongArray array, int size) {
this.data = array;
this.end = size;
}
public LongQueryBuffer(int size) {
this.data = new long[size];
this.data = LongArrayFactory.onHeapConfined(size);
this.end = size;
}
public LongQueryBuffer(long[] data, int size) {
this.data = data;
this.data = LongArrayFactory.onHeapConfined(size);
this.data.set(0, data);
this.end = size;
}
public long[] copyData() {
return Arrays.copyOf(data, end);
long[] copy = new long[end];
data.forEach(0, end, (pos, val) -> copy[(int)pos]=val );
return copy;
}
public boolean isEmpty() {
@ -48,7 +60,7 @@ public class LongQueryBuffer {
}
public void reset() {
end = data.length;
end = (int) data.size();
read = 0;
write = 0;
}
@ -59,12 +71,16 @@ public class LongQueryBuffer {
write = 0;
}
public LongQueryBuffer slice(int start, int end) {
return new LongQueryBuffer(data.range(start, end), end - start);
}
/* == Filtering methods == */
/** Returns the current value at the read pointer.
*/
public long currentValue() {
return data[read];
return data.get(read);
}
/** Advances the read pointer and returns true if there are more values to read. */
@ -79,9 +95,9 @@ public class LongQueryBuffer {
*/
public boolean retainAndAdvance() {
if (read != write) {
long tmp = data[write];
data[write] = data[read];
data[read] = tmp;
long tmp = data.get(write);
data.set(write, data.get(read));
data.set(read, tmp);
}
write++;
@ -117,9 +133,10 @@ public class LongQueryBuffer {
write = 0;
}
public void startFilterForRange(int pos, int end) {
read = write = pos;
this.end = end;
public void finalizeFiltering(int pos) {
end = write;
read = pos;
write = pos;
}
/** Retain only unique values in the buffer, and update the end pointer to the new length.
@ -153,7 +170,7 @@ public class LongQueryBuffer {
"read = " + read +
",write = " + write +
",end = " + end +
",data = [" + Arrays.toString(Arrays.copyOf(data, end)) + "]]";
",data = [" + Arrays.toString(copyData()) + "]]";
}

View File

@ -143,7 +143,7 @@ class LongArraySearchTest {
assertEquals(43, buffer.size());
for (int i = 0; i < 43; i++) {
assertEquals(buffer.data[i], i*3);
assertEquals(buffer.data.get(i), i*3);
}
}
@ -160,7 +160,7 @@ class LongArraySearchTest {
int j = 0;
for (int i = 0; i < 43; i++) {
if (++j % 3 == 0) j++;
assertEquals(buffer.data[i], j);
assertEquals(buffer.data.get(i), j);
}
}
}

View File

@ -109,8 +109,8 @@ public class BTreeReader {
return ip.findData(key);
}
public void readData(long[] buf, int n, long pos) {
data.get(pos, pos + n, buf);
public void readData(LongArray buf, int n, long pos) {
data.get(pos, pos + n, buf, 0);
}
/** Used for querying interlaced data in the btree.

View File

@ -32,7 +32,8 @@ public class BTreeReaderRejectRetainWithIndexTest {
@Test
public void testRetain() {
LongQueryBuffer odds = new LongQueryBuffer(50);
Arrays.setAll(odds.data, i -> 2L*i + 1);
for (int i = 0; i < 50; i++)
odds.data.set(i, 2L*i + 1);
BTreeReader reader = new BTreeReader(array, ctx, 0);
reader.retainEntries(odds);
@ -46,7 +47,8 @@ public class BTreeReaderRejectRetainWithIndexTest {
@Test
public void testReject() {
LongQueryBuffer odds = new LongQueryBuffer(50);
Arrays.setAll(odds.data, i -> 2L*i + 1);
for (int i = 0; i < 50; i++)
odds.data.set(i, 2L*i + 1);
BTreeReader reader = new BTreeReader(array, ctx, 0);
reader.rejectEntries(odds);

View File

@ -32,7 +32,8 @@ public class BTreeReaderRejectRetainWithoutIndexTest {
@Test
public void testRetain() {
LongQueryBuffer odds = new LongQueryBuffer(50);
Arrays.setAll(odds.data, i -> 2L*i + 1);
for (int i = 0; i < 50; i++)
odds.data.set(i, 2L*i + 1);
BTreeReader reader = new BTreeReader(array, ctx, 0);
reader.retainEntries(odds);
@ -46,7 +47,9 @@ public class BTreeReaderRejectRetainWithoutIndexTest {
@Test
public void testReject() {
LongQueryBuffer odds = new LongQueryBuffer(50);
Arrays.setAll(odds.data, i -> 2L*i + 1);
for (int i = 0; i < 50; i++)
odds.data.set(i, 2L*i + 1);
BTreeReader reader = new BTreeReader(array, ctx, 0);
reader.rejectEntries(odds);

View File

@ -1,7 +1,7 @@
package nu.marginalia.search;
import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier;
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.index.query.limit.QueryLimits;
import nu.marginalia.index.query.limit.QueryStrategy;
@ -14,7 +14,7 @@ import java.util.List;
public class SearchQueryParamFactory {
public QueryParams forRegularSearch(SearchParameters userParams) {
SearchSubquery prototype = new SearchSubquery();
SearchQuery prototype = new SearchQuery();
var profile = userParams.profile();
profile.addTacitTerms(prototype);

View File

@ -1,6 +1,6 @@
package nu.marginalia.search.command;
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import javax.annotation.Nullable;
import java.util.Arrays;
@ -23,7 +23,7 @@ public enum SearchAdtechParameter {
return DEFAULT;
}
public void addTacitTerms(SearchSubquery subquery) {
public void addTacitTerms(SearchQuery subquery) {
subquery.searchTermsExclude.addAll(Arrays.asList(implictExcludeSearchTerms));
}
}

View File

@ -1,6 +1,6 @@
package nu.marginalia.search.command;
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import javax.annotation.Nullable;
import java.util.Arrays;
@ -25,7 +25,7 @@ public enum SearchJsParameter {
return DEFAULT;
}
public void addTacitTerms(SearchSubquery subquery) {
public void addTacitTerms(SearchQuery subquery) {
subquery.searchTermsExclude.addAll(Arrays.asList(implictExcludeSearchTerms));
}
}

View File

@ -2,7 +2,7 @@ package nu.marginalia.search.model;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier;
import java.util.Objects;
@ -47,7 +47,7 @@ public enum SearchProfile {
return NO_FILTER;
}
public void addTacitTerms(SearchSubquery subquery) {
public void addTacitTerms(SearchQuery subquery) {
if (this == ACADEMIA) {
subquery.searchTermsAdvice.add("special:academia");
}