mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(qs, index) New query model integrated with index service.
Seems to work, tests are green and initial testing finds no errors. Still a bit untested, committing WIP as-is because it would suck to lose weeks of work due to a drive failure or something.
This commit is contained in:
parent
8cb9455c32
commit
a3a6d6292b
@ -30,6 +30,7 @@ dependencies {
|
||||
implementation libs.notnull
|
||||
implementation libs.guice
|
||||
implementation libs.gson
|
||||
implementation libs.commons.lang3
|
||||
implementation libs.bundles.protobuf
|
||||
implementation libs.bundles.grpc
|
||||
implementation libs.fastutil
|
||||
|
@ -1,7 +1,6 @@
|
||||
package nu.marginalia.api.searchquery;
|
||||
|
||||
import nu.marginalia.api.searchquery.*;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.results.Bm25Parameters;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.index.query.limit.QueryLimits;
|
||||
@ -45,33 +44,37 @@ public class IndexProtobufCodec {
|
||||
.build();
|
||||
}
|
||||
|
||||
public static SearchSubquery convertSearchSubquery(RpcSubquery subquery) {
|
||||
public static SearchQuery convertRpcQuery(RpcQuery query) {
|
||||
List<List<String>> coherences = new ArrayList<>();
|
||||
|
||||
for (int j = 0; j < subquery.getCoherencesCount(); j++) {
|
||||
var coh = subquery.getCoherences(j);
|
||||
for (int j = 0; j < query.getCoherencesCount(); j++) {
|
||||
var coh = query.getCoherences(j);
|
||||
coherences.add(new ArrayList<>(coh.getCoherencesList()));
|
||||
}
|
||||
|
||||
return new SearchSubquery(
|
||||
subquery.getIncludeList(),
|
||||
subquery.getExcludeList(),
|
||||
subquery.getAdviceList(),
|
||||
subquery.getPriorityList(),
|
||||
return new SearchQuery(
|
||||
query.getCompiledQuery(),
|
||||
query.getIncludeList(),
|
||||
query.getExcludeList(),
|
||||
query.getAdviceList(),
|
||||
query.getPriorityList(),
|
||||
coherences
|
||||
);
|
||||
}
|
||||
|
||||
public static RpcSubquery convertSearchSubquery(SearchSubquery searchSubquery) {
|
||||
public static RpcQuery convertRpcQuery(SearchQuery searchQuery) {
|
||||
var subqueryBuilder =
|
||||
RpcSubquery.newBuilder()
|
||||
.addAllAdvice(searchSubquery.getSearchTermsAdvice())
|
||||
.addAllExclude(searchSubquery.getSearchTermsExclude())
|
||||
.addAllInclude(searchSubquery.getSearchTermsInclude())
|
||||
.addAllPriority(searchSubquery.getSearchTermsPriority());
|
||||
for (var coherences : searchSubquery.searchTermCoherences) {
|
||||
RpcQuery.newBuilder()
|
||||
.setCompiledQuery(searchQuery.compiledQuery)
|
||||
.addAllInclude(searchQuery.getSearchTermsInclude())
|
||||
.addAllAdvice(searchQuery.getSearchTermsAdvice())
|
||||
.addAllExclude(searchQuery.getSearchTermsExclude())
|
||||
.addAllPriority(searchQuery.getSearchTermsPriority());
|
||||
|
||||
for (var coherences : searchQuery.searchTermCoherences) {
|
||||
subqueryBuilder.addCoherencesBuilder().addAllCoherences(coherences);
|
||||
}
|
||||
|
||||
return subqueryBuilder.build();
|
||||
}
|
||||
|
||||
|
@ -2,7 +2,6 @@ package nu.marginalia.api.searchquery;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
||||
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||
@ -14,7 +13,6 @@ import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryResponse;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public class QueryProtobufCodec {
|
||||
|
||||
@ -23,9 +21,7 @@ public class QueryProtobufCodec {
|
||||
|
||||
builder.addAllDomains(request.getDomainIdsList());
|
||||
|
||||
for (var subquery : query.specs.subqueries) {
|
||||
builder.addSubqueries(IndexProtobufCodec.convertSearchSubquery(subquery));
|
||||
}
|
||||
builder.setQuery(IndexProtobufCodec.convertRpcQuery(query.specs.query));
|
||||
|
||||
builder.setSearchSetIdentifier(query.specs.searchSetIdentifier);
|
||||
builder.setHumanQuery(request.getHumanQuery());
|
||||
@ -51,9 +47,7 @@ public class QueryProtobufCodec {
|
||||
public static RpcIndexQuery convertQuery(String humanQuery, ProcessedQuery query) {
|
||||
var builder = RpcIndexQuery.newBuilder();
|
||||
|
||||
for (var subquery : query.specs.subqueries) {
|
||||
builder.addSubqueries(IndexProtobufCodec.convertSearchSubquery(subquery));
|
||||
}
|
||||
builder.setQuery(IndexProtobufCodec.convertRpcQuery(query.specs.query));
|
||||
|
||||
builder.setSearchSetIdentifier(query.specs.searchSetIdentifier);
|
||||
builder.setHumanQuery(humanQuery);
|
||||
@ -147,8 +141,8 @@ public class QueryProtobufCodec {
|
||||
|
||||
private static SearchResultKeywordScore convertKeywordScore(RpcResultKeywordScore keywordScores) {
|
||||
return new SearchResultKeywordScore(
|
||||
keywordScores.getSubquery(),
|
||||
keywordScores.getKeyword(),
|
||||
-1, // termId is internal to index service
|
||||
keywordScores.getEncodedWordMetadata(),
|
||||
keywordScores.getEncodedDocMetadata(),
|
||||
keywordScores.getHtmlFeatures()
|
||||
@ -156,14 +150,8 @@ public class QueryProtobufCodec {
|
||||
}
|
||||
|
||||
private static SearchSpecification convertSearchSpecification(RpcIndexQuery specs) {
|
||||
List<SearchSubquery> subqueries = new ArrayList<>(specs.getSubqueriesCount());
|
||||
|
||||
for (int i = 0; i < specs.getSubqueriesCount(); i++) {
|
||||
subqueries.add(IndexProtobufCodec.convertSearchSubquery(specs.getSubqueries(i)));
|
||||
}
|
||||
|
||||
return new SearchSpecification(
|
||||
subqueries,
|
||||
IndexProtobufCodec.convertRpcQuery(specs.getQuery()),
|
||||
specs.getDomainsList(),
|
||||
specs.getSearchSetIdentifier(),
|
||||
specs.getHumanQuery(),
|
||||
@ -182,7 +170,6 @@ public class QueryProtobufCodec {
|
||||
.addAllDomainIds(params.domainIds())
|
||||
.addAllTacitAdvice(params.tacitAdvice())
|
||||
.addAllTacitExcludes(params.tacitExcludes())
|
||||
.addAllTacitIncludes(params.tacitIncludes())
|
||||
.addAllTacitPriority(params.tacitPriority())
|
||||
.setHumanQuery(params.humanQuery())
|
||||
.setQueryLimits(IndexProtobufCodec.convertQueryLimits(params.limits()))
|
||||
|
@ -0,0 +1,76 @@
|
||||
package nu.marginalia.api.searchquery.model.compiled;
|
||||
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.function.*;
|
||||
import java.util.stream.IntStream;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
|
||||
/** A compiled index service query. The class separates the topology of the query from the data,
|
||||
* and it's possible to create new queries supplanting the data */
|
||||
public class CompiledQuery<T> implements Iterable<T> {
|
||||
|
||||
/** The root expression, conveys the topology of the query */
|
||||
public final CqExpression root;
|
||||
|
||||
private final CqData<T> data;
|
||||
|
||||
public CompiledQuery(CqExpression root, CqData<T> data) {
|
||||
this.root = root;
|
||||
this.data = data;
|
||||
}
|
||||
|
||||
public CompiledQuery(CqExpression root, T[] data) {
|
||||
this.root = root;
|
||||
this.data = new CqData<>(data);
|
||||
}
|
||||
|
||||
/** Exists for testing, creates a simple query that ANDs all the provided items */
|
||||
public static <T> CompiledQuery<T> just(T... item) {
|
||||
return new CompiledQuery<>(new CqExpression.And(
|
||||
IntStream.range(0, item.length).mapToObj(CqExpression.Word::new).toList()
|
||||
), item);
|
||||
}
|
||||
|
||||
/** Create a new CompiledQuery mapping the leaf nodes using the provided mapper */
|
||||
public <T2> CompiledQuery<T2> map(Class<T2> clazz, Function<T, T2> mapper) {
|
||||
return new CompiledQuery<>(
|
||||
root,
|
||||
data.map(clazz, mapper)
|
||||
);
|
||||
}
|
||||
|
||||
public CompiledQueryLong mapToLong(ToLongFunction<T> mapper) {
|
||||
return new CompiledQueryLong(root, data.mapToLong(mapper));
|
||||
}
|
||||
|
||||
public CqExpression root() {
|
||||
return root;
|
||||
}
|
||||
|
||||
public Stream<T> stream() {
|
||||
return data.stream();
|
||||
}
|
||||
|
||||
public IntStream indices() {
|
||||
return IntStream.range(0, data.size());
|
||||
}
|
||||
|
||||
public T at(int index) {
|
||||
return data.get(index);
|
||||
}
|
||||
|
||||
@NotNull
|
||||
@Override
|
||||
public Iterator<T> iterator() {
|
||||
return stream().iterator();
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return data.size();
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -0,0 +1,42 @@
|
||||
package nu.marginalia.api.searchquery.model.compiled;
|
||||
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.stream.IntStream;
|
||||
import java.util.stream.LongStream;
|
||||
|
||||
|
||||
/** A compiled index service query */
|
||||
public class CompiledQueryLong implements Iterable<Long> {
|
||||
private final CqExpression root;
|
||||
private final CqDataLong data;
|
||||
|
||||
public CompiledQueryLong(CqExpression root, CqDataLong data) {
|
||||
this.root = root;
|
||||
this.data = data;
|
||||
}
|
||||
|
||||
|
||||
public CqExpression root() {
|
||||
return root;
|
||||
}
|
||||
|
||||
public LongStream stream() {
|
||||
return data.stream();
|
||||
}
|
||||
|
||||
public IntStream indices() {
|
||||
return IntStream.range(0, data.size());
|
||||
}
|
||||
|
||||
public long at(int index) {
|
||||
return data.get(index);
|
||||
}
|
||||
|
||||
@NotNull
|
||||
@Override
|
||||
public Iterator<Long> iterator() {
|
||||
return stream().iterator();
|
||||
}
|
||||
}
|
@ -0,0 +1,113 @@
|
||||
package nu.marginalia.api.searchquery.model.compiled;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/** Parser for a compiled index query */
|
||||
public class CompiledQueryParser {
|
||||
|
||||
public static CompiledQuery<String> parse(String query) {
|
||||
List<String> parts = tokenize(query);
|
||||
|
||||
if (parts.isEmpty()) {
|
||||
return new CompiledQuery<>(
|
||||
CqExpression.empty(),
|
||||
new CqData<>(new String[0])
|
||||
);
|
||||
}
|
||||
|
||||
// We aren't interested in a binary tree representation, but an n-ary tree one,
|
||||
// so a somewhat unusual parsing technique is used to avoid having an additional
|
||||
// flattening step at the end.
|
||||
|
||||
// This is only possible due to the trivial and unambiguous grammar of the compiled queries
|
||||
|
||||
List<AndOrState> parenState = new ArrayList<>();
|
||||
parenState.add(new AndOrState());
|
||||
|
||||
Map<String, Integer> wordIds = new HashMap<>();
|
||||
|
||||
for (var part : parts) {
|
||||
var head = parenState.getLast();
|
||||
|
||||
if (part.equals("|")) {
|
||||
head.or();
|
||||
}
|
||||
else if (part.equals("(")) {
|
||||
parenState.addLast(new AndOrState());
|
||||
}
|
||||
else if (part.equals(")")) {
|
||||
if (parenState.size() < 2) {
|
||||
throw new IllegalStateException("Mismatched parentheses in expression: " + query);
|
||||
}
|
||||
parenState.removeLast();
|
||||
parenState.getLast().and(head.closeOr());
|
||||
}
|
||||
else {
|
||||
head.and(
|
||||
new CqExpression.Word(
|
||||
wordIds.computeIfAbsent(part, p -> wordIds.size())
|
||||
)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if (parenState.size() != 1)
|
||||
throw new IllegalStateException("Mismatched parentheses in expression: " + query);
|
||||
|
||||
// Construct the CompiledQuery object with String:s as leaves
|
||||
var root = parenState.getLast().closeOr();
|
||||
|
||||
String[] cqData = new String[wordIds.size()];
|
||||
wordIds.forEach((w, i) -> cqData[i] = w);
|
||||
return new CompiledQuery<>(root, new CqData<>(cqData));
|
||||
|
||||
}
|
||||
|
||||
private static class AndOrState {
|
||||
private List<CqExpression> andState = new ArrayList<>();
|
||||
private List<CqExpression> orState = new ArrayList<>();
|
||||
|
||||
/** Add a new item to the and-list */
|
||||
public void and(CqExpression e) {
|
||||
andState.add(e);
|
||||
}
|
||||
|
||||
/** Turn the and-list into an expression on the or-list, and then start a new and-list */
|
||||
public void or() {
|
||||
closeAnd();
|
||||
|
||||
andState = new ArrayList<>();
|
||||
}
|
||||
|
||||
/** Turn the and-list into an And-expression in the or-list */
|
||||
private void closeAnd() {
|
||||
if (andState.size() == 1)
|
||||
orState.add(andState.getFirst());
|
||||
else if (!andState.isEmpty())
|
||||
orState.add(new CqExpression.And(andState));
|
||||
}
|
||||
|
||||
/** Finalize the current and-list, then turn the or-list into an Or-expression */
|
||||
public CqExpression closeOr() {
|
||||
closeAnd();
|
||||
|
||||
if (orState.isEmpty())
|
||||
return CqExpression.empty();
|
||||
if (orState.size() == 1)
|
||||
return orState.getFirst();
|
||||
|
||||
return new CqExpression.Or(orState);
|
||||
}
|
||||
}
|
||||
|
||||
private static List<String> tokenize(String query) {
|
||||
// Each token is guaranteed to be separated by one or more space characters
|
||||
|
||||
return Arrays.stream(StringUtils.split(query, ' '))
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.toList();
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,51 @@
|
||||
package nu.marginalia.api.searchquery.model.compiled;
|
||||
|
||||
import java.lang.reflect.Array;
|
||||
import java.util.Arrays;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.ToDoubleFunction;
|
||||
import java.util.function.ToLongFunction;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
public class CqData<T> {
|
||||
private final T[] data;
|
||||
|
||||
public CqData(T[] data) {
|
||||
this.data = data;
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public <T2> CqData<T2> map(Class<T2> clazz, Function<T, T2> mapper) {
|
||||
T2[] newData = (T2[]) Array.newInstance(clazz, data.length);
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
newData[i] = mapper.apply((T) data[i]);
|
||||
}
|
||||
|
||||
return new CqData<>(newData);
|
||||
}
|
||||
|
||||
public CqDataLong mapToLong(ToLongFunction<T> mapper) {
|
||||
long[] newData = new long[data.length];
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
newData[i] = mapper.applyAsLong((T) data[i]);
|
||||
}
|
||||
|
||||
return new CqDataLong(newData);
|
||||
}
|
||||
|
||||
public T get(int i) {
|
||||
return data[i];
|
||||
}
|
||||
|
||||
public T get(CqExpression.Word w) {
|
||||
return data[w.idx()];
|
||||
}
|
||||
|
||||
public Stream<T> stream() {
|
||||
return Arrays.stream(data);
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return data.length;
|
||||
}
|
||||
}
|
@ -0,0 +1,27 @@
|
||||
package nu.marginalia.api.searchquery.model.compiled;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.stream.LongStream;
|
||||
|
||||
public class CqDataLong {
|
||||
private final long[] data;
|
||||
|
||||
public CqDataLong(long[] data) {
|
||||
this.data = data;
|
||||
}
|
||||
|
||||
public long get(int i) {
|
||||
return data[i];
|
||||
}
|
||||
public long get(CqExpression.Word w) {
|
||||
return data[w.idx()];
|
||||
}
|
||||
|
||||
public LongStream stream() {
|
||||
return Arrays.stream(data);
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return data.length;
|
||||
}
|
||||
}
|
@ -0,0 +1,170 @@
|
||||
package nu.marginalia.api.searchquery.model.compiled;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.StringJoiner;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
/** Expression in a parsed index service query
|
||||
*
|
||||
*/
|
||||
public sealed interface CqExpression {
|
||||
|
||||
Stream<Word> stream();
|
||||
|
||||
/** @see nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates */
|
||||
long visit(LongVisitor visitor);
|
||||
/** @see nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates */
|
||||
double visit(DoubleVisitor visitor);
|
||||
/** @see nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates */
|
||||
int visit(IntVisitor visitor);
|
||||
/** @see nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates */
|
||||
boolean visit(BoolVisitor visitor);
|
||||
|
||||
<T> T visit(ObjectVisitor<T> visitor);
|
||||
|
||||
static CqExpression empty() {
|
||||
return new Or(List.of());
|
||||
}
|
||||
|
||||
|
||||
record And(List<? extends CqExpression> parts) implements CqExpression {
|
||||
@Override
|
||||
public Stream<Word> stream() {
|
||||
return parts.stream().flatMap(CqExpression::stream);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long visit(LongVisitor visitor) {
|
||||
return visitor.onAnd(parts);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double visit(DoubleVisitor visitor) {
|
||||
return visitor.onAnd(parts);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int visit(IntVisitor visitor) {
|
||||
return visitor.onAnd(parts);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean visit(BoolVisitor visitor) {
|
||||
return visitor.onAnd(parts);
|
||||
}
|
||||
|
||||
@Override
|
||||
public <T> T visit(ObjectVisitor<T> visitor) { return visitor.onAnd(parts); }
|
||||
|
||||
public String toString() {
|
||||
StringJoiner sj = new StringJoiner(", ", "And[ ", "]");
|
||||
parts.forEach(part -> sj.add(part.toString()));
|
||||
return sj.toString();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
record Or(List<? extends CqExpression> parts) implements CqExpression {
|
||||
@Override
|
||||
public Stream<Word> stream() {
|
||||
return parts.stream().flatMap(CqExpression::stream);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long visit(LongVisitor visitor) {
|
||||
return visitor.onOr(parts);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double visit(DoubleVisitor visitor) {
|
||||
return visitor.onOr(parts);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int visit(IntVisitor visitor) {
|
||||
return visitor.onOr(parts);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean visit(BoolVisitor visitor) {
|
||||
return visitor.onOr(parts);
|
||||
}
|
||||
|
||||
@Override
|
||||
public <T> T visit(ObjectVisitor<T> visitor) { return visitor.onOr(parts); }
|
||||
|
||||
public String toString() {
|
||||
StringJoiner sj = new StringJoiner(", ", "Or[ ", "]");
|
||||
parts.forEach(part -> sj.add(part.toString()));
|
||||
return sj.toString();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
record Word(int idx) implements CqExpression {
|
||||
@Override
|
||||
public Stream<Word> stream() {
|
||||
return Stream.of(this);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long visit(LongVisitor visitor) {
|
||||
return visitor.onLeaf(idx);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double visit(DoubleVisitor visitor) {
|
||||
return visitor.onLeaf(idx);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int visit(IntVisitor visitor) {
|
||||
return visitor.onLeaf(idx);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean visit(BoolVisitor visitor) {
|
||||
return visitor.onLeaf(idx);
|
||||
}
|
||||
|
||||
@Override
|
||||
public <T> T visit(ObjectVisitor<T> visitor) { return visitor.onLeaf(idx); }
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return Integer.toString(idx);
|
||||
}
|
||||
}
|
||||
|
||||
interface LongVisitor {
|
||||
long onAnd(List<? extends CqExpression> parts);
|
||||
long onOr(List<? extends CqExpression> parts);
|
||||
long onLeaf(int idx);
|
||||
}
|
||||
|
||||
interface IntVisitor {
|
||||
int onAnd(List<? extends CqExpression> parts);
|
||||
int onOr(List<? extends CqExpression> parts);
|
||||
int onLeaf(int idx);
|
||||
}
|
||||
|
||||
interface BoolVisitor {
|
||||
boolean onAnd(List<? extends CqExpression> parts);
|
||||
boolean onOr(List<? extends CqExpression> parts);
|
||||
boolean onLeaf(int idx);
|
||||
}
|
||||
|
||||
interface DoubleVisitor {
|
||||
double onAnd(List<? extends CqExpression> parts);
|
||||
double onOr(List<? extends CqExpression> parts);
|
||||
double onLeaf(int idx);
|
||||
}
|
||||
|
||||
interface ObjectVisitor<T> {
|
||||
T onAnd(List<? extends CqExpression> parts);
|
||||
T onOr(List<? extends CqExpression> parts);
|
||||
T onLeaf(int idx);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,46 @@
|
||||
package nu.marginalia.api.searchquery.model.compiled.aggregate;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.LongSet;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.function.*;
|
||||
|
||||
public class CompiledQueryAggregates {
|
||||
/** Compiled query aggregate that for a single boolean that treats or-branches as logical OR,
|
||||
* and and-branches as logical AND operations. Will return true if there exists a path through
|
||||
* the query where the provided predicate returns true for each item.
|
||||
*/
|
||||
static public <T> boolean booleanAggregate(CompiledQuery<T> query, Predicate<T> predicate) {
|
||||
return query.root.visit(new CqBooleanAggregate(query, predicate));
|
||||
}
|
||||
|
||||
|
||||
/** Compiled query aggregate that for a 64b bitmask that treats or-branches as logical OR,
|
||||
* and and-branches as logical AND operations.
|
||||
*/
|
||||
public static <T> long longBitmaskAggregate(CompiledQuery<T> query, ToLongFunction<T> operator) {
|
||||
return query.root.visit(new CqLongBitmaskOperator(query, operator));
|
||||
}
|
||||
|
||||
|
||||
/** Apply the operator to each leaf node, then return the highest minimum value found along any path */
|
||||
public static <T> int intMaxMinAggregate(CompiledQuery<T> query, ToIntFunction<T> operator) {
|
||||
return query.root.visit(new CqIntMaxMinOperator(query, operator));
|
||||
}
|
||||
|
||||
/** Apply the operator to each leaf node, and then return the highest sum of values possible
|
||||
* through each branch in the compiled query.
|
||||
*
|
||||
*/
|
||||
public static <T> double doubleSumAggregate(CompiledQuery<T> query, ToDoubleFunction<T> operator) {
|
||||
return query.root.visit(new CqDoubleSumOperator(query, operator));
|
||||
}
|
||||
|
||||
/** Enumerate all possible paths through the compiled query */
|
||||
public static List<LongSet> queriesAggregate(CompiledQueryLong query) {
|
||||
return new ArrayList<>(query.root().visit(new CqQueryPathsOperator(query)));
|
||||
}
|
||||
}
|
@ -0,0 +1,40 @@
|
||||
package nu.marginalia.api.searchquery.model.compiled.aggregate;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.function.IntPredicate;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
public class CqBooleanAggregate implements CqExpression.BoolVisitor {
|
||||
|
||||
private final IntPredicate predicate;
|
||||
|
||||
public <T> CqBooleanAggregate(CompiledQuery<T> query, Predicate<T> objPred) {
|
||||
this.predicate = idx -> objPred.test(query.at(idx));
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean onAnd(List<? extends CqExpression> parts) {
|
||||
for (var part : parts) {
|
||||
if (!part.visit(this)) // short-circuit
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean onOr(List<? extends CqExpression> parts) {
|
||||
for (var part : parts) {
|
||||
if (part.visit(this)) // short-circuit
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean onLeaf(int idx) {
|
||||
return predicate.test(idx);
|
||||
}
|
||||
}
|
@ -0,0 +1,40 @@
|
||||
package nu.marginalia.api.searchquery.model.compiled.aggregate;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.function.IntToDoubleFunction;
|
||||
import java.util.function.ToDoubleFunction;
|
||||
|
||||
public class CqDoubleSumOperator implements CqExpression.DoubleVisitor {
|
||||
|
||||
private final IntToDoubleFunction operator;
|
||||
|
||||
public <T> CqDoubleSumOperator(CompiledQuery<T> query, ToDoubleFunction<T> operator) {
|
||||
this.operator = idx -> operator.applyAsDouble(query.at(idx));
|
||||
}
|
||||
|
||||
@Override
|
||||
public double onAnd(List<? extends CqExpression> parts) {
|
||||
double value = 0;
|
||||
for (var part : parts) {
|
||||
value += part.visit(this);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double onOr(List<? extends CqExpression> parts) {
|
||||
double value = parts.getFirst().visit(this);
|
||||
for (int i = 1; i < parts.size(); i++) {
|
||||
value = Math.max(value, parts.get(i).visit(this));
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double onLeaf(int idx) {
|
||||
return operator.applyAsDouble(idx);
|
||||
}
|
||||
}
|
@ -0,0 +1,41 @@
|
||||
package nu.marginalia.api.searchquery.model.compiled.aggregate;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.function.IntUnaryOperator;
|
||||
import java.util.function.ToIntFunction;
|
||||
|
||||
public class CqIntMaxMinOperator implements CqExpression.IntVisitor {
|
||||
|
||||
private final IntUnaryOperator operator;
|
||||
|
||||
|
||||
public <T> CqIntMaxMinOperator(CompiledQuery<T> query, ToIntFunction<T> operator) {
|
||||
this.operator = idx -> operator.applyAsInt(query.at(idx));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int onAnd(List<? extends CqExpression> parts) {
|
||||
int value = parts.getFirst().visit(this);
|
||||
for (int i = 1; i < parts.size(); i++) {
|
||||
value = Math.min(value, parts.get(i).visit(this));
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int onOr(List<? extends CqExpression> parts) {
|
||||
int value = parts.getFirst().visit(this);
|
||||
for (int i = 1; i < parts.size(); i++) {
|
||||
value = Math.max(value, parts.get(i).visit(this));
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int onLeaf(int idx) {
|
||||
return operator.applyAsInt(idx);
|
||||
}
|
||||
}
|
@ -0,0 +1,40 @@
|
||||
package nu.marginalia.api.searchquery.model.compiled.aggregate;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.function.IntToLongFunction;
|
||||
import java.util.function.ToLongFunction;
|
||||
|
||||
public class CqLongBitmaskOperator implements CqExpression.LongVisitor {
|
||||
|
||||
private final IntToLongFunction operator;
|
||||
|
||||
public <T> CqLongBitmaskOperator(CompiledQuery<T> query, ToLongFunction<T> operator) {
|
||||
this.operator = idx-> operator.applyAsLong(query.at(idx));
|
||||
}
|
||||
|
||||
@Override
|
||||
public long onAnd(List<? extends CqExpression> parts) {
|
||||
long value = ~0L;
|
||||
for (var part : parts) {
|
||||
value &= part.visit(this);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long onOr(List<? extends CqExpression> parts) {
|
||||
long value = 0L;
|
||||
for (var part : parts) {
|
||||
value |= part.visit(this);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long onLeaf(int idx) {
|
||||
return operator.applyAsLong(idx);
|
||||
}
|
||||
}
|
@ -0,0 +1,75 @@
|
||||
package nu.marginalia.api.searchquery.model.compiled.aggregate;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.LongArraySet;
|
||||
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
|
||||
import it.unimi.dsi.fastutil.longs.LongSet;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public class CqQueryPathsOperator implements CqExpression.ObjectVisitor<List<LongSet>> {
|
||||
private final CompiledQueryLong query;
|
||||
|
||||
public CqQueryPathsOperator(CompiledQueryLong query) {
|
||||
this.query = query;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<LongSet> onAnd(List<? extends CqExpression> parts) {
|
||||
return parts.stream()
|
||||
.map(expr -> expr.visit(this))
|
||||
.reduce(List.of(), this::combineAnd);
|
||||
}
|
||||
|
||||
private List<LongSet> combineAnd(List<LongSet> a, List<LongSet> b) {
|
||||
// No-op cases
|
||||
if (a.isEmpty())
|
||||
return b;
|
||||
if (b.isEmpty())
|
||||
return a;
|
||||
|
||||
// Simple cases
|
||||
if (a.size() == 1) {
|
||||
b.forEach(set -> set.addAll(a.getFirst()));
|
||||
return b;
|
||||
}
|
||||
else if (b.size() == 1) {
|
||||
a.forEach(set -> set.addAll(b.getFirst()));
|
||||
return a;
|
||||
}
|
||||
|
||||
// Case where we AND two ORs
|
||||
List<LongSet> ret = new ArrayList<>();
|
||||
|
||||
for (var aPart : a) {
|
||||
for (var bPart : b) {
|
||||
LongSet set = new LongOpenHashSet(aPart.size() + bPart.size());
|
||||
set.addAll(aPart);
|
||||
set.addAll(bPart);
|
||||
ret.add(set);
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<LongSet> onOr(List<? extends CqExpression> parts) {
|
||||
List<LongSet> ret = new ArrayList<>();
|
||||
|
||||
for (var part : parts) {
|
||||
ret.addAll(part.visit(this));
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<LongSet> onLeaf(int idx) {
|
||||
var set = new LongArraySet(1);
|
||||
set.add(query.at(idx));
|
||||
return List.of(set);
|
||||
}
|
||||
}
|
@ -13,10 +13,6 @@ public record QueryResponse(SearchSpecification specs,
|
||||
String domain)
|
||||
{
|
||||
public Set<String> getAllKeywords() {
|
||||
Set<String> keywords = new HashSet<>(100);
|
||||
for (var sq : specs.subqueries) {
|
||||
keywords.addAll(sq.searchTermsInclude);
|
||||
}
|
||||
return keywords;
|
||||
return new HashSet<>(specs.query.searchTermsInclude);
|
||||
}
|
||||
}
|
||||
|
@ -13,9 +13,12 @@ import java.util.stream.Collectors;
|
||||
@AllArgsConstructor
|
||||
@With
|
||||
@EqualsAndHashCode
|
||||
public class SearchSubquery {
|
||||
public class SearchQuery {
|
||||
|
||||
/** These terms must be present in the document and are used in ranking*/
|
||||
/** An infix style expression that encodes the required terms in the query */
|
||||
public final String compiledQuery;
|
||||
|
||||
/** All terms that appear in {@see compiledQuery} */
|
||||
public final List<String> searchTermsInclude;
|
||||
|
||||
/** These terms must be absent from the document */
|
||||
@ -33,7 +36,8 @@ public class SearchSubquery {
|
||||
@Deprecated // why does this exist?
|
||||
private double value = 0;
|
||||
|
||||
public SearchSubquery() {
|
||||
public SearchQuery() {
|
||||
this.compiledQuery = "";
|
||||
this.searchTermsInclude = new ArrayList<>();
|
||||
this.searchTermsExclude = new ArrayList<>();
|
||||
this.searchTermsAdvice = new ArrayList<>();
|
||||
@ -41,11 +45,13 @@ public class SearchSubquery {
|
||||
this.searchTermCoherences = new ArrayList<>();
|
||||
}
|
||||
|
||||
public SearchSubquery(List<String> searchTermsInclude,
|
||||
List<String> searchTermsExclude,
|
||||
List<String> searchTermsAdvice,
|
||||
List<String> searchTermsPriority,
|
||||
List<List<String>> searchTermCoherences) {
|
||||
public SearchQuery(String compiledQuery,
|
||||
List<String> searchTermsInclude,
|
||||
List<String> searchTermsExclude,
|
||||
List<String> searchTermsAdvice,
|
||||
List<String> searchTermsPriority,
|
||||
List<List<String>> searchTermCoherences) {
|
||||
this.compiledQuery = compiledQuery;
|
||||
this.searchTermsInclude = searchTermsInclude;
|
||||
this.searchTermsExclude = searchTermsExclude;
|
||||
this.searchTermsAdvice = searchTermsAdvice;
|
||||
@ -54,7 +60,7 @@ public class SearchSubquery {
|
||||
}
|
||||
|
||||
@Deprecated // why does this exist?
|
||||
public SearchSubquery setValue(double value) {
|
||||
public SearchQuery setValue(double value) {
|
||||
if (Double.isInfinite(value) || Double.isNaN(value)) {
|
||||
this.value = Double.MAX_VALUE;
|
||||
} else {
|
||||
@ -66,7 +72,7 @@ public class SearchSubquery {
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
if (!searchTermsInclude.isEmpty()) sb.append("include=").append(searchTermsInclude.stream().collect(Collectors.joining(",", "[", "] ")));
|
||||
if (!compiledQuery.isEmpty()) sb.append("compiledQuery=").append(compiledQuery);
|
||||
if (!searchTermsExclude.isEmpty()) sb.append("exclude=").append(searchTermsExclude.stream().collect(Collectors.joining(",", "[", "] ")));
|
||||
if (!searchTermsAdvice.isEmpty()) sb.append("advice=").append(searchTermsAdvice.stream().collect(Collectors.joining(",", "[", "] ")));
|
||||
if (!searchTermsPriority.isEmpty()) sb.append("priority=").append(searchTermsPriority.stream().collect(Collectors.joining(",", "[", "] ")));
|
@ -10,7 +10,7 @@ import java.util.List;
|
||||
|
||||
@ToString @Getter @Builder @With @AllArgsConstructor
|
||||
public class SearchSpecification {
|
||||
public List<SearchSubquery> subqueries;
|
||||
public SearchQuery query;
|
||||
|
||||
/** If present and not empty, limit the search to these domain IDs */
|
||||
public List<Integer> domains;
|
||||
|
@ -21,9 +21,9 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
|
||||
/** How many other potential results existed in the same domain */
|
||||
public int resultsFromDomain;
|
||||
|
||||
public SearchResultItem(long combinedId, int scoresCount) {
|
||||
public SearchResultItem(long combinedId) {
|
||||
this.combinedId = combinedId;
|
||||
this.keywordScores = new ArrayList<>(scoresCount);
|
||||
this.keywordScores = new ArrayList<>();
|
||||
}
|
||||
|
||||
|
||||
|
@ -7,19 +7,22 @@ import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import java.util.Objects;
|
||||
|
||||
public final class SearchResultKeywordScore {
|
||||
@Deprecated
|
||||
public final int subquery;
|
||||
public final long termId;
|
||||
public final String keyword;
|
||||
private final long encodedWordMetadata;
|
||||
private final long encodedDocMetadata;
|
||||
|
||||
private final int htmlFeatures;
|
||||
|
||||
public SearchResultKeywordScore(int subquery,
|
||||
String keyword,
|
||||
public SearchResultKeywordScore(String keyword,
|
||||
long termId,
|
||||
long encodedWordMetadata,
|
||||
long encodedDocMetadata,
|
||||
int htmlFeatures) {
|
||||
this.subquery = subquery;
|
||||
this.termId = termId;
|
||||
this.subquery = -1; // FIXME, deprecated
|
||||
this.keyword = keyword;
|
||||
this.encodedWordMetadata = encodedWordMetadata;
|
||||
this.encodedDocMetadata = encodedDocMetadata;
|
||||
|
@ -52,7 +52,7 @@ message RpcTemporalBias {
|
||||
|
||||
/* Index service query request */
|
||||
message RpcIndexQuery {
|
||||
repeated RpcSubquery subqueries = 1;
|
||||
RpcQuery query = 1;
|
||||
repeated int32 domains = 2; // (optional) A list of domain IDs to consider
|
||||
string searchSetIdentifier = 3; // (optional) A named set of domains to consider
|
||||
string humanQuery = 4; // The search query as the user entered it
|
||||
@ -102,12 +102,11 @@ message RpcRawResultItem {
|
||||
|
||||
/* Information about how well a keyword matches a query */
|
||||
message RpcResultKeywordScore {
|
||||
int32 subquery = 1; // index of the subquery this keyword relates to
|
||||
string keyword = 2; // the keyword
|
||||
int64 encodedWordMetadata = 3; // bit encoded word metadata
|
||||
int64 encodedDocMetadata = 4; // bit encoded document metadata
|
||||
bool hasPriorityTerms = 5; // true if this word is important to the document
|
||||
int32 htmlFeatures = 6; // bit encoded document features
|
||||
string keyword = 1; // the keyword
|
||||
int64 encodedWordMetadata = 2; // bit encoded word metadata
|
||||
int64 encodedDocMetadata = 3; // bit encoded document metadata
|
||||
bool hasPriorityTerms = 4; // true if this word is important to the document
|
||||
int32 htmlFeatures = 5; // bit encoded document features
|
||||
}
|
||||
|
||||
/* Query execution parameters */
|
||||
@ -137,12 +136,13 @@ message RpcResultRankingParameters {
|
||||
}
|
||||
|
||||
/* Defines a single subquery */
|
||||
message RpcSubquery {
|
||||
message RpcQuery {
|
||||
repeated string include = 1; // These terms must be present
|
||||
repeated string exclude = 2; // These terms must be absent
|
||||
repeated string advice = 3; // These terms must be present, but do not affect ranking
|
||||
repeated string priority = 4; // These terms are not mandatory, but affect ranking positively if they are present
|
||||
repeated RpcCoherences coherences = 5; // Groups of terms that must exist in proximity of each other
|
||||
string compiledQuery = 6; // Compiled query in infix notation
|
||||
}
|
||||
|
||||
/* Defines a group of search terms that must exist in close proximity within the document */
|
||||
|
@ -0,0 +1,79 @@
|
||||
package nu.marginalia.api.searchquery.model.compiled;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class CompiledQueryParserTest {
|
||||
|
||||
@Test
|
||||
public void testEmpty() {
|
||||
assertEquals(CqExpression.empty(), CompiledQueryParser.parse("").root);
|
||||
assertEquals(CqExpression.empty(), CompiledQueryParser.parse("( )").root);
|
||||
assertEquals(CqExpression.empty(), CompiledQueryParser.parse("( | )").root);
|
||||
assertEquals(CqExpression.empty(), CompiledQueryParser.parse("| ( | ) |").root);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSingleWord() {
|
||||
CompiledQuery<String> q = CompiledQueryParser.parse("foo");
|
||||
assertEquals(w(q, "foo"), q.root);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAndTwoWords() {
|
||||
CompiledQuery<String> q = CompiledQueryParser.parse("foo bar");
|
||||
assertEquals(and(w(q, "foo"), w(q,"bar")), q.root);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOrTwoWords() {
|
||||
CompiledQuery<String> q = CompiledQueryParser.parse("foo | bar");
|
||||
assertEquals(or(w(q, "foo"), w(q,"bar")), q.root);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOrAndWords() {
|
||||
CompiledQuery<String> q = CompiledQueryParser.parse("foo | bar baz");
|
||||
assertEquals(or(w(q,"foo"), and(w(q,"bar"), w(q,"baz"))), q.root);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAndAndOrAndAndWords() {
|
||||
CompiledQuery<String> q = CompiledQueryParser.parse("foo foobar | bar baz");
|
||||
assertEquals(or(
|
||||
and(w(q, "foo"), w(q, "foobar")),
|
||||
and(w(q, "bar"), w(q, "baz")))
|
||||
, q.root);
|
||||
}
|
||||
@Test
|
||||
public void testComplex1() {
|
||||
CompiledQuery<String> q = CompiledQueryParser.parse("foo ( bar | baz ) quux");
|
||||
assertEquals(and(w(q,"foo"), or(w(q, "bar"), w(q, "baz")), w(q, "quux")), q.root);
|
||||
}
|
||||
@Test
|
||||
public void testComplex2() {
|
||||
CompiledQuery<String> q = CompiledQueryParser.parse("( ( ( a ) b ) c ) d");
|
||||
assertEquals(and(and(and(w(q, "a"), w(q, "b")), w(q, "c")), w(q, "d")), q.root);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNested() {
|
||||
CompiledQuery<String> q = CompiledQueryParser.parse("( ( ( a ) ) )");
|
||||
assertEquals(w(q,"a"), q.root);
|
||||
}
|
||||
|
||||
private CqExpression.Word w(CompiledQuery<String> query, String word) {
|
||||
return new CqExpression.Word(query.indices().filter(idx -> word.equals(query.at(idx))).findAny().orElseThrow());
|
||||
}
|
||||
|
||||
private CqExpression and(CqExpression... parts) {
|
||||
return new CqExpression.And(List.of(parts));
|
||||
}
|
||||
|
||||
private CqExpression or(CqExpression... parts) {
|
||||
return new CqExpression.Or(List.of(parts));
|
||||
}
|
||||
}
|
@ -0,0 +1,35 @@
|
||||
package nu.marginalia.api.searchquery.model.compiled.aggregate;
|
||||
|
||||
import static nu.marginalia.api.searchquery.model.compiled.CompiledQueryParser.parse;
|
||||
import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.*;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class CompiledQueryAggregatesTest {
|
||||
|
||||
@Test
|
||||
void booleanAggregates() {
|
||||
assertFalse(booleanAggregate(parse("false"), Boolean::parseBoolean));
|
||||
assertTrue(booleanAggregate(parse("true"), Boolean::parseBoolean));
|
||||
assertFalse(booleanAggregate(parse("false true"), Boolean::parseBoolean));
|
||||
assertTrue(booleanAggregate(parse("( true ) | ( true false )"), Boolean::parseBoolean));
|
||||
assertTrue(booleanAggregate(parse("( false ) | ( true )"), Boolean::parseBoolean));
|
||||
assertTrue(booleanAggregate(parse("( true false ) | ( true true )"), Boolean::parseBoolean));
|
||||
assertFalse(booleanAggregate(parse("( true false ) | ( true false )"), Boolean::parseBoolean));
|
||||
}
|
||||
|
||||
@Test
|
||||
void intMaxMinAggregates() {
|
||||
assertEquals(5, intMaxMinAggregate(parse("5"), Integer::parseInt));
|
||||
assertEquals(3, intMaxMinAggregate(parse("5 3"), Integer::parseInt));
|
||||
assertEquals(6, intMaxMinAggregate(parse("5 3 | 6 7"), Integer::parseInt));
|
||||
}
|
||||
|
||||
@Test
|
||||
void doubleSumAggregates() {
|
||||
assertEquals(5, (int) doubleSumAggregate(parse("5"), Double::parseDouble));
|
||||
assertEquals(8, (int) doubleSumAggregate(parse("5 3"), Double::parseDouble));
|
||||
assertEquals(13, (int) doubleSumAggregate(parse("1 ( 5 3 | 2 10 )"), Double::parseDouble));
|
||||
}
|
||||
}
|
@ -1,7 +1,7 @@
|
||||
package nu.marginalia.index.client;
|
||||
|
||||
import nu.marginalia.api.searchquery.IndexProtobufCodec;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.index.query.limit.QueryLimits;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
@ -35,14 +35,15 @@ class IndexProtobufCodecTest {
|
||||
}
|
||||
@Test
|
||||
public void testSubqery() {
|
||||
verifyIsIdentityTransformation(new SearchSubquery(
|
||||
verifyIsIdentityTransformation(new SearchQuery(
|
||||
"qs",
|
||||
List.of("a", "b"),
|
||||
List.of("c", "d"),
|
||||
List.of("e", "f"),
|
||||
List.of("g", "h"),
|
||||
List.of(List.of("i", "j"), List.of("k"))
|
||||
),
|
||||
s -> IndexProtobufCodec.convertSearchSubquery(IndexProtobufCodec.convertSearchSubquery(s))
|
||||
s -> IndexProtobufCodec.convertRpcQuery(IndexProtobufCodec.convertRpcQuery(s))
|
||||
);
|
||||
}
|
||||
private <T> void verifyIsIdentityTransformation(T val, Function<T,T> transformation) {
|
||||
|
@ -2,18 +2,16 @@ package nu.marginalia.functions.searchquery.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.util.language.EnglishDictionary;
|
||||
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
|
||||
import nu.marginalia.language.WordPatterns;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.query.ProcessedQuery;
|
||||
import nu.marginalia.functions.searchquery.query_parser.QueryParser;
|
||||
import nu.marginalia.functions.searchquery.query_parser.token.Token;
|
||||
import nu.marginalia.functions.searchquery.query_parser.token.TokenType;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -26,15 +24,14 @@ import java.util.List;
|
||||
public class QueryFactory {
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private static final int RETAIN_QUERY_VARIANT_COUNT = 5;
|
||||
private final QueryParser queryParser = new QueryParser();
|
||||
private final QueryExpansion queryExpansion;
|
||||
|
||||
|
||||
@Inject
|
||||
public QueryFactory(LanguageModels lm,
|
||||
TermFrequencyDict dict,
|
||||
EnglishDictionary englishDictionary)
|
||||
public QueryFactory(QueryExpansion queryExpansion)
|
||||
{
|
||||
this.queryExpansion = queryExpansion;
|
||||
}
|
||||
|
||||
|
||||
@ -49,8 +46,6 @@ public class QueryFactory {
|
||||
List<String> searchTermsHuman = new ArrayList<>();
|
||||
List<String> problems = new ArrayList<>();
|
||||
|
||||
String domain = null;
|
||||
|
||||
List<Token> basicQuery = queryParser.parse(query);
|
||||
|
||||
if (basicQuery.size() >= 12) {
|
||||
@ -74,19 +69,8 @@ public class QueryFactory {
|
||||
t.visit(qualityLimits);
|
||||
}
|
||||
|
||||
// var queryPermutations = queryPermutation.permuteQueriesNew(basicQuery);
|
||||
List<SearchSubquery> subqueries = new ArrayList<>();
|
||||
QuerySearchTermsAccumulator termsAccumulator = new QuerySearchTermsAccumulator(basicQuery);
|
||||
domain = termsAccumulator.domain;
|
||||
|
||||
// for (var parts : queryPermutations) {
|
||||
// QuerySearchTermsAccumulator termsAccumulator = new QuerySearchTermsAccumulator(basicQuery);
|
||||
//
|
||||
// domain = termsAccumulator.domain;
|
||||
//
|
||||
// SearchSubquery subquery = termsAccumulator.createSubquery();
|
||||
// subqueries.add(subquery);
|
||||
// }
|
||||
String domain = termsAccumulator.domain;
|
||||
|
||||
List<Integer> domainIds = params.domainIds();
|
||||
|
||||
@ -97,7 +81,18 @@ public class QueryFactory {
|
||||
}
|
||||
|
||||
var specsBuilder = SearchSpecification.builder()
|
||||
.subqueries(subqueries)
|
||||
.query(
|
||||
new SearchQuery(
|
||||
queryExpansion.expandQuery(
|
||||
termsAccumulator.searchTermsInclude
|
||||
),
|
||||
termsAccumulator.searchTermsInclude,
|
||||
termsAccumulator.searchTermsExclude,
|
||||
termsAccumulator.searchTermsAdvice,
|
||||
termsAccumulator.searchTermsPriority,
|
||||
termsAccumulator.searchTermCoherences
|
||||
)
|
||||
)
|
||||
.humanQuery(query)
|
||||
.quality(qualityLimits.qualityLimit)
|
||||
.year(qualityLimits.year)
|
||||
@ -111,12 +106,9 @@ public class QueryFactory {
|
||||
|
||||
SearchSpecification specs = specsBuilder.build();
|
||||
|
||||
for (var sq : specs.subqueries) {
|
||||
sq.searchTermsAdvice.addAll(params.tacitAdvice());
|
||||
sq.searchTermsPriority.addAll(params.tacitPriority());
|
||||
sq.searchTermsInclude.addAll(params.tacitIncludes());
|
||||
sq.searchTermsExclude.addAll(params.tacitExcludes());
|
||||
}
|
||||
specs.query.searchTermsAdvice.addAll(params.tacitAdvice());
|
||||
specs.query.searchTermsPriority.addAll(params.tacitPriority());
|
||||
specs.query.searchTermsExclude.addAll(params.tacitExcludes());
|
||||
|
||||
return new ProcessedQuery(specs, searchTermsHuman, domain);
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.functions.searchquery.svc;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.language.WordPatterns;
|
||||
import nu.marginalia.functions.searchquery.query_parser.token.Token;
|
||||
import nu.marginalia.functions.searchquery.query_parser.token.TokenVisitor;
|
||||
@ -9,7 +9,7 @@ import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/** @see SearchSubquery */
|
||||
/** @see SearchQuery */
|
||||
public class QuerySearchTermsAccumulator implements TokenVisitor {
|
||||
public List<String> searchTermsExclude = new ArrayList<>();
|
||||
public List<String> searchTermsInclude = new ArrayList<>();
|
||||
@ -19,10 +19,6 @@ public class QuerySearchTermsAccumulator implements TokenVisitor {
|
||||
|
||||
public String domain;
|
||||
|
||||
public SearchSubquery createSubquery() {
|
||||
return new SearchSubquery(searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority, searchTermCoherences);
|
||||
}
|
||||
|
||||
public QuerySearchTermsAccumulator(List<Token> parts) {
|
||||
for (Token t : parts) {
|
||||
t.visit(this);
|
||||
|
@ -3,12 +3,13 @@ package nu.marginalia.query.svc;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
|
||||
import nu.marginalia.functions.searchquery.svc.QueryFactory;
|
||||
import nu.marginalia.index.query.limit.QueryLimits;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
||||
import nu.marginalia.util.language.EnglishDictionary;
|
||||
import nu.marginalia.segmentation.NgramLexicon;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
@ -27,11 +28,9 @@ public class QueryFactoryTest {
|
||||
public static void setUpAll() throws IOException {
|
||||
|
||||
var lm = WmsaHome.getLanguageModels();
|
||||
var tfd = new TermFrequencyDict(lm);
|
||||
|
||||
queryFactory = new QueryFactory(lm,
|
||||
tfd,
|
||||
new EnglishDictionary(tfd)
|
||||
queryFactory = new QueryFactory(
|
||||
new QueryExpansion(new TermFrequencyDict(lm), new NgramLexicon(lm))
|
||||
);
|
||||
}
|
||||
|
||||
@ -112,17 +111,15 @@ public class QueryFactoryTest {
|
||||
{
|
||||
// the is a stopword, so it should generate an ngram search term
|
||||
var specs = parseAndGetSpecs("\"the shining\"");
|
||||
assertEquals(List.of("the_shining"), specs.subqueries.iterator().next().searchTermsInclude);
|
||||
assertEquals(List.of(), specs.subqueries.iterator().next().searchTermsAdvice);
|
||||
assertEquals(List.of(), specs.subqueries.iterator().next().searchTermCoherences);
|
||||
assertEquals("the_shining", specs.query.compiledQuery);
|
||||
}
|
||||
|
||||
{
|
||||
// tde isn't a stopword, so we should get the normal behavior
|
||||
var specs = parseAndGetSpecs("\"tde shining\"");
|
||||
assertEquals(List.of("tde", "shining"), specs.subqueries.iterator().next().searchTermsInclude);
|
||||
assertEquals(List.of("tde_shining"), specs.subqueries.iterator().next().searchTermsAdvice);
|
||||
assertEquals(List.of(List.of("tde", "shining")), specs.subqueries.iterator().next().searchTermCoherences);
|
||||
assertEquals("tde shining", specs.query.compiledQuery);
|
||||
assertEquals(List.of("tde_shining"), specs.query.searchTermsAdvice);
|
||||
assertEquals(List.of(List.of("tde", "shining")), specs.query.searchTermCoherences);
|
||||
}
|
||||
}
|
||||
|
||||
@ -150,8 +147,18 @@ public class QueryFactoryTest {
|
||||
|
||||
@Test
|
||||
public void testPriorityTerm() {
|
||||
var subquery = parseAndGetSpecs("physics ?tld:edu").subqueries.iterator().next();
|
||||
var subquery = parseAndGetSpecs("physics ?tld:edu").query;
|
||||
assertEquals(List.of("tld:edu"), subquery.searchTermsPriority);
|
||||
assertEquals(List.of("physics"), subquery.searchTermsInclude);
|
||||
assertEquals("physics", subquery.compiledQuery);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExpansion() {
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
var subquery = parseAndGetSpecs("elden ring mechanical keyboard slackware linux duke nukem 3d").query;
|
||||
System.out.println("Time: " + (System.currentTimeMillis() - start));
|
||||
System.out.println(subquery.compiledQuery);
|
||||
|
||||
}
|
||||
}
|
@ -46,7 +46,7 @@ public class ReverseIndexEntrySource implements EntrySource {
|
||||
return;
|
||||
|
||||
for (int ri = entrySize, wi=1; ri < buffer.end ; ri+=entrySize, wi++) {
|
||||
buffer.data[wi] = buffer.data[ri];
|
||||
buffer.data.set(wi, buffer.data.get(ri));
|
||||
}
|
||||
|
||||
buffer.end /= entrySize;
|
||||
|
@ -9,14 +9,14 @@ import io.prometheus.client.Histogram;
|
||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.api.searchquery.*;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
||||
import nu.marginalia.api.searchquery.model.results.*;
|
||||
import nu.marginalia.array.buffer.LongQueryBuffer;
|
||||
import nu.marginalia.index.index.StatefulIndex;
|
||||
import nu.marginalia.index.model.SearchParameters;
|
||||
import nu.marginalia.index.model.SearchTerms;
|
||||
import nu.marginalia.index.model.SearchTermsUtil;
|
||||
import nu.marginalia.index.query.IndexQuery;
|
||||
import nu.marginalia.index.query.IndexSearchBudget;
|
||||
import nu.marginalia.index.results.IndexResultValuatorService;
|
||||
@ -143,7 +143,6 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
|
||||
.setEncodedWordMetadata(score.encodedWordMetadata())
|
||||
.setKeyword(score.keyword)
|
||||
.setHtmlFeatures(score.htmlFeatures())
|
||||
.setSubquery(score.subquery)
|
||||
);
|
||||
}
|
||||
|
||||
@ -203,7 +202,9 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
|
||||
return new SearchResultSet(List.of());
|
||||
}
|
||||
|
||||
ResultRankingContext rankingContext = createRankingContext(params.rankingParams, params.subqueries);
|
||||
ResultRankingContext rankingContext = createRankingContext(params.rankingParams,
|
||||
params.compiledQuery,
|
||||
params.compiledQueryIds);
|
||||
|
||||
var queryExecution = new QueryExecution(rankingContext, params.fetchSize);
|
||||
|
||||
@ -255,14 +256,10 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
|
||||
/** Execute a search query */
|
||||
public SearchResultSet run(SearchParameters parameters) throws SQLException, InterruptedException {
|
||||
|
||||
for (var subquery : parameters.subqueries) {
|
||||
var terms = new SearchTerms(subquery);
|
||||
if (terms.isEmpty())
|
||||
continue;
|
||||
var terms = new SearchTerms(parameters.query, parameters.compiledQueryIds);
|
||||
|
||||
for (var indexQuery : index.createQueries(terms, parameters.queryParams)) {
|
||||
workerPool.execute(new IndexLookup(indexQuery, parameters.budget));
|
||||
}
|
||||
for (var indexQuery : index.createQueries(terms, parameters.queryParams)) {
|
||||
workerPool.execute(new IndexLookup(indexQuery, parameters.budget));
|
||||
}
|
||||
|
||||
for (int i = 0; i < indexValuationThreads; i++) {
|
||||
@ -327,7 +324,9 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
|
||||
buffer.reset();
|
||||
query.getMoreResults(buffer);
|
||||
|
||||
results.addElements(0, buffer.data, 0, buffer.end);
|
||||
for (int i = 0; i < buffer.end; i++) {
|
||||
results.add(buffer.data.get(i));
|
||||
}
|
||||
|
||||
if (results.size() < 512) {
|
||||
enqueueResults(new CombinedDocIdList(results));
|
||||
@ -413,8 +412,13 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
|
||||
|
||||
}
|
||||
|
||||
private ResultRankingContext createRankingContext(ResultRankingParameters rankingParams, List<SearchSubquery> subqueries) {
|
||||
final var termToId = SearchTermsUtil.getAllIncludeTerms(subqueries);
|
||||
private ResultRankingContext createRankingContext(ResultRankingParameters rankingParams,
|
||||
CompiledQuery<String> query,
|
||||
CompiledQueryLong compiledQueryIds)
|
||||
{
|
||||
Map<String, Long> termToId = new HashMap<>(query.size());
|
||||
query.indices().forEach(id -> termToId.put(query.at(id), compiledQueryIds.at(id)));
|
||||
|
||||
final Map<String, Integer> termFrequencies = new HashMap<>(termToId.size());
|
||||
final Map<String, Integer> prioFrequencies = new HashMap<>(termToId.size());
|
||||
|
||||
|
@ -38,6 +38,13 @@ public class CombinedIndexReader {
|
||||
return new IndexQueryBuilderImpl(reverseIndexFullReader, reverseIndexPriorityReader, query);
|
||||
}
|
||||
|
||||
public QueryFilterStepIf hasWordFull(long termId) {
|
||||
return reverseIndexFullReader.also(termId);
|
||||
}
|
||||
public QueryFilterStepIf hasWordPrio(long termId) {
|
||||
return reverseIndexPriorityReader.also(termId);
|
||||
}
|
||||
|
||||
|
||||
/** Creates a query builder for terms in the priority index */
|
||||
public IndexQueryBuilder findPriorityWord(long wordId) {
|
||||
|
@ -1,9 +1,11 @@
|
||||
package nu.marginalia.index.index;
|
||||
|
||||
import java.util.List;
|
||||
import gnu.trove.set.hash.TLongHashSet;
|
||||
import nu.marginalia.index.ReverseIndexReader;
|
||||
import nu.marginalia.index.query.IndexQuery;
|
||||
import nu.marginalia.index.query.IndexQueryBuilder;
|
||||
import nu.marginalia.index.query.filter.QueryFilterAnyOf;
|
||||
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
||||
|
||||
public class IndexQueryBuilderImpl implements IndexQueryBuilder {
|
||||
@ -66,6 +68,20 @@ public class IndexQueryBuilderImpl implements IndexQueryBuilder {
|
||||
return this;
|
||||
}
|
||||
|
||||
public IndexQueryBuilder addInclusionFilterAny(List<QueryFilterStepIf> filterSteps) {
|
||||
if (filterSteps.isEmpty())
|
||||
return this;
|
||||
|
||||
if (filterSteps.size() == 1) {
|
||||
query.addInclusionFilter(filterSteps.getFirst());
|
||||
}
|
||||
else {
|
||||
query.addInclusionFilter(new QueryFilterAnyOf(filterSteps));
|
||||
}
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
public IndexQuery build() {
|
||||
return query;
|
||||
}
|
||||
|
@ -0,0 +1,78 @@
|
||||
package nu.marginalia.index.index;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||
import it.unimi.dsi.fastutil.longs.LongSet;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
class QueryBranchWalker {
|
||||
public final long[] priorityOrder;
|
||||
public final List<LongSet> paths;
|
||||
public final long termId;
|
||||
|
||||
private QueryBranchWalker(long[] priorityOrder, List<LongSet> paths, long termId) {
|
||||
this.priorityOrder = priorityOrder;
|
||||
this.paths = paths;
|
||||
this.termId = termId;
|
||||
}
|
||||
|
||||
public boolean atEnd() {
|
||||
return priorityOrder.length == 0;
|
||||
}
|
||||
|
||||
public static List<QueryBranchWalker> create(long[] priorityOrder, List<LongSet> paths) {
|
||||
|
||||
List<QueryBranchWalker> ret = new ArrayList<>();
|
||||
List<LongSet> remainingPaths = new LinkedList<>(paths);
|
||||
|
||||
remainingPaths.removeIf(LongSet::isEmpty);
|
||||
|
||||
for (int i = 0; i < priorityOrder.length; i++) {
|
||||
long prio = priorityOrder[i];
|
||||
|
||||
var it = remainingPaths.iterator();
|
||||
List<LongSet> pathsForPrio = new ArrayList<>();
|
||||
|
||||
while (it.hasNext()) {
|
||||
var path = it.next();
|
||||
|
||||
if (path.contains(prio)) {
|
||||
path.remove(prio);
|
||||
pathsForPrio.add(path);
|
||||
it.remove();
|
||||
}
|
||||
}
|
||||
|
||||
if (!pathsForPrio.isEmpty()) {
|
||||
LongArrayList remainingPrios = new LongArrayList(pathsForPrio.size());
|
||||
|
||||
for (var p : priorityOrder) {
|
||||
for (var path : pathsForPrio) {
|
||||
if (path.contains(p)) {
|
||||
remainingPrios.add(p);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ret.add(new QueryBranchWalker(remainingPrios.elements(), pathsForPrio, prio));
|
||||
}
|
||||
}
|
||||
|
||||
if (!remainingPaths.isEmpty()) {
|
||||
System.out.println("Dropping: " + remainingPaths);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public List<QueryBranchWalker> next() {
|
||||
if (atEnd())
|
||||
return List.of();
|
||||
|
||||
return create(priorityOrder, paths);
|
||||
}
|
||||
|
||||
}
|
@ -2,6 +2,13 @@ package nu.marginalia.index.index;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
|
||||
import it.unimi.dsi.fastutil.longs.LongSet;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
|
||||
import nu.marginalia.index.query.filter.QueryFilterAllOf;
|
||||
import nu.marginalia.index.query.filter.QueryFilterAnyOf;
|
||||
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
||||
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
||||
import nu.marginalia.index.results.model.ids.DocMetadataList;
|
||||
import nu.marginalia.index.model.QueryParams;
|
||||
@ -14,12 +21,13 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.locks.Lock;
|
||||
import java.util.concurrent.locks.ReadWriteLock;
|
||||
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
||||
import java.util.function.LongFunction;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/** This class delegates SearchIndexReader and deals with the stateful nature of the index,
|
||||
* i.e. it may be possible to reconstruct the index and load a new set of data.
|
||||
@ -105,6 +113,61 @@ public class StatefulIndex {
|
||||
return combinedIndexReader != null && combinedIndexReader.isLoaded();
|
||||
}
|
||||
|
||||
private Predicate<LongSet> containsOnly(long[] permitted) {
|
||||
LongSet permittedTerms = new LongOpenHashSet(permitted);
|
||||
return permittedTerms::containsAll;
|
||||
}
|
||||
|
||||
private List<IndexQueryBuilder> createBuilders(CompiledQueryLong query,
|
||||
LongFunction<IndexQueryBuilder> builderFactory,
|
||||
long[] termPriority) {
|
||||
List<LongSet> paths = CompiledQueryAggregates.queriesAggregate(query);
|
||||
|
||||
// Remove any paths that do not contain all prioritized terms, as this means
|
||||
// the term is missing from the index and can never be found
|
||||
paths.removeIf(containsOnly(termPriority).negate());
|
||||
|
||||
List<QueryBranchWalker> helpers = QueryBranchWalker.create(termPriority, paths);
|
||||
List<IndexQueryBuilder> builders = new ArrayList<>();
|
||||
|
||||
for (var helper : helpers) {
|
||||
var builder = builderFactory.apply(helper.termId);
|
||||
|
||||
builders.add(builder);
|
||||
|
||||
if (helper.atEnd())
|
||||
continue;
|
||||
|
||||
var filters = helper.next().stream()
|
||||
.map(this::createFilter)
|
||||
.toList();
|
||||
|
||||
builder.addInclusionFilterAny(filters);
|
||||
}
|
||||
|
||||
return builders;
|
||||
}
|
||||
|
||||
private QueryFilterStepIf createFilter(QueryBranchWalker helper) {
|
||||
var selfCondition = combinedIndexReader.hasWordFull(helper.termId);
|
||||
if (helper.atEnd())
|
||||
return selfCondition;
|
||||
|
||||
var nextSteps = helper.next();
|
||||
var nextFilters = nextSteps.stream()
|
||||
.map(this::createFilter)
|
||||
.map(filter -> new QueryFilterAllOf(List.of(selfCondition, filter)))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
if (nextFilters.isEmpty())
|
||||
return selfCondition;
|
||||
|
||||
if (nextFilters.size() == 1)
|
||||
return nextFilters.getFirst();
|
||||
|
||||
|
||||
return new QueryFilterAnyOf(nextFilters);
|
||||
}
|
||||
|
||||
public List<IndexQuery> createQueries(SearchTerms terms, QueryParams params) {
|
||||
|
||||
@ -117,40 +180,13 @@ public class StatefulIndex {
|
||||
final long[] orderedIncludesPrio = terms.sortedDistinctIncludes(this::compareKeywordsPrio);
|
||||
|
||||
List<IndexQueryBuilder> queryHeads = new ArrayList<>(10);
|
||||
|
||||
queryHeads.addAll(createBuilders(terms.compiledQuery(), combinedIndexReader::findFullWord, orderedIncludes));
|
||||
queryHeads.addAll(createBuilders(terms.compiledQuery(), combinedIndexReader::findPriorityWord, orderedIncludesPrio));
|
||||
|
||||
List<IndexQuery> queries = new ArrayList<>(10);
|
||||
|
||||
// To ensure that good results are discovered, create separate query heads for the priority index that
|
||||
// filter for terms that contain pairs of two search terms
|
||||
if (orderedIncludesPrio.length > 1) {
|
||||
for (int i = 0; i + 1 < orderedIncludesPrio.length; i++) {
|
||||
for (int j = i + 1; j < orderedIncludesPrio.length; j++) {
|
||||
var entrySource = combinedIndexReader
|
||||
.findPriorityWord(orderedIncludesPrio[i])
|
||||
.alsoPrio(orderedIncludesPrio[j]);
|
||||
queryHeads.add(entrySource);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Next consider entries that appear only once in the priority index
|
||||
for (var wordId : orderedIncludesPrio) {
|
||||
queryHeads.add(combinedIndexReader.findPriorityWord(wordId));
|
||||
}
|
||||
|
||||
// Finally consider terms in the full index
|
||||
queryHeads.add(combinedIndexReader.findFullWord(orderedIncludes[0]));
|
||||
|
||||
for (var query : queryHeads) {
|
||||
if (query == null) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
// Note that we can add all includes as filters, even though
|
||||
// they may not be present in the query head, as the query builder
|
||||
// will ignore redundant include filters:
|
||||
for (long orderedInclude : orderedIncludes) {
|
||||
query = query.alsoFull(orderedInclude);
|
||||
}
|
||||
|
||||
for (long term : terms.excludes()) {
|
||||
query = query.notFull(term);
|
||||
@ -161,6 +197,7 @@ public class StatefulIndex {
|
||||
queries.add(query.addInclusionFilter(combinedIndexReader.filterForParams(params)).build());
|
||||
}
|
||||
|
||||
|
||||
return queries;
|
||||
}
|
||||
|
||||
|
@ -2,16 +2,16 @@ package nu.marginalia.index.model;
|
||||
|
||||
import nu.marginalia.api.searchquery.IndexProtobufCodec;
|
||||
import nu.marginalia.api.searchquery.RpcIndexQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryParser;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.index.query.IndexSearchBudget;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.searchset.SearchSet;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import static nu.marginalia.api.searchquery.IndexProtobufCodec.convertSpecLimit;
|
||||
|
||||
public class SearchParameters {
|
||||
@ -21,13 +21,16 @@ public class SearchParameters {
|
||||
*/
|
||||
public final int fetchSize;
|
||||
public final IndexSearchBudget budget;
|
||||
public final List<SearchSubquery> subqueries;
|
||||
public final SearchQuery query;
|
||||
public final QueryParams queryParams;
|
||||
public final ResultRankingParameters rankingParams;
|
||||
|
||||
public final int limitByDomain;
|
||||
public final int limitTotal;
|
||||
|
||||
public final CompiledQuery<String> compiledQuery;
|
||||
public final CompiledQueryLong compiledQueryIds;
|
||||
|
||||
// mutable:
|
||||
|
||||
/**
|
||||
@ -40,7 +43,7 @@ public class SearchParameters {
|
||||
|
||||
this.fetchSize = limits.fetchSize();
|
||||
this.budget = new IndexSearchBudget(limits.timeoutMs());
|
||||
this.subqueries = specsSet.subqueries;
|
||||
this.query = specsSet.query;
|
||||
this.limitByDomain = limits.resultsByDomain();
|
||||
this.limitTotal = limits.resultsTotal();
|
||||
|
||||
@ -52,6 +55,9 @@ public class SearchParameters {
|
||||
searchSet,
|
||||
specsSet.queryStrategy);
|
||||
|
||||
compiledQuery = CompiledQueryParser.parse(this.query.compiledQuery);
|
||||
compiledQueryIds = compiledQuery.mapToLong(SearchTermsUtil::getWordId);
|
||||
|
||||
rankingParams = specsSet.rankingParams;
|
||||
}
|
||||
|
||||
@ -63,11 +69,8 @@ public class SearchParameters {
|
||||
// The time budget is halved because this is the point when we start to
|
||||
// wrap up the search and return the results.
|
||||
this.budget = new IndexSearchBudget(limits.timeoutMs() / 2);
|
||||
this.query = IndexProtobufCodec.convertRpcQuery(request.getQuery());
|
||||
|
||||
this.subqueries = new ArrayList<>(request.getSubqueriesCount());
|
||||
for (int i = 0; i < request.getSubqueriesCount(); i++) {
|
||||
this.subqueries.add(IndexProtobufCodec.convertSearchSubquery(request.getSubqueries(i)));
|
||||
}
|
||||
this.limitByDomain = limits.resultsByDomain();
|
||||
this.limitTotal = limits.resultsTotal();
|
||||
|
||||
@ -79,9 +82,13 @@ public class SearchParameters {
|
||||
searchSet,
|
||||
QueryStrategy.valueOf(request.getQueryStrategy()));
|
||||
|
||||
compiledQuery = CompiledQueryParser.parse(this.query.compiledQuery);
|
||||
compiledQueryIds = compiledQuery.mapToLong(SearchTermsUtil::getWordId);
|
||||
|
||||
rankingParams = IndexProtobufCodec.convertRankingParameterss(request.getParameters());
|
||||
}
|
||||
|
||||
|
||||
public long getDataCost() {
|
||||
return dataCost;
|
||||
}
|
||||
|
@ -4,7 +4,8 @@ import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||
import it.unimi.dsi.fastutil.longs.LongComparator;
|
||||
import it.unimi.dsi.fastutil.longs.LongList;
|
||||
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
@ -18,34 +19,39 @@ public final class SearchTerms {
|
||||
private final LongList priority;
|
||||
private final List<LongList> coherences;
|
||||
|
||||
private final CompiledQueryLong compiledQueryIds;
|
||||
|
||||
public SearchTerms(
|
||||
LongList includes,
|
||||
LongList excludes,
|
||||
LongList priority,
|
||||
List<LongList> coherences
|
||||
List<LongList> coherences,
|
||||
CompiledQueryLong compiledQueryIds
|
||||
) {
|
||||
this.includes = includes;
|
||||
this.excludes = excludes;
|
||||
this.priority = priority;
|
||||
this.coherences = coherences;
|
||||
this.compiledQueryIds = compiledQueryIds;
|
||||
}
|
||||
|
||||
public SearchTerms(SearchSubquery subquery) {
|
||||
public SearchTerms(SearchQuery query, CompiledQueryLong compiledQueryIds) {
|
||||
this(new LongArrayList(),
|
||||
new LongArrayList(),
|
||||
new LongArrayList(),
|
||||
new ArrayList<>());
|
||||
new ArrayList<>(),
|
||||
compiledQueryIds);
|
||||
|
||||
for (var word : subquery.searchTermsInclude) {
|
||||
for (var word : query.searchTermsInclude) {
|
||||
includes.add(getWordId(word));
|
||||
}
|
||||
for (var word : subquery.searchTermsAdvice) {
|
||||
for (var word : query.searchTermsAdvice) {
|
||||
// This looks like a bug, but it's not
|
||||
includes.add(getWordId(word));
|
||||
}
|
||||
|
||||
|
||||
for (var coherence : subquery.searchTermCoherences) {
|
||||
for (var coherence : query.searchTermCoherences) {
|
||||
LongList parts = new LongArrayList(coherence.size());
|
||||
|
||||
for (var word : coherence) {
|
||||
@ -55,10 +61,10 @@ public final class SearchTerms {
|
||||
coherences.add(parts);
|
||||
}
|
||||
|
||||
for (var word : subquery.searchTermsExclude) {
|
||||
for (var word : query.searchTermsExclude) {
|
||||
excludes.add(getWordId(word));
|
||||
}
|
||||
for (var word : subquery.searchTermsPriority) {
|
||||
for (var word : query.searchTermsPriority) {
|
||||
priority.add(getWordId(word));
|
||||
}
|
||||
}
|
||||
@ -96,6 +102,8 @@ public final class SearchTerms {
|
||||
return coherences;
|
||||
}
|
||||
|
||||
public CompiledQueryLong compiledQuery() { return compiledQueryIds; }
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (obj == this) return true;
|
||||
|
@ -1,29 +1,9 @@
|
||||
package nu.marginalia.index.model;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
||||
import nu.marginalia.hash.MurmurHash3_128;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class SearchTermsUtil {
|
||||
|
||||
/** Extract all include-terms from the specified subqueries,
|
||||
* and a return a map of the terms and their termIds.
|
||||
*/
|
||||
public static Map<String, Long> getAllIncludeTerms(List<SearchSubquery> subqueries) {
|
||||
Map<String, Long> ret = new HashMap<>();
|
||||
|
||||
for (var subquery : subqueries) {
|
||||
for (var include : subquery.searchTermsInclude) {
|
||||
ret.computeIfAbsent(include, i -> getWordId(include));
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
private static final MurmurHash3_128 hasher = new MurmurHash3_128();
|
||||
|
||||
/** Translate the word to a unique id. */
|
||||
|
@ -4,7 +4,8 @@ import com.google.inject.Inject;
|
||||
import gnu.trove.map.hash.TObjectLongHashMap;
|
||||
import it.unimi.dsi.fastutil.longs.Long2ObjectArrayMap;
|
||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.index.index.StatefulIndex;
|
||||
import nu.marginalia.index.model.SearchTermsUtil;
|
||||
import nu.marginalia.index.results.model.QuerySearchTerms;
|
||||
@ -13,9 +14,6 @@ import nu.marginalia.index.results.model.TermMetadataForCombinedDocumentIds;
|
||||
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
||||
import nu.marginalia.index.results.model.ids.TermIdList;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import static nu.marginalia.index.results.model.TermCoherenceGroupList.TermCoherenceGroup;
|
||||
import static nu.marginalia.index.results.model.TermMetadataForCombinedDocumentIds.DocumentsWithMetadata;
|
||||
|
||||
@ -42,43 +40,24 @@ public class IndexMetadataService {
|
||||
return new TermMetadataForCombinedDocumentIds(termdocToMeta);
|
||||
}
|
||||
|
||||
public QuerySearchTerms getSearchTerms(List<SearchSubquery> searchTermVariants) {
|
||||
public QuerySearchTerms getSearchTerms(CompiledQuery<String> compiledQuery, SearchQuery searchQuery) {
|
||||
|
||||
LongArrayList termIdsList = new LongArrayList();
|
||||
|
||||
TObjectLongHashMap<String> termToId = new TObjectLongHashMap<>(10, 0.75f, -1);
|
||||
|
||||
for (var subquery : searchTermVariants) {
|
||||
for (var term : subquery.searchTermsInclude) {
|
||||
if (termToId.containsKey(term)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
long id = SearchTermsUtil.getWordId(term);
|
||||
termIdsList.add(id);
|
||||
termToId.put(term, id);
|
||||
}
|
||||
for (String word : compiledQuery) {
|
||||
long id = SearchTermsUtil.getWordId(word);
|
||||
termIdsList.add(id);
|
||||
termToId.put(word, id);
|
||||
}
|
||||
|
||||
return new QuerySearchTerms(termToId,
|
||||
new TermIdList(termIdsList),
|
||||
getTermCoherences(searchTermVariants));
|
||||
}
|
||||
|
||||
|
||||
private TermCoherenceGroupList getTermCoherences(List<SearchSubquery> searchTermVariants) {
|
||||
List<TermCoherenceGroup> coherences = new ArrayList<>();
|
||||
|
||||
for (var subquery : searchTermVariants) {
|
||||
for (var coh : subquery.searchTermCoherences) {
|
||||
coherences.add(new TermCoherenceGroup(coh));
|
||||
}
|
||||
|
||||
// It's assumed each subquery has identical coherences
|
||||
break;
|
||||
}
|
||||
|
||||
return new TermCoherenceGroupList(coherences);
|
||||
new TermCoherenceGroupList(
|
||||
searchQuery.searchTermCoherences.stream().map(TermCoherenceGroup::new).toList()
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,10 +1,13 @@
|
||||
package nu.marginalia.index.results;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
|
||||
import nu.marginalia.index.index.StatefulIndex;
|
||||
import nu.marginalia.index.model.SearchParameters;
|
||||
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
||||
import nu.marginalia.index.model.QueryParams;
|
||||
import nu.marginalia.index.results.model.QuerySearchTerms;
|
||||
@ -23,7 +26,6 @@ import java.util.List;
|
||||
* reasons to cache this data, and performs the calculations */
|
||||
public class IndexResultValuationContext {
|
||||
private final StatefulIndex statefulIndex;
|
||||
private final List<List<String>> searchTermVariants;
|
||||
private final QueryParams queryParams;
|
||||
|
||||
private final TermMetadataForCombinedDocumentIds termMetadataForCombinedDocumentIds;
|
||||
@ -31,23 +33,26 @@ public class IndexResultValuationContext {
|
||||
|
||||
private final ResultRankingContext rankingContext;
|
||||
private final ResultValuator searchResultValuator;
|
||||
private final CompiledQuery<String> compiledQuery;
|
||||
private final CompiledQueryLong compiledQueryIds;
|
||||
|
||||
public IndexResultValuationContext(IndexMetadataService metadataService,
|
||||
ResultValuator searchResultValuator,
|
||||
CombinedDocIdList ids,
|
||||
StatefulIndex statefulIndex,
|
||||
ResultRankingContext rankingContext,
|
||||
List<SearchSubquery> subqueries,
|
||||
QueryParams queryParams
|
||||
SearchParameters params
|
||||
) {
|
||||
this.statefulIndex = statefulIndex;
|
||||
this.rankingContext = rankingContext;
|
||||
this.searchResultValuator = searchResultValuator;
|
||||
|
||||
this.searchTermVariants = subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList();
|
||||
this.queryParams = queryParams;
|
||||
this.queryParams = params.queryParams;
|
||||
this.compiledQuery = params.compiledQuery;
|
||||
this.compiledQueryIds = params.compiledQueryIds;
|
||||
|
||||
this.searchTerms = metadataService.getSearchTerms(params.compiledQuery, params.query);
|
||||
|
||||
this.searchTerms = metadataService.getSearchTerms(subqueries);
|
||||
this.termMetadataForCombinedDocumentIds = metadataService.getTermMetadataForDocuments(ids, searchTerms.termIdsAll);
|
||||
}
|
||||
|
||||
@ -65,68 +70,39 @@ public class IndexResultValuationContext {
|
||||
long docMetadata = statefulIndex.getDocumentMetadata(docId);
|
||||
int htmlFeatures = statefulIndex.getHtmlFeatures(docId);
|
||||
|
||||
int maxFlagsCount = 0;
|
||||
boolean anyAllSynthetic = false;
|
||||
int maxPositionsSet = 0;
|
||||
SearchResultItem searchResult = new SearchResultItem(docId);
|
||||
|
||||
SearchResultItem searchResult = new SearchResultItem(docId,
|
||||
searchTermVariants.stream().mapToInt(List::size).sum());
|
||||
SearchResultKeywordScore[] scores = compiledQuery.indices().mapToObj(idx ->
|
||||
new SearchResultKeywordScore(
|
||||
compiledQuery.at(idx),
|
||||
compiledQueryIds.at(idx),
|
||||
termMetadataForCombinedDocumentIds.getTermMetadata(
|
||||
compiledQueryIds.at(idx), combinedId
|
||||
),
|
||||
docMetadata,
|
||||
htmlFeatures)
|
||||
)
|
||||
.toArray(SearchResultKeywordScore[]::new);
|
||||
|
||||
for (int querySetId = 0;
|
||||
querySetId < searchTermVariants.size();
|
||||
querySetId++)
|
||||
{
|
||||
var termList = searchTermVariants.get(querySetId);
|
||||
// DANGER: IndexResultValuatorService assumes that searchResult.keywordScores has this specific order, as it needs
|
||||
// to be able to re-construct its own CompiledQuery<SearchResultKeywordScore> for re-ranking the results. This is
|
||||
// a very flimsy assumption.
|
||||
searchResult.keywordScores.addAll(List.of(scores));
|
||||
|
||||
SearchResultKeywordScore[] termScoresForSet = new SearchResultKeywordScore[termList.size()];
|
||||
CompiledQuery<SearchResultKeywordScore> queryGraphScores = new CompiledQuery<>(compiledQuery.root, scores);
|
||||
|
||||
boolean synthetic = true;
|
||||
boolean allSynthetic = !CompiledQueryAggregates.booleanAggregate(queryGraphScores, score -> !score.hasTermFlag(WordFlags.Synthetic));
|
||||
int flagsCount = CompiledQueryAggregates.intMaxMinAggregate(queryGraphScores, score -> Long.bitCount(score.encodedWordMetadata() & flagsFilterMask));
|
||||
int positionsCount = CompiledQueryAggregates.intMaxMinAggregate(queryGraphScores, SearchResultKeywordScore::positionCount);
|
||||
|
||||
for (int termIdx = 0; termIdx < termList.size(); termIdx++) {
|
||||
String searchTerm = termList.get(termIdx);
|
||||
|
||||
long termMetadata = termMetadataForCombinedDocumentIds.getTermMetadata(
|
||||
searchTerms.getIdForTerm(searchTerm),
|
||||
combinedId
|
||||
);
|
||||
|
||||
var score = new SearchResultKeywordScore(
|
||||
querySetId,
|
||||
searchTerm,
|
||||
termMetadata,
|
||||
docMetadata,
|
||||
htmlFeatures
|
||||
);
|
||||
|
||||
synthetic &= WordFlags.Synthetic.isPresent(termMetadata);
|
||||
|
||||
searchResult.keywordScores.add(score);
|
||||
|
||||
termScoresForSet[termIdx] = score;
|
||||
}
|
||||
|
||||
if (!meetsQueryStrategyRequirements(termScoresForSet, queryParams.queryStrategy())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
int minFlagsCount = 8;
|
||||
int minPositionsSet = 4;
|
||||
|
||||
for (var termScore : termScoresForSet) {
|
||||
final int flagCount = Long.bitCount(termScore.encodedWordMetadata() & flagsFilterMask);
|
||||
minFlagsCount = Math.min(minFlagsCount, flagCount);
|
||||
minPositionsSet = Math.min(minPositionsSet, termScore.positionCount());
|
||||
}
|
||||
|
||||
maxFlagsCount = Math.max(maxFlagsCount, minFlagsCount);
|
||||
maxPositionsSet = Math.max(maxPositionsSet, minPositionsSet);
|
||||
anyAllSynthetic |= synthetic;
|
||||
if (!meetsQueryStrategyRequirements(queryGraphScores, queryParams.queryStrategy())) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (maxFlagsCount == 0 && !anyAllSynthetic && maxPositionsSet == 0)
|
||||
if (flagsCount == 0 && !allSynthetic && positionsCount == 0)
|
||||
return null;
|
||||
|
||||
double score = searchResultValuator.calculateSearchResultValue(searchResult.keywordScores,
|
||||
double score = searchResultValuator.calculateSearchResultValue(queryGraphScores,
|
||||
5000, // use a dummy value here as it's not present in the index
|
||||
rankingContext);
|
||||
|
||||
@ -135,20 +111,17 @@ public class IndexResultValuationContext {
|
||||
return searchResult;
|
||||
}
|
||||
|
||||
private boolean meetsQueryStrategyRequirements(SearchResultKeywordScore[] termSet, QueryStrategy queryStrategy) {
|
||||
private boolean meetsQueryStrategyRequirements(CompiledQuery<SearchResultKeywordScore> queryGraphScores,
|
||||
QueryStrategy queryStrategy)
|
||||
{
|
||||
if (queryStrategy == QueryStrategy.AUTO ||
|
||||
queryStrategy == QueryStrategy.SENTENCE ||
|
||||
queryStrategy == QueryStrategy.TOPIC) {
|
||||
return true;
|
||||
}
|
||||
|
||||
for (var keyword : termSet) {
|
||||
if (!meetsQueryStrategyRequirements(keyword, queryParams.queryStrategy())) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
return CompiledQueryAggregates.booleanAggregate(queryGraphScores,
|
||||
docs -> meetsQueryStrategyRequirements(docs, queryParams.queryStrategy()));
|
||||
}
|
||||
|
||||
private boolean meetsQueryStrategyRequirements(SearchResultKeywordScore termScore, QueryStrategy queryStrategy) {
|
||||
|
@ -4,10 +4,11 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import gnu.trove.list.TLongList;
|
||||
import gnu.trove.list.array.TLongArrayList;
|
||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
|
||||
import nu.marginalia.index.index.StatefulIndex;
|
||||
import nu.marginalia.index.model.SearchParameters;
|
||||
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
||||
@ -19,8 +20,6 @@ import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Singleton
|
||||
public class IndexResultValuatorService {
|
||||
@ -44,8 +43,8 @@ public class IndexResultValuatorService {
|
||||
}
|
||||
|
||||
public List<SearchResultItem> rankResults(SearchParameters params,
|
||||
ResultRankingContext rankingContext,
|
||||
CombinedDocIdList resultIds)
|
||||
ResultRankingContext rankingContext,
|
||||
CombinedDocIdList resultIds)
|
||||
{
|
||||
final var evaluator = createValuationContext(params, rankingContext, resultIds);
|
||||
|
||||
@ -70,8 +69,7 @@ public class IndexResultValuatorService {
|
||||
resultIds,
|
||||
statefulIndex,
|
||||
rankingContext,
|
||||
params.subqueries,
|
||||
params.queryParams);
|
||||
params);
|
||||
}
|
||||
|
||||
|
||||
@ -96,12 +94,13 @@ public class IndexResultValuatorService {
|
||||
item.resultsFromDomain = domainCountFilter.getCount(item);
|
||||
}
|
||||
|
||||
return decorateAndRerank(resultsList, rankingContext);
|
||||
return decorateAndRerank(resultsList, params.compiledQuery, rankingContext);
|
||||
}
|
||||
|
||||
/** Decorate the result items with additional information from the link database
|
||||
* and calculate an updated ranking with the additional information */
|
||||
public List<DecoratedSearchResultItem> decorateAndRerank(List<SearchResultItem> rawResults,
|
||||
CompiledQuery<String> compiledQuery,
|
||||
ResultRankingContext rankingContext)
|
||||
throws SQLException
|
||||
{
|
||||
@ -125,13 +124,22 @@ public class IndexResultValuatorService {
|
||||
continue;
|
||||
}
|
||||
|
||||
resultItems.add(createCombinedItem(result, docData, rankingContext));
|
||||
// Reconstruct the SearchResultKeywordScore-compiledquery for re-valuation
|
||||
//
|
||||
// CAVEAT: This hinges on a very fragile that IndexResultValuationContext puts them in the same
|
||||
// order as the data for the CompiledQuery<String>.
|
||||
CompiledQuery<SearchResultKeywordScore> resultQuery =
|
||||
new CompiledQuery<>(compiledQuery.root, result.keywordScores.toArray(SearchResultKeywordScore[]::new));
|
||||
|
||||
|
||||
resultItems.add(createCombinedItem(result, docData, resultQuery, rankingContext));
|
||||
}
|
||||
return resultItems;
|
||||
}
|
||||
|
||||
private DecoratedSearchResultItem createCombinedItem(SearchResultItem result,
|
||||
DocdbUrlDetail docData,
|
||||
CompiledQuery<SearchResultKeywordScore> resultQuery,
|
||||
ResultRankingContext rankingContext) {
|
||||
return new DecoratedSearchResultItem(
|
||||
result,
|
||||
@ -144,7 +152,7 @@ public class IndexResultValuatorService {
|
||||
docData.pubYear(),
|
||||
docData.dataHash(),
|
||||
docData.wordsTotal(),
|
||||
resultValuator.calculateSearchResultValue(result.keywordScores, docData.wordsTotal(), rankingContext)
|
||||
resultValuator.calculateSearchResultValue(resultQuery, docData.wordsTotal(), rankingContext)
|
||||
);
|
||||
|
||||
}
|
||||
|
@ -1,5 +1,6 @@
|
||||
package nu.marginalia.ranking.results;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
|
||||
@ -33,14 +34,17 @@ public class ResultValuator {
|
||||
this.termCoherenceFactor = termCoherenceFactor;
|
||||
}
|
||||
|
||||
public double calculateSearchResultValue(List<SearchResultKeywordScore> scores,
|
||||
public double calculateSearchResultValue(CompiledQuery<SearchResultKeywordScore> scores,
|
||||
int length,
|
||||
ResultRankingContext ctx)
|
||||
{
|
||||
int sets = numberOfSets(scores);
|
||||
if (scores.size() == 0)
|
||||
return Double.MAX_VALUE;
|
||||
if (length < 0)
|
||||
length = 5000;
|
||||
|
||||
long documentMetadata = documentMetadata(scores);
|
||||
int features = htmlFeatures(scores);
|
||||
long documentMetadata = scores.at(0).encodedDocMetadata();
|
||||
int features = scores.at(0).htmlFeatures();
|
||||
var rankingParams = ctx.params;
|
||||
|
||||
int rank = DocumentMetadata.decodeRank(documentMetadata);
|
||||
@ -75,32 +79,16 @@ public class ResultValuator {
|
||||
+ temporalBias
|
||||
+ flagsPenalty;
|
||||
|
||||
double bestTcf = 0;
|
||||
double bestBM25F = 0;
|
||||
double bestBM25P = 0;
|
||||
double bestBM25PN = 0;
|
||||
|
||||
for (int set = 0; set < sets; set++) {
|
||||
ResultKeywordSet keywordSet = createKeywordSet(scores, set);
|
||||
|
||||
if (keywordSet.isEmpty())
|
||||
continue;
|
||||
|
||||
bestTcf = Math.max(bestTcf, rankingParams.tcfWeight * termCoherenceFactor.calculate(keywordSet));
|
||||
bestBM25P = Math.max(bestBM25P, rankingParams.bm25PrioWeight * bm25Factor.calculateBm25Prio(rankingParams.prioParams, keywordSet, ctx));
|
||||
bestBM25F = Math.max(bestBM25F, rankingParams.bm25FullWeight * bm25Factor.calculateBm25(rankingParams.fullParams, keywordSet, length, ctx));
|
||||
if (keywordSet.hasNgram()) {
|
||||
bestBM25PN = Math.max(bestBM25PN, rankingParams.bm25PrioWeight * bm25Factor.calculateBm25Prio(rankingParams.prioParams, keywordSet, ctx));
|
||||
}
|
||||
}
|
||||
|
||||
double bestTcf = rankingParams.tcfWeight * termCoherenceFactor.calculate(scores);
|
||||
double bestBM25F = rankingParams.bm25FullWeight * bm25Factor.calculateBm25(rankingParams.prioParams, scores, length, ctx);
|
||||
double bestBM25P = rankingParams.bm25PrioWeight * bm25Factor.calculateBm25Prio(rankingParams.prioParams, scores, ctx);
|
||||
|
||||
double overallPartPositive = Math.max(0, overallPart);
|
||||
double overallPartNegative = -Math.min(0, overallPart);
|
||||
|
||||
// Renormalize to 0...15, where 0 is the best possible score;
|
||||
// this is a historical artifact of the original ranking function
|
||||
return normalize(1.5 * bestTcf + bestBM25F + bestBM25P + 0.25 * bestBM25PN + overallPartPositive, overallPartNegative);
|
||||
return normalize(1.5 * bestTcf + bestBM25F + bestBM25P + overallPartPositive, overallPartNegative);
|
||||
}
|
||||
|
||||
private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) {
|
||||
|
@ -1,10 +1,11 @@
|
||||
package nu.marginalia.ranking.results.factors;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
|
||||
import nu.marginalia.api.searchquery.model.results.Bm25Parameters;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.ranking.results.ResultKeywordSet;
|
||||
|
||||
public class Bm25Factor {
|
||||
private static final int AVG_LENGTH = 5000;
|
||||
@ -13,43 +14,33 @@ public class Bm25Factor {
|
||||
*
|
||||
* @see Bm25Parameters
|
||||
*/
|
||||
public double calculateBm25(Bm25Parameters bm25Parameters, ResultKeywordSet keywordSet, int length, ResultRankingContext ctx) {
|
||||
public double calculateBm25(Bm25Parameters bm25Parameters, CompiledQuery<SearchResultKeywordScore> scores, int length, ResultRankingContext ctx) {
|
||||
final int docCount = ctx.termFreqDocCount();
|
||||
|
||||
if (length <= 0)
|
||||
length = AVG_LENGTH;
|
||||
|
||||
double sum = 0.;
|
||||
|
||||
for (var keyword : keywordSet.keywords()) {
|
||||
return CompiledQueryAggregates.doubleSumAggregate(scores, keyword -> {
|
||||
double count = keyword.positionCount();
|
||||
|
||||
int freq = ctx.frequency(keyword.keyword);
|
||||
|
||||
sum += invFreq(docCount, freq) * f(bm25Parameters.k(), bm25Parameters.b(), count, length);
|
||||
}
|
||||
|
||||
return sum;
|
||||
return invFreq(docCount, freq) * f(bm25Parameters.k(), bm25Parameters.b(), count, length);
|
||||
});
|
||||
}
|
||||
|
||||
/** Bm25 calculation, except instead of counting positions in the document,
|
||||
* the number of relevance signals for the term is counted instead.
|
||||
*/
|
||||
public double calculateBm25Prio(Bm25Parameters bm25Parameters, ResultKeywordSet keywordSet, ResultRankingContext ctx) {
|
||||
public double calculateBm25Prio(Bm25Parameters bm25Parameters, CompiledQuery<SearchResultKeywordScore> scores, ResultRankingContext ctx) {
|
||||
final int docCount = ctx.termFreqDocCount();
|
||||
|
||||
double sum = 0.;
|
||||
|
||||
for (var keyword : keywordSet.keywords()) {
|
||||
return CompiledQueryAggregates.doubleSumAggregate(scores, keyword -> {
|
||||
double count = evaluatePriorityScore(keyword);
|
||||
|
||||
int freq = ctx.priorityFrequency(keyword.keyword);
|
||||
|
||||
// note we override b to zero for priority terms as they are independent of document length
|
||||
sum += invFreq(docCount, freq) * f(bm25Parameters.k(), 0, count, 0);
|
||||
}
|
||||
return invFreq(docCount, freq) * f(bm25Parameters.k(), 0, count, 0);
|
||||
});
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
private static double evaluatePriorityScore(SearchResultKeywordScore keyword) {
|
||||
|
@ -1,14 +1,16 @@
|
||||
package nu.marginalia.ranking.results.factors;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
import nu.marginalia.ranking.results.ResultKeywordSet;
|
||||
|
||||
/** Rewards documents where terms appear frequently within the same sentences
|
||||
*/
|
||||
public class TermCoherenceFactor {
|
||||
|
||||
public double calculate(ResultKeywordSet keywordSet) {
|
||||
long mask = combinedMask(keywordSet);
|
||||
public double calculate(CompiledQuery<SearchResultKeywordScore> scores) {
|
||||
long mask = CompiledQueryAggregates.longBitmaskAggregate(scores, score -> score.positions() & WordMetadata.POSITIONS_MASK);
|
||||
|
||||
return bitsSetFactor(mask);
|
||||
}
|
||||
@ -19,14 +21,5 @@ public class TermCoherenceFactor {
|
||||
return Math.pow(bitsSetInMask/(float) WordMetadata.POSITIONS_COUNT, 0.25);
|
||||
}
|
||||
|
||||
long combinedMask(ResultKeywordSet keywordSet) {
|
||||
long mask = WordMetadata.POSITIONS_MASK;
|
||||
|
||||
for (var keyword : keywordSet.keywords()) {
|
||||
mask &= keyword.positions();
|
||||
}
|
||||
|
||||
return mask;
|
||||
}
|
||||
|
||||
}
|
@ -2,6 +2,8 @@ package nu.marginalia.index.query;
|
||||
|
||||
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/** Builds a query.
|
||||
* <p />
|
||||
* Note: The query builder may omit predicates that are deemed redundant.
|
||||
@ -21,6 +23,7 @@ public interface IndexQueryBuilder {
|
||||
IndexQueryBuilder notFull(long termId);
|
||||
|
||||
IndexQueryBuilder addInclusionFilter(QueryFilterStepIf filterStep);
|
||||
IndexQueryBuilder addInclusionFilterAny(List<QueryFilterStepIf> filterStep);
|
||||
|
||||
IndexQuery build();
|
||||
}
|
||||
|
@ -0,0 +1,57 @@
|
||||
package nu.marginalia.index.query.filter;
|
||||
|
||||
import nu.marginalia.array.buffer.LongQueryBuffer;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
public class QueryFilterAllOf implements QueryFilterStepIf {
|
||||
private final List<? extends QueryFilterStepIf> steps;
|
||||
|
||||
public QueryFilterAllOf(List<? extends QueryFilterStepIf> steps) {
|
||||
this.steps = steps;
|
||||
}
|
||||
|
||||
public double cost() {
|
||||
double prod = 1.;
|
||||
|
||||
for (var step : steps) {
|
||||
double cost = step.cost();
|
||||
if (cost > 1.0) {
|
||||
prod *= Math.log(cost);
|
||||
}
|
||||
else {
|
||||
prod += cost;
|
||||
}
|
||||
}
|
||||
|
||||
return prod;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean test(long value) {
|
||||
for (var step : steps) {
|
||||
if (!step.test(value))
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
public void apply(LongQueryBuffer buffer) {
|
||||
if (steps.isEmpty())
|
||||
return;
|
||||
|
||||
for (var step : steps) {
|
||||
step.apply(buffer);
|
||||
}
|
||||
}
|
||||
|
||||
public String describe() {
|
||||
StringJoiner sj = new StringJoiner(",", "[All Of: ", "]");
|
||||
for (var step : steps) {
|
||||
sj.add(step.describe());
|
||||
}
|
||||
return sj.toString();
|
||||
}
|
||||
}
|
@ -2,7 +2,6 @@ package nu.marginalia.index.query.filter;
|
||||
|
||||
import nu.marginalia.array.buffer.LongQueryBuffer;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
@ -14,7 +13,7 @@ public class QueryFilterAnyOf implements QueryFilterStepIf {
|
||||
}
|
||||
|
||||
public double cost() {
|
||||
return steps.stream().mapToDouble(QueryFilterStepIf::cost).average().orElse(0.);
|
||||
return steps.stream().mapToDouble(QueryFilterStepIf::cost).sum();
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -31,31 +30,23 @@ public class QueryFilterAnyOf implements QueryFilterStepIf {
|
||||
if (steps.isEmpty())
|
||||
return;
|
||||
|
||||
int start;
|
||||
int start = 0;
|
||||
int end = buffer.end;
|
||||
|
||||
steps.getFirst().apply(buffer);
|
||||
|
||||
// The filter functions will partition the data in the buffer from 0 to END,
|
||||
// and update END to the length of the retained items, keeping the retained
|
||||
// items sorted but making no guarantees about the rejected half
|
||||
//
|
||||
// Therefore, we need to re-sort the rejected side, and to satisfy the
|
||||
// constraint that the data is sorted up to END, finally sort it again.
|
||||
//
|
||||
// This sorting may seem like it's slower, but filter.apply(...) is
|
||||
// typically much faster than iterating over filter.test(...); so this
|
||||
// is more than made up for
|
||||
|
||||
for (int fi = 1; fi < steps.size(); fi++)
|
||||
for (var step : steps)
|
||||
{
|
||||
start = buffer.end;
|
||||
Arrays.sort(buffer.data, start, end);
|
||||
buffer.startFilterForRange(start, end);
|
||||
steps.get(fi).apply(buffer);
|
||||
var slice = buffer.slice(start, end);
|
||||
slice.data.quickSort(0, slice.size());
|
||||
|
||||
step.apply(slice);
|
||||
start += slice.end;
|
||||
}
|
||||
|
||||
Arrays.sort(buffer.data, 0, buffer.end);
|
||||
buffer.data.quickSort(0, start);
|
||||
|
||||
// Special finalization
|
||||
buffer.reset();
|
||||
buffer.end = start;
|
||||
}
|
||||
|
||||
public String describe() {
|
||||
|
@ -16,7 +16,7 @@ public class QueryFilterLetThrough implements QueryFilterStepIf {
|
||||
}
|
||||
|
||||
public double cost() {
|
||||
return 0.;
|
||||
return 1.;
|
||||
}
|
||||
|
||||
public String describe() {
|
||||
|
@ -15,7 +15,7 @@ public class QueryFilterNoPass implements QueryFilterStepIf {
|
||||
}
|
||||
|
||||
public double cost() {
|
||||
return 0.;
|
||||
return 1.;
|
||||
}
|
||||
|
||||
public String describe() {
|
||||
|
@ -16,7 +16,7 @@ public class QueryFilterStepExcludeFromPredicate implements QueryFilterStepIf {
|
||||
|
||||
@Override
|
||||
public double cost() {
|
||||
return 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -16,7 +16,7 @@ public class QueryFilterStepFromPredicate implements QueryFilterStepIf {
|
||||
|
||||
@Override
|
||||
public double cost() {
|
||||
return 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -55,6 +55,32 @@ class QueryFilterStepIfTest {
|
||||
assertArrayEquals(new long[]{8, 10}, buffer.copyData());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSuccessiveApplicationWithAllOf() {
|
||||
var buffer = createBuffer(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
|
||||
var filter1 = new QueryFilterStepFromPredicate(value -> value % 2 == 0);
|
||||
var filter2 = new QueryFilterStepExcludeFromPredicate(value -> value <= 6);
|
||||
new QueryFilterAllOf(List.of(filter1, filter2)).apply(buffer);
|
||||
assertArrayEquals(new long[]{8, 10}, buffer.copyData());
|
||||
}
|
||||
@Test
|
||||
public void testCombinedOrAnd() {
|
||||
var buffer = createBuffer(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
|
||||
|
||||
var filter1 = new QueryFilterStepFromPredicate(value -> value % 2 == 0);
|
||||
var filter2 = new QueryFilterStepFromPredicate(value -> value <= 5);
|
||||
var filter1_2 = new QueryFilterAllOf(List.of(filter1, filter2));
|
||||
|
||||
var filter3 = new QueryFilterStepFromPredicate(value -> value % 2 == 1);
|
||||
var filter4 = new QueryFilterStepFromPredicate(value -> value > 5);
|
||||
var filter3_4 = new QueryFilterAllOf(List.of(filter3, filter4));
|
||||
|
||||
var filter12_34 = new QueryFilterAnyOf(List.of(filter1_2, filter3_4));
|
||||
|
||||
filter12_34.apply(buffer);
|
||||
|
||||
assertArrayEquals(new long[]{2, 4, 7, 9}, buffer.copyData());
|
||||
}
|
||||
@Test
|
||||
public void testCombinedApplication() {
|
||||
var buffer = createBuffer(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
|
||||
|
@ -5,7 +5,7 @@ import com.google.inject.Inject;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.IndexLocations;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.index.index.StatefulIndex;
|
||||
import nu.marginalia.process.control.FakeProcessHeartbeat;
|
||||
@ -123,9 +123,10 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
||||
.rankingParams(ResultRankingParameters.sensibleDefaults())
|
||||
.domains(new ArrayList<>())
|
||||
.searchSetIdentifier("NONE")
|
||||
.subqueries(List.of(new SearchSubquery(
|
||||
.query(new SearchQuery(
|
||||
"2 3 5",
|
||||
List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(),
|
||||
Collections.emptyList()))).build());
|
||||
Collections.emptyList())).build());
|
||||
|
||||
int[] idxes = new int[] { 30, 510, 90, 150, 210, 270, 330, 390, 450 };
|
||||
long[] ids = IntStream.of(idxes).mapToLong(this::fullId).toArray();
|
||||
@ -166,9 +167,13 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
||||
.rankingParams(ResultRankingParameters.sensibleDefaults())
|
||||
.queryStrategy(QueryStrategy.SENTENCE)
|
||||
.domains(List.of(2))
|
||||
.subqueries(List.of(new SearchSubquery(
|
||||
List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(),
|
||||
Collections.emptyList()))).build());
|
||||
.query(new SearchQuery(
|
||||
"2 3 5",
|
||||
List.of("3", "5", "2"),
|
||||
List.of("4"),
|
||||
Collections.emptyList(),
|
||||
Collections.emptyList(),
|
||||
Collections.emptyList())).build());
|
||||
int[] idxes = new int[] { 210, 270 };
|
||||
long[] ids = IntStream.of(idxes).mapToLong(id -> UrlIdCodec.encodeId(id/100, id)).toArray();
|
||||
long[] actual = rsp.results.stream().mapToLong(i -> i.rawIndexResult.getDocumentId()).toArray();
|
||||
@ -202,9 +207,8 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
||||
.queryStrategy(QueryStrategy.SENTENCE)
|
||||
.searchSetIdentifier("NONE")
|
||||
.rankingParams(ResultRankingParameters.sensibleDefaults())
|
||||
.subqueries(List.of(new SearchSubquery(
|
||||
List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList(),
|
||||
Collections.emptyList()))
|
||||
.query(
|
||||
new SearchQuery("4", List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList(), Collections.emptyList())
|
||||
).build());
|
||||
|
||||
|
||||
|
@ -4,7 +4,7 @@ import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.IndexLocations;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.index.index.StatefulIndex;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
@ -35,6 +35,7 @@ import nu.marginalia.process.control.ProcessHeartbeat;
|
||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||
import nu.marginalia.service.server.Initialization;
|
||||
import org.apache.logging.log4j.util.Strings;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
@ -108,7 +109,7 @@ public class IndexQueryServiceIntegrationTest {
|
||||
w("world", WordFlags.Title)
|
||||
).load();
|
||||
|
||||
var query = basicQuery(builder -> builder.subqueries(justInclude("hello", "world")));
|
||||
var query = basicQuery(builder -> builder.query(justInclude("hello", "world")));
|
||||
|
||||
executeSearch(query)
|
||||
.expectDocumentsInOrder(d(1,1));
|
||||
@ -127,57 +128,51 @@ public class IndexQueryServiceIntegrationTest {
|
||||
).load();
|
||||
|
||||
var queryMissingExclude = basicQuery(builder ->
|
||||
builder.subqueries(includeAndExclude("hello", "missing")));
|
||||
builder.query(includeAndExclude("hello", "missing")));
|
||||
|
||||
executeSearch(queryMissingExclude)
|
||||
.expectDocumentsInOrder(d(1,1));
|
||||
|
||||
var queryMissingInclude = basicQuery(builder ->
|
||||
builder.subqueries(justInclude("missing")));
|
||||
builder.query(justInclude("missing")));
|
||||
|
||||
executeSearch(queryMissingInclude)
|
||||
.expectCount(0);
|
||||
|
||||
var queryMissingPriority = basicQuery(builder ->
|
||||
builder.subqueries(
|
||||
List.of(
|
||||
new SearchSubquery(
|
||||
List.of("hello"),
|
||||
List.of(),
|
||||
List.of(),
|
||||
List.of("missing"),
|
||||
List.of()
|
||||
)
|
||||
)));
|
||||
builder.query(new SearchQuery(
|
||||
"hello",
|
||||
List.of("hello"),
|
||||
List.of(),
|
||||
List.of(),
|
||||
List.of("missing"),
|
||||
List.of())
|
||||
));
|
||||
|
||||
executeSearch(queryMissingPriority)
|
||||
.expectCount(1);
|
||||
|
||||
var queryMissingAdvice = basicQuery(builder ->
|
||||
builder.subqueries(
|
||||
List.of(
|
||||
new SearchSubquery(
|
||||
List.of("hello"),
|
||||
List.of(),
|
||||
List.of("missing"),
|
||||
List.of(),
|
||||
List.of()
|
||||
)
|
||||
builder.query(
|
||||
new SearchQuery("hello",
|
||||
List.of("hello"),
|
||||
List.of(),
|
||||
List.of("missing"),
|
||||
List.of(),
|
||||
List.of()
|
||||
)));
|
||||
|
||||
executeSearch(queryMissingAdvice)
|
||||
.expectCount(0);
|
||||
|
||||
var queryMissingCoherence = basicQuery(builder ->
|
||||
builder.subqueries(
|
||||
List.of(
|
||||
new SearchSubquery(
|
||||
List.of("hello"),
|
||||
List.of(),
|
||||
List.of(),
|
||||
List.of(),
|
||||
List.of(List.of("missing", "hello"))
|
||||
)
|
||||
builder.query(
|
||||
new SearchQuery("hello",
|
||||
List.of("hello"),
|
||||
List.of(),
|
||||
List.of(),
|
||||
List.of(),
|
||||
List.of(List.of("missing", "hello"))
|
||||
)));
|
||||
|
||||
executeSearch(queryMissingCoherence)
|
||||
@ -202,7 +197,7 @@ public class IndexQueryServiceIntegrationTest {
|
||||
).load();
|
||||
|
||||
|
||||
var query = basicQuery(builder -> builder.subqueries(justInclude("hello", "world")));
|
||||
var query = basicQuery(builder -> builder.query(justInclude("hello", "world")));
|
||||
|
||||
executeSearch(query)
|
||||
.expectDocumentsInOrder(d(1,1));
|
||||
@ -234,15 +229,15 @@ public class IndexQueryServiceIntegrationTest {
|
||||
|
||||
|
||||
var beforeY2K = basicQuery(builder ->
|
||||
builder.subqueries(justInclude("hello", "world"))
|
||||
builder.query(justInclude("hello", "world"))
|
||||
.year(SpecificationLimit.lessThan(2000))
|
||||
);
|
||||
var atY2K = basicQuery(builder ->
|
||||
builder.subqueries(justInclude("hello", "world"))
|
||||
builder.query(justInclude("hello", "world"))
|
||||
.year(SpecificationLimit.equals(2000))
|
||||
);
|
||||
var afterY2K = basicQuery(builder ->
|
||||
builder.subqueries(justInclude("hello", "world"))
|
||||
builder.query(justInclude("hello", "world"))
|
||||
.year(SpecificationLimit.greaterThan(2000))
|
||||
);
|
||||
|
||||
@ -296,11 +291,11 @@ public class IndexQueryServiceIntegrationTest {
|
||||
|
||||
|
||||
var domain1 = basicQuery(builder ->
|
||||
builder.subqueries(justInclude("hello", "world"))
|
||||
builder.query(justInclude("hello", "world"))
|
||||
.domains(List.of(1))
|
||||
);
|
||||
var domain2 = basicQuery(builder ->
|
||||
builder.subqueries(justInclude("hello", "world"))
|
||||
builder.query(justInclude("hello", "world"))
|
||||
.domains(List.of(2))
|
||||
);
|
||||
|
||||
@ -334,7 +329,7 @@ public class IndexQueryServiceIntegrationTest {
|
||||
).load();
|
||||
|
||||
var query = basicQuery(builder ->
|
||||
builder.subqueries(includeAndExclude("hello", "my_darling"))
|
||||
builder.query(includeAndExclude("hello", "my_darling"))
|
||||
);
|
||||
|
||||
executeSearch(query)
|
||||
@ -403,7 +398,7 @@ public class IndexQueryServiceIntegrationTest {
|
||||
.load();
|
||||
|
||||
var rsp = queryService.justQuery(
|
||||
basicQuery(builder -> builder.subqueries(
|
||||
basicQuery(builder -> builder.query(
|
||||
// note coherence requriement
|
||||
includeAndCohere("hello", "world")
|
||||
)));
|
||||
@ -424,50 +419,53 @@ public class IndexQueryServiceIntegrationTest {
|
||||
.rank(SpecificationLimit.none())
|
||||
.rankingParams(ResultRankingParameters.sensibleDefaults())
|
||||
.domains(new ArrayList<>())
|
||||
.searchSetIdentifier("NONE")
|
||||
.subqueries(List.of());
|
||||
.searchSetIdentifier("NONE");
|
||||
|
||||
return mutator.apply(builder).build();
|
||||
}
|
||||
|
||||
List<SearchSubquery> justInclude(String... includes) {
|
||||
return List.of(new SearchSubquery(
|
||||
SearchQuery justInclude(String... includes) {
|
||||
return new SearchQuery(
|
||||
Strings.join(List.of(includes), ' '),
|
||||
List.of(includes),
|
||||
List.of(),
|
||||
List.of(),
|
||||
List.of(),
|
||||
List.of()
|
||||
));
|
||||
);
|
||||
}
|
||||
|
||||
List<SearchSubquery> includeAndExclude(List<String> includes, List<String> excludes) {
|
||||
return List.of(new SearchSubquery(
|
||||
SearchQuery includeAndExclude(List<String> includes, List<String> excludes) {
|
||||
return new SearchQuery(
|
||||
Strings.join(List.of(includes), ' '),
|
||||
includes,
|
||||
excludes,
|
||||
List.of(),
|
||||
List.of(),
|
||||
List.of()
|
||||
));
|
||||
);
|
||||
}
|
||||
|
||||
List<SearchSubquery> includeAndExclude(String include, String exclude) {
|
||||
return List.of(new SearchSubquery(
|
||||
SearchQuery includeAndExclude(String include, String exclude) {
|
||||
return new SearchQuery(
|
||||
include,
|
||||
List.of(include),
|
||||
List.of(exclude),
|
||||
List.of(),
|
||||
List.of(),
|
||||
List.of()
|
||||
));
|
||||
);
|
||||
}
|
||||
|
||||
List<SearchSubquery> includeAndCohere(String... includes) {
|
||||
return List.of(new SearchSubquery(
|
||||
SearchQuery includeAndCohere(String... includes) {
|
||||
return new SearchQuery(
|
||||
Strings.join(List.of(includes), ' '),
|
||||
List.of(includes),
|
||||
List.of(),
|
||||
List.of(),
|
||||
List.of(),
|
||||
List.of(List.of(includes))
|
||||
));
|
||||
);
|
||||
}
|
||||
private MockDataDocument d(int domainId, int ordinal) {
|
||||
return new MockDataDocument(domainId, ordinal);
|
||||
|
@ -0,0 +1,59 @@
|
||||
package nu.marginalia.index.index;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.LongArraySet;
|
||||
import it.unimi.dsi.fastutil.longs.LongSet;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class QueryBranchWalkerTest {
|
||||
@Test
|
||||
public void testNoOverlap() {
|
||||
var paths = QueryBranchWalker.create(
|
||||
new long[] { 1, 2 },
|
||||
List.of(set(1), set(2))
|
||||
);
|
||||
assertEquals(2, paths.size());
|
||||
assertEquals(Set.of(1L, 2L), paths.stream().map(path -> path.termId).collect(Collectors.toSet()));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCond() {
|
||||
var paths = QueryBranchWalker.create(
|
||||
new long[] { 1, 2, 3, 4 },
|
||||
List.of(set(1,2,3), set(1,4,3))
|
||||
);
|
||||
assertEquals(1, paths.size());
|
||||
assertEquals(Set.of(1L), paths.stream().map(path -> path.termId).collect(Collectors.toSet()));
|
||||
System.out.println(Arrays.toString(paths.getFirst().priorityOrder));
|
||||
assertArrayEquals(new long[] { 2, 3, 4 }, paths.getFirst().priorityOrder);
|
||||
|
||||
var next = paths.getFirst().next();
|
||||
assertEquals(2, next.size());
|
||||
assertEquals(Set.of(2L, 3L), next.stream().map(path -> path.termId).collect(Collectors.toSet()));
|
||||
Map<Long, QueryBranchWalker> byId = next.stream().collect(Collectors.toMap(w -> w.termId, w->w));
|
||||
assertArrayEquals(new long[] { 3L }, byId.get(2L).priorityOrder );
|
||||
assertArrayEquals(new long[] { 4L }, byId.get(3L).priorityOrder );
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNoOverlapFirst() {
|
||||
var paths = QueryBranchWalker.create(
|
||||
new long[] { 1, 2, 3 },
|
||||
List.of(set(1, 2), set(1, 3))
|
||||
);
|
||||
assertEquals(1, paths.size());
|
||||
assertArrayEquals(new long[] { 2, 3 }, paths.getFirst().priorityOrder);
|
||||
assertEquals(Set.of(1L), paths.stream().map(path -> path.termId).collect(Collectors.toSet()));
|
||||
}
|
||||
|
||||
LongSet set(long... args) {
|
||||
return new LongArraySet(args);
|
||||
}
|
||||
}
|
@ -2,9 +2,10 @@ package nu.marginalia.index.results;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class IndexResultDomainDeduplicatorTest {
|
||||
@ -24,7 +25,7 @@ class IndexResultDomainDeduplicatorTest {
|
||||
}
|
||||
|
||||
SearchResultItem forId(int domain, int ordinal) {
|
||||
return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), 4);
|
||||
return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), List.of(), 4, Double.NaN);
|
||||
}
|
||||
|
||||
}
|
@ -1,5 +1,6 @@
|
||||
package nu.marginalia.ranking.results;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
|
||||
@ -35,21 +36,21 @@ class ResultValuatorTest {
|
||||
);
|
||||
|
||||
}
|
||||
List<SearchResultKeywordScore> titleOnlyLowCountSet = List.of(
|
||||
new SearchResultKeywordScore(0, "bob",
|
||||
CompiledQuery<SearchResultKeywordScore> titleOnlyLowCountSet = CompiledQuery.just(
|
||||
new SearchResultKeywordScore("bob", 1,
|
||||
wordMetadata(Set.of(1), EnumSet.of(WordFlags.Title)),
|
||||
docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)),
|
||||
0)
|
||||
);
|
||||
List<SearchResultKeywordScore> highCountNoTitleSet = List.of(
|
||||
new SearchResultKeywordScore(0, "bob",
|
||||
CompiledQuery<SearchResultKeywordScore> highCountNoTitleSet = CompiledQuery.just(
|
||||
new SearchResultKeywordScore("bob", 1,
|
||||
wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh)),
|
||||
docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)),
|
||||
0)
|
||||
);
|
||||
|
||||
List<SearchResultKeywordScore> highCountSubjectSet = List.of(
|
||||
new SearchResultKeywordScore(0, "bob",
|
||||
CompiledQuery<SearchResultKeywordScore> highCountSubjectSet = CompiledQuery.just(
|
||||
new SearchResultKeywordScore("bob", 1,
|
||||
wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh, WordFlags.Subjects)),
|
||||
docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)),
|
||||
0)
|
||||
@ -75,7 +76,10 @@ class ResultValuatorTest {
|
||||
System.out.println(highCountSubject);
|
||||
}
|
||||
|
||||
private long docMetadata(int topology, int year, int quality, EnumSet<DocumentFlags> flags) {
|
||||
private long docMetadata(int topology,
|
||||
int year,
|
||||
int quality,
|
||||
EnumSet<DocumentFlags> flags) {
|
||||
return new DocumentMetadata(topology, PubDate.toYearByte(year), quality, flags).encode();
|
||||
}
|
||||
|
||||
|
@ -1,9 +1,10 @@
|
||||
package nu.marginalia.ranking.results.factors;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
|
||||
import nu.marginalia.bbpc.BrailleBlockPunchCards;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
import nu.marginalia.ranking.results.ResultKeywordSet;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
@ -20,7 +21,7 @@ class TermCoherenceFactorTest {
|
||||
WordMetadata.POSITIONS_MASK, WordMetadata.POSITIONS_MASK
|
||||
);
|
||||
|
||||
long mask = termCoherenceFactor.combinedMask(allPositionsSet);
|
||||
long mask = CompiledQueryAggregates.longBitmaskAggregate(allPositionsSet, score -> score.positions() & WordMetadata.POSITIONS_MASK);
|
||||
|
||||
assertEquals(1.0, termCoherenceFactor.bitsSetFactor(mask), 0.01);
|
||||
|
||||
@ -33,7 +34,7 @@ class TermCoherenceFactorTest {
|
||||
0, 0
|
||||
);
|
||||
|
||||
long mask = termCoherenceFactor.combinedMask(allPositionsSet);
|
||||
long mask = CompiledQueryAggregates.longBitmaskAggregate(allPositionsSet, score -> score.positions() & WordMetadata.POSITIONS_MASK);
|
||||
|
||||
assertEquals(0, termCoherenceFactor.bitsSetFactor(mask), 0.01);
|
||||
|
||||
@ -46,7 +47,7 @@ class TermCoherenceFactorTest {
|
||||
List.of(0, 1, 2, 3), List.of(0, 1, 2, 3)
|
||||
);
|
||||
|
||||
long mask = termCoherenceFactor.combinedMask(positions);
|
||||
long mask = CompiledQueryAggregates.longBitmaskAggregate(positions, score -> score.positions() & WordMetadata.POSITIONS_MASK);
|
||||
printMask(mask);
|
||||
|
||||
}
|
||||
@ -57,7 +58,7 @@ class TermCoherenceFactorTest {
|
||||
List.of(55, 54, 53, 52), List.of(55, 54, 53, 52)
|
||||
);
|
||||
|
||||
long mask = termCoherenceFactor.combinedMask(positions);
|
||||
long mask = CompiledQueryAggregates.longBitmaskAggregate(positions, score -> score.positions() & WordMetadata.POSITIONS_MASK);
|
||||
printMask(mask);
|
||||
}
|
||||
|
||||
@ -72,7 +73,7 @@ class TermCoherenceFactorTest {
|
||||
System.out.println(BrailleBlockPunchCards.printBits(mask, 48));
|
||||
}
|
||||
|
||||
ResultKeywordSet createSet(List<Integer>... maskPositions) {
|
||||
CompiledQuery<SearchResultKeywordScore> createSet(List<Integer>... maskPositions) {
|
||||
long[] positions = new long[maskPositions.length];
|
||||
|
||||
for (int i = 0; i < maskPositions.length; i++) {
|
||||
@ -84,14 +85,14 @@ class TermCoherenceFactorTest {
|
||||
return createSet(positions);
|
||||
}
|
||||
|
||||
ResultKeywordSet createSet(long... positionMasks) {
|
||||
CompiledQuery<SearchResultKeywordScore> createSet(long... positionMasks) {
|
||||
List<SearchResultKeywordScore> keywords = new ArrayList<>();
|
||||
|
||||
for (int i = 0; i < positionMasks.length; i++) {
|
||||
keywords.add(new SearchResultKeywordScore(0, "",
|
||||
keywords.add(new SearchResultKeywordScore("", 0,
|
||||
new WordMetadata(positionMasks[i], (byte) 0).encode(), 0, 0));
|
||||
}
|
||||
|
||||
return new ResultKeywordSet(keywords);
|
||||
return CompiledQuery.just(keywords.toArray(SearchResultKeywordScore[]::new));
|
||||
}
|
||||
}
|
@ -1,5 +1,7 @@
|
||||
package nu.marginalia.array.algo;
|
||||
|
||||
import nu.marginalia.array.LongArray;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.LongBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
@ -61,6 +63,12 @@ public interface LongArrayBase extends BulkTransferArray<LongBuffer> {
|
||||
}
|
||||
}
|
||||
|
||||
default void get(long start, long end, LongArray buffer, int bufferStart) {
|
||||
for (int i = 0; i < (end-start); i++) {
|
||||
buffer.set(i + bufferStart, get(start + i));
|
||||
}
|
||||
}
|
||||
|
||||
default void get(long start, LongBuffer buffer) {
|
||||
get(start, start + buffer.remaining(), buffer, buffer.position());
|
||||
}
|
||||
|
@ -1,5 +1,8 @@
|
||||
package nu.marginalia.array.buffer;
|
||||
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/** A buffer for long values that can be used to filter and manipulate the data.
|
||||
@ -17,7 +20,7 @@ import java.util.Arrays;
|
||||
public class LongQueryBuffer {
|
||||
/** Direct access to the data in the buffer,
|
||||
* guaranteed to be populated until `end` */
|
||||
public final long[] data;
|
||||
public final LongArray data;
|
||||
|
||||
/** Number of items in the data buffer */
|
||||
public int end;
|
||||
@ -25,18 +28,27 @@ public class LongQueryBuffer {
|
||||
private int read = 0;
|
||||
private int write = 0;
|
||||
|
||||
private LongQueryBuffer(LongArray array, int size) {
|
||||
this.data = array;
|
||||
this.end = size;
|
||||
}
|
||||
|
||||
public LongQueryBuffer(int size) {
|
||||
this.data = new long[size];
|
||||
this.data = LongArrayFactory.onHeapConfined(size);
|
||||
this.end = size;
|
||||
}
|
||||
|
||||
public LongQueryBuffer(long[] data, int size) {
|
||||
this.data = data;
|
||||
this.data = LongArrayFactory.onHeapConfined(size);
|
||||
this.data.set(0, data);
|
||||
|
||||
this.end = size;
|
||||
}
|
||||
|
||||
public long[] copyData() {
|
||||
return Arrays.copyOf(data, end);
|
||||
long[] copy = new long[end];
|
||||
data.forEach(0, end, (pos, val) -> copy[(int)pos]=val );
|
||||
return copy;
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
@ -48,7 +60,7 @@ public class LongQueryBuffer {
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
end = data.length;
|
||||
end = (int) data.size();
|
||||
read = 0;
|
||||
write = 0;
|
||||
}
|
||||
@ -59,12 +71,16 @@ public class LongQueryBuffer {
|
||||
write = 0;
|
||||
}
|
||||
|
||||
public LongQueryBuffer slice(int start, int end) {
|
||||
return new LongQueryBuffer(data.range(start, end), end - start);
|
||||
}
|
||||
|
||||
/* == Filtering methods == */
|
||||
|
||||
/** Returns the current value at the read pointer.
|
||||
*/
|
||||
public long currentValue() {
|
||||
return data[read];
|
||||
return data.get(read);
|
||||
}
|
||||
|
||||
/** Advances the read pointer and returns true if there are more values to read. */
|
||||
@ -79,9 +95,9 @@ public class LongQueryBuffer {
|
||||
*/
|
||||
public boolean retainAndAdvance() {
|
||||
if (read != write) {
|
||||
long tmp = data[write];
|
||||
data[write] = data[read];
|
||||
data[read] = tmp;
|
||||
long tmp = data.get(write);
|
||||
data.set(write, data.get(read));
|
||||
data.set(read, tmp);
|
||||
}
|
||||
|
||||
write++;
|
||||
@ -117,9 +133,10 @@ public class LongQueryBuffer {
|
||||
write = 0;
|
||||
}
|
||||
|
||||
public void startFilterForRange(int pos, int end) {
|
||||
read = write = pos;
|
||||
this.end = end;
|
||||
public void finalizeFiltering(int pos) {
|
||||
end = write;
|
||||
read = pos;
|
||||
write = pos;
|
||||
}
|
||||
|
||||
/** Retain only unique values in the buffer, and update the end pointer to the new length.
|
||||
@ -153,7 +170,7 @@ public class LongQueryBuffer {
|
||||
"read = " + read +
|
||||
",write = " + write +
|
||||
",end = " + end +
|
||||
",data = [" + Arrays.toString(Arrays.copyOf(data, end)) + "]]";
|
||||
",data = [" + Arrays.toString(copyData()) + "]]";
|
||||
}
|
||||
|
||||
|
||||
|
@ -143,7 +143,7 @@ class LongArraySearchTest {
|
||||
|
||||
assertEquals(43, buffer.size());
|
||||
for (int i = 0; i < 43; i++) {
|
||||
assertEquals(buffer.data[i], i*3);
|
||||
assertEquals(buffer.data.get(i), i*3);
|
||||
}
|
||||
}
|
||||
|
||||
@ -160,7 +160,7 @@ class LongArraySearchTest {
|
||||
int j = 0;
|
||||
for (int i = 0; i < 43; i++) {
|
||||
if (++j % 3 == 0) j++;
|
||||
assertEquals(buffer.data[i], j);
|
||||
assertEquals(buffer.data.get(i), j);
|
||||
}
|
||||
}
|
||||
}
|
@ -109,8 +109,8 @@ public class BTreeReader {
|
||||
return ip.findData(key);
|
||||
}
|
||||
|
||||
public void readData(long[] buf, int n, long pos) {
|
||||
data.get(pos, pos + n, buf);
|
||||
public void readData(LongArray buf, int n, long pos) {
|
||||
data.get(pos, pos + n, buf, 0);
|
||||
}
|
||||
|
||||
/** Used for querying interlaced data in the btree.
|
||||
|
@ -32,7 +32,8 @@ public class BTreeReaderRejectRetainWithIndexTest {
|
||||
@Test
|
||||
public void testRetain() {
|
||||
LongQueryBuffer odds = new LongQueryBuffer(50);
|
||||
Arrays.setAll(odds.data, i -> 2L*i + 1);
|
||||
for (int i = 0; i < 50; i++)
|
||||
odds.data.set(i, 2L*i + 1);
|
||||
|
||||
BTreeReader reader = new BTreeReader(array, ctx, 0);
|
||||
reader.retainEntries(odds);
|
||||
@ -46,7 +47,8 @@ public class BTreeReaderRejectRetainWithIndexTest {
|
||||
@Test
|
||||
public void testReject() {
|
||||
LongQueryBuffer odds = new LongQueryBuffer(50);
|
||||
Arrays.setAll(odds.data, i -> 2L*i + 1);
|
||||
for (int i = 0; i < 50; i++)
|
||||
odds.data.set(i, 2L*i + 1);
|
||||
|
||||
BTreeReader reader = new BTreeReader(array, ctx, 0);
|
||||
reader.rejectEntries(odds);
|
||||
|
@ -32,7 +32,8 @@ public class BTreeReaderRejectRetainWithoutIndexTest {
|
||||
@Test
|
||||
public void testRetain() {
|
||||
LongQueryBuffer odds = new LongQueryBuffer(50);
|
||||
Arrays.setAll(odds.data, i -> 2L*i + 1);
|
||||
for (int i = 0; i < 50; i++)
|
||||
odds.data.set(i, 2L*i + 1);
|
||||
|
||||
BTreeReader reader = new BTreeReader(array, ctx, 0);
|
||||
reader.retainEntries(odds);
|
||||
@ -46,7 +47,9 @@ public class BTreeReaderRejectRetainWithoutIndexTest {
|
||||
@Test
|
||||
public void testReject() {
|
||||
LongQueryBuffer odds = new LongQueryBuffer(50);
|
||||
Arrays.setAll(odds.data, i -> 2L*i + 1);
|
||||
for (int i = 0; i < 50; i++)
|
||||
odds.data.set(i, 2L*i + 1);
|
||||
|
||||
|
||||
BTreeReader reader = new BTreeReader(array, ctx, 0);
|
||||
reader.rejectEntries(odds);
|
||||
|
@ -1,7 +1,7 @@
|
||||
package nu.marginalia.search;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.index.query.limit.QueryLimits;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
@ -14,7 +14,7 @@ import java.util.List;
|
||||
public class SearchQueryParamFactory {
|
||||
|
||||
public QueryParams forRegularSearch(SearchParameters userParams) {
|
||||
SearchSubquery prototype = new SearchSubquery();
|
||||
SearchQuery prototype = new SearchQuery();
|
||||
var profile = userParams.profile();
|
||||
|
||||
profile.addTacitTerms(prototype);
|
||||
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.search.command;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.util.Arrays;
|
||||
@ -23,7 +23,7 @@ public enum SearchAdtechParameter {
|
||||
return DEFAULT;
|
||||
}
|
||||
|
||||
public void addTacitTerms(SearchSubquery subquery) {
|
||||
public void addTacitTerms(SearchQuery subquery) {
|
||||
subquery.searchTermsExclude.addAll(Arrays.asList(implictExcludeSearchTerms));
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.search.command;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.util.Arrays;
|
||||
@ -25,7 +25,7 @@ public enum SearchJsParameter {
|
||||
return DEFAULT;
|
||||
}
|
||||
|
||||
public void addTacitTerms(SearchSubquery subquery) {
|
||||
public void addTacitTerms(SearchQuery subquery) {
|
||||
subquery.searchTermsExclude.addAll(Arrays.asList(implictExcludeSearchTerms));
|
||||
}
|
||||
}
|
||||
|
@ -2,7 +2,7 @@ package nu.marginalia.search.model;
|
||||
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier;
|
||||
|
||||
import java.util.Objects;
|
||||
@ -47,7 +47,7 @@ public enum SearchProfile {
|
||||
return NO_FILTER;
|
||||
}
|
||||
|
||||
public void addTacitTerms(SearchSubquery subquery) {
|
||||
public void addTacitTerms(SearchQuery subquery) {
|
||||
if (this == ACADEMIA) {
|
||||
subquery.searchTermsAdvice.add("special:academia");
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user