mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(qs, index) New query model integrated with index service.
Seems to work, tests are green and initial testing finds no errors. Still a bit untested, committing WIP as-is because it would suck to lose weeks of work due to a drive failure or something.
This commit is contained in:
parent
8cb9455c32
commit
a3a6d6292b
@ -30,6 +30,7 @@ dependencies {
|
|||||||
implementation libs.notnull
|
implementation libs.notnull
|
||||||
implementation libs.guice
|
implementation libs.guice
|
||||||
implementation libs.gson
|
implementation libs.gson
|
||||||
|
implementation libs.commons.lang3
|
||||||
implementation libs.bundles.protobuf
|
implementation libs.bundles.protobuf
|
||||||
implementation libs.bundles.grpc
|
implementation libs.bundles.grpc
|
||||||
implementation libs.fastutil
|
implementation libs.fastutil
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
package nu.marginalia.api.searchquery;
|
package nu.marginalia.api.searchquery;
|
||||||
|
|
||||||
import nu.marginalia.api.searchquery.*;
|
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
|
||||||
import nu.marginalia.api.searchquery.model.results.Bm25Parameters;
|
import nu.marginalia.api.searchquery.model.results.Bm25Parameters;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||||
import nu.marginalia.index.query.limit.QueryLimits;
|
import nu.marginalia.index.query.limit.QueryLimits;
|
||||||
@ -45,33 +44,37 @@ public class IndexProtobufCodec {
|
|||||||
.build();
|
.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
public static SearchSubquery convertSearchSubquery(RpcSubquery subquery) {
|
public static SearchQuery convertRpcQuery(RpcQuery query) {
|
||||||
List<List<String>> coherences = new ArrayList<>();
|
List<List<String>> coherences = new ArrayList<>();
|
||||||
|
|
||||||
for (int j = 0; j < subquery.getCoherencesCount(); j++) {
|
for (int j = 0; j < query.getCoherencesCount(); j++) {
|
||||||
var coh = subquery.getCoherences(j);
|
var coh = query.getCoherences(j);
|
||||||
coherences.add(new ArrayList<>(coh.getCoherencesList()));
|
coherences.add(new ArrayList<>(coh.getCoherencesList()));
|
||||||
}
|
}
|
||||||
|
|
||||||
return new SearchSubquery(
|
return new SearchQuery(
|
||||||
subquery.getIncludeList(),
|
query.getCompiledQuery(),
|
||||||
subquery.getExcludeList(),
|
query.getIncludeList(),
|
||||||
subquery.getAdviceList(),
|
query.getExcludeList(),
|
||||||
subquery.getPriorityList(),
|
query.getAdviceList(),
|
||||||
|
query.getPriorityList(),
|
||||||
coherences
|
coherences
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static RpcSubquery convertSearchSubquery(SearchSubquery searchSubquery) {
|
public static RpcQuery convertRpcQuery(SearchQuery searchQuery) {
|
||||||
var subqueryBuilder =
|
var subqueryBuilder =
|
||||||
RpcSubquery.newBuilder()
|
RpcQuery.newBuilder()
|
||||||
.addAllAdvice(searchSubquery.getSearchTermsAdvice())
|
.setCompiledQuery(searchQuery.compiledQuery)
|
||||||
.addAllExclude(searchSubquery.getSearchTermsExclude())
|
.addAllInclude(searchQuery.getSearchTermsInclude())
|
||||||
.addAllInclude(searchSubquery.getSearchTermsInclude())
|
.addAllAdvice(searchQuery.getSearchTermsAdvice())
|
||||||
.addAllPriority(searchSubquery.getSearchTermsPriority());
|
.addAllExclude(searchQuery.getSearchTermsExclude())
|
||||||
for (var coherences : searchSubquery.searchTermCoherences) {
|
.addAllPriority(searchQuery.getSearchTermsPriority());
|
||||||
|
|
||||||
|
for (var coherences : searchQuery.searchTermCoherences) {
|
||||||
subqueryBuilder.addCoherencesBuilder().addAllCoherences(coherences);
|
subqueryBuilder.addCoherencesBuilder().addAllCoherences(coherences);
|
||||||
}
|
}
|
||||||
|
|
||||||
return subqueryBuilder.build();
|
return subqueryBuilder.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2,7 +2,6 @@ package nu.marginalia.api.searchquery;
|
|||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
|
||||||
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||||
@ -14,7 +13,6 @@ import nu.marginalia.api.searchquery.model.query.QueryParams;
|
|||||||
import nu.marginalia.api.searchquery.model.query.QueryResponse;
|
import nu.marginalia.api.searchquery.model.query.QueryResponse;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
public class QueryProtobufCodec {
|
public class QueryProtobufCodec {
|
||||||
|
|
||||||
@ -23,9 +21,7 @@ public class QueryProtobufCodec {
|
|||||||
|
|
||||||
builder.addAllDomains(request.getDomainIdsList());
|
builder.addAllDomains(request.getDomainIdsList());
|
||||||
|
|
||||||
for (var subquery : query.specs.subqueries) {
|
builder.setQuery(IndexProtobufCodec.convertRpcQuery(query.specs.query));
|
||||||
builder.addSubqueries(IndexProtobufCodec.convertSearchSubquery(subquery));
|
|
||||||
}
|
|
||||||
|
|
||||||
builder.setSearchSetIdentifier(query.specs.searchSetIdentifier);
|
builder.setSearchSetIdentifier(query.specs.searchSetIdentifier);
|
||||||
builder.setHumanQuery(request.getHumanQuery());
|
builder.setHumanQuery(request.getHumanQuery());
|
||||||
@ -51,9 +47,7 @@ public class QueryProtobufCodec {
|
|||||||
public static RpcIndexQuery convertQuery(String humanQuery, ProcessedQuery query) {
|
public static RpcIndexQuery convertQuery(String humanQuery, ProcessedQuery query) {
|
||||||
var builder = RpcIndexQuery.newBuilder();
|
var builder = RpcIndexQuery.newBuilder();
|
||||||
|
|
||||||
for (var subquery : query.specs.subqueries) {
|
builder.setQuery(IndexProtobufCodec.convertRpcQuery(query.specs.query));
|
||||||
builder.addSubqueries(IndexProtobufCodec.convertSearchSubquery(subquery));
|
|
||||||
}
|
|
||||||
|
|
||||||
builder.setSearchSetIdentifier(query.specs.searchSetIdentifier);
|
builder.setSearchSetIdentifier(query.specs.searchSetIdentifier);
|
||||||
builder.setHumanQuery(humanQuery);
|
builder.setHumanQuery(humanQuery);
|
||||||
@ -147,8 +141,8 @@ public class QueryProtobufCodec {
|
|||||||
|
|
||||||
private static SearchResultKeywordScore convertKeywordScore(RpcResultKeywordScore keywordScores) {
|
private static SearchResultKeywordScore convertKeywordScore(RpcResultKeywordScore keywordScores) {
|
||||||
return new SearchResultKeywordScore(
|
return new SearchResultKeywordScore(
|
||||||
keywordScores.getSubquery(),
|
|
||||||
keywordScores.getKeyword(),
|
keywordScores.getKeyword(),
|
||||||
|
-1, // termId is internal to index service
|
||||||
keywordScores.getEncodedWordMetadata(),
|
keywordScores.getEncodedWordMetadata(),
|
||||||
keywordScores.getEncodedDocMetadata(),
|
keywordScores.getEncodedDocMetadata(),
|
||||||
keywordScores.getHtmlFeatures()
|
keywordScores.getHtmlFeatures()
|
||||||
@ -156,14 +150,8 @@ public class QueryProtobufCodec {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private static SearchSpecification convertSearchSpecification(RpcIndexQuery specs) {
|
private static SearchSpecification convertSearchSpecification(RpcIndexQuery specs) {
|
||||||
List<SearchSubquery> subqueries = new ArrayList<>(specs.getSubqueriesCount());
|
|
||||||
|
|
||||||
for (int i = 0; i < specs.getSubqueriesCount(); i++) {
|
|
||||||
subqueries.add(IndexProtobufCodec.convertSearchSubquery(specs.getSubqueries(i)));
|
|
||||||
}
|
|
||||||
|
|
||||||
return new SearchSpecification(
|
return new SearchSpecification(
|
||||||
subqueries,
|
IndexProtobufCodec.convertRpcQuery(specs.getQuery()),
|
||||||
specs.getDomainsList(),
|
specs.getDomainsList(),
|
||||||
specs.getSearchSetIdentifier(),
|
specs.getSearchSetIdentifier(),
|
||||||
specs.getHumanQuery(),
|
specs.getHumanQuery(),
|
||||||
@ -182,7 +170,6 @@ public class QueryProtobufCodec {
|
|||||||
.addAllDomainIds(params.domainIds())
|
.addAllDomainIds(params.domainIds())
|
||||||
.addAllTacitAdvice(params.tacitAdvice())
|
.addAllTacitAdvice(params.tacitAdvice())
|
||||||
.addAllTacitExcludes(params.tacitExcludes())
|
.addAllTacitExcludes(params.tacitExcludes())
|
||||||
.addAllTacitIncludes(params.tacitIncludes())
|
|
||||||
.addAllTacitPriority(params.tacitPriority())
|
.addAllTacitPriority(params.tacitPriority())
|
||||||
.setHumanQuery(params.humanQuery())
|
.setHumanQuery(params.humanQuery())
|
||||||
.setQueryLimits(IndexProtobufCodec.convertQueryLimits(params.limits()))
|
.setQueryLimits(IndexProtobufCodec.convertQueryLimits(params.limits()))
|
||||||
|
@ -0,0 +1,76 @@
|
|||||||
|
package nu.marginalia.api.searchquery.model.compiled;
|
||||||
|
|
||||||
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.function.*;
|
||||||
|
import java.util.stream.IntStream;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
|
||||||
|
/** A compiled index service query. The class separates the topology of the query from the data,
|
||||||
|
* and it's possible to create new queries supplanting the data */
|
||||||
|
public class CompiledQuery<T> implements Iterable<T> {
|
||||||
|
|
||||||
|
/** The root expression, conveys the topology of the query */
|
||||||
|
public final CqExpression root;
|
||||||
|
|
||||||
|
private final CqData<T> data;
|
||||||
|
|
||||||
|
public CompiledQuery(CqExpression root, CqData<T> data) {
|
||||||
|
this.root = root;
|
||||||
|
this.data = data;
|
||||||
|
}
|
||||||
|
|
||||||
|
public CompiledQuery(CqExpression root, T[] data) {
|
||||||
|
this.root = root;
|
||||||
|
this.data = new CqData<>(data);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Exists for testing, creates a simple query that ANDs all the provided items */
|
||||||
|
public static <T> CompiledQuery<T> just(T... item) {
|
||||||
|
return new CompiledQuery<>(new CqExpression.And(
|
||||||
|
IntStream.range(0, item.length).mapToObj(CqExpression.Word::new).toList()
|
||||||
|
), item);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Create a new CompiledQuery mapping the leaf nodes using the provided mapper */
|
||||||
|
public <T2> CompiledQuery<T2> map(Class<T2> clazz, Function<T, T2> mapper) {
|
||||||
|
return new CompiledQuery<>(
|
||||||
|
root,
|
||||||
|
data.map(clazz, mapper)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
public CompiledQueryLong mapToLong(ToLongFunction<T> mapper) {
|
||||||
|
return new CompiledQueryLong(root, data.mapToLong(mapper));
|
||||||
|
}
|
||||||
|
|
||||||
|
public CqExpression root() {
|
||||||
|
return root;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Stream<T> stream() {
|
||||||
|
return data.stream();
|
||||||
|
}
|
||||||
|
|
||||||
|
public IntStream indices() {
|
||||||
|
return IntStream.range(0, data.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
public T at(int index) {
|
||||||
|
return data.get(index);
|
||||||
|
}
|
||||||
|
|
||||||
|
@NotNull
|
||||||
|
@Override
|
||||||
|
public Iterator<T> iterator() {
|
||||||
|
return stream().iterator();
|
||||||
|
}
|
||||||
|
|
||||||
|
public int size() {
|
||||||
|
return data.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,42 @@
|
|||||||
|
package nu.marginalia.api.searchquery.model.compiled;
|
||||||
|
|
||||||
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.stream.IntStream;
|
||||||
|
import java.util.stream.LongStream;
|
||||||
|
|
||||||
|
|
||||||
|
/** A compiled index service query */
|
||||||
|
public class CompiledQueryLong implements Iterable<Long> {
|
||||||
|
private final CqExpression root;
|
||||||
|
private final CqDataLong data;
|
||||||
|
|
||||||
|
public CompiledQueryLong(CqExpression root, CqDataLong data) {
|
||||||
|
this.root = root;
|
||||||
|
this.data = data;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public CqExpression root() {
|
||||||
|
return root;
|
||||||
|
}
|
||||||
|
|
||||||
|
public LongStream stream() {
|
||||||
|
return data.stream();
|
||||||
|
}
|
||||||
|
|
||||||
|
public IntStream indices() {
|
||||||
|
return IntStream.range(0, data.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
public long at(int index) {
|
||||||
|
return data.get(index);
|
||||||
|
}
|
||||||
|
|
||||||
|
@NotNull
|
||||||
|
@Override
|
||||||
|
public Iterator<Long> iterator() {
|
||||||
|
return stream().iterator();
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,113 @@
|
|||||||
|
package nu.marginalia.api.searchquery.model.compiled;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
/** Parser for a compiled index query */
|
||||||
|
public class CompiledQueryParser {
|
||||||
|
|
||||||
|
public static CompiledQuery<String> parse(String query) {
|
||||||
|
List<String> parts = tokenize(query);
|
||||||
|
|
||||||
|
if (parts.isEmpty()) {
|
||||||
|
return new CompiledQuery<>(
|
||||||
|
CqExpression.empty(),
|
||||||
|
new CqData<>(new String[0])
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// We aren't interested in a binary tree representation, but an n-ary tree one,
|
||||||
|
// so a somewhat unusual parsing technique is used to avoid having an additional
|
||||||
|
// flattening step at the end.
|
||||||
|
|
||||||
|
// This is only possible due to the trivial and unambiguous grammar of the compiled queries
|
||||||
|
|
||||||
|
List<AndOrState> parenState = new ArrayList<>();
|
||||||
|
parenState.add(new AndOrState());
|
||||||
|
|
||||||
|
Map<String, Integer> wordIds = new HashMap<>();
|
||||||
|
|
||||||
|
for (var part : parts) {
|
||||||
|
var head = parenState.getLast();
|
||||||
|
|
||||||
|
if (part.equals("|")) {
|
||||||
|
head.or();
|
||||||
|
}
|
||||||
|
else if (part.equals("(")) {
|
||||||
|
parenState.addLast(new AndOrState());
|
||||||
|
}
|
||||||
|
else if (part.equals(")")) {
|
||||||
|
if (parenState.size() < 2) {
|
||||||
|
throw new IllegalStateException("Mismatched parentheses in expression: " + query);
|
||||||
|
}
|
||||||
|
parenState.removeLast();
|
||||||
|
parenState.getLast().and(head.closeOr());
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
head.and(
|
||||||
|
new CqExpression.Word(
|
||||||
|
wordIds.computeIfAbsent(part, p -> wordIds.size())
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (parenState.size() != 1)
|
||||||
|
throw new IllegalStateException("Mismatched parentheses in expression: " + query);
|
||||||
|
|
||||||
|
// Construct the CompiledQuery object with String:s as leaves
|
||||||
|
var root = parenState.getLast().closeOr();
|
||||||
|
|
||||||
|
String[] cqData = new String[wordIds.size()];
|
||||||
|
wordIds.forEach((w, i) -> cqData[i] = w);
|
||||||
|
return new CompiledQuery<>(root, new CqData<>(cqData));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class AndOrState {
|
||||||
|
private List<CqExpression> andState = new ArrayList<>();
|
||||||
|
private List<CqExpression> orState = new ArrayList<>();
|
||||||
|
|
||||||
|
/** Add a new item to the and-list */
|
||||||
|
public void and(CqExpression e) {
|
||||||
|
andState.add(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Turn the and-list into an expression on the or-list, and then start a new and-list */
|
||||||
|
public void or() {
|
||||||
|
closeAnd();
|
||||||
|
|
||||||
|
andState = new ArrayList<>();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Turn the and-list into an And-expression in the or-list */
|
||||||
|
private void closeAnd() {
|
||||||
|
if (andState.size() == 1)
|
||||||
|
orState.add(andState.getFirst());
|
||||||
|
else if (!andState.isEmpty())
|
||||||
|
orState.add(new CqExpression.And(andState));
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Finalize the current and-list, then turn the or-list into an Or-expression */
|
||||||
|
public CqExpression closeOr() {
|
||||||
|
closeAnd();
|
||||||
|
|
||||||
|
if (orState.isEmpty())
|
||||||
|
return CqExpression.empty();
|
||||||
|
if (orState.size() == 1)
|
||||||
|
return orState.getFirst();
|
||||||
|
|
||||||
|
return new CqExpression.Or(orState);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<String> tokenize(String query) {
|
||||||
|
// Each token is guaranteed to be separated by one or more space characters
|
||||||
|
|
||||||
|
return Arrays.stream(StringUtils.split(query, ' '))
|
||||||
|
.filter(StringUtils::isNotBlank)
|
||||||
|
.toList();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,51 @@
|
|||||||
|
package nu.marginalia.api.searchquery.model.compiled;
|
||||||
|
|
||||||
|
import java.lang.reflect.Array;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.function.Function;
|
||||||
|
import java.util.function.ToDoubleFunction;
|
||||||
|
import java.util.function.ToLongFunction;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
public class CqData<T> {
|
||||||
|
private final T[] data;
|
||||||
|
|
||||||
|
public CqData(T[] data) {
|
||||||
|
this.data = data;
|
||||||
|
}
|
||||||
|
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
public <T2> CqData<T2> map(Class<T2> clazz, Function<T, T2> mapper) {
|
||||||
|
T2[] newData = (T2[]) Array.newInstance(clazz, data.length);
|
||||||
|
for (int i = 0; i < data.length; i++) {
|
||||||
|
newData[i] = mapper.apply((T) data[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new CqData<>(newData);
|
||||||
|
}
|
||||||
|
|
||||||
|
public CqDataLong mapToLong(ToLongFunction<T> mapper) {
|
||||||
|
long[] newData = new long[data.length];
|
||||||
|
for (int i = 0; i < data.length; i++) {
|
||||||
|
newData[i] = mapper.applyAsLong((T) data[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new CqDataLong(newData);
|
||||||
|
}
|
||||||
|
|
||||||
|
public T get(int i) {
|
||||||
|
return data[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
public T get(CqExpression.Word w) {
|
||||||
|
return data[w.idx()];
|
||||||
|
}
|
||||||
|
|
||||||
|
public Stream<T> stream() {
|
||||||
|
return Arrays.stream(data);
|
||||||
|
}
|
||||||
|
|
||||||
|
public int size() {
|
||||||
|
return data.length;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,27 @@
|
|||||||
|
package nu.marginalia.api.searchquery.model.compiled;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.stream.LongStream;
|
||||||
|
|
||||||
|
public class CqDataLong {
|
||||||
|
private final long[] data;
|
||||||
|
|
||||||
|
public CqDataLong(long[] data) {
|
||||||
|
this.data = data;
|
||||||
|
}
|
||||||
|
|
||||||
|
public long get(int i) {
|
||||||
|
return data[i];
|
||||||
|
}
|
||||||
|
public long get(CqExpression.Word w) {
|
||||||
|
return data[w.idx()];
|
||||||
|
}
|
||||||
|
|
||||||
|
public LongStream stream() {
|
||||||
|
return Arrays.stream(data);
|
||||||
|
}
|
||||||
|
|
||||||
|
public int size() {
|
||||||
|
return data.length;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,170 @@
|
|||||||
|
package nu.marginalia.api.searchquery.model.compiled;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.StringJoiner;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
/** Expression in a parsed index service query
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public sealed interface CqExpression {
|
||||||
|
|
||||||
|
Stream<Word> stream();
|
||||||
|
|
||||||
|
/** @see nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates */
|
||||||
|
long visit(LongVisitor visitor);
|
||||||
|
/** @see nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates */
|
||||||
|
double visit(DoubleVisitor visitor);
|
||||||
|
/** @see nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates */
|
||||||
|
int visit(IntVisitor visitor);
|
||||||
|
/** @see nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates */
|
||||||
|
boolean visit(BoolVisitor visitor);
|
||||||
|
|
||||||
|
<T> T visit(ObjectVisitor<T> visitor);
|
||||||
|
|
||||||
|
static CqExpression empty() {
|
||||||
|
return new Or(List.of());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
record And(List<? extends CqExpression> parts) implements CqExpression {
|
||||||
|
@Override
|
||||||
|
public Stream<Word> stream() {
|
||||||
|
return parts.stream().flatMap(CqExpression::stream);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long visit(LongVisitor visitor) {
|
||||||
|
return visitor.onAnd(parts);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double visit(DoubleVisitor visitor) {
|
||||||
|
return visitor.onAnd(parts);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int visit(IntVisitor visitor) {
|
||||||
|
return visitor.onAnd(parts);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean visit(BoolVisitor visitor) {
|
||||||
|
return visitor.onAnd(parts);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public <T> T visit(ObjectVisitor<T> visitor) { return visitor.onAnd(parts); }
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
StringJoiner sj = new StringJoiner(", ", "And[ ", "]");
|
||||||
|
parts.forEach(part -> sj.add(part.toString()));
|
||||||
|
return sj.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
record Or(List<? extends CqExpression> parts) implements CqExpression {
|
||||||
|
@Override
|
||||||
|
public Stream<Word> stream() {
|
||||||
|
return parts.stream().flatMap(CqExpression::stream);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long visit(LongVisitor visitor) {
|
||||||
|
return visitor.onOr(parts);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double visit(DoubleVisitor visitor) {
|
||||||
|
return visitor.onOr(parts);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int visit(IntVisitor visitor) {
|
||||||
|
return visitor.onOr(parts);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean visit(BoolVisitor visitor) {
|
||||||
|
return visitor.onOr(parts);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public <T> T visit(ObjectVisitor<T> visitor) { return visitor.onOr(parts); }
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
StringJoiner sj = new StringJoiner(", ", "Or[ ", "]");
|
||||||
|
parts.forEach(part -> sj.add(part.toString()));
|
||||||
|
return sj.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
record Word(int idx) implements CqExpression {
|
||||||
|
@Override
|
||||||
|
public Stream<Word> stream() {
|
||||||
|
return Stream.of(this);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long visit(LongVisitor visitor) {
|
||||||
|
return visitor.onLeaf(idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double visit(DoubleVisitor visitor) {
|
||||||
|
return visitor.onLeaf(idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int visit(IntVisitor visitor) {
|
||||||
|
return visitor.onLeaf(idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean visit(BoolVisitor visitor) {
|
||||||
|
return visitor.onLeaf(idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public <T> T visit(ObjectVisitor<T> visitor) { return visitor.onLeaf(idx); }
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return Integer.toString(idx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
interface LongVisitor {
|
||||||
|
long onAnd(List<? extends CqExpression> parts);
|
||||||
|
long onOr(List<? extends CqExpression> parts);
|
||||||
|
long onLeaf(int idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
interface IntVisitor {
|
||||||
|
int onAnd(List<? extends CqExpression> parts);
|
||||||
|
int onOr(List<? extends CqExpression> parts);
|
||||||
|
int onLeaf(int idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
interface BoolVisitor {
|
||||||
|
boolean onAnd(List<? extends CqExpression> parts);
|
||||||
|
boolean onOr(List<? extends CqExpression> parts);
|
||||||
|
boolean onLeaf(int idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
interface DoubleVisitor {
|
||||||
|
double onAnd(List<? extends CqExpression> parts);
|
||||||
|
double onOr(List<? extends CqExpression> parts);
|
||||||
|
double onLeaf(int idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
interface ObjectVisitor<T> {
|
||||||
|
T onAnd(List<? extends CqExpression> parts);
|
||||||
|
T onOr(List<? extends CqExpression> parts);
|
||||||
|
T onLeaf(int idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,46 @@
|
|||||||
|
package nu.marginalia.api.searchquery.model.compiled.aggregate;
|
||||||
|
|
||||||
|
import it.unimi.dsi.fastutil.longs.LongSet;
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.function.*;
|
||||||
|
|
||||||
|
public class CompiledQueryAggregates {
|
||||||
|
/** Compiled query aggregate that for a single boolean that treats or-branches as logical OR,
|
||||||
|
* and and-branches as logical AND operations. Will return true if there exists a path through
|
||||||
|
* the query where the provided predicate returns true for each item.
|
||||||
|
*/
|
||||||
|
static public <T> boolean booleanAggregate(CompiledQuery<T> query, Predicate<T> predicate) {
|
||||||
|
return query.root.visit(new CqBooleanAggregate(query, predicate));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/** Compiled query aggregate that for a 64b bitmask that treats or-branches as logical OR,
|
||||||
|
* and and-branches as logical AND operations.
|
||||||
|
*/
|
||||||
|
public static <T> long longBitmaskAggregate(CompiledQuery<T> query, ToLongFunction<T> operator) {
|
||||||
|
return query.root.visit(new CqLongBitmaskOperator(query, operator));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/** Apply the operator to each leaf node, then return the highest minimum value found along any path */
|
||||||
|
public static <T> int intMaxMinAggregate(CompiledQuery<T> query, ToIntFunction<T> operator) {
|
||||||
|
return query.root.visit(new CqIntMaxMinOperator(query, operator));
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Apply the operator to each leaf node, and then return the highest sum of values possible
|
||||||
|
* through each branch in the compiled query.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public static <T> double doubleSumAggregate(CompiledQuery<T> query, ToDoubleFunction<T> operator) {
|
||||||
|
return query.root.visit(new CqDoubleSumOperator(query, operator));
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Enumerate all possible paths through the compiled query */
|
||||||
|
public static List<LongSet> queriesAggregate(CompiledQueryLong query) {
|
||||||
|
return new ArrayList<>(query.root().visit(new CqQueryPathsOperator(query)));
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,40 @@
|
|||||||
|
package nu.marginalia.api.searchquery.model.compiled.aggregate;
|
||||||
|
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.function.IntPredicate;
|
||||||
|
import java.util.function.Predicate;
|
||||||
|
|
||||||
|
public class CqBooleanAggregate implements CqExpression.BoolVisitor {
|
||||||
|
|
||||||
|
private final IntPredicate predicate;
|
||||||
|
|
||||||
|
public <T> CqBooleanAggregate(CompiledQuery<T> query, Predicate<T> objPred) {
|
||||||
|
this.predicate = idx -> objPred.test(query.at(idx));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean onAnd(List<? extends CqExpression> parts) {
|
||||||
|
for (var part : parts) {
|
||||||
|
if (!part.visit(this)) // short-circuit
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean onOr(List<? extends CqExpression> parts) {
|
||||||
|
for (var part : parts) {
|
||||||
|
if (part.visit(this)) // short-circuit
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean onLeaf(int idx) {
|
||||||
|
return predicate.test(idx);
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,40 @@
|
|||||||
|
package nu.marginalia.api.searchquery.model.compiled.aggregate;
|
||||||
|
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.function.IntToDoubleFunction;
|
||||||
|
import java.util.function.ToDoubleFunction;
|
||||||
|
|
||||||
|
public class CqDoubleSumOperator implements CqExpression.DoubleVisitor {
|
||||||
|
|
||||||
|
private final IntToDoubleFunction operator;
|
||||||
|
|
||||||
|
public <T> CqDoubleSumOperator(CompiledQuery<T> query, ToDoubleFunction<T> operator) {
|
||||||
|
this.operator = idx -> operator.applyAsDouble(query.at(idx));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double onAnd(List<? extends CqExpression> parts) {
|
||||||
|
double value = 0;
|
||||||
|
for (var part : parts) {
|
||||||
|
value += part.visit(this);
|
||||||
|
}
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double onOr(List<? extends CqExpression> parts) {
|
||||||
|
double value = parts.getFirst().visit(this);
|
||||||
|
for (int i = 1; i < parts.size(); i++) {
|
||||||
|
value = Math.max(value, parts.get(i).visit(this));
|
||||||
|
}
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double onLeaf(int idx) {
|
||||||
|
return operator.applyAsDouble(idx);
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,41 @@
|
|||||||
|
package nu.marginalia.api.searchquery.model.compiled.aggregate;
|
||||||
|
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.function.IntUnaryOperator;
|
||||||
|
import java.util.function.ToIntFunction;
|
||||||
|
|
||||||
|
public class CqIntMaxMinOperator implements CqExpression.IntVisitor {
|
||||||
|
|
||||||
|
private final IntUnaryOperator operator;
|
||||||
|
|
||||||
|
|
||||||
|
public <T> CqIntMaxMinOperator(CompiledQuery<T> query, ToIntFunction<T> operator) {
|
||||||
|
this.operator = idx -> operator.applyAsInt(query.at(idx));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int onAnd(List<? extends CqExpression> parts) {
|
||||||
|
int value = parts.getFirst().visit(this);
|
||||||
|
for (int i = 1; i < parts.size(); i++) {
|
||||||
|
value = Math.min(value, parts.get(i).visit(this));
|
||||||
|
}
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int onOr(List<? extends CqExpression> parts) {
|
||||||
|
int value = parts.getFirst().visit(this);
|
||||||
|
for (int i = 1; i < parts.size(); i++) {
|
||||||
|
value = Math.max(value, parts.get(i).visit(this));
|
||||||
|
}
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int onLeaf(int idx) {
|
||||||
|
return operator.applyAsInt(idx);
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,40 @@
|
|||||||
|
package nu.marginalia.api.searchquery.model.compiled.aggregate;
|
||||||
|
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.function.IntToLongFunction;
|
||||||
|
import java.util.function.ToLongFunction;
|
||||||
|
|
||||||
|
public class CqLongBitmaskOperator implements CqExpression.LongVisitor {
|
||||||
|
|
||||||
|
private final IntToLongFunction operator;
|
||||||
|
|
||||||
|
public <T> CqLongBitmaskOperator(CompiledQuery<T> query, ToLongFunction<T> operator) {
|
||||||
|
this.operator = idx-> operator.applyAsLong(query.at(idx));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long onAnd(List<? extends CqExpression> parts) {
|
||||||
|
long value = ~0L;
|
||||||
|
for (var part : parts) {
|
||||||
|
value &= part.visit(this);
|
||||||
|
}
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long onOr(List<? extends CqExpression> parts) {
|
||||||
|
long value = 0L;
|
||||||
|
for (var part : parts) {
|
||||||
|
value |= part.visit(this);
|
||||||
|
}
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long onLeaf(int idx) {
|
||||||
|
return operator.applyAsLong(idx);
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,75 @@
|
|||||||
|
package nu.marginalia.api.searchquery.model.compiled.aggregate;
|
||||||
|
|
||||||
|
import it.unimi.dsi.fastutil.longs.LongArraySet;
|
||||||
|
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
|
||||||
|
import it.unimi.dsi.fastutil.longs.LongSet;
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class CqQueryPathsOperator implements CqExpression.ObjectVisitor<List<LongSet>> {
|
||||||
|
private final CompiledQueryLong query;
|
||||||
|
|
||||||
|
public CqQueryPathsOperator(CompiledQueryLong query) {
|
||||||
|
this.query = query;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<LongSet> onAnd(List<? extends CqExpression> parts) {
|
||||||
|
return parts.stream()
|
||||||
|
.map(expr -> expr.visit(this))
|
||||||
|
.reduce(List.of(), this::combineAnd);
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<LongSet> combineAnd(List<LongSet> a, List<LongSet> b) {
|
||||||
|
// No-op cases
|
||||||
|
if (a.isEmpty())
|
||||||
|
return b;
|
||||||
|
if (b.isEmpty())
|
||||||
|
return a;
|
||||||
|
|
||||||
|
// Simple cases
|
||||||
|
if (a.size() == 1) {
|
||||||
|
b.forEach(set -> set.addAll(a.getFirst()));
|
||||||
|
return b;
|
||||||
|
}
|
||||||
|
else if (b.size() == 1) {
|
||||||
|
a.forEach(set -> set.addAll(b.getFirst()));
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Case where we AND two ORs
|
||||||
|
List<LongSet> ret = new ArrayList<>();
|
||||||
|
|
||||||
|
for (var aPart : a) {
|
||||||
|
for (var bPart : b) {
|
||||||
|
LongSet set = new LongOpenHashSet(aPart.size() + bPart.size());
|
||||||
|
set.addAll(aPart);
|
||||||
|
set.addAll(bPart);
|
||||||
|
ret.add(set);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<LongSet> onOr(List<? extends CqExpression> parts) {
|
||||||
|
List<LongSet> ret = new ArrayList<>();
|
||||||
|
|
||||||
|
for (var part : parts) {
|
||||||
|
ret.addAll(part.visit(this));
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<LongSet> onLeaf(int idx) {
|
||||||
|
var set = new LongArraySet(1);
|
||||||
|
set.add(query.at(idx));
|
||||||
|
return List.of(set);
|
||||||
|
}
|
||||||
|
}
|
@ -13,10 +13,6 @@ public record QueryResponse(SearchSpecification specs,
|
|||||||
String domain)
|
String domain)
|
||||||
{
|
{
|
||||||
public Set<String> getAllKeywords() {
|
public Set<String> getAllKeywords() {
|
||||||
Set<String> keywords = new HashSet<>(100);
|
return new HashSet<>(specs.query.searchTermsInclude);
|
||||||
for (var sq : specs.subqueries) {
|
|
||||||
keywords.addAll(sq.searchTermsInclude);
|
|
||||||
}
|
|
||||||
return keywords;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -13,9 +13,12 @@ import java.util.stream.Collectors;
|
|||||||
@AllArgsConstructor
|
@AllArgsConstructor
|
||||||
@With
|
@With
|
||||||
@EqualsAndHashCode
|
@EqualsAndHashCode
|
||||||
public class SearchSubquery {
|
public class SearchQuery {
|
||||||
|
|
||||||
/** These terms must be present in the document and are used in ranking*/
|
/** An infix style expression that encodes the required terms in the query */
|
||||||
|
public final String compiledQuery;
|
||||||
|
|
||||||
|
/** All terms that appear in {@see compiledQuery} */
|
||||||
public final List<String> searchTermsInclude;
|
public final List<String> searchTermsInclude;
|
||||||
|
|
||||||
/** These terms must be absent from the document */
|
/** These terms must be absent from the document */
|
||||||
@ -33,7 +36,8 @@ public class SearchSubquery {
|
|||||||
@Deprecated // why does this exist?
|
@Deprecated // why does this exist?
|
||||||
private double value = 0;
|
private double value = 0;
|
||||||
|
|
||||||
public SearchSubquery() {
|
public SearchQuery() {
|
||||||
|
this.compiledQuery = "";
|
||||||
this.searchTermsInclude = new ArrayList<>();
|
this.searchTermsInclude = new ArrayList<>();
|
||||||
this.searchTermsExclude = new ArrayList<>();
|
this.searchTermsExclude = new ArrayList<>();
|
||||||
this.searchTermsAdvice = new ArrayList<>();
|
this.searchTermsAdvice = new ArrayList<>();
|
||||||
@ -41,11 +45,13 @@ public class SearchSubquery {
|
|||||||
this.searchTermCoherences = new ArrayList<>();
|
this.searchTermCoherences = new ArrayList<>();
|
||||||
}
|
}
|
||||||
|
|
||||||
public SearchSubquery(List<String> searchTermsInclude,
|
public SearchQuery(String compiledQuery,
|
||||||
List<String> searchTermsExclude,
|
List<String> searchTermsInclude,
|
||||||
List<String> searchTermsAdvice,
|
List<String> searchTermsExclude,
|
||||||
List<String> searchTermsPriority,
|
List<String> searchTermsAdvice,
|
||||||
List<List<String>> searchTermCoherences) {
|
List<String> searchTermsPriority,
|
||||||
|
List<List<String>> searchTermCoherences) {
|
||||||
|
this.compiledQuery = compiledQuery;
|
||||||
this.searchTermsInclude = searchTermsInclude;
|
this.searchTermsInclude = searchTermsInclude;
|
||||||
this.searchTermsExclude = searchTermsExclude;
|
this.searchTermsExclude = searchTermsExclude;
|
||||||
this.searchTermsAdvice = searchTermsAdvice;
|
this.searchTermsAdvice = searchTermsAdvice;
|
||||||
@ -54,7 +60,7 @@ public class SearchSubquery {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Deprecated // why does this exist?
|
@Deprecated // why does this exist?
|
||||||
public SearchSubquery setValue(double value) {
|
public SearchQuery setValue(double value) {
|
||||||
if (Double.isInfinite(value) || Double.isNaN(value)) {
|
if (Double.isInfinite(value) || Double.isNaN(value)) {
|
||||||
this.value = Double.MAX_VALUE;
|
this.value = Double.MAX_VALUE;
|
||||||
} else {
|
} else {
|
||||||
@ -66,7 +72,7 @@ public class SearchSubquery {
|
|||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
StringBuilder sb = new StringBuilder();
|
StringBuilder sb = new StringBuilder();
|
||||||
if (!searchTermsInclude.isEmpty()) sb.append("include=").append(searchTermsInclude.stream().collect(Collectors.joining(",", "[", "] ")));
|
if (!compiledQuery.isEmpty()) sb.append("compiledQuery=").append(compiledQuery);
|
||||||
if (!searchTermsExclude.isEmpty()) sb.append("exclude=").append(searchTermsExclude.stream().collect(Collectors.joining(",", "[", "] ")));
|
if (!searchTermsExclude.isEmpty()) sb.append("exclude=").append(searchTermsExclude.stream().collect(Collectors.joining(",", "[", "] ")));
|
||||||
if (!searchTermsAdvice.isEmpty()) sb.append("advice=").append(searchTermsAdvice.stream().collect(Collectors.joining(",", "[", "] ")));
|
if (!searchTermsAdvice.isEmpty()) sb.append("advice=").append(searchTermsAdvice.stream().collect(Collectors.joining(",", "[", "] ")));
|
||||||
if (!searchTermsPriority.isEmpty()) sb.append("priority=").append(searchTermsPriority.stream().collect(Collectors.joining(",", "[", "] ")));
|
if (!searchTermsPriority.isEmpty()) sb.append("priority=").append(searchTermsPriority.stream().collect(Collectors.joining(",", "[", "] ")));
|
@ -10,7 +10,7 @@ import java.util.List;
|
|||||||
|
|
||||||
@ToString @Getter @Builder @With @AllArgsConstructor
|
@ToString @Getter @Builder @With @AllArgsConstructor
|
||||||
public class SearchSpecification {
|
public class SearchSpecification {
|
||||||
public List<SearchSubquery> subqueries;
|
public SearchQuery query;
|
||||||
|
|
||||||
/** If present and not empty, limit the search to these domain IDs */
|
/** If present and not empty, limit the search to these domain IDs */
|
||||||
public List<Integer> domains;
|
public List<Integer> domains;
|
||||||
|
@ -21,9 +21,9 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
|
|||||||
/** How many other potential results existed in the same domain */
|
/** How many other potential results existed in the same domain */
|
||||||
public int resultsFromDomain;
|
public int resultsFromDomain;
|
||||||
|
|
||||||
public SearchResultItem(long combinedId, int scoresCount) {
|
public SearchResultItem(long combinedId) {
|
||||||
this.combinedId = combinedId;
|
this.combinedId = combinedId;
|
||||||
this.keywordScores = new ArrayList<>(scoresCount);
|
this.keywordScores = new ArrayList<>();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -7,19 +7,22 @@ import nu.marginalia.model.idx.DocumentMetadata;
|
|||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
|
|
||||||
public final class SearchResultKeywordScore {
|
public final class SearchResultKeywordScore {
|
||||||
|
@Deprecated
|
||||||
public final int subquery;
|
public final int subquery;
|
||||||
|
public final long termId;
|
||||||
public final String keyword;
|
public final String keyword;
|
||||||
private final long encodedWordMetadata;
|
private final long encodedWordMetadata;
|
||||||
private final long encodedDocMetadata;
|
private final long encodedDocMetadata;
|
||||||
|
|
||||||
private final int htmlFeatures;
|
private final int htmlFeatures;
|
||||||
|
|
||||||
public SearchResultKeywordScore(int subquery,
|
public SearchResultKeywordScore(String keyword,
|
||||||
String keyword,
|
long termId,
|
||||||
long encodedWordMetadata,
|
long encodedWordMetadata,
|
||||||
long encodedDocMetadata,
|
long encodedDocMetadata,
|
||||||
int htmlFeatures) {
|
int htmlFeatures) {
|
||||||
this.subquery = subquery;
|
this.termId = termId;
|
||||||
|
this.subquery = -1; // FIXME, deprecated
|
||||||
this.keyword = keyword;
|
this.keyword = keyword;
|
||||||
this.encodedWordMetadata = encodedWordMetadata;
|
this.encodedWordMetadata = encodedWordMetadata;
|
||||||
this.encodedDocMetadata = encodedDocMetadata;
|
this.encodedDocMetadata = encodedDocMetadata;
|
||||||
|
@ -52,7 +52,7 @@ message RpcTemporalBias {
|
|||||||
|
|
||||||
/* Index service query request */
|
/* Index service query request */
|
||||||
message RpcIndexQuery {
|
message RpcIndexQuery {
|
||||||
repeated RpcSubquery subqueries = 1;
|
RpcQuery query = 1;
|
||||||
repeated int32 domains = 2; // (optional) A list of domain IDs to consider
|
repeated int32 domains = 2; // (optional) A list of domain IDs to consider
|
||||||
string searchSetIdentifier = 3; // (optional) A named set of domains to consider
|
string searchSetIdentifier = 3; // (optional) A named set of domains to consider
|
||||||
string humanQuery = 4; // The search query as the user entered it
|
string humanQuery = 4; // The search query as the user entered it
|
||||||
@ -102,12 +102,11 @@ message RpcRawResultItem {
|
|||||||
|
|
||||||
/* Information about how well a keyword matches a query */
|
/* Information about how well a keyword matches a query */
|
||||||
message RpcResultKeywordScore {
|
message RpcResultKeywordScore {
|
||||||
int32 subquery = 1; // index of the subquery this keyword relates to
|
string keyword = 1; // the keyword
|
||||||
string keyword = 2; // the keyword
|
int64 encodedWordMetadata = 2; // bit encoded word metadata
|
||||||
int64 encodedWordMetadata = 3; // bit encoded word metadata
|
int64 encodedDocMetadata = 3; // bit encoded document metadata
|
||||||
int64 encodedDocMetadata = 4; // bit encoded document metadata
|
bool hasPriorityTerms = 4; // true if this word is important to the document
|
||||||
bool hasPriorityTerms = 5; // true if this word is important to the document
|
int32 htmlFeatures = 5; // bit encoded document features
|
||||||
int32 htmlFeatures = 6; // bit encoded document features
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Query execution parameters */
|
/* Query execution parameters */
|
||||||
@ -137,12 +136,13 @@ message RpcResultRankingParameters {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Defines a single subquery */
|
/* Defines a single subquery */
|
||||||
message RpcSubquery {
|
message RpcQuery {
|
||||||
repeated string include = 1; // These terms must be present
|
repeated string include = 1; // These terms must be present
|
||||||
repeated string exclude = 2; // These terms must be absent
|
repeated string exclude = 2; // These terms must be absent
|
||||||
repeated string advice = 3; // These terms must be present, but do not affect ranking
|
repeated string advice = 3; // These terms must be present, but do not affect ranking
|
||||||
repeated string priority = 4; // These terms are not mandatory, but affect ranking positively if they are present
|
repeated string priority = 4; // These terms are not mandatory, but affect ranking positively if they are present
|
||||||
repeated RpcCoherences coherences = 5; // Groups of terms that must exist in proximity of each other
|
repeated RpcCoherences coherences = 5; // Groups of terms that must exist in proximity of each other
|
||||||
|
string compiledQuery = 6; // Compiled query in infix notation
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Defines a group of search terms that must exist in close proximity within the document */
|
/* Defines a group of search terms that must exist in close proximity within the document */
|
||||||
|
@ -0,0 +1,79 @@
|
|||||||
|
package nu.marginalia.api.searchquery.model.compiled;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
|
class CompiledQueryParserTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testEmpty() {
|
||||||
|
assertEquals(CqExpression.empty(), CompiledQueryParser.parse("").root);
|
||||||
|
assertEquals(CqExpression.empty(), CompiledQueryParser.parse("( )").root);
|
||||||
|
assertEquals(CqExpression.empty(), CompiledQueryParser.parse("( | )").root);
|
||||||
|
assertEquals(CqExpression.empty(), CompiledQueryParser.parse("| ( | ) |").root);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSingleWord() {
|
||||||
|
CompiledQuery<String> q = CompiledQueryParser.parse("foo");
|
||||||
|
assertEquals(w(q, "foo"), q.root);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testAndTwoWords() {
|
||||||
|
CompiledQuery<String> q = CompiledQueryParser.parse("foo bar");
|
||||||
|
assertEquals(and(w(q, "foo"), w(q,"bar")), q.root);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testOrTwoWords() {
|
||||||
|
CompiledQuery<String> q = CompiledQueryParser.parse("foo | bar");
|
||||||
|
assertEquals(or(w(q, "foo"), w(q,"bar")), q.root);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testOrAndWords() {
|
||||||
|
CompiledQuery<String> q = CompiledQueryParser.parse("foo | bar baz");
|
||||||
|
assertEquals(or(w(q,"foo"), and(w(q,"bar"), w(q,"baz"))), q.root);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testAndAndOrAndAndWords() {
|
||||||
|
CompiledQuery<String> q = CompiledQueryParser.parse("foo foobar | bar baz");
|
||||||
|
assertEquals(or(
|
||||||
|
and(w(q, "foo"), w(q, "foobar")),
|
||||||
|
and(w(q, "bar"), w(q, "baz")))
|
||||||
|
, q.root);
|
||||||
|
}
|
||||||
|
@Test
|
||||||
|
public void testComplex1() {
|
||||||
|
CompiledQuery<String> q = CompiledQueryParser.parse("foo ( bar | baz ) quux");
|
||||||
|
assertEquals(and(w(q,"foo"), or(w(q, "bar"), w(q, "baz")), w(q, "quux")), q.root);
|
||||||
|
}
|
||||||
|
@Test
|
||||||
|
public void testComplex2() {
|
||||||
|
CompiledQuery<String> q = CompiledQueryParser.parse("( ( ( a ) b ) c ) d");
|
||||||
|
assertEquals(and(and(and(w(q, "a"), w(q, "b")), w(q, "c")), w(q, "d")), q.root);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testNested() {
|
||||||
|
CompiledQuery<String> q = CompiledQueryParser.parse("( ( ( a ) ) )");
|
||||||
|
assertEquals(w(q,"a"), q.root);
|
||||||
|
}
|
||||||
|
|
||||||
|
private CqExpression.Word w(CompiledQuery<String> query, String word) {
|
||||||
|
return new CqExpression.Word(query.indices().filter(idx -> word.equals(query.at(idx))).findAny().orElseThrow());
|
||||||
|
}
|
||||||
|
|
||||||
|
private CqExpression and(CqExpression... parts) {
|
||||||
|
return new CqExpression.And(List.of(parts));
|
||||||
|
}
|
||||||
|
|
||||||
|
private CqExpression or(CqExpression... parts) {
|
||||||
|
return new CqExpression.Or(List.of(parts));
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,35 @@
|
|||||||
|
package nu.marginalia.api.searchquery.model.compiled.aggregate;
|
||||||
|
|
||||||
|
import static nu.marginalia.api.searchquery.model.compiled.CompiledQueryParser.parse;
|
||||||
|
import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.*;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
|
class CompiledQueryAggregatesTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void booleanAggregates() {
|
||||||
|
assertFalse(booleanAggregate(parse("false"), Boolean::parseBoolean));
|
||||||
|
assertTrue(booleanAggregate(parse("true"), Boolean::parseBoolean));
|
||||||
|
assertFalse(booleanAggregate(parse("false true"), Boolean::parseBoolean));
|
||||||
|
assertTrue(booleanAggregate(parse("( true ) | ( true false )"), Boolean::parseBoolean));
|
||||||
|
assertTrue(booleanAggregate(parse("( false ) | ( true )"), Boolean::parseBoolean));
|
||||||
|
assertTrue(booleanAggregate(parse("( true false ) | ( true true )"), Boolean::parseBoolean));
|
||||||
|
assertFalse(booleanAggregate(parse("( true false ) | ( true false )"), Boolean::parseBoolean));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void intMaxMinAggregates() {
|
||||||
|
assertEquals(5, intMaxMinAggregate(parse("5"), Integer::parseInt));
|
||||||
|
assertEquals(3, intMaxMinAggregate(parse("5 3"), Integer::parseInt));
|
||||||
|
assertEquals(6, intMaxMinAggregate(parse("5 3 | 6 7"), Integer::parseInt));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void doubleSumAggregates() {
|
||||||
|
assertEquals(5, (int) doubleSumAggregate(parse("5"), Double::parseDouble));
|
||||||
|
assertEquals(8, (int) doubleSumAggregate(parse("5 3"), Double::parseDouble));
|
||||||
|
assertEquals(13, (int) doubleSumAggregate(parse("1 ( 5 3 | 2 10 )"), Double::parseDouble));
|
||||||
|
}
|
||||||
|
}
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.index.client;
|
package nu.marginalia.index.client;
|
||||||
|
|
||||||
import nu.marginalia.api.searchquery.IndexProtobufCodec;
|
import nu.marginalia.api.searchquery.IndexProtobufCodec;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||||
import nu.marginalia.index.query.limit.QueryLimits;
|
import nu.marginalia.index.query.limit.QueryLimits;
|
||||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||||
@ -35,14 +35,15 @@ class IndexProtobufCodecTest {
|
|||||||
}
|
}
|
||||||
@Test
|
@Test
|
||||||
public void testSubqery() {
|
public void testSubqery() {
|
||||||
verifyIsIdentityTransformation(new SearchSubquery(
|
verifyIsIdentityTransformation(new SearchQuery(
|
||||||
|
"qs",
|
||||||
List.of("a", "b"),
|
List.of("a", "b"),
|
||||||
List.of("c", "d"),
|
List.of("c", "d"),
|
||||||
List.of("e", "f"),
|
List.of("e", "f"),
|
||||||
List.of("g", "h"),
|
List.of("g", "h"),
|
||||||
List.of(List.of("i", "j"), List.of("k"))
|
List.of(List.of("i", "j"), List.of("k"))
|
||||||
),
|
),
|
||||||
s -> IndexProtobufCodec.convertSearchSubquery(IndexProtobufCodec.convertSearchSubquery(s))
|
s -> IndexProtobufCodec.convertRpcQuery(IndexProtobufCodec.convertRpcQuery(s))
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
private <T> void verifyIsIdentityTransformation(T val, Function<T,T> transformation) {
|
private <T> void verifyIsIdentityTransformation(T val, Function<T,T> transformation) {
|
||||||
|
@ -2,18 +2,16 @@ package nu.marginalia.functions.searchquery.svc;
|
|||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.LanguageModels;
|
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||||
import nu.marginalia.util.language.EnglishDictionary;
|
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
|
||||||
import nu.marginalia.language.WordPatterns;
|
import nu.marginalia.language.WordPatterns;
|
||||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||||
import nu.marginalia.api.searchquery.model.query.ProcessedQuery;
|
import nu.marginalia.api.searchquery.model.query.ProcessedQuery;
|
||||||
import nu.marginalia.functions.searchquery.query_parser.QueryParser;
|
import nu.marginalia.functions.searchquery.query_parser.QueryParser;
|
||||||
import nu.marginalia.functions.searchquery.query_parser.token.Token;
|
import nu.marginalia.functions.searchquery.query_parser.token.Token;
|
||||||
import nu.marginalia.functions.searchquery.query_parser.token.TokenType;
|
import nu.marginalia.functions.searchquery.query_parser.token.TokenType;
|
||||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -26,15 +24,14 @@ import java.util.List;
|
|||||||
public class QueryFactory {
|
public class QueryFactory {
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
private static final int RETAIN_QUERY_VARIANT_COUNT = 5;
|
|
||||||
private final QueryParser queryParser = new QueryParser();
|
private final QueryParser queryParser = new QueryParser();
|
||||||
|
private final QueryExpansion queryExpansion;
|
||||||
|
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public QueryFactory(LanguageModels lm,
|
public QueryFactory(QueryExpansion queryExpansion)
|
||||||
TermFrequencyDict dict,
|
|
||||||
EnglishDictionary englishDictionary)
|
|
||||||
{
|
{
|
||||||
|
this.queryExpansion = queryExpansion;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -49,8 +46,6 @@ public class QueryFactory {
|
|||||||
List<String> searchTermsHuman = new ArrayList<>();
|
List<String> searchTermsHuman = new ArrayList<>();
|
||||||
List<String> problems = new ArrayList<>();
|
List<String> problems = new ArrayList<>();
|
||||||
|
|
||||||
String domain = null;
|
|
||||||
|
|
||||||
List<Token> basicQuery = queryParser.parse(query);
|
List<Token> basicQuery = queryParser.parse(query);
|
||||||
|
|
||||||
if (basicQuery.size() >= 12) {
|
if (basicQuery.size() >= 12) {
|
||||||
@ -74,19 +69,8 @@ public class QueryFactory {
|
|||||||
t.visit(qualityLimits);
|
t.visit(qualityLimits);
|
||||||
}
|
}
|
||||||
|
|
||||||
// var queryPermutations = queryPermutation.permuteQueriesNew(basicQuery);
|
|
||||||
List<SearchSubquery> subqueries = new ArrayList<>();
|
|
||||||
QuerySearchTermsAccumulator termsAccumulator = new QuerySearchTermsAccumulator(basicQuery);
|
QuerySearchTermsAccumulator termsAccumulator = new QuerySearchTermsAccumulator(basicQuery);
|
||||||
domain = termsAccumulator.domain;
|
String domain = termsAccumulator.domain;
|
||||||
|
|
||||||
// for (var parts : queryPermutations) {
|
|
||||||
// QuerySearchTermsAccumulator termsAccumulator = new QuerySearchTermsAccumulator(basicQuery);
|
|
||||||
//
|
|
||||||
// domain = termsAccumulator.domain;
|
|
||||||
//
|
|
||||||
// SearchSubquery subquery = termsAccumulator.createSubquery();
|
|
||||||
// subqueries.add(subquery);
|
|
||||||
// }
|
|
||||||
|
|
||||||
List<Integer> domainIds = params.domainIds();
|
List<Integer> domainIds = params.domainIds();
|
||||||
|
|
||||||
@ -97,7 +81,18 @@ public class QueryFactory {
|
|||||||
}
|
}
|
||||||
|
|
||||||
var specsBuilder = SearchSpecification.builder()
|
var specsBuilder = SearchSpecification.builder()
|
||||||
.subqueries(subqueries)
|
.query(
|
||||||
|
new SearchQuery(
|
||||||
|
queryExpansion.expandQuery(
|
||||||
|
termsAccumulator.searchTermsInclude
|
||||||
|
),
|
||||||
|
termsAccumulator.searchTermsInclude,
|
||||||
|
termsAccumulator.searchTermsExclude,
|
||||||
|
termsAccumulator.searchTermsAdvice,
|
||||||
|
termsAccumulator.searchTermsPriority,
|
||||||
|
termsAccumulator.searchTermCoherences
|
||||||
|
)
|
||||||
|
)
|
||||||
.humanQuery(query)
|
.humanQuery(query)
|
||||||
.quality(qualityLimits.qualityLimit)
|
.quality(qualityLimits.qualityLimit)
|
||||||
.year(qualityLimits.year)
|
.year(qualityLimits.year)
|
||||||
@ -111,12 +106,9 @@ public class QueryFactory {
|
|||||||
|
|
||||||
SearchSpecification specs = specsBuilder.build();
|
SearchSpecification specs = specsBuilder.build();
|
||||||
|
|
||||||
for (var sq : specs.subqueries) {
|
specs.query.searchTermsAdvice.addAll(params.tacitAdvice());
|
||||||
sq.searchTermsAdvice.addAll(params.tacitAdvice());
|
specs.query.searchTermsPriority.addAll(params.tacitPriority());
|
||||||
sq.searchTermsPriority.addAll(params.tacitPriority());
|
specs.query.searchTermsExclude.addAll(params.tacitExcludes());
|
||||||
sq.searchTermsInclude.addAll(params.tacitIncludes());
|
|
||||||
sq.searchTermsExclude.addAll(params.tacitExcludes());
|
|
||||||
}
|
|
||||||
|
|
||||||
return new ProcessedQuery(specs, searchTermsHuman, domain);
|
return new ProcessedQuery(specs, searchTermsHuman, domain);
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
package nu.marginalia.functions.searchquery.svc;
|
package nu.marginalia.functions.searchquery.svc;
|
||||||
|
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||||
import nu.marginalia.language.WordPatterns;
|
import nu.marginalia.language.WordPatterns;
|
||||||
import nu.marginalia.functions.searchquery.query_parser.token.Token;
|
import nu.marginalia.functions.searchquery.query_parser.token.Token;
|
||||||
import nu.marginalia.functions.searchquery.query_parser.token.TokenVisitor;
|
import nu.marginalia.functions.searchquery.query_parser.token.TokenVisitor;
|
||||||
@ -9,7 +9,7 @@ import java.util.ArrayList;
|
|||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/** @see SearchSubquery */
|
/** @see SearchQuery */
|
||||||
public class QuerySearchTermsAccumulator implements TokenVisitor {
|
public class QuerySearchTermsAccumulator implements TokenVisitor {
|
||||||
public List<String> searchTermsExclude = new ArrayList<>();
|
public List<String> searchTermsExclude = new ArrayList<>();
|
||||||
public List<String> searchTermsInclude = new ArrayList<>();
|
public List<String> searchTermsInclude = new ArrayList<>();
|
||||||
@ -19,10 +19,6 @@ public class QuerySearchTermsAccumulator implements TokenVisitor {
|
|||||||
|
|
||||||
public String domain;
|
public String domain;
|
||||||
|
|
||||||
public SearchSubquery createSubquery() {
|
|
||||||
return new SearchSubquery(searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority, searchTermCoherences);
|
|
||||||
}
|
|
||||||
|
|
||||||
public QuerySearchTermsAccumulator(List<Token> parts) {
|
public QuerySearchTermsAccumulator(List<Token> parts) {
|
||||||
for (Token t : parts) {
|
for (Token t : parts) {
|
||||||
t.visit(this);
|
t.visit(this);
|
||||||
|
@ -3,12 +3,13 @@ package nu.marginalia.query.svc;
|
|||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||||
|
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
|
||||||
import nu.marginalia.functions.searchquery.svc.QueryFactory;
|
import nu.marginalia.functions.searchquery.svc.QueryFactory;
|
||||||
import nu.marginalia.index.query.limit.QueryLimits;
|
import nu.marginalia.index.query.limit.QueryLimits;
|
||||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||||
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
||||||
import nu.marginalia.util.language.EnglishDictionary;
|
import nu.marginalia.segmentation.NgramLexicon;
|
||||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||||
import org.junit.jupiter.api.BeforeAll;
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
@ -27,11 +28,9 @@ public class QueryFactoryTest {
|
|||||||
public static void setUpAll() throws IOException {
|
public static void setUpAll() throws IOException {
|
||||||
|
|
||||||
var lm = WmsaHome.getLanguageModels();
|
var lm = WmsaHome.getLanguageModels();
|
||||||
var tfd = new TermFrequencyDict(lm);
|
|
||||||
|
|
||||||
queryFactory = new QueryFactory(lm,
|
queryFactory = new QueryFactory(
|
||||||
tfd,
|
new QueryExpansion(new TermFrequencyDict(lm), new NgramLexicon(lm))
|
||||||
new EnglishDictionary(tfd)
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -112,17 +111,15 @@ public class QueryFactoryTest {
|
|||||||
{
|
{
|
||||||
// the is a stopword, so it should generate an ngram search term
|
// the is a stopword, so it should generate an ngram search term
|
||||||
var specs = parseAndGetSpecs("\"the shining\"");
|
var specs = parseAndGetSpecs("\"the shining\"");
|
||||||
assertEquals(List.of("the_shining"), specs.subqueries.iterator().next().searchTermsInclude);
|
assertEquals("the_shining", specs.query.compiledQuery);
|
||||||
assertEquals(List.of(), specs.subqueries.iterator().next().searchTermsAdvice);
|
|
||||||
assertEquals(List.of(), specs.subqueries.iterator().next().searchTermCoherences);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
// tde isn't a stopword, so we should get the normal behavior
|
// tde isn't a stopword, so we should get the normal behavior
|
||||||
var specs = parseAndGetSpecs("\"tde shining\"");
|
var specs = parseAndGetSpecs("\"tde shining\"");
|
||||||
assertEquals(List.of("tde", "shining"), specs.subqueries.iterator().next().searchTermsInclude);
|
assertEquals("tde shining", specs.query.compiledQuery);
|
||||||
assertEquals(List.of("tde_shining"), specs.subqueries.iterator().next().searchTermsAdvice);
|
assertEquals(List.of("tde_shining"), specs.query.searchTermsAdvice);
|
||||||
assertEquals(List.of(List.of("tde", "shining")), specs.subqueries.iterator().next().searchTermCoherences);
|
assertEquals(List.of(List.of("tde", "shining")), specs.query.searchTermCoherences);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -150,8 +147,18 @@ public class QueryFactoryTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testPriorityTerm() {
|
public void testPriorityTerm() {
|
||||||
var subquery = parseAndGetSpecs("physics ?tld:edu").subqueries.iterator().next();
|
var subquery = parseAndGetSpecs("physics ?tld:edu").query;
|
||||||
assertEquals(List.of("tld:edu"), subquery.searchTermsPriority);
|
assertEquals(List.of("tld:edu"), subquery.searchTermsPriority);
|
||||||
assertEquals(List.of("physics"), subquery.searchTermsInclude);
|
assertEquals("physics", subquery.compiledQuery);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testExpansion() {
|
||||||
|
|
||||||
|
long start = System.currentTimeMillis();
|
||||||
|
var subquery = parseAndGetSpecs("elden ring mechanical keyboard slackware linux duke nukem 3d").query;
|
||||||
|
System.out.println("Time: " + (System.currentTimeMillis() - start));
|
||||||
|
System.out.println(subquery.compiledQuery);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -46,7 +46,7 @@ public class ReverseIndexEntrySource implements EntrySource {
|
|||||||
return;
|
return;
|
||||||
|
|
||||||
for (int ri = entrySize, wi=1; ri < buffer.end ; ri+=entrySize, wi++) {
|
for (int ri = entrySize, wi=1; ri < buffer.end ; ri+=entrySize, wi++) {
|
||||||
buffer.data[wi] = buffer.data[ri];
|
buffer.data.set(wi, buffer.data.get(ri));
|
||||||
}
|
}
|
||||||
|
|
||||||
buffer.end /= entrySize;
|
buffer.end /= entrySize;
|
||||||
|
@ -9,14 +9,14 @@ import io.prometheus.client.Histogram;
|
|||||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.api.searchquery.*;
|
import nu.marginalia.api.searchquery.*;
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
|
||||||
import nu.marginalia.api.searchquery.model.results.*;
|
import nu.marginalia.api.searchquery.model.results.*;
|
||||||
import nu.marginalia.array.buffer.LongQueryBuffer;
|
import nu.marginalia.array.buffer.LongQueryBuffer;
|
||||||
import nu.marginalia.index.index.StatefulIndex;
|
import nu.marginalia.index.index.StatefulIndex;
|
||||||
import nu.marginalia.index.model.SearchParameters;
|
import nu.marginalia.index.model.SearchParameters;
|
||||||
import nu.marginalia.index.model.SearchTerms;
|
import nu.marginalia.index.model.SearchTerms;
|
||||||
import nu.marginalia.index.model.SearchTermsUtil;
|
|
||||||
import nu.marginalia.index.query.IndexQuery;
|
import nu.marginalia.index.query.IndexQuery;
|
||||||
import nu.marginalia.index.query.IndexSearchBudget;
|
import nu.marginalia.index.query.IndexSearchBudget;
|
||||||
import nu.marginalia.index.results.IndexResultValuatorService;
|
import nu.marginalia.index.results.IndexResultValuatorService;
|
||||||
@ -143,7 +143,6 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
|
|||||||
.setEncodedWordMetadata(score.encodedWordMetadata())
|
.setEncodedWordMetadata(score.encodedWordMetadata())
|
||||||
.setKeyword(score.keyword)
|
.setKeyword(score.keyword)
|
||||||
.setHtmlFeatures(score.htmlFeatures())
|
.setHtmlFeatures(score.htmlFeatures())
|
||||||
.setSubquery(score.subquery)
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -203,7 +202,9 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
|
|||||||
return new SearchResultSet(List.of());
|
return new SearchResultSet(List.of());
|
||||||
}
|
}
|
||||||
|
|
||||||
ResultRankingContext rankingContext = createRankingContext(params.rankingParams, params.subqueries);
|
ResultRankingContext rankingContext = createRankingContext(params.rankingParams,
|
||||||
|
params.compiledQuery,
|
||||||
|
params.compiledQueryIds);
|
||||||
|
|
||||||
var queryExecution = new QueryExecution(rankingContext, params.fetchSize);
|
var queryExecution = new QueryExecution(rankingContext, params.fetchSize);
|
||||||
|
|
||||||
@ -255,14 +256,10 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
|
|||||||
/** Execute a search query */
|
/** Execute a search query */
|
||||||
public SearchResultSet run(SearchParameters parameters) throws SQLException, InterruptedException {
|
public SearchResultSet run(SearchParameters parameters) throws SQLException, InterruptedException {
|
||||||
|
|
||||||
for (var subquery : parameters.subqueries) {
|
var terms = new SearchTerms(parameters.query, parameters.compiledQueryIds);
|
||||||
var terms = new SearchTerms(subquery);
|
|
||||||
if (terms.isEmpty())
|
|
||||||
continue;
|
|
||||||
|
|
||||||
for (var indexQuery : index.createQueries(terms, parameters.queryParams)) {
|
for (var indexQuery : index.createQueries(terms, parameters.queryParams)) {
|
||||||
workerPool.execute(new IndexLookup(indexQuery, parameters.budget));
|
workerPool.execute(new IndexLookup(indexQuery, parameters.budget));
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < indexValuationThreads; i++) {
|
for (int i = 0; i < indexValuationThreads; i++) {
|
||||||
@ -327,7 +324,9 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
|
|||||||
buffer.reset();
|
buffer.reset();
|
||||||
query.getMoreResults(buffer);
|
query.getMoreResults(buffer);
|
||||||
|
|
||||||
results.addElements(0, buffer.data, 0, buffer.end);
|
for (int i = 0; i < buffer.end; i++) {
|
||||||
|
results.add(buffer.data.get(i));
|
||||||
|
}
|
||||||
|
|
||||||
if (results.size() < 512) {
|
if (results.size() < 512) {
|
||||||
enqueueResults(new CombinedDocIdList(results));
|
enqueueResults(new CombinedDocIdList(results));
|
||||||
@ -413,8 +412,13 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private ResultRankingContext createRankingContext(ResultRankingParameters rankingParams, List<SearchSubquery> subqueries) {
|
private ResultRankingContext createRankingContext(ResultRankingParameters rankingParams,
|
||||||
final var termToId = SearchTermsUtil.getAllIncludeTerms(subqueries);
|
CompiledQuery<String> query,
|
||||||
|
CompiledQueryLong compiledQueryIds)
|
||||||
|
{
|
||||||
|
Map<String, Long> termToId = new HashMap<>(query.size());
|
||||||
|
query.indices().forEach(id -> termToId.put(query.at(id), compiledQueryIds.at(id)));
|
||||||
|
|
||||||
final Map<String, Integer> termFrequencies = new HashMap<>(termToId.size());
|
final Map<String, Integer> termFrequencies = new HashMap<>(termToId.size());
|
||||||
final Map<String, Integer> prioFrequencies = new HashMap<>(termToId.size());
|
final Map<String, Integer> prioFrequencies = new HashMap<>(termToId.size());
|
||||||
|
|
||||||
|
@ -38,6 +38,13 @@ public class CombinedIndexReader {
|
|||||||
return new IndexQueryBuilderImpl(reverseIndexFullReader, reverseIndexPriorityReader, query);
|
return new IndexQueryBuilderImpl(reverseIndexFullReader, reverseIndexPriorityReader, query);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public QueryFilterStepIf hasWordFull(long termId) {
|
||||||
|
return reverseIndexFullReader.also(termId);
|
||||||
|
}
|
||||||
|
public QueryFilterStepIf hasWordPrio(long termId) {
|
||||||
|
return reverseIndexPriorityReader.also(termId);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/** Creates a query builder for terms in the priority index */
|
/** Creates a query builder for terms in the priority index */
|
||||||
public IndexQueryBuilder findPriorityWord(long wordId) {
|
public IndexQueryBuilder findPriorityWord(long wordId) {
|
||||||
|
@ -1,9 +1,11 @@
|
|||||||
package nu.marginalia.index.index;
|
package nu.marginalia.index.index;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
import gnu.trove.set.hash.TLongHashSet;
|
import gnu.trove.set.hash.TLongHashSet;
|
||||||
import nu.marginalia.index.ReverseIndexReader;
|
import nu.marginalia.index.ReverseIndexReader;
|
||||||
import nu.marginalia.index.query.IndexQuery;
|
import nu.marginalia.index.query.IndexQuery;
|
||||||
import nu.marginalia.index.query.IndexQueryBuilder;
|
import nu.marginalia.index.query.IndexQueryBuilder;
|
||||||
|
import nu.marginalia.index.query.filter.QueryFilterAnyOf;
|
||||||
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
||||||
|
|
||||||
public class IndexQueryBuilderImpl implements IndexQueryBuilder {
|
public class IndexQueryBuilderImpl implements IndexQueryBuilder {
|
||||||
@ -66,6 +68,20 @@ public class IndexQueryBuilderImpl implements IndexQueryBuilder {
|
|||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public IndexQueryBuilder addInclusionFilterAny(List<QueryFilterStepIf> filterSteps) {
|
||||||
|
if (filterSteps.isEmpty())
|
||||||
|
return this;
|
||||||
|
|
||||||
|
if (filterSteps.size() == 1) {
|
||||||
|
query.addInclusionFilter(filterSteps.getFirst());
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
query.addInclusionFilter(new QueryFilterAnyOf(filterSteps));
|
||||||
|
}
|
||||||
|
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
public IndexQuery build() {
|
public IndexQuery build() {
|
||||||
return query;
|
return query;
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,78 @@
|
|||||||
|
package nu.marginalia.index.index;
|
||||||
|
|
||||||
|
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||||
|
import it.unimi.dsi.fastutil.longs.LongSet;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
class QueryBranchWalker {
|
||||||
|
public final long[] priorityOrder;
|
||||||
|
public final List<LongSet> paths;
|
||||||
|
public final long termId;
|
||||||
|
|
||||||
|
private QueryBranchWalker(long[] priorityOrder, List<LongSet> paths, long termId) {
|
||||||
|
this.priorityOrder = priorityOrder;
|
||||||
|
this.paths = paths;
|
||||||
|
this.termId = termId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean atEnd() {
|
||||||
|
return priorityOrder.length == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static List<QueryBranchWalker> create(long[] priorityOrder, List<LongSet> paths) {
|
||||||
|
|
||||||
|
List<QueryBranchWalker> ret = new ArrayList<>();
|
||||||
|
List<LongSet> remainingPaths = new LinkedList<>(paths);
|
||||||
|
|
||||||
|
remainingPaths.removeIf(LongSet::isEmpty);
|
||||||
|
|
||||||
|
for (int i = 0; i < priorityOrder.length; i++) {
|
||||||
|
long prio = priorityOrder[i];
|
||||||
|
|
||||||
|
var it = remainingPaths.iterator();
|
||||||
|
List<LongSet> pathsForPrio = new ArrayList<>();
|
||||||
|
|
||||||
|
while (it.hasNext()) {
|
||||||
|
var path = it.next();
|
||||||
|
|
||||||
|
if (path.contains(prio)) {
|
||||||
|
path.remove(prio);
|
||||||
|
pathsForPrio.add(path);
|
||||||
|
it.remove();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!pathsForPrio.isEmpty()) {
|
||||||
|
LongArrayList remainingPrios = new LongArrayList(pathsForPrio.size());
|
||||||
|
|
||||||
|
for (var p : priorityOrder) {
|
||||||
|
for (var path : pathsForPrio) {
|
||||||
|
if (path.contains(p)) {
|
||||||
|
remainingPrios.add(p);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ret.add(new QueryBranchWalker(remainingPrios.elements(), pathsForPrio, prio));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!remainingPaths.isEmpty()) {
|
||||||
|
System.out.println("Dropping: " + remainingPaths);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<QueryBranchWalker> next() {
|
||||||
|
if (atEnd())
|
||||||
|
return List.of();
|
||||||
|
|
||||||
|
return create(priorityOrder, paths);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -2,6 +2,13 @@ package nu.marginalia.index.index;
|
|||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
|
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
|
||||||
|
import it.unimi.dsi.fastutil.longs.LongSet;
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
|
||||||
|
import nu.marginalia.index.query.filter.QueryFilterAllOf;
|
||||||
|
import nu.marginalia.index.query.filter.QueryFilterAnyOf;
|
||||||
|
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
||||||
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
||||||
import nu.marginalia.index.results.model.ids.DocMetadataList;
|
import nu.marginalia.index.results.model.ids.DocMetadataList;
|
||||||
import nu.marginalia.index.model.QueryParams;
|
import nu.marginalia.index.model.QueryParams;
|
||||||
@ -14,12 +21,13 @@ import org.slf4j.Logger;
|
|||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.*;
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.concurrent.locks.Lock;
|
import java.util.concurrent.locks.Lock;
|
||||||
import java.util.concurrent.locks.ReadWriteLock;
|
import java.util.concurrent.locks.ReadWriteLock;
|
||||||
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
||||||
|
import java.util.function.LongFunction;
|
||||||
|
import java.util.function.Predicate;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
/** This class delegates SearchIndexReader and deals with the stateful nature of the index,
|
/** This class delegates SearchIndexReader and deals with the stateful nature of the index,
|
||||||
* i.e. it may be possible to reconstruct the index and load a new set of data.
|
* i.e. it may be possible to reconstruct the index and load a new set of data.
|
||||||
@ -105,6 +113,61 @@ public class StatefulIndex {
|
|||||||
return combinedIndexReader != null && combinedIndexReader.isLoaded();
|
return combinedIndexReader != null && combinedIndexReader.isLoaded();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private Predicate<LongSet> containsOnly(long[] permitted) {
|
||||||
|
LongSet permittedTerms = new LongOpenHashSet(permitted);
|
||||||
|
return permittedTerms::containsAll;
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<IndexQueryBuilder> createBuilders(CompiledQueryLong query,
|
||||||
|
LongFunction<IndexQueryBuilder> builderFactory,
|
||||||
|
long[] termPriority) {
|
||||||
|
List<LongSet> paths = CompiledQueryAggregates.queriesAggregate(query);
|
||||||
|
|
||||||
|
// Remove any paths that do not contain all prioritized terms, as this means
|
||||||
|
// the term is missing from the index and can never be found
|
||||||
|
paths.removeIf(containsOnly(termPriority).negate());
|
||||||
|
|
||||||
|
List<QueryBranchWalker> helpers = QueryBranchWalker.create(termPriority, paths);
|
||||||
|
List<IndexQueryBuilder> builders = new ArrayList<>();
|
||||||
|
|
||||||
|
for (var helper : helpers) {
|
||||||
|
var builder = builderFactory.apply(helper.termId);
|
||||||
|
|
||||||
|
builders.add(builder);
|
||||||
|
|
||||||
|
if (helper.atEnd())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
var filters = helper.next().stream()
|
||||||
|
.map(this::createFilter)
|
||||||
|
.toList();
|
||||||
|
|
||||||
|
builder.addInclusionFilterAny(filters);
|
||||||
|
}
|
||||||
|
|
||||||
|
return builders;
|
||||||
|
}
|
||||||
|
|
||||||
|
private QueryFilterStepIf createFilter(QueryBranchWalker helper) {
|
||||||
|
var selfCondition = combinedIndexReader.hasWordFull(helper.termId);
|
||||||
|
if (helper.atEnd())
|
||||||
|
return selfCondition;
|
||||||
|
|
||||||
|
var nextSteps = helper.next();
|
||||||
|
var nextFilters = nextSteps.stream()
|
||||||
|
.map(this::createFilter)
|
||||||
|
.map(filter -> new QueryFilterAllOf(List.of(selfCondition, filter)))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
|
if (nextFilters.isEmpty())
|
||||||
|
return selfCondition;
|
||||||
|
|
||||||
|
if (nextFilters.size() == 1)
|
||||||
|
return nextFilters.getFirst();
|
||||||
|
|
||||||
|
|
||||||
|
return new QueryFilterAnyOf(nextFilters);
|
||||||
|
}
|
||||||
|
|
||||||
public List<IndexQuery> createQueries(SearchTerms terms, QueryParams params) {
|
public List<IndexQuery> createQueries(SearchTerms terms, QueryParams params) {
|
||||||
|
|
||||||
@ -117,40 +180,13 @@ public class StatefulIndex {
|
|||||||
final long[] orderedIncludesPrio = terms.sortedDistinctIncludes(this::compareKeywordsPrio);
|
final long[] orderedIncludesPrio = terms.sortedDistinctIncludes(this::compareKeywordsPrio);
|
||||||
|
|
||||||
List<IndexQueryBuilder> queryHeads = new ArrayList<>(10);
|
List<IndexQueryBuilder> queryHeads = new ArrayList<>(10);
|
||||||
|
|
||||||
|
queryHeads.addAll(createBuilders(terms.compiledQuery(), combinedIndexReader::findFullWord, orderedIncludes));
|
||||||
|
queryHeads.addAll(createBuilders(terms.compiledQuery(), combinedIndexReader::findPriorityWord, orderedIncludesPrio));
|
||||||
|
|
||||||
List<IndexQuery> queries = new ArrayList<>(10);
|
List<IndexQuery> queries = new ArrayList<>(10);
|
||||||
|
|
||||||
// To ensure that good results are discovered, create separate query heads for the priority index that
|
|
||||||
// filter for terms that contain pairs of two search terms
|
|
||||||
if (orderedIncludesPrio.length > 1) {
|
|
||||||
for (int i = 0; i + 1 < orderedIncludesPrio.length; i++) {
|
|
||||||
for (int j = i + 1; j < orderedIncludesPrio.length; j++) {
|
|
||||||
var entrySource = combinedIndexReader
|
|
||||||
.findPriorityWord(orderedIncludesPrio[i])
|
|
||||||
.alsoPrio(orderedIncludesPrio[j]);
|
|
||||||
queryHeads.add(entrySource);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Next consider entries that appear only once in the priority index
|
|
||||||
for (var wordId : orderedIncludesPrio) {
|
|
||||||
queryHeads.add(combinedIndexReader.findPriorityWord(wordId));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Finally consider terms in the full index
|
|
||||||
queryHeads.add(combinedIndexReader.findFullWord(orderedIncludes[0]));
|
|
||||||
|
|
||||||
for (var query : queryHeads) {
|
for (var query : queryHeads) {
|
||||||
if (query == null) {
|
|
||||||
return Collections.emptyList();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Note that we can add all includes as filters, even though
|
|
||||||
// they may not be present in the query head, as the query builder
|
|
||||||
// will ignore redundant include filters:
|
|
||||||
for (long orderedInclude : orderedIncludes) {
|
|
||||||
query = query.alsoFull(orderedInclude);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (long term : terms.excludes()) {
|
for (long term : terms.excludes()) {
|
||||||
query = query.notFull(term);
|
query = query.notFull(term);
|
||||||
@ -161,6 +197,7 @@ public class StatefulIndex {
|
|||||||
queries.add(query.addInclusionFilter(combinedIndexReader.filterForParams(params)).build());
|
queries.add(query.addInclusionFilter(combinedIndexReader.filterForParams(params)).build());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
return queries;
|
return queries;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2,16 +2,16 @@ package nu.marginalia.index.model;
|
|||||||
|
|
||||||
import nu.marginalia.api.searchquery.IndexProtobufCodec;
|
import nu.marginalia.api.searchquery.IndexProtobufCodec;
|
||||||
import nu.marginalia.api.searchquery.RpcIndexQuery;
|
import nu.marginalia.api.searchquery.RpcIndexQuery;
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryParser;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||||
import nu.marginalia.index.query.IndexSearchBudget;
|
import nu.marginalia.index.query.IndexSearchBudget;
|
||||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||||
import nu.marginalia.index.searchset.SearchSet;
|
import nu.marginalia.index.searchset.SearchSet;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import static nu.marginalia.api.searchquery.IndexProtobufCodec.convertSpecLimit;
|
import static nu.marginalia.api.searchquery.IndexProtobufCodec.convertSpecLimit;
|
||||||
|
|
||||||
public class SearchParameters {
|
public class SearchParameters {
|
||||||
@ -21,13 +21,16 @@ public class SearchParameters {
|
|||||||
*/
|
*/
|
||||||
public final int fetchSize;
|
public final int fetchSize;
|
||||||
public final IndexSearchBudget budget;
|
public final IndexSearchBudget budget;
|
||||||
public final List<SearchSubquery> subqueries;
|
public final SearchQuery query;
|
||||||
public final QueryParams queryParams;
|
public final QueryParams queryParams;
|
||||||
public final ResultRankingParameters rankingParams;
|
public final ResultRankingParameters rankingParams;
|
||||||
|
|
||||||
public final int limitByDomain;
|
public final int limitByDomain;
|
||||||
public final int limitTotal;
|
public final int limitTotal;
|
||||||
|
|
||||||
|
public final CompiledQuery<String> compiledQuery;
|
||||||
|
public final CompiledQueryLong compiledQueryIds;
|
||||||
|
|
||||||
// mutable:
|
// mutable:
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -40,7 +43,7 @@ public class SearchParameters {
|
|||||||
|
|
||||||
this.fetchSize = limits.fetchSize();
|
this.fetchSize = limits.fetchSize();
|
||||||
this.budget = new IndexSearchBudget(limits.timeoutMs());
|
this.budget = new IndexSearchBudget(limits.timeoutMs());
|
||||||
this.subqueries = specsSet.subqueries;
|
this.query = specsSet.query;
|
||||||
this.limitByDomain = limits.resultsByDomain();
|
this.limitByDomain = limits.resultsByDomain();
|
||||||
this.limitTotal = limits.resultsTotal();
|
this.limitTotal = limits.resultsTotal();
|
||||||
|
|
||||||
@ -52,6 +55,9 @@ public class SearchParameters {
|
|||||||
searchSet,
|
searchSet,
|
||||||
specsSet.queryStrategy);
|
specsSet.queryStrategy);
|
||||||
|
|
||||||
|
compiledQuery = CompiledQueryParser.parse(this.query.compiledQuery);
|
||||||
|
compiledQueryIds = compiledQuery.mapToLong(SearchTermsUtil::getWordId);
|
||||||
|
|
||||||
rankingParams = specsSet.rankingParams;
|
rankingParams = specsSet.rankingParams;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -63,11 +69,8 @@ public class SearchParameters {
|
|||||||
// The time budget is halved because this is the point when we start to
|
// The time budget is halved because this is the point when we start to
|
||||||
// wrap up the search and return the results.
|
// wrap up the search and return the results.
|
||||||
this.budget = new IndexSearchBudget(limits.timeoutMs() / 2);
|
this.budget = new IndexSearchBudget(limits.timeoutMs() / 2);
|
||||||
|
this.query = IndexProtobufCodec.convertRpcQuery(request.getQuery());
|
||||||
|
|
||||||
this.subqueries = new ArrayList<>(request.getSubqueriesCount());
|
|
||||||
for (int i = 0; i < request.getSubqueriesCount(); i++) {
|
|
||||||
this.subqueries.add(IndexProtobufCodec.convertSearchSubquery(request.getSubqueries(i)));
|
|
||||||
}
|
|
||||||
this.limitByDomain = limits.resultsByDomain();
|
this.limitByDomain = limits.resultsByDomain();
|
||||||
this.limitTotal = limits.resultsTotal();
|
this.limitTotal = limits.resultsTotal();
|
||||||
|
|
||||||
@ -79,9 +82,13 @@ public class SearchParameters {
|
|||||||
searchSet,
|
searchSet,
|
||||||
QueryStrategy.valueOf(request.getQueryStrategy()));
|
QueryStrategy.valueOf(request.getQueryStrategy()));
|
||||||
|
|
||||||
|
compiledQuery = CompiledQueryParser.parse(this.query.compiledQuery);
|
||||||
|
compiledQueryIds = compiledQuery.mapToLong(SearchTermsUtil::getWordId);
|
||||||
|
|
||||||
rankingParams = IndexProtobufCodec.convertRankingParameterss(request.getParameters());
|
rankingParams = IndexProtobufCodec.convertRankingParameterss(request.getParameters());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public long getDataCost() {
|
public long getDataCost() {
|
||||||
return dataCost;
|
return dataCost;
|
||||||
}
|
}
|
||||||
|
@ -4,7 +4,8 @@ import it.unimi.dsi.fastutil.longs.LongArrayList;
|
|||||||
import it.unimi.dsi.fastutil.longs.LongComparator;
|
import it.unimi.dsi.fastutil.longs.LongComparator;
|
||||||
import it.unimi.dsi.fastutil.longs.LongList;
|
import it.unimi.dsi.fastutil.longs.LongList;
|
||||||
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
|
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||||
|
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -18,34 +19,39 @@ public final class SearchTerms {
|
|||||||
private final LongList priority;
|
private final LongList priority;
|
||||||
private final List<LongList> coherences;
|
private final List<LongList> coherences;
|
||||||
|
|
||||||
|
private final CompiledQueryLong compiledQueryIds;
|
||||||
|
|
||||||
public SearchTerms(
|
public SearchTerms(
|
||||||
LongList includes,
|
LongList includes,
|
||||||
LongList excludes,
|
LongList excludes,
|
||||||
LongList priority,
|
LongList priority,
|
||||||
List<LongList> coherences
|
List<LongList> coherences,
|
||||||
|
CompiledQueryLong compiledQueryIds
|
||||||
) {
|
) {
|
||||||
this.includes = includes;
|
this.includes = includes;
|
||||||
this.excludes = excludes;
|
this.excludes = excludes;
|
||||||
this.priority = priority;
|
this.priority = priority;
|
||||||
this.coherences = coherences;
|
this.coherences = coherences;
|
||||||
|
this.compiledQueryIds = compiledQueryIds;
|
||||||
}
|
}
|
||||||
|
|
||||||
public SearchTerms(SearchSubquery subquery) {
|
public SearchTerms(SearchQuery query, CompiledQueryLong compiledQueryIds) {
|
||||||
this(new LongArrayList(),
|
this(new LongArrayList(),
|
||||||
new LongArrayList(),
|
new LongArrayList(),
|
||||||
new LongArrayList(),
|
new LongArrayList(),
|
||||||
new ArrayList<>());
|
new ArrayList<>(),
|
||||||
|
compiledQueryIds);
|
||||||
|
|
||||||
for (var word : subquery.searchTermsInclude) {
|
for (var word : query.searchTermsInclude) {
|
||||||
includes.add(getWordId(word));
|
includes.add(getWordId(word));
|
||||||
}
|
}
|
||||||
for (var word : subquery.searchTermsAdvice) {
|
for (var word : query.searchTermsAdvice) {
|
||||||
// This looks like a bug, but it's not
|
// This looks like a bug, but it's not
|
||||||
includes.add(getWordId(word));
|
includes.add(getWordId(word));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
for (var coherence : subquery.searchTermCoherences) {
|
for (var coherence : query.searchTermCoherences) {
|
||||||
LongList parts = new LongArrayList(coherence.size());
|
LongList parts = new LongArrayList(coherence.size());
|
||||||
|
|
||||||
for (var word : coherence) {
|
for (var word : coherence) {
|
||||||
@ -55,10 +61,10 @@ public final class SearchTerms {
|
|||||||
coherences.add(parts);
|
coherences.add(parts);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (var word : subquery.searchTermsExclude) {
|
for (var word : query.searchTermsExclude) {
|
||||||
excludes.add(getWordId(word));
|
excludes.add(getWordId(word));
|
||||||
}
|
}
|
||||||
for (var word : subquery.searchTermsPriority) {
|
for (var word : query.searchTermsPriority) {
|
||||||
priority.add(getWordId(word));
|
priority.add(getWordId(word));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -96,6 +102,8 @@ public final class SearchTerms {
|
|||||||
return coherences;
|
return coherences;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public CompiledQueryLong compiledQuery() { return compiledQueryIds; }
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean equals(Object obj) {
|
public boolean equals(Object obj) {
|
||||||
if (obj == this) return true;
|
if (obj == this) return true;
|
||||||
|
@ -1,29 +1,9 @@
|
|||||||
package nu.marginalia.index.model;
|
package nu.marginalia.index.model;
|
||||||
|
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
|
||||||
import nu.marginalia.hash.MurmurHash3_128;
|
import nu.marginalia.hash.MurmurHash3_128;
|
||||||
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
public class SearchTermsUtil {
|
public class SearchTermsUtil {
|
||||||
|
|
||||||
/** Extract all include-terms from the specified subqueries,
|
|
||||||
* and a return a map of the terms and their termIds.
|
|
||||||
*/
|
|
||||||
public static Map<String, Long> getAllIncludeTerms(List<SearchSubquery> subqueries) {
|
|
||||||
Map<String, Long> ret = new HashMap<>();
|
|
||||||
|
|
||||||
for (var subquery : subqueries) {
|
|
||||||
for (var include : subquery.searchTermsInclude) {
|
|
||||||
ret.computeIfAbsent(include, i -> getWordId(include));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static final MurmurHash3_128 hasher = new MurmurHash3_128();
|
private static final MurmurHash3_128 hasher = new MurmurHash3_128();
|
||||||
|
|
||||||
/** Translate the word to a unique id. */
|
/** Translate the word to a unique id. */
|
||||||
|
@ -4,7 +4,8 @@ import com.google.inject.Inject;
|
|||||||
import gnu.trove.map.hash.TObjectLongHashMap;
|
import gnu.trove.map.hash.TObjectLongHashMap;
|
||||||
import it.unimi.dsi.fastutil.longs.Long2ObjectArrayMap;
|
import it.unimi.dsi.fastutil.longs.Long2ObjectArrayMap;
|
||||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||||
|
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||||
import nu.marginalia.index.index.StatefulIndex;
|
import nu.marginalia.index.index.StatefulIndex;
|
||||||
import nu.marginalia.index.model.SearchTermsUtil;
|
import nu.marginalia.index.model.SearchTermsUtil;
|
||||||
import nu.marginalia.index.results.model.QuerySearchTerms;
|
import nu.marginalia.index.results.model.QuerySearchTerms;
|
||||||
@ -13,9 +14,6 @@ import nu.marginalia.index.results.model.TermMetadataForCombinedDocumentIds;
|
|||||||
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
||||||
import nu.marginalia.index.results.model.ids.TermIdList;
|
import nu.marginalia.index.results.model.ids.TermIdList;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import static nu.marginalia.index.results.model.TermCoherenceGroupList.TermCoherenceGroup;
|
import static nu.marginalia.index.results.model.TermCoherenceGroupList.TermCoherenceGroup;
|
||||||
import static nu.marginalia.index.results.model.TermMetadataForCombinedDocumentIds.DocumentsWithMetadata;
|
import static nu.marginalia.index.results.model.TermMetadataForCombinedDocumentIds.DocumentsWithMetadata;
|
||||||
|
|
||||||
@ -42,43 +40,24 @@ public class IndexMetadataService {
|
|||||||
return new TermMetadataForCombinedDocumentIds(termdocToMeta);
|
return new TermMetadataForCombinedDocumentIds(termdocToMeta);
|
||||||
}
|
}
|
||||||
|
|
||||||
public QuerySearchTerms getSearchTerms(List<SearchSubquery> searchTermVariants) {
|
public QuerySearchTerms getSearchTerms(CompiledQuery<String> compiledQuery, SearchQuery searchQuery) {
|
||||||
|
|
||||||
LongArrayList termIdsList = new LongArrayList();
|
LongArrayList termIdsList = new LongArrayList();
|
||||||
|
|
||||||
TObjectLongHashMap<String> termToId = new TObjectLongHashMap<>(10, 0.75f, -1);
|
TObjectLongHashMap<String> termToId = new TObjectLongHashMap<>(10, 0.75f, -1);
|
||||||
|
|
||||||
for (var subquery : searchTermVariants) {
|
for (String word : compiledQuery) {
|
||||||
for (var term : subquery.searchTermsInclude) {
|
long id = SearchTermsUtil.getWordId(word);
|
||||||
if (termToId.containsKey(term)) {
|
termIdsList.add(id);
|
||||||
continue;
|
termToId.put(word, id);
|
||||||
}
|
|
||||||
|
|
||||||
long id = SearchTermsUtil.getWordId(term);
|
|
||||||
termIdsList.add(id);
|
|
||||||
termToId.put(term, id);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return new QuerySearchTerms(termToId,
|
return new QuerySearchTerms(termToId,
|
||||||
new TermIdList(termIdsList),
|
new TermIdList(termIdsList),
|
||||||
getTermCoherences(searchTermVariants));
|
new TermCoherenceGroupList(
|
||||||
}
|
searchQuery.searchTermCoherences.stream().map(TermCoherenceGroup::new).toList()
|
||||||
|
)
|
||||||
|
);
|
||||||
private TermCoherenceGroupList getTermCoherences(List<SearchSubquery> searchTermVariants) {
|
|
||||||
List<TermCoherenceGroup> coherences = new ArrayList<>();
|
|
||||||
|
|
||||||
for (var subquery : searchTermVariants) {
|
|
||||||
for (var coh : subquery.searchTermCoherences) {
|
|
||||||
coherences.add(new TermCoherenceGroup(coh));
|
|
||||||
}
|
|
||||||
|
|
||||||
// It's assumed each subquery has identical coherences
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
return new TermCoherenceGroupList(coherences);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1,10 +1,13 @@
|
|||||||
package nu.marginalia.index.results;
|
package nu.marginalia.index.results;
|
||||||
|
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||||
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
|
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
|
||||||
import nu.marginalia.index.index.StatefulIndex;
|
import nu.marginalia.index.index.StatefulIndex;
|
||||||
|
import nu.marginalia.index.model.SearchParameters;
|
||||||
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
||||||
import nu.marginalia.index.model.QueryParams;
|
import nu.marginalia.index.model.QueryParams;
|
||||||
import nu.marginalia.index.results.model.QuerySearchTerms;
|
import nu.marginalia.index.results.model.QuerySearchTerms;
|
||||||
@ -23,7 +26,6 @@ import java.util.List;
|
|||||||
* reasons to cache this data, and performs the calculations */
|
* reasons to cache this data, and performs the calculations */
|
||||||
public class IndexResultValuationContext {
|
public class IndexResultValuationContext {
|
||||||
private final StatefulIndex statefulIndex;
|
private final StatefulIndex statefulIndex;
|
||||||
private final List<List<String>> searchTermVariants;
|
|
||||||
private final QueryParams queryParams;
|
private final QueryParams queryParams;
|
||||||
|
|
||||||
private final TermMetadataForCombinedDocumentIds termMetadataForCombinedDocumentIds;
|
private final TermMetadataForCombinedDocumentIds termMetadataForCombinedDocumentIds;
|
||||||
@ -31,23 +33,26 @@ public class IndexResultValuationContext {
|
|||||||
|
|
||||||
private final ResultRankingContext rankingContext;
|
private final ResultRankingContext rankingContext;
|
||||||
private final ResultValuator searchResultValuator;
|
private final ResultValuator searchResultValuator;
|
||||||
|
private final CompiledQuery<String> compiledQuery;
|
||||||
|
private final CompiledQueryLong compiledQueryIds;
|
||||||
|
|
||||||
public IndexResultValuationContext(IndexMetadataService metadataService,
|
public IndexResultValuationContext(IndexMetadataService metadataService,
|
||||||
ResultValuator searchResultValuator,
|
ResultValuator searchResultValuator,
|
||||||
CombinedDocIdList ids,
|
CombinedDocIdList ids,
|
||||||
StatefulIndex statefulIndex,
|
StatefulIndex statefulIndex,
|
||||||
ResultRankingContext rankingContext,
|
ResultRankingContext rankingContext,
|
||||||
List<SearchSubquery> subqueries,
|
SearchParameters params
|
||||||
QueryParams queryParams
|
|
||||||
) {
|
) {
|
||||||
this.statefulIndex = statefulIndex;
|
this.statefulIndex = statefulIndex;
|
||||||
this.rankingContext = rankingContext;
|
this.rankingContext = rankingContext;
|
||||||
this.searchResultValuator = searchResultValuator;
|
this.searchResultValuator = searchResultValuator;
|
||||||
|
|
||||||
this.searchTermVariants = subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList();
|
this.queryParams = params.queryParams;
|
||||||
this.queryParams = queryParams;
|
this.compiledQuery = params.compiledQuery;
|
||||||
|
this.compiledQueryIds = params.compiledQueryIds;
|
||||||
|
|
||||||
|
this.searchTerms = metadataService.getSearchTerms(params.compiledQuery, params.query);
|
||||||
|
|
||||||
this.searchTerms = metadataService.getSearchTerms(subqueries);
|
|
||||||
this.termMetadataForCombinedDocumentIds = metadataService.getTermMetadataForDocuments(ids, searchTerms.termIdsAll);
|
this.termMetadataForCombinedDocumentIds = metadataService.getTermMetadataForDocuments(ids, searchTerms.termIdsAll);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -65,68 +70,39 @@ public class IndexResultValuationContext {
|
|||||||
long docMetadata = statefulIndex.getDocumentMetadata(docId);
|
long docMetadata = statefulIndex.getDocumentMetadata(docId);
|
||||||
int htmlFeatures = statefulIndex.getHtmlFeatures(docId);
|
int htmlFeatures = statefulIndex.getHtmlFeatures(docId);
|
||||||
|
|
||||||
int maxFlagsCount = 0;
|
SearchResultItem searchResult = new SearchResultItem(docId);
|
||||||
boolean anyAllSynthetic = false;
|
|
||||||
int maxPositionsSet = 0;
|
|
||||||
|
|
||||||
SearchResultItem searchResult = new SearchResultItem(docId,
|
SearchResultKeywordScore[] scores = compiledQuery.indices().mapToObj(idx ->
|
||||||
searchTermVariants.stream().mapToInt(List::size).sum());
|
new SearchResultKeywordScore(
|
||||||
|
compiledQuery.at(idx),
|
||||||
|
compiledQueryIds.at(idx),
|
||||||
|
termMetadataForCombinedDocumentIds.getTermMetadata(
|
||||||
|
compiledQueryIds.at(idx), combinedId
|
||||||
|
),
|
||||||
|
docMetadata,
|
||||||
|
htmlFeatures)
|
||||||
|
)
|
||||||
|
.toArray(SearchResultKeywordScore[]::new);
|
||||||
|
|
||||||
for (int querySetId = 0;
|
// DANGER: IndexResultValuatorService assumes that searchResult.keywordScores has this specific order, as it needs
|
||||||
querySetId < searchTermVariants.size();
|
// to be able to re-construct its own CompiledQuery<SearchResultKeywordScore> for re-ranking the results. This is
|
||||||
querySetId++)
|
// a very flimsy assumption.
|
||||||
{
|
searchResult.keywordScores.addAll(List.of(scores));
|
||||||
var termList = searchTermVariants.get(querySetId);
|
|
||||||
|
|
||||||
SearchResultKeywordScore[] termScoresForSet = new SearchResultKeywordScore[termList.size()];
|
CompiledQuery<SearchResultKeywordScore> queryGraphScores = new CompiledQuery<>(compiledQuery.root, scores);
|
||||||
|
|
||||||
boolean synthetic = true;
|
boolean allSynthetic = !CompiledQueryAggregates.booleanAggregate(queryGraphScores, score -> !score.hasTermFlag(WordFlags.Synthetic));
|
||||||
|
int flagsCount = CompiledQueryAggregates.intMaxMinAggregate(queryGraphScores, score -> Long.bitCount(score.encodedWordMetadata() & flagsFilterMask));
|
||||||
|
int positionsCount = CompiledQueryAggregates.intMaxMinAggregate(queryGraphScores, SearchResultKeywordScore::positionCount);
|
||||||
|
|
||||||
for (int termIdx = 0; termIdx < termList.size(); termIdx++) {
|
if (!meetsQueryStrategyRequirements(queryGraphScores, queryParams.queryStrategy())) {
|
||||||
String searchTerm = termList.get(termIdx);
|
return null;
|
||||||
|
|
||||||
long termMetadata = termMetadataForCombinedDocumentIds.getTermMetadata(
|
|
||||||
searchTerms.getIdForTerm(searchTerm),
|
|
||||||
combinedId
|
|
||||||
);
|
|
||||||
|
|
||||||
var score = new SearchResultKeywordScore(
|
|
||||||
querySetId,
|
|
||||||
searchTerm,
|
|
||||||
termMetadata,
|
|
||||||
docMetadata,
|
|
||||||
htmlFeatures
|
|
||||||
);
|
|
||||||
|
|
||||||
synthetic &= WordFlags.Synthetic.isPresent(termMetadata);
|
|
||||||
|
|
||||||
searchResult.keywordScores.add(score);
|
|
||||||
|
|
||||||
termScoresForSet[termIdx] = score;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!meetsQueryStrategyRequirements(termScoresForSet, queryParams.queryStrategy())) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
int minFlagsCount = 8;
|
|
||||||
int minPositionsSet = 4;
|
|
||||||
|
|
||||||
for (var termScore : termScoresForSet) {
|
|
||||||
final int flagCount = Long.bitCount(termScore.encodedWordMetadata() & flagsFilterMask);
|
|
||||||
minFlagsCount = Math.min(minFlagsCount, flagCount);
|
|
||||||
minPositionsSet = Math.min(minPositionsSet, termScore.positionCount());
|
|
||||||
}
|
|
||||||
|
|
||||||
maxFlagsCount = Math.max(maxFlagsCount, minFlagsCount);
|
|
||||||
maxPositionsSet = Math.max(maxPositionsSet, minPositionsSet);
|
|
||||||
anyAllSynthetic |= synthetic;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (maxFlagsCount == 0 && !anyAllSynthetic && maxPositionsSet == 0)
|
if (flagsCount == 0 && !allSynthetic && positionsCount == 0)
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
double score = searchResultValuator.calculateSearchResultValue(searchResult.keywordScores,
|
double score = searchResultValuator.calculateSearchResultValue(queryGraphScores,
|
||||||
5000, // use a dummy value here as it's not present in the index
|
5000, // use a dummy value here as it's not present in the index
|
||||||
rankingContext);
|
rankingContext);
|
||||||
|
|
||||||
@ -135,20 +111,17 @@ public class IndexResultValuationContext {
|
|||||||
return searchResult;
|
return searchResult;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean meetsQueryStrategyRequirements(SearchResultKeywordScore[] termSet, QueryStrategy queryStrategy) {
|
private boolean meetsQueryStrategyRequirements(CompiledQuery<SearchResultKeywordScore> queryGraphScores,
|
||||||
|
QueryStrategy queryStrategy)
|
||||||
|
{
|
||||||
if (queryStrategy == QueryStrategy.AUTO ||
|
if (queryStrategy == QueryStrategy.AUTO ||
|
||||||
queryStrategy == QueryStrategy.SENTENCE ||
|
queryStrategy == QueryStrategy.SENTENCE ||
|
||||||
queryStrategy == QueryStrategy.TOPIC) {
|
queryStrategy == QueryStrategy.TOPIC) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (var keyword : termSet) {
|
return CompiledQueryAggregates.booleanAggregate(queryGraphScores,
|
||||||
if (!meetsQueryStrategyRequirements(keyword, queryParams.queryStrategy())) {
|
docs -> meetsQueryStrategyRequirements(docs, queryParams.queryStrategy()));
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean meetsQueryStrategyRequirements(SearchResultKeywordScore termScore, QueryStrategy queryStrategy) {
|
private boolean meetsQueryStrategyRequirements(SearchResultKeywordScore termScore, QueryStrategy queryStrategy) {
|
||||||
|
@ -4,10 +4,11 @@ import com.google.inject.Inject;
|
|||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import gnu.trove.list.TLongList;
|
import gnu.trove.list.TLongList;
|
||||||
import gnu.trove.list.array.TLongArrayList;
|
import gnu.trove.list.array.TLongArrayList;
|
||||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||||
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||||
|
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
|
||||||
import nu.marginalia.index.index.StatefulIndex;
|
import nu.marginalia.index.index.StatefulIndex;
|
||||||
import nu.marginalia.index.model.SearchParameters;
|
import nu.marginalia.index.model.SearchParameters;
|
||||||
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
||||||
@ -19,8 +20,6 @@ import org.slf4j.LoggerFactory;
|
|||||||
|
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.function.Consumer;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class IndexResultValuatorService {
|
public class IndexResultValuatorService {
|
||||||
@ -44,8 +43,8 @@ public class IndexResultValuatorService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public List<SearchResultItem> rankResults(SearchParameters params,
|
public List<SearchResultItem> rankResults(SearchParameters params,
|
||||||
ResultRankingContext rankingContext,
|
ResultRankingContext rankingContext,
|
||||||
CombinedDocIdList resultIds)
|
CombinedDocIdList resultIds)
|
||||||
{
|
{
|
||||||
final var evaluator = createValuationContext(params, rankingContext, resultIds);
|
final var evaluator = createValuationContext(params, rankingContext, resultIds);
|
||||||
|
|
||||||
@ -70,8 +69,7 @@ public class IndexResultValuatorService {
|
|||||||
resultIds,
|
resultIds,
|
||||||
statefulIndex,
|
statefulIndex,
|
||||||
rankingContext,
|
rankingContext,
|
||||||
params.subqueries,
|
params);
|
||||||
params.queryParams);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -96,12 +94,13 @@ public class IndexResultValuatorService {
|
|||||||
item.resultsFromDomain = domainCountFilter.getCount(item);
|
item.resultsFromDomain = domainCountFilter.getCount(item);
|
||||||
}
|
}
|
||||||
|
|
||||||
return decorateAndRerank(resultsList, rankingContext);
|
return decorateAndRerank(resultsList, params.compiledQuery, rankingContext);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Decorate the result items with additional information from the link database
|
/** Decorate the result items with additional information from the link database
|
||||||
* and calculate an updated ranking with the additional information */
|
* and calculate an updated ranking with the additional information */
|
||||||
public List<DecoratedSearchResultItem> decorateAndRerank(List<SearchResultItem> rawResults,
|
public List<DecoratedSearchResultItem> decorateAndRerank(List<SearchResultItem> rawResults,
|
||||||
|
CompiledQuery<String> compiledQuery,
|
||||||
ResultRankingContext rankingContext)
|
ResultRankingContext rankingContext)
|
||||||
throws SQLException
|
throws SQLException
|
||||||
{
|
{
|
||||||
@ -125,13 +124,22 @@ public class IndexResultValuatorService {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
resultItems.add(createCombinedItem(result, docData, rankingContext));
|
// Reconstruct the SearchResultKeywordScore-compiledquery for re-valuation
|
||||||
|
//
|
||||||
|
// CAVEAT: This hinges on a very fragile that IndexResultValuationContext puts them in the same
|
||||||
|
// order as the data for the CompiledQuery<String>.
|
||||||
|
CompiledQuery<SearchResultKeywordScore> resultQuery =
|
||||||
|
new CompiledQuery<>(compiledQuery.root, result.keywordScores.toArray(SearchResultKeywordScore[]::new));
|
||||||
|
|
||||||
|
|
||||||
|
resultItems.add(createCombinedItem(result, docData, resultQuery, rankingContext));
|
||||||
}
|
}
|
||||||
return resultItems;
|
return resultItems;
|
||||||
}
|
}
|
||||||
|
|
||||||
private DecoratedSearchResultItem createCombinedItem(SearchResultItem result,
|
private DecoratedSearchResultItem createCombinedItem(SearchResultItem result,
|
||||||
DocdbUrlDetail docData,
|
DocdbUrlDetail docData,
|
||||||
|
CompiledQuery<SearchResultKeywordScore> resultQuery,
|
||||||
ResultRankingContext rankingContext) {
|
ResultRankingContext rankingContext) {
|
||||||
return new DecoratedSearchResultItem(
|
return new DecoratedSearchResultItem(
|
||||||
result,
|
result,
|
||||||
@ -144,7 +152,7 @@ public class IndexResultValuatorService {
|
|||||||
docData.pubYear(),
|
docData.pubYear(),
|
||||||
docData.dataHash(),
|
docData.dataHash(),
|
||||||
docData.wordsTotal(),
|
docData.wordsTotal(),
|
||||||
resultValuator.calculateSearchResultValue(result.keywordScores, docData.wordsTotal(), rankingContext)
|
resultValuator.calculateSearchResultValue(resultQuery, docData.wordsTotal(), rankingContext)
|
||||||
);
|
);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
package nu.marginalia.ranking.results;
|
package nu.marginalia.ranking.results;
|
||||||
|
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||||
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
|
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
|
||||||
@ -33,14 +34,17 @@ public class ResultValuator {
|
|||||||
this.termCoherenceFactor = termCoherenceFactor;
|
this.termCoherenceFactor = termCoherenceFactor;
|
||||||
}
|
}
|
||||||
|
|
||||||
public double calculateSearchResultValue(List<SearchResultKeywordScore> scores,
|
public double calculateSearchResultValue(CompiledQuery<SearchResultKeywordScore> scores,
|
||||||
int length,
|
int length,
|
||||||
ResultRankingContext ctx)
|
ResultRankingContext ctx)
|
||||||
{
|
{
|
||||||
int sets = numberOfSets(scores);
|
if (scores.size() == 0)
|
||||||
|
return Double.MAX_VALUE;
|
||||||
|
if (length < 0)
|
||||||
|
length = 5000;
|
||||||
|
|
||||||
long documentMetadata = documentMetadata(scores);
|
long documentMetadata = scores.at(0).encodedDocMetadata();
|
||||||
int features = htmlFeatures(scores);
|
int features = scores.at(0).htmlFeatures();
|
||||||
var rankingParams = ctx.params;
|
var rankingParams = ctx.params;
|
||||||
|
|
||||||
int rank = DocumentMetadata.decodeRank(documentMetadata);
|
int rank = DocumentMetadata.decodeRank(documentMetadata);
|
||||||
@ -75,32 +79,16 @@ public class ResultValuator {
|
|||||||
+ temporalBias
|
+ temporalBias
|
||||||
+ flagsPenalty;
|
+ flagsPenalty;
|
||||||
|
|
||||||
double bestTcf = 0;
|
double bestTcf = rankingParams.tcfWeight * termCoherenceFactor.calculate(scores);
|
||||||
double bestBM25F = 0;
|
double bestBM25F = rankingParams.bm25FullWeight * bm25Factor.calculateBm25(rankingParams.prioParams, scores, length, ctx);
|
||||||
double bestBM25P = 0;
|
double bestBM25P = rankingParams.bm25PrioWeight * bm25Factor.calculateBm25Prio(rankingParams.prioParams, scores, ctx);
|
||||||
double bestBM25PN = 0;
|
|
||||||
|
|
||||||
for (int set = 0; set < sets; set++) {
|
|
||||||
ResultKeywordSet keywordSet = createKeywordSet(scores, set);
|
|
||||||
|
|
||||||
if (keywordSet.isEmpty())
|
|
||||||
continue;
|
|
||||||
|
|
||||||
bestTcf = Math.max(bestTcf, rankingParams.tcfWeight * termCoherenceFactor.calculate(keywordSet));
|
|
||||||
bestBM25P = Math.max(bestBM25P, rankingParams.bm25PrioWeight * bm25Factor.calculateBm25Prio(rankingParams.prioParams, keywordSet, ctx));
|
|
||||||
bestBM25F = Math.max(bestBM25F, rankingParams.bm25FullWeight * bm25Factor.calculateBm25(rankingParams.fullParams, keywordSet, length, ctx));
|
|
||||||
if (keywordSet.hasNgram()) {
|
|
||||||
bestBM25PN = Math.max(bestBM25PN, rankingParams.bm25PrioWeight * bm25Factor.calculateBm25Prio(rankingParams.prioParams, keywordSet, ctx));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
double overallPartPositive = Math.max(0, overallPart);
|
double overallPartPositive = Math.max(0, overallPart);
|
||||||
double overallPartNegative = -Math.min(0, overallPart);
|
double overallPartNegative = -Math.min(0, overallPart);
|
||||||
|
|
||||||
// Renormalize to 0...15, where 0 is the best possible score;
|
// Renormalize to 0...15, where 0 is the best possible score;
|
||||||
// this is a historical artifact of the original ranking function
|
// this is a historical artifact of the original ranking function
|
||||||
return normalize(1.5 * bestTcf + bestBM25F + bestBM25P + 0.25 * bestBM25PN + overallPartPositive, overallPartNegative);
|
return normalize(1.5 * bestTcf + bestBM25F + bestBM25P + overallPartPositive, overallPartNegative);
|
||||||
}
|
}
|
||||||
|
|
||||||
private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) {
|
private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) {
|
||||||
|
@ -1,10 +1,11 @@
|
|||||||
package nu.marginalia.ranking.results.factors;
|
package nu.marginalia.ranking.results.factors;
|
||||||
|
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
|
||||||
import nu.marginalia.api.searchquery.model.results.Bm25Parameters;
|
import nu.marginalia.api.searchquery.model.results.Bm25Parameters;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||||
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
|
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
|
||||||
import nu.marginalia.model.idx.WordFlags;
|
import nu.marginalia.model.idx.WordFlags;
|
||||||
import nu.marginalia.ranking.results.ResultKeywordSet;
|
|
||||||
|
|
||||||
public class Bm25Factor {
|
public class Bm25Factor {
|
||||||
private static final int AVG_LENGTH = 5000;
|
private static final int AVG_LENGTH = 5000;
|
||||||
@ -13,43 +14,33 @@ public class Bm25Factor {
|
|||||||
*
|
*
|
||||||
* @see Bm25Parameters
|
* @see Bm25Parameters
|
||||||
*/
|
*/
|
||||||
public double calculateBm25(Bm25Parameters bm25Parameters, ResultKeywordSet keywordSet, int length, ResultRankingContext ctx) {
|
public double calculateBm25(Bm25Parameters bm25Parameters, CompiledQuery<SearchResultKeywordScore> scores, int length, ResultRankingContext ctx) {
|
||||||
final int docCount = ctx.termFreqDocCount();
|
final int docCount = ctx.termFreqDocCount();
|
||||||
|
|
||||||
if (length <= 0)
|
return CompiledQueryAggregates.doubleSumAggregate(scores, keyword -> {
|
||||||
length = AVG_LENGTH;
|
|
||||||
|
|
||||||
double sum = 0.;
|
|
||||||
|
|
||||||
for (var keyword : keywordSet.keywords()) {
|
|
||||||
double count = keyword.positionCount();
|
double count = keyword.positionCount();
|
||||||
|
|
||||||
int freq = ctx.frequency(keyword.keyword);
|
int freq = ctx.frequency(keyword.keyword);
|
||||||
|
|
||||||
sum += invFreq(docCount, freq) * f(bm25Parameters.k(), bm25Parameters.b(), count, length);
|
return invFreq(docCount, freq) * f(bm25Parameters.k(), bm25Parameters.b(), count, length);
|
||||||
}
|
});
|
||||||
|
|
||||||
return sum;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Bm25 calculation, except instead of counting positions in the document,
|
/** Bm25 calculation, except instead of counting positions in the document,
|
||||||
* the number of relevance signals for the term is counted instead.
|
* the number of relevance signals for the term is counted instead.
|
||||||
*/
|
*/
|
||||||
public double calculateBm25Prio(Bm25Parameters bm25Parameters, ResultKeywordSet keywordSet, ResultRankingContext ctx) {
|
public double calculateBm25Prio(Bm25Parameters bm25Parameters, CompiledQuery<SearchResultKeywordScore> scores, ResultRankingContext ctx) {
|
||||||
final int docCount = ctx.termFreqDocCount();
|
final int docCount = ctx.termFreqDocCount();
|
||||||
|
|
||||||
double sum = 0.;
|
return CompiledQueryAggregates.doubleSumAggregate(scores, keyword -> {
|
||||||
|
|
||||||
for (var keyword : keywordSet.keywords()) {
|
|
||||||
double count = evaluatePriorityScore(keyword);
|
double count = evaluatePriorityScore(keyword);
|
||||||
|
|
||||||
int freq = ctx.priorityFrequency(keyword.keyword);
|
int freq = ctx.priorityFrequency(keyword.keyword);
|
||||||
|
|
||||||
// note we override b to zero for priority terms as they are independent of document length
|
// note we override b to zero for priority terms as they are independent of document length
|
||||||
sum += invFreq(docCount, freq) * f(bm25Parameters.k(), 0, count, 0);
|
return invFreq(docCount, freq) * f(bm25Parameters.k(), 0, count, 0);
|
||||||
}
|
});
|
||||||
|
|
||||||
return sum;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static double evaluatePriorityScore(SearchResultKeywordScore keyword) {
|
private static double evaluatePriorityScore(SearchResultKeywordScore keyword) {
|
||||||
|
@ -1,14 +1,16 @@
|
|||||||
package nu.marginalia.ranking.results.factors;
|
package nu.marginalia.ranking.results.factors;
|
||||||
|
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
|
||||||
|
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
|
||||||
import nu.marginalia.model.idx.WordMetadata;
|
import nu.marginalia.model.idx.WordMetadata;
|
||||||
import nu.marginalia.ranking.results.ResultKeywordSet;
|
|
||||||
|
|
||||||
/** Rewards documents where terms appear frequently within the same sentences
|
/** Rewards documents where terms appear frequently within the same sentences
|
||||||
*/
|
*/
|
||||||
public class TermCoherenceFactor {
|
public class TermCoherenceFactor {
|
||||||
|
|
||||||
public double calculate(ResultKeywordSet keywordSet) {
|
public double calculate(CompiledQuery<SearchResultKeywordScore> scores) {
|
||||||
long mask = combinedMask(keywordSet);
|
long mask = CompiledQueryAggregates.longBitmaskAggregate(scores, score -> score.positions() & WordMetadata.POSITIONS_MASK);
|
||||||
|
|
||||||
return bitsSetFactor(mask);
|
return bitsSetFactor(mask);
|
||||||
}
|
}
|
||||||
@ -19,14 +21,5 @@ public class TermCoherenceFactor {
|
|||||||
return Math.pow(bitsSetInMask/(float) WordMetadata.POSITIONS_COUNT, 0.25);
|
return Math.pow(bitsSetInMask/(float) WordMetadata.POSITIONS_COUNT, 0.25);
|
||||||
}
|
}
|
||||||
|
|
||||||
long combinedMask(ResultKeywordSet keywordSet) {
|
|
||||||
long mask = WordMetadata.POSITIONS_MASK;
|
|
||||||
|
|
||||||
for (var keyword : keywordSet.keywords()) {
|
|
||||||
mask &= keyword.positions();
|
|
||||||
}
|
|
||||||
|
|
||||||
return mask;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
@ -2,6 +2,8 @@ package nu.marginalia.index.query;
|
|||||||
|
|
||||||
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
/** Builds a query.
|
/** Builds a query.
|
||||||
* <p />
|
* <p />
|
||||||
* Note: The query builder may omit predicates that are deemed redundant.
|
* Note: The query builder may omit predicates that are deemed redundant.
|
||||||
@ -21,6 +23,7 @@ public interface IndexQueryBuilder {
|
|||||||
IndexQueryBuilder notFull(long termId);
|
IndexQueryBuilder notFull(long termId);
|
||||||
|
|
||||||
IndexQueryBuilder addInclusionFilter(QueryFilterStepIf filterStep);
|
IndexQueryBuilder addInclusionFilter(QueryFilterStepIf filterStep);
|
||||||
|
IndexQueryBuilder addInclusionFilterAny(List<QueryFilterStepIf> filterStep);
|
||||||
|
|
||||||
IndexQuery build();
|
IndexQuery build();
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,57 @@
|
|||||||
|
package nu.marginalia.index.query.filter;
|
||||||
|
|
||||||
|
import nu.marginalia.array.buffer.LongQueryBuffer;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.StringJoiner;
|
||||||
|
|
||||||
|
public class QueryFilterAllOf implements QueryFilterStepIf {
|
||||||
|
private final List<? extends QueryFilterStepIf> steps;
|
||||||
|
|
||||||
|
public QueryFilterAllOf(List<? extends QueryFilterStepIf> steps) {
|
||||||
|
this.steps = steps;
|
||||||
|
}
|
||||||
|
|
||||||
|
public double cost() {
|
||||||
|
double prod = 1.;
|
||||||
|
|
||||||
|
for (var step : steps) {
|
||||||
|
double cost = step.cost();
|
||||||
|
if (cost > 1.0) {
|
||||||
|
prod *= Math.log(cost);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
prod += cost;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return prod;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean test(long value) {
|
||||||
|
for (var step : steps) {
|
||||||
|
if (!step.test(value))
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void apply(LongQueryBuffer buffer) {
|
||||||
|
if (steps.isEmpty())
|
||||||
|
return;
|
||||||
|
|
||||||
|
for (var step : steps) {
|
||||||
|
step.apply(buffer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public String describe() {
|
||||||
|
StringJoiner sj = new StringJoiner(",", "[All Of: ", "]");
|
||||||
|
for (var step : steps) {
|
||||||
|
sj.add(step.describe());
|
||||||
|
}
|
||||||
|
return sj.toString();
|
||||||
|
}
|
||||||
|
}
|
@ -2,7 +2,6 @@ package nu.marginalia.index.query.filter;
|
|||||||
|
|
||||||
import nu.marginalia.array.buffer.LongQueryBuffer;
|
import nu.marginalia.array.buffer.LongQueryBuffer;
|
||||||
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.StringJoiner;
|
import java.util.StringJoiner;
|
||||||
|
|
||||||
@ -14,7 +13,7 @@ public class QueryFilterAnyOf implements QueryFilterStepIf {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public double cost() {
|
public double cost() {
|
||||||
return steps.stream().mapToDouble(QueryFilterStepIf::cost).average().orElse(0.);
|
return steps.stream().mapToDouble(QueryFilterStepIf::cost).sum();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -31,31 +30,23 @@ public class QueryFilterAnyOf implements QueryFilterStepIf {
|
|||||||
if (steps.isEmpty())
|
if (steps.isEmpty())
|
||||||
return;
|
return;
|
||||||
|
|
||||||
int start;
|
int start = 0;
|
||||||
int end = buffer.end;
|
int end = buffer.end;
|
||||||
|
|
||||||
steps.getFirst().apply(buffer);
|
for (var step : steps)
|
||||||
|
|
||||||
// The filter functions will partition the data in the buffer from 0 to END,
|
|
||||||
// and update END to the length of the retained items, keeping the retained
|
|
||||||
// items sorted but making no guarantees about the rejected half
|
|
||||||
//
|
|
||||||
// Therefore, we need to re-sort the rejected side, and to satisfy the
|
|
||||||
// constraint that the data is sorted up to END, finally sort it again.
|
|
||||||
//
|
|
||||||
// This sorting may seem like it's slower, but filter.apply(...) is
|
|
||||||
// typically much faster than iterating over filter.test(...); so this
|
|
||||||
// is more than made up for
|
|
||||||
|
|
||||||
for (int fi = 1; fi < steps.size(); fi++)
|
|
||||||
{
|
{
|
||||||
start = buffer.end;
|
var slice = buffer.slice(start, end);
|
||||||
Arrays.sort(buffer.data, start, end);
|
slice.data.quickSort(0, slice.size());
|
||||||
buffer.startFilterForRange(start, end);
|
|
||||||
steps.get(fi).apply(buffer);
|
step.apply(slice);
|
||||||
|
start += slice.end;
|
||||||
}
|
}
|
||||||
|
|
||||||
Arrays.sort(buffer.data, 0, buffer.end);
|
buffer.data.quickSort(0, start);
|
||||||
|
|
||||||
|
// Special finalization
|
||||||
|
buffer.reset();
|
||||||
|
buffer.end = start;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String describe() {
|
public String describe() {
|
||||||
|
@ -16,7 +16,7 @@ public class QueryFilterLetThrough implements QueryFilterStepIf {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public double cost() {
|
public double cost() {
|
||||||
return 0.;
|
return 1.;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String describe() {
|
public String describe() {
|
||||||
|
@ -15,7 +15,7 @@ public class QueryFilterNoPass implements QueryFilterStepIf {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public double cost() {
|
public double cost() {
|
||||||
return 0.;
|
return 1.;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String describe() {
|
public String describe() {
|
||||||
|
@ -16,7 +16,7 @@ public class QueryFilterStepExcludeFromPredicate implements QueryFilterStepIf {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double cost() {
|
public double cost() {
|
||||||
return 0;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -16,7 +16,7 @@ public class QueryFilterStepFromPredicate implements QueryFilterStepIf {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double cost() {
|
public double cost() {
|
||||||
return 0;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -55,6 +55,32 @@ class QueryFilterStepIfTest {
|
|||||||
assertArrayEquals(new long[]{8, 10}, buffer.copyData());
|
assertArrayEquals(new long[]{8, 10}, buffer.copyData());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSuccessiveApplicationWithAllOf() {
|
||||||
|
var buffer = createBuffer(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
|
||||||
|
var filter1 = new QueryFilterStepFromPredicate(value -> value % 2 == 0);
|
||||||
|
var filter2 = new QueryFilterStepExcludeFromPredicate(value -> value <= 6);
|
||||||
|
new QueryFilterAllOf(List.of(filter1, filter2)).apply(buffer);
|
||||||
|
assertArrayEquals(new long[]{8, 10}, buffer.copyData());
|
||||||
|
}
|
||||||
|
@Test
|
||||||
|
public void testCombinedOrAnd() {
|
||||||
|
var buffer = createBuffer(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
|
||||||
|
|
||||||
|
var filter1 = new QueryFilterStepFromPredicate(value -> value % 2 == 0);
|
||||||
|
var filter2 = new QueryFilterStepFromPredicate(value -> value <= 5);
|
||||||
|
var filter1_2 = new QueryFilterAllOf(List.of(filter1, filter2));
|
||||||
|
|
||||||
|
var filter3 = new QueryFilterStepFromPredicate(value -> value % 2 == 1);
|
||||||
|
var filter4 = new QueryFilterStepFromPredicate(value -> value > 5);
|
||||||
|
var filter3_4 = new QueryFilterAllOf(List.of(filter3, filter4));
|
||||||
|
|
||||||
|
var filter12_34 = new QueryFilterAnyOf(List.of(filter1_2, filter3_4));
|
||||||
|
|
||||||
|
filter12_34.apply(buffer);
|
||||||
|
|
||||||
|
assertArrayEquals(new long[]{2, 4, 7, 9}, buffer.copyData());
|
||||||
|
}
|
||||||
@Test
|
@Test
|
||||||
public void testCombinedApplication() {
|
public void testCombinedApplication() {
|
||||||
var buffer = createBuffer(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
|
var buffer = createBuffer(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
|
||||||
|
@ -5,7 +5,7 @@ import com.google.inject.Inject;
|
|||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.IndexLocations;
|
import nu.marginalia.IndexLocations;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||||
import nu.marginalia.index.index.StatefulIndex;
|
import nu.marginalia.index.index.StatefulIndex;
|
||||||
import nu.marginalia.process.control.FakeProcessHeartbeat;
|
import nu.marginalia.process.control.FakeProcessHeartbeat;
|
||||||
@ -123,9 +123,10 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
|||||||
.rankingParams(ResultRankingParameters.sensibleDefaults())
|
.rankingParams(ResultRankingParameters.sensibleDefaults())
|
||||||
.domains(new ArrayList<>())
|
.domains(new ArrayList<>())
|
||||||
.searchSetIdentifier("NONE")
|
.searchSetIdentifier("NONE")
|
||||||
.subqueries(List.of(new SearchSubquery(
|
.query(new SearchQuery(
|
||||||
|
"2 3 5",
|
||||||
List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(),
|
List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(),
|
||||||
Collections.emptyList()))).build());
|
Collections.emptyList())).build());
|
||||||
|
|
||||||
int[] idxes = new int[] { 30, 510, 90, 150, 210, 270, 330, 390, 450 };
|
int[] idxes = new int[] { 30, 510, 90, 150, 210, 270, 330, 390, 450 };
|
||||||
long[] ids = IntStream.of(idxes).mapToLong(this::fullId).toArray();
|
long[] ids = IntStream.of(idxes).mapToLong(this::fullId).toArray();
|
||||||
@ -166,9 +167,13 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
|||||||
.rankingParams(ResultRankingParameters.sensibleDefaults())
|
.rankingParams(ResultRankingParameters.sensibleDefaults())
|
||||||
.queryStrategy(QueryStrategy.SENTENCE)
|
.queryStrategy(QueryStrategy.SENTENCE)
|
||||||
.domains(List.of(2))
|
.domains(List.of(2))
|
||||||
.subqueries(List.of(new SearchSubquery(
|
.query(new SearchQuery(
|
||||||
List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(),
|
"2 3 5",
|
||||||
Collections.emptyList()))).build());
|
List.of("3", "5", "2"),
|
||||||
|
List.of("4"),
|
||||||
|
Collections.emptyList(),
|
||||||
|
Collections.emptyList(),
|
||||||
|
Collections.emptyList())).build());
|
||||||
int[] idxes = new int[] { 210, 270 };
|
int[] idxes = new int[] { 210, 270 };
|
||||||
long[] ids = IntStream.of(idxes).mapToLong(id -> UrlIdCodec.encodeId(id/100, id)).toArray();
|
long[] ids = IntStream.of(idxes).mapToLong(id -> UrlIdCodec.encodeId(id/100, id)).toArray();
|
||||||
long[] actual = rsp.results.stream().mapToLong(i -> i.rawIndexResult.getDocumentId()).toArray();
|
long[] actual = rsp.results.stream().mapToLong(i -> i.rawIndexResult.getDocumentId()).toArray();
|
||||||
@ -202,9 +207,8 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
|||||||
.queryStrategy(QueryStrategy.SENTENCE)
|
.queryStrategy(QueryStrategy.SENTENCE)
|
||||||
.searchSetIdentifier("NONE")
|
.searchSetIdentifier("NONE")
|
||||||
.rankingParams(ResultRankingParameters.sensibleDefaults())
|
.rankingParams(ResultRankingParameters.sensibleDefaults())
|
||||||
.subqueries(List.of(new SearchSubquery(
|
.query(
|
||||||
List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList(),
|
new SearchQuery("4", List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList(), Collections.emptyList())
|
||||||
Collections.emptyList()))
|
|
||||||
).build());
|
).build());
|
||||||
|
|
||||||
|
|
||||||
|
@ -4,7 +4,7 @@ import com.google.inject.Guice;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import nu.marginalia.IndexLocations;
|
import nu.marginalia.IndexLocations;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||||
import nu.marginalia.index.index.StatefulIndex;
|
import nu.marginalia.index.index.StatefulIndex;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
@ -35,6 +35,7 @@ import nu.marginalia.process.control.ProcessHeartbeat;
|
|||||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||||
import nu.marginalia.service.server.Initialization;
|
import nu.marginalia.service.server.Initialization;
|
||||||
|
import org.apache.logging.log4j.util.Strings;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
@ -108,7 +109,7 @@ public class IndexQueryServiceIntegrationTest {
|
|||||||
w("world", WordFlags.Title)
|
w("world", WordFlags.Title)
|
||||||
).load();
|
).load();
|
||||||
|
|
||||||
var query = basicQuery(builder -> builder.subqueries(justInclude("hello", "world")));
|
var query = basicQuery(builder -> builder.query(justInclude("hello", "world")));
|
||||||
|
|
||||||
executeSearch(query)
|
executeSearch(query)
|
||||||
.expectDocumentsInOrder(d(1,1));
|
.expectDocumentsInOrder(d(1,1));
|
||||||
@ -127,57 +128,51 @@ public class IndexQueryServiceIntegrationTest {
|
|||||||
).load();
|
).load();
|
||||||
|
|
||||||
var queryMissingExclude = basicQuery(builder ->
|
var queryMissingExclude = basicQuery(builder ->
|
||||||
builder.subqueries(includeAndExclude("hello", "missing")));
|
builder.query(includeAndExclude("hello", "missing")));
|
||||||
|
|
||||||
executeSearch(queryMissingExclude)
|
executeSearch(queryMissingExclude)
|
||||||
.expectDocumentsInOrder(d(1,1));
|
.expectDocumentsInOrder(d(1,1));
|
||||||
|
|
||||||
var queryMissingInclude = basicQuery(builder ->
|
var queryMissingInclude = basicQuery(builder ->
|
||||||
builder.subqueries(justInclude("missing")));
|
builder.query(justInclude("missing")));
|
||||||
|
|
||||||
executeSearch(queryMissingInclude)
|
executeSearch(queryMissingInclude)
|
||||||
.expectCount(0);
|
.expectCount(0);
|
||||||
|
|
||||||
var queryMissingPriority = basicQuery(builder ->
|
var queryMissingPriority = basicQuery(builder ->
|
||||||
builder.subqueries(
|
builder.query(new SearchQuery(
|
||||||
List.of(
|
"hello",
|
||||||
new SearchSubquery(
|
List.of("hello"),
|
||||||
List.of("hello"),
|
List.of(),
|
||||||
List.of(),
|
List.of(),
|
||||||
List.of(),
|
List.of("missing"),
|
||||||
List.of("missing"),
|
List.of())
|
||||||
List.of()
|
));
|
||||||
)
|
|
||||||
)));
|
|
||||||
|
|
||||||
executeSearch(queryMissingPriority)
|
executeSearch(queryMissingPriority)
|
||||||
.expectCount(1);
|
.expectCount(1);
|
||||||
|
|
||||||
var queryMissingAdvice = basicQuery(builder ->
|
var queryMissingAdvice = basicQuery(builder ->
|
||||||
builder.subqueries(
|
builder.query(
|
||||||
List.of(
|
new SearchQuery("hello",
|
||||||
new SearchSubquery(
|
List.of("hello"),
|
||||||
List.of("hello"),
|
List.of(),
|
||||||
List.of(),
|
List.of("missing"),
|
||||||
List.of("missing"),
|
List.of(),
|
||||||
List.of(),
|
List.of()
|
||||||
List.of()
|
|
||||||
)
|
|
||||||
)));
|
)));
|
||||||
|
|
||||||
executeSearch(queryMissingAdvice)
|
executeSearch(queryMissingAdvice)
|
||||||
.expectCount(0);
|
.expectCount(0);
|
||||||
|
|
||||||
var queryMissingCoherence = basicQuery(builder ->
|
var queryMissingCoherence = basicQuery(builder ->
|
||||||
builder.subqueries(
|
builder.query(
|
||||||
List.of(
|
new SearchQuery("hello",
|
||||||
new SearchSubquery(
|
List.of("hello"),
|
||||||
List.of("hello"),
|
List.of(),
|
||||||
List.of(),
|
List.of(),
|
||||||
List.of(),
|
List.of(),
|
||||||
List.of(),
|
List.of(List.of("missing", "hello"))
|
||||||
List.of(List.of("missing", "hello"))
|
|
||||||
)
|
|
||||||
)));
|
)));
|
||||||
|
|
||||||
executeSearch(queryMissingCoherence)
|
executeSearch(queryMissingCoherence)
|
||||||
@ -202,7 +197,7 @@ public class IndexQueryServiceIntegrationTest {
|
|||||||
).load();
|
).load();
|
||||||
|
|
||||||
|
|
||||||
var query = basicQuery(builder -> builder.subqueries(justInclude("hello", "world")));
|
var query = basicQuery(builder -> builder.query(justInclude("hello", "world")));
|
||||||
|
|
||||||
executeSearch(query)
|
executeSearch(query)
|
||||||
.expectDocumentsInOrder(d(1,1));
|
.expectDocumentsInOrder(d(1,1));
|
||||||
@ -234,15 +229,15 @@ public class IndexQueryServiceIntegrationTest {
|
|||||||
|
|
||||||
|
|
||||||
var beforeY2K = basicQuery(builder ->
|
var beforeY2K = basicQuery(builder ->
|
||||||
builder.subqueries(justInclude("hello", "world"))
|
builder.query(justInclude("hello", "world"))
|
||||||
.year(SpecificationLimit.lessThan(2000))
|
.year(SpecificationLimit.lessThan(2000))
|
||||||
);
|
);
|
||||||
var atY2K = basicQuery(builder ->
|
var atY2K = basicQuery(builder ->
|
||||||
builder.subqueries(justInclude("hello", "world"))
|
builder.query(justInclude("hello", "world"))
|
||||||
.year(SpecificationLimit.equals(2000))
|
.year(SpecificationLimit.equals(2000))
|
||||||
);
|
);
|
||||||
var afterY2K = basicQuery(builder ->
|
var afterY2K = basicQuery(builder ->
|
||||||
builder.subqueries(justInclude("hello", "world"))
|
builder.query(justInclude("hello", "world"))
|
||||||
.year(SpecificationLimit.greaterThan(2000))
|
.year(SpecificationLimit.greaterThan(2000))
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -296,11 +291,11 @@ public class IndexQueryServiceIntegrationTest {
|
|||||||
|
|
||||||
|
|
||||||
var domain1 = basicQuery(builder ->
|
var domain1 = basicQuery(builder ->
|
||||||
builder.subqueries(justInclude("hello", "world"))
|
builder.query(justInclude("hello", "world"))
|
||||||
.domains(List.of(1))
|
.domains(List.of(1))
|
||||||
);
|
);
|
||||||
var domain2 = basicQuery(builder ->
|
var domain2 = basicQuery(builder ->
|
||||||
builder.subqueries(justInclude("hello", "world"))
|
builder.query(justInclude("hello", "world"))
|
||||||
.domains(List.of(2))
|
.domains(List.of(2))
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -334,7 +329,7 @@ public class IndexQueryServiceIntegrationTest {
|
|||||||
).load();
|
).load();
|
||||||
|
|
||||||
var query = basicQuery(builder ->
|
var query = basicQuery(builder ->
|
||||||
builder.subqueries(includeAndExclude("hello", "my_darling"))
|
builder.query(includeAndExclude("hello", "my_darling"))
|
||||||
);
|
);
|
||||||
|
|
||||||
executeSearch(query)
|
executeSearch(query)
|
||||||
@ -403,7 +398,7 @@ public class IndexQueryServiceIntegrationTest {
|
|||||||
.load();
|
.load();
|
||||||
|
|
||||||
var rsp = queryService.justQuery(
|
var rsp = queryService.justQuery(
|
||||||
basicQuery(builder -> builder.subqueries(
|
basicQuery(builder -> builder.query(
|
||||||
// note coherence requriement
|
// note coherence requriement
|
||||||
includeAndCohere("hello", "world")
|
includeAndCohere("hello", "world")
|
||||||
)));
|
)));
|
||||||
@ -424,50 +419,53 @@ public class IndexQueryServiceIntegrationTest {
|
|||||||
.rank(SpecificationLimit.none())
|
.rank(SpecificationLimit.none())
|
||||||
.rankingParams(ResultRankingParameters.sensibleDefaults())
|
.rankingParams(ResultRankingParameters.sensibleDefaults())
|
||||||
.domains(new ArrayList<>())
|
.domains(new ArrayList<>())
|
||||||
.searchSetIdentifier("NONE")
|
.searchSetIdentifier("NONE");
|
||||||
.subqueries(List.of());
|
|
||||||
|
|
||||||
return mutator.apply(builder).build();
|
return mutator.apply(builder).build();
|
||||||
}
|
}
|
||||||
|
|
||||||
List<SearchSubquery> justInclude(String... includes) {
|
SearchQuery justInclude(String... includes) {
|
||||||
return List.of(new SearchSubquery(
|
return new SearchQuery(
|
||||||
|
Strings.join(List.of(includes), ' '),
|
||||||
List.of(includes),
|
List.of(includes),
|
||||||
List.of(),
|
List.of(),
|
||||||
List.of(),
|
List.of(),
|
||||||
List.of(),
|
List.of(),
|
||||||
List.of()
|
List.of()
|
||||||
));
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
List<SearchSubquery> includeAndExclude(List<String> includes, List<String> excludes) {
|
SearchQuery includeAndExclude(List<String> includes, List<String> excludes) {
|
||||||
return List.of(new SearchSubquery(
|
return new SearchQuery(
|
||||||
|
Strings.join(List.of(includes), ' '),
|
||||||
includes,
|
includes,
|
||||||
excludes,
|
excludes,
|
||||||
List.of(),
|
List.of(),
|
||||||
List.of(),
|
List.of(),
|
||||||
List.of()
|
List.of()
|
||||||
));
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
List<SearchSubquery> includeAndExclude(String include, String exclude) {
|
SearchQuery includeAndExclude(String include, String exclude) {
|
||||||
return List.of(new SearchSubquery(
|
return new SearchQuery(
|
||||||
|
include,
|
||||||
List.of(include),
|
List.of(include),
|
||||||
List.of(exclude),
|
List.of(exclude),
|
||||||
List.of(),
|
List.of(),
|
||||||
List.of(),
|
List.of(),
|
||||||
List.of()
|
List.of()
|
||||||
));
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
List<SearchSubquery> includeAndCohere(String... includes) {
|
SearchQuery includeAndCohere(String... includes) {
|
||||||
return List.of(new SearchSubquery(
|
return new SearchQuery(
|
||||||
|
Strings.join(List.of(includes), ' '),
|
||||||
List.of(includes),
|
List.of(includes),
|
||||||
List.of(),
|
List.of(),
|
||||||
List.of(),
|
List.of(),
|
||||||
List.of(),
|
List.of(),
|
||||||
List.of(List.of(includes))
|
List.of(List.of(includes))
|
||||||
));
|
);
|
||||||
}
|
}
|
||||||
private MockDataDocument d(int domainId, int ordinal) {
|
private MockDataDocument d(int domainId, int ordinal) {
|
||||||
return new MockDataDocument(domainId, ordinal);
|
return new MockDataDocument(domainId, ordinal);
|
||||||
|
@ -0,0 +1,59 @@
|
|||||||
|
package nu.marginalia.index.index;
|
||||||
|
|
||||||
|
import it.unimi.dsi.fastutil.longs.LongArraySet;
|
||||||
|
import it.unimi.dsi.fastutil.longs.LongSet;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
|
class QueryBranchWalkerTest {
|
||||||
|
@Test
|
||||||
|
public void testNoOverlap() {
|
||||||
|
var paths = QueryBranchWalker.create(
|
||||||
|
new long[] { 1, 2 },
|
||||||
|
List.of(set(1), set(2))
|
||||||
|
);
|
||||||
|
assertEquals(2, paths.size());
|
||||||
|
assertEquals(Set.of(1L, 2L), paths.stream().map(path -> path.termId).collect(Collectors.toSet()));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testCond() {
|
||||||
|
var paths = QueryBranchWalker.create(
|
||||||
|
new long[] { 1, 2, 3, 4 },
|
||||||
|
List.of(set(1,2,3), set(1,4,3))
|
||||||
|
);
|
||||||
|
assertEquals(1, paths.size());
|
||||||
|
assertEquals(Set.of(1L), paths.stream().map(path -> path.termId).collect(Collectors.toSet()));
|
||||||
|
System.out.println(Arrays.toString(paths.getFirst().priorityOrder));
|
||||||
|
assertArrayEquals(new long[] { 2, 3, 4 }, paths.getFirst().priorityOrder);
|
||||||
|
|
||||||
|
var next = paths.getFirst().next();
|
||||||
|
assertEquals(2, next.size());
|
||||||
|
assertEquals(Set.of(2L, 3L), next.stream().map(path -> path.termId).collect(Collectors.toSet()));
|
||||||
|
Map<Long, QueryBranchWalker> byId = next.stream().collect(Collectors.toMap(w -> w.termId, w->w));
|
||||||
|
assertArrayEquals(new long[] { 3L }, byId.get(2L).priorityOrder );
|
||||||
|
assertArrayEquals(new long[] { 4L }, byId.get(3L).priorityOrder );
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testNoOverlapFirst() {
|
||||||
|
var paths = QueryBranchWalker.create(
|
||||||
|
new long[] { 1, 2, 3 },
|
||||||
|
List.of(set(1, 2), set(1, 3))
|
||||||
|
);
|
||||||
|
assertEquals(1, paths.size());
|
||||||
|
assertArrayEquals(new long[] { 2, 3 }, paths.getFirst().priorityOrder);
|
||||||
|
assertEquals(Set.of(1L), paths.stream().map(path -> path.termId).collect(Collectors.toSet()));
|
||||||
|
}
|
||||||
|
|
||||||
|
LongSet set(long... args) {
|
||||||
|
return new LongArraySet(args);
|
||||||
|
}
|
||||||
|
}
|
@ -2,9 +2,10 @@ package nu.marginalia.index.results;
|
|||||||
|
|
||||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||||
import nu.marginalia.model.id.UrlIdCodec;
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.*;
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
class IndexResultDomainDeduplicatorTest {
|
class IndexResultDomainDeduplicatorTest {
|
||||||
@ -24,7 +25,7 @@ class IndexResultDomainDeduplicatorTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
SearchResultItem forId(int domain, int ordinal) {
|
SearchResultItem forId(int domain, int ordinal) {
|
||||||
return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), 4);
|
return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), List.of(), 4, Double.NaN);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
@ -1,5 +1,6 @@
|
|||||||
package nu.marginalia.ranking.results;
|
package nu.marginalia.ranking.results;
|
||||||
|
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||||
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
|
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
|
||||||
@ -35,21 +36,21 @@ class ResultValuatorTest {
|
|||||||
);
|
);
|
||||||
|
|
||||||
}
|
}
|
||||||
List<SearchResultKeywordScore> titleOnlyLowCountSet = List.of(
|
CompiledQuery<SearchResultKeywordScore> titleOnlyLowCountSet = CompiledQuery.just(
|
||||||
new SearchResultKeywordScore(0, "bob",
|
new SearchResultKeywordScore("bob", 1,
|
||||||
wordMetadata(Set.of(1), EnumSet.of(WordFlags.Title)),
|
wordMetadata(Set.of(1), EnumSet.of(WordFlags.Title)),
|
||||||
docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)),
|
docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)),
|
||||||
0)
|
0)
|
||||||
);
|
);
|
||||||
List<SearchResultKeywordScore> highCountNoTitleSet = List.of(
|
CompiledQuery<SearchResultKeywordScore> highCountNoTitleSet = CompiledQuery.just(
|
||||||
new SearchResultKeywordScore(0, "bob",
|
new SearchResultKeywordScore("bob", 1,
|
||||||
wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh)),
|
wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh)),
|
||||||
docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)),
|
docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)),
|
||||||
0)
|
0)
|
||||||
);
|
);
|
||||||
|
|
||||||
List<SearchResultKeywordScore> highCountSubjectSet = List.of(
|
CompiledQuery<SearchResultKeywordScore> highCountSubjectSet = CompiledQuery.just(
|
||||||
new SearchResultKeywordScore(0, "bob",
|
new SearchResultKeywordScore("bob", 1,
|
||||||
wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh, WordFlags.Subjects)),
|
wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh, WordFlags.Subjects)),
|
||||||
docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)),
|
docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)),
|
||||||
0)
|
0)
|
||||||
@ -75,7 +76,10 @@ class ResultValuatorTest {
|
|||||||
System.out.println(highCountSubject);
|
System.out.println(highCountSubject);
|
||||||
}
|
}
|
||||||
|
|
||||||
private long docMetadata(int topology, int year, int quality, EnumSet<DocumentFlags> flags) {
|
private long docMetadata(int topology,
|
||||||
|
int year,
|
||||||
|
int quality,
|
||||||
|
EnumSet<DocumentFlags> flags) {
|
||||||
return new DocumentMetadata(topology, PubDate.toYearByte(year), quality, flags).encode();
|
return new DocumentMetadata(topology, PubDate.toYearByte(year), quality, flags).encode();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,9 +1,10 @@
|
|||||||
package nu.marginalia.ranking.results.factors;
|
package nu.marginalia.ranking.results.factors;
|
||||||
|
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
|
||||||
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
|
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
|
||||||
import nu.marginalia.bbpc.BrailleBlockPunchCards;
|
import nu.marginalia.bbpc.BrailleBlockPunchCards;
|
||||||
import nu.marginalia.model.idx.WordMetadata;
|
import nu.marginalia.model.idx.WordMetadata;
|
||||||
import nu.marginalia.ranking.results.ResultKeywordSet;
|
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
@ -20,7 +21,7 @@ class TermCoherenceFactorTest {
|
|||||||
WordMetadata.POSITIONS_MASK, WordMetadata.POSITIONS_MASK
|
WordMetadata.POSITIONS_MASK, WordMetadata.POSITIONS_MASK
|
||||||
);
|
);
|
||||||
|
|
||||||
long mask = termCoherenceFactor.combinedMask(allPositionsSet);
|
long mask = CompiledQueryAggregates.longBitmaskAggregate(allPositionsSet, score -> score.positions() & WordMetadata.POSITIONS_MASK);
|
||||||
|
|
||||||
assertEquals(1.0, termCoherenceFactor.bitsSetFactor(mask), 0.01);
|
assertEquals(1.0, termCoherenceFactor.bitsSetFactor(mask), 0.01);
|
||||||
|
|
||||||
@ -33,7 +34,7 @@ class TermCoherenceFactorTest {
|
|||||||
0, 0
|
0, 0
|
||||||
);
|
);
|
||||||
|
|
||||||
long mask = termCoherenceFactor.combinedMask(allPositionsSet);
|
long mask = CompiledQueryAggregates.longBitmaskAggregate(allPositionsSet, score -> score.positions() & WordMetadata.POSITIONS_MASK);
|
||||||
|
|
||||||
assertEquals(0, termCoherenceFactor.bitsSetFactor(mask), 0.01);
|
assertEquals(0, termCoherenceFactor.bitsSetFactor(mask), 0.01);
|
||||||
|
|
||||||
@ -46,7 +47,7 @@ class TermCoherenceFactorTest {
|
|||||||
List.of(0, 1, 2, 3), List.of(0, 1, 2, 3)
|
List.of(0, 1, 2, 3), List.of(0, 1, 2, 3)
|
||||||
);
|
);
|
||||||
|
|
||||||
long mask = termCoherenceFactor.combinedMask(positions);
|
long mask = CompiledQueryAggregates.longBitmaskAggregate(positions, score -> score.positions() & WordMetadata.POSITIONS_MASK);
|
||||||
printMask(mask);
|
printMask(mask);
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -57,7 +58,7 @@ class TermCoherenceFactorTest {
|
|||||||
List.of(55, 54, 53, 52), List.of(55, 54, 53, 52)
|
List.of(55, 54, 53, 52), List.of(55, 54, 53, 52)
|
||||||
);
|
);
|
||||||
|
|
||||||
long mask = termCoherenceFactor.combinedMask(positions);
|
long mask = CompiledQueryAggregates.longBitmaskAggregate(positions, score -> score.positions() & WordMetadata.POSITIONS_MASK);
|
||||||
printMask(mask);
|
printMask(mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -72,7 +73,7 @@ class TermCoherenceFactorTest {
|
|||||||
System.out.println(BrailleBlockPunchCards.printBits(mask, 48));
|
System.out.println(BrailleBlockPunchCards.printBits(mask, 48));
|
||||||
}
|
}
|
||||||
|
|
||||||
ResultKeywordSet createSet(List<Integer>... maskPositions) {
|
CompiledQuery<SearchResultKeywordScore> createSet(List<Integer>... maskPositions) {
|
||||||
long[] positions = new long[maskPositions.length];
|
long[] positions = new long[maskPositions.length];
|
||||||
|
|
||||||
for (int i = 0; i < maskPositions.length; i++) {
|
for (int i = 0; i < maskPositions.length; i++) {
|
||||||
@ -84,14 +85,14 @@ class TermCoherenceFactorTest {
|
|||||||
return createSet(positions);
|
return createSet(positions);
|
||||||
}
|
}
|
||||||
|
|
||||||
ResultKeywordSet createSet(long... positionMasks) {
|
CompiledQuery<SearchResultKeywordScore> createSet(long... positionMasks) {
|
||||||
List<SearchResultKeywordScore> keywords = new ArrayList<>();
|
List<SearchResultKeywordScore> keywords = new ArrayList<>();
|
||||||
|
|
||||||
for (int i = 0; i < positionMasks.length; i++) {
|
for (int i = 0; i < positionMasks.length; i++) {
|
||||||
keywords.add(new SearchResultKeywordScore(0, "",
|
keywords.add(new SearchResultKeywordScore("", 0,
|
||||||
new WordMetadata(positionMasks[i], (byte) 0).encode(), 0, 0));
|
new WordMetadata(positionMasks[i], (byte) 0).encode(), 0, 0));
|
||||||
}
|
}
|
||||||
|
|
||||||
return new ResultKeywordSet(keywords);
|
return CompiledQuery.just(keywords.toArray(SearchResultKeywordScore[]::new));
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -1,5 +1,7 @@
|
|||||||
package nu.marginalia.array.algo;
|
package nu.marginalia.array.algo;
|
||||||
|
|
||||||
|
import nu.marginalia.array.LongArray;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.LongBuffer;
|
import java.nio.LongBuffer;
|
||||||
import java.nio.channels.FileChannel;
|
import java.nio.channels.FileChannel;
|
||||||
@ -61,6 +63,12 @@ public interface LongArrayBase extends BulkTransferArray<LongBuffer> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
default void get(long start, long end, LongArray buffer, int bufferStart) {
|
||||||
|
for (int i = 0; i < (end-start); i++) {
|
||||||
|
buffer.set(i + bufferStart, get(start + i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
default void get(long start, LongBuffer buffer) {
|
default void get(long start, LongBuffer buffer) {
|
||||||
get(start, start + buffer.remaining(), buffer, buffer.position());
|
get(start, start + buffer.remaining(), buffer, buffer.position());
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,8 @@
|
|||||||
package nu.marginalia.array.buffer;
|
package nu.marginalia.array.buffer;
|
||||||
|
|
||||||
|
import nu.marginalia.array.LongArray;
|
||||||
|
import nu.marginalia.array.LongArrayFactory;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
|
||||||
/** A buffer for long values that can be used to filter and manipulate the data.
|
/** A buffer for long values that can be used to filter and manipulate the data.
|
||||||
@ -17,7 +20,7 @@ import java.util.Arrays;
|
|||||||
public class LongQueryBuffer {
|
public class LongQueryBuffer {
|
||||||
/** Direct access to the data in the buffer,
|
/** Direct access to the data in the buffer,
|
||||||
* guaranteed to be populated until `end` */
|
* guaranteed to be populated until `end` */
|
||||||
public final long[] data;
|
public final LongArray data;
|
||||||
|
|
||||||
/** Number of items in the data buffer */
|
/** Number of items in the data buffer */
|
||||||
public int end;
|
public int end;
|
||||||
@ -25,18 +28,27 @@ public class LongQueryBuffer {
|
|||||||
private int read = 0;
|
private int read = 0;
|
||||||
private int write = 0;
|
private int write = 0;
|
||||||
|
|
||||||
|
private LongQueryBuffer(LongArray array, int size) {
|
||||||
|
this.data = array;
|
||||||
|
this.end = size;
|
||||||
|
}
|
||||||
|
|
||||||
public LongQueryBuffer(int size) {
|
public LongQueryBuffer(int size) {
|
||||||
this.data = new long[size];
|
this.data = LongArrayFactory.onHeapConfined(size);
|
||||||
this.end = size;
|
this.end = size;
|
||||||
}
|
}
|
||||||
|
|
||||||
public LongQueryBuffer(long[] data, int size) {
|
public LongQueryBuffer(long[] data, int size) {
|
||||||
this.data = data;
|
this.data = LongArrayFactory.onHeapConfined(size);
|
||||||
|
this.data.set(0, data);
|
||||||
|
|
||||||
this.end = size;
|
this.end = size;
|
||||||
}
|
}
|
||||||
|
|
||||||
public long[] copyData() {
|
public long[] copyData() {
|
||||||
return Arrays.copyOf(data, end);
|
long[] copy = new long[end];
|
||||||
|
data.forEach(0, end, (pos, val) -> copy[(int)pos]=val );
|
||||||
|
return copy;
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isEmpty() {
|
public boolean isEmpty() {
|
||||||
@ -48,7 +60,7 @@ public class LongQueryBuffer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public void reset() {
|
public void reset() {
|
||||||
end = data.length;
|
end = (int) data.size();
|
||||||
read = 0;
|
read = 0;
|
||||||
write = 0;
|
write = 0;
|
||||||
}
|
}
|
||||||
@ -59,12 +71,16 @@ public class LongQueryBuffer {
|
|||||||
write = 0;
|
write = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public LongQueryBuffer slice(int start, int end) {
|
||||||
|
return new LongQueryBuffer(data.range(start, end), end - start);
|
||||||
|
}
|
||||||
|
|
||||||
/* == Filtering methods == */
|
/* == Filtering methods == */
|
||||||
|
|
||||||
/** Returns the current value at the read pointer.
|
/** Returns the current value at the read pointer.
|
||||||
*/
|
*/
|
||||||
public long currentValue() {
|
public long currentValue() {
|
||||||
return data[read];
|
return data.get(read);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Advances the read pointer and returns true if there are more values to read. */
|
/** Advances the read pointer and returns true if there are more values to read. */
|
||||||
@ -79,9 +95,9 @@ public class LongQueryBuffer {
|
|||||||
*/
|
*/
|
||||||
public boolean retainAndAdvance() {
|
public boolean retainAndAdvance() {
|
||||||
if (read != write) {
|
if (read != write) {
|
||||||
long tmp = data[write];
|
long tmp = data.get(write);
|
||||||
data[write] = data[read];
|
data.set(write, data.get(read));
|
||||||
data[read] = tmp;
|
data.set(read, tmp);
|
||||||
}
|
}
|
||||||
|
|
||||||
write++;
|
write++;
|
||||||
@ -117,9 +133,10 @@ public class LongQueryBuffer {
|
|||||||
write = 0;
|
write = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void startFilterForRange(int pos, int end) {
|
public void finalizeFiltering(int pos) {
|
||||||
read = write = pos;
|
end = write;
|
||||||
this.end = end;
|
read = pos;
|
||||||
|
write = pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Retain only unique values in the buffer, and update the end pointer to the new length.
|
/** Retain only unique values in the buffer, and update the end pointer to the new length.
|
||||||
@ -153,7 +170,7 @@ public class LongQueryBuffer {
|
|||||||
"read = " + read +
|
"read = " + read +
|
||||||
",write = " + write +
|
",write = " + write +
|
||||||
",end = " + end +
|
",end = " + end +
|
||||||
",data = [" + Arrays.toString(Arrays.copyOf(data, end)) + "]]";
|
",data = [" + Arrays.toString(copyData()) + "]]";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -143,7 +143,7 @@ class LongArraySearchTest {
|
|||||||
|
|
||||||
assertEquals(43, buffer.size());
|
assertEquals(43, buffer.size());
|
||||||
for (int i = 0; i < 43; i++) {
|
for (int i = 0; i < 43; i++) {
|
||||||
assertEquals(buffer.data[i], i*3);
|
assertEquals(buffer.data.get(i), i*3);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -160,7 +160,7 @@ class LongArraySearchTest {
|
|||||||
int j = 0;
|
int j = 0;
|
||||||
for (int i = 0; i < 43; i++) {
|
for (int i = 0; i < 43; i++) {
|
||||||
if (++j % 3 == 0) j++;
|
if (++j % 3 == 0) j++;
|
||||||
assertEquals(buffer.data[i], j);
|
assertEquals(buffer.data.get(i), j);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -109,8 +109,8 @@ public class BTreeReader {
|
|||||||
return ip.findData(key);
|
return ip.findData(key);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void readData(long[] buf, int n, long pos) {
|
public void readData(LongArray buf, int n, long pos) {
|
||||||
data.get(pos, pos + n, buf);
|
data.get(pos, pos + n, buf, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Used for querying interlaced data in the btree.
|
/** Used for querying interlaced data in the btree.
|
||||||
|
@ -32,7 +32,8 @@ public class BTreeReaderRejectRetainWithIndexTest {
|
|||||||
@Test
|
@Test
|
||||||
public void testRetain() {
|
public void testRetain() {
|
||||||
LongQueryBuffer odds = new LongQueryBuffer(50);
|
LongQueryBuffer odds = new LongQueryBuffer(50);
|
||||||
Arrays.setAll(odds.data, i -> 2L*i + 1);
|
for (int i = 0; i < 50; i++)
|
||||||
|
odds.data.set(i, 2L*i + 1);
|
||||||
|
|
||||||
BTreeReader reader = new BTreeReader(array, ctx, 0);
|
BTreeReader reader = new BTreeReader(array, ctx, 0);
|
||||||
reader.retainEntries(odds);
|
reader.retainEntries(odds);
|
||||||
@ -46,7 +47,8 @@ public class BTreeReaderRejectRetainWithIndexTest {
|
|||||||
@Test
|
@Test
|
||||||
public void testReject() {
|
public void testReject() {
|
||||||
LongQueryBuffer odds = new LongQueryBuffer(50);
|
LongQueryBuffer odds = new LongQueryBuffer(50);
|
||||||
Arrays.setAll(odds.data, i -> 2L*i + 1);
|
for (int i = 0; i < 50; i++)
|
||||||
|
odds.data.set(i, 2L*i + 1);
|
||||||
|
|
||||||
BTreeReader reader = new BTreeReader(array, ctx, 0);
|
BTreeReader reader = new BTreeReader(array, ctx, 0);
|
||||||
reader.rejectEntries(odds);
|
reader.rejectEntries(odds);
|
||||||
|
@ -32,7 +32,8 @@ public class BTreeReaderRejectRetainWithoutIndexTest {
|
|||||||
@Test
|
@Test
|
||||||
public void testRetain() {
|
public void testRetain() {
|
||||||
LongQueryBuffer odds = new LongQueryBuffer(50);
|
LongQueryBuffer odds = new LongQueryBuffer(50);
|
||||||
Arrays.setAll(odds.data, i -> 2L*i + 1);
|
for (int i = 0; i < 50; i++)
|
||||||
|
odds.data.set(i, 2L*i + 1);
|
||||||
|
|
||||||
BTreeReader reader = new BTreeReader(array, ctx, 0);
|
BTreeReader reader = new BTreeReader(array, ctx, 0);
|
||||||
reader.retainEntries(odds);
|
reader.retainEntries(odds);
|
||||||
@ -46,7 +47,9 @@ public class BTreeReaderRejectRetainWithoutIndexTest {
|
|||||||
@Test
|
@Test
|
||||||
public void testReject() {
|
public void testReject() {
|
||||||
LongQueryBuffer odds = new LongQueryBuffer(50);
|
LongQueryBuffer odds = new LongQueryBuffer(50);
|
||||||
Arrays.setAll(odds.data, i -> 2L*i + 1);
|
for (int i = 0; i < 50; i++)
|
||||||
|
odds.data.set(i, 2L*i + 1);
|
||||||
|
|
||||||
|
|
||||||
BTreeReader reader = new BTreeReader(array, ctx, 0);
|
BTreeReader reader = new BTreeReader(array, ctx, 0);
|
||||||
reader.rejectEntries(odds);
|
reader.rejectEntries(odds);
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.search;
|
package nu.marginalia.search;
|
||||||
|
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier;
|
import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||||
import nu.marginalia.index.query.limit.QueryLimits;
|
import nu.marginalia.index.query.limit.QueryLimits;
|
||||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||||
@ -14,7 +14,7 @@ import java.util.List;
|
|||||||
public class SearchQueryParamFactory {
|
public class SearchQueryParamFactory {
|
||||||
|
|
||||||
public QueryParams forRegularSearch(SearchParameters userParams) {
|
public QueryParams forRegularSearch(SearchParameters userParams) {
|
||||||
SearchSubquery prototype = new SearchSubquery();
|
SearchQuery prototype = new SearchQuery();
|
||||||
var profile = userParams.profile();
|
var profile = userParams.profile();
|
||||||
|
|
||||||
profile.addTacitTerms(prototype);
|
profile.addTacitTerms(prototype);
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
package nu.marginalia.search.command;
|
package nu.marginalia.search.command;
|
||||||
|
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||||
|
|
||||||
import javax.annotation.Nullable;
|
import javax.annotation.Nullable;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
@ -23,7 +23,7 @@ public enum SearchAdtechParameter {
|
|||||||
return DEFAULT;
|
return DEFAULT;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void addTacitTerms(SearchSubquery subquery) {
|
public void addTacitTerms(SearchQuery subquery) {
|
||||||
subquery.searchTermsExclude.addAll(Arrays.asList(implictExcludeSearchTerms));
|
subquery.searchTermsExclude.addAll(Arrays.asList(implictExcludeSearchTerms));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
package nu.marginalia.search.command;
|
package nu.marginalia.search.command;
|
||||||
|
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||||
|
|
||||||
import javax.annotation.Nullable;
|
import javax.annotation.Nullable;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
@ -25,7 +25,7 @@ public enum SearchJsParameter {
|
|||||||
return DEFAULT;
|
return DEFAULT;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void addTacitTerms(SearchSubquery subquery) {
|
public void addTacitTerms(SearchQuery subquery) {
|
||||||
subquery.searchTermsExclude.addAll(Arrays.asList(implictExcludeSearchTerms));
|
subquery.searchTermsExclude.addAll(Arrays.asList(implictExcludeSearchTerms));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2,7 +2,7 @@ package nu.marginalia.search.model;
|
|||||||
|
|
||||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||||
import nu.marginalia.model.crawl.HtmlFeature;
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier;
|
import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier;
|
||||||
|
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
@ -47,7 +47,7 @@ public enum SearchProfile {
|
|||||||
return NO_FILTER;
|
return NO_FILTER;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void addTacitTerms(SearchSubquery subquery) {
|
public void addTacitTerms(SearchQuery subquery) {
|
||||||
if (this == ACADEMIA) {
|
if (this == ACADEMIA) {
|
||||||
subquery.searchTermsAdvice.add("special:academia");
|
subquery.searchTermsAdvice.add("special:academia");
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user