diff --git a/code/common/model/java/nu/marginalia/model/idx/WordFlags.java b/code/common/model/java/nu/marginalia/model/idx/WordFlags.java index dc627715..db54df77 100644 --- a/code/common/model/java/nu/marginalia/model/idx/WordFlags.java +++ b/code/common/model/java/nu/marginalia/model/idx/WordFlags.java @@ -50,6 +50,10 @@ public enum WordFlags { return (asBit() & value) > 0; } + public boolean isAbsent(long value) { + return (asBit() & value) == 0; + } + public static EnumSet decode(long encodedValue) { EnumSet ret = EnumSet.noneOf(WordFlags.class); @@ -61,4 +65,5 @@ public enum WordFlags { return ret; } + } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java index b705917e..5a43df1b 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java @@ -134,6 +134,8 @@ public class QueryProtobufCodec { return new SearchResultItem( rawItem.getCombinedId(), + rawItem.getEncodedDocMetadata(), + rawItem.getHtmlFeatures(), keywordScores, rawItem.getResultsFromDomain(), Double.NaN // Not set @@ -144,9 +146,7 @@ public class QueryProtobufCodec { return new SearchResultKeywordScore( keywordScores.getKeyword(), -1, // termId is internal to index service - keywordScores.getEncodedWordMetadata(), - keywordScores.getEncodedDocMetadata(), - keywordScores.getHtmlFeatures() + keywordScores.getEncodedWordMetadata() ); } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQuery.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQuery.java index 3ae850a3..356a1d86 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQuery.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQuery.java @@ -46,6 +46,10 @@ public class CompiledQuery implements Iterable { return new CompiledQueryLong(root, data.mapToLong(mapper)); } + public CompiledQueryLong mapToInt(ToIntFunction mapper) { + return new CompiledQueryLong(root, data.mapToInt(mapper)); + } + public CqExpression root() { return root; } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryInt.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryInt.java new file mode 100644 index 00000000..9e26c35c --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryInt.java @@ -0,0 +1,44 @@ +package nu.marginalia.api.searchquery.model.compiled; + +import java.util.stream.IntStream; + + +/** A compiled index service query */ +public class CompiledQueryInt { + private final CqExpression root; + private final CqDataInt data; + + public CompiledQueryInt(CqExpression root, CqDataInt data) { + this.root = root; + this.data = data; + } + + + public CqExpression root() { + return root; + } + + public IntStream stream() { + return data.stream(); + } + + public IntStream indices() { + return IntStream.range(0, data.size()); + } + + public long at(int index) { + return data.get(index); + } + + public int[] copyData() { + return data.copyData(); + } + + public boolean isEmpty() { + return data.size() == 0; + } + + public int size() { + return data.size(); + } +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryLong.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryLong.java index 94fa0e8b..718aaca7 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryLong.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryLong.java @@ -9,8 +9,8 @@ import java.util.stream.LongStream; /** A compiled index service query */ public class CompiledQueryLong implements Iterable { - private final CqExpression root; - private final CqDataLong data; + public final CqExpression root; + public final CqDataLong data; public CompiledQueryLong(CqExpression root, CqDataLong data) { this.root = root; @@ -47,4 +47,8 @@ public class CompiledQueryLong implements Iterable { public boolean isEmpty() { return data.size() == 0; } + + public int size() { + return data.size(); + } } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqData.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqData.java index b1565dc0..145f3f0f 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqData.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqData.java @@ -3,7 +3,7 @@ package nu.marginalia.api.searchquery.model.compiled; import java.lang.reflect.Array; import java.util.Arrays; import java.util.function.Function; -import java.util.function.ToDoubleFunction; +import java.util.function.ToIntFunction; import java.util.function.ToLongFunction; import java.util.stream.Stream; @@ -33,6 +33,15 @@ public class CqData { return new CqDataLong(newData); } + public CqDataLong mapToInt(ToIntFunction mapper) { + long[] newData = new long[data.length]; + for (int i = 0; i < data.length; i++) { + newData[i] = mapper.applyAsInt((T) data[i]); + } + + return new CqDataLong(newData); + } + public T get(int i) { return data[i]; } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqDataInt.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqDataInt.java new file mode 100644 index 00000000..24991686 --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqDataInt.java @@ -0,0 +1,31 @@ +package nu.marginalia.api.searchquery.model.compiled; + +import java.util.Arrays; +import java.util.stream.IntStream; + +public class CqDataInt { + private final int[] data; + + public CqDataInt(int[] data) { + this.data = data; + } + + public int get(int i) { + return data[i]; + } + public int get(CqExpression.Word w) { + return data[w.idx()]; + } + + public IntStream stream() { + return Arrays.stream(data); + } + + public int size() { + return data.length; + } + + public int[] copyData() { + return Arrays.copyOf(data, data.length); + } +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java index 0ab0647d..7e8ca8ec 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java @@ -17,6 +17,9 @@ public class CompiledQueryAggregates { static public boolean booleanAggregate(CompiledQuery query, Predicate predicate) { return query.root.visit(new CqBooleanAggregate(query, predicate)); } + static public boolean booleanAggregate(CompiledQueryLong query, LongPredicate predicate) { + return query.root.visit(new CqBooleanAggregate(query, predicate)); + } /** Compiled query aggregate that for a 64b bitmask that treats or-branches as logical OR, @@ -25,13 +28,20 @@ public class CompiledQueryAggregates { public static long longBitmaskAggregate(CompiledQuery query, ToLongFunction operator) { return query.root.visit(new CqLongBitmaskOperator(query, operator)); } - + public static long longBitmaskAggregate(CompiledQueryLong query, LongUnaryOperator operator) { + return query.root.visit(new CqLongBitmaskOperator(query, operator)); + } /** Apply the operator to each leaf node, then return the highest minimum value found along any path */ public static int intMaxMinAggregate(CompiledQuery query, ToIntFunction operator) { return query.root.visit(new CqIntMaxMinOperator(query, operator)); } + /** Apply the operator to each leaf node, then return the highest minimum value found along any path */ + public static int intMaxMinAggregate(CompiledQueryLong query, LongToIntFunction operator) { + return query.root.visit(new CqIntMaxMinOperator(query, operator)); + } + /** Apply the operator to each leaf node, and then return the highest sum of values possible * through each branch in the compiled query. * @@ -49,4 +59,9 @@ public class CompiledQueryAggregates { public static LongSet positionsAggregate(CompiledQuery query, ToLongFunction operator) { return query.root().visit(new CqPositionsOperator(query, operator)); } + + /** Using the bitwise AND operator, aggregate all possible combined values of the long generated by the provided operator */ + public static LongSet positionsAggregate(CompiledQueryLong query, LongUnaryOperator operator) { + return query.root().visit(new CqPositionsOperator(query, operator)); + } } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqBooleanAggregate.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqBooleanAggregate.java index 05ebf4c7..2a87ec79 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqBooleanAggregate.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqBooleanAggregate.java @@ -1,10 +1,12 @@ package nu.marginalia.api.searchquery.model.compiled.aggregate; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.compiled.CqExpression; import java.util.List; import java.util.function.IntPredicate; +import java.util.function.LongPredicate; import java.util.function.Predicate; public class CqBooleanAggregate implements CqExpression.BoolVisitor { @@ -15,6 +17,10 @@ public class CqBooleanAggregate implements CqExpression.BoolVisitor { this.predicate = idx -> objPred.test(query.at(idx)); } + public CqBooleanAggregate(CompiledQueryLong query, LongPredicate longPredicate) { + this.predicate = idx -> longPredicate.test(query.at(idx)); + } + @Override public boolean onAnd(List parts) { for (var part : parts) { diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqDoubleSumOperator.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqDoubleSumOperator.java index 23d1904e..082de29e 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqDoubleSumOperator.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqDoubleSumOperator.java @@ -1,10 +1,12 @@ package nu.marginalia.api.searchquery.model.compiled.aggregate; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.compiled.CqExpression; import java.util.List; import java.util.function.IntToDoubleFunction; +import java.util.function.LongToDoubleFunction; import java.util.function.ToDoubleFunction; public class CqDoubleSumOperator implements CqExpression.DoubleVisitor { @@ -15,6 +17,10 @@ public class CqDoubleSumOperator implements CqExpression.DoubleVisitor { this.operator = idx -> operator.applyAsDouble(query.at(idx)); } + public CqDoubleSumOperator(IntToDoubleFunction operator) { + this.operator = operator; + } + @Override public double onAnd(List parts) { double value = 0; diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqIntMaxMinOperator.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqIntMaxMinOperator.java index b3ec86bb..621dff73 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqIntMaxMinOperator.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqIntMaxMinOperator.java @@ -1,10 +1,12 @@ package nu.marginalia.api.searchquery.model.compiled.aggregate; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.compiled.CqExpression; import java.util.List; import java.util.function.IntUnaryOperator; +import java.util.function.LongToIntFunction; import java.util.function.ToIntFunction; public class CqIntMaxMinOperator implements CqExpression.IntVisitor { @@ -16,6 +18,10 @@ public class CqIntMaxMinOperator implements CqExpression.IntVisitor { this.operator = idx -> operator.applyAsInt(query.at(idx)); } + public CqIntMaxMinOperator(CompiledQueryLong query, LongToIntFunction operator) { + this.operator = idx -> operator.applyAsInt(query.at(idx)); + } + @Override public int onAnd(List parts) { int value = parts.getFirst().visit(this); diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqLongBitmaskOperator.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqLongBitmaskOperator.java index d9a4804b..b64029c1 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqLongBitmaskOperator.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqLongBitmaskOperator.java @@ -1,10 +1,12 @@ package nu.marginalia.api.searchquery.model.compiled.aggregate; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.compiled.CqExpression; import java.util.List; import java.util.function.IntToLongFunction; +import java.util.function.LongUnaryOperator; import java.util.function.ToLongFunction; public class CqLongBitmaskOperator implements CqExpression.LongVisitor { @@ -14,6 +16,9 @@ public class CqLongBitmaskOperator implements CqExpression.LongVisitor { public CqLongBitmaskOperator(CompiledQuery query, ToLongFunction operator) { this.operator = idx-> operator.applyAsLong(query.at(idx)); } + public CqLongBitmaskOperator(CompiledQueryLong query, LongUnaryOperator operator) { + this.operator = idx-> operator.applyAsLong(query.at(idx)); + } @Override public long onAnd(List parts) { diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqPositionsOperator.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqPositionsOperator.java index 19db2d4b..715c4cb2 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqPositionsOperator.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqPositionsOperator.java @@ -4,10 +4,12 @@ import it.unimi.dsi.fastutil.longs.LongArraySet; import it.unimi.dsi.fastutil.longs.LongOpenHashSet; import it.unimi.dsi.fastutil.longs.LongSet; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.compiled.CqExpression; import java.util.List; import java.util.function.IntToLongFunction; +import java.util.function.LongUnaryOperator; import java.util.function.ToLongFunction; public class CqPositionsOperator implements CqExpression.ObjectVisitor { @@ -17,6 +19,10 @@ public class CqPositionsOperator implements CqExpression.ObjectVisitor this.operator = idx -> operator.applyAsLong(query.at(idx)); } + public CqPositionsOperator(CompiledQueryLong query, LongUnaryOperator operator) { + this.operator = idx -> operator.applyAsLong(query.at(idx)); + } + @Override public LongSet onAnd(List parts) { LongSet ret = new LongArraySet(); diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingContext.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingContext.java index f0ad172f..9052345a 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingContext.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingContext.java @@ -1,38 +1,34 @@ package nu.marginalia.api.searchquery.model.results; -import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; import lombok.ToString; - -import java.util.Map; +import nu.marginalia.api.searchquery.model.compiled.CqDataInt; @ToString public class ResultRankingContext { private final int docCount; public final ResultRankingParameters params; - private final Object2IntOpenHashMap fullCounts = new Object2IntOpenHashMap<>(10, 0.5f); - private final Object2IntOpenHashMap priorityCounts = new Object2IntOpenHashMap<>(10, 0.5f); + /** CqDataInt associated with frequency information of the terms in the query + * in the full index. The dataset is indexed by the compiled query. */ + public final CqDataInt fullCounts; + + /** CqDataInt associated with frequency information of the terms in the query + * in the full index. The dataset is indexed by the compiled query. */ + public final CqDataInt priorityCounts; public ResultRankingContext(int docCount, ResultRankingParameters params, - Map fullCounts, - Map prioCounts - ) { + CqDataInt fullCounts, + CqDataInt prioCounts) + { this.docCount = docCount; this.params = params; - this.fullCounts.putAll(fullCounts); - this.priorityCounts.putAll(prioCounts); + this.fullCounts = fullCounts; + this.priorityCounts = prioCounts; } public int termFreqDocCount() { return docCount; } - public int frequency(String keyword) { - return fullCounts.getOrDefault(keyword, 1); - } - - public int priorityFrequency(String keyword) { - return priorityCounts.getOrDefault(keyword, 1); - } } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java index 8f50c9fb..7cd95b96 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java @@ -15,15 +15,24 @@ public class SearchResultItem implements Comparable { * probably not what you want, use getDocumentId() instead */ public final long combinedId; + /** Encoded document metadata */ + public final long encodedDocMetadata; + + /** Encoded html features of document */ + + public final int htmlFeatures; + /** How did the subqueries match against the document ? */ public final List keywordScores; /** How many other potential results existed in the same domain */ public int resultsFromDomain; - public SearchResultItem(long combinedId) { + public SearchResultItem(long combinedId, long encodedDocMetadata, int htmlFeatures) { this.combinedId = combinedId; + this.encodedDocMetadata = encodedDocMetadata; this.keywordScores = new ArrayList<>(); + this.htmlFeatures = htmlFeatures; } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultKeywordScore.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultKeywordScore.java index a0fd2156..212b2302 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultKeywordScore.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultKeywordScore.java @@ -2,7 +2,6 @@ package nu.marginalia.api.searchquery.model.results; import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.idx.WordMetadata; -import nu.marginalia.model.idx.DocumentMetadata; import java.util.Objects; @@ -10,34 +9,20 @@ public final class SearchResultKeywordScore { public final long termId; public final String keyword; private final long encodedWordMetadata; - private final long encodedDocMetadata; - - private final int htmlFeatures; public SearchResultKeywordScore(String keyword, long termId, - long encodedWordMetadata, - long encodedDocMetadata, - int htmlFeatures) { + long encodedWordMetadata) { this.termId = termId; this.keyword = keyword; this.encodedWordMetadata = encodedWordMetadata; - this.encodedDocMetadata = encodedDocMetadata; - this.htmlFeatures = htmlFeatures; } public boolean hasTermFlag(WordFlags flag) { return WordMetadata.hasFlags(encodedWordMetadata, flag.asBit()); } - public int positionCount() { - return Long.bitCount(positions()); - } - @Deprecated // FIXME 2024-04-06 - public int subquery() { - return -1; - } public long positions() { return WordMetadata.decodePositions(encodedWordMetadata); } @@ -46,44 +31,28 @@ public final class SearchResultKeywordScore { return keyword.contains(":") || hasTermFlag(WordFlags.Synthetic); } - public boolean isKeywordRegular() { - return !keyword.contains(":") - && !hasTermFlag(WordFlags.Synthetic); - } - public long encodedWordMetadata() { return encodedWordMetadata; } - public long encodedDocMetadata() { - return encodedDocMetadata; - } - - public int htmlFeatures() { - return htmlFeatures; - } - @Override public boolean equals(Object obj) { if (obj == this) return true; if (obj == null || obj.getClass() != this.getClass()) return false; var that = (SearchResultKeywordScore) obj; - return Objects.equals(this.keyword, that.keyword) && - this.encodedWordMetadata == that.encodedWordMetadata && - this.encodedDocMetadata == that.encodedDocMetadata; + return Objects.equals(this.termId, that.termId); } @Override public int hashCode() { - return Objects.hash(keyword, encodedWordMetadata, encodedDocMetadata); + return Objects.hash(termId); } @Override public String toString() { return "SearchResultKeywordScore[" + "keyword=" + keyword + ", " + - "encodedWordMetadata=" + new WordMetadata(encodedWordMetadata) + ", " + - "encodedDocMetadata=" + new DocumentMetadata(encodedDocMetadata) + ']'; + "encodedWordMetadata=" + new WordMetadata(encodedWordMetadata) + ']'; } } diff --git a/code/functions/search-query/api/src/main/protobuf/query-api.proto b/code/functions/search-query/api/src/main/protobuf/query-api.proto index df25c494..3094699b 100644 --- a/code/functions/search-query/api/src/main/protobuf/query-api.proto +++ b/code/functions/search-query/api/src/main/protobuf/query-api.proto @@ -98,16 +98,16 @@ message RpcDecoratedResultItem { message RpcRawResultItem { int64 combinedId = 1; // raw ID with bit-encoded ranking information still present int32 resultsFromDomain = 2; // number of other results from the same domain - repeated RpcResultKeywordScore keywordScores = 3; + int64 encodedDocMetadata = 3; // bit encoded document metadata + int32 htmlFeatures = 4; // bitmask encoding features of the document + repeated RpcResultKeywordScore keywordScores = 5; } /* Information about how well a keyword matches a query */ message RpcResultKeywordScore { string keyword = 1; // the keyword int64 encodedWordMetadata = 2; // bit encoded word metadata - int64 encodedDocMetadata = 3; // bit encoded document metadata - bool hasPriorityTerms = 4; // true if this word is important to the document - int32 htmlFeatures = 5; // bit encoded document features + bool hasPriorityTerms = 3; // true if this word is important to the document } /* Query execution parameters */ diff --git a/code/index/java/nu/marginalia/index/IndexGrpcService.java b/code/index/java/nu/marginalia/index/IndexGrpcService.java index 36b611ff..fa0a8343 100644 --- a/code/index/java/nu/marginalia/index/IndexGrpcService.java +++ b/code/index/java/nu/marginalia/index/IndexGrpcService.java @@ -11,6 +11,7 @@ import lombok.SneakyThrows; import nu.marginalia.api.searchquery.*; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; +import nu.marginalia.api.searchquery.model.compiled.CqDataInt; import nu.marginalia.api.searchquery.model.query.SearchSpecification; import nu.marginalia.api.searchquery.model.results.*; import nu.marginalia.array.buffer.LongQueryBuffer; @@ -135,14 +136,14 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { var rawItem = RpcRawResultItem.newBuilder(); rawItem.setCombinedId(rawResult.combinedId); rawItem.setResultsFromDomain(rawResult.resultsFromDomain); + rawItem.setHtmlFeatures(rawResult.htmlFeatures); + rawItem.setEncodedDocMetadata(rawResult.encodedDocMetadata); for (var score : rawResult.keywordScores) { rawItem.addKeywordScores( RpcResultKeywordScore.newBuilder() - .setEncodedDocMetadata(score.encodedDocMetadata()) .setEncodedWordMetadata(score.encodedWordMetadata()) .setKeyword(score.keyword) - .setHtmlFeatures(score.htmlFeatures()) ); } @@ -203,9 +204,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { return new SearchResultSet(List.of()); } - ResultRankingContext rankingContext = createRankingContext(params.rankingParams, - params.compiledQuery, - params.compiledQueryIds); + ResultRankingContext rankingContext = createRankingContext(params.rankingParams, params.compiledQueryIds); var queryExecution = new QueryExecution(rankingContext, params.fetchSize); @@ -414,22 +413,22 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { } private ResultRankingContext createRankingContext(ResultRankingParameters rankingParams, - CompiledQuery query, CompiledQueryLong compiledQueryIds) { - Map termToId = new HashMap<>(query.size()); - query.indices().forEach(id -> termToId.put(query.at(id), compiledQueryIds.at(id))); - final Map termFrequencies = new HashMap<>(termToId.size()); - final Map prioFrequencies = new HashMap<>(termToId.size()); + int[] full = new int[compiledQueryIds.size()]; + int[] prio = new int[compiledQueryIds.size()]; - termToId.forEach((key, id) -> termFrequencies.put(key, index.getTermFrequency(id))); - termToId.forEach((key, id) -> prioFrequencies.put(key, index.getTermFrequencyPrio(id))); + for (int idx = 0; idx < compiledQueryIds.size(); idx++) { + long id = compiledQueryIds.at(idx); + full[idx] = index.getTermFrequency(id); + prio[idx] = index.getTermFrequencyPrio(id); + } return new ResultRankingContext(index.getTotalDocCount(), rankingParams, - termFrequencies, - prioFrequencies); + new CqDataInt(full), + new CqDataInt(prio)); } } diff --git a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java index 3777cf4f..89b4c543 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java @@ -1,7 +1,6 @@ package nu.marginalia.index.results; -import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; -import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; +import nu.marginalia.api.searchquery.model.compiled.*; import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.SearchResultItem; @@ -70,39 +69,42 @@ public class IndexResultValuationContext { long docMetadata = statefulIndex.getDocumentMetadata(docId); int htmlFeatures = statefulIndex.getHtmlFeatures(docId); - SearchResultItem searchResult = new SearchResultItem(docId); + SearchResultItem searchResult = new SearchResultItem(docId, docMetadata, htmlFeatures); + + long[] wordMetas = new long[compiledQuery.size()]; + SearchResultKeywordScore[] scores = new SearchResultKeywordScore[compiledQuery.size()]; + + for (int i = 0; i < wordMetas.length; i++) { + final long termId = compiledQueryIds.at(i); + final String term = compiledQuery.at(i); + + wordMetas[i] = termMetadataForCombinedDocumentIds.getTermMetadata(termId, combinedId); + scores[i] = new SearchResultKeywordScore(term, termId, wordMetas[i]); + } - SearchResultKeywordScore[] scores = compiledQuery.indices().mapToObj(idx -> - new SearchResultKeywordScore( - compiledQuery.at(idx), - compiledQueryIds.at(idx), - termMetadataForCombinedDocumentIds.getTermMetadata( - compiledQueryIds.at(idx), combinedId - ), - docMetadata, - htmlFeatures) - ) - .toArray(SearchResultKeywordScore[]::new); // DANGER: IndexResultValuatorService assumes that searchResult.keywordScores has this specific order, as it needs // to be able to re-construct its own CompiledQuery for re-ranking the results. This is // a very flimsy assumption. searchResult.keywordScores.addAll(List.of(scores)); - CompiledQuery queryGraphScores = new CompiledQuery<>(compiledQuery.root, scores); + CompiledQueryLong wordMetasQuery = new CompiledQueryLong(compiledQuery.root, new CqDataLong(wordMetas)); - boolean allSynthetic = !CompiledQueryAggregates.booleanAggregate(queryGraphScores, score -> !score.hasTermFlag(WordFlags.Synthetic)); - int flagsCount = CompiledQueryAggregates.intMaxMinAggregate(queryGraphScores, score -> Long.bitCount(score.encodedWordMetadata() & flagsFilterMask)); - int positionsCount = CompiledQueryAggregates.intMaxMinAggregate(queryGraphScores, SearchResultKeywordScore::positionCount); + boolean allSynthetic = !CompiledQueryAggregates.booleanAggregate(wordMetasQuery, WordFlags.Synthetic::isAbsent); + int flagsCount = CompiledQueryAggregates.intMaxMinAggregate(wordMetasQuery, wordMeta -> Long.bitCount(wordMeta & flagsFilterMask)); + int positionsCount = CompiledQueryAggregates.intMaxMinAggregate(wordMetasQuery, wordMeta -> Long.bitCount(WordMetadata.decodePositions(wordMeta))); - if (!meetsQueryStrategyRequirements(queryGraphScores, queryParams.queryStrategy())) { + if (!meetsQueryStrategyRequirements(wordMetasQuery, queryParams.queryStrategy())) { return null; } if (flagsCount == 0 && !allSynthetic && positionsCount == 0) return null; - double score = searchResultValuator.calculateSearchResultValue(queryGraphScores, + double score = searchResultValuator.calculateSearchResultValue( + wordMetasQuery, + docMetadata, + htmlFeatures, 5000, // use a dummy value here as it's not present in the index rankingContext); @@ -111,7 +113,7 @@ public class IndexResultValuationContext { return searchResult; } - private boolean meetsQueryStrategyRequirements(CompiledQuery queryGraphScores, + private boolean meetsQueryStrategyRequirements(CompiledQueryLong queryGraphScores, QueryStrategy queryStrategy) { if (queryStrategy == QueryStrategy.AUTO || @@ -124,24 +126,24 @@ public class IndexResultValuationContext { docs -> meetsQueryStrategyRequirements(docs, queryParams.queryStrategy())); } - private boolean meetsQueryStrategyRequirements(SearchResultKeywordScore termScore, QueryStrategy queryStrategy) { + private boolean meetsQueryStrategyRequirements(long wordMeta, QueryStrategy queryStrategy) { if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SITE) { - return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.Site.asBit()); + return WordFlags.Site.isPresent(wordMeta); } else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SUBJECT) { - return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.Subjects.asBit()); + return WordFlags.Subjects.isPresent(wordMeta); } else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_TITLE) { - return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.Title.asBit()); + return WordFlags.Title.isPresent(wordMeta); } else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_URL) { - return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.UrlPath.asBit()); + return WordFlags.UrlPath.isPresent(wordMeta); } else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_DOMAIN) { - return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.UrlDomain.asBit()); + return WordFlags.UrlDomain.isPresent(wordMeta); } else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_LINK) { - return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.ExternalLink.asBit()); + return WordFlags.ExternalLink.isPresent(wordMeta); } return true; } diff --git a/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java b/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java index a84e5f4f..2fa44c31 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java @@ -6,16 +6,19 @@ import gnu.trove.list.TLongList; import gnu.trove.list.array.TLongArrayList; import it.unimi.dsi.fastutil.longs.LongSet; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; +import nu.marginalia.api.searchquery.model.compiled.CqDataInt; +import nu.marginalia.api.searchquery.model.compiled.CqDataLong; import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.SearchResultItem; -import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.index.model.SearchParameters; import nu.marginalia.index.results.model.ids.CombinedDocIdList; import nu.marginalia.linkdb.docs.DocumentDbReader; import nu.marginalia.linkdb.model.DocdbUrlDetail; +import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.ranking.results.ResultValuator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -126,22 +129,31 @@ public class IndexResultValuatorService { continue; } - // Reconstruct the SearchResultKeywordScore-compiledquery for re-valuation + // Reconstruct the compiledquery for re-valuation // // CAVEAT: This hinges on a very fragile that IndexResultValuationContext puts them in the same // order as the data for the CompiledQuery. - CompiledQuery resultQuery = - new CompiledQuery<>(compiledQuery.root, result.keywordScores.toArray(SearchResultKeywordScore[]::new)); + long[] wordMetas = new long[compiledQuery.size()]; + for (int i = 0; i < compiledQuery.size(); i++) { + var score = result.keywordScores.get(i); + wordMetas[i] = score.encodedWordMetadata(); + } - resultItems.add(createCombinedItem(result, docData, resultQuery, rankingContext)); + CompiledQueryLong metaQuery = new CompiledQueryLong(compiledQuery.root, new CqDataLong(wordMetas)); + + resultItems.add(createCombinedItem( + result, + docData, + metaQuery, + rankingContext)); } return resultItems; } private DecoratedSearchResultItem createCombinedItem(SearchResultItem result, DocdbUrlDetail docData, - CompiledQuery resultQuery, + CompiledQueryLong wordMetas, ResultRankingContext rankingContext) { return new DecoratedSearchResultItem( result, @@ -154,13 +166,19 @@ public class IndexResultValuatorService { docData.pubYear(), docData.dataHash(), docData.wordsTotal(), - bestPositions(resultQuery), - resultValuator.calculateSearchResultValue(resultQuery, docData.wordsTotal(), rankingContext) + bestPositions(wordMetas), + + resultValuator.calculateSearchResultValue(wordMetas, + result.encodedDocMetadata, + result.htmlFeatures, + docData.wordsTotal(), + rankingContext) ); } - private long bestPositions(CompiledQuery resultQuery) { - LongSet positionsSet = CompiledQueryAggregates.positionsAggregate(resultQuery, SearchResultKeywordScore::positions); + private long bestPositions(CompiledQueryLong wordMetas) { + LongSet positionsSet = CompiledQueryAggregates.positionsAggregate(wordMetas, WordMetadata::decodePositions); + int bestPc = 0; long bestPositions = 0; diff --git a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java index 862978c9..4d257349 100644 --- a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java +++ b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java @@ -1,9 +1,8 @@ package nu.marginalia.ranking.results; -import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; -import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.idx.DocumentFlags; @@ -15,36 +14,32 @@ import com.google.inject.Singleton; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.ArrayList; -import java.util.List; - @Singleton public class ResultValuator { final static double scalingFactor = 500.; - private final Bm25Factor bm25Factor; private final TermCoherenceFactor termCoherenceFactor; private static final Logger logger = LoggerFactory.getLogger(ResultValuator.class); @Inject - public ResultValuator(Bm25Factor bm25Factor, - TermCoherenceFactor termCoherenceFactor) { - this.bm25Factor = bm25Factor; + public ResultValuator(TermCoherenceFactor termCoherenceFactor) { this.termCoherenceFactor = termCoherenceFactor; } - public double calculateSearchResultValue(CompiledQuery scores, + public double calculateSearchResultValue(CompiledQueryLong wordMeta, + long documentMetadata, + int features, int length, ResultRankingContext ctx) { - if (scores.size() == 0) + if (wordMeta.isEmpty()) return Double.MAX_VALUE; - if (length < 0) - length = 5000; - long documentMetadata = scores.at(0).encodedDocMetadata(); - int features = scores.at(0).htmlFeatures(); + if (length < 0) { + length = 5000; + } + var rankingParams = ctx.params; int rank = DocumentMetadata.decodeRank(documentMetadata); @@ -79,9 +74,10 @@ public class ResultValuator { + temporalBias + flagsPenalty; - double bestTcf = rankingParams.tcfWeight * termCoherenceFactor.calculate(scores); - double bestBM25F = rankingParams.bm25FullWeight * bm25Factor.calculateBm25(rankingParams.prioParams, scores, length, ctx); - double bestBM25P = rankingParams.bm25PrioWeight * bm25Factor.calculateBm25Prio(rankingParams.prioParams, scores, ctx); + double bestTcf = rankingParams.tcfWeight * termCoherenceFactor.calculate(wordMeta); + + double bestBM25F = rankingParams.bm25FullWeight * wordMeta.root.visit(new Bm25FullGraphVisitor(rankingParams.fullParams, wordMeta.data, length, ctx)); + double bestBM25P = rankingParams.bm25PrioWeight * wordMeta.root.visit(new Bm25PrioGraphVisitor(rankingParams.prioParams, wordMeta.data, ctx)); double overallPartPositive = Math.max(0, overallPart); double overallPartNegative = -Math.min(0, overallPart); diff --git a/code/index/java/nu/marginalia/ranking/results/factors/Bm25Factor.java b/code/index/java/nu/marginalia/ranking/results/factors/Bm25Factor.java deleted file mode 100644 index bc13671e..00000000 --- a/code/index/java/nu/marginalia/ranking/results/factors/Bm25Factor.java +++ /dev/null @@ -1,113 +0,0 @@ -package nu.marginalia.ranking.results.factors; - -import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; -import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; -import nu.marginalia.api.searchquery.model.results.Bm25Parameters; -import nu.marginalia.api.searchquery.model.results.ResultRankingContext; -import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; -import nu.marginalia.model.idx.WordFlags; - -public class Bm25Factor { - private static final int AVG_LENGTH = 5000; - - /** This is an estimation of BM-25. - * - * @see Bm25Parameters - */ - public double calculateBm25(Bm25Parameters bm25Parameters, CompiledQuery scores, int length, ResultRankingContext ctx) { - final int docCount = ctx.termFreqDocCount(); - - return CompiledQueryAggregates.doubleSumAggregate(scores, keyword -> { - double count = keyword.positionCount(); - - int freq = ctx.frequency(keyword.keyword); - - return invFreq(docCount, freq) * f(bm25Parameters.k(), bm25Parameters.b(), count, length); - }); - } - - /** Bm25 calculation, except instead of counting positions in the document, - * the number of relevance signals for the term is counted instead. - */ - public double calculateBm25Prio(Bm25Parameters bm25Parameters, CompiledQuery scores, ResultRankingContext ctx) { - final int docCount = ctx.termFreqDocCount(); - - return CompiledQueryAggregates.doubleSumAggregate(scores, keyword -> { - double count = evaluatePriorityScore(keyword); - - int freq = ctx.priorityFrequency(keyword.keyword); - - // note we override b to zero for priority terms as they are independent of document length - return invFreq(docCount, freq) * f(bm25Parameters.k(), 0, count, 0); - }); - - } - - private static double evaluatePriorityScore(SearchResultKeywordScore keyword) { - int pcount = keyword.positionCount(); - - double qcount = 0.; - - if ((keyword.encodedWordMetadata() & WordFlags.ExternalLink.asBit()) != 0) { - - qcount += 2.5; - - if ((keyword.encodedWordMetadata() & WordFlags.UrlDomain.asBit()) != 0) - qcount += 2.5; - else if ((keyword.encodedWordMetadata() & WordFlags.UrlPath.asBit()) != 0) - qcount += 1.5; - - if ((keyword.encodedWordMetadata() & WordFlags.Site.asBit()) != 0) - qcount += 1.25; - if ((keyword.encodedWordMetadata() & WordFlags.SiteAdjacent.asBit()) != 0) - qcount += 1.25; - } - else { - if ((keyword.encodedWordMetadata() & WordFlags.UrlDomain.asBit()) != 0) - qcount += 3; - else if ((keyword.encodedWordMetadata() & WordFlags.UrlPath.asBit()) != 0) - qcount += 1; - - if ((keyword.encodedWordMetadata() & WordFlags.Site.asBit()) != 0) - qcount += 0.5; - if ((keyword.encodedWordMetadata() & WordFlags.SiteAdjacent.asBit()) != 0) - qcount += 0.5; - } - - if ((keyword.encodedWordMetadata() & WordFlags.Title.asBit()) != 0) - qcount += 1.5; - - if (pcount > 2) { - if ((keyword.encodedWordMetadata() & WordFlags.Subjects.asBit()) != 0) - qcount += 1.25; - if ((keyword.encodedWordMetadata() & WordFlags.NamesWords.asBit()) != 0) - qcount += 0.25; - if ((keyword.encodedWordMetadata() & WordFlags.TfIdfHigh.asBit()) != 0) - qcount += 0.5; - } - - return qcount; - } - - /** - * - * @param docCount Number of documents - * @param freq Number of matching documents - */ - private double invFreq(int docCount, int freq) { - return Math.log(1.0 + (docCount - freq + 0.5) / (freq + 0.5)); - } - - /** - * - * @param k determines the size of the impact of a single term - * @param b determines the magnitude of the length normalization - * @param count number of occurrences in the document - * @param length document length - */ - private double f(double k, double b, double count, int length) { - final double lengthRatio = (double) length / AVG_LENGTH; - - return (count * (k + 1)) / (count + k * (1 - b + b * lengthRatio)); - } -} diff --git a/code/index/java/nu/marginalia/ranking/results/factors/Bm25FullGraphVisitor.java b/code/index/java/nu/marginalia/ranking/results/factors/Bm25FullGraphVisitor.java new file mode 100644 index 00000000..9c46261d --- /dev/null +++ b/code/index/java/nu/marginalia/ranking/results/factors/Bm25FullGraphVisitor.java @@ -0,0 +1,81 @@ +package nu.marginalia.ranking.results.factors; + +import nu.marginalia.api.searchquery.model.compiled.CqDataInt; +import nu.marginalia.api.searchquery.model.compiled.CqDataLong; +import nu.marginalia.api.searchquery.model.compiled.CqExpression; +import nu.marginalia.api.searchquery.model.results.Bm25Parameters; +import nu.marginalia.api.searchquery.model.results.ResultRankingContext; +import nu.marginalia.model.idx.WordMetadata; + +import java.util.List; + +public class Bm25FullGraphVisitor implements CqExpression.DoubleVisitor { + private static final long AVG_LENGTH = 5000; + + private final CqDataLong wordMetaData; + private final CqDataInt frequencies; + private final Bm25Parameters bm25Parameters; + + private final int docCount; + private final int length; + + public Bm25FullGraphVisitor(Bm25Parameters bm25Parameters, + CqDataLong wordMetaData, + int length, + ResultRankingContext ctx) { + this.length = length; + this.bm25Parameters = bm25Parameters; + this.docCount = ctx.termFreqDocCount(); + this.wordMetaData = wordMetaData; + this.frequencies = ctx.fullCounts; + } + + @Override + public double onAnd(List parts) { + double value = 0; + for (var part : parts) { + value += part.visit(this); + } + return value; + } + + @Override + public double onOr(List parts) { + double value = 0; + for (var part : parts) { + value = Math.max(value, part.visit(this)); + } + return value; + } + + @Override + public double onLeaf(int idx) { + double count = Long.bitCount(WordMetadata.decodePositions(wordMetaData.get(idx))); + + int freq = frequencies.get(idx); + + return invFreq(docCount, freq) * f(bm25Parameters.k(), bm25Parameters.b(), count, length); + } + + /** + * + * @param docCount Number of documents + * @param freq Number of matching documents + */ + private double invFreq(int docCount, int freq) { + return Math.log(1.0 + (docCount - freq + 0.5) / (freq + 0.5)); + } + + /** + * + * @param k determines the size of the impact of a single term + * @param b determines the magnitude of the length normalization + * @param count number of occurrences in the document + * @param length document length + */ + private double f(double k, double b, double count, int length) { + final double lengthRatio = (double) length / AVG_LENGTH; + + return (count * (k + 1)) / (count + k * (1 - b + b * lengthRatio)); + } +} diff --git a/code/index/java/nu/marginalia/ranking/results/factors/Bm25PrioGraphVisitor.java b/code/index/java/nu/marginalia/ranking/results/factors/Bm25PrioGraphVisitor.java new file mode 100644 index 00000000..1fb26f6b --- /dev/null +++ b/code/index/java/nu/marginalia/ranking/results/factors/Bm25PrioGraphVisitor.java @@ -0,0 +1,127 @@ +package nu.marginalia.ranking.results.factors; + +import nu.marginalia.api.searchquery.model.compiled.CqDataInt; +import nu.marginalia.api.searchquery.model.compiled.CqDataLong; +import nu.marginalia.api.searchquery.model.compiled.CqExpression; +import nu.marginalia.api.searchquery.model.results.Bm25Parameters; +import nu.marginalia.api.searchquery.model.results.ResultRankingContext; +import nu.marginalia.model.idx.WordFlags; +import nu.marginalia.model.idx.WordMetadata; + +import java.util.List; + +public class Bm25PrioGraphVisitor implements CqExpression.DoubleVisitor { + private static final long AVG_LENGTH = 5000; + + private final CqDataLong wordMetaData; + private final CqDataInt frequencies; + private final Bm25Parameters bm25Parameters; + + private final int docCount; + + public Bm25PrioGraphVisitor(Bm25Parameters bm25Parameters, + CqDataLong wordMetaData, + ResultRankingContext ctx) { + this.bm25Parameters = bm25Parameters; + this.docCount = ctx.termFreqDocCount(); + this.wordMetaData = wordMetaData; + this.frequencies = ctx.fullCounts; + } + + @Override + public double onAnd(List parts) { + double value = 0; + for (var part : parts) { + value += part.visit(this); + } + return value; + } + + @Override + public double onOr(List parts) { + double value = 0; + for (var part : parts) { + value = Math.max(value, part.visit(this)); + } + return value; + } + + @Override + public double onLeaf(int idx) { + double count = evaluatePriorityScore(wordMetaData.get(idx)); + + int freq = frequencies.get(idx); + + // note we override b to zero for priority terms as they are independent of document length + return invFreq(docCount, freq) * f(bm25Parameters.k(), 0, count, 0); + } + + private static double evaluatePriorityScore(long wordMeta) { + int pcount = Long.bitCount(WordMetadata.decodePositions(wordMeta)); + + double qcount = 0.; + + if ((wordMeta & WordFlags.ExternalLink.asBit()) != 0) { + + qcount += 2.5; + + if ((wordMeta & WordFlags.UrlDomain.asBit()) != 0) + qcount += 2.5; + else if ((wordMeta & WordFlags.UrlPath.asBit()) != 0) + qcount += 1.5; + + if ((wordMeta & WordFlags.Site.asBit()) != 0) + qcount += 1.25; + if ((wordMeta & WordFlags.SiteAdjacent.asBit()) != 0) + qcount += 1.25; + } + else { + if ((wordMeta & WordFlags.UrlDomain.asBit()) != 0) + qcount += 3; + else if ((wordMeta & WordFlags.UrlPath.asBit()) != 0) + qcount += 1; + + if ((wordMeta & WordFlags.Site.asBit()) != 0) + qcount += 0.5; + if ((wordMeta & WordFlags.SiteAdjacent.asBit()) != 0) + qcount += 0.5; + } + + if ((wordMeta & WordFlags.Title.asBit()) != 0) + qcount += 1.5; + + if (pcount > 2) { + if ((wordMeta & WordFlags.Subjects.asBit()) != 0) + qcount += 1.25; + if ((wordMeta & WordFlags.NamesWords.asBit()) != 0) + qcount += 0.25; + if ((wordMeta & WordFlags.TfIdfHigh.asBit()) != 0) + qcount += 0.5; + } + + return qcount; + } + + + /** + * + * @param docCount Number of documents + * @param freq Number of matching documents + */ + private double invFreq(int docCount, int freq) { + return Math.log(1.0 + (docCount - freq + 0.5) / (freq + 0.5)); + } + + /** + * + * @param k determines the size of the impact of a single term + * @param b determines the magnitude of the length normalization + * @param count number of occurrences in the document + * @param length document length + */ + private double f(double k, double b, double count, int length) { + final double lengthRatio = (double) length / AVG_LENGTH; + + return (count * (k + 1)) / (count + k * (1 - b + b * lengthRatio)); + } +} diff --git a/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java b/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java index 71159c58..e617549d 100644 --- a/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java +++ b/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java @@ -1,16 +1,16 @@ package nu.marginalia.ranking.results.factors; -import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; -import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; import nu.marginalia.model.idx.WordMetadata; /** Rewards documents where terms appear frequently within the same sentences */ public class TermCoherenceFactor { - public double calculate(CompiledQuery scores) { - long mask = CompiledQueryAggregates.longBitmaskAggregate(scores, score -> score.positions() & WordMetadata.POSITIONS_MASK); + public double calculate(CompiledQueryLong wordMetadataQuery) { + long mask = CompiledQueryAggregates.longBitmaskAggregate(wordMetadataQuery, + score -> score >>> WordMetadata.POSITIONS_SHIFT); return bitsSetFactor(mask); } diff --git a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java index 301b5e19..7b0a6a24 100644 --- a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java +++ b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java @@ -215,9 +215,7 @@ public class IndexQueryServiceIntegrationSmokeTest { Set years = new HashSet<>(); for (var res : rsp.results) { - for (var score : res.rawIndexResult.getKeywordScores()) { - years.add(DocumentMetadata.decodeYear(score.encodedDocMetadata())); - } + years.add(DocumentMetadata.decodeYear(res.rawIndexResult.encodedDocMetadata)); } assertEquals(Set.of(1998), years); diff --git a/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java b/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java index 948c5857..c605a0a8 100644 --- a/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java +++ b/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java @@ -25,7 +25,7 @@ class IndexResultDomainDeduplicatorTest { } SearchResultItem forId(int domain, int ordinal) { - return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), List.of(), 4, Double.NaN); + return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), 0, 0, List.of(), 4, Double.NaN); } } \ No newline at end of file diff --git a/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java b/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java index 243ae90d..a1b66b04 100644 --- a/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java +++ b/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java @@ -1,6 +1,8 @@ package nu.marginalia.ranking.results; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; +import nu.marginalia.api.searchquery.model.compiled.CqDataInt; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; @@ -31,30 +33,27 @@ class ResultValuatorTest { when(dict.docCount()).thenReturn(100_000); valuator = new ResultValuator( - new Bm25Factor(), new TermCoherenceFactor() ); } - CompiledQuery titleOnlyLowCountSet = CompiledQuery.just( - new SearchResultKeywordScore("bob", 1, - wordMetadata(Set.of(1), EnumSet.of(WordFlags.Title)), - docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)), - 0) - ); - CompiledQuery highCountNoTitleSet = CompiledQuery.just( - new SearchResultKeywordScore("bob", 1, - wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh)), - docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)), - 0) - ); - CompiledQuery highCountSubjectSet = CompiledQuery.just( + CqDataInt frequencyData = new CqDataInt(new int[] { 10 }); + + CompiledQueryLong titleOnlyLowCountSet = CompiledQuery.just( new SearchResultKeywordScore("bob", 1, - wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh, WordFlags.Subjects)), - docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)), - 0) - ); + wordMetadata(Set.of(1), EnumSet.of(WordFlags.Title))) + ).mapToLong(SearchResultKeywordScore::encodedWordMetadata); + + CompiledQueryLong highCountNoTitleSet = CompiledQuery.just( + new SearchResultKeywordScore("bob", 1, + wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh))) + ).mapToLong(SearchResultKeywordScore::encodedWordMetadata);; + + CompiledQueryLong highCountSubjectSet = CompiledQuery.just( + new SearchResultKeywordScore("bob", 1, + wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh, WordFlags.Subjects))) + ).mapToLong(SearchResultKeywordScore::encodedWordMetadata);; @Test @@ -63,12 +62,16 @@ class ResultValuatorTest { when(dict.getTermFreq("bob")).thenReturn(10); ResultRankingContext context = new ResultRankingContext(100000, ResultRankingParameters.sensibleDefaults(), - Map.of("bob", 10), Collections.emptyMap()); + frequencyData, + frequencyData); - double titleOnlyLowCount = valuator.calculateSearchResultValue(titleOnlyLowCountSet, 10_000, context); - double titleLongOnlyLowCount = valuator.calculateSearchResultValue(titleOnlyLowCountSet, 10_000, context); - double highCountNoTitle = valuator.calculateSearchResultValue(highCountNoTitleSet, 10_000, context); - double highCountSubject = valuator.calculateSearchResultValue(highCountSubjectSet, 10_000, context); + long docMeta = docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)); + int features = 0; + + double titleOnlyLowCount = valuator.calculateSearchResultValue(titleOnlyLowCountSet, docMeta, features, 10_000, context); + double titleLongOnlyLowCount = valuator.calculateSearchResultValue(titleOnlyLowCountSet, docMeta, features, 10_000, context); + double highCountNoTitle = valuator.calculateSearchResultValue(highCountNoTitleSet, docMeta, features, 10_000, context); + double highCountSubject = valuator.calculateSearchResultValue(highCountSubjectSet, docMeta, features, 10_000, context); System.out.println(titleOnlyLowCount); System.out.println(titleLongOnlyLowCount); diff --git a/code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java b/code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java index 028896d9..d0abe443 100644 --- a/code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java +++ b/code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java @@ -18,14 +18,23 @@ class TermCoherenceFactorTest { @Test public void testAllBitsSet() { var allPositionsSet = createSet( - WordMetadata.POSITIONS_MASK, WordMetadata.POSITIONS_MASK + ~0L, + ~0L ); - long mask = CompiledQueryAggregates.longBitmaskAggregate(allPositionsSet, score -> score.positions() & WordMetadata.POSITIONS_MASK); + long mask = CompiledQueryAggregates.longBitmaskAggregate( + allPositionsSet, + SearchResultKeywordScore::positions + ); assertEquals(1.0, termCoherenceFactor.bitsSetFactor(mask), 0.01); - assertEquals(1.0, termCoherenceFactor.calculate(allPositionsSet)); + assertEquals(1.0, + termCoherenceFactor.calculate( + allPositionsSet.mapToLong(SearchResultKeywordScore::encodedWordMetadata) + ) + ); + } @Test @@ -38,7 +47,7 @@ class TermCoherenceFactorTest { assertEquals(0, termCoherenceFactor.bitsSetFactor(mask), 0.01); - assertEquals(0, termCoherenceFactor.calculate(allPositionsSet)); + assertEquals(0, termCoherenceFactor.calculate(allPositionsSet.mapToLong(SearchResultKeywordScore::encodedWordMetadata))); } @Test @SuppressWarnings("unchecked") @@ -90,7 +99,7 @@ class TermCoherenceFactorTest { for (int i = 0; i < positionMasks.length; i++) { keywords.add(new SearchResultKeywordScore("", 0, - new WordMetadata(positionMasks[i], (byte) 0).encode(), 0, 0)); + new WordMetadata(positionMasks[i] & WordMetadata.POSITIONS_MASK, (byte) 0).encode())); } return CompiledQuery.just(keywords.toArray(SearchResultKeywordScore[]::new)); diff --git a/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java b/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java index f5068d07..df24ec10 100644 --- a/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java +++ b/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java @@ -15,7 +15,7 @@ class NgramLexiconTest { } void addNgram(String... ngram) { - lexicon.incOrdered(HasherGroup.ordered().rollingHash(ngram)); + lexicon.incOrderedTitle(HasherGroup.ordered().rollingHash(ngram)); } @Test diff --git a/code/services-application/search-service/java/nu/marginalia/search/model/ClusteredUrlDetails.java b/code/services-application/search-service/java/nu/marginalia/search/model/ClusteredUrlDetails.java index a67582bd..faba9eb7 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/model/ClusteredUrlDetails.java +++ b/code/services-application/search-service/java/nu/marginalia/search/model/ClusteredUrlDetails.java @@ -38,7 +38,7 @@ public class ClusteredUrlDetails implements Comparable { for (var keywordScore : urlDetail.resultItem.keywordScores) { if (keywordScore.isKeywordSpecial()) continue; - if (keywordScore.positionCount() == 0) + if (keywordScore.positions() == 0) continue; if (keywordScore.hasTermFlag(WordFlags.Title))