From b6d365bacd7715b1744a7c74ff082a0d3f8e64ee Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 15 Apr 2024 16:04:07 +0200 Subject: [PATCH] (index) Clean up data model The change set cleans up the data model for the term-level data. This used to contain a bunch of fields with document-level metadata. This data-duplication means a larger memory footprint and worse memory locality. The ranking code is also modified to not accept SearchResultKeywordScores, but rather CompiledQueryLong and CqDataInts containing only the term metadata and the frequency information needed for ranking. This is again an effort to improve memory locality. --- .../nu/marginalia/model/idx/WordFlags.java | 5 + .../api/searchquery/QueryProtobufCodec.java | 6 +- .../model/compiled/CompiledQuery.java | 4 + .../model/compiled/CompiledQueryInt.java | 44 ++++++ .../model/compiled/CompiledQueryLong.java | 8 +- .../searchquery/model/compiled/CqData.java | 11 +- .../searchquery/model/compiled/CqDataInt.java | 31 +++++ .../aggregate/CompiledQueryAggregates.java | 17 ++- .../aggregate/CqBooleanAggregate.java | 6 + .../aggregate/CqDoubleSumOperator.java | 6 + .../aggregate/CqIntMaxMinOperator.java | 6 + .../aggregate/CqLongBitmaskOperator.java | 5 + .../aggregate/CqPositionsOperator.java | 6 + .../model/results/ResultRankingContext.java | 30 ++--- .../model/results/SearchResultItem.java | 11 +- .../results/SearchResultKeywordScore.java | 39 +----- .../api/src/main/protobuf/query-api.proto | 8 +- .../nu/marginalia/index/IndexGrpcService.java | 27 ++-- .../results/IndexResultValuationContext.java | 58 ++++---- .../results/IndexResultValuatorService.java | 38 ++++-- .../ranking/results/ResultValuator.java | 32 ++--- .../ranking/results/factors/Bm25Factor.java | 113 ---------------- .../results/factors/Bm25FullGraphVisitor.java | 81 +++++++++++ .../results/factors/Bm25PrioGraphVisitor.java | 127 ++++++++++++++++++ .../results/factors/TermCoherenceFactor.java | 8 +- ...IndexQueryServiceIntegrationSmokeTest.java | 4 +- .../IndexResultDomainDeduplicatorTest.java | 2 +- .../ranking/results/ResultValuatorTest.java | 49 +++---- .../factors/TermCoherenceFactorTest.java | 19 ++- .../segmentation/NgramLexiconTest.java | 2 +- .../search/model/ClusteredUrlDetails.java | 2 +- 31 files changed, 520 insertions(+), 285 deletions(-) create mode 100644 code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryInt.java create mode 100644 code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqDataInt.java delete mode 100644 code/index/java/nu/marginalia/ranking/results/factors/Bm25Factor.java create mode 100644 code/index/java/nu/marginalia/ranking/results/factors/Bm25FullGraphVisitor.java create mode 100644 code/index/java/nu/marginalia/ranking/results/factors/Bm25PrioGraphVisitor.java diff --git a/code/common/model/java/nu/marginalia/model/idx/WordFlags.java b/code/common/model/java/nu/marginalia/model/idx/WordFlags.java index dc627715..db54df77 100644 --- a/code/common/model/java/nu/marginalia/model/idx/WordFlags.java +++ b/code/common/model/java/nu/marginalia/model/idx/WordFlags.java @@ -50,6 +50,10 @@ public enum WordFlags { return (asBit() & value) > 0; } + public boolean isAbsent(long value) { + return (asBit() & value) == 0; + } + public static EnumSet decode(long encodedValue) { EnumSet ret = EnumSet.noneOf(WordFlags.class); @@ -61,4 +65,5 @@ public enum WordFlags { return ret; } + } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java index b705917e..5a43df1b 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java @@ -134,6 +134,8 @@ public class QueryProtobufCodec { return new SearchResultItem( rawItem.getCombinedId(), + rawItem.getEncodedDocMetadata(), + rawItem.getHtmlFeatures(), keywordScores, rawItem.getResultsFromDomain(), Double.NaN // Not set @@ -144,9 +146,7 @@ public class QueryProtobufCodec { return new SearchResultKeywordScore( keywordScores.getKeyword(), -1, // termId is internal to index service - keywordScores.getEncodedWordMetadata(), - keywordScores.getEncodedDocMetadata(), - keywordScores.getHtmlFeatures() + keywordScores.getEncodedWordMetadata() ); } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQuery.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQuery.java index 3ae850a3..356a1d86 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQuery.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQuery.java @@ -46,6 +46,10 @@ public class CompiledQuery implements Iterable { return new CompiledQueryLong(root, data.mapToLong(mapper)); } + public CompiledQueryLong mapToInt(ToIntFunction mapper) { + return new CompiledQueryLong(root, data.mapToInt(mapper)); + } + public CqExpression root() { return root; } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryInt.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryInt.java new file mode 100644 index 00000000..9e26c35c --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryInt.java @@ -0,0 +1,44 @@ +package nu.marginalia.api.searchquery.model.compiled; + +import java.util.stream.IntStream; + + +/** A compiled index service query */ +public class CompiledQueryInt { + private final CqExpression root; + private final CqDataInt data; + + public CompiledQueryInt(CqExpression root, CqDataInt data) { + this.root = root; + this.data = data; + } + + + public CqExpression root() { + return root; + } + + public IntStream stream() { + return data.stream(); + } + + public IntStream indices() { + return IntStream.range(0, data.size()); + } + + public long at(int index) { + return data.get(index); + } + + public int[] copyData() { + return data.copyData(); + } + + public boolean isEmpty() { + return data.size() == 0; + } + + public int size() { + return data.size(); + } +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryLong.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryLong.java index 94fa0e8b..718aaca7 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryLong.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryLong.java @@ -9,8 +9,8 @@ import java.util.stream.LongStream; /** A compiled index service query */ public class CompiledQueryLong implements Iterable { - private final CqExpression root; - private final CqDataLong data; + public final CqExpression root; + public final CqDataLong data; public CompiledQueryLong(CqExpression root, CqDataLong data) { this.root = root; @@ -47,4 +47,8 @@ public class CompiledQueryLong implements Iterable { public boolean isEmpty() { return data.size() == 0; } + + public int size() { + return data.size(); + } } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqData.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqData.java index b1565dc0..145f3f0f 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqData.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqData.java @@ -3,7 +3,7 @@ package nu.marginalia.api.searchquery.model.compiled; import java.lang.reflect.Array; import java.util.Arrays; import java.util.function.Function; -import java.util.function.ToDoubleFunction; +import java.util.function.ToIntFunction; import java.util.function.ToLongFunction; import java.util.stream.Stream; @@ -33,6 +33,15 @@ public class CqData { return new CqDataLong(newData); } + public CqDataLong mapToInt(ToIntFunction mapper) { + long[] newData = new long[data.length]; + for (int i = 0; i < data.length; i++) { + newData[i] = mapper.applyAsInt((T) data[i]); + } + + return new CqDataLong(newData); + } + public T get(int i) { return data[i]; } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqDataInt.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqDataInt.java new file mode 100644 index 00000000..24991686 --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqDataInt.java @@ -0,0 +1,31 @@ +package nu.marginalia.api.searchquery.model.compiled; + +import java.util.Arrays; +import java.util.stream.IntStream; + +public class CqDataInt { + private final int[] data; + + public CqDataInt(int[] data) { + this.data = data; + } + + public int get(int i) { + return data[i]; + } + public int get(CqExpression.Word w) { + return data[w.idx()]; + } + + public IntStream stream() { + return Arrays.stream(data); + } + + public int size() { + return data.length; + } + + public int[] copyData() { + return Arrays.copyOf(data, data.length); + } +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java index 0ab0647d..7e8ca8ec 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java @@ -17,6 +17,9 @@ public class CompiledQueryAggregates { static public boolean booleanAggregate(CompiledQuery query, Predicate predicate) { return query.root.visit(new CqBooleanAggregate(query, predicate)); } + static public boolean booleanAggregate(CompiledQueryLong query, LongPredicate predicate) { + return query.root.visit(new CqBooleanAggregate(query, predicate)); + } /** Compiled query aggregate that for a 64b bitmask that treats or-branches as logical OR, @@ -25,13 +28,20 @@ public class CompiledQueryAggregates { public static long longBitmaskAggregate(CompiledQuery query, ToLongFunction operator) { return query.root.visit(new CqLongBitmaskOperator(query, operator)); } - + public static long longBitmaskAggregate(CompiledQueryLong query, LongUnaryOperator operator) { + return query.root.visit(new CqLongBitmaskOperator(query, operator)); + } /** Apply the operator to each leaf node, then return the highest minimum value found along any path */ public static int intMaxMinAggregate(CompiledQuery query, ToIntFunction operator) { return query.root.visit(new CqIntMaxMinOperator(query, operator)); } + /** Apply the operator to each leaf node, then return the highest minimum value found along any path */ + public static int intMaxMinAggregate(CompiledQueryLong query, LongToIntFunction operator) { + return query.root.visit(new CqIntMaxMinOperator(query, operator)); + } + /** Apply the operator to each leaf node, and then return the highest sum of values possible * through each branch in the compiled query. * @@ -49,4 +59,9 @@ public class CompiledQueryAggregates { public static LongSet positionsAggregate(CompiledQuery query, ToLongFunction operator) { return query.root().visit(new CqPositionsOperator(query, operator)); } + + /** Using the bitwise AND operator, aggregate all possible combined values of the long generated by the provided operator */ + public static LongSet positionsAggregate(CompiledQueryLong query, LongUnaryOperator operator) { + return query.root().visit(new CqPositionsOperator(query, operator)); + } } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqBooleanAggregate.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqBooleanAggregate.java index 05ebf4c7..2a87ec79 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqBooleanAggregate.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqBooleanAggregate.java @@ -1,10 +1,12 @@ package nu.marginalia.api.searchquery.model.compiled.aggregate; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.compiled.CqExpression; import java.util.List; import java.util.function.IntPredicate; +import java.util.function.LongPredicate; import java.util.function.Predicate; public class CqBooleanAggregate implements CqExpression.BoolVisitor { @@ -15,6 +17,10 @@ public class CqBooleanAggregate implements CqExpression.BoolVisitor { this.predicate = idx -> objPred.test(query.at(idx)); } + public CqBooleanAggregate(CompiledQueryLong query, LongPredicate longPredicate) { + this.predicate = idx -> longPredicate.test(query.at(idx)); + } + @Override public boolean onAnd(List parts) { for (var part : parts) { diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqDoubleSumOperator.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqDoubleSumOperator.java index 23d1904e..082de29e 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqDoubleSumOperator.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqDoubleSumOperator.java @@ -1,10 +1,12 @@ package nu.marginalia.api.searchquery.model.compiled.aggregate; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.compiled.CqExpression; import java.util.List; import java.util.function.IntToDoubleFunction; +import java.util.function.LongToDoubleFunction; import java.util.function.ToDoubleFunction; public class CqDoubleSumOperator implements CqExpression.DoubleVisitor { @@ -15,6 +17,10 @@ public class CqDoubleSumOperator implements CqExpression.DoubleVisitor { this.operator = idx -> operator.applyAsDouble(query.at(idx)); } + public CqDoubleSumOperator(IntToDoubleFunction operator) { + this.operator = operator; + } + @Override public double onAnd(List parts) { double value = 0; diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqIntMaxMinOperator.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqIntMaxMinOperator.java index b3ec86bb..621dff73 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqIntMaxMinOperator.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqIntMaxMinOperator.java @@ -1,10 +1,12 @@ package nu.marginalia.api.searchquery.model.compiled.aggregate; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.compiled.CqExpression; import java.util.List; import java.util.function.IntUnaryOperator; +import java.util.function.LongToIntFunction; import java.util.function.ToIntFunction; public class CqIntMaxMinOperator implements CqExpression.IntVisitor { @@ -16,6 +18,10 @@ public class CqIntMaxMinOperator implements CqExpression.IntVisitor { this.operator = idx -> operator.applyAsInt(query.at(idx)); } + public CqIntMaxMinOperator(CompiledQueryLong query, LongToIntFunction operator) { + this.operator = idx -> operator.applyAsInt(query.at(idx)); + } + @Override public int onAnd(List parts) { int value = parts.getFirst().visit(this); diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqLongBitmaskOperator.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqLongBitmaskOperator.java index d9a4804b..b64029c1 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqLongBitmaskOperator.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqLongBitmaskOperator.java @@ -1,10 +1,12 @@ package nu.marginalia.api.searchquery.model.compiled.aggregate; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.compiled.CqExpression; import java.util.List; import java.util.function.IntToLongFunction; +import java.util.function.LongUnaryOperator; import java.util.function.ToLongFunction; public class CqLongBitmaskOperator implements CqExpression.LongVisitor { @@ -14,6 +16,9 @@ public class CqLongBitmaskOperator implements CqExpression.LongVisitor { public CqLongBitmaskOperator(CompiledQuery query, ToLongFunction operator) { this.operator = idx-> operator.applyAsLong(query.at(idx)); } + public CqLongBitmaskOperator(CompiledQueryLong query, LongUnaryOperator operator) { + this.operator = idx-> operator.applyAsLong(query.at(idx)); + } @Override public long onAnd(List parts) { diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqPositionsOperator.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqPositionsOperator.java index 19db2d4b..715c4cb2 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqPositionsOperator.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqPositionsOperator.java @@ -4,10 +4,12 @@ import it.unimi.dsi.fastutil.longs.LongArraySet; import it.unimi.dsi.fastutil.longs.LongOpenHashSet; import it.unimi.dsi.fastutil.longs.LongSet; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.compiled.CqExpression; import java.util.List; import java.util.function.IntToLongFunction; +import java.util.function.LongUnaryOperator; import java.util.function.ToLongFunction; public class CqPositionsOperator implements CqExpression.ObjectVisitor { @@ -17,6 +19,10 @@ public class CqPositionsOperator implements CqExpression.ObjectVisitor this.operator = idx -> operator.applyAsLong(query.at(idx)); } + public CqPositionsOperator(CompiledQueryLong query, LongUnaryOperator operator) { + this.operator = idx -> operator.applyAsLong(query.at(idx)); + } + @Override public LongSet onAnd(List parts) { LongSet ret = new LongArraySet(); diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingContext.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingContext.java index f0ad172f..9052345a 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingContext.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingContext.java @@ -1,38 +1,34 @@ package nu.marginalia.api.searchquery.model.results; -import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; import lombok.ToString; - -import java.util.Map; +import nu.marginalia.api.searchquery.model.compiled.CqDataInt; @ToString public class ResultRankingContext { private final int docCount; public final ResultRankingParameters params; - private final Object2IntOpenHashMap fullCounts = new Object2IntOpenHashMap<>(10, 0.5f); - private final Object2IntOpenHashMap priorityCounts = new Object2IntOpenHashMap<>(10, 0.5f); + /** CqDataInt associated with frequency information of the terms in the query + * in the full index. The dataset is indexed by the compiled query. */ + public final CqDataInt fullCounts; + + /** CqDataInt associated with frequency information of the terms in the query + * in the full index. The dataset is indexed by the compiled query. */ + public final CqDataInt priorityCounts; public ResultRankingContext(int docCount, ResultRankingParameters params, - Map fullCounts, - Map prioCounts - ) { + CqDataInt fullCounts, + CqDataInt prioCounts) + { this.docCount = docCount; this.params = params; - this.fullCounts.putAll(fullCounts); - this.priorityCounts.putAll(prioCounts); + this.fullCounts = fullCounts; + this.priorityCounts = prioCounts; } public int termFreqDocCount() { return docCount; } - public int frequency(String keyword) { - return fullCounts.getOrDefault(keyword, 1); - } - - public int priorityFrequency(String keyword) { - return priorityCounts.getOrDefault(keyword, 1); - } } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java index 8f50c9fb..7cd95b96 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java @@ -15,15 +15,24 @@ public class SearchResultItem implements Comparable { * probably not what you want, use getDocumentId() instead */ public final long combinedId; + /** Encoded document metadata */ + public final long encodedDocMetadata; + + /** Encoded html features of document */ + + public final int htmlFeatures; + /** How did the subqueries match against the document ? */ public final List keywordScores; /** How many other potential results existed in the same domain */ public int resultsFromDomain; - public SearchResultItem(long combinedId) { + public SearchResultItem(long combinedId, long encodedDocMetadata, int htmlFeatures) { this.combinedId = combinedId; + this.encodedDocMetadata = encodedDocMetadata; this.keywordScores = new ArrayList<>(); + this.htmlFeatures = htmlFeatures; } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultKeywordScore.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultKeywordScore.java index a0fd2156..212b2302 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultKeywordScore.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultKeywordScore.java @@ -2,7 +2,6 @@ package nu.marginalia.api.searchquery.model.results; import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.idx.WordMetadata; -import nu.marginalia.model.idx.DocumentMetadata; import java.util.Objects; @@ -10,34 +9,20 @@ public final class SearchResultKeywordScore { public final long termId; public final String keyword; private final long encodedWordMetadata; - private final long encodedDocMetadata; - - private final int htmlFeatures; public SearchResultKeywordScore(String keyword, long termId, - long encodedWordMetadata, - long encodedDocMetadata, - int htmlFeatures) { + long encodedWordMetadata) { this.termId = termId; this.keyword = keyword; this.encodedWordMetadata = encodedWordMetadata; - this.encodedDocMetadata = encodedDocMetadata; - this.htmlFeatures = htmlFeatures; } public boolean hasTermFlag(WordFlags flag) { return WordMetadata.hasFlags(encodedWordMetadata, flag.asBit()); } - public int positionCount() { - return Long.bitCount(positions()); - } - @Deprecated // FIXME 2024-04-06 - public int subquery() { - return -1; - } public long positions() { return WordMetadata.decodePositions(encodedWordMetadata); } @@ -46,44 +31,28 @@ public final class SearchResultKeywordScore { return keyword.contains(":") || hasTermFlag(WordFlags.Synthetic); } - public boolean isKeywordRegular() { - return !keyword.contains(":") - && !hasTermFlag(WordFlags.Synthetic); - } - public long encodedWordMetadata() { return encodedWordMetadata; } - public long encodedDocMetadata() { - return encodedDocMetadata; - } - - public int htmlFeatures() { - return htmlFeatures; - } - @Override public boolean equals(Object obj) { if (obj == this) return true; if (obj == null || obj.getClass() != this.getClass()) return false; var that = (SearchResultKeywordScore) obj; - return Objects.equals(this.keyword, that.keyword) && - this.encodedWordMetadata == that.encodedWordMetadata && - this.encodedDocMetadata == that.encodedDocMetadata; + return Objects.equals(this.termId, that.termId); } @Override public int hashCode() { - return Objects.hash(keyword, encodedWordMetadata, encodedDocMetadata); + return Objects.hash(termId); } @Override public String toString() { return "SearchResultKeywordScore[" + "keyword=" + keyword + ", " + - "encodedWordMetadata=" + new WordMetadata(encodedWordMetadata) + ", " + - "encodedDocMetadata=" + new DocumentMetadata(encodedDocMetadata) + ']'; + "encodedWordMetadata=" + new WordMetadata(encodedWordMetadata) + ']'; } } diff --git a/code/functions/search-query/api/src/main/protobuf/query-api.proto b/code/functions/search-query/api/src/main/protobuf/query-api.proto index df25c494..3094699b 100644 --- a/code/functions/search-query/api/src/main/protobuf/query-api.proto +++ b/code/functions/search-query/api/src/main/protobuf/query-api.proto @@ -98,16 +98,16 @@ message RpcDecoratedResultItem { message RpcRawResultItem { int64 combinedId = 1; // raw ID with bit-encoded ranking information still present int32 resultsFromDomain = 2; // number of other results from the same domain - repeated RpcResultKeywordScore keywordScores = 3; + int64 encodedDocMetadata = 3; // bit encoded document metadata + int32 htmlFeatures = 4; // bitmask encoding features of the document + repeated RpcResultKeywordScore keywordScores = 5; } /* Information about how well a keyword matches a query */ message RpcResultKeywordScore { string keyword = 1; // the keyword int64 encodedWordMetadata = 2; // bit encoded word metadata - int64 encodedDocMetadata = 3; // bit encoded document metadata - bool hasPriorityTerms = 4; // true if this word is important to the document - int32 htmlFeatures = 5; // bit encoded document features + bool hasPriorityTerms = 3; // true if this word is important to the document } /* Query execution parameters */ diff --git a/code/index/java/nu/marginalia/index/IndexGrpcService.java b/code/index/java/nu/marginalia/index/IndexGrpcService.java index 36b611ff..fa0a8343 100644 --- a/code/index/java/nu/marginalia/index/IndexGrpcService.java +++ b/code/index/java/nu/marginalia/index/IndexGrpcService.java @@ -11,6 +11,7 @@ import lombok.SneakyThrows; import nu.marginalia.api.searchquery.*; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; +import nu.marginalia.api.searchquery.model.compiled.CqDataInt; import nu.marginalia.api.searchquery.model.query.SearchSpecification; import nu.marginalia.api.searchquery.model.results.*; import nu.marginalia.array.buffer.LongQueryBuffer; @@ -135,14 +136,14 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { var rawItem = RpcRawResultItem.newBuilder(); rawItem.setCombinedId(rawResult.combinedId); rawItem.setResultsFromDomain(rawResult.resultsFromDomain); + rawItem.setHtmlFeatures(rawResult.htmlFeatures); + rawItem.setEncodedDocMetadata(rawResult.encodedDocMetadata); for (var score : rawResult.keywordScores) { rawItem.addKeywordScores( RpcResultKeywordScore.newBuilder() - .setEncodedDocMetadata(score.encodedDocMetadata()) .setEncodedWordMetadata(score.encodedWordMetadata()) .setKeyword(score.keyword) - .setHtmlFeatures(score.htmlFeatures()) ); } @@ -203,9 +204,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { return new SearchResultSet(List.of()); } - ResultRankingContext rankingContext = createRankingContext(params.rankingParams, - params.compiledQuery, - params.compiledQueryIds); + ResultRankingContext rankingContext = createRankingContext(params.rankingParams, params.compiledQueryIds); var queryExecution = new QueryExecution(rankingContext, params.fetchSize); @@ -414,22 +413,22 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { } private ResultRankingContext createRankingContext(ResultRankingParameters rankingParams, - CompiledQuery query, CompiledQueryLong compiledQueryIds) { - Map termToId = new HashMap<>(query.size()); - query.indices().forEach(id -> termToId.put(query.at(id), compiledQueryIds.at(id))); - final Map termFrequencies = new HashMap<>(termToId.size()); - final Map prioFrequencies = new HashMap<>(termToId.size()); + int[] full = new int[compiledQueryIds.size()]; + int[] prio = new int[compiledQueryIds.size()]; - termToId.forEach((key, id) -> termFrequencies.put(key, index.getTermFrequency(id))); - termToId.forEach((key, id) -> prioFrequencies.put(key, index.getTermFrequencyPrio(id))); + for (int idx = 0; idx < compiledQueryIds.size(); idx++) { + long id = compiledQueryIds.at(idx); + full[idx] = index.getTermFrequency(id); + prio[idx] = index.getTermFrequencyPrio(id); + } return new ResultRankingContext(index.getTotalDocCount(), rankingParams, - termFrequencies, - prioFrequencies); + new CqDataInt(full), + new CqDataInt(prio)); } } diff --git a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java index 3777cf4f..89b4c543 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java @@ -1,7 +1,6 @@ package nu.marginalia.index.results; -import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; -import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; +import nu.marginalia.api.searchquery.model.compiled.*; import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.SearchResultItem; @@ -70,39 +69,42 @@ public class IndexResultValuationContext { long docMetadata = statefulIndex.getDocumentMetadata(docId); int htmlFeatures = statefulIndex.getHtmlFeatures(docId); - SearchResultItem searchResult = new SearchResultItem(docId); + SearchResultItem searchResult = new SearchResultItem(docId, docMetadata, htmlFeatures); + + long[] wordMetas = new long[compiledQuery.size()]; + SearchResultKeywordScore[] scores = new SearchResultKeywordScore[compiledQuery.size()]; + + for (int i = 0; i < wordMetas.length; i++) { + final long termId = compiledQueryIds.at(i); + final String term = compiledQuery.at(i); + + wordMetas[i] = termMetadataForCombinedDocumentIds.getTermMetadata(termId, combinedId); + scores[i] = new SearchResultKeywordScore(term, termId, wordMetas[i]); + } - SearchResultKeywordScore[] scores = compiledQuery.indices().mapToObj(idx -> - new SearchResultKeywordScore( - compiledQuery.at(idx), - compiledQueryIds.at(idx), - termMetadataForCombinedDocumentIds.getTermMetadata( - compiledQueryIds.at(idx), combinedId - ), - docMetadata, - htmlFeatures) - ) - .toArray(SearchResultKeywordScore[]::new); // DANGER: IndexResultValuatorService assumes that searchResult.keywordScores has this specific order, as it needs // to be able to re-construct its own CompiledQuery for re-ranking the results. This is // a very flimsy assumption. searchResult.keywordScores.addAll(List.of(scores)); - CompiledQuery queryGraphScores = new CompiledQuery<>(compiledQuery.root, scores); + CompiledQueryLong wordMetasQuery = new CompiledQueryLong(compiledQuery.root, new CqDataLong(wordMetas)); - boolean allSynthetic = !CompiledQueryAggregates.booleanAggregate(queryGraphScores, score -> !score.hasTermFlag(WordFlags.Synthetic)); - int flagsCount = CompiledQueryAggregates.intMaxMinAggregate(queryGraphScores, score -> Long.bitCount(score.encodedWordMetadata() & flagsFilterMask)); - int positionsCount = CompiledQueryAggregates.intMaxMinAggregate(queryGraphScores, SearchResultKeywordScore::positionCount); + boolean allSynthetic = !CompiledQueryAggregates.booleanAggregate(wordMetasQuery, WordFlags.Synthetic::isAbsent); + int flagsCount = CompiledQueryAggregates.intMaxMinAggregate(wordMetasQuery, wordMeta -> Long.bitCount(wordMeta & flagsFilterMask)); + int positionsCount = CompiledQueryAggregates.intMaxMinAggregate(wordMetasQuery, wordMeta -> Long.bitCount(WordMetadata.decodePositions(wordMeta))); - if (!meetsQueryStrategyRequirements(queryGraphScores, queryParams.queryStrategy())) { + if (!meetsQueryStrategyRequirements(wordMetasQuery, queryParams.queryStrategy())) { return null; } if (flagsCount == 0 && !allSynthetic && positionsCount == 0) return null; - double score = searchResultValuator.calculateSearchResultValue(queryGraphScores, + double score = searchResultValuator.calculateSearchResultValue( + wordMetasQuery, + docMetadata, + htmlFeatures, 5000, // use a dummy value here as it's not present in the index rankingContext); @@ -111,7 +113,7 @@ public class IndexResultValuationContext { return searchResult; } - private boolean meetsQueryStrategyRequirements(CompiledQuery queryGraphScores, + private boolean meetsQueryStrategyRequirements(CompiledQueryLong queryGraphScores, QueryStrategy queryStrategy) { if (queryStrategy == QueryStrategy.AUTO || @@ -124,24 +126,24 @@ public class IndexResultValuationContext { docs -> meetsQueryStrategyRequirements(docs, queryParams.queryStrategy())); } - private boolean meetsQueryStrategyRequirements(SearchResultKeywordScore termScore, QueryStrategy queryStrategy) { + private boolean meetsQueryStrategyRequirements(long wordMeta, QueryStrategy queryStrategy) { if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SITE) { - return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.Site.asBit()); + return WordFlags.Site.isPresent(wordMeta); } else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SUBJECT) { - return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.Subjects.asBit()); + return WordFlags.Subjects.isPresent(wordMeta); } else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_TITLE) { - return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.Title.asBit()); + return WordFlags.Title.isPresent(wordMeta); } else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_URL) { - return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.UrlPath.asBit()); + return WordFlags.UrlPath.isPresent(wordMeta); } else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_DOMAIN) { - return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.UrlDomain.asBit()); + return WordFlags.UrlDomain.isPresent(wordMeta); } else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_LINK) { - return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.ExternalLink.asBit()); + return WordFlags.ExternalLink.isPresent(wordMeta); } return true; } diff --git a/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java b/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java index a84e5f4f..2fa44c31 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java @@ -6,16 +6,19 @@ import gnu.trove.list.TLongList; import gnu.trove.list.array.TLongArrayList; import it.unimi.dsi.fastutil.longs.LongSet; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; +import nu.marginalia.api.searchquery.model.compiled.CqDataInt; +import nu.marginalia.api.searchquery.model.compiled.CqDataLong; import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.SearchResultItem; -import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.index.model.SearchParameters; import nu.marginalia.index.results.model.ids.CombinedDocIdList; import nu.marginalia.linkdb.docs.DocumentDbReader; import nu.marginalia.linkdb.model.DocdbUrlDetail; +import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.ranking.results.ResultValuator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -126,22 +129,31 @@ public class IndexResultValuatorService { continue; } - // Reconstruct the SearchResultKeywordScore-compiledquery for re-valuation + // Reconstruct the compiledquery for re-valuation // // CAVEAT: This hinges on a very fragile that IndexResultValuationContext puts them in the same // order as the data for the CompiledQuery. - CompiledQuery resultQuery = - new CompiledQuery<>(compiledQuery.root, result.keywordScores.toArray(SearchResultKeywordScore[]::new)); + long[] wordMetas = new long[compiledQuery.size()]; + for (int i = 0; i < compiledQuery.size(); i++) { + var score = result.keywordScores.get(i); + wordMetas[i] = score.encodedWordMetadata(); + } - resultItems.add(createCombinedItem(result, docData, resultQuery, rankingContext)); + CompiledQueryLong metaQuery = new CompiledQueryLong(compiledQuery.root, new CqDataLong(wordMetas)); + + resultItems.add(createCombinedItem( + result, + docData, + metaQuery, + rankingContext)); } return resultItems; } private DecoratedSearchResultItem createCombinedItem(SearchResultItem result, DocdbUrlDetail docData, - CompiledQuery resultQuery, + CompiledQueryLong wordMetas, ResultRankingContext rankingContext) { return new DecoratedSearchResultItem( result, @@ -154,13 +166,19 @@ public class IndexResultValuatorService { docData.pubYear(), docData.dataHash(), docData.wordsTotal(), - bestPositions(resultQuery), - resultValuator.calculateSearchResultValue(resultQuery, docData.wordsTotal(), rankingContext) + bestPositions(wordMetas), + + resultValuator.calculateSearchResultValue(wordMetas, + result.encodedDocMetadata, + result.htmlFeatures, + docData.wordsTotal(), + rankingContext) ); } - private long bestPositions(CompiledQuery resultQuery) { - LongSet positionsSet = CompiledQueryAggregates.positionsAggregate(resultQuery, SearchResultKeywordScore::positions); + private long bestPositions(CompiledQueryLong wordMetas) { + LongSet positionsSet = CompiledQueryAggregates.positionsAggregate(wordMetas, WordMetadata::decodePositions); + int bestPc = 0; long bestPositions = 0; diff --git a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java index 862978c9..4d257349 100644 --- a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java +++ b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java @@ -1,9 +1,8 @@ package nu.marginalia.ranking.results; -import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; -import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.idx.DocumentFlags; @@ -15,36 +14,32 @@ import com.google.inject.Singleton; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.ArrayList; -import java.util.List; - @Singleton public class ResultValuator { final static double scalingFactor = 500.; - private final Bm25Factor bm25Factor; private final TermCoherenceFactor termCoherenceFactor; private static final Logger logger = LoggerFactory.getLogger(ResultValuator.class); @Inject - public ResultValuator(Bm25Factor bm25Factor, - TermCoherenceFactor termCoherenceFactor) { - this.bm25Factor = bm25Factor; + public ResultValuator(TermCoherenceFactor termCoherenceFactor) { this.termCoherenceFactor = termCoherenceFactor; } - public double calculateSearchResultValue(CompiledQuery scores, + public double calculateSearchResultValue(CompiledQueryLong wordMeta, + long documentMetadata, + int features, int length, ResultRankingContext ctx) { - if (scores.size() == 0) + if (wordMeta.isEmpty()) return Double.MAX_VALUE; - if (length < 0) - length = 5000; - long documentMetadata = scores.at(0).encodedDocMetadata(); - int features = scores.at(0).htmlFeatures(); + if (length < 0) { + length = 5000; + } + var rankingParams = ctx.params; int rank = DocumentMetadata.decodeRank(documentMetadata); @@ -79,9 +74,10 @@ public class ResultValuator { + temporalBias + flagsPenalty; - double bestTcf = rankingParams.tcfWeight * termCoherenceFactor.calculate(scores); - double bestBM25F = rankingParams.bm25FullWeight * bm25Factor.calculateBm25(rankingParams.prioParams, scores, length, ctx); - double bestBM25P = rankingParams.bm25PrioWeight * bm25Factor.calculateBm25Prio(rankingParams.prioParams, scores, ctx); + double bestTcf = rankingParams.tcfWeight * termCoherenceFactor.calculate(wordMeta); + + double bestBM25F = rankingParams.bm25FullWeight * wordMeta.root.visit(new Bm25FullGraphVisitor(rankingParams.fullParams, wordMeta.data, length, ctx)); + double bestBM25P = rankingParams.bm25PrioWeight * wordMeta.root.visit(new Bm25PrioGraphVisitor(rankingParams.prioParams, wordMeta.data, ctx)); double overallPartPositive = Math.max(0, overallPart); double overallPartNegative = -Math.min(0, overallPart); diff --git a/code/index/java/nu/marginalia/ranking/results/factors/Bm25Factor.java b/code/index/java/nu/marginalia/ranking/results/factors/Bm25Factor.java deleted file mode 100644 index bc13671e..00000000 --- a/code/index/java/nu/marginalia/ranking/results/factors/Bm25Factor.java +++ /dev/null @@ -1,113 +0,0 @@ -package nu.marginalia.ranking.results.factors; - -import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; -import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; -import nu.marginalia.api.searchquery.model.results.Bm25Parameters; -import nu.marginalia.api.searchquery.model.results.ResultRankingContext; -import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; -import nu.marginalia.model.idx.WordFlags; - -public class Bm25Factor { - private static final int AVG_LENGTH = 5000; - - /** This is an estimation of BM-25. - * - * @see Bm25Parameters - */ - public double calculateBm25(Bm25Parameters bm25Parameters, CompiledQuery scores, int length, ResultRankingContext ctx) { - final int docCount = ctx.termFreqDocCount(); - - return CompiledQueryAggregates.doubleSumAggregate(scores, keyword -> { - double count = keyword.positionCount(); - - int freq = ctx.frequency(keyword.keyword); - - return invFreq(docCount, freq) * f(bm25Parameters.k(), bm25Parameters.b(), count, length); - }); - } - - /** Bm25 calculation, except instead of counting positions in the document, - * the number of relevance signals for the term is counted instead. - */ - public double calculateBm25Prio(Bm25Parameters bm25Parameters, CompiledQuery scores, ResultRankingContext ctx) { - final int docCount = ctx.termFreqDocCount(); - - return CompiledQueryAggregates.doubleSumAggregate(scores, keyword -> { - double count = evaluatePriorityScore(keyword); - - int freq = ctx.priorityFrequency(keyword.keyword); - - // note we override b to zero for priority terms as they are independent of document length - return invFreq(docCount, freq) * f(bm25Parameters.k(), 0, count, 0); - }); - - } - - private static double evaluatePriorityScore(SearchResultKeywordScore keyword) { - int pcount = keyword.positionCount(); - - double qcount = 0.; - - if ((keyword.encodedWordMetadata() & WordFlags.ExternalLink.asBit()) != 0) { - - qcount += 2.5; - - if ((keyword.encodedWordMetadata() & WordFlags.UrlDomain.asBit()) != 0) - qcount += 2.5; - else if ((keyword.encodedWordMetadata() & WordFlags.UrlPath.asBit()) != 0) - qcount += 1.5; - - if ((keyword.encodedWordMetadata() & WordFlags.Site.asBit()) != 0) - qcount += 1.25; - if ((keyword.encodedWordMetadata() & WordFlags.SiteAdjacent.asBit()) != 0) - qcount += 1.25; - } - else { - if ((keyword.encodedWordMetadata() & WordFlags.UrlDomain.asBit()) != 0) - qcount += 3; - else if ((keyword.encodedWordMetadata() & WordFlags.UrlPath.asBit()) != 0) - qcount += 1; - - if ((keyword.encodedWordMetadata() & WordFlags.Site.asBit()) != 0) - qcount += 0.5; - if ((keyword.encodedWordMetadata() & WordFlags.SiteAdjacent.asBit()) != 0) - qcount += 0.5; - } - - if ((keyword.encodedWordMetadata() & WordFlags.Title.asBit()) != 0) - qcount += 1.5; - - if (pcount > 2) { - if ((keyword.encodedWordMetadata() & WordFlags.Subjects.asBit()) != 0) - qcount += 1.25; - if ((keyword.encodedWordMetadata() & WordFlags.NamesWords.asBit()) != 0) - qcount += 0.25; - if ((keyword.encodedWordMetadata() & WordFlags.TfIdfHigh.asBit()) != 0) - qcount += 0.5; - } - - return qcount; - } - - /** - * - * @param docCount Number of documents - * @param freq Number of matching documents - */ - private double invFreq(int docCount, int freq) { - return Math.log(1.0 + (docCount - freq + 0.5) / (freq + 0.5)); - } - - /** - * - * @param k determines the size of the impact of a single term - * @param b determines the magnitude of the length normalization - * @param count number of occurrences in the document - * @param length document length - */ - private double f(double k, double b, double count, int length) { - final double lengthRatio = (double) length / AVG_LENGTH; - - return (count * (k + 1)) / (count + k * (1 - b + b * lengthRatio)); - } -} diff --git a/code/index/java/nu/marginalia/ranking/results/factors/Bm25FullGraphVisitor.java b/code/index/java/nu/marginalia/ranking/results/factors/Bm25FullGraphVisitor.java new file mode 100644 index 00000000..9c46261d --- /dev/null +++ b/code/index/java/nu/marginalia/ranking/results/factors/Bm25FullGraphVisitor.java @@ -0,0 +1,81 @@ +package nu.marginalia.ranking.results.factors; + +import nu.marginalia.api.searchquery.model.compiled.CqDataInt; +import nu.marginalia.api.searchquery.model.compiled.CqDataLong; +import nu.marginalia.api.searchquery.model.compiled.CqExpression; +import nu.marginalia.api.searchquery.model.results.Bm25Parameters; +import nu.marginalia.api.searchquery.model.results.ResultRankingContext; +import nu.marginalia.model.idx.WordMetadata; + +import java.util.List; + +public class Bm25FullGraphVisitor implements CqExpression.DoubleVisitor { + private static final long AVG_LENGTH = 5000; + + private final CqDataLong wordMetaData; + private final CqDataInt frequencies; + private final Bm25Parameters bm25Parameters; + + private final int docCount; + private final int length; + + public Bm25FullGraphVisitor(Bm25Parameters bm25Parameters, + CqDataLong wordMetaData, + int length, + ResultRankingContext ctx) { + this.length = length; + this.bm25Parameters = bm25Parameters; + this.docCount = ctx.termFreqDocCount(); + this.wordMetaData = wordMetaData; + this.frequencies = ctx.fullCounts; + } + + @Override + public double onAnd(List parts) { + double value = 0; + for (var part : parts) { + value += part.visit(this); + } + return value; + } + + @Override + public double onOr(List parts) { + double value = 0; + for (var part : parts) { + value = Math.max(value, part.visit(this)); + } + return value; + } + + @Override + public double onLeaf(int idx) { + double count = Long.bitCount(WordMetadata.decodePositions(wordMetaData.get(idx))); + + int freq = frequencies.get(idx); + + return invFreq(docCount, freq) * f(bm25Parameters.k(), bm25Parameters.b(), count, length); + } + + /** + * + * @param docCount Number of documents + * @param freq Number of matching documents + */ + private double invFreq(int docCount, int freq) { + return Math.log(1.0 + (docCount - freq + 0.5) / (freq + 0.5)); + } + + /** + * + * @param k determines the size of the impact of a single term + * @param b determines the magnitude of the length normalization + * @param count number of occurrences in the document + * @param length document length + */ + private double f(double k, double b, double count, int length) { + final double lengthRatio = (double) length / AVG_LENGTH; + + return (count * (k + 1)) / (count + k * (1 - b + b * lengthRatio)); + } +} diff --git a/code/index/java/nu/marginalia/ranking/results/factors/Bm25PrioGraphVisitor.java b/code/index/java/nu/marginalia/ranking/results/factors/Bm25PrioGraphVisitor.java new file mode 100644 index 00000000..1fb26f6b --- /dev/null +++ b/code/index/java/nu/marginalia/ranking/results/factors/Bm25PrioGraphVisitor.java @@ -0,0 +1,127 @@ +package nu.marginalia.ranking.results.factors; + +import nu.marginalia.api.searchquery.model.compiled.CqDataInt; +import nu.marginalia.api.searchquery.model.compiled.CqDataLong; +import nu.marginalia.api.searchquery.model.compiled.CqExpression; +import nu.marginalia.api.searchquery.model.results.Bm25Parameters; +import nu.marginalia.api.searchquery.model.results.ResultRankingContext; +import nu.marginalia.model.idx.WordFlags; +import nu.marginalia.model.idx.WordMetadata; + +import java.util.List; + +public class Bm25PrioGraphVisitor implements CqExpression.DoubleVisitor { + private static final long AVG_LENGTH = 5000; + + private final CqDataLong wordMetaData; + private final CqDataInt frequencies; + private final Bm25Parameters bm25Parameters; + + private final int docCount; + + public Bm25PrioGraphVisitor(Bm25Parameters bm25Parameters, + CqDataLong wordMetaData, + ResultRankingContext ctx) { + this.bm25Parameters = bm25Parameters; + this.docCount = ctx.termFreqDocCount(); + this.wordMetaData = wordMetaData; + this.frequencies = ctx.fullCounts; + } + + @Override + public double onAnd(List parts) { + double value = 0; + for (var part : parts) { + value += part.visit(this); + } + return value; + } + + @Override + public double onOr(List parts) { + double value = 0; + for (var part : parts) { + value = Math.max(value, part.visit(this)); + } + return value; + } + + @Override + public double onLeaf(int idx) { + double count = evaluatePriorityScore(wordMetaData.get(idx)); + + int freq = frequencies.get(idx); + + // note we override b to zero for priority terms as they are independent of document length + return invFreq(docCount, freq) * f(bm25Parameters.k(), 0, count, 0); + } + + private static double evaluatePriorityScore(long wordMeta) { + int pcount = Long.bitCount(WordMetadata.decodePositions(wordMeta)); + + double qcount = 0.; + + if ((wordMeta & WordFlags.ExternalLink.asBit()) != 0) { + + qcount += 2.5; + + if ((wordMeta & WordFlags.UrlDomain.asBit()) != 0) + qcount += 2.5; + else if ((wordMeta & WordFlags.UrlPath.asBit()) != 0) + qcount += 1.5; + + if ((wordMeta & WordFlags.Site.asBit()) != 0) + qcount += 1.25; + if ((wordMeta & WordFlags.SiteAdjacent.asBit()) != 0) + qcount += 1.25; + } + else { + if ((wordMeta & WordFlags.UrlDomain.asBit()) != 0) + qcount += 3; + else if ((wordMeta & WordFlags.UrlPath.asBit()) != 0) + qcount += 1; + + if ((wordMeta & WordFlags.Site.asBit()) != 0) + qcount += 0.5; + if ((wordMeta & WordFlags.SiteAdjacent.asBit()) != 0) + qcount += 0.5; + } + + if ((wordMeta & WordFlags.Title.asBit()) != 0) + qcount += 1.5; + + if (pcount > 2) { + if ((wordMeta & WordFlags.Subjects.asBit()) != 0) + qcount += 1.25; + if ((wordMeta & WordFlags.NamesWords.asBit()) != 0) + qcount += 0.25; + if ((wordMeta & WordFlags.TfIdfHigh.asBit()) != 0) + qcount += 0.5; + } + + return qcount; + } + + + /** + * + * @param docCount Number of documents + * @param freq Number of matching documents + */ + private double invFreq(int docCount, int freq) { + return Math.log(1.0 + (docCount - freq + 0.5) / (freq + 0.5)); + } + + /** + * + * @param k determines the size of the impact of a single term + * @param b determines the magnitude of the length normalization + * @param count number of occurrences in the document + * @param length document length + */ + private double f(double k, double b, double count, int length) { + final double lengthRatio = (double) length / AVG_LENGTH; + + return (count * (k + 1)) / (count + k * (1 - b + b * lengthRatio)); + } +} diff --git a/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java b/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java index 71159c58..e617549d 100644 --- a/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java +++ b/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java @@ -1,16 +1,16 @@ package nu.marginalia.ranking.results.factors; -import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; -import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; import nu.marginalia.model.idx.WordMetadata; /** Rewards documents where terms appear frequently within the same sentences */ public class TermCoherenceFactor { - public double calculate(CompiledQuery scores) { - long mask = CompiledQueryAggregates.longBitmaskAggregate(scores, score -> score.positions() & WordMetadata.POSITIONS_MASK); + public double calculate(CompiledQueryLong wordMetadataQuery) { + long mask = CompiledQueryAggregates.longBitmaskAggregate(wordMetadataQuery, + score -> score >>> WordMetadata.POSITIONS_SHIFT); return bitsSetFactor(mask); } diff --git a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java index 301b5e19..7b0a6a24 100644 --- a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java +++ b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java @@ -215,9 +215,7 @@ public class IndexQueryServiceIntegrationSmokeTest { Set years = new HashSet<>(); for (var res : rsp.results) { - for (var score : res.rawIndexResult.getKeywordScores()) { - years.add(DocumentMetadata.decodeYear(score.encodedDocMetadata())); - } + years.add(DocumentMetadata.decodeYear(res.rawIndexResult.encodedDocMetadata)); } assertEquals(Set.of(1998), years); diff --git a/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java b/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java index 948c5857..c605a0a8 100644 --- a/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java +++ b/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java @@ -25,7 +25,7 @@ class IndexResultDomainDeduplicatorTest { } SearchResultItem forId(int domain, int ordinal) { - return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), List.of(), 4, Double.NaN); + return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), 0, 0, List.of(), 4, Double.NaN); } } \ No newline at end of file diff --git a/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java b/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java index 243ae90d..a1b66b04 100644 --- a/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java +++ b/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java @@ -1,6 +1,8 @@ package nu.marginalia.ranking.results; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; +import nu.marginalia.api.searchquery.model.compiled.CqDataInt; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; @@ -31,30 +33,27 @@ class ResultValuatorTest { when(dict.docCount()).thenReturn(100_000); valuator = new ResultValuator( - new Bm25Factor(), new TermCoherenceFactor() ); } - CompiledQuery titleOnlyLowCountSet = CompiledQuery.just( - new SearchResultKeywordScore("bob", 1, - wordMetadata(Set.of(1), EnumSet.of(WordFlags.Title)), - docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)), - 0) - ); - CompiledQuery highCountNoTitleSet = CompiledQuery.just( - new SearchResultKeywordScore("bob", 1, - wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh)), - docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)), - 0) - ); - CompiledQuery highCountSubjectSet = CompiledQuery.just( + CqDataInt frequencyData = new CqDataInt(new int[] { 10 }); + + CompiledQueryLong titleOnlyLowCountSet = CompiledQuery.just( new SearchResultKeywordScore("bob", 1, - wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh, WordFlags.Subjects)), - docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)), - 0) - ); + wordMetadata(Set.of(1), EnumSet.of(WordFlags.Title))) + ).mapToLong(SearchResultKeywordScore::encodedWordMetadata); + + CompiledQueryLong highCountNoTitleSet = CompiledQuery.just( + new SearchResultKeywordScore("bob", 1, + wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh))) + ).mapToLong(SearchResultKeywordScore::encodedWordMetadata);; + + CompiledQueryLong highCountSubjectSet = CompiledQuery.just( + new SearchResultKeywordScore("bob", 1, + wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh, WordFlags.Subjects))) + ).mapToLong(SearchResultKeywordScore::encodedWordMetadata);; @Test @@ -63,12 +62,16 @@ class ResultValuatorTest { when(dict.getTermFreq("bob")).thenReturn(10); ResultRankingContext context = new ResultRankingContext(100000, ResultRankingParameters.sensibleDefaults(), - Map.of("bob", 10), Collections.emptyMap()); + frequencyData, + frequencyData); - double titleOnlyLowCount = valuator.calculateSearchResultValue(titleOnlyLowCountSet, 10_000, context); - double titleLongOnlyLowCount = valuator.calculateSearchResultValue(titleOnlyLowCountSet, 10_000, context); - double highCountNoTitle = valuator.calculateSearchResultValue(highCountNoTitleSet, 10_000, context); - double highCountSubject = valuator.calculateSearchResultValue(highCountSubjectSet, 10_000, context); + long docMeta = docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)); + int features = 0; + + double titleOnlyLowCount = valuator.calculateSearchResultValue(titleOnlyLowCountSet, docMeta, features, 10_000, context); + double titleLongOnlyLowCount = valuator.calculateSearchResultValue(titleOnlyLowCountSet, docMeta, features, 10_000, context); + double highCountNoTitle = valuator.calculateSearchResultValue(highCountNoTitleSet, docMeta, features, 10_000, context); + double highCountSubject = valuator.calculateSearchResultValue(highCountSubjectSet, docMeta, features, 10_000, context); System.out.println(titleOnlyLowCount); System.out.println(titleLongOnlyLowCount); diff --git a/code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java b/code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java index 028896d9..d0abe443 100644 --- a/code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java +++ b/code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java @@ -18,14 +18,23 @@ class TermCoherenceFactorTest { @Test public void testAllBitsSet() { var allPositionsSet = createSet( - WordMetadata.POSITIONS_MASK, WordMetadata.POSITIONS_MASK + ~0L, + ~0L ); - long mask = CompiledQueryAggregates.longBitmaskAggregate(allPositionsSet, score -> score.positions() & WordMetadata.POSITIONS_MASK); + long mask = CompiledQueryAggregates.longBitmaskAggregate( + allPositionsSet, + SearchResultKeywordScore::positions + ); assertEquals(1.0, termCoherenceFactor.bitsSetFactor(mask), 0.01); - assertEquals(1.0, termCoherenceFactor.calculate(allPositionsSet)); + assertEquals(1.0, + termCoherenceFactor.calculate( + allPositionsSet.mapToLong(SearchResultKeywordScore::encodedWordMetadata) + ) + ); + } @Test @@ -38,7 +47,7 @@ class TermCoherenceFactorTest { assertEquals(0, termCoherenceFactor.bitsSetFactor(mask), 0.01); - assertEquals(0, termCoherenceFactor.calculate(allPositionsSet)); + assertEquals(0, termCoherenceFactor.calculate(allPositionsSet.mapToLong(SearchResultKeywordScore::encodedWordMetadata))); } @Test @SuppressWarnings("unchecked") @@ -90,7 +99,7 @@ class TermCoherenceFactorTest { for (int i = 0; i < positionMasks.length; i++) { keywords.add(new SearchResultKeywordScore("", 0, - new WordMetadata(positionMasks[i], (byte) 0).encode(), 0, 0)); + new WordMetadata(positionMasks[i] & WordMetadata.POSITIONS_MASK, (byte) 0).encode())); } return CompiledQuery.just(keywords.toArray(SearchResultKeywordScore[]::new)); diff --git a/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java b/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java index f5068d07..df24ec10 100644 --- a/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java +++ b/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java @@ -15,7 +15,7 @@ class NgramLexiconTest { } void addNgram(String... ngram) { - lexicon.incOrdered(HasherGroup.ordered().rollingHash(ngram)); + lexicon.incOrderedTitle(HasherGroup.ordered().rollingHash(ngram)); } @Test diff --git a/code/services-application/search-service/java/nu/marginalia/search/model/ClusteredUrlDetails.java b/code/services-application/search-service/java/nu/marginalia/search/model/ClusteredUrlDetails.java index a67582bd..faba9eb7 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/model/ClusteredUrlDetails.java +++ b/code/services-application/search-service/java/nu/marginalia/search/model/ClusteredUrlDetails.java @@ -38,7 +38,7 @@ public class ClusteredUrlDetails implements Comparable { for (var keywordScore : urlDetail.resultItem.keywordScores) { if (keywordScore.isKeywordSpecial()) continue; - if (keywordScore.positionCount() == 0) + if (keywordScore.positions() == 0) continue; if (keywordScore.hasTermFlag(WordFlags.Title))