From ad3857938daa58b2b33acaafb0bf4191b2ad2213 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 15 Jul 2024 04:49:28 +0200 Subject: [PATCH] (search-api, ranking) Update with new ranking parameters Adding new ranking parameters to the API and routing them through the system, in order to permit integration of the new position data with the ranking algorithm. The change also cleans out several parameters that no longer filled any function. --- .../api/searchquery/IndexProtobufCodec.java | 33 ++++++---------- .../api/searchquery/QueryProtobufCodec.java | 9 ++--- .../results/ResultRankingParameters.java | 23 ++++------- .../results/debug/ResultRankingOutputs.java | 9 ++--- .../api/src/main/protobuf/query-api.proto | 29 +++++++------- .../ranking/results/ResultValuator.java | 18 ++++----- .../marginalia/query/QueryBasicInterface.java | 18 +++------ .../resources/templates/qdebug.hdb | 38 +++++++------------ 8 files changed, 70 insertions(+), 107 deletions(-) diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java index 3a57cfe6..2b5cbaa0 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java @@ -98,19 +98,16 @@ public class IndexProtobufCodec { return ResultRankingParameters.sensibleDefaults(); return new ResultRankingParameters( - new Bm25Parameters(params.getFullK(), params.getFullB()), - new Bm25Parameters(params.getPrioK(), params.getPrioB()), + new Bm25Parameters(params.getBm25K(), params.getBm25B()), params.getShortDocumentThreshold(), params.getShortDocumentPenalty(), params.getDomainRankBonus(), params.getQualityPenalty(), params.getShortSentenceThreshold(), params.getShortSentencePenalty(), - params.getBm25FullWeight(), - params.getBm25NgramWeight(), - params.getBm25PrioWeight(), - params.getTcfJaccardWeight(), - params.getTcfOverlapWeight(), + params.getBm25Weight(), + params.getTcfFirstPositionWeight(), + params.getTcfAvgDistWeight(), ResultRankingParameters.TemporalBias.valueOf(params.getTemporalBias().getBias().name()), params.getTemporalBiasWeight(), params.getExportDebugData() @@ -125,21 +122,17 @@ public class IndexProtobufCodec { } var builder = RpcResultRankingParameters.newBuilder() - .setFullB(rankingParams.fullParams.b()) - .setFullK(rankingParams.fullParams.k()) - .setPrioB(rankingParams.prioParams.b()) - .setPrioK(rankingParams.prioParams.k()) + .setBm25B(rankingParams.bm25Params.b()) + .setBm25K(rankingParams.bm25Params.k()) .setShortDocumentThreshold(rankingParams.shortDocumentThreshold) .setShortDocumentPenalty(rankingParams.shortDocumentPenalty) .setDomainRankBonus(rankingParams.domainRankBonus) .setQualityPenalty(rankingParams.qualityPenalty) .setShortSentenceThreshold(rankingParams.shortSentenceThreshold) .setShortSentencePenalty(rankingParams.shortSentencePenalty) - .setBm25FullWeight(rankingParams.bm25FullWeight) - .setBm25NgramWeight(rankingParams.bm25NgramWeight) - .setBm25PrioWeight(rankingParams.bm25PrioWeight) - .setTcfOverlapWeight(rankingParams.tcfOverlapWeight) - .setTcfJaccardWeight(rankingParams.tcfJaccardWeight) + .setBm25Weight(rankingParams.bm25Weight) + .setTcfAvgDistWeight(rankingParams.tcfAvgDist) + .setTcfFirstPositionWeight(rankingParams.tcfFirstPosition) .setTemporalBiasWeight(rankingParams.temporalBiasWeight) .setExportDebugData(rankingParams.exportDebugData); @@ -176,11 +169,9 @@ public class IndexProtobufCodec { .setTemporalBias(outputs.temporalBias()) .setFlagsPenalty(outputs.flagsPenalty()) .setOverallPart(outputs.overallPart()) - .setTcfOverlap(outputs.tcfOverlap()) - .setTcfJaccard(outputs.tcfJaccard()) - .setBM25F(outputs.bM25F()) - .setBM25N(outputs.bM25N()) - .setBM25P(outputs.bM25P()) + .setTcfAvgDist(outputs.tcfAvgDist()) + .setTcfFirstPosition(outputs.tcfFirstPosition()) + .setBm25Part(outputs.bm25()) .build(); } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java index 58a20a8a..5d79cfea 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java @@ -157,11 +157,10 @@ public class QueryProtobufCodec { outputs.getTemporalBias(), outputs.getFlagsPenalty(), outputs.getOverallPart(), - outputs.getTcfOverlap(), - outputs.getTcfJaccard(), - outputs.getBM25F(), - outputs.getBM25N(), - outputs.getBM25P() + outputs.getBm25Part(), + outputs.getTcfAvgDist(), + outputs.getTcfFirstPosition() + ); } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingParameters.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingParameters.java index 04f699aa..7a5b7937 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingParameters.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingParameters.java @@ -10,9 +10,7 @@ import lombok.*; public class ResultRankingParameters { /** Tuning for BM25 when applied to full document matches */ - public final Bm25Parameters fullParams; - /** Tuning for BM25 when applied to priority matches, terms with relevance signal indicators */ - public final Bm25Parameters prioParams; + public final Bm25Parameters bm25Params; /** Documents below this length are penalized */ public int shortDocumentThreshold; @@ -32,11 +30,9 @@ public class ResultRankingParameters { /** Magnitude of penalty for documents with low average sentence length */ public double shortSentencePenalty; - public double bm25FullWeight; - public double bm25NgramWeight; - public double bm25PrioWeight; - public double tcfJaccardWeight; - public double tcfOverlapWeight; + public double bm25Weight; + public double tcfFirstPosition; + public double tcfAvgDist; public TemporalBias temporalBias; public double temporalBiasWeight; @@ -45,19 +41,16 @@ public class ResultRankingParameters { public static ResultRankingParameters sensibleDefaults() { return builder() - .fullParams(new Bm25Parameters(1.2, 0.5)) - .prioParams(new Bm25Parameters(1.5, 0)) + .bm25Params(new Bm25Parameters(1.2, 0.5)) .shortDocumentThreshold(2000) .shortDocumentPenalty(2.) .domainRankBonus(1/25.) .qualityPenalty(1/15.) .shortSentenceThreshold(2) .shortSentencePenalty(5) - .bm25FullWeight(1.) - .bm25NgramWeight(.25) - .bm25PrioWeight(1.) - .tcfOverlapWeight(3.) - .tcfJaccardWeight(1) + .bm25Weight(1.) + .tcfAvgDist(25.) + .tcfFirstPosition(1) // FIXME: what's a good default? .temporalBias(TemporalBias.NONE) .temporalBiasWeight(1. / (5.)) .exportDebugData(false) diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/ResultRankingOutputs.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/ResultRankingOutputs.java index bd4b943d..e9c490e8 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/ResultRankingOutputs.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/ResultRankingOutputs.java @@ -1,5 +1,6 @@ package nu.marginalia.api.searchquery.model.results.debug; + public record ResultRankingOutputs(double averageSentenceLengthPenalty, double qualityPenalty, double rankingBonus, @@ -8,10 +9,8 @@ public record ResultRankingOutputs(double averageSentenceLengthPenalty, double temporalBias, double flagsPenalty, double overallPart, - double tcfOverlap, - double tcfJaccard, - double bM25F, - double bM25N, - double bM25P) + double bm25, + double tcfAvgDist, + double tcfFirstPosition) { } diff --git a/code/functions/search-query/api/src/main/protobuf/query-api.proto b/code/functions/search-query/api/src/main/protobuf/query-api.proto index 589c5143..a29b7010 100644 --- a/code/functions/search-query/api/src/main/protobuf/query-api.proto +++ b/code/functions/search-query/api/src/main/protobuf/query-api.proto @@ -119,25 +119,26 @@ message RpcQueryLimits { int32 fetchSize = 4; // Size of the fetch buffer in the index service } +/** Parameters for the result ranking function */ message RpcResultRankingParameters { - double fullK = 1; // BM25 parameter - double fullB = 2; // BM25 parameter - double prioK = 3; // BM25 parameter - double prioB = 4; // BM25 parameter + double bm25K = 1; // BM25 parameter + double bm25B = 2; // BM25 parameter + int32 shortDocumentThreshold = 5; double shortDocumentPenalty = 6; double domainRankBonus = 7; double qualityPenalty = 8; int32 shortSentenceThreshold = 9; double shortSentencePenalty = 10; - double bm25FullWeight = 11; - double bm25NgramWeight = 12; - double bm25PrioWeight = 13; - double tcfOverlapWeight = 14; - double tcfJaccardWeight = 15; + double bm25Weight = 11; + double tcfAvgDistWeight = 12; + double tcfFirstPositionWeight = 13; + // 14, 15 unused RpcTemporalBias temporalBias = 16; double temporalBiasWeight = 17; + bool exportDebugData = 18; + } message RpcResultRankingDetails { @@ -155,6 +156,7 @@ message RpcResultRankingInputs { repeated string flags = 7; } +/** Summary of the output of the ranking function */ message RpcResultRankingOutputs { double averageSentenceLengthPenalty = 1; double qualityPenalty = 2; @@ -164,11 +166,10 @@ message RpcResultRankingOutputs { double temporalBias = 6; double flagsPenalty = 7; double overallPart = 8; - double tcfOverlap = 9; - double tcfJaccard = 10; - double bM25F = 11; - double bM25N = 12; - double bM25P = 13; + double bm25Part = 9; + // 10-14 unused + double tcfAvgDist = 15; + double tcfFirstPosition = 16; } /* Defines a single subquery */ diff --git a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java index 379a1d9d..6ab72eef 100644 --- a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java +++ b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java @@ -88,10 +88,10 @@ public class ResultValuator { + bestCoherence; // FIXME: need a weighting factor here - double tcfAvgDist = 25. / termCoherenceFactor.calculateAvgMinDistance(positionsQuery, ctx); + double tcfAvgDist = rankingParams.tcfAvgDist * (1.0 / termCoherenceFactor.calculateAvgMinDistance(positionsQuery, ctx)); + double tcfFirstPosition = 0.; - double bM25F = rankingParams.bm25FullWeight * wordFlagsQuery.root.visit(new Bm25FullGraphVisitor(rankingParams.fullParams, positionsCountQuery.data, length, ctx)); - double bM25P = rankingParams.bm25PrioWeight * wordFlagsQuery.root.visit(new Bm25PrioGraphVisitor(rankingParams.prioParams, wordFlagsQuery.data, ctx)); + double bM25 = rankingParams.bm25Weight * wordFlagsQuery.root.visit(new Bm25FullGraphVisitor(rankingParams.bm25Params, positionsCountQuery.data, length, ctx)); double overallPartPositive = Math.max(0, overallPart); double overallPartNegative = -Math.min(0, overallPart); @@ -116,11 +116,9 @@ public class ResultValuator { temporalBias, flagsPenalty, overallPart, - 0, - 0, - bM25F, - 0, // FIXME: Remove from model - bM25P) + bM25, + tcfAvgDist, + tcfFirstPosition) ); detailsConsumer.accept(details); @@ -129,8 +127,8 @@ public class ResultValuator { // Renormalize to 0...15, where 0 is the best possible score; // this is a historical artifact of the original ranking function double ret = normalize( - tcfAvgDist - + bM25F + bM25P + tcfAvgDist + tcfFirstPosition + + bM25 + overallPartPositive, overallPartNegative); diff --git a/code/services-core/query-service/java/nu/marginalia/query/QueryBasicInterface.java b/code/services-core/query-service/java/nu/marginalia/query/QueryBasicInterface.java index 62af8591..4ed3b838 100644 --- a/code/services-core/query-service/java/nu/marginalia/query/QueryBasicInterface.java +++ b/code/services-core/query-service/java/nu/marginalia/query/QueryBasicInterface.java @@ -106,23 +106,17 @@ public class QueryBasicInterface { .qualityPenalty(doubleFromRequest(request, "qualityPenalty", sensibleDefaults.qualityPenalty)) .shortDocumentThreshold(intFromRequest(request, "shortDocumentThreshold", sensibleDefaults.shortDocumentThreshold)) .shortDocumentPenalty(doubleFromRequest(request, "shortDocumentPenalty", sensibleDefaults.shortDocumentPenalty)) - .tcfJaccardWeight(doubleFromRequest(request, "tcfJaccardWeight", sensibleDefaults.tcfJaccardWeight)) - .tcfOverlapWeight(doubleFromRequest(request, "tcfOverlapWeight", sensibleDefaults.tcfOverlapWeight)) - .fullParams(new Bm25Parameters( - doubleFromRequest(request, "fullParams.k1", sensibleDefaults.fullParams.k()), - doubleFromRequest(request, "fullParams.b", sensibleDefaults.fullParams.b()) - )) - .prioParams(new Bm25Parameters( - doubleFromRequest(request, "prioParams.k1", sensibleDefaults.prioParams.k()), - doubleFromRequest(request, "prioParams.b", sensibleDefaults.prioParams.b()) + .tcfFirstPosition(doubleFromRequest(request, "tcfFirstPosition", sensibleDefaults.tcfFirstPosition)) + .tcfAvgDist(doubleFromRequest(request, "tcfAvgDist", sensibleDefaults.tcfAvgDist)) + .bm25Params(new Bm25Parameters( + doubleFromRequest(request, "bm25.k1", sensibleDefaults.bm25Params.k()), + doubleFromRequest(request, "bm25.b", sensibleDefaults.bm25Params.b()) )) .temporalBias(ResultRankingParameters.TemporalBias.valueOf(stringFromRequest(request, "temporalBias", sensibleDefaults.temporalBias.toString()))) .temporalBiasWeight(doubleFromRequest(request, "temporalBiasWeight", sensibleDefaults.temporalBiasWeight)) .shortSentenceThreshold(intFromRequest(request, "shortSentenceThreshold", sensibleDefaults.shortSentenceThreshold)) .shortSentencePenalty(doubleFromRequest(request, "shortSentencePenalty", sensibleDefaults.shortSentencePenalty)) - .bm25FullWeight(doubleFromRequest(request, "bm25FullWeight", sensibleDefaults.bm25FullWeight)) - .bm25NgramWeight(doubleFromRequest(request, "bm25NgramWeight", sensibleDefaults.bm25NgramWeight)) - .bm25PrioWeight(doubleFromRequest(request, "bm25PrioWeight", sensibleDefaults.bm25PrioWeight)) + .bm25Weight(doubleFromRequest(request, "bm25Weight", sensibleDefaults.bm25Weight)) .exportDebugData(true) .build(); } diff --git a/code/services-core/query-service/resources/templates/qdebug.hdb b/code/services-core/query-service/resources/templates/qdebug.hdb index 4081317f..4d2e7e41 100644 --- a/code/services-core/query-service/resources/templates/qdebug.hdb +++ b/code/services-core/query-service/resources/templates/qdebug.hdb @@ -31,22 +31,16 @@
-
-
-
-
+
+
+
+
-
-
-
-
-
-
-
-
-
-
+
+
+
+
@@ -67,12 +61,8 @@
-
-
-
-
-
-
+
+
{{/with}} @@ -130,11 +120,9 @@
Temporal Bias: {{temporalBias}}
Flags Penalty: {{flagsPenalty}}
Overall Part: {{overallPart}}
-
TCF Overlap: {{tcfOverlap}}
-
TCF Jaccard: {{tcfJaccard}}
-
BM25 Full: {{bM25F}}
-
BM25 Ngram: {{bM25N}}
-
BM25 Prio: {{bM25P}}
+
TCF Avg Distance: {{tcfAvgDist}}
+
TCF First Position: {{tcfFirstPosition}}
+
BM25: {{bM25}}
{{/with}}