diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java index 3a57cfe6..2b5cbaa0 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java @@ -98,19 +98,16 @@ public class IndexProtobufCodec { return ResultRankingParameters.sensibleDefaults(); return new ResultRankingParameters( - new Bm25Parameters(params.getFullK(), params.getFullB()), - new Bm25Parameters(params.getPrioK(), params.getPrioB()), + new Bm25Parameters(params.getBm25K(), params.getBm25B()), params.getShortDocumentThreshold(), params.getShortDocumentPenalty(), params.getDomainRankBonus(), params.getQualityPenalty(), params.getShortSentenceThreshold(), params.getShortSentencePenalty(), - params.getBm25FullWeight(), - params.getBm25NgramWeight(), - params.getBm25PrioWeight(), - params.getTcfJaccardWeight(), - params.getTcfOverlapWeight(), + params.getBm25Weight(), + params.getTcfFirstPositionWeight(), + params.getTcfAvgDistWeight(), ResultRankingParameters.TemporalBias.valueOf(params.getTemporalBias().getBias().name()), params.getTemporalBiasWeight(), params.getExportDebugData() @@ -125,21 +122,17 @@ public class IndexProtobufCodec { } var builder = RpcResultRankingParameters.newBuilder() - .setFullB(rankingParams.fullParams.b()) - .setFullK(rankingParams.fullParams.k()) - .setPrioB(rankingParams.prioParams.b()) - .setPrioK(rankingParams.prioParams.k()) + .setBm25B(rankingParams.bm25Params.b()) + .setBm25K(rankingParams.bm25Params.k()) .setShortDocumentThreshold(rankingParams.shortDocumentThreshold) .setShortDocumentPenalty(rankingParams.shortDocumentPenalty) .setDomainRankBonus(rankingParams.domainRankBonus) .setQualityPenalty(rankingParams.qualityPenalty) .setShortSentenceThreshold(rankingParams.shortSentenceThreshold) .setShortSentencePenalty(rankingParams.shortSentencePenalty) - .setBm25FullWeight(rankingParams.bm25FullWeight) - .setBm25NgramWeight(rankingParams.bm25NgramWeight) - .setBm25PrioWeight(rankingParams.bm25PrioWeight) - .setTcfOverlapWeight(rankingParams.tcfOverlapWeight) - .setTcfJaccardWeight(rankingParams.tcfJaccardWeight) + .setBm25Weight(rankingParams.bm25Weight) + .setTcfAvgDistWeight(rankingParams.tcfAvgDist) + .setTcfFirstPositionWeight(rankingParams.tcfFirstPosition) .setTemporalBiasWeight(rankingParams.temporalBiasWeight) .setExportDebugData(rankingParams.exportDebugData); @@ -176,11 +169,9 @@ public class IndexProtobufCodec { .setTemporalBias(outputs.temporalBias()) .setFlagsPenalty(outputs.flagsPenalty()) .setOverallPart(outputs.overallPart()) - .setTcfOverlap(outputs.tcfOverlap()) - .setTcfJaccard(outputs.tcfJaccard()) - .setBM25F(outputs.bM25F()) - .setBM25N(outputs.bM25N()) - .setBM25P(outputs.bM25P()) + .setTcfAvgDist(outputs.tcfAvgDist()) + .setTcfFirstPosition(outputs.tcfFirstPosition()) + .setBm25Part(outputs.bm25()) .build(); } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java index 58a20a8a..5d79cfea 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java @@ -157,11 +157,10 @@ public class QueryProtobufCodec { outputs.getTemporalBias(), outputs.getFlagsPenalty(), outputs.getOverallPart(), - outputs.getTcfOverlap(), - outputs.getTcfJaccard(), - outputs.getBM25F(), - outputs.getBM25N(), - outputs.getBM25P() + outputs.getBm25Part(), + outputs.getTcfAvgDist(), + outputs.getTcfFirstPosition() + ); } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingParameters.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingParameters.java index 04f699aa..7a5b7937 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingParameters.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingParameters.java @@ -10,9 +10,7 @@ import lombok.*; public class ResultRankingParameters { /** Tuning for BM25 when applied to full document matches */ - public final Bm25Parameters fullParams; - /** Tuning for BM25 when applied to priority matches, terms with relevance signal indicators */ - public final Bm25Parameters prioParams; + public final Bm25Parameters bm25Params; /** Documents below this length are penalized */ public int shortDocumentThreshold; @@ -32,11 +30,9 @@ public class ResultRankingParameters { /** Magnitude of penalty for documents with low average sentence length */ public double shortSentencePenalty; - public double bm25FullWeight; - public double bm25NgramWeight; - public double bm25PrioWeight; - public double tcfJaccardWeight; - public double tcfOverlapWeight; + public double bm25Weight; + public double tcfFirstPosition; + public double tcfAvgDist; public TemporalBias temporalBias; public double temporalBiasWeight; @@ -45,19 +41,16 @@ public class ResultRankingParameters { public static ResultRankingParameters sensibleDefaults() { return builder() - .fullParams(new Bm25Parameters(1.2, 0.5)) - .prioParams(new Bm25Parameters(1.5, 0)) + .bm25Params(new Bm25Parameters(1.2, 0.5)) .shortDocumentThreshold(2000) .shortDocumentPenalty(2.) .domainRankBonus(1/25.) .qualityPenalty(1/15.) .shortSentenceThreshold(2) .shortSentencePenalty(5) - .bm25FullWeight(1.) - .bm25NgramWeight(.25) - .bm25PrioWeight(1.) - .tcfOverlapWeight(3.) - .tcfJaccardWeight(1) + .bm25Weight(1.) + .tcfAvgDist(25.) + .tcfFirstPosition(1) // FIXME: what's a good default? .temporalBias(TemporalBias.NONE) .temporalBiasWeight(1. / (5.)) .exportDebugData(false) diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/ResultRankingOutputs.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/ResultRankingOutputs.java index bd4b943d..e9c490e8 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/ResultRankingOutputs.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/ResultRankingOutputs.java @@ -1,5 +1,6 @@ package nu.marginalia.api.searchquery.model.results.debug; + public record ResultRankingOutputs(double averageSentenceLengthPenalty, double qualityPenalty, double rankingBonus, @@ -8,10 +9,8 @@ public record ResultRankingOutputs(double averageSentenceLengthPenalty, double temporalBias, double flagsPenalty, double overallPart, - double tcfOverlap, - double tcfJaccard, - double bM25F, - double bM25N, - double bM25P) + double bm25, + double tcfAvgDist, + double tcfFirstPosition) { } diff --git a/code/functions/search-query/api/src/main/protobuf/query-api.proto b/code/functions/search-query/api/src/main/protobuf/query-api.proto index 589c5143..a29b7010 100644 --- a/code/functions/search-query/api/src/main/protobuf/query-api.proto +++ b/code/functions/search-query/api/src/main/protobuf/query-api.proto @@ -119,25 +119,26 @@ message RpcQueryLimits { int32 fetchSize = 4; // Size of the fetch buffer in the index service } +/** Parameters for the result ranking function */ message RpcResultRankingParameters { - double fullK = 1; // BM25 parameter - double fullB = 2; // BM25 parameter - double prioK = 3; // BM25 parameter - double prioB = 4; // BM25 parameter + double bm25K = 1; // BM25 parameter + double bm25B = 2; // BM25 parameter + int32 shortDocumentThreshold = 5; double shortDocumentPenalty = 6; double domainRankBonus = 7; double qualityPenalty = 8; int32 shortSentenceThreshold = 9; double shortSentencePenalty = 10; - double bm25FullWeight = 11; - double bm25NgramWeight = 12; - double bm25PrioWeight = 13; - double tcfOverlapWeight = 14; - double tcfJaccardWeight = 15; + double bm25Weight = 11; + double tcfAvgDistWeight = 12; + double tcfFirstPositionWeight = 13; + // 14, 15 unused RpcTemporalBias temporalBias = 16; double temporalBiasWeight = 17; + bool exportDebugData = 18; + } message RpcResultRankingDetails { @@ -155,6 +156,7 @@ message RpcResultRankingInputs { repeated string flags = 7; } +/** Summary of the output of the ranking function */ message RpcResultRankingOutputs { double averageSentenceLengthPenalty = 1; double qualityPenalty = 2; @@ -164,11 +166,10 @@ message RpcResultRankingOutputs { double temporalBias = 6; double flagsPenalty = 7; double overallPart = 8; - double tcfOverlap = 9; - double tcfJaccard = 10; - double bM25F = 11; - double bM25N = 12; - double bM25P = 13; + double bm25Part = 9; + // 10-14 unused + double tcfAvgDist = 15; + double tcfFirstPosition = 16; } /* Defines a single subquery */ diff --git a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java index 379a1d9d..6ab72eef 100644 --- a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java +++ b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java @@ -88,10 +88,10 @@ public class ResultValuator { + bestCoherence; // FIXME: need a weighting factor here - double tcfAvgDist = 25. / termCoherenceFactor.calculateAvgMinDistance(positionsQuery, ctx); + double tcfAvgDist = rankingParams.tcfAvgDist * (1.0 / termCoherenceFactor.calculateAvgMinDistance(positionsQuery, ctx)); + double tcfFirstPosition = 0.; - double bM25F = rankingParams.bm25FullWeight * wordFlagsQuery.root.visit(new Bm25FullGraphVisitor(rankingParams.fullParams, positionsCountQuery.data, length, ctx)); - double bM25P = rankingParams.bm25PrioWeight * wordFlagsQuery.root.visit(new Bm25PrioGraphVisitor(rankingParams.prioParams, wordFlagsQuery.data, ctx)); + double bM25 = rankingParams.bm25Weight * wordFlagsQuery.root.visit(new Bm25FullGraphVisitor(rankingParams.bm25Params, positionsCountQuery.data, length, ctx)); double overallPartPositive = Math.max(0, overallPart); double overallPartNegative = -Math.min(0, overallPart); @@ -116,11 +116,9 @@ public class ResultValuator { temporalBias, flagsPenalty, overallPart, - 0, - 0, - bM25F, - 0, // FIXME: Remove from model - bM25P) + bM25, + tcfAvgDist, + tcfFirstPosition) ); detailsConsumer.accept(details); @@ -129,8 +127,8 @@ public class ResultValuator { // Renormalize to 0...15, where 0 is the best possible score; // this is a historical artifact of the original ranking function double ret = normalize( - tcfAvgDist - + bM25F + bM25P + tcfAvgDist + tcfFirstPosition + + bM25 + overallPartPositive, overallPartNegative); diff --git a/code/services-core/query-service/java/nu/marginalia/query/QueryBasicInterface.java b/code/services-core/query-service/java/nu/marginalia/query/QueryBasicInterface.java index 62af8591..4ed3b838 100644 --- a/code/services-core/query-service/java/nu/marginalia/query/QueryBasicInterface.java +++ b/code/services-core/query-service/java/nu/marginalia/query/QueryBasicInterface.java @@ -106,23 +106,17 @@ public class QueryBasicInterface { .qualityPenalty(doubleFromRequest(request, "qualityPenalty", sensibleDefaults.qualityPenalty)) .shortDocumentThreshold(intFromRequest(request, "shortDocumentThreshold", sensibleDefaults.shortDocumentThreshold)) .shortDocumentPenalty(doubleFromRequest(request, "shortDocumentPenalty", sensibleDefaults.shortDocumentPenalty)) - .tcfJaccardWeight(doubleFromRequest(request, "tcfJaccardWeight", sensibleDefaults.tcfJaccardWeight)) - .tcfOverlapWeight(doubleFromRequest(request, "tcfOverlapWeight", sensibleDefaults.tcfOverlapWeight)) - .fullParams(new Bm25Parameters( - doubleFromRequest(request, "fullParams.k1", sensibleDefaults.fullParams.k()), - doubleFromRequest(request, "fullParams.b", sensibleDefaults.fullParams.b()) - )) - .prioParams(new Bm25Parameters( - doubleFromRequest(request, "prioParams.k1", sensibleDefaults.prioParams.k()), - doubleFromRequest(request, "prioParams.b", sensibleDefaults.prioParams.b()) + .tcfFirstPosition(doubleFromRequest(request, "tcfFirstPosition", sensibleDefaults.tcfFirstPosition)) + .tcfAvgDist(doubleFromRequest(request, "tcfAvgDist", sensibleDefaults.tcfAvgDist)) + .bm25Params(new Bm25Parameters( + doubleFromRequest(request, "bm25.k1", sensibleDefaults.bm25Params.k()), + doubleFromRequest(request, "bm25.b", sensibleDefaults.bm25Params.b()) )) .temporalBias(ResultRankingParameters.TemporalBias.valueOf(stringFromRequest(request, "temporalBias", sensibleDefaults.temporalBias.toString()))) .temporalBiasWeight(doubleFromRequest(request, "temporalBiasWeight", sensibleDefaults.temporalBiasWeight)) .shortSentenceThreshold(intFromRequest(request, "shortSentenceThreshold", sensibleDefaults.shortSentenceThreshold)) .shortSentencePenalty(doubleFromRequest(request, "shortSentencePenalty", sensibleDefaults.shortSentencePenalty)) - .bm25FullWeight(doubleFromRequest(request, "bm25FullWeight", sensibleDefaults.bm25FullWeight)) - .bm25NgramWeight(doubleFromRequest(request, "bm25NgramWeight", sensibleDefaults.bm25NgramWeight)) - .bm25PrioWeight(doubleFromRequest(request, "bm25PrioWeight", sensibleDefaults.bm25PrioWeight)) + .bm25Weight(doubleFromRequest(request, "bm25Weight", sensibleDefaults.bm25Weight)) .exportDebugData(true) .build(); } diff --git a/code/services-core/query-service/resources/templates/qdebug.hdb b/code/services-core/query-service/resources/templates/qdebug.hdb index 4081317f..4d2e7e41 100644 --- a/code/services-core/query-service/resources/templates/qdebug.hdb +++ b/code/services-core/query-service/resources/templates/qdebug.hdb @@ -31,22 +31,16 @@