(search-api, ranking) Update with new ranking parameters

Adding new ranking parameters to the API and routing them through the system, in order to permit integration of the new position data with the ranking algorithm.

The change also cleans out several parameters that no longer filled any function.
This commit is contained in:
Viktor Lofgren 2024-07-15 04:49:28 +02:00
parent 179a6002c2
commit ad3857938d
8 changed files with 70 additions and 107 deletions

View File

@ -98,19 +98,16 @@ public class IndexProtobufCodec {
return ResultRankingParameters.sensibleDefaults();
return new ResultRankingParameters(
new Bm25Parameters(params.getFullK(), params.getFullB()),
new Bm25Parameters(params.getPrioK(), params.getPrioB()),
new Bm25Parameters(params.getBm25K(), params.getBm25B()),
params.getShortDocumentThreshold(),
params.getShortDocumentPenalty(),
params.getDomainRankBonus(),
params.getQualityPenalty(),
params.getShortSentenceThreshold(),
params.getShortSentencePenalty(),
params.getBm25FullWeight(),
params.getBm25NgramWeight(),
params.getBm25PrioWeight(),
params.getTcfJaccardWeight(),
params.getTcfOverlapWeight(),
params.getBm25Weight(),
params.getTcfFirstPositionWeight(),
params.getTcfAvgDistWeight(),
ResultRankingParameters.TemporalBias.valueOf(params.getTemporalBias().getBias().name()),
params.getTemporalBiasWeight(),
params.getExportDebugData()
@ -125,21 +122,17 @@ public class IndexProtobufCodec {
}
var builder = RpcResultRankingParameters.newBuilder()
.setFullB(rankingParams.fullParams.b())
.setFullK(rankingParams.fullParams.k())
.setPrioB(rankingParams.prioParams.b())
.setPrioK(rankingParams.prioParams.k())
.setBm25B(rankingParams.bm25Params.b())
.setBm25K(rankingParams.bm25Params.k())
.setShortDocumentThreshold(rankingParams.shortDocumentThreshold)
.setShortDocumentPenalty(rankingParams.shortDocumentPenalty)
.setDomainRankBonus(rankingParams.domainRankBonus)
.setQualityPenalty(rankingParams.qualityPenalty)
.setShortSentenceThreshold(rankingParams.shortSentenceThreshold)
.setShortSentencePenalty(rankingParams.shortSentencePenalty)
.setBm25FullWeight(rankingParams.bm25FullWeight)
.setBm25NgramWeight(rankingParams.bm25NgramWeight)
.setBm25PrioWeight(rankingParams.bm25PrioWeight)
.setTcfOverlapWeight(rankingParams.tcfOverlapWeight)
.setTcfJaccardWeight(rankingParams.tcfJaccardWeight)
.setBm25Weight(rankingParams.bm25Weight)
.setTcfAvgDistWeight(rankingParams.tcfAvgDist)
.setTcfFirstPositionWeight(rankingParams.tcfFirstPosition)
.setTemporalBiasWeight(rankingParams.temporalBiasWeight)
.setExportDebugData(rankingParams.exportDebugData);
@ -176,11 +169,9 @@ public class IndexProtobufCodec {
.setTemporalBias(outputs.temporalBias())
.setFlagsPenalty(outputs.flagsPenalty())
.setOverallPart(outputs.overallPart())
.setTcfOverlap(outputs.tcfOverlap())
.setTcfJaccard(outputs.tcfJaccard())
.setBM25F(outputs.bM25F())
.setBM25N(outputs.bM25N())
.setBM25P(outputs.bM25P())
.setTcfAvgDist(outputs.tcfAvgDist())
.setTcfFirstPosition(outputs.tcfFirstPosition())
.setBm25Part(outputs.bm25())
.build();
}

View File

@ -157,11 +157,10 @@ public class QueryProtobufCodec {
outputs.getTemporalBias(),
outputs.getFlagsPenalty(),
outputs.getOverallPart(),
outputs.getTcfOverlap(),
outputs.getTcfJaccard(),
outputs.getBM25F(),
outputs.getBM25N(),
outputs.getBM25P()
outputs.getBm25Part(),
outputs.getTcfAvgDist(),
outputs.getTcfFirstPosition()
);
}

View File

@ -10,9 +10,7 @@ import lombok.*;
public class ResultRankingParameters {
/** Tuning for BM25 when applied to full document matches */
public final Bm25Parameters fullParams;
/** Tuning for BM25 when applied to priority matches, terms with relevance signal indicators */
public final Bm25Parameters prioParams;
public final Bm25Parameters bm25Params;
/** Documents below this length are penalized */
public int shortDocumentThreshold;
@ -32,11 +30,9 @@ public class ResultRankingParameters {
/** Magnitude of penalty for documents with low average sentence length */
public double shortSentencePenalty;
public double bm25FullWeight;
public double bm25NgramWeight;
public double bm25PrioWeight;
public double tcfJaccardWeight;
public double tcfOverlapWeight;
public double bm25Weight;
public double tcfFirstPosition;
public double tcfAvgDist;
public TemporalBias temporalBias;
public double temporalBiasWeight;
@ -45,19 +41,16 @@ public class ResultRankingParameters {
public static ResultRankingParameters sensibleDefaults() {
return builder()
.fullParams(new Bm25Parameters(1.2, 0.5))
.prioParams(new Bm25Parameters(1.5, 0))
.bm25Params(new Bm25Parameters(1.2, 0.5))
.shortDocumentThreshold(2000)
.shortDocumentPenalty(2.)
.domainRankBonus(1/25.)
.qualityPenalty(1/15.)
.shortSentenceThreshold(2)
.shortSentencePenalty(5)
.bm25FullWeight(1.)
.bm25NgramWeight(.25)
.bm25PrioWeight(1.)
.tcfOverlapWeight(3.)
.tcfJaccardWeight(1)
.bm25Weight(1.)
.tcfAvgDist(25.)
.tcfFirstPosition(1) // FIXME: what's a good default?
.temporalBias(TemporalBias.NONE)
.temporalBiasWeight(1. / (5.))
.exportDebugData(false)

View File

@ -1,5 +1,6 @@
package nu.marginalia.api.searchquery.model.results.debug;
public record ResultRankingOutputs(double averageSentenceLengthPenalty,
double qualityPenalty,
double rankingBonus,
@ -8,10 +9,8 @@ public record ResultRankingOutputs(double averageSentenceLengthPenalty,
double temporalBias,
double flagsPenalty,
double overallPart,
double tcfOverlap,
double tcfJaccard,
double bM25F,
double bM25N,
double bM25P)
double bm25,
double tcfAvgDist,
double tcfFirstPosition)
{
}

View File

@ -119,25 +119,26 @@ message RpcQueryLimits {
int32 fetchSize = 4; // Size of the fetch buffer in the index service
}
/** Parameters for the result ranking function */
message RpcResultRankingParameters {
double fullK = 1; // BM25 parameter
double fullB = 2; // BM25 parameter
double prioK = 3; // BM25 parameter
double prioB = 4; // BM25 parameter
double bm25K = 1; // BM25 parameter
double bm25B = 2; // BM25 parameter
int32 shortDocumentThreshold = 5;
double shortDocumentPenalty = 6;
double domainRankBonus = 7;
double qualityPenalty = 8;
int32 shortSentenceThreshold = 9;
double shortSentencePenalty = 10;
double bm25FullWeight = 11;
double bm25NgramWeight = 12;
double bm25PrioWeight = 13;
double tcfOverlapWeight = 14;
double tcfJaccardWeight = 15;
double bm25Weight = 11;
double tcfAvgDistWeight = 12;
double tcfFirstPositionWeight = 13;
// 14, 15 unused
RpcTemporalBias temporalBias = 16;
double temporalBiasWeight = 17;
bool exportDebugData = 18;
}
message RpcResultRankingDetails {
@ -155,6 +156,7 @@ message RpcResultRankingInputs {
repeated string flags = 7;
}
/** Summary of the output of the ranking function */
message RpcResultRankingOutputs {
double averageSentenceLengthPenalty = 1;
double qualityPenalty = 2;
@ -164,11 +166,10 @@ message RpcResultRankingOutputs {
double temporalBias = 6;
double flagsPenalty = 7;
double overallPart = 8;
double tcfOverlap = 9;
double tcfJaccard = 10;
double bM25F = 11;
double bM25N = 12;
double bM25P = 13;
double bm25Part = 9;
// 10-14 unused
double tcfAvgDist = 15;
double tcfFirstPosition = 16;
}
/* Defines a single subquery */

View File

@ -88,10 +88,10 @@ public class ResultValuator {
+ bestCoherence;
// FIXME: need a weighting factor here
double tcfAvgDist = 25. / termCoherenceFactor.calculateAvgMinDistance(positionsQuery, ctx);
double tcfAvgDist = rankingParams.tcfAvgDist * (1.0 / termCoherenceFactor.calculateAvgMinDistance(positionsQuery, ctx));
double tcfFirstPosition = 0.;
double bM25F = rankingParams.bm25FullWeight * wordFlagsQuery.root.visit(new Bm25FullGraphVisitor(rankingParams.fullParams, positionsCountQuery.data, length, ctx));
double bM25P = rankingParams.bm25PrioWeight * wordFlagsQuery.root.visit(new Bm25PrioGraphVisitor(rankingParams.prioParams, wordFlagsQuery.data, ctx));
double bM25 = rankingParams.bm25Weight * wordFlagsQuery.root.visit(new Bm25FullGraphVisitor(rankingParams.bm25Params, positionsCountQuery.data, length, ctx));
double overallPartPositive = Math.max(0, overallPart);
double overallPartNegative = -Math.min(0, overallPart);
@ -116,11 +116,9 @@ public class ResultValuator {
temporalBias,
flagsPenalty,
overallPart,
0,
0,
bM25F,
0, // FIXME: Remove from model
bM25P)
bM25,
tcfAvgDist,
tcfFirstPosition)
);
detailsConsumer.accept(details);
@ -129,8 +127,8 @@ public class ResultValuator {
// Renormalize to 0...15, where 0 is the best possible score;
// this is a historical artifact of the original ranking function
double ret = normalize(
tcfAvgDist
+ bM25F + bM25P
tcfAvgDist + tcfFirstPosition
+ bM25
+ overallPartPositive,
overallPartNegative);

View File

@ -106,23 +106,17 @@ public class QueryBasicInterface {
.qualityPenalty(doubleFromRequest(request, "qualityPenalty", sensibleDefaults.qualityPenalty))
.shortDocumentThreshold(intFromRequest(request, "shortDocumentThreshold", sensibleDefaults.shortDocumentThreshold))
.shortDocumentPenalty(doubleFromRequest(request, "shortDocumentPenalty", sensibleDefaults.shortDocumentPenalty))
.tcfJaccardWeight(doubleFromRequest(request, "tcfJaccardWeight", sensibleDefaults.tcfJaccardWeight))
.tcfOverlapWeight(doubleFromRequest(request, "tcfOverlapWeight", sensibleDefaults.tcfOverlapWeight))
.fullParams(new Bm25Parameters(
doubleFromRequest(request, "fullParams.k1", sensibleDefaults.fullParams.k()),
doubleFromRequest(request, "fullParams.b", sensibleDefaults.fullParams.b())
))
.prioParams(new Bm25Parameters(
doubleFromRequest(request, "prioParams.k1", sensibleDefaults.prioParams.k()),
doubleFromRequest(request, "prioParams.b", sensibleDefaults.prioParams.b())
.tcfFirstPosition(doubleFromRequest(request, "tcfFirstPosition", sensibleDefaults.tcfFirstPosition))
.tcfAvgDist(doubleFromRequest(request, "tcfAvgDist", sensibleDefaults.tcfAvgDist))
.bm25Params(new Bm25Parameters(
doubleFromRequest(request, "bm25.k1", sensibleDefaults.bm25Params.k()),
doubleFromRequest(request, "bm25.b", sensibleDefaults.bm25Params.b())
))
.temporalBias(ResultRankingParameters.TemporalBias.valueOf(stringFromRequest(request, "temporalBias", sensibleDefaults.temporalBias.toString())))
.temporalBiasWeight(doubleFromRequest(request, "temporalBiasWeight", sensibleDefaults.temporalBiasWeight))
.shortSentenceThreshold(intFromRequest(request, "shortSentenceThreshold", sensibleDefaults.shortSentenceThreshold))
.shortSentencePenalty(doubleFromRequest(request, "shortSentencePenalty", sensibleDefaults.shortSentencePenalty))
.bm25FullWeight(doubleFromRequest(request, "bm25FullWeight", sensibleDefaults.bm25FullWeight))
.bm25NgramWeight(doubleFromRequest(request, "bm25NgramWeight", sensibleDefaults.bm25NgramWeight))
.bm25PrioWeight(doubleFromRequest(request, "bm25PrioWeight", sensibleDefaults.bm25PrioWeight))
.bm25Weight(doubleFromRequest(request, "bm25Weight", sensibleDefaults.bm25Weight))
.exportDebugData(true)
.build();
}

View File

@ -31,22 +31,16 @@
<div class="col-sm-2"><input type="text" class="form-control" id="shortDocumentPenalty" name="shortDocumentPenalty" value="{{shortDocumentPenalty}}"></div>
</div>
<div class="row my-2">
<div class="col-sm-2"><label for="tcfJaccardWeight">TCF Jaccard Weight</label></div>
<div class="col-sm-2"><input type="text" class="form-control" id="tcfJaccardWeight" name="tcfJaccardWeight" value="{{tcfJaccardWeight}}"></div>
<div class="col-sm-2"><label for="tcfOverlapWeight">TCF Overlap Weight</label></div>
<div class="col-sm-2"><input type="text" class="form-control" id="tcfOverlapWeight" name="tcfOverlapWeight" value="{{tcfOverlapWeight}}"></div>
<div class="col-sm-2"><label for="tcfAvgDist">TCF Average Distance</label></div>
<div class="col-sm-2"><input type="text" class="form-control" id="tcfAvgDist" name="tcfAvgDist" value="{{tcfAvgDist}}"></div>
<div class="col-sm-2"><label for="tcfFirstPosition">TCF First Position Weight</label></div>
<div class="col-sm-2"><input type="text" class="form-control" id="tcfFirstPosition" name="tcfFirstPosition" value="{{tcfFirstPosition}}"></div>
</div>
<div class="row my-2">
<div class="col-sm-2"><label for="fullParams.k1">Full Params K1</label></div>
<div class="col-sm-2"><input type="text" class="form-control" id="fullParams.k1" name="fullParams.k1" value="{{fullParams.k}}"></div>
<div class="col-sm-2"><label for="fullParams.b">Full Params B</label></div>
<div class="col-sm-2"><input type="text" class="form-control" id="fullParams.b" name="fullParams.b" value="{{fullParams.b}}"></div>
</div>
<div class="row my-2">
<div class="col-sm-2"><label for="prioParams.k1">Prio Params K1</label></div>
<div class="col-sm-2"><input type="text" class="form-control" id="prioParams.k1" name="prioParams.k1" value="{{prioParams.k}}"></div>
<div class="col-sm-2"><label for="prioParams.b">Prio Params B</label></div>
<div class="col-sm-2"><input type="text" class="form-control" id="prioParams.b" name="prioParams.b" value="{{prioParams.b}}"></div>
<div class="col-sm-2"><label for="bm25.k1">BM25 K1</label></div>
<div class="col-sm-2"><input type="text" class="form-control" id="bm25.k1" name="bm25.k1" value="{{bm25Params.k}}"></div>
<div class="col-sm-2"><label for="bm25.b">BM25 B</label></div>
<div class="col-sm-2"><input type="text" class="form-control" id="bm25.b" name="bm25.b" value="{{bm25Params.b}}"></div>
</div>
<div class="row my-2">
<div class="col-sm-2"><label for="temporalBias">Temporal Bias</label></div>
@ -67,12 +61,8 @@
<div class="col-sm-2"><input type="text" class="form-control" id="shortSentencePenalty" name="shortSentencePenalty" value="{{shortSentencePenalty}}"></div>
</div>
<div class="row my-2">
<div class="col-sm-2"><label for="bm25FullWeight">BM25 Full Weight</label></div>
<div class="col-sm-2"><input type="text" class="form-control" id="bm25FullWeight" name="bm25FullWeight" value="{{bm25FullWeight}}"></div>
<div class="col-sm-2"><label for="bm25NgramWeight">BM25 Ngram Weight</label></div>
<div class="col-sm-2"><input type="text" class="form-control" id="bm25NgramWeight" name="bm25NgramWeight" value="{{bm25NgramWeight}}"></div>
<div class="col-sm-2"><label for="bm25PrioWeight">BM25 Prio Weight</label></div>
<div class="col-sm-2"><input type="text" class="form-control" id="bm25PrioWeight" name="bm25PrioWeight" value="{{bm25PrioWeight}}"></div>
<div class="col-sm-2"><label for="bm25FullWeight">BM25 Weight</label></div>
<div class="col-sm-2"><input type="text" class="form-control" id="bm25Weight" name="bm25Weight" value="{{bm25Weight}}"></div>
</div>
{{/with}}
@ -130,11 +120,9 @@
<div><small class="text-muted">Temporal Bias: {{temporalBias}}</small></div>
<div><small class="text-muted">Flags Penalty: {{flagsPenalty}}</small></div>
<div><small class="text-muted">Overall Part: {{overallPart}}</small></div>
<div><small class="text-muted">TCF Overlap: {{tcfOverlap}}</small></div>
<div><small class="text-muted">TCF Jaccard: {{tcfJaccard}}</small></div>
<div><small class="text-muted">BM25 Full: {{bM25F}}</small></div>
<div><small class="text-muted">BM25 Ngram: {{bM25N}}</small></div>
<div><small class="text-muted">BM25 Prio: {{bM25P}}</small></div>
<div><small class="text-muted">TCF Avg Distance: {{tcfAvgDist}}</small></div>
<div><small class="text-muted">TCF First Position: {{tcfFirstPosition}}</small></div>
<div><small class="text-muted">BM25: {{bM25}}</small></div>
{{/with}}
</div>