mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(search-api, ranking) Update with new ranking parameters
Adding new ranking parameters to the API and routing them through the system, in order to permit integration of the new position data with the ranking algorithm. The change also cleans out several parameters that no longer filled any function.
This commit is contained in:
parent
179a6002c2
commit
ad3857938d
@ -98,19 +98,16 @@ public class IndexProtobufCodec {
|
||||
return ResultRankingParameters.sensibleDefaults();
|
||||
|
||||
return new ResultRankingParameters(
|
||||
new Bm25Parameters(params.getFullK(), params.getFullB()),
|
||||
new Bm25Parameters(params.getPrioK(), params.getPrioB()),
|
||||
new Bm25Parameters(params.getBm25K(), params.getBm25B()),
|
||||
params.getShortDocumentThreshold(),
|
||||
params.getShortDocumentPenalty(),
|
||||
params.getDomainRankBonus(),
|
||||
params.getQualityPenalty(),
|
||||
params.getShortSentenceThreshold(),
|
||||
params.getShortSentencePenalty(),
|
||||
params.getBm25FullWeight(),
|
||||
params.getBm25NgramWeight(),
|
||||
params.getBm25PrioWeight(),
|
||||
params.getTcfJaccardWeight(),
|
||||
params.getTcfOverlapWeight(),
|
||||
params.getBm25Weight(),
|
||||
params.getTcfFirstPositionWeight(),
|
||||
params.getTcfAvgDistWeight(),
|
||||
ResultRankingParameters.TemporalBias.valueOf(params.getTemporalBias().getBias().name()),
|
||||
params.getTemporalBiasWeight(),
|
||||
params.getExportDebugData()
|
||||
@ -125,21 +122,17 @@ public class IndexProtobufCodec {
|
||||
}
|
||||
|
||||
var builder = RpcResultRankingParameters.newBuilder()
|
||||
.setFullB(rankingParams.fullParams.b())
|
||||
.setFullK(rankingParams.fullParams.k())
|
||||
.setPrioB(rankingParams.prioParams.b())
|
||||
.setPrioK(rankingParams.prioParams.k())
|
||||
.setBm25B(rankingParams.bm25Params.b())
|
||||
.setBm25K(rankingParams.bm25Params.k())
|
||||
.setShortDocumentThreshold(rankingParams.shortDocumentThreshold)
|
||||
.setShortDocumentPenalty(rankingParams.shortDocumentPenalty)
|
||||
.setDomainRankBonus(rankingParams.domainRankBonus)
|
||||
.setQualityPenalty(rankingParams.qualityPenalty)
|
||||
.setShortSentenceThreshold(rankingParams.shortSentenceThreshold)
|
||||
.setShortSentencePenalty(rankingParams.shortSentencePenalty)
|
||||
.setBm25FullWeight(rankingParams.bm25FullWeight)
|
||||
.setBm25NgramWeight(rankingParams.bm25NgramWeight)
|
||||
.setBm25PrioWeight(rankingParams.bm25PrioWeight)
|
||||
.setTcfOverlapWeight(rankingParams.tcfOverlapWeight)
|
||||
.setTcfJaccardWeight(rankingParams.tcfJaccardWeight)
|
||||
.setBm25Weight(rankingParams.bm25Weight)
|
||||
.setTcfAvgDistWeight(rankingParams.tcfAvgDist)
|
||||
.setTcfFirstPositionWeight(rankingParams.tcfFirstPosition)
|
||||
.setTemporalBiasWeight(rankingParams.temporalBiasWeight)
|
||||
.setExportDebugData(rankingParams.exportDebugData);
|
||||
|
||||
@ -176,11 +169,9 @@ public class IndexProtobufCodec {
|
||||
.setTemporalBias(outputs.temporalBias())
|
||||
.setFlagsPenalty(outputs.flagsPenalty())
|
||||
.setOverallPart(outputs.overallPart())
|
||||
.setTcfOverlap(outputs.tcfOverlap())
|
||||
.setTcfJaccard(outputs.tcfJaccard())
|
||||
.setBM25F(outputs.bM25F())
|
||||
.setBM25N(outputs.bM25N())
|
||||
.setBM25P(outputs.bM25P())
|
||||
.setTcfAvgDist(outputs.tcfAvgDist())
|
||||
.setTcfFirstPosition(outputs.tcfFirstPosition())
|
||||
.setBm25Part(outputs.bm25())
|
||||
.build();
|
||||
}
|
||||
|
||||
|
@ -157,11 +157,10 @@ public class QueryProtobufCodec {
|
||||
outputs.getTemporalBias(),
|
||||
outputs.getFlagsPenalty(),
|
||||
outputs.getOverallPart(),
|
||||
outputs.getTcfOverlap(),
|
||||
outputs.getTcfJaccard(),
|
||||
outputs.getBM25F(),
|
||||
outputs.getBM25N(),
|
||||
outputs.getBM25P()
|
||||
outputs.getBm25Part(),
|
||||
outputs.getTcfAvgDist(),
|
||||
outputs.getTcfFirstPosition()
|
||||
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -10,9 +10,7 @@ import lombok.*;
|
||||
public class ResultRankingParameters {
|
||||
|
||||
/** Tuning for BM25 when applied to full document matches */
|
||||
public final Bm25Parameters fullParams;
|
||||
/** Tuning for BM25 when applied to priority matches, terms with relevance signal indicators */
|
||||
public final Bm25Parameters prioParams;
|
||||
public final Bm25Parameters bm25Params;
|
||||
|
||||
/** Documents below this length are penalized */
|
||||
public int shortDocumentThreshold;
|
||||
@ -32,11 +30,9 @@ public class ResultRankingParameters {
|
||||
/** Magnitude of penalty for documents with low average sentence length */
|
||||
public double shortSentencePenalty;
|
||||
|
||||
public double bm25FullWeight;
|
||||
public double bm25NgramWeight;
|
||||
public double bm25PrioWeight;
|
||||
public double tcfJaccardWeight;
|
||||
public double tcfOverlapWeight;
|
||||
public double bm25Weight;
|
||||
public double tcfFirstPosition;
|
||||
public double tcfAvgDist;
|
||||
|
||||
public TemporalBias temporalBias;
|
||||
public double temporalBiasWeight;
|
||||
@ -45,19 +41,16 @@ public class ResultRankingParameters {
|
||||
|
||||
public static ResultRankingParameters sensibleDefaults() {
|
||||
return builder()
|
||||
.fullParams(new Bm25Parameters(1.2, 0.5))
|
||||
.prioParams(new Bm25Parameters(1.5, 0))
|
||||
.bm25Params(new Bm25Parameters(1.2, 0.5))
|
||||
.shortDocumentThreshold(2000)
|
||||
.shortDocumentPenalty(2.)
|
||||
.domainRankBonus(1/25.)
|
||||
.qualityPenalty(1/15.)
|
||||
.shortSentenceThreshold(2)
|
||||
.shortSentencePenalty(5)
|
||||
.bm25FullWeight(1.)
|
||||
.bm25NgramWeight(.25)
|
||||
.bm25PrioWeight(1.)
|
||||
.tcfOverlapWeight(3.)
|
||||
.tcfJaccardWeight(1)
|
||||
.bm25Weight(1.)
|
||||
.tcfAvgDist(25.)
|
||||
.tcfFirstPosition(1) // FIXME: what's a good default?
|
||||
.temporalBias(TemporalBias.NONE)
|
||||
.temporalBiasWeight(1. / (5.))
|
||||
.exportDebugData(false)
|
||||
|
@ -1,5 +1,6 @@
|
||||
package nu.marginalia.api.searchquery.model.results.debug;
|
||||
|
||||
|
||||
public record ResultRankingOutputs(double averageSentenceLengthPenalty,
|
||||
double qualityPenalty,
|
||||
double rankingBonus,
|
||||
@ -8,10 +9,8 @@ public record ResultRankingOutputs(double averageSentenceLengthPenalty,
|
||||
double temporalBias,
|
||||
double flagsPenalty,
|
||||
double overallPart,
|
||||
double tcfOverlap,
|
||||
double tcfJaccard,
|
||||
double bM25F,
|
||||
double bM25N,
|
||||
double bM25P)
|
||||
double bm25,
|
||||
double tcfAvgDist,
|
||||
double tcfFirstPosition)
|
||||
{
|
||||
}
|
||||
|
@ -119,25 +119,26 @@ message RpcQueryLimits {
|
||||
int32 fetchSize = 4; // Size of the fetch buffer in the index service
|
||||
}
|
||||
|
||||
/** Parameters for the result ranking function */
|
||||
message RpcResultRankingParameters {
|
||||
double fullK = 1; // BM25 parameter
|
||||
double fullB = 2; // BM25 parameter
|
||||
double prioK = 3; // BM25 parameter
|
||||
double prioB = 4; // BM25 parameter
|
||||
double bm25K = 1; // BM25 parameter
|
||||
double bm25B = 2; // BM25 parameter
|
||||
|
||||
int32 shortDocumentThreshold = 5;
|
||||
double shortDocumentPenalty = 6;
|
||||
double domainRankBonus = 7;
|
||||
double qualityPenalty = 8;
|
||||
int32 shortSentenceThreshold = 9;
|
||||
double shortSentencePenalty = 10;
|
||||
double bm25FullWeight = 11;
|
||||
double bm25NgramWeight = 12;
|
||||
double bm25PrioWeight = 13;
|
||||
double tcfOverlapWeight = 14;
|
||||
double tcfJaccardWeight = 15;
|
||||
double bm25Weight = 11;
|
||||
double tcfAvgDistWeight = 12;
|
||||
double tcfFirstPositionWeight = 13;
|
||||
// 14, 15 unused
|
||||
RpcTemporalBias temporalBias = 16;
|
||||
double temporalBiasWeight = 17;
|
||||
|
||||
bool exportDebugData = 18;
|
||||
|
||||
}
|
||||
|
||||
message RpcResultRankingDetails {
|
||||
@ -155,6 +156,7 @@ message RpcResultRankingInputs {
|
||||
repeated string flags = 7;
|
||||
}
|
||||
|
||||
/** Summary of the output of the ranking function */
|
||||
message RpcResultRankingOutputs {
|
||||
double averageSentenceLengthPenalty = 1;
|
||||
double qualityPenalty = 2;
|
||||
@ -164,11 +166,10 @@ message RpcResultRankingOutputs {
|
||||
double temporalBias = 6;
|
||||
double flagsPenalty = 7;
|
||||
double overallPart = 8;
|
||||
double tcfOverlap = 9;
|
||||
double tcfJaccard = 10;
|
||||
double bM25F = 11;
|
||||
double bM25N = 12;
|
||||
double bM25P = 13;
|
||||
double bm25Part = 9;
|
||||
// 10-14 unused
|
||||
double tcfAvgDist = 15;
|
||||
double tcfFirstPosition = 16;
|
||||
}
|
||||
|
||||
/* Defines a single subquery */
|
||||
|
@ -88,10 +88,10 @@ public class ResultValuator {
|
||||
+ bestCoherence;
|
||||
|
||||
// FIXME: need a weighting factor here
|
||||
double tcfAvgDist = 25. / termCoherenceFactor.calculateAvgMinDistance(positionsQuery, ctx);
|
||||
double tcfAvgDist = rankingParams.tcfAvgDist * (1.0 / termCoherenceFactor.calculateAvgMinDistance(positionsQuery, ctx));
|
||||
double tcfFirstPosition = 0.;
|
||||
|
||||
double bM25F = rankingParams.bm25FullWeight * wordFlagsQuery.root.visit(new Bm25FullGraphVisitor(rankingParams.fullParams, positionsCountQuery.data, length, ctx));
|
||||
double bM25P = rankingParams.bm25PrioWeight * wordFlagsQuery.root.visit(new Bm25PrioGraphVisitor(rankingParams.prioParams, wordFlagsQuery.data, ctx));
|
||||
double bM25 = rankingParams.bm25Weight * wordFlagsQuery.root.visit(new Bm25FullGraphVisitor(rankingParams.bm25Params, positionsCountQuery.data, length, ctx));
|
||||
|
||||
double overallPartPositive = Math.max(0, overallPart);
|
||||
double overallPartNegative = -Math.min(0, overallPart);
|
||||
@ -116,11 +116,9 @@ public class ResultValuator {
|
||||
temporalBias,
|
||||
flagsPenalty,
|
||||
overallPart,
|
||||
0,
|
||||
0,
|
||||
bM25F,
|
||||
0, // FIXME: Remove from model
|
||||
bM25P)
|
||||
bM25,
|
||||
tcfAvgDist,
|
||||
tcfFirstPosition)
|
||||
);
|
||||
|
||||
detailsConsumer.accept(details);
|
||||
@ -129,8 +127,8 @@ public class ResultValuator {
|
||||
// Renormalize to 0...15, where 0 is the best possible score;
|
||||
// this is a historical artifact of the original ranking function
|
||||
double ret = normalize(
|
||||
tcfAvgDist
|
||||
+ bM25F + bM25P
|
||||
tcfAvgDist + tcfFirstPosition
|
||||
+ bM25
|
||||
+ overallPartPositive,
|
||||
overallPartNegative);
|
||||
|
||||
|
@ -106,23 +106,17 @@ public class QueryBasicInterface {
|
||||
.qualityPenalty(doubleFromRequest(request, "qualityPenalty", sensibleDefaults.qualityPenalty))
|
||||
.shortDocumentThreshold(intFromRequest(request, "shortDocumentThreshold", sensibleDefaults.shortDocumentThreshold))
|
||||
.shortDocumentPenalty(doubleFromRequest(request, "shortDocumentPenalty", sensibleDefaults.shortDocumentPenalty))
|
||||
.tcfJaccardWeight(doubleFromRequest(request, "tcfJaccardWeight", sensibleDefaults.tcfJaccardWeight))
|
||||
.tcfOverlapWeight(doubleFromRequest(request, "tcfOverlapWeight", sensibleDefaults.tcfOverlapWeight))
|
||||
.fullParams(new Bm25Parameters(
|
||||
doubleFromRequest(request, "fullParams.k1", sensibleDefaults.fullParams.k()),
|
||||
doubleFromRequest(request, "fullParams.b", sensibleDefaults.fullParams.b())
|
||||
))
|
||||
.prioParams(new Bm25Parameters(
|
||||
doubleFromRequest(request, "prioParams.k1", sensibleDefaults.prioParams.k()),
|
||||
doubleFromRequest(request, "prioParams.b", sensibleDefaults.prioParams.b())
|
||||
.tcfFirstPosition(doubleFromRequest(request, "tcfFirstPosition", sensibleDefaults.tcfFirstPosition))
|
||||
.tcfAvgDist(doubleFromRequest(request, "tcfAvgDist", sensibleDefaults.tcfAvgDist))
|
||||
.bm25Params(new Bm25Parameters(
|
||||
doubleFromRequest(request, "bm25.k1", sensibleDefaults.bm25Params.k()),
|
||||
doubleFromRequest(request, "bm25.b", sensibleDefaults.bm25Params.b())
|
||||
))
|
||||
.temporalBias(ResultRankingParameters.TemporalBias.valueOf(stringFromRequest(request, "temporalBias", sensibleDefaults.temporalBias.toString())))
|
||||
.temporalBiasWeight(doubleFromRequest(request, "temporalBiasWeight", sensibleDefaults.temporalBiasWeight))
|
||||
.shortSentenceThreshold(intFromRequest(request, "shortSentenceThreshold", sensibleDefaults.shortSentenceThreshold))
|
||||
.shortSentencePenalty(doubleFromRequest(request, "shortSentencePenalty", sensibleDefaults.shortSentencePenalty))
|
||||
.bm25FullWeight(doubleFromRequest(request, "bm25FullWeight", sensibleDefaults.bm25FullWeight))
|
||||
.bm25NgramWeight(doubleFromRequest(request, "bm25NgramWeight", sensibleDefaults.bm25NgramWeight))
|
||||
.bm25PrioWeight(doubleFromRequest(request, "bm25PrioWeight", sensibleDefaults.bm25PrioWeight))
|
||||
.bm25Weight(doubleFromRequest(request, "bm25Weight", sensibleDefaults.bm25Weight))
|
||||
.exportDebugData(true)
|
||||
.build();
|
||||
}
|
||||
|
@ -31,22 +31,16 @@
|
||||
<div class="col-sm-2"><input type="text" class="form-control" id="shortDocumentPenalty" name="shortDocumentPenalty" value="{{shortDocumentPenalty}}"></div>
|
||||
</div>
|
||||
<div class="row my-2">
|
||||
<div class="col-sm-2"><label for="tcfJaccardWeight">TCF Jaccard Weight</label></div>
|
||||
<div class="col-sm-2"><input type="text" class="form-control" id="tcfJaccardWeight" name="tcfJaccardWeight" value="{{tcfJaccardWeight}}"></div>
|
||||
<div class="col-sm-2"><label for="tcfOverlapWeight">TCF Overlap Weight</label></div>
|
||||
<div class="col-sm-2"><input type="text" class="form-control" id="tcfOverlapWeight" name="tcfOverlapWeight" value="{{tcfOverlapWeight}}"></div>
|
||||
<div class="col-sm-2"><label for="tcfAvgDist">TCF Average Distance</label></div>
|
||||
<div class="col-sm-2"><input type="text" class="form-control" id="tcfAvgDist" name="tcfAvgDist" value="{{tcfAvgDist}}"></div>
|
||||
<div class="col-sm-2"><label for="tcfFirstPosition">TCF First Position Weight</label></div>
|
||||
<div class="col-sm-2"><input type="text" class="form-control" id="tcfFirstPosition" name="tcfFirstPosition" value="{{tcfFirstPosition}}"></div>
|
||||
</div>
|
||||
<div class="row my-2">
|
||||
<div class="col-sm-2"><label for="fullParams.k1">Full Params K1</label></div>
|
||||
<div class="col-sm-2"><input type="text" class="form-control" id="fullParams.k1" name="fullParams.k1" value="{{fullParams.k}}"></div>
|
||||
<div class="col-sm-2"><label for="fullParams.b">Full Params B</label></div>
|
||||
<div class="col-sm-2"><input type="text" class="form-control" id="fullParams.b" name="fullParams.b" value="{{fullParams.b}}"></div>
|
||||
</div>
|
||||
<div class="row my-2">
|
||||
<div class="col-sm-2"><label for="prioParams.k1">Prio Params K1</label></div>
|
||||
<div class="col-sm-2"><input type="text" class="form-control" id="prioParams.k1" name="prioParams.k1" value="{{prioParams.k}}"></div>
|
||||
<div class="col-sm-2"><label for="prioParams.b">Prio Params B</label></div>
|
||||
<div class="col-sm-2"><input type="text" class="form-control" id="prioParams.b" name="prioParams.b" value="{{prioParams.b}}"></div>
|
||||
<div class="col-sm-2"><label for="bm25.k1">BM25 K1</label></div>
|
||||
<div class="col-sm-2"><input type="text" class="form-control" id="bm25.k1" name="bm25.k1" value="{{bm25Params.k}}"></div>
|
||||
<div class="col-sm-2"><label for="bm25.b">BM25 B</label></div>
|
||||
<div class="col-sm-2"><input type="text" class="form-control" id="bm25.b" name="bm25.b" value="{{bm25Params.b}}"></div>
|
||||
</div>
|
||||
<div class="row my-2">
|
||||
<div class="col-sm-2"><label for="temporalBias">Temporal Bias</label></div>
|
||||
@ -67,12 +61,8 @@
|
||||
<div class="col-sm-2"><input type="text" class="form-control" id="shortSentencePenalty" name="shortSentencePenalty" value="{{shortSentencePenalty}}"></div>
|
||||
</div>
|
||||
<div class="row my-2">
|
||||
<div class="col-sm-2"><label for="bm25FullWeight">BM25 Full Weight</label></div>
|
||||
<div class="col-sm-2"><input type="text" class="form-control" id="bm25FullWeight" name="bm25FullWeight" value="{{bm25FullWeight}}"></div>
|
||||
<div class="col-sm-2"><label for="bm25NgramWeight">BM25 Ngram Weight</label></div>
|
||||
<div class="col-sm-2"><input type="text" class="form-control" id="bm25NgramWeight" name="bm25NgramWeight" value="{{bm25NgramWeight}}"></div>
|
||||
<div class="col-sm-2"><label for="bm25PrioWeight">BM25 Prio Weight</label></div>
|
||||
<div class="col-sm-2"><input type="text" class="form-control" id="bm25PrioWeight" name="bm25PrioWeight" value="{{bm25PrioWeight}}"></div>
|
||||
<div class="col-sm-2"><label for="bm25FullWeight">BM25 Weight</label></div>
|
||||
<div class="col-sm-2"><input type="text" class="form-control" id="bm25Weight" name="bm25Weight" value="{{bm25Weight}}"></div>
|
||||
</div>
|
||||
|
||||
{{/with}}
|
||||
@ -130,11 +120,9 @@
|
||||
<div><small class="text-muted">Temporal Bias: {{temporalBias}}</small></div>
|
||||
<div><small class="text-muted">Flags Penalty: {{flagsPenalty}}</small></div>
|
||||
<div><small class="text-muted">Overall Part: {{overallPart}}</small></div>
|
||||
<div><small class="text-muted">TCF Overlap: {{tcfOverlap}}</small></div>
|
||||
<div><small class="text-muted">TCF Jaccard: {{tcfJaccard}}</small></div>
|
||||
<div><small class="text-muted">BM25 Full: {{bM25F}}</small></div>
|
||||
<div><small class="text-muted">BM25 Ngram: {{bM25N}}</small></div>
|
||||
<div><small class="text-muted">BM25 Prio: {{bM25P}}</small></div>
|
||||
<div><small class="text-muted">TCF Avg Distance: {{tcfAvgDist}}</small></div>
|
||||
<div><small class="text-muted">TCF First Position: {{tcfFirstPosition}}</small></div>
|
||||
<div><small class="text-muted">BM25: {{bM25}}</small></div>
|
||||
{{/with}}
|
||||
|
||||
</div>
|
||||
|
Loading…
Reference in New Issue
Block a user