mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(index) Return some variant of the previously removed 'Bm25PrioGraphVisitor'
This commit is contained in:
parent
d8a99784e5
commit
c6c8b059bf
@ -266,12 +266,14 @@ public class IndexResultScoreCalculator {
|
|||||||
double tcfFirstPosition = rankingParams.tcfFirstPosition * (1.0 / Math.max(1, firstPosition));
|
double tcfFirstPosition = rankingParams.tcfFirstPosition * (1.0 / Math.max(1, firstPosition));
|
||||||
|
|
||||||
double bM25 = rankingParams.bm25Weight * wordFlagsQuery.root.visit(new Bm25GraphVisitor(rankingParams.bm25Params, weightedCounts, length, ctx));
|
double bM25 = rankingParams.bm25Weight * wordFlagsQuery.root.visit(new Bm25GraphVisitor(rankingParams.bm25Params, weightedCounts, length, ctx));
|
||||||
|
double bFlags = rankingParams.bm25Weight * wordFlagsQuery.root.visit(new TermFlagsGraphVisitor(rankingParams.bm25Params, wordFlagsQuery.data, weightedCounts, ctx));
|
||||||
|
|
||||||
// Renormalize to 0...15, where 0 is the best possible score;
|
// Renormalize to 0...15, where 0 is the best possible score;
|
||||||
// this is a historical artifact of the original ranking function
|
// this is a historical artifact of the original ranking function
|
||||||
double ret = normalize(
|
double ret = normalize(
|
||||||
tcfAvgDist + tcfFirstPosition
|
tcfAvgDist + tcfFirstPosition
|
||||||
+ bM25
|
+ bM25
|
||||||
|
+ bFlags
|
||||||
+ Math.max(0, overallPart),
|
+ Math.max(0, overallPart),
|
||||||
-Math.min(0, overallPart));
|
-Math.min(0, overallPart));
|
||||||
|
|
||||||
|
@ -0,0 +1,128 @@
|
|||||||
|
package nu.marginalia.index.results;
|
||||||
|
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.CqDataLong;
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
|
||||||
|
import nu.marginalia.api.searchquery.model.results.Bm25Parameters;
|
||||||
|
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||||
|
import nu.marginalia.model.idx.WordFlags;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class TermFlagsGraphVisitor implements CqExpression.DoubleVisitor {
|
||||||
|
private static final long AVG_LENGTH = 5000;
|
||||||
|
|
||||||
|
private final CqDataLong wordMetaData;
|
||||||
|
private final CqDataInt frequencies;
|
||||||
|
private final float[] counts;
|
||||||
|
private final Bm25Parameters bm25Parameters;
|
||||||
|
|
||||||
|
private final int docCount;
|
||||||
|
|
||||||
|
public TermFlagsGraphVisitor(Bm25Parameters bm25Parameters,
|
||||||
|
CqDataLong wordMetaData,
|
||||||
|
float[] counts,
|
||||||
|
ResultRankingContext ctx) {
|
||||||
|
this.bm25Parameters = bm25Parameters;
|
||||||
|
this.counts = counts;
|
||||||
|
this.docCount = ctx.termFreqDocCount();
|
||||||
|
this.wordMetaData = wordMetaData;
|
||||||
|
this.frequencies = ctx.fullCounts;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double onAnd(List<? extends CqExpression> parts) {
|
||||||
|
double value = 0;
|
||||||
|
for (var part : parts) {
|
||||||
|
value += part.visit(this);
|
||||||
|
}
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double onOr(List<? extends CqExpression> parts) {
|
||||||
|
double value = 0;
|
||||||
|
for (var part : parts) {
|
||||||
|
value = Math.max(value, part.visit(this));
|
||||||
|
}
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double onLeaf(int idx) {
|
||||||
|
double count = evaluatePriorityScore(idx);
|
||||||
|
|
||||||
|
int freq = frequencies.get(idx);
|
||||||
|
|
||||||
|
// note we override b to zero for priority terms as they are independent of document length
|
||||||
|
return invFreq(docCount, freq) * f(bm25Parameters.k(), 0, count, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
private double evaluatePriorityScore(int idx) {
|
||||||
|
byte wordMeta = (byte) wordMetaData.get(idx);
|
||||||
|
float pcount = counts[idx];
|
||||||
|
|
||||||
|
double qcount = 0.;
|
||||||
|
|
||||||
|
if ((wordMeta & WordFlags.ExternalLink.asBit()) != 0) {
|
||||||
|
|
||||||
|
qcount += 2.5;
|
||||||
|
|
||||||
|
if ((wordMeta & WordFlags.UrlDomain.asBit()) != 0)
|
||||||
|
qcount += 2.5;
|
||||||
|
else if ((wordMeta & WordFlags.UrlPath.asBit()) != 0)
|
||||||
|
qcount += 1.5;
|
||||||
|
|
||||||
|
if ((wordMeta & WordFlags.Site.asBit()) != 0)
|
||||||
|
qcount += 1.25;
|
||||||
|
if ((wordMeta & WordFlags.SiteAdjacent.asBit()) != 0)
|
||||||
|
qcount += 1.25;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if ((wordMeta & WordFlags.UrlDomain.asBit()) != 0)
|
||||||
|
qcount += 3;
|
||||||
|
else if ((wordMeta & WordFlags.UrlPath.asBit()) != 0)
|
||||||
|
qcount += 1;
|
||||||
|
|
||||||
|
if ((wordMeta & WordFlags.Site.asBit()) != 0)
|
||||||
|
qcount += 0.5;
|
||||||
|
if ((wordMeta & WordFlags.SiteAdjacent.asBit()) != 0)
|
||||||
|
qcount += 0.5;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((wordMeta & WordFlags.Title.asBit()) != 0)
|
||||||
|
qcount += 1.5;
|
||||||
|
|
||||||
|
if (pcount > 2) {
|
||||||
|
if ((wordMeta & WordFlags.Subjects.asBit()) != 0)
|
||||||
|
qcount += 1.25;
|
||||||
|
if ((wordMeta & WordFlags.NamesWords.asBit()) != 0)
|
||||||
|
qcount += 0.25;
|
||||||
|
}
|
||||||
|
|
||||||
|
return qcount;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param docCount Number of documents
|
||||||
|
* @param freq Number of matching documents
|
||||||
|
*/
|
||||||
|
private double invFreq(int docCount, int freq) {
|
||||||
|
return Math.log(1.0 + (docCount - freq + 0.5) / (freq + 0.5));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param k determines the size of the impact of a single term
|
||||||
|
* @param b determines the magnitude of the length normalization
|
||||||
|
* @param count number of occurrences in the document
|
||||||
|
* @param length document length
|
||||||
|
*/
|
||||||
|
private double f(double k, double b, double count, int length) {
|
||||||
|
final double lengthRatio = (double) length / AVG_LENGTH;
|
||||||
|
|
||||||
|
return (count * (k + 1)) / (count + k * (1 - b + b * lengthRatio));
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user