(index) Return some variant of the previously removed 'Bm25PrioGraphVisitor'

This commit is contained in:
Viktor Lofgren 2024-08-03 10:10:12 +02:00
parent d8a99784e5
commit c6c8b059bf
2 changed files with 130 additions and 0 deletions

View File

@ -266,12 +266,14 @@ public class IndexResultScoreCalculator {
double tcfFirstPosition = rankingParams.tcfFirstPosition * (1.0 / Math.max(1, firstPosition));
double bM25 = rankingParams.bm25Weight * wordFlagsQuery.root.visit(new Bm25GraphVisitor(rankingParams.bm25Params, weightedCounts, length, ctx));
double bFlags = rankingParams.bm25Weight * wordFlagsQuery.root.visit(new TermFlagsGraphVisitor(rankingParams.bm25Params, wordFlagsQuery.data, weightedCounts, ctx));
// Renormalize to 0...15, where 0 is the best possible score;
// this is a historical artifact of the original ranking function
double ret = normalize(
tcfAvgDist + tcfFirstPosition
+ bM25
+ bFlags
+ Math.max(0, overallPart),
-Math.min(0, overallPart));

View File

@ -0,0 +1,128 @@
package nu.marginalia.index.results;
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
import nu.marginalia.api.searchquery.model.compiled.CqDataLong;
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
import nu.marginalia.api.searchquery.model.results.Bm25Parameters;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.model.idx.WordFlags;
import java.util.List;
public class TermFlagsGraphVisitor implements CqExpression.DoubleVisitor {
private static final long AVG_LENGTH = 5000;
private final CqDataLong wordMetaData;
private final CqDataInt frequencies;
private final float[] counts;
private final Bm25Parameters bm25Parameters;
private final int docCount;
public TermFlagsGraphVisitor(Bm25Parameters bm25Parameters,
CqDataLong wordMetaData,
float[] counts,
ResultRankingContext ctx) {
this.bm25Parameters = bm25Parameters;
this.counts = counts;
this.docCount = ctx.termFreqDocCount();
this.wordMetaData = wordMetaData;
this.frequencies = ctx.fullCounts;
}
@Override
public double onAnd(List<? extends CqExpression> parts) {
double value = 0;
for (var part : parts) {
value += part.visit(this);
}
return value;
}
@Override
public double onOr(List<? extends CqExpression> parts) {
double value = 0;
for (var part : parts) {
value = Math.max(value, part.visit(this));
}
return value;
}
@Override
public double onLeaf(int idx) {
double count = evaluatePriorityScore(idx);
int freq = frequencies.get(idx);
// note we override b to zero for priority terms as they are independent of document length
return invFreq(docCount, freq) * f(bm25Parameters.k(), 0, count, 0);
}
private double evaluatePriorityScore(int idx) {
byte wordMeta = (byte) wordMetaData.get(idx);
float pcount = counts[idx];
double qcount = 0.;
if ((wordMeta & WordFlags.ExternalLink.asBit()) != 0) {
qcount += 2.5;
if ((wordMeta & WordFlags.UrlDomain.asBit()) != 0)
qcount += 2.5;
else if ((wordMeta & WordFlags.UrlPath.asBit()) != 0)
qcount += 1.5;
if ((wordMeta & WordFlags.Site.asBit()) != 0)
qcount += 1.25;
if ((wordMeta & WordFlags.SiteAdjacent.asBit()) != 0)
qcount += 1.25;
}
else {
if ((wordMeta & WordFlags.UrlDomain.asBit()) != 0)
qcount += 3;
else if ((wordMeta & WordFlags.UrlPath.asBit()) != 0)
qcount += 1;
if ((wordMeta & WordFlags.Site.asBit()) != 0)
qcount += 0.5;
if ((wordMeta & WordFlags.SiteAdjacent.asBit()) != 0)
qcount += 0.5;
}
if ((wordMeta & WordFlags.Title.asBit()) != 0)
qcount += 1.5;
if (pcount > 2) {
if ((wordMeta & WordFlags.Subjects.asBit()) != 0)
qcount += 1.25;
if ((wordMeta & WordFlags.NamesWords.asBit()) != 0)
qcount += 0.25;
}
return qcount;
}
/**
*
* @param docCount Number of documents
* @param freq Number of matching documents
*/
private double invFreq(int docCount, int freq) {
return Math.log(1.0 + (docCount - freq + 0.5) / (freq + 0.5));
}
/**
*
* @param k determines the size of the impact of a single term
* @param b determines the magnitude of the length normalization
* @param count number of occurrences in the document
* @param length document length
*/
private double f(double k, double b, double count, int length) {
final double lengthRatio = (double) length / AVG_LENGTH;
return (count * (k + 1)) / (count + k * (1 - b + b * lengthRatio));
}
}