mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(index) Return some variant of the previously removed 'Bm25PrioGraphVisitor'
This commit is contained in:
parent
d8a99784e5
commit
c6c8b059bf
@ -266,12 +266,14 @@ public class IndexResultScoreCalculator {
|
||||
double tcfFirstPosition = rankingParams.tcfFirstPosition * (1.0 / Math.max(1, firstPosition));
|
||||
|
||||
double bM25 = rankingParams.bm25Weight * wordFlagsQuery.root.visit(new Bm25GraphVisitor(rankingParams.bm25Params, weightedCounts, length, ctx));
|
||||
double bFlags = rankingParams.bm25Weight * wordFlagsQuery.root.visit(new TermFlagsGraphVisitor(rankingParams.bm25Params, wordFlagsQuery.data, weightedCounts, ctx));
|
||||
|
||||
// Renormalize to 0...15, where 0 is the best possible score;
|
||||
// this is a historical artifact of the original ranking function
|
||||
double ret = normalize(
|
||||
tcfAvgDist + tcfFirstPosition
|
||||
+ bM25
|
||||
+ bFlags
|
||||
+ Math.max(0, overallPart),
|
||||
-Math.min(0, overallPart));
|
||||
|
||||
|
@ -0,0 +1,128 @@
|
||||
package nu.marginalia.index.results;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqDataLong;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
|
||||
import nu.marginalia.api.searchquery.model.results.Bm25Parameters;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class TermFlagsGraphVisitor implements CqExpression.DoubleVisitor {
|
||||
private static final long AVG_LENGTH = 5000;
|
||||
|
||||
private final CqDataLong wordMetaData;
|
||||
private final CqDataInt frequencies;
|
||||
private final float[] counts;
|
||||
private final Bm25Parameters bm25Parameters;
|
||||
|
||||
private final int docCount;
|
||||
|
||||
public TermFlagsGraphVisitor(Bm25Parameters bm25Parameters,
|
||||
CqDataLong wordMetaData,
|
||||
float[] counts,
|
||||
ResultRankingContext ctx) {
|
||||
this.bm25Parameters = bm25Parameters;
|
||||
this.counts = counts;
|
||||
this.docCount = ctx.termFreqDocCount();
|
||||
this.wordMetaData = wordMetaData;
|
||||
this.frequencies = ctx.fullCounts;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double onAnd(List<? extends CqExpression> parts) {
|
||||
double value = 0;
|
||||
for (var part : parts) {
|
||||
value += part.visit(this);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double onOr(List<? extends CqExpression> parts) {
|
||||
double value = 0;
|
||||
for (var part : parts) {
|
||||
value = Math.max(value, part.visit(this));
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double onLeaf(int idx) {
|
||||
double count = evaluatePriorityScore(idx);
|
||||
|
||||
int freq = frequencies.get(idx);
|
||||
|
||||
// note we override b to zero for priority terms as they are independent of document length
|
||||
return invFreq(docCount, freq) * f(bm25Parameters.k(), 0, count, 0);
|
||||
}
|
||||
|
||||
private double evaluatePriorityScore(int idx) {
|
||||
byte wordMeta = (byte) wordMetaData.get(idx);
|
||||
float pcount = counts[idx];
|
||||
|
||||
double qcount = 0.;
|
||||
|
||||
if ((wordMeta & WordFlags.ExternalLink.asBit()) != 0) {
|
||||
|
||||
qcount += 2.5;
|
||||
|
||||
if ((wordMeta & WordFlags.UrlDomain.asBit()) != 0)
|
||||
qcount += 2.5;
|
||||
else if ((wordMeta & WordFlags.UrlPath.asBit()) != 0)
|
||||
qcount += 1.5;
|
||||
|
||||
if ((wordMeta & WordFlags.Site.asBit()) != 0)
|
||||
qcount += 1.25;
|
||||
if ((wordMeta & WordFlags.SiteAdjacent.asBit()) != 0)
|
||||
qcount += 1.25;
|
||||
}
|
||||
else {
|
||||
if ((wordMeta & WordFlags.UrlDomain.asBit()) != 0)
|
||||
qcount += 3;
|
||||
else if ((wordMeta & WordFlags.UrlPath.asBit()) != 0)
|
||||
qcount += 1;
|
||||
|
||||
if ((wordMeta & WordFlags.Site.asBit()) != 0)
|
||||
qcount += 0.5;
|
||||
if ((wordMeta & WordFlags.SiteAdjacent.asBit()) != 0)
|
||||
qcount += 0.5;
|
||||
}
|
||||
|
||||
if ((wordMeta & WordFlags.Title.asBit()) != 0)
|
||||
qcount += 1.5;
|
||||
|
||||
if (pcount > 2) {
|
||||
if ((wordMeta & WordFlags.Subjects.asBit()) != 0)
|
||||
qcount += 1.25;
|
||||
if ((wordMeta & WordFlags.NamesWords.asBit()) != 0)
|
||||
qcount += 0.25;
|
||||
}
|
||||
|
||||
return qcount;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @param docCount Number of documents
|
||||
* @param freq Number of matching documents
|
||||
*/
|
||||
private double invFreq(int docCount, int freq) {
|
||||
return Math.log(1.0 + (docCount - freq + 0.5) / (freq + 0.5));
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param k determines the size of the impact of a single term
|
||||
* @param b determines the magnitude of the length normalization
|
||||
* @param count number of occurrences in the document
|
||||
* @param length document length
|
||||
*/
|
||||
private double f(double k, double b, double count, int length) {
|
||||
final double lengthRatio = (double) length / AVG_LENGTH;
|
||||
|
||||
return (count * (k + 1)) / (count + k * (1 - b + b * lengthRatio));
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user