(index) Clean up jaccard index term code and down-tune the parameter's importance a bit

This commit is contained in:
Viktor Lofgren 2024-04-17 17:40:16 +02:00
parent 2f0b648fad
commit 44b33798f3
2 changed files with 40 additions and 10 deletions

View File

@ -74,8 +74,8 @@ public class ResultValuator {
+ temporalBias
+ flagsPenalty;
double tcfOverlap = rankingParams.tcfWeight * termCoherenceFactor.calculateOverlap(wordMeta);
double tcfJaccard = rankingParams.tcfWeight * termCoherenceFactor.calculateAvgMutualJaccard(wordMeta, ctx);
double tcfOverlap = 1.5 * rankingParams.tcfWeight * termCoherenceFactor.calculateOverlap(wordMeta);
double tcfJaccard = 0.5 * rankingParams.tcfWeight * termCoherenceFactor.calculateAvgMutualJaccard(wordMeta, ctx);
double bM25F = rankingParams.bm25FullWeight * wordMeta.root.visit(Bm25FullGraphVisitor.forRegular(rankingParams.fullParams, wordMeta.data, length, ctx));
double bM25N = 0.25 * rankingParams.bm25FullWeight * wordMeta.root.visit(Bm25FullGraphVisitor.forNgrams(rankingParams.fullParams, wordMeta.data, length, ctx));
@ -86,7 +86,11 @@ public class ResultValuator {
// Renormalize to 0...15, where 0 is the best possible score;
// this is a historical artifact of the original ranking function
return normalize(1.5 * tcfOverlap + tcfJaccard + bM25F + bM25P + bM25N + overallPartPositive, overallPartNegative);
return normalize(
tcfOverlap + tcfJaccard
+ bM25F + bM25P + bM25N
+ overallPartPositive,
overallPartNegative);
}
private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) {

View File

@ -9,6 +9,10 @@ import nu.marginalia.model.idx.WordMetadata;
*/
public class TermCoherenceFactor {
/** Calculate a factor that rewards the best total position overlap
* between the terms in the query. This is high when all the terms
* found in the same sentences.
*/
public double calculateOverlap(CompiledQueryLong wordMetadataQuery) {
long mask = CompiledQueryAggregates.longBitmaskAggregate(wordMetadataQuery,
score -> score >>> WordMetadata.POSITIONS_SHIFT);
@ -16,31 +20,53 @@ public class TermCoherenceFactor {
return bitsSetFactor(mask);
}
/** Calculate a factor that rewards the best average mutual Jaccard index
* between the terms in the query. This is high when the several terms are frequently
* found in the same sentences.
*/
public double calculateAvgMutualJaccard(CompiledQueryLong wordMetadataQuery, ResultRankingContext ctx) {
double sum = 0;
int cnt = 0;
for (int i = 0; i < wordMetadataQuery.size(); i++) {
if (!ctx.regularMask.get(i)) continue;
// Skip terms that are not in the regular mask
if (!ctx.regularMask.get(i))
continue;
long imask = WordMetadata.decodePositions(wordMetadataQuery.at(i));
// Skip terms that are not in the document
if (imask == 0L)
continue;
for (int j = i + 1; j < wordMetadataQuery.size(); j++) {
if (!ctx.regularMask.get(j)) continue;
// Skip terms that are not in the regular mask
if (!ctx.regularMask.get(j))
continue;
long jmask = WordMetadata.decodePositions(wordMetadataQuery.at(j));
// Skip terms that are not in the document
if (jmask == 0L)
continue;
long quot = Long.bitCount(imask & jmask);
long rem = Long.bitCount(imask | jmask);
if (rem != 0) {
sum += quot/(double) rem;
cnt++;
}
// rem is always > 0 because imask and jmask are not both 0
sum += quot/(double) rem;
cnt++;
}
}
return sum / cnt;
if (cnt != 0) {
return sum / cnt;
} else {
return 0;
}
}
double bitsSetFactor(long mask) {