(index) Clean up jaccard index term code and down-tune the parameter's importance a bit

This commit is contained in:
Viktor Lofgren 2024-04-17 17:40:16 +02:00
parent 2f0b648fad
commit 44b33798f3
2 changed files with 40 additions and 10 deletions

View File

@ -74,8 +74,8 @@ public class ResultValuator {
+ temporalBias + temporalBias
+ flagsPenalty; + flagsPenalty;
double tcfOverlap = rankingParams.tcfWeight * termCoherenceFactor.calculateOverlap(wordMeta); double tcfOverlap = 1.5 * rankingParams.tcfWeight * termCoherenceFactor.calculateOverlap(wordMeta);
double tcfJaccard = rankingParams.tcfWeight * termCoherenceFactor.calculateAvgMutualJaccard(wordMeta, ctx); double tcfJaccard = 0.5 * rankingParams.tcfWeight * termCoherenceFactor.calculateAvgMutualJaccard(wordMeta, ctx);
double bM25F = rankingParams.bm25FullWeight * wordMeta.root.visit(Bm25FullGraphVisitor.forRegular(rankingParams.fullParams, wordMeta.data, length, ctx)); double bM25F = rankingParams.bm25FullWeight * wordMeta.root.visit(Bm25FullGraphVisitor.forRegular(rankingParams.fullParams, wordMeta.data, length, ctx));
double bM25N = 0.25 * rankingParams.bm25FullWeight * wordMeta.root.visit(Bm25FullGraphVisitor.forNgrams(rankingParams.fullParams, wordMeta.data, length, ctx)); double bM25N = 0.25 * rankingParams.bm25FullWeight * wordMeta.root.visit(Bm25FullGraphVisitor.forNgrams(rankingParams.fullParams, wordMeta.data, length, ctx));
@ -86,7 +86,11 @@ public class ResultValuator {
// Renormalize to 0...15, where 0 is the best possible score; // Renormalize to 0...15, where 0 is the best possible score;
// this is a historical artifact of the original ranking function // this is a historical artifact of the original ranking function
return normalize(1.5 * tcfOverlap + tcfJaccard + bM25F + bM25P + bM25N + overallPartPositive, overallPartNegative); return normalize(
tcfOverlap + tcfJaccard
+ bM25F + bM25P + bM25N
+ overallPartPositive,
overallPartNegative);
} }
private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) { private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) {

View File

@ -9,6 +9,10 @@ import nu.marginalia.model.idx.WordMetadata;
*/ */
public class TermCoherenceFactor { public class TermCoherenceFactor {
/** Calculate a factor that rewards the best total position overlap
* between the terms in the query. This is high when all the terms
* found in the same sentences.
*/
public double calculateOverlap(CompiledQueryLong wordMetadataQuery) { public double calculateOverlap(CompiledQueryLong wordMetadataQuery) {
long mask = CompiledQueryAggregates.longBitmaskAggregate(wordMetadataQuery, long mask = CompiledQueryAggregates.longBitmaskAggregate(wordMetadataQuery,
score -> score >>> WordMetadata.POSITIONS_SHIFT); score -> score >>> WordMetadata.POSITIONS_SHIFT);
@ -16,31 +20,53 @@ public class TermCoherenceFactor {
return bitsSetFactor(mask); return bitsSetFactor(mask);
} }
/** Calculate a factor that rewards the best average mutual Jaccard index
* between the terms in the query. This is high when the several terms are frequently
* found in the same sentences.
*/
public double calculateAvgMutualJaccard(CompiledQueryLong wordMetadataQuery, ResultRankingContext ctx) { public double calculateAvgMutualJaccard(CompiledQueryLong wordMetadataQuery, ResultRankingContext ctx) {
double sum = 0; double sum = 0;
int cnt = 0; int cnt = 0;
for (int i = 0; i < wordMetadataQuery.size(); i++) { for (int i = 0; i < wordMetadataQuery.size(); i++) {
if (!ctx.regularMask.get(i)) continue;
// Skip terms that are not in the regular mask
if (!ctx.regularMask.get(i))
continue;
long imask = WordMetadata.decodePositions(wordMetadataQuery.at(i)); long imask = WordMetadata.decodePositions(wordMetadataQuery.at(i));
// Skip terms that are not in the document
if (imask == 0L)
continue;
for (int j = i + 1; j < wordMetadataQuery.size(); j++) { for (int j = i + 1; j < wordMetadataQuery.size(); j++) {
if (!ctx.regularMask.get(j)) continue;
// Skip terms that are not in the regular mask
if (!ctx.regularMask.get(j))
continue;
long jmask = WordMetadata.decodePositions(wordMetadataQuery.at(j)); long jmask = WordMetadata.decodePositions(wordMetadataQuery.at(j));
// Skip terms that are not in the document
if (jmask == 0L)
continue;
long quot = Long.bitCount(imask & jmask); long quot = Long.bitCount(imask & jmask);
long rem = Long.bitCount(imask | jmask); long rem = Long.bitCount(imask | jmask);
if (rem != 0) { // rem is always > 0 because imask and jmask are not both 0
sum += quot/(double) rem;
cnt++; sum += quot/(double) rem;
} cnt++;
} }
} }
return sum / cnt; if (cnt != 0) {
return sum / cnt;
} else {
return 0;
}
} }
double bitsSetFactor(long mask) { double bitsSetFactor(long mask) {