mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(index) Clean up jaccard index term code and down-tune the parameter's importance a bit
This commit is contained in:
parent
2f0b648fad
commit
44b33798f3
@ -74,8 +74,8 @@ public class ResultValuator {
|
||||
+ temporalBias
|
||||
+ flagsPenalty;
|
||||
|
||||
double tcfOverlap = rankingParams.tcfWeight * termCoherenceFactor.calculateOverlap(wordMeta);
|
||||
double tcfJaccard = rankingParams.tcfWeight * termCoherenceFactor.calculateAvgMutualJaccard(wordMeta, ctx);
|
||||
double tcfOverlap = 1.5 * rankingParams.tcfWeight * termCoherenceFactor.calculateOverlap(wordMeta);
|
||||
double tcfJaccard = 0.5 * rankingParams.tcfWeight * termCoherenceFactor.calculateAvgMutualJaccard(wordMeta, ctx);
|
||||
|
||||
double bM25F = rankingParams.bm25FullWeight * wordMeta.root.visit(Bm25FullGraphVisitor.forRegular(rankingParams.fullParams, wordMeta.data, length, ctx));
|
||||
double bM25N = 0.25 * rankingParams.bm25FullWeight * wordMeta.root.visit(Bm25FullGraphVisitor.forNgrams(rankingParams.fullParams, wordMeta.data, length, ctx));
|
||||
@ -86,7 +86,11 @@ public class ResultValuator {
|
||||
|
||||
// Renormalize to 0...15, where 0 is the best possible score;
|
||||
// this is a historical artifact of the original ranking function
|
||||
return normalize(1.5 * tcfOverlap + tcfJaccard + bM25F + bM25P + bM25N + overallPartPositive, overallPartNegative);
|
||||
return normalize(
|
||||
tcfOverlap + tcfJaccard
|
||||
+ bM25F + bM25P + bM25N
|
||||
+ overallPartPositive,
|
||||
overallPartNegative);
|
||||
}
|
||||
|
||||
private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) {
|
||||
|
@ -9,6 +9,10 @@ import nu.marginalia.model.idx.WordMetadata;
|
||||
*/
|
||||
public class TermCoherenceFactor {
|
||||
|
||||
/** Calculate a factor that rewards the best total position overlap
|
||||
* between the terms in the query. This is high when all the terms
|
||||
* found in the same sentences.
|
||||
*/
|
||||
public double calculateOverlap(CompiledQueryLong wordMetadataQuery) {
|
||||
long mask = CompiledQueryAggregates.longBitmaskAggregate(wordMetadataQuery,
|
||||
score -> score >>> WordMetadata.POSITIONS_SHIFT);
|
||||
@ -16,31 +20,53 @@ public class TermCoherenceFactor {
|
||||
return bitsSetFactor(mask);
|
||||
}
|
||||
|
||||
/** Calculate a factor that rewards the best average mutual Jaccard index
|
||||
* between the terms in the query. This is high when the several terms are frequently
|
||||
* found in the same sentences.
|
||||
*/
|
||||
public double calculateAvgMutualJaccard(CompiledQueryLong wordMetadataQuery, ResultRankingContext ctx) {
|
||||
double sum = 0;
|
||||
int cnt = 0;
|
||||
|
||||
for (int i = 0; i < wordMetadataQuery.size(); i++) {
|
||||
if (!ctx.regularMask.get(i)) continue;
|
||||
|
||||
// Skip terms that are not in the regular mask
|
||||
if (!ctx.regularMask.get(i))
|
||||
continue;
|
||||
|
||||
long imask = WordMetadata.decodePositions(wordMetadataQuery.at(i));
|
||||
|
||||
// Skip terms that are not in the document
|
||||
if (imask == 0L)
|
||||
continue;
|
||||
|
||||
for (int j = i + 1; j < wordMetadataQuery.size(); j++) {
|
||||
if (!ctx.regularMask.get(j)) continue;
|
||||
|
||||
// Skip terms that are not in the regular mask
|
||||
if (!ctx.regularMask.get(j))
|
||||
continue;
|
||||
|
||||
long jmask = WordMetadata.decodePositions(wordMetadataQuery.at(j));
|
||||
|
||||
// Skip terms that are not in the document
|
||||
if (jmask == 0L)
|
||||
continue;
|
||||
|
||||
long quot = Long.bitCount(imask & jmask);
|
||||
long rem = Long.bitCount(imask | jmask);
|
||||
|
||||
if (rem != 0) {
|
||||
sum += quot/(double) rem;
|
||||
cnt++;
|
||||
}
|
||||
// rem is always > 0 because imask and jmask are not both 0
|
||||
|
||||
sum += quot/(double) rem;
|
||||
cnt++;
|
||||
}
|
||||
}
|
||||
|
||||
return sum / cnt;
|
||||
if (cnt != 0) {
|
||||
return sum / cnt;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
double bitsSetFactor(long mask) {
|
||||
|
Loading…
Reference in New Issue
Block a user