mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(index) Clean up jaccard index term code and down-tune the parameter's importance a bit
This commit is contained in:
parent
2f0b648fad
commit
44b33798f3
@ -74,8 +74,8 @@ public class ResultValuator {
|
|||||||
+ temporalBias
|
+ temporalBias
|
||||||
+ flagsPenalty;
|
+ flagsPenalty;
|
||||||
|
|
||||||
double tcfOverlap = rankingParams.tcfWeight * termCoherenceFactor.calculateOverlap(wordMeta);
|
double tcfOverlap = 1.5 * rankingParams.tcfWeight * termCoherenceFactor.calculateOverlap(wordMeta);
|
||||||
double tcfJaccard = rankingParams.tcfWeight * termCoherenceFactor.calculateAvgMutualJaccard(wordMeta, ctx);
|
double tcfJaccard = 0.5 * rankingParams.tcfWeight * termCoherenceFactor.calculateAvgMutualJaccard(wordMeta, ctx);
|
||||||
|
|
||||||
double bM25F = rankingParams.bm25FullWeight * wordMeta.root.visit(Bm25FullGraphVisitor.forRegular(rankingParams.fullParams, wordMeta.data, length, ctx));
|
double bM25F = rankingParams.bm25FullWeight * wordMeta.root.visit(Bm25FullGraphVisitor.forRegular(rankingParams.fullParams, wordMeta.data, length, ctx));
|
||||||
double bM25N = 0.25 * rankingParams.bm25FullWeight * wordMeta.root.visit(Bm25FullGraphVisitor.forNgrams(rankingParams.fullParams, wordMeta.data, length, ctx));
|
double bM25N = 0.25 * rankingParams.bm25FullWeight * wordMeta.root.visit(Bm25FullGraphVisitor.forNgrams(rankingParams.fullParams, wordMeta.data, length, ctx));
|
||||||
@ -86,7 +86,11 @@ public class ResultValuator {
|
|||||||
|
|
||||||
// Renormalize to 0...15, where 0 is the best possible score;
|
// Renormalize to 0...15, where 0 is the best possible score;
|
||||||
// this is a historical artifact of the original ranking function
|
// this is a historical artifact of the original ranking function
|
||||||
return normalize(1.5 * tcfOverlap + tcfJaccard + bM25F + bM25P + bM25N + overallPartPositive, overallPartNegative);
|
return normalize(
|
||||||
|
tcfOverlap + tcfJaccard
|
||||||
|
+ bM25F + bM25P + bM25N
|
||||||
|
+ overallPartPositive,
|
||||||
|
overallPartNegative);
|
||||||
}
|
}
|
||||||
|
|
||||||
private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) {
|
private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) {
|
||||||
|
@ -9,6 +9,10 @@ import nu.marginalia.model.idx.WordMetadata;
|
|||||||
*/
|
*/
|
||||||
public class TermCoherenceFactor {
|
public class TermCoherenceFactor {
|
||||||
|
|
||||||
|
/** Calculate a factor that rewards the best total position overlap
|
||||||
|
* between the terms in the query. This is high when all the terms
|
||||||
|
* found in the same sentences.
|
||||||
|
*/
|
||||||
public double calculateOverlap(CompiledQueryLong wordMetadataQuery) {
|
public double calculateOverlap(CompiledQueryLong wordMetadataQuery) {
|
||||||
long mask = CompiledQueryAggregates.longBitmaskAggregate(wordMetadataQuery,
|
long mask = CompiledQueryAggregates.longBitmaskAggregate(wordMetadataQuery,
|
||||||
score -> score >>> WordMetadata.POSITIONS_SHIFT);
|
score -> score >>> WordMetadata.POSITIONS_SHIFT);
|
||||||
@ -16,31 +20,53 @@ public class TermCoherenceFactor {
|
|||||||
return bitsSetFactor(mask);
|
return bitsSetFactor(mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Calculate a factor that rewards the best average mutual Jaccard index
|
||||||
|
* between the terms in the query. This is high when the several terms are frequently
|
||||||
|
* found in the same sentences.
|
||||||
|
*/
|
||||||
public double calculateAvgMutualJaccard(CompiledQueryLong wordMetadataQuery, ResultRankingContext ctx) {
|
public double calculateAvgMutualJaccard(CompiledQueryLong wordMetadataQuery, ResultRankingContext ctx) {
|
||||||
double sum = 0;
|
double sum = 0;
|
||||||
int cnt = 0;
|
int cnt = 0;
|
||||||
|
|
||||||
for (int i = 0; i < wordMetadataQuery.size(); i++) {
|
for (int i = 0; i < wordMetadataQuery.size(); i++) {
|
||||||
if (!ctx.regularMask.get(i)) continue;
|
|
||||||
|
// Skip terms that are not in the regular mask
|
||||||
|
if (!ctx.regularMask.get(i))
|
||||||
|
continue;
|
||||||
|
|
||||||
long imask = WordMetadata.decodePositions(wordMetadataQuery.at(i));
|
long imask = WordMetadata.decodePositions(wordMetadataQuery.at(i));
|
||||||
|
|
||||||
|
// Skip terms that are not in the document
|
||||||
|
if (imask == 0L)
|
||||||
|
continue;
|
||||||
|
|
||||||
for (int j = i + 1; j < wordMetadataQuery.size(); j++) {
|
for (int j = i + 1; j < wordMetadataQuery.size(); j++) {
|
||||||
if (!ctx.regularMask.get(j)) continue;
|
|
||||||
|
// Skip terms that are not in the regular mask
|
||||||
|
if (!ctx.regularMask.get(j))
|
||||||
|
continue;
|
||||||
|
|
||||||
long jmask = WordMetadata.decodePositions(wordMetadataQuery.at(j));
|
long jmask = WordMetadata.decodePositions(wordMetadataQuery.at(j));
|
||||||
|
|
||||||
|
// Skip terms that are not in the document
|
||||||
|
if (jmask == 0L)
|
||||||
|
continue;
|
||||||
|
|
||||||
long quot = Long.bitCount(imask & jmask);
|
long quot = Long.bitCount(imask & jmask);
|
||||||
long rem = Long.bitCount(imask | jmask);
|
long rem = Long.bitCount(imask | jmask);
|
||||||
|
|
||||||
if (rem != 0) {
|
// rem is always > 0 because imask and jmask are not both 0
|
||||||
|
|
||||||
sum += quot/(double) rem;
|
sum += quot/(double) rem;
|
||||||
cnt++;
|
cnt++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
|
if (cnt != 0) {
|
||||||
return sum / cnt;
|
return sum / cnt;
|
||||||
|
} else {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
double bitsSetFactor(long mask) {
|
double bitsSetFactor(long mask) {
|
||||||
|
Loading…
Reference in New Issue
Block a user