From 085d985e6178ee236a2e1d06ddcda379320c2e20 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 11 Jan 2023 17:19:57 +0100 Subject: [PATCH] Result selection algorithm tweaks --- .../index/postings/IndexResultValuator.java | 20 ++++++++---- .../valuation/SearchResultValuator.java | 31 +++++++------------ .../templates/edge/parts/search-form.hdb | 2 +- 3 files changed, 26 insertions(+), 27 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/IndexResultValuator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/IndexResultValuator.java index 462e4c7a..d785735f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/IndexResultValuator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/IndexResultValuator.java @@ -131,7 +131,8 @@ public class IndexResultValuator { } private double calculateTermCoherencePenalty(int urlId, TObjectIntHashMap termToId, List termList) { - long maskDirect = ~0; + long maskDirectGenerous = ~0; + long maskDirectRaw = ~0; long maskAdjacent = ~0; final int flagBitMask = EdgePageWordFlags.Title.asBit() @@ -148,21 +149,28 @@ public class IndexResultValuator { positions = EdgePageWordMetadata.decodePositions(meta); - if (!EdgePageWordMetadata.hasAnyFlags(meta, flagBitMask)) { - maskDirect &= positions; - maskAdjacent &= (positions | (positions << 1) | (positions >>> 1)); + maskAdjacent &= (positions | (positions << 1) | (positions >>> 1)); + maskDirectRaw &= positions; + + if (positions == 0 && !EdgePageWordMetadata.hasAnyFlags(meta, flagBitMask)) { + maskDirectGenerous &= positions; } + } if (maskAdjacent == 0) { return 40; } - if (maskDirect == 0) { + if (maskDirectGenerous == 0) { return 20; } - return Long.numberOfTrailingZeros(maskDirect)/5. - Long.bitCount(maskDirect); + if (maskDirectRaw == 0) { + return 2; + } + + return Long.numberOfTrailingZeros(maskDirectGenerous)/5. - Long.bitCount(maskDirectGenerous); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/valuation/SearchResultValuator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/valuation/SearchResultValuator.java index 7ea78619..1e9db506 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/valuation/SearchResultValuator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/valuation/SearchResultValuator.java @@ -86,7 +86,7 @@ public class SearchResultValuator { } - return bestScore * (0.3 + 0.7 * bestAllTermsFactor) * priorityTermBonus; + return bestScore * (0.1 + 0.9 * bestAllTermsFactor) * priorityTermBonus; } private boolean hasPriorityTerm(List rawScores) { @@ -153,6 +153,7 @@ public class SearchResultValuator { private double calculateTermCoherencePenalty(SearchResultsKeywordSet keywordSet, double f) { long maskDirect = ~0; long maskAdjacent = ~0; + byte excludeMask = (byte) (EdgePageWordFlags.Title.asBit() | EdgePageWordFlags.Subjects.asBit() | EdgePageWordFlags.Synthetic.asBit()); for (var keyword : keywordSet.keywords) { @@ -163,28 +164,28 @@ public class SearchResultValuator { return f; } + positions = meta.positions(); - if (!EdgePageWordMetadata.hasAnyFlags(meta.flags(), excludeMask)) - { + maskAdjacent &= (positions | (positions << 1) | (positions >>> 1)); + if (positions != 0 && !EdgePageWordMetadata.hasAnyFlags(meta.flags(), excludeMask)) + { maskDirect &= positions; - maskAdjacent &= (positions | (positions << 1) | (positions >>> 1)); } } if (maskAdjacent == 0) { - return 1.2 * f; + return 2 * f; } if (maskDirect == 0) { - return 1.1 * f; + return 1.25 * f; } - if (maskDirect != ~0L) { - double locationFactor = 0.65 + Math.max(0., - 0.35 * Long.numberOfTrailingZeros(maskDirect) / 16. - - Math.sqrt(Long.bitCount(maskDirect) - 1) / 5. + double locationFactor = 0.5 + Math.max(0., + 0.5 * Long.numberOfTrailingZeros(maskDirect) / 16. + - Math.sqrt(Long.bitCount(maskDirect) - 1) / 3. ); return f * locationFactor; @@ -237,16 +238,6 @@ public class SearchResultValuator { return f; } - private double getLengthPenalty(int length) { - if (length < MIN_LENGTH) { - length = MIN_LENGTH; - } - if (length > AVG_LENGTH) { - length = AVG_LENGTH; - } - return (0.5 + 0.5 * length / AVG_LENGTH); - } - private double[] getTermWeights(EdgeSearchResultKeywordScore[] scores) { double[] weights = new double[scores.length]; diff --git a/marginalia_nu/src/main/resources/templates/edge/parts/search-form.hdb b/marginalia_nu/src/main/resources/templates/edge/parts/search-form.hdb index d2b059fd..2a252e56 100644 --- a/marginalia_nu/src/main/resources/templates/edge/parts/search-form.hdb +++ b/marginalia_nu/src/main/resources/templates/edge/parts/search-form.hdb @@ -11,7 +11,7 @@ - +