diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingParameters.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingParameters.java index 7a5b7937..68e2b094 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingParameters.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingParameters.java @@ -50,7 +50,7 @@ public class ResultRankingParameters { .shortSentencePenalty(5) .bm25Weight(1.) .tcfAvgDist(25.) - .tcfFirstPosition(1) // FIXME: what's a good default? + .tcfFirstPosition(5) // FIXME: what's a good default? .temporalBias(TemporalBias.NONE) .temporalBiasWeight(1. / (5.)) .exportDebugData(false) diff --git a/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java b/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java index 32f5f78b..1f802c2c 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java @@ -1,5 +1,6 @@ package nu.marginalia.index.results; +import it.unimi.dsi.fastutil.ints.IntIterator; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; import nu.marginalia.api.searchquery.model.compiled.CompiledQueryInt; import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; @@ -25,6 +26,8 @@ import nu.marginalia.sequence.SequenceOperations; import javax.annotation.Nullable; import java.lang.foreign.Arena; +import java.util.ArrayList; +import java.util.List; import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.booleanAggregate; import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.intMaxMinAggregate; @@ -221,18 +224,35 @@ public class IndexResultScoreCalculator { float[] weightedCounts = new float[compiledQuery.size()]; int firstPosition = Integer.MAX_VALUE; - for (int i = 0; i < weightedCounts.length; i++) { - if (positions[i] != null) { - var iter = positions[i].iterator(); + float keywordMinDistFac = 0; + if (positions.length > 2) { + List iterators = new ArrayList<>(positions.length); - if (!ctx.regularMask.get(i)) { - continue; + for (int i = 0; i < positions.length; i++) { + if (positions[i] != null && ctx.regularMask.get(i)) { + iterators.add(positions[i].iterator()); } + } + + if (iterators.size() > 2) { + int minDist = SequenceOperations.minDistance(iterators); + + if (minDist < 32) { + keywordMinDistFac = 2.0f / (1.f + (float) Math.sqrt(minDist)); + } else { + keywordMinDistFac = -1.0f * (float) Math.sqrt(minDist); + } + } + } + + for (int i = 0; i < weightedCounts.length; i++) { + if (positions[i] != null && ctx.regularMask.get(i)) { + var iter = positions[i].iterator(); while (iter.hasNext()) { int pos = iter.nextInt(); - firstPosition = Math.min(firstPosition, pos); + firstPosition = Math.max(firstPosition, pos); if (spans.title.containsPosition(pos) || spans.heading.containsPosition(pos)) weightedCounts[i] += 2.5f; @@ -254,10 +274,11 @@ public class IndexResultScoreCalculator { + topologyBonus + temporalBias + flagsPenalty - + coherenceScore; + + coherenceScore + + keywordMinDistFac; double tcfAvgDist = rankingParams.tcfAvgDist * (1.0 / calculateAvgMinDistance(positionsQuery, ctx)); - double tcfFirstPosition = rankingParams.tcfFirstPosition * (1.0 / Math.max(1, firstPosition)); + double tcfFirstPosition = rankingParams.tcfFirstPosition * (1.0 / Math.sqrt(Math.max(1, firstPosition))); double bM25 = rankingParams.bm25Weight * wordFlagsQuery.root.visit(new Bm25GraphVisitor(rankingParams.bm25Params, weightedCounts, length, ctx)); double bFlags = rankingParams.bm25Weight * wordFlagsQuery.root.visit(new TermFlagsGraphVisitor(rankingParams.bm25Params, wordFlagsQuery.data, weightedCounts, ctx)); diff --git a/code/libraries/coded-sequence/java/nu/marginalia/sequence/SequenceOperations.java b/code/libraries/coded-sequence/java/nu/marginalia/sequence/SequenceOperations.java index 11df084e..64ee2b5a 100644 --- a/code/libraries/coded-sequence/java/nu/marginalia/sequence/SequenceOperations.java +++ b/code/libraries/coded-sequence/java/nu/marginalia/sequence/SequenceOperations.java @@ -4,6 +4,8 @@ import it.unimi.dsi.fastutil.ints.IntArrayList; import it.unimi.dsi.fastutil.ints.IntIterator; import it.unimi.dsi.fastutil.ints.IntList; +import java.util.List; + public class SequenceOperations { /** Return true if the sequences intersect, false otherwise. @@ -142,4 +144,55 @@ public class SequenceOperations { return minDistance; } + + public static int minDistance(List iterators) { + if (iterators.size() <= 1) + return 0; + + int[] values = new int[iterators.size()]; + + for (int i = 0; i < iterators.size(); i++) { + if (iterators.get(i).hasNext()) + values[i] = iterators.get(i).nextInt(); + else + return 0; + } + + int minDist = Integer.MAX_VALUE; + int successes = 0; + + int minVal = Integer.MAX_VALUE; + int maxVal = Integer.MIN_VALUE; + + for (int val : values) { + minVal = Math.min(minVal, val); + maxVal = Math.max(maxVal, val); + } + + minDist = Math.min(minDist, maxVal - minVal); + + for (int i = 0; successes < iterators.size(); i = (i + 1) % iterators.size()) + { + if (values[i] == minVal) { + if (!iterators.get(i).hasNext()) { + break; + } + values[i] = iterators.get(i).nextInt(); + + if (values[i] > maxVal) { + maxVal = values[i]; + } + if (values[i] > minVal) { + minVal = Integer.MAX_VALUE; + for (int val : values) { + minVal = Math.min(minVal, val); + } + } + + minDist = Math.min(minDist, maxVal - minVal); + } + } + + return minDist; + } } diff --git a/code/libraries/coded-sequence/test/nu/marginalia/sequence/SequenceOperationsTest.java b/code/libraries/coded-sequence/test/nu/marginalia/sequence/SequenceOperationsTest.java index e77ce0c5..6e235407 100644 --- a/code/libraries/coded-sequence/test/nu/marginalia/sequence/SequenceOperationsTest.java +++ b/code/libraries/coded-sequence/test/nu/marginalia/sequence/SequenceOperationsTest.java @@ -4,6 +4,7 @@ import it.unimi.dsi.fastutil.ints.IntList; import org.junit.jupiter.api.Test; import java.nio.ByteBuffer; +import java.util.List; import static org.junit.jupiter.api.Assertions.*; @@ -83,4 +84,15 @@ class SequenceOperationsTest { assertFalse(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator())); } + @Test + void testMinDistance() { + ByteBuffer wa = ByteBuffer.allocate(1024); + GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 11, 80, 160); + GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 20, 50, 100); + GammaCodedSequence seq3 = GammaCodedSequence.generate(wa, 30, 60, 90); + + assertEquals(19, SequenceOperations.minDistance(List.of(seq1.iterator(), seq2.iterator(), seq3.iterator()))); + + + } } \ No newline at end of file