From 965c89798e65a68205bdab96494380bea7f09b3b Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 25 Aug 2024 12:44:33 +0200 Subject: [PATCH] (index) Optimize DocumentSpan --- .../index/forward/spans/DocumentSpan.java | 26 +++++++++++++------ .../results/IndexResultScoreCalculator.java | 6 +++-- .../model/PhraseConstraintGroupList.java | 17 +++--------- .../sequence/SequenceOperationsTest.java | 2 +- 4 files changed, 27 insertions(+), 24 deletions(-) diff --git a/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java b/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java index b66030d2..cd528892 100644 --- a/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java +++ b/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java @@ -68,21 +68,31 @@ public class DocumentSpan { return false; } - public boolean containsRange(int rangeStart, int len) { - if (startsEnds == null) { + public boolean containsRange(IntIterator positionsIter, int len) { + if (null == startsEnds || !positionsIter.hasNext()) { return false; } var iter = startsEnds.iterator(); - while (iter.hasNext()) { - int start = iter.nextInt(); - if (start > rangeStart) { - return false; + int start = -1; + int end = -1; + + while (iter.hasNext() && positionsIter.hasNext()) { + if (start < 0) { + start = iter.nextInt(); + end = iter.nextInt(); } - int end = iter.nextInt(); - if (end > rangeStart + len) { + + int position = positionsIter.nextInt(); + if (position < start) { + continue; + } + + if (position + len < end) { return true; } + + start = -1; } return false; diff --git a/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java b/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java index 84db185e..d5546076 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java @@ -398,8 +398,9 @@ public class IndexResultScoreCalculator { float verbatimMatchScore = 0.f; var fullGroup = constraints.getFullGroup(); + IntList fullGroupIntersections = fullGroup.findIntersections(positions); for (var tag : HtmlTag.includedTags) { - if (fullGroup.test(spans.getSpan(tag), positions)) { + if (spans.getSpan(tag).containsRange(fullGroupIntersections.iterator(), fullGroup.size)) { verbatimMatchScore += verbatimMatches.getWeightFull(tag) * fullGroup.size; verbatimMatches.set(tag); } @@ -410,8 +411,9 @@ public class IndexResultScoreCalculator { int groupSize = optionalGroup.size; float sizeScalingFactor = groupSize / (float) largestOptional; + IntList intersections = optionalGroup.findIntersections(positions); for (var tag : HtmlTag.includedTags) { - if (optionalGroup.test(spans.getSpan(tag), positions)) { + if (spans.getSpan(tag).containsRange(intersections.iterator(), groupSize)) { verbatimMatchScore += verbatimMatches.getWeightPartial(tag) * sizeScalingFactor * groupSize; } } diff --git a/code/index/java/nu/marginalia/index/results/model/PhraseConstraintGroupList.java b/code/index/java/nu/marginalia/index/results/model/PhraseConstraintGroupList.java index 2976653b..399ff8ca 100644 --- a/code/index/java/nu/marginalia/index/results/model/PhraseConstraintGroupList.java +++ b/code/index/java/nu/marginalia/index/results/model/PhraseConstraintGroupList.java @@ -2,7 +2,6 @@ package nu.marginalia.index.results.model; import it.unimi.dsi.fastutil.ints.IntIterator; import it.unimi.dsi.fastutil.ints.IntList; -import nu.marginalia.index.forward.spans.DocumentSpan; import nu.marginalia.index.model.SearchTermsUtil; import nu.marginalia.index.results.model.ids.TermIdList; import nu.marginalia.sequence.CodedSequence; @@ -114,7 +113,7 @@ public class PhraseConstraintGroupList { } - public boolean test(DocumentSpan span, IntList[] positions) { + public IntList findIntersections(IntList[] positions) { IntIterator[] sequences = new IntIterator[present.cardinality()]; int[] iterOffsets = new int[sequences.length]; @@ -124,7 +123,7 @@ public class PhraseConstraintGroupList { } int offset = offsets[oi]; if (offset < 0) - return false; + return IntList.of(); // Create iterators that are offset by their relative position in the // sequence. This is done by subtracting the index from the offset, @@ -133,21 +132,13 @@ public class PhraseConstraintGroupList { var posForTerm = positions[offset]; if (posForTerm == null) { - return false; + return IntList.of(); } sequences[si++] = posForTerm.iterator(); iterOffsets[si - 1] = -oi; } - var intersections = SequenceOperations.findIntersections(iterOffsets, sequences); - - for (int idx = 0; idx < intersections.size(); idx++) { - if (span.containsRange(intersections.getInt(idx), sequences.length)) { - return true; - } - } - - return false; + return SequenceOperations.findIntersections(iterOffsets, sequences); } public int minDistance(IntList[] positions) { diff --git a/code/libraries/coded-sequence/test/nu/marginalia/sequence/SequenceOperationsTest.java b/code/libraries/coded-sequence/test/nu/marginalia/sequence/SequenceOperationsTest.java index cf72412d..514eedc9 100644 --- a/code/libraries/coded-sequence/test/nu/marginalia/sequence/SequenceOperationsTest.java +++ b/code/libraries/coded-sequence/test/nu/marginalia/sequence/SequenceOperationsTest.java @@ -71,7 +71,7 @@ class SequenceOperationsTest { GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 2, 5, 8, 10, 14); GammaCodedSequence seq3 = GammaCodedSequence.generate(wa, 1, 5, 8, 9, 10); - assertEquals(IntList.of(8, 10), SequenceOperations.findIntersections(iterOffsets, seq1.iterator(), seq2.iterator(), seq3.iterator())); + assertEquals(IntList.of(8, 10), SequenceOperations.findIntersections(seq1.iterator(), seq2.iterator(), seq3.iterator())); }