(index) Optimize DocumentSpan

This commit is contained in:
Viktor Lofgren 2024-08-25 12:44:33 +02:00
parent 982b03382b
commit 965c89798e
4 changed files with 27 additions and 24 deletions

View File

@ -68,21 +68,31 @@ public class DocumentSpan {
return false; return false;
} }
public boolean containsRange(int rangeStart, int len) { public boolean containsRange(IntIterator positionsIter, int len) {
if (startsEnds == null) { if (null == startsEnds || !positionsIter.hasNext()) {
return false; return false;
} }
var iter = startsEnds.iterator(); var iter = startsEnds.iterator();
while (iter.hasNext()) { int start = -1;
int start = iter.nextInt(); int end = -1;
if (start > rangeStart) {
return false; while (iter.hasNext() && positionsIter.hasNext()) {
if (start < 0) {
start = iter.nextInt();
end = iter.nextInt();
} }
int end = iter.nextInt();
if (end > rangeStart + len) { int position = positionsIter.nextInt();
if (position < start) {
continue;
}
if (position + len < end) {
return true; return true;
} }
start = -1;
} }
return false; return false;

View File

@ -398,8 +398,9 @@ public class IndexResultScoreCalculator {
float verbatimMatchScore = 0.f; float verbatimMatchScore = 0.f;
var fullGroup = constraints.getFullGroup(); var fullGroup = constraints.getFullGroup();
IntList fullGroupIntersections = fullGroup.findIntersections(positions);
for (var tag : HtmlTag.includedTags) { for (var tag : HtmlTag.includedTags) {
if (fullGroup.test(spans.getSpan(tag), positions)) { if (spans.getSpan(tag).containsRange(fullGroupIntersections.iterator(), fullGroup.size)) {
verbatimMatchScore += verbatimMatches.getWeightFull(tag) * fullGroup.size; verbatimMatchScore += verbatimMatches.getWeightFull(tag) * fullGroup.size;
verbatimMatches.set(tag); verbatimMatches.set(tag);
} }
@ -410,8 +411,9 @@ public class IndexResultScoreCalculator {
int groupSize = optionalGroup.size; int groupSize = optionalGroup.size;
float sizeScalingFactor = groupSize / (float) largestOptional; float sizeScalingFactor = groupSize / (float) largestOptional;
IntList intersections = optionalGroup.findIntersections(positions);
for (var tag : HtmlTag.includedTags) { for (var tag : HtmlTag.includedTags) {
if (optionalGroup.test(spans.getSpan(tag), positions)) { if (spans.getSpan(tag).containsRange(intersections.iterator(), groupSize)) {
verbatimMatchScore += verbatimMatches.getWeightPartial(tag) * sizeScalingFactor * groupSize; verbatimMatchScore += verbatimMatches.getWeightPartial(tag) * sizeScalingFactor * groupSize;
} }
} }

View File

@ -2,7 +2,6 @@ package nu.marginalia.index.results.model;
import it.unimi.dsi.fastutil.ints.IntIterator; import it.unimi.dsi.fastutil.ints.IntIterator;
import it.unimi.dsi.fastutil.ints.IntList; import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.index.forward.spans.DocumentSpan;
import nu.marginalia.index.model.SearchTermsUtil; import nu.marginalia.index.model.SearchTermsUtil;
import nu.marginalia.index.results.model.ids.TermIdList; import nu.marginalia.index.results.model.ids.TermIdList;
import nu.marginalia.sequence.CodedSequence; import nu.marginalia.sequence.CodedSequence;
@ -114,7 +113,7 @@ public class PhraseConstraintGroupList {
} }
public boolean test(DocumentSpan span, IntList[] positions) { public IntList findIntersections(IntList[] positions) {
IntIterator[] sequences = new IntIterator[present.cardinality()]; IntIterator[] sequences = new IntIterator[present.cardinality()];
int[] iterOffsets = new int[sequences.length]; int[] iterOffsets = new int[sequences.length];
@ -124,7 +123,7 @@ public class PhraseConstraintGroupList {
} }
int offset = offsets[oi]; int offset = offsets[oi];
if (offset < 0) if (offset < 0)
return false; return IntList.of();
// Create iterators that are offset by their relative position in the // Create iterators that are offset by their relative position in the
// sequence. This is done by subtracting the index from the offset, // sequence. This is done by subtracting the index from the offset,
@ -133,21 +132,13 @@ public class PhraseConstraintGroupList {
var posForTerm = positions[offset]; var posForTerm = positions[offset];
if (posForTerm == null) { if (posForTerm == null) {
return false; return IntList.of();
} }
sequences[si++] = posForTerm.iterator(); sequences[si++] = posForTerm.iterator();
iterOffsets[si - 1] = -oi; iterOffsets[si - 1] = -oi;
} }
var intersections = SequenceOperations.findIntersections(iterOffsets, sequences); return SequenceOperations.findIntersections(iterOffsets, sequences);
for (int idx = 0; idx < intersections.size(); idx++) {
if (span.containsRange(intersections.getInt(idx), sequences.length)) {
return true;
}
}
return false;
} }
public int minDistance(IntList[] positions) { public int minDistance(IntList[] positions) {

View File

@ -71,7 +71,7 @@ class SequenceOperationsTest {
GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 2, 5, 8, 10, 14); GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 2, 5, 8, 10, 14);
GammaCodedSequence seq3 = GammaCodedSequence.generate(wa, 1, 5, 8, 9, 10); GammaCodedSequence seq3 = GammaCodedSequence.generate(wa, 1, 5, 8, 9, 10);
assertEquals(IntList.of(8, 10), SequenceOperations.findIntersections(iterOffsets, seq1.iterator(), seq2.iterator(), seq3.iterator())); assertEquals(IntList.of(8, 10), SequenceOperations.findIntersections(seq1.iterator(), seq2.iterator(), seq3.iterator()));
} }