(index) Optimize DocumentSpan

This commit is contained in:
Viktor Lofgren 2024-08-25 12:44:33 +02:00
parent 982b03382b
commit 965c89798e
4 changed files with 27 additions and 24 deletions

View File

@ -68,21 +68,31 @@ public class DocumentSpan {
return false;
}
public boolean containsRange(int rangeStart, int len) {
if (startsEnds == null) {
public boolean containsRange(IntIterator positionsIter, int len) {
if (null == startsEnds || !positionsIter.hasNext()) {
return false;
}
var iter = startsEnds.iterator();
while (iter.hasNext()) {
int start = iter.nextInt();
if (start > rangeStart) {
return false;
int start = -1;
int end = -1;
while (iter.hasNext() && positionsIter.hasNext()) {
if (start < 0) {
start = iter.nextInt();
end = iter.nextInt();
}
int end = iter.nextInt();
if (end > rangeStart + len) {
int position = positionsIter.nextInt();
if (position < start) {
continue;
}
if (position + len < end) {
return true;
}
start = -1;
}
return false;

View File

@ -398,8 +398,9 @@ public class IndexResultScoreCalculator {
float verbatimMatchScore = 0.f;
var fullGroup = constraints.getFullGroup();
IntList fullGroupIntersections = fullGroup.findIntersections(positions);
for (var tag : HtmlTag.includedTags) {
if (fullGroup.test(spans.getSpan(tag), positions)) {
if (spans.getSpan(tag).containsRange(fullGroupIntersections.iterator(), fullGroup.size)) {
verbatimMatchScore += verbatimMatches.getWeightFull(tag) * fullGroup.size;
verbatimMatches.set(tag);
}
@ -410,8 +411,9 @@ public class IndexResultScoreCalculator {
int groupSize = optionalGroup.size;
float sizeScalingFactor = groupSize / (float) largestOptional;
IntList intersections = optionalGroup.findIntersections(positions);
for (var tag : HtmlTag.includedTags) {
if (optionalGroup.test(spans.getSpan(tag), positions)) {
if (spans.getSpan(tag).containsRange(intersections.iterator(), groupSize)) {
verbatimMatchScore += verbatimMatches.getWeightPartial(tag) * sizeScalingFactor * groupSize;
}
}

View File

@ -2,7 +2,6 @@ package nu.marginalia.index.results.model;
import it.unimi.dsi.fastutil.ints.IntIterator;
import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.index.forward.spans.DocumentSpan;
import nu.marginalia.index.model.SearchTermsUtil;
import nu.marginalia.index.results.model.ids.TermIdList;
import nu.marginalia.sequence.CodedSequence;
@ -114,7 +113,7 @@ public class PhraseConstraintGroupList {
}
public boolean test(DocumentSpan span, IntList[] positions) {
public IntList findIntersections(IntList[] positions) {
IntIterator[] sequences = new IntIterator[present.cardinality()];
int[] iterOffsets = new int[sequences.length];
@ -124,7 +123,7 @@ public class PhraseConstraintGroupList {
}
int offset = offsets[oi];
if (offset < 0)
return false;
return IntList.of();
// Create iterators that are offset by their relative position in the
// sequence. This is done by subtracting the index from the offset,
@ -133,21 +132,13 @@ public class PhraseConstraintGroupList {
var posForTerm = positions[offset];
if (posForTerm == null) {
return false;
return IntList.of();
}
sequences[si++] = posForTerm.iterator();
iterOffsets[si - 1] = -oi;
}
var intersections = SequenceOperations.findIntersections(iterOffsets, sequences);
for (int idx = 0; idx < intersections.size(); idx++) {
if (span.containsRange(intersections.getInt(idx), sequences.length)) {
return true;
}
}
return false;
return SequenceOperations.findIntersections(iterOffsets, sequences);
}
public int minDistance(IntList[] positions) {

View File

@ -71,7 +71,7 @@ class SequenceOperationsTest {
GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 2, 5, 8, 10, 14);
GammaCodedSequence seq3 = GammaCodedSequence.generate(wa, 1, 5, 8, 9, 10);
assertEquals(IntList.of(8, 10), SequenceOperations.findIntersections(iterOffsets, seq1.iterator(), seq2.iterator(), seq3.iterator()));
assertEquals(IntList.of(8, 10), SequenceOperations.findIntersections(seq1.iterator(), seq2.iterator(), seq3.iterator()));
}