(index) Speed up minDist calculations by excluding large lists

This commit is contained in:
Viktor Lofgren 2024-08-26 13:04:15 +02:00
parent 77efce0673
commit 30bf845c81
2 changed files with 11 additions and 6 deletions

View File

@ -1,5 +1,6 @@
package nu.marginalia.index.results.model; package nu.marginalia.index.results.model;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntIterator; import it.unimi.dsi.fastutil.ints.IntIterator;
import it.unimi.dsi.fastutil.ints.IntList; import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.index.model.SearchTermsUtil; import nu.marginalia.index.model.SearchTermsUtil;
@ -142,8 +143,8 @@ public class PhraseConstraintGroupList {
} }
public int minDistance(IntList[] positions) { public int minDistance(IntList[] positions) {
IntList[] sequences = new IntList[present.cardinality()]; List<IntList> sequences = new ArrayList<>(present.cardinality());
int[] iterOffsets = new int[sequences.length]; IntList iterOffsets = new IntArrayList(present.cardinality());
for (int oi = 0, si = 0; oi < offsets.length; oi++) { for (int oi = 0, si = 0; oi < offsets.length; oi++) {
if (!present.get(oi)) { if (!present.get(oi)) {
@ -162,11 +163,16 @@ public class PhraseConstraintGroupList {
if (posForTerm == null) { if (posForTerm == null) {
return Integer.MAX_VALUE; return Integer.MAX_VALUE;
} }
sequences[si++] = posForTerm;
iterOffsets[si - 1] = -oi; if (posForTerm.size() > 16) { // heuristic to avoid large sequences, which is expensive and not very useful
continue;
}
sequences.add(posForTerm);
iterOffsets.add(-oi);
} }
return SequenceOperations.minDistance(sequences, iterOffsets); return SequenceOperations.minDistance(sequences.toArray(IntList[]::new), iterOffsets.toIntArray());
} }
} }
} }

View File

@ -158,7 +158,6 @@ public class SequenceOperations {
} }
int minDist = Integer.MAX_VALUE; int minDist = Integer.MAX_VALUE;
int maxVal = Integer.MIN_VALUE; int maxVal = Integer.MIN_VALUE;
int maxI = 0; int maxI = 0;