(index) Additional optimization pass

This commit is contained in:
Viktor Lofgren 2024-12-12 18:57:33 +01:00
parent 3f11ca409f
commit dafaab3ef7
7 changed files with 59 additions and 85 deletions

View File

@ -29,7 +29,7 @@ public class IndexClient {
private static final Logger logger = LoggerFactory.getLogger(IndexClient.class);
private final GrpcMultiNodeChannelPool<IndexApiGrpc.IndexApiBlockingStub> channelPool;
private final DomainBlacklistImpl blacklist;
private static final ExecutorService executor = Executors.newFixedThreadPool(32);
private static final ExecutorService executor = Executors.newVirtualThreadPerTaskExecutor();
@Inject
public IndexClient(GrpcChannelPoolFactory channelPoolFactory, DomainBlacklistImpl blacklist) {

View File

@ -5,8 +5,6 @@ import it.unimi.dsi.fastutil.ints.IntIterator;
import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.sequence.CodedSequence;
import java.util.Arrays;
/** A list of the interlaced start and end positions of each span in the document of this type */
public class DocumentSpan {
@ -21,47 +19,29 @@ public class DocumentSpan {
this.startsEnds = null;
}
/** Counts the number of intersections between the spans in the document of this type and the given list of positions */
public int countIntersections(int[] positions) {
if (null == startsEnds || startsEnds.isEmpty() || positions.length == 0) {
public int countIntersections(IntList positions) {
if (null == startsEnds || startsEnds.isEmpty() || positions.size() == 0) {
return 0;
}
int sei = 0;
int pi = 0;
int start = startsEnds.getInt(sei++);
int end = startsEnds.getInt(sei++);
int pos = -1;
int cnt = 0;
if (positions.length < 8) { // for small arrays we can do a linear search
int seis = 0;
for (int pi = 0; pi < positions.length; pi++) {
int position = positions[pi];
// search through the spans until we find an item that is greater than the given position
for (int sei = seis; sei < startsEnds.size(); sei ++) {
if (startsEnds.getInt(sei) > position) {
cnt += sei % 2; // if sei is odd, we are between a start and end position in the spans list
seis = Math.max(seis, sei - 1);
break;
}
}
while (pi < positions.size() && sei < startsEnds.size()) {
if (pos < start) {
pos = positions.getInt(pi++);
}
}
else { // for large arrays we use a binary search
int searchStart = 0;
for (int sei = 0; sei < startsEnds.size() && searchStart < positions.length; ) {
int start = startsEnds.getInt(sei++);
int end = startsEnds.getInt(sei++);
// find the first position that is greater or equal to the start position
int i = Arrays.binarySearch(positions, searchStart, positions.length, start);
if (i < 0) i = -i - 1; // if the position is not found, we get the insertion point
// ... from that point, count the number of positions that smaller than the end position
while (i < positions.length && positions[i] < end) {
cnt++;
i++;
}
searchStart = i;
else if (pos < end) {
cnt++;
pos = positions.getInt(pi++);
}
else {
start = startsEnds.getInt(sei++);
end = startsEnds.getInt(sei++);
}
}

View File

@ -4,6 +4,7 @@ import nu.marginalia.sequence.VarintCodedSequence;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.lang.foreign.MemorySegment;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
@ -12,10 +13,15 @@ import java.nio.file.StandardOpenOption;
@SuppressWarnings("preview")
public class ForwardIndexSpansReader implements AutoCloseable {
private final FileChannel spansFileChannel;
private final Arena arena;
private final MemorySegment spansSegment;
public ForwardIndexSpansReader(Path spansFile) throws IOException {
this.spansFileChannel = (FileChannel) Files.newByteChannel(spansFile, StandardOpenOption.READ);
arena = Arena.ofShared();
try (var channel = (FileChannel) Files.newByteChannel(spansFile, StandardOpenOption.READ)) {
spansSegment = channel.map(FileChannel.MapMode.READ_ONLY, 0, channel.size(), arena);
}
}
public DocumentSpans readSpans(Arena arena, long encodedOffset) throws IOException {
@ -23,13 +29,9 @@ public class ForwardIndexSpansReader implements AutoCloseable {
long size = SpansCodec.decodeSize(encodedOffset);
long offset = SpansCodec.decodeStartOffset(encodedOffset);
// Allocate a buffer from the arena
var buffer = arena.allocate(size).asByteBuffer();
buffer.clear();
while (buffer.hasRemaining()) {
spansFileChannel.read(buffer, offset + buffer.position());
}
buffer.flip();
var segment = spansSegment.asSlice(offset, size);
ByteBuffer buffer = segment.asByteBuffer();
// Read the number of spans in the document
int count = buffer.get();
@ -53,7 +55,7 @@ public class ForwardIndexSpansReader implements AutoCloseable {
@Override
public void close() throws IOException {
spansFileChannel.close();
arena.close();
}
}

View File

@ -5,16 +5,22 @@ import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.lang.foreign.MemorySegment;
import java.nio.channels.FileChannel;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
public class PositionsFileReader implements AutoCloseable {
private final FileChannel positions;
private final Arena arena;
private final MemorySegment positionsSegment;
private static final Logger logger = LoggerFactory.getLogger(PositionsFileReader.class);
public PositionsFileReader(Path positionsFile) throws IOException {
this.positions = FileChannel.open(positionsFile, StandardOpenOption.READ);
arena = Arena.ofShared();
try (var channel = FileChannel.open(positionsFile, StandardOpenOption.READ)) {
positionsSegment = channel.map(FileChannel.MapMode.READ_ONLY, 0, channel.size(), arena);
}
}
/** Get the positions for a term in the index, as pointed out by the encoded offset;
@ -24,20 +30,15 @@ public class PositionsFileReader implements AutoCloseable {
long offset = PositionCodec.decodeOffset(sizeEncodedOffset);
var segment = arena.allocate(length);
var buffer = segment.asByteBuffer();
try {
positions.read(buffer, offset);
} catch (IOException e) {
throw new RuntimeException(e);
}
MemorySegment.copy(positionsSegment, offset, segment, 0, length);
return new TermData(buffer);
return new TermData(segment.asByteBuffer());
}
@Override
public void close() throws IOException {
positions.close();
arena.close();
}
}

View File

@ -256,7 +256,7 @@ public class IndexGrpcService
/** The queue where the results from the index lookup threads are placed,
* pending ranking by the result ranker threads */
private final ArrayBlockingQueue<CombinedDocIdList> resultCandidateQueue
= new ArrayBlockingQueue<>(8);
= new ArrayBlockingQueue<>(64);
private final ResultPriorityQueue resultHeap;
private final ResultRankingContext resultRankingContext;
@ -342,7 +342,7 @@ public class IndexGrpcService
}
private void executeSearch() {
final LongArrayList results = new LongArrayList(4096);
final LongArrayList results = new LongArrayList(16);
// These queries are different indices for one subquery
final LongQueryBuffer buffer = new LongQueryBuffer(4096);
@ -352,31 +352,16 @@ public class IndexGrpcService
buffer.reset();
query.getMoreResults(buffer);
for (int i = 0; i < buffer.end; i++) {
results.add(buffer.data.get(i));
}
if (!results.isEmpty()) {
int stride = 16;
for (int start = 0; start < results.size(); start+=stride) {
int end = Math.min(results.size(), start + stride);
if (end > start) {
long[] data = new long[end-start];
for (int i = 0; i < data.length; i++) {
data[i] = results.getLong(start + i);
}
enqueueResults(new CombinedDocIdList(data));
}
for (int i = 0; i < buffer.end; i+=16) {
for (int j = 0; j < Math.min(buffer.end - i, 16); j++) {
results.add(buffer.data.get(i+j));
}
enqueueResults(new CombinedDocIdList(results));
results.clear();
}
}
buffer.dispose();
if (!results.isEmpty()) {
enqueueResults(new CombinedDocIdList(results));
}
}
private void enqueueResults(CombinedDocIdList resultIds) {

View File

@ -484,9 +484,8 @@ public class IndexResultScoreCalculator {
firstPosition = Math.max(firstPosition, positions[i].getInt(0));
searchableKeywordCount ++;
int[] posArray = positions[i].toIntArray();
for (var tag : HtmlTag.includedTags) {
int cnt = spans.getSpan(tag).countIntersections(posArray);
int cnt = spans.getSpan(tag).countIntersections(positions[i]);
observationsByTag[tag.ordinal()] += cnt;
valuesByWordIdx[i] += cnt * weights[tag.ordinal()];
}

View File

@ -4,11 +4,18 @@ package nu.marginalia.index.query;
/** An execution time budget for index search operations. */
public class IndexSearchBudget {
private final long timeout;
private final long startTime;
public IndexSearchBudget(long limitTime) {
this.timeout = System.currentTimeMillis() + limitTime;
this.startTime = System.nanoTime();
this.timeout = Math.min(limitTime, 10_000) * 1_000_000L;
}
public boolean hasTimeLeft() {
return System.nanoTime() - startTime < timeout;
}
public long timeLeft() {
return 1_000_000 * (timeout - (System.nanoTime() - startTime));
}
public boolean hasTimeLeft() { return System.currentTimeMillis() < timeout; }
public long timeLeft() { return timeout - System.currentTimeMillis(); }
}