mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(index) Additional optimization pass
This commit is contained in:
parent
3f11ca409f
commit
dafaab3ef7
@ -29,7 +29,7 @@ public class IndexClient {
|
||||
private static final Logger logger = LoggerFactory.getLogger(IndexClient.class);
|
||||
private final GrpcMultiNodeChannelPool<IndexApiGrpc.IndexApiBlockingStub> channelPool;
|
||||
private final DomainBlacklistImpl blacklist;
|
||||
private static final ExecutorService executor = Executors.newFixedThreadPool(32);
|
||||
private static final ExecutorService executor = Executors.newVirtualThreadPerTaskExecutor();
|
||||
|
||||
@Inject
|
||||
public IndexClient(GrpcChannelPoolFactory channelPoolFactory, DomainBlacklistImpl blacklist) {
|
||||
|
@ -5,8 +5,6 @@ import it.unimi.dsi.fastutil.ints.IntIterator;
|
||||
import it.unimi.dsi.fastutil.ints.IntList;
|
||||
import nu.marginalia.sequence.CodedSequence;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/** A list of the interlaced start and end positions of each span in the document of this type */
|
||||
public class DocumentSpan {
|
||||
|
||||
@ -21,47 +19,29 @@ public class DocumentSpan {
|
||||
this.startsEnds = null;
|
||||
}
|
||||
|
||||
/** Counts the number of intersections between the spans in the document of this type and the given list of positions */
|
||||
public int countIntersections(int[] positions) {
|
||||
if (null == startsEnds || startsEnds.isEmpty() || positions.length == 0) {
|
||||
public int countIntersections(IntList positions) {
|
||||
if (null == startsEnds || startsEnds.isEmpty() || positions.size() == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int sei = 0;
|
||||
int pi = 0;
|
||||
int start = startsEnds.getInt(sei++);
|
||||
int end = startsEnds.getInt(sei++);
|
||||
int pos = -1;
|
||||
|
||||
int cnt = 0;
|
||||
|
||||
if (positions.length < 8) { // for small arrays we can do a linear search
|
||||
int seis = 0;
|
||||
|
||||
for (int pi = 0; pi < positions.length; pi++) {
|
||||
int position = positions[pi];
|
||||
|
||||
// search through the spans until we find an item that is greater than the given position
|
||||
for (int sei = seis; sei < startsEnds.size(); sei ++) {
|
||||
if (startsEnds.getInt(sei) > position) {
|
||||
cnt += sei % 2; // if sei is odd, we are between a start and end position in the spans list
|
||||
seis = Math.max(seis, sei - 1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
while (pi < positions.size() && sei < startsEnds.size()) {
|
||||
if (pos < start) {
|
||||
pos = positions.getInt(pi++);
|
||||
}
|
||||
}
|
||||
else { // for large arrays we use a binary search
|
||||
int searchStart = 0;
|
||||
|
||||
for (int sei = 0; sei < startsEnds.size() && searchStart < positions.length; ) {
|
||||
int start = startsEnds.getInt(sei++);
|
||||
int end = startsEnds.getInt(sei++);
|
||||
|
||||
// find the first position that is greater or equal to the start position
|
||||
int i = Arrays.binarySearch(positions, searchStart, positions.length, start);
|
||||
if (i < 0) i = -i - 1; // if the position is not found, we get the insertion point
|
||||
|
||||
// ... from that point, count the number of positions that smaller than the end position
|
||||
while (i < positions.length && positions[i] < end) {
|
||||
cnt++;
|
||||
i++;
|
||||
}
|
||||
searchStart = i;
|
||||
else if (pos < end) {
|
||||
cnt++;
|
||||
pos = positions.getInt(pi++);
|
||||
}
|
||||
else {
|
||||
start = startsEnds.getInt(sei++);
|
||||
end = startsEnds.getInt(sei++);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -4,6 +4,7 @@ import nu.marginalia.sequence.VarintCodedSequence;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.foreign.Arena;
|
||||
import java.lang.foreign.MemorySegment;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
@ -12,10 +13,15 @@ import java.nio.file.StandardOpenOption;
|
||||
|
||||
@SuppressWarnings("preview")
|
||||
public class ForwardIndexSpansReader implements AutoCloseable {
|
||||
private final FileChannel spansFileChannel;
|
||||
private final Arena arena;
|
||||
private final MemorySegment spansSegment;
|
||||
|
||||
public ForwardIndexSpansReader(Path spansFile) throws IOException {
|
||||
this.spansFileChannel = (FileChannel) Files.newByteChannel(spansFile, StandardOpenOption.READ);
|
||||
arena = Arena.ofShared();
|
||||
|
||||
try (var channel = (FileChannel) Files.newByteChannel(spansFile, StandardOpenOption.READ)) {
|
||||
spansSegment = channel.map(FileChannel.MapMode.READ_ONLY, 0, channel.size(), arena);
|
||||
}
|
||||
}
|
||||
|
||||
public DocumentSpans readSpans(Arena arena, long encodedOffset) throws IOException {
|
||||
@ -23,13 +29,9 @@ public class ForwardIndexSpansReader implements AutoCloseable {
|
||||
long size = SpansCodec.decodeSize(encodedOffset);
|
||||
long offset = SpansCodec.decodeStartOffset(encodedOffset);
|
||||
|
||||
// Allocate a buffer from the arena
|
||||
var buffer = arena.allocate(size).asByteBuffer();
|
||||
buffer.clear();
|
||||
while (buffer.hasRemaining()) {
|
||||
spansFileChannel.read(buffer, offset + buffer.position());
|
||||
}
|
||||
buffer.flip();
|
||||
var segment = spansSegment.asSlice(offset, size);
|
||||
|
||||
ByteBuffer buffer = segment.asByteBuffer();
|
||||
|
||||
// Read the number of spans in the document
|
||||
int count = buffer.get();
|
||||
@ -53,7 +55,7 @@ public class ForwardIndexSpansReader implements AutoCloseable {
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
spansFileChannel.close();
|
||||
arena.close();
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -5,16 +5,22 @@ import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.foreign.Arena;
|
||||
import java.lang.foreign.MemorySegment;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
|
||||
public class PositionsFileReader implements AutoCloseable {
|
||||
private final FileChannel positions;
|
||||
private final Arena arena;
|
||||
private final MemorySegment positionsSegment;
|
||||
private static final Logger logger = LoggerFactory.getLogger(PositionsFileReader.class);
|
||||
|
||||
public PositionsFileReader(Path positionsFile) throws IOException {
|
||||
this.positions = FileChannel.open(positionsFile, StandardOpenOption.READ);
|
||||
arena = Arena.ofShared();
|
||||
|
||||
try (var channel = FileChannel.open(positionsFile, StandardOpenOption.READ)) {
|
||||
positionsSegment = channel.map(FileChannel.MapMode.READ_ONLY, 0, channel.size(), arena);
|
||||
}
|
||||
}
|
||||
|
||||
/** Get the positions for a term in the index, as pointed out by the encoded offset;
|
||||
@ -24,20 +30,15 @@ public class PositionsFileReader implements AutoCloseable {
|
||||
long offset = PositionCodec.decodeOffset(sizeEncodedOffset);
|
||||
|
||||
var segment = arena.allocate(length);
|
||||
var buffer = segment.asByteBuffer();
|
||||
|
||||
try {
|
||||
positions.read(buffer, offset);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
MemorySegment.copy(positionsSegment, offset, segment, 0, length);
|
||||
|
||||
return new TermData(buffer);
|
||||
return new TermData(segment.asByteBuffer());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
positions.close();
|
||||
arena.close();
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -256,7 +256,7 @@ public class IndexGrpcService
|
||||
/** The queue where the results from the index lookup threads are placed,
|
||||
* pending ranking by the result ranker threads */
|
||||
private final ArrayBlockingQueue<CombinedDocIdList> resultCandidateQueue
|
||||
= new ArrayBlockingQueue<>(8);
|
||||
= new ArrayBlockingQueue<>(64);
|
||||
private final ResultPriorityQueue resultHeap;
|
||||
|
||||
private final ResultRankingContext resultRankingContext;
|
||||
@ -342,7 +342,7 @@ public class IndexGrpcService
|
||||
}
|
||||
|
||||
private void executeSearch() {
|
||||
final LongArrayList results = new LongArrayList(4096);
|
||||
final LongArrayList results = new LongArrayList(16);
|
||||
|
||||
// These queries are different indices for one subquery
|
||||
final LongQueryBuffer buffer = new LongQueryBuffer(4096);
|
||||
@ -352,31 +352,16 @@ public class IndexGrpcService
|
||||
buffer.reset();
|
||||
query.getMoreResults(buffer);
|
||||
|
||||
for (int i = 0; i < buffer.end; i++) {
|
||||
results.add(buffer.data.get(i));
|
||||
}
|
||||
|
||||
if (!results.isEmpty()) {
|
||||
int stride = 16;
|
||||
for (int start = 0; start < results.size(); start+=stride) {
|
||||
int end = Math.min(results.size(), start + stride);
|
||||
if (end > start) {
|
||||
long[] data = new long[end-start];
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
data[i] = results.getLong(start + i);
|
||||
}
|
||||
enqueueResults(new CombinedDocIdList(data));
|
||||
}
|
||||
for (int i = 0; i < buffer.end; i+=16) {
|
||||
for (int j = 0; j < Math.min(buffer.end - i, 16); j++) {
|
||||
results.add(buffer.data.get(i+j));
|
||||
}
|
||||
enqueueResults(new CombinedDocIdList(results));
|
||||
results.clear();
|
||||
}
|
||||
}
|
||||
|
||||
buffer.dispose();
|
||||
|
||||
if (!results.isEmpty()) {
|
||||
enqueueResults(new CombinedDocIdList(results));
|
||||
}
|
||||
}
|
||||
|
||||
private void enqueueResults(CombinedDocIdList resultIds) {
|
||||
|
@ -484,9 +484,8 @@ public class IndexResultScoreCalculator {
|
||||
firstPosition = Math.max(firstPosition, positions[i].getInt(0));
|
||||
searchableKeywordCount ++;
|
||||
|
||||
int[] posArray = positions[i].toIntArray();
|
||||
for (var tag : HtmlTag.includedTags) {
|
||||
int cnt = spans.getSpan(tag).countIntersections(posArray);
|
||||
int cnt = spans.getSpan(tag).countIntersections(positions[i]);
|
||||
observationsByTag[tag.ordinal()] += cnt;
|
||||
valuesByWordIdx[i] += cnt * weights[tag.ordinal()];
|
||||
}
|
||||
|
@ -4,11 +4,18 @@ package nu.marginalia.index.query;
|
||||
/** An execution time budget for index search operations. */
|
||||
public class IndexSearchBudget {
|
||||
private final long timeout;
|
||||
private final long startTime;
|
||||
|
||||
public IndexSearchBudget(long limitTime) {
|
||||
this.timeout = System.currentTimeMillis() + limitTime;
|
||||
this.startTime = System.nanoTime();
|
||||
this.timeout = Math.min(limitTime, 10_000) * 1_000_000L;
|
||||
}
|
||||
|
||||
public boolean hasTimeLeft() {
|
||||
return System.nanoTime() - startTime < timeout;
|
||||
}
|
||||
public long timeLeft() {
|
||||
return 1_000_000 * (timeout - (System.nanoTime() - startTime));
|
||||
}
|
||||
|
||||
public boolean hasTimeLeft() { return System.currentTimeMillis() < timeout; }
|
||||
public long timeLeft() { return timeout - System.currentTimeMillis(); }
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user