(index) Evaluate using mmap reads during index construction in favor of filechannel reads

It's likely that this will be faster, as the reads are on average small and sequential, and can't be buffered easily.
This commit is contained in:
Viktor Lofgren 2024-09-13 16:14:56 +02:00
parent 1cf62f5850
commit a8bec13ed9
6 changed files with 107 additions and 82 deletions

View File

@ -6,14 +6,12 @@ import nu.marginalia.btree.BTreeWriter;
import nu.marginalia.btree.model.BTreeContext; import nu.marginalia.btree.model.BTreeContext;
import java.io.IOException; import java.io.IOException;
import java.nio.channels.FileChannel;
/** Constructs the BTrees in a reverse index */ /** Constructs the BTrees in a reverse index */
public class FullIndexBTreeTransformer implements LongArrayTransformations.LongIOTransformer { public class FullIndexBTreeTransformer implements LongArrayTransformations.LongIOTransformer {
private final BTreeWriter writer; private final BTreeWriter writer;
private final FileChannel intermediateChannel;
private final int entrySize; private final int entrySize;
private final LongArray documentsArray;
long start = 0; long start = 0;
long writeOffset = 0; long writeOffset = 0;
@ -21,10 +19,10 @@ public class FullIndexBTreeTransformer implements LongArrayTransformations.LongI
public FullIndexBTreeTransformer(LongArray urlsFileMap, public FullIndexBTreeTransformer(LongArray urlsFileMap,
int entrySize, int entrySize,
BTreeContext bTreeContext, BTreeContext bTreeContext,
FileChannel intermediateChannel) { LongArray documentsArray) {
this.documentsArray = documentsArray;
this.writer = new BTreeWriter(urlsFileMap, bTreeContext); this.writer = new BTreeWriter(urlsFileMap, bTreeContext);
this.entrySize = entrySize; this.entrySize = entrySize;
this.intermediateChannel = intermediateChannel;
} }
@Override @Override
@ -39,7 +37,7 @@ public class FullIndexBTreeTransformer implements LongArrayTransformations.LongI
final long offsetForBlock = writeOffset; final long offsetForBlock = writeOffset;
writeOffset += writer.write(writeOffset, size, writeOffset += writer.write(writeOffset, size,
mapRegion -> mapRegion.transferFrom(intermediateChannel, start, 0, end - start) mapRegion -> mapRegion.transferFrom(documentsArray, start, 0, end - start)
); );
start = end; start = end;

View File

@ -13,7 +13,6 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.IOException; import java.io.IOException;
import java.nio.channels.FileChannel;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.nio.file.StandardOpenOption; import java.nio.file.StandardOpenOption;
@ -87,13 +86,10 @@ public class FullPreindex {
// Write the docs file // Write the docs file
LongArray finalDocs = LongArrayFactory.mmapForWritingConfined(outputFileDocs, sizeEstimator.size); LongArray finalDocs = LongArrayFactory.mmapForWritingConfined(outputFileDocs, sizeEstimator.size);
try (var intermediateDocChannel = documents.createDocumentsFileChannel()) { offsets.transformEachIO(0, offsets.size(),
offsets.transformEachIO(0, offsets.size(), new FullIndexBTreeTransformer(finalDocs, 2,
new FullIndexBTreeTransformer(finalDocs, 2, ReverseIndexParameters.fullDocsBTreeContext,
ReverseIndexParameters.fullDocsBTreeContext, documents.documents));
intermediateDocChannel));
intermediateDocChannel.force(false);
}
LongArray wordIds = segments.wordIds; LongArray wordIds = segments.wordIds;
@ -148,42 +144,36 @@ public class FullPreindex {
leftIter.next(); leftIter.next();
rightIter.next(); rightIter.next();
try (FileChannel leftChannel = left.documents.createDocumentsFileChannel(); while (mergingIter.canPutMore()
FileChannel rightChannel = right.documents.createDocumentsFileChannel()) && leftIter.isPositionBeforeEnd()
&& rightIter.isPositionBeforeEnd())
{ {
final long currentWord = mergingIter.wordId;
while (mergingIter.canPutMore() if (leftIter.wordId == currentWord && rightIter.wordId == currentWord)
&& leftIter.isPositionBeforeEnd()
&& rightIter.isPositionBeforeEnd())
{ {
final long currentWord = mergingIter.wordId; // both inputs have documents for the current word
mergeSegments(leftIter, rightIter,
if (leftIter.wordId == currentWord && rightIter.wordId == currentWord) left.documents, right.documents,
{ mergedDocuments, mergingIter);
// both inputs have documents for the current word
mergeSegments(leftIter, rightIter,
left.documents, right.documents,
mergedDocuments, mergingIter);
}
else if (leftIter.wordId == currentWord) {
if (!copySegment(leftIter, mergedDocuments, leftChannel, mergingIter))
break;
}
else if (rightIter.wordId == currentWord) {
if (!copySegment(rightIter, mergedDocuments, rightChannel, mergingIter))
break;
}
else assert false : "This should never happen"; // the helvetica scenario
} }
else if (leftIter.wordId == currentWord) {
if (leftIter.isPositionBeforeEnd()) { if (!copySegment(leftIter, left.documents, mergingIter, mergedDocuments))
while (copySegment(leftIter, mergedDocuments, leftChannel, mergingIter)); break;
} }
else if (rightIter.wordId == currentWord) {
if (rightIter.isPositionBeforeEnd()) { if (!copySegment(rightIter, right.documents, mergingIter, mergedDocuments))
while (copySegment(rightIter, mergedDocuments, rightChannel, mergingIter)); break;
} }
else assert false : "This should never happen"; // the helvetica scenario
}
if (leftIter.isPositionBeforeEnd()) {
while (copySegment(leftIter, left.documents, mergingIter, mergedDocuments));
}
if (rightIter.isPositionBeforeEnd()) {
while (copySegment(rightIter, right.documents, mergingIter, mergedDocuments));
} }
if (leftIter.isPositionBeforeEnd()) if (leftIter.isPositionBeforeEnd())
@ -284,15 +274,15 @@ public class FullPreindex {
* into the destination segment, and advance the construction iterator. * into the destination segment, and advance the construction iterator.
*/ */
private static boolean copySegment(FullPreindexWordSegments.SegmentIterator sourceIter, private static boolean copySegment(FullPreindexWordSegments.SegmentIterator sourceIter,
LongArray dest, FullPreindexDocuments srcDocuments,
FileChannel sourceChannel, FullPreindexWordSegments.SegmentConstructionIterator mergingIter,
FullPreindexWordSegments.SegmentConstructionIterator mergingIter) throws IOException { LongArray dest) throws IOException {
long size = sourceIter.endOffset - sourceIter.startOffset; long size = sourceIter.endOffset - sourceIter.startOffset;
long start = mergingIter.startOffset; long start = mergingIter.startOffset;
long end = start + size; long end = start + size;
dest.transferFrom(sourceChannel, dest.transferFrom(srcDocuments.documents,
sourceIter.startOffset, sourceIter.startOffset,
mergingIter.startOffset, mergingIter.startOffset,
end); end);

View File

@ -139,44 +139,39 @@ public class PrioPreindex {
leftIter.next(); leftIter.next();
rightIter.next(); rightIter.next();
try (FileChannel leftChannel = left.documents.createDocumentsFileChannel(); while (mergingIter.canPutMore()
FileChannel rightChannel = right.documents.createDocumentsFileChannel()) && leftIter.isPositionBeforeEnd()
&& rightIter.isPositionBeforeEnd())
{ {
final long currentWord = mergingIter.wordId;
while (mergingIter.canPutMore() if (leftIter.wordId == currentWord && rightIter.wordId == currentWord)
&& leftIter.isPositionBeforeEnd()
&& rightIter.isPositionBeforeEnd())
{ {
final long currentWord = mergingIter.wordId; // both inputs have documents for the current word
mergeSegments(leftIter, rightIter,
if (leftIter.wordId == currentWord && rightIter.wordId == currentWord) left.documents, right.documents,
{ mergedDocuments, mergingIter);
// both inputs have documents for the current word
mergeSegments(leftIter, rightIter,
left.documents, right.documents,
mergedDocuments, mergingIter);
}
else if (leftIter.wordId == currentWord) {
if (!copySegment(leftIter, mergedDocuments, leftChannel, mergingIter))
break;
}
else if (rightIter.wordId == currentWord) {
if (!copySegment(rightIter, mergedDocuments, rightChannel, mergingIter))
break;
}
else assert false : "This should never happen"; // the helvetica scenario
} }
else if (leftIter.wordId == currentWord) {
if (leftIter.isPositionBeforeEnd()) { if (!copySegment(leftIter, left.documents, mergingIter, mergedDocuments))
while (copySegment(leftIter, mergedDocuments, leftChannel, mergingIter)); break;
} }
else if (rightIter.wordId == currentWord) {
if (rightIter.isPositionBeforeEnd()) { if (!copySegment(rightIter, right.documents, mergingIter, mergedDocuments))
while (copySegment(rightIter, mergedDocuments, rightChannel, mergingIter)); break;
} }
else assert false : "This should never happen"; // the helvetica scenario
} }
if (leftIter.isPositionBeforeEnd()) {
while (copySegment(leftIter, left.documents, mergingIter, mergedDocuments));
}
if (rightIter.isPositionBeforeEnd()) {
while (copySegment(rightIter, right.documents, mergingIter, mergedDocuments));
}
if (leftIter.isPositionBeforeEnd()) if (leftIter.isPositionBeforeEnd())
throw new IllegalStateException("Left has more to go"); throw new IllegalStateException("Left has more to go");
if (rightIter.isPositionBeforeEnd()) if (rightIter.isPositionBeforeEnd())
@ -270,24 +265,27 @@ public class PrioPreindex {
rightIter.next(); rightIter.next();
} }
/** Copy the data from the source segment at the position and length indicated by sourceIter,
* into the destination segment, and advance the construction iterator.
*/
/** Copy the data from the source segment at the position and length indicated by sourceIter, /** Copy the data from the source segment at the position and length indicated by sourceIter,
* into the destination segment, and advance the construction iterator. * into the destination segment, and advance the construction iterator.
*/ */
private static boolean copySegment(PrioPreindexWordSegments.SegmentIterator sourceIter, private static boolean copySegment(PrioPreindexWordSegments.SegmentIterator sourceIter,
LongArray dest, PrioPreindexDocuments srcDocuments,
FileChannel sourceChannel, PrioPreindexWordSegments.SegmentConstructionIterator mergingIter,
PrioPreindexWordSegments.SegmentConstructionIterator mergingIter) throws IOException { LongArray dest) throws IOException {
long size = sourceIter.endOffset - sourceIter.startOffset; long size = sourceIter.endOffset - sourceIter.startOffset;
long start = mergingIter.startOffset; long start = mergingIter.startOffset;
long end = start + size; long end = start + size;
dest.transferFrom(sourceChannel, dest.transferFrom(srcDocuments.documents,
sourceIter.startOffset, sourceIter.startOffset,
mergingIter.startOffset, mergingIter.startOffset,
end); end);
boolean putNext = mergingIter.putNext(size); boolean putNext = mergingIter.putNext(size / 2);
boolean iterNext = sourceIter.next(); boolean iterNext = sourceIter.next();
if (!putNext && iterNext) if (!putNext && iterNext)

View File

@ -108,4 +108,5 @@ public interface LongArrayBase extends BulkTransferArray<LongBuffer> {
void write(Path file) throws IOException; void write(Path file) throws IOException;
void transferFrom(FileChannel source, long sourceStart, long arrayStart, long arrayEnd) throws IOException; void transferFrom(FileChannel source, long sourceStart, long arrayStart, long arrayEnd) throws IOException;
void transferFrom(LongArray source, long sourceStart, long arrayStart, long arrayEnd) throws IOException;
} }

View File

@ -167,6 +167,25 @@ public class SegmentLongArray implements LongArray {
} }
} }
@Override
public void transferFrom(LongArray source,
long sourceStartL,
long destStartL,
long destEndL)
{
if (destStartL > destEndL)
throw new IndexOutOfBoundsException("Source start after end");
if (sourceStartL + (destEndL - destStartL) > source.size())
throw new IndexOutOfBoundsException("Source array too small");
if (destEndL > size())
throw new IndexOutOfBoundsException("Destination array too small");
for (long i = destStartL; i < destEndL; i++) {
set(i, source.get(sourceStartL + i - destStartL));
}
}
@Override @Override
public MemorySegment getMemorySegment() { public MemorySegment getMemorySegment() {

View File

@ -269,4 +269,23 @@ public class UnsafeLongArray implements LongArray {
} }
} }
@Override
public void transferFrom(LongArray source,
long sourceStartL,
long destStartL,
long destEndL)
{
if (destStartL > destEndL)
throw new IndexOutOfBoundsException("Source start after end");
if (sourceStartL + (destEndL - destStartL) > source.size())
throw new IndexOutOfBoundsException("Source array too small");
if (destEndL > size())
throw new IndexOutOfBoundsException("Destination array too small");
for (long i = destStartL; i < destEndL; i++) {
set(i, source.get(sourceStartL + i - destStartL));
}
}
} }