(index) Evaluate using mmap reads during index construction in favor of filechannel reads

It's likely that this will be faster, as the reads are on average small and sequential, and can't be buffered easily.
This commit is contained in:
Viktor Lofgren 2024-09-13 16:14:56 +02:00
parent 1cf62f5850
commit a8bec13ed9
6 changed files with 107 additions and 82 deletions

View File

@ -6,14 +6,12 @@ import nu.marginalia.btree.BTreeWriter;
import nu.marginalia.btree.model.BTreeContext;
import java.io.IOException;
import java.nio.channels.FileChannel;
/** Constructs the BTrees in a reverse index */
public class FullIndexBTreeTransformer implements LongArrayTransformations.LongIOTransformer {
private final BTreeWriter writer;
private final FileChannel intermediateChannel;
private final int entrySize;
private final LongArray documentsArray;
long start = 0;
long writeOffset = 0;
@ -21,10 +19,10 @@ public class FullIndexBTreeTransformer implements LongArrayTransformations.LongI
public FullIndexBTreeTransformer(LongArray urlsFileMap,
int entrySize,
BTreeContext bTreeContext,
FileChannel intermediateChannel) {
LongArray documentsArray) {
this.documentsArray = documentsArray;
this.writer = new BTreeWriter(urlsFileMap, bTreeContext);
this.entrySize = entrySize;
this.intermediateChannel = intermediateChannel;
}
@Override
@ -39,7 +37,7 @@ public class FullIndexBTreeTransformer implements LongArrayTransformations.LongI
final long offsetForBlock = writeOffset;
writeOffset += writer.write(writeOffset, size,
mapRegion -> mapRegion.transferFrom(intermediateChannel, start, 0, end - start)
mapRegion -> mapRegion.transferFrom(documentsArray, start, 0, end - start)
);
start = end;

View File

@ -13,7 +13,6 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
@ -87,13 +86,10 @@ public class FullPreindex {
// Write the docs file
LongArray finalDocs = LongArrayFactory.mmapForWritingConfined(outputFileDocs, sizeEstimator.size);
try (var intermediateDocChannel = documents.createDocumentsFileChannel()) {
offsets.transformEachIO(0, offsets.size(),
new FullIndexBTreeTransformer(finalDocs, 2,
ReverseIndexParameters.fullDocsBTreeContext,
intermediateDocChannel));
intermediateDocChannel.force(false);
}
offsets.transformEachIO(0, offsets.size(),
new FullIndexBTreeTransformer(finalDocs, 2,
ReverseIndexParameters.fullDocsBTreeContext,
documents.documents));
LongArray wordIds = segments.wordIds;
@ -148,42 +144,36 @@ public class FullPreindex {
leftIter.next();
rightIter.next();
try (FileChannel leftChannel = left.documents.createDocumentsFileChannel();
FileChannel rightChannel = right.documents.createDocumentsFileChannel())
while (mergingIter.canPutMore()
&& leftIter.isPositionBeforeEnd()
&& rightIter.isPositionBeforeEnd())
{
final long currentWord = mergingIter.wordId;
while (mergingIter.canPutMore()
&& leftIter.isPositionBeforeEnd()
&& rightIter.isPositionBeforeEnd())
if (leftIter.wordId == currentWord && rightIter.wordId == currentWord)
{
final long currentWord = mergingIter.wordId;
if (leftIter.wordId == currentWord && rightIter.wordId == currentWord)
{
// both inputs have documents for the current word
mergeSegments(leftIter, rightIter,
left.documents, right.documents,
mergedDocuments, mergingIter);
}
else if (leftIter.wordId == currentWord) {
if (!copySegment(leftIter, mergedDocuments, leftChannel, mergingIter))
break;
}
else if (rightIter.wordId == currentWord) {
if (!copySegment(rightIter, mergedDocuments, rightChannel, mergingIter))
break;
}
else assert false : "This should never happen"; // the helvetica scenario
// both inputs have documents for the current word
mergeSegments(leftIter, rightIter,
left.documents, right.documents,
mergedDocuments, mergingIter);
}
if (leftIter.isPositionBeforeEnd()) {
while (copySegment(leftIter, mergedDocuments, leftChannel, mergingIter));
else if (leftIter.wordId == currentWord) {
if (!copySegment(leftIter, left.documents, mergingIter, mergedDocuments))
break;
}
if (rightIter.isPositionBeforeEnd()) {
while (copySegment(rightIter, mergedDocuments, rightChannel, mergingIter));
else if (rightIter.wordId == currentWord) {
if (!copySegment(rightIter, right.documents, mergingIter, mergedDocuments))
break;
}
else assert false : "This should never happen"; // the helvetica scenario
}
if (leftIter.isPositionBeforeEnd()) {
while (copySegment(leftIter, left.documents, mergingIter, mergedDocuments));
}
if (rightIter.isPositionBeforeEnd()) {
while (copySegment(rightIter, right.documents, mergingIter, mergedDocuments));
}
if (leftIter.isPositionBeforeEnd())
@ -284,15 +274,15 @@ public class FullPreindex {
* into the destination segment, and advance the construction iterator.
*/
private static boolean copySegment(FullPreindexWordSegments.SegmentIterator sourceIter,
LongArray dest,
FileChannel sourceChannel,
FullPreindexWordSegments.SegmentConstructionIterator mergingIter) throws IOException {
FullPreindexDocuments srcDocuments,
FullPreindexWordSegments.SegmentConstructionIterator mergingIter,
LongArray dest) throws IOException {
long size = sourceIter.endOffset - sourceIter.startOffset;
long start = mergingIter.startOffset;
long end = start + size;
dest.transferFrom(sourceChannel,
dest.transferFrom(srcDocuments.documents,
sourceIter.startOffset,
mergingIter.startOffset,
end);

View File

@ -139,44 +139,39 @@ public class PrioPreindex {
leftIter.next();
rightIter.next();
try (FileChannel leftChannel = left.documents.createDocumentsFileChannel();
FileChannel rightChannel = right.documents.createDocumentsFileChannel())
while (mergingIter.canPutMore()
&& leftIter.isPositionBeforeEnd()
&& rightIter.isPositionBeforeEnd())
{
final long currentWord = mergingIter.wordId;
while (mergingIter.canPutMore()
&& leftIter.isPositionBeforeEnd()
&& rightIter.isPositionBeforeEnd())
if (leftIter.wordId == currentWord && rightIter.wordId == currentWord)
{
final long currentWord = mergingIter.wordId;
if (leftIter.wordId == currentWord && rightIter.wordId == currentWord)
{
// both inputs have documents for the current word
mergeSegments(leftIter, rightIter,
left.documents, right.documents,
mergedDocuments, mergingIter);
}
else if (leftIter.wordId == currentWord) {
if (!copySegment(leftIter, mergedDocuments, leftChannel, mergingIter))
break;
}
else if (rightIter.wordId == currentWord) {
if (!copySegment(rightIter, mergedDocuments, rightChannel, mergingIter))
break;
}
else assert false : "This should never happen"; // the helvetica scenario
// both inputs have documents for the current word
mergeSegments(leftIter, rightIter,
left.documents, right.documents,
mergedDocuments, mergingIter);
}
if (leftIter.isPositionBeforeEnd()) {
while (copySegment(leftIter, mergedDocuments, leftChannel, mergingIter));
else if (leftIter.wordId == currentWord) {
if (!copySegment(leftIter, left.documents, mergingIter, mergedDocuments))
break;
}
if (rightIter.isPositionBeforeEnd()) {
while (copySegment(rightIter, mergedDocuments, rightChannel, mergingIter));
else if (rightIter.wordId == currentWord) {
if (!copySegment(rightIter, right.documents, mergingIter, mergedDocuments))
break;
}
else assert false : "This should never happen"; // the helvetica scenario
}
if (leftIter.isPositionBeforeEnd()) {
while (copySegment(leftIter, left.documents, mergingIter, mergedDocuments));
}
if (rightIter.isPositionBeforeEnd()) {
while (copySegment(rightIter, right.documents, mergingIter, mergedDocuments));
}
if (leftIter.isPositionBeforeEnd())
throw new IllegalStateException("Left has more to go");
if (rightIter.isPositionBeforeEnd())
@ -270,24 +265,27 @@ public class PrioPreindex {
rightIter.next();
}
/** Copy the data from the source segment at the position and length indicated by sourceIter,
* into the destination segment, and advance the construction iterator.
*/
/** Copy the data from the source segment at the position and length indicated by sourceIter,
* into the destination segment, and advance the construction iterator.
*/
private static boolean copySegment(PrioPreindexWordSegments.SegmentIterator sourceIter,
LongArray dest,
FileChannel sourceChannel,
PrioPreindexWordSegments.SegmentConstructionIterator mergingIter) throws IOException {
PrioPreindexDocuments srcDocuments,
PrioPreindexWordSegments.SegmentConstructionIterator mergingIter,
LongArray dest) throws IOException {
long size = sourceIter.endOffset - sourceIter.startOffset;
long start = mergingIter.startOffset;
long end = start + size;
dest.transferFrom(sourceChannel,
dest.transferFrom(srcDocuments.documents,
sourceIter.startOffset,
mergingIter.startOffset,
end);
boolean putNext = mergingIter.putNext(size);
boolean putNext = mergingIter.putNext(size / 2);
boolean iterNext = sourceIter.next();
if (!putNext && iterNext)

View File

@ -108,4 +108,5 @@ public interface LongArrayBase extends BulkTransferArray<LongBuffer> {
void write(Path file) throws IOException;
void transferFrom(FileChannel source, long sourceStart, long arrayStart, long arrayEnd) throws IOException;
void transferFrom(LongArray source, long sourceStart, long arrayStart, long arrayEnd) throws IOException;
}

View File

@ -167,6 +167,25 @@ public class SegmentLongArray implements LongArray {
}
}
@Override
public void transferFrom(LongArray source,
long sourceStartL,
long destStartL,
long destEndL)
{
if (destStartL > destEndL)
throw new IndexOutOfBoundsException("Source start after end");
if (sourceStartL + (destEndL - destStartL) > source.size())
throw new IndexOutOfBoundsException("Source array too small");
if (destEndL > size())
throw new IndexOutOfBoundsException("Destination array too small");
for (long i = destStartL; i < destEndL; i++) {
set(i, source.get(sourceStartL + i - destStartL));
}
}
@Override
public MemorySegment getMemorySegment() {

View File

@ -269,4 +269,23 @@ public class UnsafeLongArray implements LongArray {
}
}
@Override
public void transferFrom(LongArray source,
long sourceStartL,
long destStartL,
long destEndL)
{
if (destStartL > destEndL)
throw new IndexOutOfBoundsException("Source start after end");
if (sourceStartL + (destEndL - destStartL) > source.size())
throw new IndexOutOfBoundsException("Source array too small");
if (destEndL > size())
throw new IndexOutOfBoundsException("Destination array too small");
for (long i = destStartL; i < destEndL; i++) {
set(i, source.get(sourceStartL + i - destStartL));
}
}
}