mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(index) Evaluate using mmap reads during index construction in favor of filechannel reads
It's likely that this will be faster, as the reads are on average small and sequential, and can't be buffered easily.
This commit is contained in:
parent
1cf62f5850
commit
a8bec13ed9
@ -6,14 +6,12 @@ import nu.marginalia.btree.BTreeWriter;
|
|||||||
import nu.marginalia.btree.model.BTreeContext;
|
import nu.marginalia.btree.model.BTreeContext;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.channels.FileChannel;
|
|
||||||
|
|
||||||
/** Constructs the BTrees in a reverse index */
|
/** Constructs the BTrees in a reverse index */
|
||||||
public class FullIndexBTreeTransformer implements LongArrayTransformations.LongIOTransformer {
|
public class FullIndexBTreeTransformer implements LongArrayTransformations.LongIOTransformer {
|
||||||
private final BTreeWriter writer;
|
private final BTreeWriter writer;
|
||||||
private final FileChannel intermediateChannel;
|
|
||||||
|
|
||||||
private final int entrySize;
|
private final int entrySize;
|
||||||
|
private final LongArray documentsArray;
|
||||||
|
|
||||||
long start = 0;
|
long start = 0;
|
||||||
long writeOffset = 0;
|
long writeOffset = 0;
|
||||||
@ -21,10 +19,10 @@ public class FullIndexBTreeTransformer implements LongArrayTransformations.LongI
|
|||||||
public FullIndexBTreeTransformer(LongArray urlsFileMap,
|
public FullIndexBTreeTransformer(LongArray urlsFileMap,
|
||||||
int entrySize,
|
int entrySize,
|
||||||
BTreeContext bTreeContext,
|
BTreeContext bTreeContext,
|
||||||
FileChannel intermediateChannel) {
|
LongArray documentsArray) {
|
||||||
|
this.documentsArray = documentsArray;
|
||||||
this.writer = new BTreeWriter(urlsFileMap, bTreeContext);
|
this.writer = new BTreeWriter(urlsFileMap, bTreeContext);
|
||||||
this.entrySize = entrySize;
|
this.entrySize = entrySize;
|
||||||
this.intermediateChannel = intermediateChannel;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -39,7 +37,7 @@ public class FullIndexBTreeTransformer implements LongArrayTransformations.LongI
|
|||||||
final long offsetForBlock = writeOffset;
|
final long offsetForBlock = writeOffset;
|
||||||
|
|
||||||
writeOffset += writer.write(writeOffset, size,
|
writeOffset += writer.write(writeOffset, size,
|
||||||
mapRegion -> mapRegion.transferFrom(intermediateChannel, start, 0, end - start)
|
mapRegion -> mapRegion.transferFrom(documentsArray, start, 0, end - start)
|
||||||
);
|
);
|
||||||
|
|
||||||
start = end;
|
start = end;
|
||||||
|
@ -13,7 +13,6 @@ import org.slf4j.Logger;
|
|||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.channels.FileChannel;
|
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.nio.file.StandardOpenOption;
|
import java.nio.file.StandardOpenOption;
|
||||||
@ -87,13 +86,10 @@ public class FullPreindex {
|
|||||||
|
|
||||||
// Write the docs file
|
// Write the docs file
|
||||||
LongArray finalDocs = LongArrayFactory.mmapForWritingConfined(outputFileDocs, sizeEstimator.size);
|
LongArray finalDocs = LongArrayFactory.mmapForWritingConfined(outputFileDocs, sizeEstimator.size);
|
||||||
try (var intermediateDocChannel = documents.createDocumentsFileChannel()) {
|
offsets.transformEachIO(0, offsets.size(),
|
||||||
offsets.transformEachIO(0, offsets.size(),
|
new FullIndexBTreeTransformer(finalDocs, 2,
|
||||||
new FullIndexBTreeTransformer(finalDocs, 2,
|
ReverseIndexParameters.fullDocsBTreeContext,
|
||||||
ReverseIndexParameters.fullDocsBTreeContext,
|
documents.documents));
|
||||||
intermediateDocChannel));
|
|
||||||
intermediateDocChannel.force(false);
|
|
||||||
}
|
|
||||||
|
|
||||||
LongArray wordIds = segments.wordIds;
|
LongArray wordIds = segments.wordIds;
|
||||||
|
|
||||||
@ -148,42 +144,36 @@ public class FullPreindex {
|
|||||||
leftIter.next();
|
leftIter.next();
|
||||||
rightIter.next();
|
rightIter.next();
|
||||||
|
|
||||||
try (FileChannel leftChannel = left.documents.createDocumentsFileChannel();
|
while (mergingIter.canPutMore()
|
||||||
FileChannel rightChannel = right.documents.createDocumentsFileChannel())
|
&& leftIter.isPositionBeforeEnd()
|
||||||
|
&& rightIter.isPositionBeforeEnd())
|
||||||
{
|
{
|
||||||
|
final long currentWord = mergingIter.wordId;
|
||||||
|
|
||||||
while (mergingIter.canPutMore()
|
if (leftIter.wordId == currentWord && rightIter.wordId == currentWord)
|
||||||
&& leftIter.isPositionBeforeEnd()
|
|
||||||
&& rightIter.isPositionBeforeEnd())
|
|
||||||
{
|
{
|
||||||
final long currentWord = mergingIter.wordId;
|
// both inputs have documents for the current word
|
||||||
|
mergeSegments(leftIter, rightIter,
|
||||||
if (leftIter.wordId == currentWord && rightIter.wordId == currentWord)
|
left.documents, right.documents,
|
||||||
{
|
mergedDocuments, mergingIter);
|
||||||
// both inputs have documents for the current word
|
|
||||||
mergeSegments(leftIter, rightIter,
|
|
||||||
left.documents, right.documents,
|
|
||||||
mergedDocuments, mergingIter);
|
|
||||||
}
|
|
||||||
else if (leftIter.wordId == currentWord) {
|
|
||||||
if (!copySegment(leftIter, mergedDocuments, leftChannel, mergingIter))
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
else if (rightIter.wordId == currentWord) {
|
|
||||||
if (!copySegment(rightIter, mergedDocuments, rightChannel, mergingIter))
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
else assert false : "This should never happen"; // the helvetica scenario
|
|
||||||
}
|
}
|
||||||
|
else if (leftIter.wordId == currentWord) {
|
||||||
if (leftIter.isPositionBeforeEnd()) {
|
if (!copySegment(leftIter, left.documents, mergingIter, mergedDocuments))
|
||||||
while (copySegment(leftIter, mergedDocuments, leftChannel, mergingIter));
|
break;
|
||||||
}
|
}
|
||||||
|
else if (rightIter.wordId == currentWord) {
|
||||||
if (rightIter.isPositionBeforeEnd()) {
|
if (!copySegment(rightIter, right.documents, mergingIter, mergedDocuments))
|
||||||
while (copySegment(rightIter, mergedDocuments, rightChannel, mergingIter));
|
break;
|
||||||
}
|
}
|
||||||
|
else assert false : "This should never happen"; // the helvetica scenario
|
||||||
|
}
|
||||||
|
|
||||||
|
if (leftIter.isPositionBeforeEnd()) {
|
||||||
|
while (copySegment(leftIter, left.documents, mergingIter, mergedDocuments));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (rightIter.isPositionBeforeEnd()) {
|
||||||
|
while (copySegment(rightIter, right.documents, mergingIter, mergedDocuments));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (leftIter.isPositionBeforeEnd())
|
if (leftIter.isPositionBeforeEnd())
|
||||||
@ -284,15 +274,15 @@ public class FullPreindex {
|
|||||||
* into the destination segment, and advance the construction iterator.
|
* into the destination segment, and advance the construction iterator.
|
||||||
*/
|
*/
|
||||||
private static boolean copySegment(FullPreindexWordSegments.SegmentIterator sourceIter,
|
private static boolean copySegment(FullPreindexWordSegments.SegmentIterator sourceIter,
|
||||||
LongArray dest,
|
FullPreindexDocuments srcDocuments,
|
||||||
FileChannel sourceChannel,
|
FullPreindexWordSegments.SegmentConstructionIterator mergingIter,
|
||||||
FullPreindexWordSegments.SegmentConstructionIterator mergingIter) throws IOException {
|
LongArray dest) throws IOException {
|
||||||
|
|
||||||
long size = sourceIter.endOffset - sourceIter.startOffset;
|
long size = sourceIter.endOffset - sourceIter.startOffset;
|
||||||
long start = mergingIter.startOffset;
|
long start = mergingIter.startOffset;
|
||||||
long end = start + size;
|
long end = start + size;
|
||||||
|
|
||||||
dest.transferFrom(sourceChannel,
|
dest.transferFrom(srcDocuments.documents,
|
||||||
sourceIter.startOffset,
|
sourceIter.startOffset,
|
||||||
mergingIter.startOffset,
|
mergingIter.startOffset,
|
||||||
end);
|
end);
|
||||||
|
@ -139,44 +139,39 @@ public class PrioPreindex {
|
|||||||
leftIter.next();
|
leftIter.next();
|
||||||
rightIter.next();
|
rightIter.next();
|
||||||
|
|
||||||
try (FileChannel leftChannel = left.documents.createDocumentsFileChannel();
|
while (mergingIter.canPutMore()
|
||||||
FileChannel rightChannel = right.documents.createDocumentsFileChannel())
|
&& leftIter.isPositionBeforeEnd()
|
||||||
|
&& rightIter.isPositionBeforeEnd())
|
||||||
{
|
{
|
||||||
|
final long currentWord = mergingIter.wordId;
|
||||||
|
|
||||||
while (mergingIter.canPutMore()
|
if (leftIter.wordId == currentWord && rightIter.wordId == currentWord)
|
||||||
&& leftIter.isPositionBeforeEnd()
|
|
||||||
&& rightIter.isPositionBeforeEnd())
|
|
||||||
{
|
{
|
||||||
final long currentWord = mergingIter.wordId;
|
// both inputs have documents for the current word
|
||||||
|
mergeSegments(leftIter, rightIter,
|
||||||
if (leftIter.wordId == currentWord && rightIter.wordId == currentWord)
|
left.documents, right.documents,
|
||||||
{
|
mergedDocuments, mergingIter);
|
||||||
// both inputs have documents for the current word
|
|
||||||
mergeSegments(leftIter, rightIter,
|
|
||||||
left.documents, right.documents,
|
|
||||||
mergedDocuments, mergingIter);
|
|
||||||
}
|
|
||||||
else if (leftIter.wordId == currentWord) {
|
|
||||||
if (!copySegment(leftIter, mergedDocuments, leftChannel, mergingIter))
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
else if (rightIter.wordId == currentWord) {
|
|
||||||
if (!copySegment(rightIter, mergedDocuments, rightChannel, mergingIter))
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
else assert false : "This should never happen"; // the helvetica scenario
|
|
||||||
}
|
}
|
||||||
|
else if (leftIter.wordId == currentWord) {
|
||||||
if (leftIter.isPositionBeforeEnd()) {
|
if (!copySegment(leftIter, left.documents, mergingIter, mergedDocuments))
|
||||||
while (copySegment(leftIter, mergedDocuments, leftChannel, mergingIter));
|
break;
|
||||||
}
|
}
|
||||||
|
else if (rightIter.wordId == currentWord) {
|
||||||
if (rightIter.isPositionBeforeEnd()) {
|
if (!copySegment(rightIter, right.documents, mergingIter, mergedDocuments))
|
||||||
while (copySegment(rightIter, mergedDocuments, rightChannel, mergingIter));
|
break;
|
||||||
}
|
}
|
||||||
|
else assert false : "This should never happen"; // the helvetica scenario
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (leftIter.isPositionBeforeEnd()) {
|
||||||
|
while (copySegment(leftIter, left.documents, mergingIter, mergedDocuments));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (rightIter.isPositionBeforeEnd()) {
|
||||||
|
while (copySegment(rightIter, right.documents, mergingIter, mergedDocuments));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
if (leftIter.isPositionBeforeEnd())
|
if (leftIter.isPositionBeforeEnd())
|
||||||
throw new IllegalStateException("Left has more to go");
|
throw new IllegalStateException("Left has more to go");
|
||||||
if (rightIter.isPositionBeforeEnd())
|
if (rightIter.isPositionBeforeEnd())
|
||||||
@ -270,24 +265,27 @@ public class PrioPreindex {
|
|||||||
rightIter.next();
|
rightIter.next();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Copy the data from the source segment at the position and length indicated by sourceIter,
|
||||||
|
* into the destination segment, and advance the construction iterator.
|
||||||
|
*/
|
||||||
/** Copy the data from the source segment at the position and length indicated by sourceIter,
|
/** Copy the data from the source segment at the position and length indicated by sourceIter,
|
||||||
* into the destination segment, and advance the construction iterator.
|
* into the destination segment, and advance the construction iterator.
|
||||||
*/
|
*/
|
||||||
private static boolean copySegment(PrioPreindexWordSegments.SegmentIterator sourceIter,
|
private static boolean copySegment(PrioPreindexWordSegments.SegmentIterator sourceIter,
|
||||||
LongArray dest,
|
PrioPreindexDocuments srcDocuments,
|
||||||
FileChannel sourceChannel,
|
PrioPreindexWordSegments.SegmentConstructionIterator mergingIter,
|
||||||
PrioPreindexWordSegments.SegmentConstructionIterator mergingIter) throws IOException {
|
LongArray dest) throws IOException {
|
||||||
|
|
||||||
long size = sourceIter.endOffset - sourceIter.startOffset;
|
long size = sourceIter.endOffset - sourceIter.startOffset;
|
||||||
long start = mergingIter.startOffset;
|
long start = mergingIter.startOffset;
|
||||||
long end = start + size;
|
long end = start + size;
|
||||||
|
|
||||||
dest.transferFrom(sourceChannel,
|
dest.transferFrom(srcDocuments.documents,
|
||||||
sourceIter.startOffset,
|
sourceIter.startOffset,
|
||||||
mergingIter.startOffset,
|
mergingIter.startOffset,
|
||||||
end);
|
end);
|
||||||
|
|
||||||
boolean putNext = mergingIter.putNext(size);
|
boolean putNext = mergingIter.putNext(size / 2);
|
||||||
boolean iterNext = sourceIter.next();
|
boolean iterNext = sourceIter.next();
|
||||||
|
|
||||||
if (!putNext && iterNext)
|
if (!putNext && iterNext)
|
||||||
|
@ -108,4 +108,5 @@ public interface LongArrayBase extends BulkTransferArray<LongBuffer> {
|
|||||||
void write(Path file) throws IOException;
|
void write(Path file) throws IOException;
|
||||||
|
|
||||||
void transferFrom(FileChannel source, long sourceStart, long arrayStart, long arrayEnd) throws IOException;
|
void transferFrom(FileChannel source, long sourceStart, long arrayStart, long arrayEnd) throws IOException;
|
||||||
|
void transferFrom(LongArray source, long sourceStart, long arrayStart, long arrayEnd) throws IOException;
|
||||||
}
|
}
|
||||||
|
@ -167,6 +167,25 @@ public class SegmentLongArray implements LongArray {
|
|||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void transferFrom(LongArray source,
|
||||||
|
long sourceStartL,
|
||||||
|
long destStartL,
|
||||||
|
long destEndL)
|
||||||
|
{
|
||||||
|
if (destStartL > destEndL)
|
||||||
|
throw new IndexOutOfBoundsException("Source start after end");
|
||||||
|
|
||||||
|
if (sourceStartL + (destEndL - destStartL) > source.size())
|
||||||
|
throw new IndexOutOfBoundsException("Source array too small");
|
||||||
|
if (destEndL > size())
|
||||||
|
throw new IndexOutOfBoundsException("Destination array too small");
|
||||||
|
|
||||||
|
for (long i = destStartL; i < destEndL; i++) {
|
||||||
|
set(i, source.get(sourceStartL + i - destStartL));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public MemorySegment getMemorySegment() {
|
public MemorySegment getMemorySegment() {
|
||||||
|
@ -269,4 +269,23 @@ public class UnsafeLongArray implements LongArray {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void transferFrom(LongArray source,
|
||||||
|
long sourceStartL,
|
||||||
|
long destStartL,
|
||||||
|
long destEndL)
|
||||||
|
{
|
||||||
|
if (destStartL > destEndL)
|
||||||
|
throw new IndexOutOfBoundsException("Source start after end");
|
||||||
|
|
||||||
|
if (sourceStartL + (destEndL - destStartL) > source.size())
|
||||||
|
throw new IndexOutOfBoundsException("Source array too small");
|
||||||
|
if (destEndL > size())
|
||||||
|
throw new IndexOutOfBoundsException("Destination array too small");
|
||||||
|
|
||||||
|
for (long i = destStartL; i < destEndL; i++) {
|
||||||
|
set(i, source.get(sourceStartL + i - destStartL));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user