mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(index-construction) Gather up preindex writes
Use fewer writes when finalizing the preindex documents.dat file, as this was getting too slow.
This commit is contained in:
parent
9881cac2da
commit
f090f0101b
@ -12,7 +12,7 @@ import java.nio.ByteOrder;
|
||||
import java.nio.channels.FileChannel;
|
||||
|
||||
/** Constructs document ids list priority reverse index */
|
||||
public class PrioDocIdsTransformer implements LongArrayTransformations.LongIOTransformer {
|
||||
public class PrioDocIdsTransformer implements LongArrayTransformations.LongIOTransformer, AutoCloseable {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(PrioDocIdsTransformer.class);
|
||||
|
||||
@ -43,7 +43,6 @@ public class PrioDocIdsTransformer implements LongArrayTransformations.LongIOTra
|
||||
|
||||
readChannel.position(startL * 8);
|
||||
readBuffer.clear();
|
||||
writeBuffer.clear();
|
||||
|
||||
int toBeRead = 8 * (sizeL);
|
||||
|
||||
@ -80,6 +79,13 @@ public class PrioDocIdsTransformer implements LongArrayTransformations.LongIOTra
|
||||
}
|
||||
|
||||
while (readBuffer.hasRemaining()) {
|
||||
if (writeBuffer.remaining() < 16) {
|
||||
writeBuffer.flip();
|
||||
int written = writeChannel.write(writeBuffer, writeOffsetB);
|
||||
writeOffsetB += written;
|
||||
writeBuffer.clear();
|
||||
}
|
||||
|
||||
long nextId = readBuffer.getLong();
|
||||
|
||||
// break down id components
|
||||
@ -111,12 +117,6 @@ public class PrioDocIdsTransformer implements LongArrayTransformations.LongIOTra
|
||||
prevDomainId = domainId;
|
||||
prevRank = rank;
|
||||
|
||||
if (writeBuffer.remaining() < 16) {
|
||||
writeBuffer.flip();
|
||||
int written = writeChannel.write(writeBuffer, writeOffsetB);
|
||||
writeOffsetB += written;
|
||||
writeBuffer.clear();
|
||||
}
|
||||
}
|
||||
|
||||
toBeRead -= readBuffer.limit();
|
||||
@ -128,14 +128,16 @@ public class PrioDocIdsTransformer implements LongArrayTransformations.LongIOTra
|
||||
// ensure any half-written data is flushed to the buffer
|
||||
bitWriter.finishLastByte();
|
||||
|
||||
writeBuffer.flip();
|
||||
while (writeBuffer.hasRemaining()) {
|
||||
int written = writeChannel.write(writeBuffer, writeOffsetB);
|
||||
writeOffsetB += written;
|
||||
}
|
||||
|
||||
// update the start input pointer
|
||||
startL = endL;
|
||||
return startOffsetB;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
writeBuffer.flip();
|
||||
int written = writeChannel.write(writeBuffer, writeOffsetB);
|
||||
writeOffsetB += written;
|
||||
writeBuffer.clear();
|
||||
}
|
||||
}
|
||||
|
@ -82,9 +82,10 @@ public class PrioPreindex {
|
||||
|
||||
// Write the docs file
|
||||
try (var intermediateDocChannel = documents.createDocumentsFileChannel();
|
||||
var destFileChannel = (FileChannel) Files.newByteChannel(outputFileDocs, StandardOpenOption.CREATE_NEW, StandardOpenOption.WRITE)
|
||||
var destFileChannel = (FileChannel) Files.newByteChannel(outputFileDocs, StandardOpenOption.CREATE_NEW, StandardOpenOption.WRITE);
|
||||
var transformer = new PrioDocIdsTransformer(destFileChannel, intermediateDocChannel)
|
||||
) {
|
||||
offsets.transformEachIO(0, offsets.size(), new PrioDocIdsTransformer(destFileChannel, intermediateDocChannel));
|
||||
offsets.transformEachIO(0, offsets.size(), transformer);
|
||||
}
|
||||
|
||||
LongArray wordIds = segments.wordIds;
|
||||
|
@ -10,6 +10,7 @@ import java.io.DataInputStream;
|
||||
import java.io.DataOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
@ -41,19 +42,24 @@ class PrioDocIdsTransformerTest {
|
||||
@Test
|
||||
public void testDomainIdDocOrd() throws IOException {
|
||||
|
||||
// Write 5 longs to the input file as data
|
||||
try (var dos = new DataOutputStream(Files.newOutputStream(inputFile))) {
|
||||
dos.writeLong(UrlIdCodec.encodeId(0, 0));
|
||||
dos.writeLong(UrlIdCodec.encodeId(0, 1));
|
||||
dos.writeLong(UrlIdCodec.encodeId(1, 0));
|
||||
dos.writeLong(UrlIdCodec.encodeId(4, 51) | 0x7000_0000_0000_0000L);
|
||||
|
||||
try (var writeChannel = (FileChannel) Files.newByteChannel(inputFile, StandardOpenOption.WRITE)) {
|
||||
var buffer = ByteBuffer.allocate(128).order(ByteOrder.LITTLE_ENDIAN);
|
||||
|
||||
buffer.putLong(UrlIdCodec.encodeId(0, 0));
|
||||
buffer.putLong(UrlIdCodec.encodeId(0, 1));
|
||||
buffer.putLong(UrlIdCodec.encodeId(1, 0));
|
||||
buffer.putLong(UrlIdCodec.encodeId(4, 51) | 0x7000_0000_0000_0000L);
|
||||
|
||||
writeChannel.write(buffer.flip());
|
||||
}
|
||||
|
||||
try (var writeChannel = (FileChannel) Files.newByteChannel(outputFile, StandardOpenOption.WRITE);
|
||||
var readChannel = (FileChannel) Files.newByteChannel(inputFile))
|
||||
var readChannel = (FileChannel) Files.newByteChannel(inputFile);
|
||||
var transformer = new PrioDocIdsTransformer(writeChannel, readChannel))
|
||||
{
|
||||
// Transform two segments of the input file and write them to the output file with prefixed sizes
|
||||
var transformer = new PrioDocIdsTransformer(writeChannel, readChannel);
|
||||
|
||||
transformer.transform(0, 4);
|
||||
}
|
||||
|
||||
@ -107,7 +113,7 @@ class PrioDocIdsTransformerTest {
|
||||
int code = reader.get(2);
|
||||
assertEquals(2, code); // increment doc ordinal
|
||||
|
||||
int diffRank = reader.getGamma() - 1;
|
||||
int diffRank = reader.getGamma();
|
||||
rank += diffRank;
|
||||
assertEquals(56, rank);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user