(index-construction) Gather up preindex writes

Use fewer writes when finalizing the preindex documents.dat file, as this was getting too slow.
This commit is contained in:
Viktor Lofgren 2024-07-10 23:18:06 +02:00
parent 9881cac2da
commit f090f0101b
3 changed files with 34 additions and 25 deletions

View File

@ -12,7 +12,7 @@ import java.nio.ByteOrder;
import java.nio.channels.FileChannel; import java.nio.channels.FileChannel;
/** Constructs document ids list priority reverse index */ /** Constructs document ids list priority reverse index */
public class PrioDocIdsTransformer implements LongArrayTransformations.LongIOTransformer { public class PrioDocIdsTransformer implements LongArrayTransformations.LongIOTransformer, AutoCloseable {
private static final Logger logger = LoggerFactory.getLogger(PrioDocIdsTransformer.class); private static final Logger logger = LoggerFactory.getLogger(PrioDocIdsTransformer.class);
@ -43,7 +43,6 @@ public class PrioDocIdsTransformer implements LongArrayTransformations.LongIOTra
readChannel.position(startL * 8); readChannel.position(startL * 8);
readBuffer.clear(); readBuffer.clear();
writeBuffer.clear();
int toBeRead = 8 * (sizeL); int toBeRead = 8 * (sizeL);
@ -80,6 +79,13 @@ public class PrioDocIdsTransformer implements LongArrayTransformations.LongIOTra
} }
while (readBuffer.hasRemaining()) { while (readBuffer.hasRemaining()) {
if (writeBuffer.remaining() < 16) {
writeBuffer.flip();
int written = writeChannel.write(writeBuffer, writeOffsetB);
writeOffsetB += written;
writeBuffer.clear();
}
long nextId = readBuffer.getLong(); long nextId = readBuffer.getLong();
// break down id components // break down id components
@ -111,12 +117,6 @@ public class PrioDocIdsTransformer implements LongArrayTransformations.LongIOTra
prevDomainId = domainId; prevDomainId = domainId;
prevRank = rank; prevRank = rank;
if (writeBuffer.remaining() < 16) {
writeBuffer.flip();
int written = writeChannel.write(writeBuffer, writeOffsetB);
writeOffsetB += written;
writeBuffer.clear();
}
} }
toBeRead -= readBuffer.limit(); toBeRead -= readBuffer.limit();
@ -128,14 +128,16 @@ public class PrioDocIdsTransformer implements LongArrayTransformations.LongIOTra
// ensure any half-written data is flushed to the buffer // ensure any half-written data is flushed to the buffer
bitWriter.finishLastByte(); bitWriter.finishLastByte();
writeBuffer.flip();
while (writeBuffer.hasRemaining()) {
int written = writeChannel.write(writeBuffer, writeOffsetB);
writeOffsetB += written;
}
// update the start input pointer // update the start input pointer
startL = endL; startL = endL;
return startOffsetB; return startOffsetB;
} }
@Override
public void close() throws IOException {
writeBuffer.flip();
int written = writeChannel.write(writeBuffer, writeOffsetB);
writeOffsetB += written;
writeBuffer.clear();
}
} }

View File

@ -82,9 +82,10 @@ public class PrioPreindex {
// Write the docs file // Write the docs file
try (var intermediateDocChannel = documents.createDocumentsFileChannel(); try (var intermediateDocChannel = documents.createDocumentsFileChannel();
var destFileChannel = (FileChannel) Files.newByteChannel(outputFileDocs, StandardOpenOption.CREATE_NEW, StandardOpenOption.WRITE) var destFileChannel = (FileChannel) Files.newByteChannel(outputFileDocs, StandardOpenOption.CREATE_NEW, StandardOpenOption.WRITE);
var transformer = new PrioDocIdsTransformer(destFileChannel, intermediateDocChannel)
) { ) {
offsets.transformEachIO(0, offsets.size(), new PrioDocIdsTransformer(destFileChannel, intermediateDocChannel)); offsets.transformEachIO(0, offsets.size(), transformer);
} }
LongArray wordIds = segments.wordIds; LongArray wordIds = segments.wordIds;

View File

@ -10,6 +10,7 @@ import java.io.DataInputStream;
import java.io.DataOutputStream; import java.io.DataOutputStream;
import java.io.IOException; import java.io.IOException;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.FileChannel; import java.nio.channels.FileChannel;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
@ -41,19 +42,24 @@ class PrioDocIdsTransformerTest {
@Test @Test
public void testDomainIdDocOrd() throws IOException { public void testDomainIdDocOrd() throws IOException {
// Write 5 longs to the input file as data
try (var dos = new DataOutputStream(Files.newOutputStream(inputFile))) { try (var writeChannel = (FileChannel) Files.newByteChannel(inputFile, StandardOpenOption.WRITE)) {
dos.writeLong(UrlIdCodec.encodeId(0, 0)); var buffer = ByteBuffer.allocate(128).order(ByteOrder.LITTLE_ENDIAN);
dos.writeLong(UrlIdCodec.encodeId(0, 1));
dos.writeLong(UrlIdCodec.encodeId(1, 0)); buffer.putLong(UrlIdCodec.encodeId(0, 0));
dos.writeLong(UrlIdCodec.encodeId(4, 51) | 0x7000_0000_0000_0000L); buffer.putLong(UrlIdCodec.encodeId(0, 1));
buffer.putLong(UrlIdCodec.encodeId(1, 0));
buffer.putLong(UrlIdCodec.encodeId(4, 51) | 0x7000_0000_0000_0000L);
writeChannel.write(buffer.flip());
} }
try (var writeChannel = (FileChannel) Files.newByteChannel(outputFile, StandardOpenOption.WRITE); try (var writeChannel = (FileChannel) Files.newByteChannel(outputFile, StandardOpenOption.WRITE);
var readChannel = (FileChannel) Files.newByteChannel(inputFile)) var readChannel = (FileChannel) Files.newByteChannel(inputFile);
var transformer = new PrioDocIdsTransformer(writeChannel, readChannel))
{ {
// Transform two segments of the input file and write them to the output file with prefixed sizes // Transform two segments of the input file and write them to the output file with prefixed sizes
var transformer = new PrioDocIdsTransformer(writeChannel, readChannel);
transformer.transform(0, 4); transformer.transform(0, 4);
} }
@ -107,7 +113,7 @@ class PrioDocIdsTransformerTest {
int code = reader.get(2); int code = reader.get(2);
assertEquals(2, code); // increment doc ordinal assertEquals(2, code); // increment doc ordinal
int diffRank = reader.getGamma() - 1; int diffRank = reader.getGamma();
rank += diffRank; rank += diffRank;
assertEquals(56, rank); assertEquals(56, rank);