mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(index-construction) Gather up preindex writes
Use fewer writes when finalizing the preindex documents.dat file, as this was getting too slow.
This commit is contained in:
parent
9881cac2da
commit
f090f0101b
@ -12,7 +12,7 @@ import java.nio.ByteOrder;
|
|||||||
import java.nio.channels.FileChannel;
|
import java.nio.channels.FileChannel;
|
||||||
|
|
||||||
/** Constructs document ids list priority reverse index */
|
/** Constructs document ids list priority reverse index */
|
||||||
public class PrioDocIdsTransformer implements LongArrayTransformations.LongIOTransformer {
|
public class PrioDocIdsTransformer implements LongArrayTransformations.LongIOTransformer, AutoCloseable {
|
||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(PrioDocIdsTransformer.class);
|
private static final Logger logger = LoggerFactory.getLogger(PrioDocIdsTransformer.class);
|
||||||
|
|
||||||
@ -43,7 +43,6 @@ public class PrioDocIdsTransformer implements LongArrayTransformations.LongIOTra
|
|||||||
|
|
||||||
readChannel.position(startL * 8);
|
readChannel.position(startL * 8);
|
||||||
readBuffer.clear();
|
readBuffer.clear();
|
||||||
writeBuffer.clear();
|
|
||||||
|
|
||||||
int toBeRead = 8 * (sizeL);
|
int toBeRead = 8 * (sizeL);
|
||||||
|
|
||||||
@ -80,6 +79,13 @@ public class PrioDocIdsTransformer implements LongArrayTransformations.LongIOTra
|
|||||||
}
|
}
|
||||||
|
|
||||||
while (readBuffer.hasRemaining()) {
|
while (readBuffer.hasRemaining()) {
|
||||||
|
if (writeBuffer.remaining() < 16) {
|
||||||
|
writeBuffer.flip();
|
||||||
|
int written = writeChannel.write(writeBuffer, writeOffsetB);
|
||||||
|
writeOffsetB += written;
|
||||||
|
writeBuffer.clear();
|
||||||
|
}
|
||||||
|
|
||||||
long nextId = readBuffer.getLong();
|
long nextId = readBuffer.getLong();
|
||||||
|
|
||||||
// break down id components
|
// break down id components
|
||||||
@ -111,12 +117,6 @@ public class PrioDocIdsTransformer implements LongArrayTransformations.LongIOTra
|
|||||||
prevDomainId = domainId;
|
prevDomainId = domainId;
|
||||||
prevRank = rank;
|
prevRank = rank;
|
||||||
|
|
||||||
if (writeBuffer.remaining() < 16) {
|
|
||||||
writeBuffer.flip();
|
|
||||||
int written = writeChannel.write(writeBuffer, writeOffsetB);
|
|
||||||
writeOffsetB += written;
|
|
||||||
writeBuffer.clear();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
toBeRead -= readBuffer.limit();
|
toBeRead -= readBuffer.limit();
|
||||||
@ -128,14 +128,16 @@ public class PrioDocIdsTransformer implements LongArrayTransformations.LongIOTra
|
|||||||
// ensure any half-written data is flushed to the buffer
|
// ensure any half-written data is flushed to the buffer
|
||||||
bitWriter.finishLastByte();
|
bitWriter.finishLastByte();
|
||||||
|
|
||||||
writeBuffer.flip();
|
|
||||||
while (writeBuffer.hasRemaining()) {
|
|
||||||
int written = writeChannel.write(writeBuffer, writeOffsetB);
|
|
||||||
writeOffsetB += written;
|
|
||||||
}
|
|
||||||
|
|
||||||
// update the start input pointer
|
// update the start input pointer
|
||||||
startL = endL;
|
startL = endL;
|
||||||
return startOffsetB;
|
return startOffsetB;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
|
writeBuffer.flip();
|
||||||
|
int written = writeChannel.write(writeBuffer, writeOffsetB);
|
||||||
|
writeOffsetB += written;
|
||||||
|
writeBuffer.clear();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -82,9 +82,10 @@ public class PrioPreindex {
|
|||||||
|
|
||||||
// Write the docs file
|
// Write the docs file
|
||||||
try (var intermediateDocChannel = documents.createDocumentsFileChannel();
|
try (var intermediateDocChannel = documents.createDocumentsFileChannel();
|
||||||
var destFileChannel = (FileChannel) Files.newByteChannel(outputFileDocs, StandardOpenOption.CREATE_NEW, StandardOpenOption.WRITE)
|
var destFileChannel = (FileChannel) Files.newByteChannel(outputFileDocs, StandardOpenOption.CREATE_NEW, StandardOpenOption.WRITE);
|
||||||
|
var transformer = new PrioDocIdsTransformer(destFileChannel, intermediateDocChannel)
|
||||||
) {
|
) {
|
||||||
offsets.transformEachIO(0, offsets.size(), new PrioDocIdsTransformer(destFileChannel, intermediateDocChannel));
|
offsets.transformEachIO(0, offsets.size(), transformer);
|
||||||
}
|
}
|
||||||
|
|
||||||
LongArray wordIds = segments.wordIds;
|
LongArray wordIds = segments.wordIds;
|
||||||
|
@ -10,6 +10,7 @@ import java.io.DataInputStream;
|
|||||||
import java.io.DataOutputStream;
|
import java.io.DataOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.ByteBuffer;
|
import java.nio.ByteBuffer;
|
||||||
|
import java.nio.ByteOrder;
|
||||||
import java.nio.channels.FileChannel;
|
import java.nio.channels.FileChannel;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
@ -41,19 +42,24 @@ class PrioDocIdsTransformerTest {
|
|||||||
@Test
|
@Test
|
||||||
public void testDomainIdDocOrd() throws IOException {
|
public void testDomainIdDocOrd() throws IOException {
|
||||||
|
|
||||||
// Write 5 longs to the input file as data
|
|
||||||
try (var dos = new DataOutputStream(Files.newOutputStream(inputFile))) {
|
try (var writeChannel = (FileChannel) Files.newByteChannel(inputFile, StandardOpenOption.WRITE)) {
|
||||||
dos.writeLong(UrlIdCodec.encodeId(0, 0));
|
var buffer = ByteBuffer.allocate(128).order(ByteOrder.LITTLE_ENDIAN);
|
||||||
dos.writeLong(UrlIdCodec.encodeId(0, 1));
|
|
||||||
dos.writeLong(UrlIdCodec.encodeId(1, 0));
|
buffer.putLong(UrlIdCodec.encodeId(0, 0));
|
||||||
dos.writeLong(UrlIdCodec.encodeId(4, 51) | 0x7000_0000_0000_0000L);
|
buffer.putLong(UrlIdCodec.encodeId(0, 1));
|
||||||
|
buffer.putLong(UrlIdCodec.encodeId(1, 0));
|
||||||
|
buffer.putLong(UrlIdCodec.encodeId(4, 51) | 0x7000_0000_0000_0000L);
|
||||||
|
|
||||||
|
writeChannel.write(buffer.flip());
|
||||||
}
|
}
|
||||||
|
|
||||||
try (var writeChannel = (FileChannel) Files.newByteChannel(outputFile, StandardOpenOption.WRITE);
|
try (var writeChannel = (FileChannel) Files.newByteChannel(outputFile, StandardOpenOption.WRITE);
|
||||||
var readChannel = (FileChannel) Files.newByteChannel(inputFile))
|
var readChannel = (FileChannel) Files.newByteChannel(inputFile);
|
||||||
|
var transformer = new PrioDocIdsTransformer(writeChannel, readChannel))
|
||||||
{
|
{
|
||||||
// Transform two segments of the input file and write them to the output file with prefixed sizes
|
// Transform two segments of the input file and write them to the output file with prefixed sizes
|
||||||
var transformer = new PrioDocIdsTransformer(writeChannel, readChannel);
|
|
||||||
transformer.transform(0, 4);
|
transformer.transform(0, 4);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -107,7 +113,7 @@ class PrioDocIdsTransformerTest {
|
|||||||
int code = reader.get(2);
|
int code = reader.get(2);
|
||||||
assertEquals(2, code); // increment doc ordinal
|
assertEquals(2, code); // increment doc ordinal
|
||||||
|
|
||||||
int diffRank = reader.getGamma() - 1;
|
int diffRank = reader.getGamma();
|
||||||
rank += diffRank;
|
rank += diffRank;
|
||||||
assertEquals(56, rank);
|
assertEquals(56, rank);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user