mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(index-reverse) Added compression to priority index
The priority index documents file can be trivially compressed to a large degree. Compression schema: ``` 00b -> diff docord (E gamma) 01b -> diff domainid (E delta) + (1 + docord) (E delta) 10b -> rank (E gamma) + domainid,docord (raw) 11b -> 30 bit size header, followed by 1 raw doc id (61 bits) ```
This commit is contained in:
parent
abf7a8d78d
commit
12590d3449
@ -40,6 +40,14 @@ public class UrlIdCodec {
|
||||
return ((long) domainId << 26) | documentOrdinal;
|
||||
}
|
||||
|
||||
/** Encode a URL id with a ranking element */
|
||||
public static long encodeId(int rank, int domainId, int documentOrdinal) {
|
||||
domainId &= 0x7FFF_FFFF;
|
||||
documentOrdinal &= 0x03FF_FFFF;
|
||||
rank &= 0x3F;
|
||||
|
||||
return ((long) rank << 57) | ((long) domainId << 26) | documentOrdinal;
|
||||
}
|
||||
/** Add a ranking element to an existing combined URL id.
|
||||
*
|
||||
* @param rank [0,1] the importance of the domain, low is good
|
||||
|
@ -3,23 +3,32 @@ package nu.marginalia.index;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.array.page.LongQueryBuffer;
|
||||
import nu.marginalia.index.query.EntrySource;
|
||||
import nu.marginalia.sequence.io.BitReader;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
import java.nio.channels.FileChannel;
|
||||
|
||||
import static java.lang.Math.min;
|
||||
|
||||
public class PrioIndexEntrySource implements EntrySource {
|
||||
private final String name;
|
||||
|
||||
int posL;
|
||||
int endOffsetL;
|
||||
private final ByteBuffer readData = ByteBuffer.allocate(1024);
|
||||
private final BitReader bitReader = new BitReader(readData);
|
||||
|
||||
private final FileChannel docsFileChannel;
|
||||
private final long dataOffsetStartB;
|
||||
private long dataOffsetStartB;
|
||||
private final long wordId;
|
||||
|
||||
private final int numItems;
|
||||
private int readItems = 0;
|
||||
|
||||
int prevRank = -1;
|
||||
int prevDomainId = -1;
|
||||
int prevDocOrd = -1;
|
||||
|
||||
public PrioIndexEntrySource(String name,
|
||||
int numEntriesL,
|
||||
FileChannel docsFileChannel,
|
||||
long dataOffsetStartB,
|
||||
long wordId)
|
||||
@ -29,41 +38,101 @@ public class PrioIndexEntrySource implements EntrySource {
|
||||
this.dataOffsetStartB = dataOffsetStartB;
|
||||
this.wordId = wordId;
|
||||
|
||||
posL = 0;
|
||||
endOffsetL = posL + numEntriesL;
|
||||
// sneaky read of the header to get item count upfront
|
||||
|
||||
try {
|
||||
readData.limit(4);
|
||||
|
||||
int rb = docsFileChannel.read(readData, dataOffsetStartB);
|
||||
assert rb == 4;
|
||||
readData.flip();
|
||||
numItems = readData.getInt() & 0x3FFF_FFFF;
|
||||
|
||||
readData.position(0);
|
||||
readData.limit(0);
|
||||
}
|
||||
catch (IOException ex) {
|
||||
throw new IllegalStateException("Failed to read index data.", ex);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void skip(int n) {
|
||||
posL += n;
|
||||
throw new UnsupportedOperationException("Not implemented");
|
||||
}
|
||||
|
||||
@Override
|
||||
@SneakyThrows
|
||||
@SuppressWarnings("preview")
|
||||
public void read(LongQueryBuffer buffer) {
|
||||
buffer.reset();
|
||||
buffer.end = min(buffer.end, endOffsetL - posL);
|
||||
var outputBuffer = buffer.asByteBuffer().order(ByteOrder.LITTLE_ENDIAN);
|
||||
outputBuffer.clear();
|
||||
|
||||
var byteBuffer = buffer.data.getMemorySegment().asByteBuffer();
|
||||
byteBuffer.clear();
|
||||
byteBuffer.limit(buffer.end * 8);
|
||||
while (readItems++ < numItems && outputBuffer.hasRemaining()) {
|
||||
fillReadBuffer();
|
||||
|
||||
while (byteBuffer.hasRemaining()) {
|
||||
int rb = docsFileChannel.read(byteBuffer, dataOffsetStartB + posL * 8L + byteBuffer.position());
|
||||
if (rb == -1) {
|
||||
throw new IllegalStateException("Unexpected end of file while reading index data.");
|
||||
int rank;
|
||||
int domainId;
|
||||
int docOrd;
|
||||
|
||||
int code = bitReader.get(2);
|
||||
if (code == 0b11) {
|
||||
// header
|
||||
bitReader.get(30); // skip 30 bits for the size header
|
||||
|
||||
rank = bitReader.get(7);
|
||||
domainId = bitReader.get(31);
|
||||
docOrd = bitReader.get(26);
|
||||
}
|
||||
else if (code == 0b10) {
|
||||
rank = prevRank + bitReader.getGamma();
|
||||
domainId = bitReader.get(31);
|
||||
docOrd = bitReader.get(26);
|
||||
}
|
||||
else if (code == 0b01) {
|
||||
rank = prevRank;
|
||||
domainId = bitReader.getDelta() + prevDomainId;
|
||||
docOrd = bitReader.getDelta() - 1;
|
||||
}
|
||||
else if (code == 0b00) {
|
||||
rank = prevRank;
|
||||
domainId = prevDomainId;
|
||||
docOrd = prevDocOrd + bitReader.getGamma();
|
||||
}
|
||||
else {
|
||||
throw new IllegalStateException("??? found code " + code);
|
||||
}
|
||||
|
||||
long encodedId = UrlIdCodec.encodeId(rank, domainId, docOrd);
|
||||
|
||||
outputBuffer.putLong(
|
||||
encodedId
|
||||
);
|
||||
|
||||
prevRank = rank;
|
||||
prevDomainId = domainId;
|
||||
prevDocOrd = docOrd;
|
||||
}
|
||||
|
||||
posL += buffer.end;
|
||||
buffer.end = outputBuffer.position() / 8;
|
||||
|
||||
buffer.uniq();
|
||||
}
|
||||
|
||||
private void fillReadBuffer() throws IOException {
|
||||
if (readData.remaining() < 8) {
|
||||
readData.compact();
|
||||
int rb = docsFileChannel.read(readData, dataOffsetStartB);
|
||||
if (rb > 0) {
|
||||
dataOffsetStartB += rb;
|
||||
}
|
||||
readData.flip();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasMore() {
|
||||
return posL < endOffsetL;
|
||||
return readItems < numItems;
|
||||
}
|
||||
|
||||
|
||||
|
@ -70,20 +70,9 @@ public class PrioReverseIndexReader {
|
||||
if (offset < 0) // No documents
|
||||
return new EmptyEntrySource();
|
||||
|
||||
// Read the number of documents
|
||||
ByteBuffer buffer = ByteBuffer.allocate(8);
|
||||
try {
|
||||
documentsChannel.read(buffer, offset);
|
||||
}
|
||||
catch (IOException e) {
|
||||
logger.error("Failed to read documents channel", e);
|
||||
return new EmptyEntrySource();
|
||||
}
|
||||
|
||||
return new PrioIndexEntrySource(name,
|
||||
(int) buffer.getLong(0),
|
||||
documentsChannel,
|
||||
offset + 8,
|
||||
offset,
|
||||
termId);
|
||||
}
|
||||
|
||||
@ -92,7 +81,7 @@ public class PrioReverseIndexReader {
|
||||
|
||||
long offset = wordOffset(termId);
|
||||
|
||||
ByteBuffer buffer = ByteBuffer.allocate(8);
|
||||
ByteBuffer buffer = ByteBuffer.allocate(4);
|
||||
try {
|
||||
documentsChannel.read(buffer, offset);
|
||||
}
|
||||
@ -101,7 +90,7 @@ public class PrioReverseIndexReader {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return (int) buffer.getLong(0);
|
||||
return buffer.getInt(0) & 0x3FFF_FFFF;
|
||||
|
||||
}
|
||||
|
||||
|
@ -1,17 +1,26 @@
|
||||
package nu.marginalia.index.construction.prio;
|
||||
|
||||
import nu.marginalia.array.algo.LongArrayTransformations;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import nu.marginalia.sequence.io.BitWriter;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
import java.nio.channels.FileChannel;
|
||||
|
||||
/** Constructs document ids list priority reverse index */
|
||||
public class PrioDocIdsTransformer implements LongArrayTransformations.LongIOTransformer {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(PrioDocIdsTransformer.class);
|
||||
|
||||
private final FileChannel writeChannel;
|
||||
private final FileChannel readChannel;
|
||||
|
||||
private final ByteBuffer buffer = ByteBuffer.allocate(8192);
|
||||
private final ByteBuffer readBuffer = ByteBuffer.allocate(8192).order(ByteOrder.LITTLE_ENDIAN);
|
||||
private final ByteBuffer writeBuffer = ByteBuffer.allocate(8192);
|
||||
|
||||
long startL = 0;
|
||||
long writeOffsetB = 0;
|
||||
@ -33,25 +42,99 @@ public class PrioDocIdsTransformer implements LongArrayTransformations.LongIOTra
|
||||
}
|
||||
|
||||
readChannel.position(startL * 8);
|
||||
readBuffer.clear();
|
||||
writeBuffer.clear();
|
||||
|
||||
buffer.clear();
|
||||
buffer.putLong(sizeL);
|
||||
int toBeRead = 8 * (sizeL);
|
||||
|
||||
var bitWriter = new BitWriter(writeBuffer);
|
||||
|
||||
int prevRank = -1;
|
||||
int prevDomainId = -1;
|
||||
int prevDocOrd = -1;
|
||||
boolean wroteHeader = false;
|
||||
|
||||
int toBeWrittenB = 8 * (1 + sizeL);
|
||||
do {
|
||||
buffer.limit(Math.min(buffer.capacity(), toBeWrittenB));
|
||||
readChannel.read(buffer);
|
||||
buffer.flip();
|
||||
readBuffer.limit(Math.min(readBuffer.capacity(), toBeRead));
|
||||
readChannel.read(readBuffer);
|
||||
readBuffer.flip();
|
||||
|
||||
while (buffer.hasRemaining()) {
|
||||
int written = writeChannel.write(buffer, writeOffsetB);
|
||||
writeOffsetB += written;
|
||||
toBeWrittenB -= written;
|
||||
if (!wroteHeader) {
|
||||
// write 11b header
|
||||
bitWriter.putBits(3, 2);
|
||||
// encode number of items
|
||||
bitWriter.putBits(sizeL, 30);
|
||||
|
||||
|
||||
long firstItem = readBuffer.getLong();
|
||||
|
||||
prevRank = UrlIdCodec.getRank(firstItem);
|
||||
prevDomainId = UrlIdCodec.getDomainId(firstItem);
|
||||
prevDocOrd = UrlIdCodec.getDocumentOrdinal(firstItem);
|
||||
|
||||
bitWriter.putBits(prevRank, 7);
|
||||
bitWriter.putBits(prevDomainId, 31);
|
||||
bitWriter.putBits(prevDocOrd, 26);
|
||||
|
||||
wroteHeader = true;
|
||||
}
|
||||
|
||||
buffer.clear();
|
||||
} while (toBeWrittenB > 0);
|
||||
while (readBuffer.hasRemaining()) {
|
||||
long nextId = readBuffer.getLong();
|
||||
|
||||
// break down id components
|
||||
int rank = UrlIdCodec.getRank(nextId);
|
||||
int domainId = UrlIdCodec.getDomainId(nextId);
|
||||
int docOrd = UrlIdCodec.getDocumentOrdinal(nextId);
|
||||
|
||||
// encode components
|
||||
if (rank != prevRank) {
|
||||
bitWriter.putBits(0b10, 2);
|
||||
bitWriter.putGamma(rank - prevRank);
|
||||
bitWriter.putBits(domainId, 31);
|
||||
bitWriter.putBits(docOrd, 26);
|
||||
}
|
||||
else if (domainId != prevDomainId) {
|
||||
bitWriter.putBits(0b01, 2);
|
||||
bitWriter.putDelta(domainId - prevDomainId);
|
||||
bitWriter.putDelta(1 + docOrd);
|
||||
}
|
||||
else if (docOrd != prevDocOrd) {
|
||||
bitWriter.putBits(0b00, 2);
|
||||
bitWriter.putGamma(docOrd - prevDocOrd);
|
||||
}
|
||||
else {
|
||||
logger.warn("Unexpected duplicate document id: {}", nextId);
|
||||
}
|
||||
|
||||
prevDocOrd = docOrd;
|
||||
prevDomainId = domainId;
|
||||
prevRank = rank;
|
||||
|
||||
if (writeBuffer.remaining() < 16) {
|
||||
writeBuffer.flip();
|
||||
int written = writeChannel.write(writeBuffer, writeOffsetB);
|
||||
writeOffsetB += written;
|
||||
writeBuffer.clear();
|
||||
}
|
||||
}
|
||||
|
||||
toBeRead -= readBuffer.limit();
|
||||
readBuffer.clear();
|
||||
} while (toBeRead > 0);
|
||||
|
||||
// write lingering data
|
||||
|
||||
// ensure any half-written data is flushed to the buffer
|
||||
bitWriter.finishLastByte();
|
||||
|
||||
writeBuffer.flip();
|
||||
while (writeBuffer.hasRemaining()) {
|
||||
int written = writeChannel.write(writeBuffer, writeOffsetB);
|
||||
writeOffsetB += written;
|
||||
}
|
||||
|
||||
// update the start input pointer
|
||||
startL = endL;
|
||||
return startOffsetB;
|
||||
}
|
||||
|
@ -1,5 +1,7 @@
|
||||
package nu.marginalia.index.construction.prio;
|
||||
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import nu.marginalia.sequence.io.BitReader;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
@ -7,6 +9,7 @@ import org.junit.jupiter.api.Test;
|
||||
import java.io.DataInputStream;
|
||||
import java.io.DataOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
@ -36,15 +39,14 @@ class PrioDocIdsTransformerTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test() throws IOException {
|
||||
public void testDomainIdDocOrd() throws IOException {
|
||||
|
||||
// Write 5 longs to the input file as data
|
||||
try (var dos = new DataOutputStream(Files.newOutputStream(inputFile))) {
|
||||
dos.writeLong(1);
|
||||
dos.writeLong(2);
|
||||
dos.writeLong(3);
|
||||
dos.writeLong(4);
|
||||
dos.writeLong(5);
|
||||
dos.writeLong(UrlIdCodec.encodeId(0, 0));
|
||||
dos.writeLong(UrlIdCodec.encodeId(0, 1));
|
||||
dos.writeLong(UrlIdCodec.encodeId(1, 0));
|
||||
dos.writeLong(UrlIdCodec.encodeId(4, 51) | 0x7000_0000_0000_0000L);
|
||||
}
|
||||
|
||||
try (var writeChannel = (FileChannel) Files.newByteChannel(outputFile, StandardOpenOption.WRITE);
|
||||
@ -52,19 +54,68 @@ class PrioDocIdsTransformerTest {
|
||||
{
|
||||
// Transform two segments of the input file and write them to the output file with prefixed sizes
|
||||
var transformer = new PrioDocIdsTransformer(writeChannel, readChannel);
|
||||
transformer.transform(0, 3);
|
||||
transformer.transform(1, 5);
|
||||
transformer.transform(0, 4);
|
||||
}
|
||||
|
||||
// Verify the output file
|
||||
try (var dis = new DataInputStream(Files.newInputStream(outputFile))) {
|
||||
assertEquals(3, dis.readLong());
|
||||
assertEquals(1, dis.readLong());
|
||||
assertEquals(2, dis.readLong());
|
||||
assertEquals(3, dis.readLong());
|
||||
assertEquals(2, dis.readLong());
|
||||
assertEquals(4, dis.readLong());
|
||||
assertEquals(5, dis.readLong());
|
||||
byte[] bytes = Files.readAllBytes(outputFile);
|
||||
var buffer = ByteBuffer.wrap(bytes);
|
||||
|
||||
|
||||
BitReader reader = new BitReader(buffer);
|
||||
|
||||
// read the header
|
||||
{
|
||||
int code = reader.get(2);
|
||||
int size = reader.get(30);
|
||||
assertEquals(3, code);
|
||||
assertEquals(4, size);
|
||||
}
|
||||
|
||||
// read first doc id in parts
|
||||
int rank = reader.get(7);
|
||||
int domainId = reader.get(31);
|
||||
int ordinal = reader.get(26);
|
||||
|
||||
assertEquals(0, rank);
|
||||
assertEquals(0, domainId);
|
||||
assertEquals(0, ordinal);
|
||||
|
||||
{
|
||||
int code = reader.get(2);
|
||||
assertEquals(0, code); // increment doc ordinal
|
||||
|
||||
int dord = reader.getGamma();
|
||||
ordinal += dord;
|
||||
|
||||
assertEquals(1, ordinal);
|
||||
}
|
||||
|
||||
{
|
||||
int code = reader.get(2);
|
||||
assertEquals(1, code); // increment doc ordinal
|
||||
|
||||
int diffDomainId = reader.getDelta();
|
||||
domainId += diffDomainId;
|
||||
assertEquals(1, domainId);
|
||||
|
||||
int abs_ord = reader.getDelta();
|
||||
ordinal = abs_ord - 1;
|
||||
assertEquals(0, ordinal);
|
||||
}
|
||||
|
||||
{
|
||||
int code = reader.get(2);
|
||||
assertEquals(2, code); // increment doc ordinal
|
||||
|
||||
int diffRank = reader.getGamma() - 1;
|
||||
rank += diffRank;
|
||||
assertEquals(56, rank);
|
||||
|
||||
domainId = reader.get(31);
|
||||
ordinal = reader.get(26);
|
||||
|
||||
assertEquals(4, domainId);
|
||||
assertEquals(51, ordinal);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -12,6 +12,7 @@ import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import static nu.marginalia.index.construction.full.TestJournalFactory.*;
|
||||
@ -60,7 +61,8 @@ class PrioPreindexTest {
|
||||
public void testFinalizeSimple() throws IOException {
|
||||
var journalReader = journalFactory.createReader(
|
||||
new EntryDataWithWordMeta(100, 101, wm(50, 51)),
|
||||
new EntryDataWithWordMeta(104, 101, wm(50, 52))
|
||||
new EntryDataWithWordMeta(104, 101, wm(50, 52)),
|
||||
new EntryDataWithWordMeta(106, 101, wm(50, 52))
|
||||
);
|
||||
|
||||
var preindex = PrioPreindex.constructPreindex(journalReader, DocIdRewriter.identity(), tempDir);
|
||||
@ -79,9 +81,10 @@ class PrioPreindexTest {
|
||||
var lqb = new LongQueryBuffer(32);
|
||||
entrySource.read(lqb);
|
||||
|
||||
assertEquals(2, lqb.size());
|
||||
assertEquals(3, lqb.size());
|
||||
assertEquals(100, lqb.copyData()[0]);
|
||||
assertEquals(104, lqb.copyData()[1]);
|
||||
assertEquals(106, lqb.copyData()[2]);
|
||||
}
|
||||
|
||||
|
||||
|
@ -6,6 +6,7 @@ import nu.marginalia.array.page.LongQueryBuffer;
|
||||
*/
|
||||
public interface EntrySource {
|
||||
/** Skip n entries. */
|
||||
@Deprecated
|
||||
void skip(int n);
|
||||
|
||||
/** Fill the buffer with entries, updating its data and length appropriately. */
|
||||
|
@ -3,6 +3,7 @@ package nu.marginalia.array.page;
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.Arrays;
|
||||
|
||||
/** A buffer for long values that can be used to filter and manipulate the data.
|
||||
@ -164,6 +165,11 @@ public class LongQueryBuffer {
|
||||
finalizeFiltering();
|
||||
}
|
||||
|
||||
@SuppressWarnings("preview")
|
||||
public ByteBuffer asByteBuffer() {
|
||||
return data.getMemorySegment().asByteBuffer();
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return getClass().getSimpleName() + "[" +
|
||||
"read = " + read +
|
||||
|
@ -120,7 +120,8 @@ public class BitWriter {
|
||||
}
|
||||
|
||||
|
||||
private void finishLastByte() {
|
||||
/** Finish writing any partially written bit fields to the buffer */
|
||||
public void finishLastByte() {
|
||||
// It's possible we have a few bits left over that have yet to be written
|
||||
// to the underlying buffer. We need to write them out now.
|
||||
|
||||
|
@ -324,4 +324,21 @@ class BitWriterTest {
|
||||
assertEquals(2, reader.getDelta());
|
||||
assertEquals(30, reader.getDelta());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testGamma2() {
|
||||
var buffer = ByteBuffer.allocate(8192);
|
||||
var writer = new BitWriter(buffer);
|
||||
writer.putBits(0, 2);
|
||||
writer.putGamma(4);
|
||||
writer.putBits(0, 2);
|
||||
writer.putGamma(2);
|
||||
var ret = writer.finish();
|
||||
|
||||
var reader = new BitReader(ret);
|
||||
reader.get(2);
|
||||
assertEquals(4, reader.getGamma());
|
||||
reader.get(2);
|
||||
assertEquals(2, reader.getGamma());
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user