(index-reverse) Added compression to priority index

The priority index documents file can be trivially compressed to a large degree.

Compression schema:
```
00b -> diff docord (E gamma)
01b -> diff domainid (E delta) + (1 + docord) (E delta)
10b -> rank (E gamma) + domainid,docord (raw)
11b -> 30 bit size header, followed by 1 raw doc id (61 bits)
```
This commit is contained in:
Viktor Lofgren 2024-07-10 18:34:07 +02:00
parent abf7a8d78d
commit 12590d3449
10 changed files with 295 additions and 67 deletions

View File

@ -40,6 +40,14 @@ public class UrlIdCodec {
return ((long) domainId << 26) | documentOrdinal;
}
/** Encode a URL id with a ranking element */
public static long encodeId(int rank, int domainId, int documentOrdinal) {
domainId &= 0x7FFF_FFFF;
documentOrdinal &= 0x03FF_FFFF;
rank &= 0x3F;
return ((long) rank << 57) | ((long) domainId << 26) | documentOrdinal;
}
/** Add a ranking element to an existing combined URL id.
*
* @param rank [0,1] the importance of the domain, low is good

View File

@ -3,23 +3,32 @@ package nu.marginalia.index;
import lombok.SneakyThrows;
import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.index.query.EntrySource;
import nu.marginalia.sequence.io.BitReader;
import nu.marginalia.model.id.UrlIdCodec;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.FileChannel;
import static java.lang.Math.min;
public class PrioIndexEntrySource implements EntrySource {
private final String name;
int posL;
int endOffsetL;
private final ByteBuffer readData = ByteBuffer.allocate(1024);
private final BitReader bitReader = new BitReader(readData);
private final FileChannel docsFileChannel;
private final long dataOffsetStartB;
private long dataOffsetStartB;
private final long wordId;
private final int numItems;
private int readItems = 0;
int prevRank = -1;
int prevDomainId = -1;
int prevDocOrd = -1;
public PrioIndexEntrySource(String name,
int numEntriesL,
FileChannel docsFileChannel,
long dataOffsetStartB,
long wordId)
@ -29,41 +38,101 @@ public class PrioIndexEntrySource implements EntrySource {
this.dataOffsetStartB = dataOffsetStartB;
this.wordId = wordId;
posL = 0;
endOffsetL = posL + numEntriesL;
// sneaky read of the header to get item count upfront
try {
readData.limit(4);
int rb = docsFileChannel.read(readData, dataOffsetStartB);
assert rb == 4;
readData.flip();
numItems = readData.getInt() & 0x3FFF_FFFF;
readData.position(0);
readData.limit(0);
}
catch (IOException ex) {
throw new IllegalStateException("Failed to read index data.", ex);
}
}
@Override
public void skip(int n) {
posL += n;
throw new UnsupportedOperationException("Not implemented");
}
@Override
@SneakyThrows
@SuppressWarnings("preview")
public void read(LongQueryBuffer buffer) {
buffer.reset();
buffer.end = min(buffer.end, endOffsetL - posL);
var outputBuffer = buffer.asByteBuffer().order(ByteOrder.LITTLE_ENDIAN);
outputBuffer.clear();
var byteBuffer = buffer.data.getMemorySegment().asByteBuffer();
byteBuffer.clear();
byteBuffer.limit(buffer.end * 8);
while (readItems++ < numItems && outputBuffer.hasRemaining()) {
fillReadBuffer();
while (byteBuffer.hasRemaining()) {
int rb = docsFileChannel.read(byteBuffer, dataOffsetStartB + posL * 8L + byteBuffer.position());
if (rb == -1) {
throw new IllegalStateException("Unexpected end of file while reading index data.");
int rank;
int domainId;
int docOrd;
int code = bitReader.get(2);
if (code == 0b11) {
// header
bitReader.get(30); // skip 30 bits for the size header
rank = bitReader.get(7);
domainId = bitReader.get(31);
docOrd = bitReader.get(26);
}
else if (code == 0b10) {
rank = prevRank + bitReader.getGamma();
domainId = bitReader.get(31);
docOrd = bitReader.get(26);
}
else if (code == 0b01) {
rank = prevRank;
domainId = bitReader.getDelta() + prevDomainId;
docOrd = bitReader.getDelta() - 1;
}
else if (code == 0b00) {
rank = prevRank;
domainId = prevDomainId;
docOrd = prevDocOrd + bitReader.getGamma();
}
else {
throw new IllegalStateException("??? found code " + code);
}
long encodedId = UrlIdCodec.encodeId(rank, domainId, docOrd);
outputBuffer.putLong(
encodedId
);
prevRank = rank;
prevDomainId = domainId;
prevDocOrd = docOrd;
}
posL += buffer.end;
buffer.end = outputBuffer.position() / 8;
buffer.uniq();
}
private void fillReadBuffer() throws IOException {
if (readData.remaining() < 8) {
readData.compact();
int rb = docsFileChannel.read(readData, dataOffsetStartB);
if (rb > 0) {
dataOffsetStartB += rb;
}
readData.flip();
}
}
@Override
public boolean hasMore() {
return posL < endOffsetL;
return readItems < numItems;
}

View File

@ -70,20 +70,9 @@ public class PrioReverseIndexReader {
if (offset < 0) // No documents
return new EmptyEntrySource();
// Read the number of documents
ByteBuffer buffer = ByteBuffer.allocate(8);
try {
documentsChannel.read(buffer, offset);
}
catch (IOException e) {
logger.error("Failed to read documents channel", e);
return new EmptyEntrySource();
}
return new PrioIndexEntrySource(name,
(int) buffer.getLong(0),
documentsChannel,
offset + 8,
offset,
termId);
}
@ -92,7 +81,7 @@ public class PrioReverseIndexReader {
long offset = wordOffset(termId);
ByteBuffer buffer = ByteBuffer.allocate(8);
ByteBuffer buffer = ByteBuffer.allocate(4);
try {
documentsChannel.read(buffer, offset);
}
@ -101,7 +90,7 @@ public class PrioReverseIndexReader {
return 0;
}
return (int) buffer.getLong(0);
return buffer.getInt(0) & 0x3FFF_FFFF;
}

View File

@ -1,17 +1,26 @@
package nu.marginalia.index.construction.prio;
import nu.marginalia.array.algo.LongArrayTransformations;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.sequence.io.BitWriter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.FileChannel;
/** Constructs document ids list priority reverse index */
public class PrioDocIdsTransformer implements LongArrayTransformations.LongIOTransformer {
private static final Logger logger = LoggerFactory.getLogger(PrioDocIdsTransformer.class);
private final FileChannel writeChannel;
private final FileChannel readChannel;
private final ByteBuffer buffer = ByteBuffer.allocate(8192);
private final ByteBuffer readBuffer = ByteBuffer.allocate(8192).order(ByteOrder.LITTLE_ENDIAN);
private final ByteBuffer writeBuffer = ByteBuffer.allocate(8192);
long startL = 0;
long writeOffsetB = 0;
@ -33,25 +42,99 @@ public class PrioDocIdsTransformer implements LongArrayTransformations.LongIOTra
}
readChannel.position(startL * 8);
readBuffer.clear();
writeBuffer.clear();
buffer.clear();
buffer.putLong(sizeL);
int toBeRead = 8 * (sizeL);
var bitWriter = new BitWriter(writeBuffer);
int prevRank = -1;
int prevDomainId = -1;
int prevDocOrd = -1;
boolean wroteHeader = false;
int toBeWrittenB = 8 * (1 + sizeL);
do {
buffer.limit(Math.min(buffer.capacity(), toBeWrittenB));
readChannel.read(buffer);
buffer.flip();
readBuffer.limit(Math.min(readBuffer.capacity(), toBeRead));
readChannel.read(readBuffer);
readBuffer.flip();
while (buffer.hasRemaining()) {
int written = writeChannel.write(buffer, writeOffsetB);
writeOffsetB += written;
toBeWrittenB -= written;
if (!wroteHeader) {
// write 11b header
bitWriter.putBits(3, 2);
// encode number of items
bitWriter.putBits(sizeL, 30);
long firstItem = readBuffer.getLong();
prevRank = UrlIdCodec.getRank(firstItem);
prevDomainId = UrlIdCodec.getDomainId(firstItem);
prevDocOrd = UrlIdCodec.getDocumentOrdinal(firstItem);
bitWriter.putBits(prevRank, 7);
bitWriter.putBits(prevDomainId, 31);
bitWriter.putBits(prevDocOrd, 26);
wroteHeader = true;
}
buffer.clear();
} while (toBeWrittenB > 0);
while (readBuffer.hasRemaining()) {
long nextId = readBuffer.getLong();
// break down id components
int rank = UrlIdCodec.getRank(nextId);
int domainId = UrlIdCodec.getDomainId(nextId);
int docOrd = UrlIdCodec.getDocumentOrdinal(nextId);
// encode components
if (rank != prevRank) {
bitWriter.putBits(0b10, 2);
bitWriter.putGamma(rank - prevRank);
bitWriter.putBits(domainId, 31);
bitWriter.putBits(docOrd, 26);
}
else if (domainId != prevDomainId) {
bitWriter.putBits(0b01, 2);
bitWriter.putDelta(domainId - prevDomainId);
bitWriter.putDelta(1 + docOrd);
}
else if (docOrd != prevDocOrd) {
bitWriter.putBits(0b00, 2);
bitWriter.putGamma(docOrd - prevDocOrd);
}
else {
logger.warn("Unexpected duplicate document id: {}", nextId);
}
prevDocOrd = docOrd;
prevDomainId = domainId;
prevRank = rank;
if (writeBuffer.remaining() < 16) {
writeBuffer.flip();
int written = writeChannel.write(writeBuffer, writeOffsetB);
writeOffsetB += written;
writeBuffer.clear();
}
}
toBeRead -= readBuffer.limit();
readBuffer.clear();
} while (toBeRead > 0);
// write lingering data
// ensure any half-written data is flushed to the buffer
bitWriter.finishLastByte();
writeBuffer.flip();
while (writeBuffer.hasRemaining()) {
int written = writeChannel.write(writeBuffer, writeOffsetB);
writeOffsetB += written;
}
// update the start input pointer
startL = endL;
return startOffsetB;
}

View File

@ -1,5 +1,7 @@
package nu.marginalia.index.construction.prio;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.sequence.io.BitReader;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@ -7,6 +9,7 @@ import org.junit.jupiter.api.Test;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
@ -36,15 +39,14 @@ class PrioDocIdsTransformerTest {
}
@Test
public void test() throws IOException {
public void testDomainIdDocOrd() throws IOException {
// Write 5 longs to the input file as data
try (var dos = new DataOutputStream(Files.newOutputStream(inputFile))) {
dos.writeLong(1);
dos.writeLong(2);
dos.writeLong(3);
dos.writeLong(4);
dos.writeLong(5);
dos.writeLong(UrlIdCodec.encodeId(0, 0));
dos.writeLong(UrlIdCodec.encodeId(0, 1));
dos.writeLong(UrlIdCodec.encodeId(1, 0));
dos.writeLong(UrlIdCodec.encodeId(4, 51) | 0x7000_0000_0000_0000L);
}
try (var writeChannel = (FileChannel) Files.newByteChannel(outputFile, StandardOpenOption.WRITE);
@ -52,19 +54,68 @@ class PrioDocIdsTransformerTest {
{
// Transform two segments of the input file and write them to the output file with prefixed sizes
var transformer = new PrioDocIdsTransformer(writeChannel, readChannel);
transformer.transform(0, 3);
transformer.transform(1, 5);
transformer.transform(0, 4);
}
// Verify the output file
try (var dis = new DataInputStream(Files.newInputStream(outputFile))) {
assertEquals(3, dis.readLong());
assertEquals(1, dis.readLong());
assertEquals(2, dis.readLong());
assertEquals(3, dis.readLong());
assertEquals(2, dis.readLong());
assertEquals(4, dis.readLong());
assertEquals(5, dis.readLong());
byte[] bytes = Files.readAllBytes(outputFile);
var buffer = ByteBuffer.wrap(bytes);
BitReader reader = new BitReader(buffer);
// read the header
{
int code = reader.get(2);
int size = reader.get(30);
assertEquals(3, code);
assertEquals(4, size);
}
// read first doc id in parts
int rank = reader.get(7);
int domainId = reader.get(31);
int ordinal = reader.get(26);
assertEquals(0, rank);
assertEquals(0, domainId);
assertEquals(0, ordinal);
{
int code = reader.get(2);
assertEquals(0, code); // increment doc ordinal
int dord = reader.getGamma();
ordinal += dord;
assertEquals(1, ordinal);
}
{
int code = reader.get(2);
assertEquals(1, code); // increment doc ordinal
int diffDomainId = reader.getDelta();
domainId += diffDomainId;
assertEquals(1, domainId);
int abs_ord = reader.getDelta();
ordinal = abs_ord - 1;
assertEquals(0, ordinal);
}
{
int code = reader.get(2);
assertEquals(2, code); // increment doc ordinal
int diffRank = reader.getGamma() - 1;
rank += diffRank;
assertEquals(56, rank);
domainId = reader.get(31);
ordinal = reader.get(26);
assertEquals(4, domainId);
assertEquals(51, ordinal);
}
}

View File

@ -12,6 +12,7 @@ import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import static nu.marginalia.index.construction.full.TestJournalFactory.*;
@ -60,7 +61,8 @@ class PrioPreindexTest {
public void testFinalizeSimple() throws IOException {
var journalReader = journalFactory.createReader(
new EntryDataWithWordMeta(100, 101, wm(50, 51)),
new EntryDataWithWordMeta(104, 101, wm(50, 52))
new EntryDataWithWordMeta(104, 101, wm(50, 52)),
new EntryDataWithWordMeta(106, 101, wm(50, 52))
);
var preindex = PrioPreindex.constructPreindex(journalReader, DocIdRewriter.identity(), tempDir);
@ -79,9 +81,10 @@ class PrioPreindexTest {
var lqb = new LongQueryBuffer(32);
entrySource.read(lqb);
assertEquals(2, lqb.size());
assertEquals(3, lqb.size());
assertEquals(100, lqb.copyData()[0]);
assertEquals(104, lqb.copyData()[1]);
assertEquals(106, lqb.copyData()[2]);
}

View File

@ -6,6 +6,7 @@ import nu.marginalia.array.page.LongQueryBuffer;
*/
public interface EntrySource {
/** Skip n entries. */
@Deprecated
void skip(int n);
/** Fill the buffer with entries, updating its data and length appropriately. */

View File

@ -3,6 +3,7 @@ package nu.marginalia.array.page;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import java.nio.ByteBuffer;
import java.util.Arrays;
/** A buffer for long values that can be used to filter and manipulate the data.
@ -164,6 +165,11 @@ public class LongQueryBuffer {
finalizeFiltering();
}
@SuppressWarnings("preview")
public ByteBuffer asByteBuffer() {
return data.getMemorySegment().asByteBuffer();
}
public String toString() {
return getClass().getSimpleName() + "[" +
"read = " + read +

View File

@ -120,7 +120,8 @@ public class BitWriter {
}
private void finishLastByte() {
/** Finish writing any partially written bit fields to the buffer */
public void finishLastByte() {
// It's possible we have a few bits left over that have yet to be written
// to the underlying buffer. We need to write them out now.

View File

@ -324,4 +324,21 @@ class BitWriterTest {
assertEquals(2, reader.getDelta());
assertEquals(30, reader.getDelta());
}
@Test
void testGamma2() {
var buffer = ByteBuffer.allocate(8192);
var writer = new BitWriter(buffer);
writer.putBits(0, 2);
writer.putGamma(4);
writer.putBits(0, 2);
writer.putGamma(2);
var ret = writer.finish();
var reader = new BitReader(ret);
reader.get(2);
assertEquals(4, reader.getGamma());
reader.get(2);
assertEquals(2, reader.getGamma());
}
}