(index-reverse) Added compression to priority index

The priority index documents file can be trivially compressed to a large degree. Compression schema: ``` 00b -> diff docord (E gamma) 01b -> diff domainid (E delta) + (1 + docord) (E delta) 10b -> rank (E gamma) + domainid,docord (raw) 11b -> 30 bit size header, followed by 1 raw doc id (61 bits) ```
2025-02-23 21:18:58 +00:00 · 2024-07-10 18:34:07 +02:00 · 2024-07-10 18:34:07 +02:00 · 12590d3449
commit 12590d3449
parent abf7a8d78d
10 changed files with 295 additions and 67 deletions
--- a/code/common/model/java/nu/marginalia/model/id/UrlIdCodec.java
+++ b/code/common/model/java/nu/marginalia/model/id/UrlIdCodec.java
@ -40,6 +40,14 @@ public class UrlIdCodec {
        return ((long) domainId << 26) | documentOrdinal;
    }

+    /** Encode a URL id with a ranking element */
+    public static long encodeId(int rank, int domainId, int documentOrdinal) {
+        domainId &= 0x7FFF_FFFF;
+        documentOrdinal &= 0x03FF_FFFF;
+        rank &= 0x3F;
+
+        return  ((long) rank << 57) | ((long) domainId << 26) | documentOrdinal;
+    }
    /** Add a ranking element to an existing combined URL id.
     *
     * @param rank [0,1] the importance of the domain, low is good
--- a/code/index/index-reverse/java/nu/marginalia/index/PrioIndexEntrySource.java
+++ b/code/index/index-reverse/java/nu/marginalia/index/PrioIndexEntrySource.java
@ -3,23 +3,32 @@ package nu.marginalia.index;
 import lombok.SneakyThrows;
 import nu.marginalia.array.page.LongQueryBuffer;
 import nu.marginalia.index.query.EntrySource;
+import nu.marginalia.sequence.io.BitReader;
+import nu.marginalia.model.id.UrlIdCodec;

+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
 import java.nio.channels.FileChannel;

-import static java.lang.Math.min;
-
 public class PrioIndexEntrySource implements EntrySource {
    private final String name;

-    int posL;
-    int endOffsetL;
+    private final ByteBuffer readData = ByteBuffer.allocate(1024);
+    private final BitReader bitReader = new BitReader(readData);

    private final FileChannel docsFileChannel;
-    private final long dataOffsetStartB;
+    private long dataOffsetStartB;
    private final long wordId;

+    private final int numItems;
+    private int readItems = 0;
+
+    int prevRank = -1;
+    int prevDomainId = -1;
+    int prevDocOrd = -1;
+
    public PrioIndexEntrySource(String name,
-                                int numEntriesL,
                                FileChannel docsFileChannel,
                                long dataOffsetStartB,
                                long wordId)
@ -29,41 +38,101 @@ public class PrioIndexEntrySource implements EntrySource {
        this.dataOffsetStartB = dataOffsetStartB;
        this.wordId = wordId;

-        posL = 0;
-        endOffsetL = posL + numEntriesL;
+        // sneaky read of the header to get item count upfront
+
+        try {
+            readData.limit(4);
+
+            int rb = docsFileChannel.read(readData, dataOffsetStartB);
+            assert rb == 4;
+            readData.flip();
+            numItems = readData.getInt() & 0x3FFF_FFFF;
+
+            readData.position(0);
+            readData.limit(0);
+        }
+        catch (IOException ex) {
+            throw new IllegalStateException("Failed to read index data.", ex);
+        }
    }

    @Override
    public void skip(int n) {
-        posL += n;
+        throw new UnsupportedOperationException("Not implemented");
    }

    @Override
    @SneakyThrows
    @SuppressWarnings("preview")
    public void read(LongQueryBuffer buffer) {
-        buffer.reset();
-        buffer.end = min(buffer.end, endOffsetL - posL);
+        var outputBuffer = buffer.asByteBuffer().order(ByteOrder.LITTLE_ENDIAN);
+        outputBuffer.clear();

-        var byteBuffer = buffer.data.getMemorySegment().asByteBuffer();
-        byteBuffer.clear();
-        byteBuffer.limit(buffer.end * 8);
+        while (readItems++ < numItems && outputBuffer.hasRemaining()) {
+            fillReadBuffer();

-        while (byteBuffer.hasRemaining()) {
-            int rb = docsFileChannel.read(byteBuffer, dataOffsetStartB + posL * 8L + byteBuffer.position());
-            if (rb == -1) {
-                throw new IllegalStateException("Unexpected end of file while reading index data.");
+            int rank;
+            int domainId;
+            int docOrd;
+
+            int code = bitReader.get(2);
+            if (code == 0b11) {
+                // header
+                bitReader.get(30); // skip 30 bits for the size header
+
+                rank = bitReader.get(7);
+                domainId = bitReader.get(31);
+                docOrd = bitReader.get(26);
            }
+            else if (code == 0b10) {
+                rank = prevRank + bitReader.getGamma();
+                domainId = bitReader.get(31);
+                docOrd = bitReader.get(26);
+            }
+            else if (code == 0b01) {
+                rank = prevRank;
+                domainId = bitReader.getDelta() + prevDomainId;
+                docOrd = bitReader.getDelta() - 1;
+            }
+            else if (code == 0b00) {
+                rank = prevRank;
+                domainId = prevDomainId;
+                docOrd = prevDocOrd + bitReader.getGamma();
+            }
+            else {
+                throw new IllegalStateException("??? found code " + code);
+            }
+
+            long encodedId = UrlIdCodec.encodeId(rank, domainId, docOrd);
+
+            outputBuffer.putLong(
+                    encodedId
+            );
+
+            prevRank = rank;
+            prevDomainId = domainId;
+            prevDocOrd = docOrd;
        }

-        posL += buffer.end;
+        buffer.end = outputBuffer.position() / 8;
+
        buffer.uniq();
    }

+    private void fillReadBuffer() throws IOException {
+        if (readData.remaining() < 8) {
+            readData.compact();
+            int rb = docsFileChannel.read(readData, dataOffsetStartB);
+            if (rb > 0) {
+                dataOffsetStartB += rb;
+            }
+            readData.flip();
+        }
+    }

    @Override
    public boolean hasMore() {
-        return posL < endOffsetL;
+        return readItems < numItems;
    }


--- a/code/index/index-reverse/java/nu/marginalia/index/PrioReverseIndexReader.java
+++ b/code/index/index-reverse/java/nu/marginalia/index/PrioReverseIndexReader.java
@ -70,20 +70,9 @@ public class PrioReverseIndexReader {
        if (offset < 0) // No documents
            return new EmptyEntrySource();

-        // Read the number of documents
-        ByteBuffer buffer = ByteBuffer.allocate(8);
-        try {
-            documentsChannel.read(buffer, offset);
-        }
-        catch (IOException e) {
-            logger.error("Failed to read documents channel", e);
-            return new EmptyEntrySource();
-        }
-
        return new PrioIndexEntrySource(name,
-                (int) buffer.getLong(0),
                documentsChannel,
-                offset + 8,
+                offset,
                termId);
    }

@ -92,7 +81,7 @@ public class PrioReverseIndexReader {

        long offset = wordOffset(termId);

-        ByteBuffer buffer = ByteBuffer.allocate(8);
+        ByteBuffer buffer = ByteBuffer.allocate(4);
        try {
            documentsChannel.read(buffer, offset);
        }
@ -101,7 +90,7 @@ public class PrioReverseIndexReader {
            return 0;
        }

-        return (int) buffer.getLong(0);
+        return buffer.getInt(0) & 0x3FFF_FFFF;

    }

--- a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioDocIdsTransformer.java
+++ b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioDocIdsTransformer.java
@ -1,17 +1,26 @@
 package nu.marginalia.index.construction.prio;

 import nu.marginalia.array.algo.LongArrayTransformations;
+import nu.marginalia.model.id.UrlIdCodec;
+import nu.marginalia.sequence.io.BitWriter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;

 import java.io.IOException;
 import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
 import java.nio.channels.FileChannel;

 /** Constructs document ids list priority reverse index */
 public class PrioDocIdsTransformer implements LongArrayTransformations.LongIOTransformer {
+
+    private static final Logger logger = LoggerFactory.getLogger(PrioDocIdsTransformer.class);
+
    private final FileChannel writeChannel;
    private final FileChannel readChannel;

-    private final ByteBuffer buffer = ByteBuffer.allocate(8192);
+    private final ByteBuffer readBuffer = ByteBuffer.allocate(8192).order(ByteOrder.LITTLE_ENDIAN);
+    private final ByteBuffer writeBuffer = ByteBuffer.allocate(8192);

    long startL = 0;
    long writeOffsetB = 0;
@ -33,25 +42,99 @@ public class PrioDocIdsTransformer implements LongArrayTransformations.LongIOTra
        }

        readChannel.position(startL * 8);
+        readBuffer.clear();
+        writeBuffer.clear();

-        buffer.clear();
-        buffer.putLong(sizeL);
+        int toBeRead = 8 * (sizeL);
+
+        var bitWriter = new BitWriter(writeBuffer);
+
+        int prevRank = -1;
+        int prevDomainId = -1;
+        int prevDocOrd = -1;
+        boolean wroteHeader = false;

-        int toBeWrittenB = 8 * (1 + sizeL);
        do {
-            buffer.limit(Math.min(buffer.capacity(), toBeWrittenB));
-            readChannel.read(buffer);
-            buffer.flip();
+            readBuffer.limit(Math.min(readBuffer.capacity(), toBeRead));
+            readChannel.read(readBuffer);
+            readBuffer.flip();

-            while (buffer.hasRemaining()) {
-                int written = writeChannel.write(buffer, writeOffsetB);
-                writeOffsetB += written;
-                toBeWrittenB -= written;
+            if (!wroteHeader) {
+                // write 11b header
+                bitWriter.putBits(3, 2);
+                // encode number of items
+                bitWriter.putBits(sizeL, 30);
+
+
+                long firstItem = readBuffer.getLong();
+
+                prevRank = UrlIdCodec.getRank(firstItem);
+                prevDomainId = UrlIdCodec.getDomainId(firstItem);
+                prevDocOrd = UrlIdCodec.getDocumentOrdinal(firstItem);
+
+                bitWriter.putBits(prevRank, 7);
+                bitWriter.putBits(prevDomainId, 31);
+                bitWriter.putBits(prevDocOrd, 26);
+
+                wroteHeader = true;
            }

-            buffer.clear();
-        } while (toBeWrittenB > 0);
+            while (readBuffer.hasRemaining()) {
+                long nextId = readBuffer.getLong();

+                // break down id components
+                int rank = UrlIdCodec.getRank(nextId);
+                int domainId = UrlIdCodec.getDomainId(nextId);
+                int docOrd = UrlIdCodec.getDocumentOrdinal(nextId);
+
+                // encode components
+                if (rank != prevRank) {
+                    bitWriter.putBits(0b10, 2);
+                    bitWriter.putGamma(rank - prevRank);
+                    bitWriter.putBits(domainId, 31);
+                    bitWriter.putBits(docOrd, 26);
+                }
+                else if (domainId != prevDomainId) {
+                    bitWriter.putBits(0b01, 2);
+                    bitWriter.putDelta(domainId - prevDomainId);
+                    bitWriter.putDelta(1 + docOrd);
+                }
+                else if (docOrd != prevDocOrd) {
+                    bitWriter.putBits(0b00, 2);
+                    bitWriter.putGamma(docOrd - prevDocOrd);
+                }
+                else {
+                    logger.warn("Unexpected duplicate document id: {}", nextId);
+                }
+
+                prevDocOrd = docOrd;
+                prevDomainId = domainId;
+                prevRank = rank;
+
+                if (writeBuffer.remaining() < 16) {
+                    writeBuffer.flip();
+                    int written = writeChannel.write(writeBuffer, writeOffsetB);
+                    writeOffsetB += written;
+                    writeBuffer.clear();
+                }
+            }
+
+            toBeRead -= readBuffer.limit();
+            readBuffer.clear();
+        } while (toBeRead > 0);
+
+        // write lingering data
+
+        // ensure any half-written data is flushed to the buffer
+        bitWriter.finishLastByte();
+
+        writeBuffer.flip();
+        while (writeBuffer.hasRemaining()) {
+            int written = writeChannel.write(writeBuffer, writeOffsetB);
+            writeOffsetB += written;
+        }
+
+        // update the start input pointer
        startL = endL;
        return startOffsetB;
    }
--- a/code/index/index-reverse/test/nu/marginalia/index/construction/prio/PrioDocIdsTransformerTest.java
+++ b/code/index/index-reverse/test/nu/marginalia/index/construction/prio/PrioDocIdsTransformerTest.java
@ -1,5 +1,7 @@
 package nu.marginalia.index.construction.prio;

+import nu.marginalia.model.id.UrlIdCodec;
+import nu.marginalia.sequence.io.BitReader;
 import org.junit.jupiter.api.AfterEach;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
@ -7,6 +9,7 @@ import org.junit.jupiter.api.Test;
 import java.io.DataInputStream;
 import java.io.DataOutputStream;
 import java.io.IOException;
+import java.nio.ByteBuffer;
 import java.nio.channels.FileChannel;
 import java.nio.file.Files;
 import java.nio.file.Path;
@ -36,15 +39,14 @@ class PrioDocIdsTransformerTest {
    }

    @Test
-    public void test() throws IOException {
+    public void testDomainIdDocOrd() throws IOException {

        // Write 5 longs to the input file as data
        try (var dos = new DataOutputStream(Files.newOutputStream(inputFile))) {
-            dos.writeLong(1);
-            dos.writeLong(2);
-            dos.writeLong(3);
-            dos.writeLong(4);
-            dos.writeLong(5);
+            dos.writeLong(UrlIdCodec.encodeId(0, 0));
+            dos.writeLong(UrlIdCodec.encodeId(0, 1));
+            dos.writeLong(UrlIdCodec.encodeId(1, 0));
+            dos.writeLong(UrlIdCodec.encodeId(4, 51) | 0x7000_0000_0000_0000L);
        }

        try (var writeChannel = (FileChannel) Files.newByteChannel(outputFile, StandardOpenOption.WRITE);
@ -52,19 +54,68 @@ class PrioDocIdsTransformerTest {
        {
            // Transform two segments of the input file and write them to the output file with prefixed sizes
            var transformer = new PrioDocIdsTransformer(writeChannel, readChannel);
-            transformer.transform(0, 3);
-            transformer.transform(1, 5);
+            transformer.transform(0, 4);
        }

-        // Verify the output file
-        try (var dis = new DataInputStream(Files.newInputStream(outputFile))) {
-            assertEquals(3, dis.readLong());
-            assertEquals(1, dis.readLong());
-            assertEquals(2, dis.readLong());
-            assertEquals(3, dis.readLong());
-            assertEquals(2, dis.readLong());
-            assertEquals(4, dis.readLong());
-            assertEquals(5, dis.readLong());
+        byte[] bytes = Files.readAllBytes(outputFile);
+        var buffer = ByteBuffer.wrap(bytes);
+
+
+        BitReader reader = new BitReader(buffer);
+
+        // read the header
+        {
+            int code = reader.get(2);
+            int size = reader.get(30);
+            assertEquals(3, code);
+            assertEquals(4, size);
+        }
+
+        // read first doc id in parts
+        int rank = reader.get(7);
+        int domainId = reader.get(31);
+        int ordinal = reader.get(26);
+
+        assertEquals(0, rank);
+        assertEquals(0, domainId);
+        assertEquals(0, ordinal);
+
+        {
+            int code = reader.get(2);
+            assertEquals(0, code); // increment doc ordinal
+
+            int dord = reader.getGamma();
+            ordinal += dord;
+
+            assertEquals(1, ordinal);
+        }
+
+        {
+            int code = reader.get(2);
+            assertEquals(1, code); // increment doc ordinal
+
+            int diffDomainId = reader.getDelta();
+            domainId += diffDomainId;
+            assertEquals(1, domainId);
+
+            int abs_ord = reader.getDelta();
+            ordinal = abs_ord - 1;
+            assertEquals(0, ordinal);
+        }
+
+        {
+            int code = reader.get(2);
+            assertEquals(2, code); // increment doc ordinal
+
+            int diffRank = reader.getGamma() - 1;
+            rank += diffRank;
+            assertEquals(56, rank);
+
+            domainId = reader.get(31);
+            ordinal = reader.get(26);
+
+            assertEquals(4, domainId);
+            assertEquals(51, ordinal);
        }
    }

--- a/code/index/index-reverse/test/nu/marginalia/index/construction/prio/PrioPreindexTest.java
+++ b/code/index/index-reverse/test/nu/marginalia/index/construction/prio/PrioPreindexTest.java
@ -12,6 +12,7 @@ import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;

 import static nu.marginalia.index.construction.full.TestJournalFactory.*;
@ -60,7 +61,8 @@ class PrioPreindexTest {
    public void testFinalizeSimple() throws IOException {
        var journalReader = journalFactory.createReader(
                new EntryDataWithWordMeta(100, 101, wm(50, 51)),
-                new EntryDataWithWordMeta(104, 101, wm(50, 52))
+                new EntryDataWithWordMeta(104, 101, wm(50, 52)),
+                new EntryDataWithWordMeta(106, 101, wm(50, 52))
        );

        var preindex = PrioPreindex.constructPreindex(journalReader, DocIdRewriter.identity(), tempDir);
@ -79,9 +81,10 @@ class PrioPreindexTest {
        var lqb = new LongQueryBuffer(32);
        entrySource.read(lqb);

-        assertEquals(2, lqb.size());
+        assertEquals(3, lqb.size());
        assertEquals(100, lqb.copyData()[0]);
        assertEquals(104, lqb.copyData()[1]);
+        assertEquals(106, lqb.copyData()[2]);
    }


--- a/code/index/query/java/nu/marginalia/index/query/EntrySource.java
+++ b/code/index/query/java/nu/marginalia/index/query/EntrySource.java
@ -6,6 +6,7 @@ import nu.marginalia.array.page.LongQueryBuffer;
 */
 public interface EntrySource {
    /** Skip n entries. */
+    @Deprecated
    void skip(int n);

    /** Fill the buffer with entries, updating its data and length appropriately. */
--- a/code/libraries/array/java/nu/marginalia/array/page/LongQueryBuffer.java
+++ b/code/libraries/array/java/nu/marginalia/array/page/LongQueryBuffer.java
@ -3,6 +3,7 @@ package nu.marginalia.array.page;
 import nu.marginalia.array.LongArray;
 import nu.marginalia.array.LongArrayFactory;

+import java.nio.ByteBuffer;
 import java.util.Arrays;

 /** A buffer for long values that can be used to filter and manipulate the data.
@ -164,6 +165,11 @@ public class LongQueryBuffer {
        finalizeFiltering();
    }

+    @SuppressWarnings("preview")
+    public ByteBuffer asByteBuffer() {
+        return data.getMemorySegment().asByteBuffer();
+    }
+
    public String toString() {
        return getClass().getSimpleName() + "[" +
            "read = " + read +
--- a/code/libraries/coded-sequence/java/nu/marginalia/sequence/io/BitWriter.java
+++ b/code/libraries/coded-sequence/java/nu/marginalia/sequence/io/BitWriter.java
@ -120,7 +120,8 @@ public class BitWriter {
    }


-    private void finishLastByte() {
+    /** Finish writing any partially written bit fields to the buffer */
+    public void finishLastByte() {
        // It's possible we have a few bits left over that have yet to be written
        // to the underlying buffer. We need to write them out now.

--- a/code/libraries/coded-sequence/test/nu/marginalia/sequence/BitWriterTest.java
+++ b/code/libraries/coded-sequence/test/nu/marginalia/sequence/BitWriterTest.java
@ -324,4 +324,21 @@ class BitWriterTest {
        assertEquals(2, reader.getDelta());
        assertEquals(30, reader.getDelta());
    }
+
+    @Test
+    void testGamma2() {
+        var buffer = ByteBuffer.allocate(8192);
+        var writer = new BitWriter(buffer);
+        writer.putBits(0, 2);
+        writer.putGamma(4);
+        writer.putBits(0, 2);
+        writer.putGamma(2);
+        var ret = writer.finish();
+
+        var reader = new BitReader(ret);
+        reader.get(2);
+        assertEquals(4, reader.getGamma());
+        reader.get(2);
+        assertEquals(2, reader.getGamma());
+    }
 }