diff --git a/code/common/model/java/nu/marginalia/model/id/UrlIdCodec.java b/code/common/model/java/nu/marginalia/model/id/UrlIdCodec.java index 26ac847e..a8c9af28 100644 --- a/code/common/model/java/nu/marginalia/model/id/UrlIdCodec.java +++ b/code/common/model/java/nu/marginalia/model/id/UrlIdCodec.java @@ -40,6 +40,14 @@ public class UrlIdCodec { return ((long) domainId << 26) | documentOrdinal; } + /** Encode a URL id with a ranking element */ + public static long encodeId(int rank, int domainId, int documentOrdinal) { + domainId &= 0x7FFF_FFFF; + documentOrdinal &= 0x03FF_FFFF; + rank &= 0x3F; + + return ((long) rank << 57) | ((long) domainId << 26) | documentOrdinal; + } /** Add a ranking element to an existing combined URL id. * * @param rank [0,1] the importance of the domain, low is good diff --git a/code/index/index-reverse/java/nu/marginalia/index/PrioIndexEntrySource.java b/code/index/index-reverse/java/nu/marginalia/index/PrioIndexEntrySource.java index bfb6be14..e55a4235 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/PrioIndexEntrySource.java +++ b/code/index/index-reverse/java/nu/marginalia/index/PrioIndexEntrySource.java @@ -3,23 +3,32 @@ package nu.marginalia.index; import lombok.SneakyThrows; import nu.marginalia.array.page.LongQueryBuffer; import nu.marginalia.index.query.EntrySource; +import nu.marginalia.sequence.io.BitReader; +import nu.marginalia.model.id.UrlIdCodec; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; import java.nio.channels.FileChannel; -import static java.lang.Math.min; - public class PrioIndexEntrySource implements EntrySource { private final String name; - int posL; - int endOffsetL; + private final ByteBuffer readData = ByteBuffer.allocate(1024); + private final BitReader bitReader = new BitReader(readData); private final FileChannel docsFileChannel; - private final long dataOffsetStartB; + private long dataOffsetStartB; private final long wordId; + private final int numItems; + private int readItems = 0; + + int prevRank = -1; + int prevDomainId = -1; + int prevDocOrd = -1; + public PrioIndexEntrySource(String name, - int numEntriesL, FileChannel docsFileChannel, long dataOffsetStartB, long wordId) @@ -29,41 +38,101 @@ public class PrioIndexEntrySource implements EntrySource { this.dataOffsetStartB = dataOffsetStartB; this.wordId = wordId; - posL = 0; - endOffsetL = posL + numEntriesL; + // sneaky read of the header to get item count upfront + + try { + readData.limit(4); + + int rb = docsFileChannel.read(readData, dataOffsetStartB); + assert rb == 4; + readData.flip(); + numItems = readData.getInt() & 0x3FFF_FFFF; + + readData.position(0); + readData.limit(0); + } + catch (IOException ex) { + throw new IllegalStateException("Failed to read index data.", ex); + } } @Override public void skip(int n) { - posL += n; + throw new UnsupportedOperationException("Not implemented"); } @Override @SneakyThrows @SuppressWarnings("preview") public void read(LongQueryBuffer buffer) { - buffer.reset(); - buffer.end = min(buffer.end, endOffsetL - posL); + var outputBuffer = buffer.asByteBuffer().order(ByteOrder.LITTLE_ENDIAN); + outputBuffer.clear(); - var byteBuffer = buffer.data.getMemorySegment().asByteBuffer(); - byteBuffer.clear(); - byteBuffer.limit(buffer.end * 8); + while (readItems++ < numItems && outputBuffer.hasRemaining()) { + fillReadBuffer(); - while (byteBuffer.hasRemaining()) { - int rb = docsFileChannel.read(byteBuffer, dataOffsetStartB + posL * 8L + byteBuffer.position()); - if (rb == -1) { - throw new IllegalStateException("Unexpected end of file while reading index data."); + int rank; + int domainId; + int docOrd; + + int code = bitReader.get(2); + if (code == 0b11) { + // header + bitReader.get(30); // skip 30 bits for the size header + + rank = bitReader.get(7); + domainId = bitReader.get(31); + docOrd = bitReader.get(26); } + else if (code == 0b10) { + rank = prevRank + bitReader.getGamma(); + domainId = bitReader.get(31); + docOrd = bitReader.get(26); + } + else if (code == 0b01) { + rank = prevRank; + domainId = bitReader.getDelta() + prevDomainId; + docOrd = bitReader.getDelta() - 1; + } + else if (code == 0b00) { + rank = prevRank; + domainId = prevDomainId; + docOrd = prevDocOrd + bitReader.getGamma(); + } + else { + throw new IllegalStateException("??? found code " + code); + } + + long encodedId = UrlIdCodec.encodeId(rank, domainId, docOrd); + + outputBuffer.putLong( + encodedId + ); + + prevRank = rank; + prevDomainId = domainId; + prevDocOrd = docOrd; } - posL += buffer.end; + buffer.end = outputBuffer.position() / 8; + buffer.uniq(); } + private void fillReadBuffer() throws IOException { + if (readData.remaining() < 8) { + readData.compact(); + int rb = docsFileChannel.read(readData, dataOffsetStartB); + if (rb > 0) { + dataOffsetStartB += rb; + } + readData.flip(); + } + } @Override public boolean hasMore() { - return posL < endOffsetL; + return readItems < numItems; } diff --git a/code/index/index-reverse/java/nu/marginalia/index/PrioReverseIndexReader.java b/code/index/index-reverse/java/nu/marginalia/index/PrioReverseIndexReader.java index 62ab1145..4b6944ae 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/PrioReverseIndexReader.java +++ b/code/index/index-reverse/java/nu/marginalia/index/PrioReverseIndexReader.java @@ -70,20 +70,9 @@ public class PrioReverseIndexReader { if (offset < 0) // No documents return new EmptyEntrySource(); - // Read the number of documents - ByteBuffer buffer = ByteBuffer.allocate(8); - try { - documentsChannel.read(buffer, offset); - } - catch (IOException e) { - logger.error("Failed to read documents channel", e); - return new EmptyEntrySource(); - } - return new PrioIndexEntrySource(name, - (int) buffer.getLong(0), documentsChannel, - offset + 8, + offset, termId); } @@ -92,7 +81,7 @@ public class PrioReverseIndexReader { long offset = wordOffset(termId); - ByteBuffer buffer = ByteBuffer.allocate(8); + ByteBuffer buffer = ByteBuffer.allocate(4); try { documentsChannel.read(buffer, offset); } @@ -101,7 +90,7 @@ public class PrioReverseIndexReader { return 0; } - return (int) buffer.getLong(0); + return buffer.getInt(0) & 0x3FFF_FFFF; } diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioDocIdsTransformer.java b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioDocIdsTransformer.java index 7a4801b3..01bdcfc2 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioDocIdsTransformer.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioDocIdsTransformer.java @@ -1,17 +1,26 @@ package nu.marginalia.index.construction.prio; import nu.marginalia.array.algo.LongArrayTransformations; +import nu.marginalia.model.id.UrlIdCodec; +import nu.marginalia.sequence.io.BitWriter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.IOException; import java.nio.ByteBuffer; +import java.nio.ByteOrder; import java.nio.channels.FileChannel; /** Constructs document ids list priority reverse index */ public class PrioDocIdsTransformer implements LongArrayTransformations.LongIOTransformer { + + private static final Logger logger = LoggerFactory.getLogger(PrioDocIdsTransformer.class); + private final FileChannel writeChannel; private final FileChannel readChannel; - private final ByteBuffer buffer = ByteBuffer.allocate(8192); + private final ByteBuffer readBuffer = ByteBuffer.allocate(8192).order(ByteOrder.LITTLE_ENDIAN); + private final ByteBuffer writeBuffer = ByteBuffer.allocate(8192); long startL = 0; long writeOffsetB = 0; @@ -33,25 +42,99 @@ public class PrioDocIdsTransformer implements LongArrayTransformations.LongIOTra } readChannel.position(startL * 8); + readBuffer.clear(); + writeBuffer.clear(); - buffer.clear(); - buffer.putLong(sizeL); + int toBeRead = 8 * (sizeL); + + var bitWriter = new BitWriter(writeBuffer); + + int prevRank = -1; + int prevDomainId = -1; + int prevDocOrd = -1; + boolean wroteHeader = false; - int toBeWrittenB = 8 * (1 + sizeL); do { - buffer.limit(Math.min(buffer.capacity(), toBeWrittenB)); - readChannel.read(buffer); - buffer.flip(); + readBuffer.limit(Math.min(readBuffer.capacity(), toBeRead)); + readChannel.read(readBuffer); + readBuffer.flip(); - while (buffer.hasRemaining()) { - int written = writeChannel.write(buffer, writeOffsetB); - writeOffsetB += written; - toBeWrittenB -= written; + if (!wroteHeader) { + // write 11b header + bitWriter.putBits(3, 2); + // encode number of items + bitWriter.putBits(sizeL, 30); + + + long firstItem = readBuffer.getLong(); + + prevRank = UrlIdCodec.getRank(firstItem); + prevDomainId = UrlIdCodec.getDomainId(firstItem); + prevDocOrd = UrlIdCodec.getDocumentOrdinal(firstItem); + + bitWriter.putBits(prevRank, 7); + bitWriter.putBits(prevDomainId, 31); + bitWriter.putBits(prevDocOrd, 26); + + wroteHeader = true; } - buffer.clear(); - } while (toBeWrittenB > 0); + while (readBuffer.hasRemaining()) { + long nextId = readBuffer.getLong(); + // break down id components + int rank = UrlIdCodec.getRank(nextId); + int domainId = UrlIdCodec.getDomainId(nextId); + int docOrd = UrlIdCodec.getDocumentOrdinal(nextId); + + // encode components + if (rank != prevRank) { + bitWriter.putBits(0b10, 2); + bitWriter.putGamma(rank - prevRank); + bitWriter.putBits(domainId, 31); + bitWriter.putBits(docOrd, 26); + } + else if (domainId != prevDomainId) { + bitWriter.putBits(0b01, 2); + bitWriter.putDelta(domainId - prevDomainId); + bitWriter.putDelta(1 + docOrd); + } + else if (docOrd != prevDocOrd) { + bitWriter.putBits(0b00, 2); + bitWriter.putGamma(docOrd - prevDocOrd); + } + else { + logger.warn("Unexpected duplicate document id: {}", nextId); + } + + prevDocOrd = docOrd; + prevDomainId = domainId; + prevRank = rank; + + if (writeBuffer.remaining() < 16) { + writeBuffer.flip(); + int written = writeChannel.write(writeBuffer, writeOffsetB); + writeOffsetB += written; + writeBuffer.clear(); + } + } + + toBeRead -= readBuffer.limit(); + readBuffer.clear(); + } while (toBeRead > 0); + + // write lingering data + + // ensure any half-written data is flushed to the buffer + bitWriter.finishLastByte(); + + writeBuffer.flip(); + while (writeBuffer.hasRemaining()) { + int written = writeChannel.write(writeBuffer, writeOffsetB); + writeOffsetB += written; + } + + // update the start input pointer startL = endL; return startOffsetB; } diff --git a/code/index/index-reverse/test/nu/marginalia/index/construction/prio/PrioDocIdsTransformerTest.java b/code/index/index-reverse/test/nu/marginalia/index/construction/prio/PrioDocIdsTransformerTest.java index f1e976a6..c5116334 100644 --- a/code/index/index-reverse/test/nu/marginalia/index/construction/prio/PrioDocIdsTransformerTest.java +++ b/code/index/index-reverse/test/nu/marginalia/index/construction/prio/PrioDocIdsTransformerTest.java @@ -1,5 +1,7 @@ package nu.marginalia.index.construction.prio; +import nu.marginalia.model.id.UrlIdCodec; +import nu.marginalia.sequence.io.BitReader; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -7,6 +9,7 @@ import org.junit.jupiter.api.Test; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.IOException; +import java.nio.ByteBuffer; import java.nio.channels.FileChannel; import java.nio.file.Files; import java.nio.file.Path; @@ -36,15 +39,14 @@ class PrioDocIdsTransformerTest { } @Test - public void test() throws IOException { + public void testDomainIdDocOrd() throws IOException { // Write 5 longs to the input file as data try (var dos = new DataOutputStream(Files.newOutputStream(inputFile))) { - dos.writeLong(1); - dos.writeLong(2); - dos.writeLong(3); - dos.writeLong(4); - dos.writeLong(5); + dos.writeLong(UrlIdCodec.encodeId(0, 0)); + dos.writeLong(UrlIdCodec.encodeId(0, 1)); + dos.writeLong(UrlIdCodec.encodeId(1, 0)); + dos.writeLong(UrlIdCodec.encodeId(4, 51) | 0x7000_0000_0000_0000L); } try (var writeChannel = (FileChannel) Files.newByteChannel(outputFile, StandardOpenOption.WRITE); @@ -52,19 +54,68 @@ class PrioDocIdsTransformerTest { { // Transform two segments of the input file and write them to the output file with prefixed sizes var transformer = new PrioDocIdsTransformer(writeChannel, readChannel); - transformer.transform(0, 3); - transformer.transform(1, 5); + transformer.transform(0, 4); } - // Verify the output file - try (var dis = new DataInputStream(Files.newInputStream(outputFile))) { - assertEquals(3, dis.readLong()); - assertEquals(1, dis.readLong()); - assertEquals(2, dis.readLong()); - assertEquals(3, dis.readLong()); - assertEquals(2, dis.readLong()); - assertEquals(4, dis.readLong()); - assertEquals(5, dis.readLong()); + byte[] bytes = Files.readAllBytes(outputFile); + var buffer = ByteBuffer.wrap(bytes); + + + BitReader reader = new BitReader(buffer); + + // read the header + { + int code = reader.get(2); + int size = reader.get(30); + assertEquals(3, code); + assertEquals(4, size); + } + + // read first doc id in parts + int rank = reader.get(7); + int domainId = reader.get(31); + int ordinal = reader.get(26); + + assertEquals(0, rank); + assertEquals(0, domainId); + assertEquals(0, ordinal); + + { + int code = reader.get(2); + assertEquals(0, code); // increment doc ordinal + + int dord = reader.getGamma(); + ordinal += dord; + + assertEquals(1, ordinal); + } + + { + int code = reader.get(2); + assertEquals(1, code); // increment doc ordinal + + int diffDomainId = reader.getDelta(); + domainId += diffDomainId; + assertEquals(1, domainId); + + int abs_ord = reader.getDelta(); + ordinal = abs_ord - 1; + assertEquals(0, ordinal); + } + + { + int code = reader.get(2); + assertEquals(2, code); // increment doc ordinal + + int diffRank = reader.getGamma() - 1; + rank += diffRank; + assertEquals(56, rank); + + domainId = reader.get(31); + ordinal = reader.get(26); + + assertEquals(4, domainId); + assertEquals(51, ordinal); } } diff --git a/code/index/index-reverse/test/nu/marginalia/index/construction/prio/PrioPreindexTest.java b/code/index/index-reverse/test/nu/marginalia/index/construction/prio/PrioPreindexTest.java index 8ba5ac7c..2a1a2a6c 100644 --- a/code/index/index-reverse/test/nu/marginalia/index/construction/prio/PrioPreindexTest.java +++ b/code/index/index-reverse/test/nu/marginalia/index/construction/prio/PrioPreindexTest.java @@ -12,6 +12,7 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import static nu.marginalia.index.construction.full.TestJournalFactory.*; @@ -60,7 +61,8 @@ class PrioPreindexTest { public void testFinalizeSimple() throws IOException { var journalReader = journalFactory.createReader( new EntryDataWithWordMeta(100, 101, wm(50, 51)), - new EntryDataWithWordMeta(104, 101, wm(50, 52)) + new EntryDataWithWordMeta(104, 101, wm(50, 52)), + new EntryDataWithWordMeta(106, 101, wm(50, 52)) ); var preindex = PrioPreindex.constructPreindex(journalReader, DocIdRewriter.identity(), tempDir); @@ -79,9 +81,10 @@ class PrioPreindexTest { var lqb = new LongQueryBuffer(32); entrySource.read(lqb); - assertEquals(2, lqb.size()); + assertEquals(3, lqb.size()); assertEquals(100, lqb.copyData()[0]); assertEquals(104, lqb.copyData()[1]); + assertEquals(106, lqb.copyData()[2]); } diff --git a/code/index/query/java/nu/marginalia/index/query/EntrySource.java b/code/index/query/java/nu/marginalia/index/query/EntrySource.java index 4b0f6405..166440f0 100644 --- a/code/index/query/java/nu/marginalia/index/query/EntrySource.java +++ b/code/index/query/java/nu/marginalia/index/query/EntrySource.java @@ -6,6 +6,7 @@ import nu.marginalia.array.page.LongQueryBuffer; */ public interface EntrySource { /** Skip n entries. */ + @Deprecated void skip(int n); /** Fill the buffer with entries, updating its data and length appropriately. */ diff --git a/code/libraries/array/java/nu/marginalia/array/page/LongQueryBuffer.java b/code/libraries/array/java/nu/marginalia/array/page/LongQueryBuffer.java index 1a270af7..ba1bd2b3 100644 --- a/code/libraries/array/java/nu/marginalia/array/page/LongQueryBuffer.java +++ b/code/libraries/array/java/nu/marginalia/array/page/LongQueryBuffer.java @@ -3,6 +3,7 @@ package nu.marginalia.array.page; import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArrayFactory; +import java.nio.ByteBuffer; import java.util.Arrays; /** A buffer for long values that can be used to filter and manipulate the data. @@ -164,6 +165,11 @@ public class LongQueryBuffer { finalizeFiltering(); } + @SuppressWarnings("preview") + public ByteBuffer asByteBuffer() { + return data.getMemorySegment().asByteBuffer(); + } + public String toString() { return getClass().getSimpleName() + "[" + "read = " + read + diff --git a/code/libraries/coded-sequence/java/nu/marginalia/sequence/io/BitWriter.java b/code/libraries/coded-sequence/java/nu/marginalia/sequence/io/BitWriter.java index 65b90830..598f7594 100644 --- a/code/libraries/coded-sequence/java/nu/marginalia/sequence/io/BitWriter.java +++ b/code/libraries/coded-sequence/java/nu/marginalia/sequence/io/BitWriter.java @@ -120,7 +120,8 @@ public class BitWriter { } - private void finishLastByte() { + /** Finish writing any partially written bit fields to the buffer */ + public void finishLastByte() { // It's possible we have a few bits left over that have yet to be written // to the underlying buffer. We need to write them out now. diff --git a/code/libraries/coded-sequence/test/nu/marginalia/sequence/BitWriterTest.java b/code/libraries/coded-sequence/test/nu/marginalia/sequence/BitWriterTest.java index 5adb5c7e..b5404ceb 100644 --- a/code/libraries/coded-sequence/test/nu/marginalia/sequence/BitWriterTest.java +++ b/code/libraries/coded-sequence/test/nu/marginalia/sequence/BitWriterTest.java @@ -324,4 +324,21 @@ class BitWriterTest { assertEquals(2, reader.getDelta()); assertEquals(30, reader.getDelta()); } + + @Test + void testGamma2() { + var buffer = ByteBuffer.allocate(8192); + var writer = new BitWriter(buffer); + writer.putBits(0, 2); + writer.putGamma(4); + writer.putBits(0, 2); + writer.putGamma(2); + var ret = writer.finish(); + + var reader = new BitReader(ret); + reader.get(2); + assertEquals(4, reader.getGamma()); + reader.get(2); + assertEquals(2, reader.getGamma()); + } } \ No newline at end of file