diff --git a/code/common/model/java/nu/marginalia/model/id/UrlIdCodec.java b/code/common/model/java/nu/marginalia/model/id/UrlIdCodec.java index a8c9af28..f9514c3a 100644 --- a/code/common/model/java/nu/marginalia/model/id/UrlIdCodec.java +++ b/code/common/model/java/nu/marginalia/model/id/UrlIdCodec.java @@ -37,11 +37,18 @@ public class UrlIdCodec { domainId &= 0x7FFF_FFFF; documentOrdinal &= 0x03FF_FFFF; + assert (domainId & 0x7FFF_FFFF) == domainId : "Domain id must be in [0, 2^31-1], was " + domainId; + assert (documentOrdinal & 0x03FF_FFFF) == documentOrdinal : "Document ordinal must be in [0, 2^26-1], was " + documentOrdinal; + return ((long) domainId << 26) | documentOrdinal; } /** Encode a URL id with a ranking element */ public static long encodeId(int rank, int domainId, int documentOrdinal) { + assert (rank & 0x3F) == rank : "Rank must be in [0, 63], was " + rank; + assert (domainId & 0x7FFF_FFFF) == domainId : "Domain id must be in [0, 2^31-1], was " + domainId; + assert (documentOrdinal & 0x03FF_FFFF) == documentOrdinal : "Document ordinal must be in [0, 2^26-1], was " + documentOrdinal; + domainId &= 0x7FFF_FFFF; documentOrdinal &= 0x03FF_FFFF; rank &= 0x3F; @@ -75,7 +82,7 @@ public class UrlIdCodec { /** Extract the document ordinal component from this URL id */ public static int getRank(long combinedId) { - return (int) (combinedId >>> 57); + return (int) (combinedId >>> 57) & 0x3F; } /** Mask out the ranking element from this URL id */ diff --git a/code/index/index-reverse/test/nu/marginalia/index/construction/prio/PrioPreindexTest.java b/code/index/index-reverse/test/nu/marginalia/index/construction/prio/PrioPreindexTest.java index 5b323d73..413b5b8b 100644 --- a/code/index/index-reverse/test/nu/marginalia/index/construction/prio/PrioPreindexTest.java +++ b/code/index/index-reverse/test/nu/marginalia/index/construction/prio/PrioPreindexTest.java @@ -161,11 +161,13 @@ class PrioPreindexTest { System.out.println("Mismatch at position " + (i + pos)); long prevValue = documentIds[i + pos - 1]; - - assertTrue(currValue > prevValue, "Current value is not greater than previous value"); + long expectedValue = documentIds[i + pos]; System.out.println("Prev: " + prevValue + " -> " + UrlIdCodec.getRank(prevValue) + " " + UrlIdCodec.getDomainId(prevValue) + " " + UrlIdCodec.getDocumentOrdinal(prevValue)); System.out.println("Curr: " + currValue + " -> " + UrlIdCodec.getRank(currValue) + " " + UrlIdCodec.getDomainId(currValue) + " " + UrlIdCodec.getDocumentOrdinal(currValue)); + System.out.println("Exp: " + expectedValue + " -> " + UrlIdCodec.getRank(expectedValue) + " " + UrlIdCodec.getDomainId(expectedValue) + " " + UrlIdCodec.getDocumentOrdinal(expectedValue)); + + assertTrue(currValue > prevValue, "Current value is not greater than previous value"); Assertions.fail(); } diff --git a/code/libraries/coded-sequence/java/nu/marginalia/sequence/GammaCodedSequence.java b/code/libraries/coded-sequence/java/nu/marginalia/sequence/GammaCodedSequence.java index 301b1c96..75eba781 100644 --- a/code/libraries/coded-sequence/java/nu/marginalia/sequence/GammaCodedSequence.java +++ b/code/libraries/coded-sequence/java/nu/marginalia/sequence/GammaCodedSequence.java @@ -138,10 +138,6 @@ public class GammaCodedSequence implements BinarySerializable, Iterable if (startPos == startLimit) return 0; - // if the first byte is zero, the sequence is empty and we can skip decoding - if (0 == raw.get(startPos)) - return 0; - return EliasGammaSequenceIterator.readCount(buffer()); } @@ -151,12 +147,9 @@ public class GammaCodedSequence implements BinarySerializable, Iterable * or equal to zero. */ public static ByteBuffer encode(ByteBuffer workArea, IntList sequence) { - if (sequence.isEmpty()) - return ByteBuffer.allocate(0); - var writer = new BitWriter(workArea); - writer.putGamma(sequence.size()); + writer.putGamma(sequence.size() + 1); int last = 0; @@ -216,14 +209,7 @@ public class GammaCodedSequence implements BinarySerializable, Iterable reader = new BitReader(buffer); last = zero; - int bits = 1 + reader.takeWhileZero(); - - if (!reader.hasMore()) { - rem = 0; - } - else { - rem = reader.get(bits); - } + rem = reader.getGamma() - 1; } public EliasGammaSequenceIterator(ByteBuffer buffer) { @@ -233,13 +219,7 @@ public class GammaCodedSequence implements BinarySerializable, Iterable public static int readCount(ByteBuffer buffer) { var reader = new BitReader(buffer); - int bits = 1 + reader.takeWhileZero(); - if (!reader.hasMore()) { - return 0; - } - else { - return reader.get(bits); - } + return reader.getGamma() - 1; } diff --git a/code/libraries/coded-sequence/java/nu/marginalia/sequence/io/BitReader.java b/code/libraries/coded-sequence/java/nu/marginalia/sequence/io/BitReader.java index 524e2a6b..03f553c2 100644 --- a/code/libraries/coded-sequence/java/nu/marginalia/sequence/io/BitReader.java +++ b/code/libraries/coded-sequence/java/nu/marginalia/sequence/io/BitReader.java @@ -49,8 +49,10 @@ public class BitReader { /** Read the next width bits from the buffer */ public int get(int width) { - if (width == 0) + if (width == 0) { return 0; + } + assert width <= 32; if (bitPosition <= 0) { readNext(); @@ -94,9 +96,7 @@ public class BitReader { do { // Ensure we have bits to read if (bitPosition <= 0) { - if (underlying.hasRemaining()) - readNext(); - else break; + readNext(); } // Count the number of leading zeroes in the current value @@ -117,12 +117,24 @@ public class BitReader { public int getGamma() { int bits = takeWhileZero(); - return get(bits + 1); + int ret = get(bits + 1); + + // The highest bit in the gamma coded value must be set, we can use this invariant + // to detect data corruption early + assert (ret & (1 << bits)) != 0 : "Highest bit in gamma coded return value not set"; + + return ret; } public int getDelta() { int bits = getGamma(); - return get(bits); + int ret = get(bits); + + // The highest bit in the delta coded value must be set, we can use this invariant + // to detect data corruption early + assert (ret & (1 << (bits-1))) != 0 : "Highest bit in delta coded return value not set"; + + return ret; } public boolean hasMore() { diff --git a/code/libraries/coded-sequence/java/nu/marginalia/sequence/io/BitWriter.java b/code/libraries/coded-sequence/java/nu/marginalia/sequence/io/BitWriter.java index 598f7594..57455541 100644 --- a/code/libraries/coded-sequence/java/nu/marginalia/sequence/io/BitWriter.java +++ b/code/libraries/coded-sequence/java/nu/marginalia/sequence/io/BitWriter.java @@ -101,6 +101,8 @@ public class BitWriter { int bits = numberOfSignificantBits(value); + assert bits >= 1; // invariant + putGamma(bits); putBits(value, bits); } diff --git a/code/libraries/coded-sequence/test/nu/marginalia/sequence/BitReaderTest.java b/code/libraries/coded-sequence/test/nu/marginalia/sequence/BitReaderTest.java index 9218e269..5488ffb8 100644 --- a/code/libraries/coded-sequence/test/nu/marginalia/sequence/BitReaderTest.java +++ b/code/libraries/coded-sequence/test/nu/marginalia/sequence/BitReaderTest.java @@ -1,6 +1,5 @@ package nu.marginalia.sequence; -import it.unimi.dsi.fastutil.ints.IntList; import nu.marginalia.sequence.io.BitReader; import nu.marginalia.sequence.io.BitWriter; import org.junit.jupiter.api.Test; @@ -11,15 +10,6 @@ import static org.junit.jupiter.api.Assertions.*; class BitReaderTest { - - @Test - void emptySequence() { - var writer = new BitWriter(ByteBuffer.allocate(1024)); - var buffer = writer.finish(); - - assertEquals(IntList.of(), new GammaCodedSequence(buffer).values()); - } - @Test void getBit() { var writer = new BitWriter(ByteBuffer.allocate(1024)); @@ -100,6 +90,25 @@ class BitReaderTest { } } + @Test + void getSevens2() { + // Fuzz test that probes int32 misalignments + var writer = new BitWriter(ByteBuffer.allocate(1024)); + + for (int i = 0; i < 729; i++) { + writer.putBits(73, 7); + } + + var buffer = writer.finish(); + + var reader = new BitReader(buffer); + + for (int i = 0; i < 729; i++) { + int val = reader.get(7); + assertEquals(0b1001001, val); + } + } + @Test public void testTakeWhileZero() { var writer = new BitWriter(ByteBuffer.allocate(1024)); @@ -113,17 +122,6 @@ class BitReaderTest { assertTrue(reader.getBit()); } - @Test - public void testTakeWhileZeroAllZero() { - var writer = new BitWriter(ByteBuffer.allocate(1024)); - writer.putBits(0, 8); - var buffer = writer.finish(); - - var reader = new BitReader(buffer); - int val = reader.takeWhileZero(); - assertEquals(8, val); - } - @Test public void testTakeWhileZeroOverInt64() { var writer = new BitWriter(ByteBuffer.allocate(1024)); diff --git a/code/libraries/coded-sequence/test/nu/marginalia/sequence/EliasGammaSequenceIteratorTest.java b/code/libraries/coded-sequence/test/nu/marginalia/sequence/EliasGammaSequenceIteratorTest.java index 1e48d0ae..518db595 100644 --- a/code/libraries/coded-sequence/test/nu/marginalia/sequence/EliasGammaSequenceIteratorTest.java +++ b/code/libraries/coded-sequence/test/nu/marginalia/sequence/EliasGammaSequenceIteratorTest.java @@ -30,6 +30,20 @@ class EliasGammaSequenceIteratorTest { assertEquals(expected, decoded); } + @Test + public void testCodecEmpty() { + var ret = GammaCodedSequence.encode(work, new int[] { }); + + List decoded = new ArrayList<>(); + List expected = List.of(); + + var sequence = new GammaCodedSequence.EliasGammaSequenceIterator(ret); + while (sequence.hasNext()) { + decoded.add(sequence.nextInt()); + } + + assertEquals(expected, decoded); + } @Test public void valueCount() { var ret = GammaCodedSequence.encode(work, new int[] { 1, 3, 5, 16, 32, 64 });