(index) Fix rare BitReader.takeWhileZero bug

Fix rare bug where the takeWhileZero method would fail to repopulate the underlying buffer.  This caused intermittent de-compression errors if takeWhileZero happened at a 64 bit boundary while the underlying buffer was empty.

The change also alters how sequence-lengths are encoded, to more consistently use the getGamma method instead of adding special significance to a zero first byte.

Finally, assertions are added checking the invariants of the gamma and delta coding logic as well as UrlIdCodec to earlier detect issues.
This commit is contained in:
Viktor Lofgren 2024-07-16 11:03:56 +02:00
parent dfd19b5eb9
commit ae87e41cec
7 changed files with 68 additions and 53 deletions

View File

@ -37,11 +37,18 @@ public class UrlIdCodec {
domainId &= 0x7FFF_FFFF;
documentOrdinal &= 0x03FF_FFFF;
assert (domainId & 0x7FFF_FFFF) == domainId : "Domain id must be in [0, 2^31-1], was " + domainId;
assert (documentOrdinal & 0x03FF_FFFF) == documentOrdinal : "Document ordinal must be in [0, 2^26-1], was " + documentOrdinal;
return ((long) domainId << 26) | documentOrdinal;
}
/** Encode a URL id with a ranking element */
public static long encodeId(int rank, int domainId, int documentOrdinal) {
assert (rank & 0x3F) == rank : "Rank must be in [0, 63], was " + rank;
assert (domainId & 0x7FFF_FFFF) == domainId : "Domain id must be in [0, 2^31-1], was " + domainId;
assert (documentOrdinal & 0x03FF_FFFF) == documentOrdinal : "Document ordinal must be in [0, 2^26-1], was " + documentOrdinal;
domainId &= 0x7FFF_FFFF;
documentOrdinal &= 0x03FF_FFFF;
rank &= 0x3F;
@ -75,7 +82,7 @@ public class UrlIdCodec {
/** Extract the document ordinal component from this URL id */
public static int getRank(long combinedId) {
return (int) (combinedId >>> 57);
return (int) (combinedId >>> 57) & 0x3F;
}
/** Mask out the ranking element from this URL id */

View File

@ -161,11 +161,13 @@ class PrioPreindexTest {
System.out.println("Mismatch at position " + (i + pos));
long prevValue = documentIds[i + pos - 1];
assertTrue(currValue > prevValue, "Current value is not greater than previous value");
long expectedValue = documentIds[i + pos];
System.out.println("Prev: " + prevValue + " -> " + UrlIdCodec.getRank(prevValue) + " " + UrlIdCodec.getDomainId(prevValue) + " " + UrlIdCodec.getDocumentOrdinal(prevValue));
System.out.println("Curr: " + currValue + " -> " + UrlIdCodec.getRank(currValue) + " " + UrlIdCodec.getDomainId(currValue) + " " + UrlIdCodec.getDocumentOrdinal(currValue));
System.out.println("Exp: " + expectedValue + " -> " + UrlIdCodec.getRank(expectedValue) + " " + UrlIdCodec.getDomainId(expectedValue) + " " + UrlIdCodec.getDocumentOrdinal(expectedValue));
assertTrue(currValue > prevValue, "Current value is not greater than previous value");
Assertions.fail();
}

View File

@ -138,10 +138,6 @@ public class GammaCodedSequence implements BinarySerializable, Iterable<Integer>
if (startPos == startLimit)
return 0;
// if the first byte is zero, the sequence is empty and we can skip decoding
if (0 == raw.get(startPos))
return 0;
return EliasGammaSequenceIterator.readCount(buffer());
}
@ -151,12 +147,9 @@ public class GammaCodedSequence implements BinarySerializable, Iterable<Integer>
* or equal to zero.
*/
public static ByteBuffer encode(ByteBuffer workArea, IntList sequence) {
if (sequence.isEmpty())
return ByteBuffer.allocate(0);
var writer = new BitWriter(workArea);
writer.putGamma(sequence.size());
writer.putGamma(sequence.size() + 1);
int last = 0;
@ -216,14 +209,7 @@ public class GammaCodedSequence implements BinarySerializable, Iterable<Integer>
reader = new BitReader(buffer);
last = zero;
int bits = 1 + reader.takeWhileZero();
if (!reader.hasMore()) {
rem = 0;
}
else {
rem = reader.get(bits);
}
rem = reader.getGamma() - 1;
}
public EliasGammaSequenceIterator(ByteBuffer buffer) {
@ -233,13 +219,7 @@ public class GammaCodedSequence implements BinarySerializable, Iterable<Integer>
public static int readCount(ByteBuffer buffer) {
var reader = new BitReader(buffer);
int bits = 1 + reader.takeWhileZero();
if (!reader.hasMore()) {
return 0;
}
else {
return reader.get(bits);
}
return reader.getGamma() - 1;
}

View File

@ -49,8 +49,10 @@ public class BitReader {
/** Read the next width bits from the buffer */
public int get(int width) {
if (width == 0)
if (width == 0) {
return 0;
}
assert width <= 32;
if (bitPosition <= 0) {
readNext();
@ -94,9 +96,7 @@ public class BitReader {
do {
// Ensure we have bits to read
if (bitPosition <= 0) {
if (underlying.hasRemaining())
readNext();
else break;
readNext();
}
// Count the number of leading zeroes in the current value
@ -117,12 +117,24 @@ public class BitReader {
public int getGamma() {
int bits = takeWhileZero();
return get(bits + 1);
int ret = get(bits + 1);
// The highest bit in the gamma coded value must be set, we can use this invariant
// to detect data corruption early
assert (ret & (1 << bits)) != 0 : "Highest bit in gamma coded return value not set";
return ret;
}
public int getDelta() {
int bits = getGamma();
return get(bits);
int ret = get(bits);
// The highest bit in the delta coded value must be set, we can use this invariant
// to detect data corruption early
assert (ret & (1 << (bits-1))) != 0 : "Highest bit in delta coded return value not set";
return ret;
}
public boolean hasMore() {

View File

@ -101,6 +101,8 @@ public class BitWriter {
int bits = numberOfSignificantBits(value);
assert bits >= 1; // invariant
putGamma(bits);
putBits(value, bits);
}

View File

@ -1,6 +1,5 @@
package nu.marginalia.sequence;
import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.sequence.io.BitReader;
import nu.marginalia.sequence.io.BitWriter;
import org.junit.jupiter.api.Test;
@ -11,15 +10,6 @@ import static org.junit.jupiter.api.Assertions.*;
class BitReaderTest {
@Test
void emptySequence() {
var writer = new BitWriter(ByteBuffer.allocate(1024));
var buffer = writer.finish();
assertEquals(IntList.of(), new GammaCodedSequence(buffer).values());
}
@Test
void getBit() {
var writer = new BitWriter(ByteBuffer.allocate(1024));
@ -100,6 +90,25 @@ class BitReaderTest {
}
}
@Test
void getSevens2() {
// Fuzz test that probes int32 misalignments
var writer = new BitWriter(ByteBuffer.allocate(1024));
for (int i = 0; i < 729; i++) {
writer.putBits(73, 7);
}
var buffer = writer.finish();
var reader = new BitReader(buffer);
for (int i = 0; i < 729; i++) {
int val = reader.get(7);
assertEquals(0b1001001, val);
}
}
@Test
public void testTakeWhileZero() {
var writer = new BitWriter(ByteBuffer.allocate(1024));
@ -113,17 +122,6 @@ class BitReaderTest {
assertTrue(reader.getBit());
}
@Test
public void testTakeWhileZeroAllZero() {
var writer = new BitWriter(ByteBuffer.allocate(1024));
writer.putBits(0, 8);
var buffer = writer.finish();
var reader = new BitReader(buffer);
int val = reader.takeWhileZero();
assertEquals(8, val);
}
@Test
public void testTakeWhileZeroOverInt64() {
var writer = new BitWriter(ByteBuffer.allocate(1024));

View File

@ -30,6 +30,20 @@ class EliasGammaSequenceIteratorTest {
assertEquals(expected, decoded);
}
@Test
public void testCodecEmpty() {
var ret = GammaCodedSequence.encode(work, new int[] { });
List<Integer> decoded = new ArrayList<>();
List<Integer> expected = List.of();
var sequence = new GammaCodedSequence.EliasGammaSequenceIterator(ret);
while (sequence.hasNext()) {
decoded.add(sequence.nextInt());
}
assertEquals(expected, decoded);
}
@Test
public void valueCount() {
var ret = GammaCodedSequence.encode(work, new int[] { 1, 3, 5, 16, 32, 64 });