mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(index) Fix rare BitReader.takeWhileZero bug
Fix rare bug where the takeWhileZero method would fail to repopulate the underlying buffer. This caused intermittent de-compression errors if takeWhileZero happened at a 64 bit boundary while the underlying buffer was empty. The change also alters how sequence-lengths are encoded, to more consistently use the getGamma method instead of adding special significance to a zero first byte. Finally, assertions are added checking the invariants of the gamma and delta coding logic as well as UrlIdCodec to earlier detect issues.
This commit is contained in:
parent
dfd19b5eb9
commit
ae87e41cec
@ -37,11 +37,18 @@ public class UrlIdCodec {
|
||||
domainId &= 0x7FFF_FFFF;
|
||||
documentOrdinal &= 0x03FF_FFFF;
|
||||
|
||||
assert (domainId & 0x7FFF_FFFF) == domainId : "Domain id must be in [0, 2^31-1], was " + domainId;
|
||||
assert (documentOrdinal & 0x03FF_FFFF) == documentOrdinal : "Document ordinal must be in [0, 2^26-1], was " + documentOrdinal;
|
||||
|
||||
return ((long) domainId << 26) | documentOrdinal;
|
||||
}
|
||||
|
||||
/** Encode a URL id with a ranking element */
|
||||
public static long encodeId(int rank, int domainId, int documentOrdinal) {
|
||||
assert (rank & 0x3F) == rank : "Rank must be in [0, 63], was " + rank;
|
||||
assert (domainId & 0x7FFF_FFFF) == domainId : "Domain id must be in [0, 2^31-1], was " + domainId;
|
||||
assert (documentOrdinal & 0x03FF_FFFF) == documentOrdinal : "Document ordinal must be in [0, 2^26-1], was " + documentOrdinal;
|
||||
|
||||
domainId &= 0x7FFF_FFFF;
|
||||
documentOrdinal &= 0x03FF_FFFF;
|
||||
rank &= 0x3F;
|
||||
@ -75,7 +82,7 @@ public class UrlIdCodec {
|
||||
|
||||
/** Extract the document ordinal component from this URL id */
|
||||
public static int getRank(long combinedId) {
|
||||
return (int) (combinedId >>> 57);
|
||||
return (int) (combinedId >>> 57) & 0x3F;
|
||||
}
|
||||
|
||||
/** Mask out the ranking element from this URL id */
|
||||
|
@ -161,11 +161,13 @@ class PrioPreindexTest {
|
||||
System.out.println("Mismatch at position " + (i + pos));
|
||||
|
||||
long prevValue = documentIds[i + pos - 1];
|
||||
|
||||
assertTrue(currValue > prevValue, "Current value is not greater than previous value");
|
||||
long expectedValue = documentIds[i + pos];
|
||||
|
||||
System.out.println("Prev: " + prevValue + " -> " + UrlIdCodec.getRank(prevValue) + " " + UrlIdCodec.getDomainId(prevValue) + " " + UrlIdCodec.getDocumentOrdinal(prevValue));
|
||||
System.out.println("Curr: " + currValue + " -> " + UrlIdCodec.getRank(currValue) + " " + UrlIdCodec.getDomainId(currValue) + " " + UrlIdCodec.getDocumentOrdinal(currValue));
|
||||
System.out.println("Exp: " + expectedValue + " -> " + UrlIdCodec.getRank(expectedValue) + " " + UrlIdCodec.getDomainId(expectedValue) + " " + UrlIdCodec.getDocumentOrdinal(expectedValue));
|
||||
|
||||
assertTrue(currValue > prevValue, "Current value is not greater than previous value");
|
||||
|
||||
Assertions.fail();
|
||||
}
|
||||
|
@ -138,10 +138,6 @@ public class GammaCodedSequence implements BinarySerializable, Iterable<Integer>
|
||||
if (startPos == startLimit)
|
||||
return 0;
|
||||
|
||||
// if the first byte is zero, the sequence is empty and we can skip decoding
|
||||
if (0 == raw.get(startPos))
|
||||
return 0;
|
||||
|
||||
return EliasGammaSequenceIterator.readCount(buffer());
|
||||
}
|
||||
|
||||
@ -151,12 +147,9 @@ public class GammaCodedSequence implements BinarySerializable, Iterable<Integer>
|
||||
* or equal to zero.
|
||||
*/
|
||||
public static ByteBuffer encode(ByteBuffer workArea, IntList sequence) {
|
||||
if (sequence.isEmpty())
|
||||
return ByteBuffer.allocate(0);
|
||||
|
||||
var writer = new BitWriter(workArea);
|
||||
|
||||
writer.putGamma(sequence.size());
|
||||
writer.putGamma(sequence.size() + 1);
|
||||
|
||||
int last = 0;
|
||||
|
||||
@ -216,14 +209,7 @@ public class GammaCodedSequence implements BinarySerializable, Iterable<Integer>
|
||||
reader = new BitReader(buffer);
|
||||
|
||||
last = zero;
|
||||
int bits = 1 + reader.takeWhileZero();
|
||||
|
||||
if (!reader.hasMore()) {
|
||||
rem = 0;
|
||||
}
|
||||
else {
|
||||
rem = reader.get(bits);
|
||||
}
|
||||
rem = reader.getGamma() - 1;
|
||||
}
|
||||
|
||||
public EliasGammaSequenceIterator(ByteBuffer buffer) {
|
||||
@ -233,13 +219,7 @@ public class GammaCodedSequence implements BinarySerializable, Iterable<Integer>
|
||||
public static int readCount(ByteBuffer buffer) {
|
||||
var reader = new BitReader(buffer);
|
||||
|
||||
int bits = 1 + reader.takeWhileZero();
|
||||
if (!reader.hasMore()) {
|
||||
return 0;
|
||||
}
|
||||
else {
|
||||
return reader.get(bits);
|
||||
}
|
||||
return reader.getGamma() - 1;
|
||||
}
|
||||
|
||||
|
||||
|
@ -49,8 +49,10 @@ public class BitReader {
|
||||
|
||||
/** Read the next width bits from the buffer */
|
||||
public int get(int width) {
|
||||
if (width == 0)
|
||||
if (width == 0) {
|
||||
return 0;
|
||||
}
|
||||
assert width <= 32;
|
||||
|
||||
if (bitPosition <= 0) {
|
||||
readNext();
|
||||
@ -94,9 +96,7 @@ public class BitReader {
|
||||
do {
|
||||
// Ensure we have bits to read
|
||||
if (bitPosition <= 0) {
|
||||
if (underlying.hasRemaining())
|
||||
readNext();
|
||||
else break;
|
||||
readNext();
|
||||
}
|
||||
|
||||
// Count the number of leading zeroes in the current value
|
||||
@ -117,12 +117,24 @@ public class BitReader {
|
||||
|
||||
public int getGamma() {
|
||||
int bits = takeWhileZero();
|
||||
return get(bits + 1);
|
||||
int ret = get(bits + 1);
|
||||
|
||||
// The highest bit in the gamma coded value must be set, we can use this invariant
|
||||
// to detect data corruption early
|
||||
assert (ret & (1 << bits)) != 0 : "Highest bit in gamma coded return value not set";
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public int getDelta() {
|
||||
int bits = getGamma();
|
||||
return get(bits);
|
||||
int ret = get(bits);
|
||||
|
||||
// The highest bit in the delta coded value must be set, we can use this invariant
|
||||
// to detect data corruption early
|
||||
assert (ret & (1 << (bits-1))) != 0 : "Highest bit in delta coded return value not set";
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public boolean hasMore() {
|
||||
|
@ -101,6 +101,8 @@ public class BitWriter {
|
||||
|
||||
int bits = numberOfSignificantBits(value);
|
||||
|
||||
assert bits >= 1; // invariant
|
||||
|
||||
putGamma(bits);
|
||||
putBits(value, bits);
|
||||
}
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.sequence;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.IntList;
|
||||
import nu.marginalia.sequence.io.BitReader;
|
||||
import nu.marginalia.sequence.io.BitWriter;
|
||||
import org.junit.jupiter.api.Test;
|
||||
@ -11,15 +10,6 @@ import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class BitReaderTest {
|
||||
|
||||
|
||||
@Test
|
||||
void emptySequence() {
|
||||
var writer = new BitWriter(ByteBuffer.allocate(1024));
|
||||
var buffer = writer.finish();
|
||||
|
||||
assertEquals(IntList.of(), new GammaCodedSequence(buffer).values());
|
||||
}
|
||||
|
||||
@Test
|
||||
void getBit() {
|
||||
var writer = new BitWriter(ByteBuffer.allocate(1024));
|
||||
@ -100,6 +90,25 @@ class BitReaderTest {
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
void getSevens2() {
|
||||
// Fuzz test that probes int32 misalignments
|
||||
var writer = new BitWriter(ByteBuffer.allocate(1024));
|
||||
|
||||
for (int i = 0; i < 729; i++) {
|
||||
writer.putBits(73, 7);
|
||||
}
|
||||
|
||||
var buffer = writer.finish();
|
||||
|
||||
var reader = new BitReader(buffer);
|
||||
|
||||
for (int i = 0; i < 729; i++) {
|
||||
int val = reader.get(7);
|
||||
assertEquals(0b1001001, val);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTakeWhileZero() {
|
||||
var writer = new BitWriter(ByteBuffer.allocate(1024));
|
||||
@ -113,17 +122,6 @@ class BitReaderTest {
|
||||
assertTrue(reader.getBit());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTakeWhileZeroAllZero() {
|
||||
var writer = new BitWriter(ByteBuffer.allocate(1024));
|
||||
writer.putBits(0, 8);
|
||||
var buffer = writer.finish();
|
||||
|
||||
var reader = new BitReader(buffer);
|
||||
int val = reader.takeWhileZero();
|
||||
assertEquals(8, val);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTakeWhileZeroOverInt64() {
|
||||
var writer = new BitWriter(ByteBuffer.allocate(1024));
|
||||
|
@ -30,6 +30,20 @@ class EliasGammaSequenceIteratorTest {
|
||||
assertEquals(expected, decoded);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCodecEmpty() {
|
||||
var ret = GammaCodedSequence.encode(work, new int[] { });
|
||||
|
||||
List<Integer> decoded = new ArrayList<>();
|
||||
List<Integer> expected = List.of();
|
||||
|
||||
var sequence = new GammaCodedSequence.EliasGammaSequenceIterator(ret);
|
||||
while (sequence.hasNext()) {
|
||||
decoded.add(sequence.nextInt());
|
||||
}
|
||||
|
||||
assertEquals(expected, decoded);
|
||||
}
|
||||
@Test
|
||||
public void valueCount() {
|
||||
var ret = GammaCodedSequence.encode(work, new int[] { 1, 3, 5, 16, 32, 64 });
|
||||
|
Loading…
Reference in New Issue
Block a user