mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(index) Fix rare BitReader.takeWhileZero bug
Fix rare bug where the takeWhileZero method would fail to repopulate the underlying buffer. This caused intermittent de-compression errors if takeWhileZero happened at a 64 bit boundary while the underlying buffer was empty. The change also alters how sequence-lengths are encoded, to more consistently use the getGamma method instead of adding special significance to a zero first byte. Finally, assertions are added checking the invariants of the gamma and delta coding logic as well as UrlIdCodec to earlier detect issues.
This commit is contained in:
parent
dfd19b5eb9
commit
ae87e41cec
@ -37,11 +37,18 @@ public class UrlIdCodec {
|
|||||||
domainId &= 0x7FFF_FFFF;
|
domainId &= 0x7FFF_FFFF;
|
||||||
documentOrdinal &= 0x03FF_FFFF;
|
documentOrdinal &= 0x03FF_FFFF;
|
||||||
|
|
||||||
|
assert (domainId & 0x7FFF_FFFF) == domainId : "Domain id must be in [0, 2^31-1], was " + domainId;
|
||||||
|
assert (documentOrdinal & 0x03FF_FFFF) == documentOrdinal : "Document ordinal must be in [0, 2^26-1], was " + documentOrdinal;
|
||||||
|
|
||||||
return ((long) domainId << 26) | documentOrdinal;
|
return ((long) domainId << 26) | documentOrdinal;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Encode a URL id with a ranking element */
|
/** Encode a URL id with a ranking element */
|
||||||
public static long encodeId(int rank, int domainId, int documentOrdinal) {
|
public static long encodeId(int rank, int domainId, int documentOrdinal) {
|
||||||
|
assert (rank & 0x3F) == rank : "Rank must be in [0, 63], was " + rank;
|
||||||
|
assert (domainId & 0x7FFF_FFFF) == domainId : "Domain id must be in [0, 2^31-1], was " + domainId;
|
||||||
|
assert (documentOrdinal & 0x03FF_FFFF) == documentOrdinal : "Document ordinal must be in [0, 2^26-1], was " + documentOrdinal;
|
||||||
|
|
||||||
domainId &= 0x7FFF_FFFF;
|
domainId &= 0x7FFF_FFFF;
|
||||||
documentOrdinal &= 0x03FF_FFFF;
|
documentOrdinal &= 0x03FF_FFFF;
|
||||||
rank &= 0x3F;
|
rank &= 0x3F;
|
||||||
@ -75,7 +82,7 @@ public class UrlIdCodec {
|
|||||||
|
|
||||||
/** Extract the document ordinal component from this URL id */
|
/** Extract the document ordinal component from this URL id */
|
||||||
public static int getRank(long combinedId) {
|
public static int getRank(long combinedId) {
|
||||||
return (int) (combinedId >>> 57);
|
return (int) (combinedId >>> 57) & 0x3F;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Mask out the ranking element from this URL id */
|
/** Mask out the ranking element from this URL id */
|
||||||
|
@ -161,11 +161,13 @@ class PrioPreindexTest {
|
|||||||
System.out.println("Mismatch at position " + (i + pos));
|
System.out.println("Mismatch at position " + (i + pos));
|
||||||
|
|
||||||
long prevValue = documentIds[i + pos - 1];
|
long prevValue = documentIds[i + pos - 1];
|
||||||
|
long expectedValue = documentIds[i + pos];
|
||||||
assertTrue(currValue > prevValue, "Current value is not greater than previous value");
|
|
||||||
|
|
||||||
System.out.println("Prev: " + prevValue + " -> " + UrlIdCodec.getRank(prevValue) + " " + UrlIdCodec.getDomainId(prevValue) + " " + UrlIdCodec.getDocumentOrdinal(prevValue));
|
System.out.println("Prev: " + prevValue + " -> " + UrlIdCodec.getRank(prevValue) + " " + UrlIdCodec.getDomainId(prevValue) + " " + UrlIdCodec.getDocumentOrdinal(prevValue));
|
||||||
System.out.println("Curr: " + currValue + " -> " + UrlIdCodec.getRank(currValue) + " " + UrlIdCodec.getDomainId(currValue) + " " + UrlIdCodec.getDocumentOrdinal(currValue));
|
System.out.println("Curr: " + currValue + " -> " + UrlIdCodec.getRank(currValue) + " " + UrlIdCodec.getDomainId(currValue) + " " + UrlIdCodec.getDocumentOrdinal(currValue));
|
||||||
|
System.out.println("Exp: " + expectedValue + " -> " + UrlIdCodec.getRank(expectedValue) + " " + UrlIdCodec.getDomainId(expectedValue) + " " + UrlIdCodec.getDocumentOrdinal(expectedValue));
|
||||||
|
|
||||||
|
assertTrue(currValue > prevValue, "Current value is not greater than previous value");
|
||||||
|
|
||||||
Assertions.fail();
|
Assertions.fail();
|
||||||
}
|
}
|
||||||
|
@ -138,10 +138,6 @@ public class GammaCodedSequence implements BinarySerializable, Iterable<Integer>
|
|||||||
if (startPos == startLimit)
|
if (startPos == startLimit)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
// if the first byte is zero, the sequence is empty and we can skip decoding
|
|
||||||
if (0 == raw.get(startPos))
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
return EliasGammaSequenceIterator.readCount(buffer());
|
return EliasGammaSequenceIterator.readCount(buffer());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -151,12 +147,9 @@ public class GammaCodedSequence implements BinarySerializable, Iterable<Integer>
|
|||||||
* or equal to zero.
|
* or equal to zero.
|
||||||
*/
|
*/
|
||||||
public static ByteBuffer encode(ByteBuffer workArea, IntList sequence) {
|
public static ByteBuffer encode(ByteBuffer workArea, IntList sequence) {
|
||||||
if (sequence.isEmpty())
|
|
||||||
return ByteBuffer.allocate(0);
|
|
||||||
|
|
||||||
var writer = new BitWriter(workArea);
|
var writer = new BitWriter(workArea);
|
||||||
|
|
||||||
writer.putGamma(sequence.size());
|
writer.putGamma(sequence.size() + 1);
|
||||||
|
|
||||||
int last = 0;
|
int last = 0;
|
||||||
|
|
||||||
@ -216,14 +209,7 @@ public class GammaCodedSequence implements BinarySerializable, Iterable<Integer>
|
|||||||
reader = new BitReader(buffer);
|
reader = new BitReader(buffer);
|
||||||
|
|
||||||
last = zero;
|
last = zero;
|
||||||
int bits = 1 + reader.takeWhileZero();
|
rem = reader.getGamma() - 1;
|
||||||
|
|
||||||
if (!reader.hasMore()) {
|
|
||||||
rem = 0;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
rem = reader.get(bits);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public EliasGammaSequenceIterator(ByteBuffer buffer) {
|
public EliasGammaSequenceIterator(ByteBuffer buffer) {
|
||||||
@ -233,13 +219,7 @@ public class GammaCodedSequence implements BinarySerializable, Iterable<Integer>
|
|||||||
public static int readCount(ByteBuffer buffer) {
|
public static int readCount(ByteBuffer buffer) {
|
||||||
var reader = new BitReader(buffer);
|
var reader = new BitReader(buffer);
|
||||||
|
|
||||||
int bits = 1 + reader.takeWhileZero();
|
return reader.getGamma() - 1;
|
||||||
if (!reader.hasMore()) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
return reader.get(bits);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -49,8 +49,10 @@ public class BitReader {
|
|||||||
|
|
||||||
/** Read the next width bits from the buffer */
|
/** Read the next width bits from the buffer */
|
||||||
public int get(int width) {
|
public int get(int width) {
|
||||||
if (width == 0)
|
if (width == 0) {
|
||||||
return 0;
|
return 0;
|
||||||
|
}
|
||||||
|
assert width <= 32;
|
||||||
|
|
||||||
if (bitPosition <= 0) {
|
if (bitPosition <= 0) {
|
||||||
readNext();
|
readNext();
|
||||||
@ -94,9 +96,7 @@ public class BitReader {
|
|||||||
do {
|
do {
|
||||||
// Ensure we have bits to read
|
// Ensure we have bits to read
|
||||||
if (bitPosition <= 0) {
|
if (bitPosition <= 0) {
|
||||||
if (underlying.hasRemaining())
|
readNext();
|
||||||
readNext();
|
|
||||||
else break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Count the number of leading zeroes in the current value
|
// Count the number of leading zeroes in the current value
|
||||||
@ -117,12 +117,24 @@ public class BitReader {
|
|||||||
|
|
||||||
public int getGamma() {
|
public int getGamma() {
|
||||||
int bits = takeWhileZero();
|
int bits = takeWhileZero();
|
||||||
return get(bits + 1);
|
int ret = get(bits + 1);
|
||||||
|
|
||||||
|
// The highest bit in the gamma coded value must be set, we can use this invariant
|
||||||
|
// to detect data corruption early
|
||||||
|
assert (ret & (1 << bits)) != 0 : "Highest bit in gamma coded return value not set";
|
||||||
|
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getDelta() {
|
public int getDelta() {
|
||||||
int bits = getGamma();
|
int bits = getGamma();
|
||||||
return get(bits);
|
int ret = get(bits);
|
||||||
|
|
||||||
|
// The highest bit in the delta coded value must be set, we can use this invariant
|
||||||
|
// to detect data corruption early
|
||||||
|
assert (ret & (1 << (bits-1))) != 0 : "Highest bit in delta coded return value not set";
|
||||||
|
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean hasMore() {
|
public boolean hasMore() {
|
||||||
|
@ -101,6 +101,8 @@ public class BitWriter {
|
|||||||
|
|
||||||
int bits = numberOfSignificantBits(value);
|
int bits = numberOfSignificantBits(value);
|
||||||
|
|
||||||
|
assert bits >= 1; // invariant
|
||||||
|
|
||||||
putGamma(bits);
|
putGamma(bits);
|
||||||
putBits(value, bits);
|
putBits(value, bits);
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,5 @@
|
|||||||
package nu.marginalia.sequence;
|
package nu.marginalia.sequence;
|
||||||
|
|
||||||
import it.unimi.dsi.fastutil.ints.IntList;
|
|
||||||
import nu.marginalia.sequence.io.BitReader;
|
import nu.marginalia.sequence.io.BitReader;
|
||||||
import nu.marginalia.sequence.io.BitWriter;
|
import nu.marginalia.sequence.io.BitWriter;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
@ -11,15 +10,6 @@ import static org.junit.jupiter.api.Assertions.*;
|
|||||||
|
|
||||||
class BitReaderTest {
|
class BitReaderTest {
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void emptySequence() {
|
|
||||||
var writer = new BitWriter(ByteBuffer.allocate(1024));
|
|
||||||
var buffer = writer.finish();
|
|
||||||
|
|
||||||
assertEquals(IntList.of(), new GammaCodedSequence(buffer).values());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void getBit() {
|
void getBit() {
|
||||||
var writer = new BitWriter(ByteBuffer.allocate(1024));
|
var writer = new BitWriter(ByteBuffer.allocate(1024));
|
||||||
@ -100,6 +90,25 @@ class BitReaderTest {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void getSevens2() {
|
||||||
|
// Fuzz test that probes int32 misalignments
|
||||||
|
var writer = new BitWriter(ByteBuffer.allocate(1024));
|
||||||
|
|
||||||
|
for (int i = 0; i < 729; i++) {
|
||||||
|
writer.putBits(73, 7);
|
||||||
|
}
|
||||||
|
|
||||||
|
var buffer = writer.finish();
|
||||||
|
|
||||||
|
var reader = new BitReader(buffer);
|
||||||
|
|
||||||
|
for (int i = 0; i < 729; i++) {
|
||||||
|
int val = reader.get(7);
|
||||||
|
assertEquals(0b1001001, val);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testTakeWhileZero() {
|
public void testTakeWhileZero() {
|
||||||
var writer = new BitWriter(ByteBuffer.allocate(1024));
|
var writer = new BitWriter(ByteBuffer.allocate(1024));
|
||||||
@ -113,17 +122,6 @@ class BitReaderTest {
|
|||||||
assertTrue(reader.getBit());
|
assertTrue(reader.getBit());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testTakeWhileZeroAllZero() {
|
|
||||||
var writer = new BitWriter(ByteBuffer.allocate(1024));
|
|
||||||
writer.putBits(0, 8);
|
|
||||||
var buffer = writer.finish();
|
|
||||||
|
|
||||||
var reader = new BitReader(buffer);
|
|
||||||
int val = reader.takeWhileZero();
|
|
||||||
assertEquals(8, val);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testTakeWhileZeroOverInt64() {
|
public void testTakeWhileZeroOverInt64() {
|
||||||
var writer = new BitWriter(ByteBuffer.allocate(1024));
|
var writer = new BitWriter(ByteBuffer.allocate(1024));
|
||||||
|
@ -30,6 +30,20 @@ class EliasGammaSequenceIteratorTest {
|
|||||||
assertEquals(expected, decoded);
|
assertEquals(expected, decoded);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testCodecEmpty() {
|
||||||
|
var ret = GammaCodedSequence.encode(work, new int[] { });
|
||||||
|
|
||||||
|
List<Integer> decoded = new ArrayList<>();
|
||||||
|
List<Integer> expected = List.of();
|
||||||
|
|
||||||
|
var sequence = new GammaCodedSequence.EliasGammaSequenceIterator(ret);
|
||||||
|
while (sequence.hasNext()) {
|
||||||
|
decoded.add(sequence.nextInt());
|
||||||
|
}
|
||||||
|
|
||||||
|
assertEquals(expected, decoded);
|
||||||
|
}
|
||||||
@Test
|
@Test
|
||||||
public void valueCount() {
|
public void valueCount() {
|
||||||
var ret = GammaCodedSequence.encode(work, new int[] { 1, 3, 5, 16, 32, 64 });
|
var ret = GammaCodedSequence.encode(work, new int[] { 1, 3, 5, 16, 32, 64 });
|
||||||
|
Loading…
Reference in New Issue
Block a user