(gamma) Implement a small library for Elias gamma coding an integer sequence

This commit is contained in:
Viktor Lofgren 2024-05-30 14:17:23 +02:00
parent 619392edf9
commit 0112ae725c
10 changed files with 971 additions and 0 deletions

View File

@ -0,0 +1,26 @@
plugins {
id 'java'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
}
}
apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation libs.bundles.slf4j
implementation project(':third-party:parquet-floor')
implementation libs.fastutil
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}
test {
useJUnitPlatform()
}

View File

@ -0,0 +1,91 @@
package nu.marginalia.sequence;
import it.unimi.dsi.fastutil.ints.IntIterator;
import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.sequence.io.BitReader;
import nu.marginalia.sequence.io.BitWriter;
import java.nio.ByteBuffer;
/** Implement coding and decoding of sequences of integers using the Elias Gamma code
*
* https://en.wikipedia.org/wiki/Elias_gamma_coding
* */
public class EliasGammaCodec implements IntIterator {
private final BitReader reader;
private int last = 0;
private int next = 0;
private EliasGammaCodec(ByteBuffer buffer) {
reader = new BitReader(buffer);
}
/** Decode a sequence of integers from a ByteBuffer using the Elias Gamma code */
public static IntIterator decode(ByteBuffer buffer) {
return new EliasGammaCodec(buffer);
}
/** Encode a sequence of integers into a ByteBuffer using the Elias Gamma code.
* The sequence must be strictly increasing and may not contain values less than
* or equal to zero.
*/
public static ByteBuffer encode(ByteBuffer workArea, IntList sequence) {
var writer = new BitWriter(workArea);
int last = 0;
for (var iter = sequence.iterator(); iter.hasNext(); ) {
int i = iter.nextInt();
int delta = i - last;
last = i;
// can't encode zeroes
assert delta > 0 : "Sequence must be strictly increasing and may not contain zeroes or negative values";
int bits = Integer.numberOfTrailingZeros(Integer.highestOneBit(delta));
writer.put(0, bits + 1);
writer.put(delta, bits + 1);
}
return writer.finish();
}
/** Encode a sequence of integers into a ByteBuffer using the Elias Gamma code.
* The sequence must be strictly increasing and may not contain values less than
* or equal to zero.
*/
public static ByteBuffer encode(ByteBuffer workArea, int[] sequence) {
return encode(workArea, IntList.of(sequence));
}
@Override
public boolean hasNext() {
if (next > 0)
return true;
if (!reader.hasMore())
return false;
int bits = reader.takeWhileZero();
if (!reader.hasMore()) {
return false;
}
int delta = reader.get(bits);
last += delta;
next = last;
return true;
}
@Override
public int nextInt() {
if (hasNext()) {
int ret = next;
next = -1;
return ret;
}
throw new ArrayIndexOutOfBoundsException("No more data to read");
}
}

View File

@ -0,0 +1,97 @@
package nu.marginalia.sequence;
import blue.strategic.parquet.BinarySerializable;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntIterator;
import it.unimi.dsi.fastutil.ints.IntList;
import java.nio.ByteBuffer;
import java.util.Arrays;
import java.util.StringJoiner;
/** A sequence of integers encoded using the Elias Gamma code,
* the class wraps a ByteBuffer containing the encoded sequence,
* and offers convenience methods for decoding and iterating
* over the data.
* */
public class GammaCodedSequence implements BinarySerializable, Iterable<Integer> {
private final ByteBuffer raw;
/** Create a new GammaCodedSequence from a sequence of integers.
*
* The sequence must be strictly increasing and may not contain
* values less than or equal to zero.
* */
public static GammaCodedSequence generate(ByteBuffer workArea, int... values) {
return new GammaCodedSequence(EliasGammaCodec.encode(workArea, values));
}
/** Create a new GammaCodedSequence from a sequence of integers.
*
* The sequence must be strictly increasing and may not contain
* values less than or equal to zero.
* */
public static GammaCodedSequence generate(ByteBuffer workArea, IntList values) {
return new GammaCodedSequence(EliasGammaCodec.encode(workArea, values));
}
public GammaCodedSequence(ByteBuffer bytes) {
this.raw = bytes;
}
public GammaCodedSequence(byte[] bytes) {
raw = ByteBuffer.allocate(bytes.length);
raw.put(bytes);
raw.clear();
}
/** Return the raw bytes of the sequence. */
@Override
public byte[] bytes() {
if (raw.hasArray()) {
return raw.array();
}
else {
raw.clear();
byte[] bytes = new byte[raw.capacity()];
raw.get(bytes, 0, bytes.length);
return bytes;
}
}
@Override
public IntIterator iterator() {
raw.clear();
return EliasGammaCodec.decode(raw);
}
/** Decode the sequence into an IntList;
* this is a somewhat slow operation,
* iterating over the data directly more performant */
public IntList decode() {
IntArrayList ret = new IntArrayList(8);
var iter = iterator();
while (iter.hasNext()) {
ret.add(iter.nextInt());
}
return ret;
}
public int hashCode() {
return raw.hashCode();
}
public boolean equals(Object obj) {
return obj instanceof GammaCodedSequence other && Arrays.equals(bytes(), other.bytes());
}
public String toString() {
StringJoiner sj = new StringJoiner(", ", "[", "]");
for (Integer i : this) {
sj.add(i.toString());
}
return sj.toString();
}
}

View File

@ -0,0 +1,134 @@
package nu.marginalia.sequence.io;
import java.nio.ByteBuffer;
/** A utility class for reading bits from a ByteBuffer
* out of alignment with octet boundaries
*/
public class BitReader {
private final ByteBuffer underlying;
/** The current value being decoded */
private long currentValue;
/** Bit index in the current value */
private int bitPosition;
public BitReader(ByteBuffer buffer) {
this.underlying = buffer;
this.bitPosition = 0;
this.currentValue = 0;
}
/** Read the next bit from the buffer */
public boolean getBit() {
if (bitPosition <= 0) {
readNext();
}
// Return the bit at the current position, then decrement the position
return (currentValue & (1L << (--bitPosition))) != 0;
}
/** Read the next width bits from the buffer */
public int get(int width) {
if (width == 0)
return 0;
if (bitPosition <= 0) {
readNext();
}
int result = 0;
while (width > 0) {
int dw = bitPosition - width;
if (dw >= 0) { // We have enough bits in the current value to satisfy the request
result |= ((int)(currentValue >>> dw)) & ~-(1<<width);
// Update the bit position
bitPosition -= width;
// We've read all the bits we need
width = 0;
} else { // We need to split the value between two successive integers
// Extract the remaining bits from the current value to the result
// and shift them to the left to leave room for the bits still to be read
result |= (int)((currentValue & ~-(1L<<bitPosition)) << -dw);
// Decrement the number of bits left to read by the number of bits read
// so that we read the remainder as we loop around
width -= bitPosition;
// Read the next integer
readNext(); // implicitly: bitPosition = 0 here
}
}
return result;
}
/** Read bits until a 1 is encountered */
public int takeWhileZero() {
if (bitPosition <= 0) {
readNext();
}
int result = 0;
for (;;) {
// Ensure we have bits to read
if (bitPosition <= 0) {
if (underlying.hasRemaining())
readNext();
else break;
}
// Count the number of leading zeroes in the current value
int zeroes = Long.numberOfLeadingZeros(currentValue << (64 - bitPosition));
// Add the number of zeroes to the result, but cap it at the
// current bit position to avoid counting padding bits as zeroes
result += Math.min(bitPosition, zeroes);
// Subtract the number of bits read from the current position
bitPosition -= zeroes;
// If bitPosition isn't zero, we've found a 1 and can stop
if (bitPosition > 0)
break;
}
return result;
}
public boolean hasMore() {
return bitPosition > 0 || underlying.hasRemaining();
}
private void readNext() {
int remainingCapacity = underlying.remaining();
if (remainingCapacity >= 8) {
currentValue = underlying.getLong();
bitPosition = 64;
}
else if (remainingCapacity >= 4) {
currentValue = underlying.getInt() & 0xFFFFFFFFL;
bitPosition = 32;
}
else if (remainingCapacity >= 2) {
currentValue = underlying.getShort() & 0xFFFF;
bitPosition = 16;
}
else if (remainingCapacity == 1) {
currentValue = underlying.get() & 0xFF;
bitPosition = 8;
}
else { // There's no more data to read!
throw new ArrayIndexOutOfBoundsException("No more data to read");
}
}
}

View File

@ -0,0 +1,112 @@
package nu.marginalia.sequence.io;
import java.nio.ByteBuffer;
/** A utility class for writing bits to a ByteBuffer
* out of alignment with octet boundaries
*/
public class BitWriter {
private final ByteBuffer underlying;
/** The current value being encoded */
private long currentValue;
/** Bit index in the current value */
private int bitPosition;
/** The total number of significant bytes that have been written to the buffer,
* the actual number of bytes may be larger than this value, but the trailing
* values should be ignored */
private int totalMeaningfulBytes;
public BitWriter(ByteBuffer workBuffer) {
this.underlying = workBuffer;
this.bitPosition = 0;
this.currentValue = 0;
this.totalMeaningfulBytes = 0;
underlying.clear();
}
public void putBit(boolean value) {
if (value) {
currentValue = 1 | (currentValue << 1);
}
else {
currentValue <<= 1;
}
// If we've exceeded the integer size, write it to the buffer
// and start over with the next integer
if (++bitPosition == 64) {
underlying.putLong(currentValue);
totalMeaningfulBytes+=8;
bitPosition = 0;
currentValue = 0;
}
}
/** Write the lowest width bits of the value to the buffer */
public void put(int value, int width) {
assert width <= 32 : "Attempting to write more than 32 bits from a single integer";
int rem = (64 - bitPosition);
if (rem < width) { // The value is split between two integers
// write the first part of the byte
currentValue = (currentValue << rem) | (value >>> (width - rem));
// switch to the next integer
underlying.putLong(currentValue);
totalMeaningfulBytes+=8;
// write the remaining part to currentValue
currentValue = value & ((1L << (width - rem)) - 1);
bitPosition = width - rem;
}
else { // The entire value fits in the current integer
currentValue <<= width;
currentValue |= (value & ((1L << width) - 1));
bitPosition += width;
}
}
public ByteBuffer finish() {
finishLastByte();
var outBuffer = ByteBuffer.allocate(totalMeaningfulBytes);
outBuffer.put(underlying.array(), 0, totalMeaningfulBytes);
outBuffer.position(0);
outBuffer.limit(totalMeaningfulBytes);
return outBuffer;
}
public ByteBuffer finish(ByteBuffer outBuffer) {
finishLastByte();
outBuffer.put(underlying.array(), 0, totalMeaningfulBytes);
outBuffer.position(0);
outBuffer.limit(totalMeaningfulBytes);
return outBuffer;
}
private void finishLastByte() {
// It's possible we have a few bits left over that have yet to be written
// to the underlying buffer. We need to write them out now.
if (bitPosition > 0) {
totalMeaningfulBytes += bitPosition / 8 + ((bitPosition % 8 == 0) ? 0 : 1);
underlying.putLong(currentValue << (64 - bitPosition));
}
// Reset the bit position to reflect that we've written the last byte
bitPosition = 0;
}
}

View File

@ -0,0 +1,130 @@
package nu.marginalia.sequence;
import nu.marginalia.sequence.io.BitReader;
import nu.marginalia.sequence.io.BitWriter;
import org.junit.jupiter.api.Test;
import java.nio.ByteBuffer;
import static org.junit.jupiter.api.Assertions.*;
class BitReaderTest {
@Test
void getBit() {
var writer = new BitWriter(ByteBuffer.allocate(1024));
writer.putBit(true);
writer.putBit(false);
writer.put(0, 32);
writer.putBit(true);
writer.putBit(false);
var buffer = writer.finish();
var reader = new BitReader(buffer);
assertTrue(reader.getBit());
assertFalse(reader.getBit());
for (int i = 0; i < 32; i++) {
assertFalse(reader.getBit());
}
assertTrue(reader.getBit());
assertFalse(reader.getBit());
}
@Test
void getInByte() {
var writer = new BitWriter(ByteBuffer.allocate(1024));
writer.putBit(true);
writer.putBit(false);
var buffer = writer.finish();
var reader = new BitReader(buffer);
int val = reader.get(2);
assertEquals(0b10, val);
}
@Test
void get() {
var writer = new BitWriter(ByteBuffer.allocate(1024));
writer.putBit(true);
writer.putBit(false);
writer.put(0, 32);
writer.putBit(true);
writer.putBit(false);
var buffer = writer.finish();
var reader = new BitReader(buffer);
int val = reader.get(4);
assertEquals(0b1000, val);
val = reader.get(30);
assertEquals(0b000, val);
val = reader.get(2);
assertEquals(0b10, val);
}
@Test
void getSevens() {
// Fuzz test that probes int32 misalignments
var writer = new BitWriter(ByteBuffer.allocate(1024));
for (int i = 0; i < 729; i++) {
writer.putBit(true);
writer.putBit(false);
writer.putBit(false);
writer.putBit(true);
writer.putBit(false);
writer.putBit(false);
writer.putBit(true);
}
var buffer = writer.finish();
var reader = new BitReader(buffer);
for (int i = 0; i < 729; i++) {
int val = reader.get(7);
assertEquals(0b1001001, val);
}
}
@Test
public void testTakeWhileZero() {
var writer = new BitWriter(ByteBuffer.allocate(1024));
writer.put(0, 4);
writer.putBit(true);
var buffer = writer.finish();
var reader = new BitReader(buffer);
int val = reader.takeWhileZero();
assertEquals(4, val);
assertTrue(reader.getBit());
}
@Test
public void testTakeWhileZeroAllZero() {
var writer = new BitWriter(ByteBuffer.allocate(1024));
writer.put(0, 8);
var buffer = writer.finish();
var reader = new BitReader(buffer);
int val = reader.takeWhileZero();
assertEquals(8, val);
}
@Test
public void testTakeWhileZeroOverInt32() {
var writer = new BitWriter(ByteBuffer.allocate(1024));
writer.put(0, 32);
writer.put(0, 2);
writer.putBit(true);
var buffer = writer.finish();
var reader = new BitReader(buffer);
int val = reader.takeWhileZero();
assertEquals(34, val);
assertTrue(reader.getBit());
}
}

View File

@ -0,0 +1,297 @@
package nu.marginalia.sequence;
import nu.marginalia.sequence.io.BitReader;
import nu.marginalia.sequence.io.BitWriter;
import org.junit.jupiter.api.Test;
import java.nio.ByteBuffer;
import java.util.Random;
import static org.junit.jupiter.api.Assertions.*;
class BitWriterTest {
@Test
public void testPutBitsFullByte() {
var buffer = ByteBuffer.allocate(1024);
var writer = new BitWriter(buffer);
writer.putBit(false);
writer.putBit(true);
writer.putBit(true);
writer.putBit(true);
writer.putBit(true);
writer.putBit(true);
writer.putBit(true);
writer.putBit(false);
var out = writer.finish();
byte actual = out.get(0);
byte expected = (byte) 0b0111_1110;
assertEquals(expected, actual);
assertEquals(1, out.capacity());
}
@Test
public void testPutBitsPartialByte() {
var buffer = ByteBuffer.allocate(1024);
var writer = new BitWriter(buffer);
writer.putBit(true);
writer.putBit(false);
writer.putBit(true);
writer.putBit(true);
writer.putBit(true);
writer.putBit(true);
writer.putBit(true);
var out = writer.finish();
byte actual = out.get(0);
byte expected = (byte) 0b1011_1110;
assertEquals(expected, actual, STR."was \{Integer.toBinaryString(actual & 0xFF)}");
assertEquals(1, out.capacity());
}
@Test
public void testPutBitsOneAndAHalfByte() {
var buffer = ByteBuffer.allocate(1024);
var writer = new BitWriter(buffer);
writer.putBit(true);
writer.putBit(false);
writer.putBit(true);
writer.putBit(true);
writer.putBit(true);
writer.putBit(true);
writer.putBit(true);
writer.putBit(false);
writer.putBit(true);
writer.putBit(true);
var out = writer.finish();
assertEquals(2, out.capacity());
byte actual1 = out.get(0);
byte actual2 = out.get(1);
byte expected1 = (byte) 0b1011_1110;
byte expected2 = (byte) 0b1100_0000;
assertEquals(expected1, actual1, STR."was \{Integer.toBinaryString(actual1 & 0xFF)}");
assertEquals(expected2, actual2, STR."was \{Integer.toBinaryString(actual2 & 0xFF)}");
}
@Test
public void testPutBitsIntOverflow() {
var buffer = ByteBuffer.allocate(1024);
var writer = new BitWriter(buffer);
for (int i = 0; i < 4; i++) {
writer.putBit(true);
writer.putBit(false);
writer.putBit(true);
writer.putBit(true);
writer.putBit(true);
writer.putBit(true);
writer.putBit(true);
writer.putBit(false);
}
writer.putBit(true);
writer.putBit(true);
var out = writer.finish();
assertEquals(5, out.capacity());
for (int i = 0; i < 4; i++) {
byte actual1 = out.get(i);
byte expected1 = (byte) 0b1011_1110;
assertEquals(expected1, actual1, STR."was \{Integer.toBinaryString(actual1 & 0xFF)}");
}
byte actual2 = out.get(4);
byte expected2 = (byte) 0b1100_0000;
assertEquals(expected2, actual2, STR."was \{Integer.toBinaryString(actual2 & 0xFF)}");
}
@Test
public void testPut1() {
var buffer = ByteBuffer.allocate(1024);
var writer = new BitWriter(buffer);
writer.put(1, 1);
var ret = writer.finish();
assertEquals(1, ret.capacity());
assertEquals((byte)0b1000_0000, ret.get(0));
}
@Test
public void testPut4() {
var buffer = ByteBuffer.allocate(1024);
var writer = new BitWriter(buffer);
writer.put(1, 4);
var ret = writer.finish();
assertEquals(1, ret.capacity());
assertEquals((byte)0b0001_0000, ret.get(0));
}
@Test
public void testPut8() {
var buffer = ByteBuffer.allocate(1024);
var writer = new BitWriter(buffer);
writer.put(3, 8);
var ret = writer.finish();
assertEquals(1, ret.capacity());
assertEquals((byte)0b0000_0011, ret.get(0));
}
@Test
public void testPut8_2() {
var buffer = ByteBuffer.allocate(1024);
var writer = new BitWriter(buffer);
writer.put(~0, 8);
var ret = writer.finish();
assertEquals(1, ret.capacity());
assertEquals((byte)0b1111_1111, ret.get(0));
}
@Test
public void testPut8_3() {
var buffer = ByteBuffer.allocate(1024);
var writer = new BitWriter(buffer);
writer.put(~0, 8);
writer.put(0, 8);
writer.put(~0, 8);
writer.put(1, 1);
var ret = writer.finish();
assertEquals(4, ret.capacity());
assertEquals((byte)0b1111_1111, ret.get(0));
assertEquals((byte)0, ret.get(1));
assertEquals((byte)0b1111_1111, ret.get(2));
assertEquals((byte)0b1000_0000, ret.get(3));
}
@Test
public void testIntOverflow() {
var buffer = ByteBuffer.allocate(1024);
var writer = new BitWriter(buffer);
writer.put(~0, 24);
writer.put(0, 16);
writer.put(1, 1);
var ret = writer.finish();
assertEquals(6, ret.capacity());
assertEquals((byte)0b1111_1111, ret.get(0));
assertEquals((byte)0b1111_1111, ret.get(1));
assertEquals((byte)0b1111_1111, ret.get(2));
assertEquals((byte)0, ret.get(3));
assertEquals((byte)0, ret.get(4));
assertEquals((byte)0b1000_0000, ret.get(5));
}
@Test
public void testIntOverflowMisaligned() {
var buffer = ByteBuffer.allocate(1024);
var writer = new BitWriter(buffer);
writer.put(0, 2);
writer.put(~0, 24);
writer.put(0, 16);
writer.put(1, 1);
var ret = writer.finish();
assertEquals(6, ret.capacity());
assertEquals((byte)0b0011_1111, ret.get(0));
assertEquals((byte)0b1111_1111, ret.get(1));
assertEquals((byte)0b1111_1111, ret.get(2));
assertEquals((byte)0b1100_0000, ret.get(3));
assertEquals((byte)0, ret.get(4));
assertEquals((byte)0b0010_0000, ret.get(5));
}
@Test
public void testFuzzCase1() {
var buffer = ByteBuffer.allocate(1024);
var writer = new BitWriter(buffer);
writer.put(1, 6);
writer.put(702, 11);
var ret = writer.finish();
var reader = new BitReader(ret);
int a = reader.get(6);
int b = reader.get(11);
assertEquals(a, 1);
assertEquals(b, 702);
}
@Test
public void testFuzzCase2() {
var buffer = ByteBuffer.allocate(1024);
var writer = new BitWriter(buffer);
writer.put(0, 6);
writer.put(0, 2);
var ret = writer.finish();
assertEquals(1, ret.capacity());
assertEquals(0, ret.get(0));
var reader = new BitReader(ret);
int a = reader.get(6);
int b = reader.get(2);
assertEquals(a, 0);
assertEquals(b, 0);
}
@Test
void fuzz() {
Random r = new Random();
for (int i = 0; i < 1000; i++) {
var buffer = ByteBuffer.allocate(32);
var writer = new BitWriter(buffer);
int aw = r.nextInt(1, 31);
int bw = r.nextInt(1, 31);
int a = r.nextInt(0, 1<<aw - 1);
int b = r.nextInt(0, 1<<bw - 1);
System.out.println(a + "/" + aw + "," + b + "/" + bw);
writer.put(a, aw);
writer.put(b, bw);
var ret = writer.finish();
var reader = new BitReader(ret);
int ra = reader.get(aw);
int rb = reader.get(bw);
assertEquals(a, ra);
assertEquals(b, rb);
System.out.println(a + "," + b);
}
}
}

View File

@ -0,0 +1,78 @@
package nu.marginalia.sequence;
import org.junit.jupiter.api.Test;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Random;
import java.util.stream.IntStream;
import static org.junit.jupiter.api.Assertions.*;
class EliasGammaCodecTest {
ByteBuffer work = ByteBuffer.allocate(65536);
@Test
public void testCodec() {
var ret = EliasGammaCodec.encode(work, new int[] { 1, 3, 5, 16, 32, 64 });
List<Integer> decoded = new ArrayList<>();
List<Integer> expected = List.of(1, 3, 5, 16, 32, 64);
var sequence = EliasGammaCodec.decode(ret);
while (sequence.hasNext()) {
decoded.add(sequence.nextInt());
}
assertEquals(expected, decoded);
}
@Test
public void testCodec2() {
var ret = EliasGammaCodec.encode(work, new int[] { 1, 256 });
List<Integer> decoded = new ArrayList<>();
List<Integer> expected = List.of(1, 256);
var sequence = EliasGammaCodec.decode(ret);
while (sequence.hasNext()) {
decoded.add(sequence.nextInt());
}
assertEquals(expected, decoded);
}
@Test
public void fuzzTestCodec() {
Random r = new Random();
for (int i = 0; i < 1000; i++) {
int[] sequence = new int[2];
sequence[0] = 1;
sequence[1] = 1 + r.nextInt(1, 512);
var ret = EliasGammaCodec.encode(work, sequence);
List<Integer> decoded = new ArrayList<>();
List<Integer> expected = IntStream.of(sequence).boxed().toList();
try {
var codedData = EliasGammaCodec.decode(ret);
while (codedData.hasNext()) {
decoded.add(codedData.nextInt());
}
}
catch (Exception e) {
fail("Exception thrown for " + Arrays.toString(sequence));
}
assertEquals(expected, decoded, "Expected " + expected + " but got " + decoded + " for " + Arrays.toString(sequence));
System.out.println(Arrays.toString(sequence) + " ok");
}
}
}

View File

@ -37,6 +37,7 @@ include 'code:index:index-reverse'
include 'code:libraries:array'
include 'code:libraries:array:cpp'
include 'code:libraries:coded-sequence'
include 'code:libraries:geo-ip'
include 'code:libraries:btree'
include 'code:libraries:easy-lsh'

View File

@ -0,0 +1,5 @@
package blue.strategic.parquet;
public interface BinarySerializable {
byte[] bytes();
}