diff --git a/code/libraries/coded-sequence/build.gradle b/code/libraries/coded-sequence/build.gradle index 56f7d6f8..d87ef5a8 100644 --- a/code/libraries/coded-sequence/build.gradle +++ b/code/libraries/coded-sequence/build.gradle @@ -1,5 +1,6 @@ plugins { id 'java' + id "me.champeau.jmh" version "0.6.6" } java { @@ -24,3 +25,15 @@ dependencies { test { useJUnitPlatform() } + +jmh { + jvmArgs = [ "--enable-preview" ] +} +tasks.withType(me.champeau.jmh.WithJavaToolchain).configureEach { + javaLauncher.set(javaToolchains.launcherFor { + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) + }) +} +tasks.withType(me.champeau.jmh.JmhBytecodeGeneratorTask).configureEach { + jvmArgs = ["--enable-preview"] +} \ No newline at end of file diff --git a/code/libraries/coded-sequence/java/nu/marginalia/sequence/VarintCodedSequence.java b/code/libraries/coded-sequence/java/nu/marginalia/sequence/VarintCodedSequence.java new file mode 100644 index 00000000..bf49e2b2 --- /dev/null +++ b/code/libraries/coded-sequence/java/nu/marginalia/sequence/VarintCodedSequence.java @@ -0,0 +1,198 @@ +package nu.marginalia.sequence; + +import it.unimi.dsi.fastutil.ints.IntArrayList; +import it.unimi.dsi.fastutil.ints.IntIterator; +import it.unimi.dsi.fastutil.ints.IntList; + +import java.nio.ByteBuffer; + +public class VarintCodedSequence implements CodedSequence { + + private final ByteBuffer raw; + + private final int startPos; + private final int startLimit; + + public VarintCodedSequence(ByteBuffer buffer) { + this.raw = buffer; + + this.startPos = buffer.position(); + this.startLimit = buffer.limit(); + } + + private static int requiredBufferSize(int[] values) { + int prev = 0; + int size = 0; + + for (int value : values) { + size += varintSize(value - prev); + prev = value; + } + + return size + varintSize(size + 1); + } + + private static int varintSize(int value) { + int bits = 32 - Integer.numberOfLeadingZeros(value); + return (bits + 6) / 7; + } + + public static VarintCodedSequence generate(int... values) { + int bufferSize = requiredBufferSize(values); + ByteBuffer buffer = ByteBuffer.allocate(bufferSize); + + int prev = 0; + + encodeValue(buffer, values.length + 1); + + for (int value : values) { + int toEncode = value - prev; + assert toEncode > 0 : "Values must be strictly increasing"; + + encodeValue(buffer, toEncode); + + prev = value; + } + + buffer.flip(); + + return new VarintCodedSequence(buffer); + } + + private static void encodeValue(ByteBuffer buffer, int value) { + if (value < 0x80) { + buffer.put((byte) value); + } + else if (value < 0x4_000) { + buffer.put((byte) (value >>> (7) | 0x80)); + buffer.put((byte) (value & 0x7F)); + } + else if (value < 0x20_0000) { + buffer.put((byte) (value >>> (14) | 0x80)); + buffer.put((byte) (value >>> (7) | 0x80)); + buffer.put((byte) (value & 0x7F)); + } + else if (value < 0x1000_0000) { + buffer.putInt(Integer.expand(value, 0x00808080) | 0x80808000); + } + else { + throw new IllegalArgumentException("Value too large to encode"); + } + } + + @Override + public byte[] bytes() { + return raw.array(); + } + + @Override + public IntIterator iterator() { + return new VarintSequenceIterator(buffer()); + } + + @Override + public IntIterator offsetIterator(int offset) { + return new VarintSequenceIterator(buffer(), offset); + } + + @Override + public IntList values() { + var buffer = buffer(); + + int val = 0; + int count = decodeValue(buffer) - 1; + + IntArrayList list = new IntArrayList(count); + + while (buffer.hasRemaining()) { + val += decodeValue(buffer); + list.add(val); + } + + return list; + } + + @Override + public ByteBuffer buffer() { + raw.position(startPos); + raw.limit(startLimit); + + return raw; + } + + @Override + public int bufferSize() { + return raw.capacity(); + } + + @Override + public int valueCount() { + var buffer = buffer(); + return decodeValue(buffer) - 1; + } + + private static int decodeValue(ByteBuffer buffer) { + // most common case gets a fast path, this is a fairly large performance win + // on average, something like 10-20% faster than not having this check + byte b = buffer.get(); + if ((b & 0x80) == 0) { + return b; + } + + int value = b; + do { + b = buffer.get(); + value = value << 7 | (b & 0x7F); + } while ((b & 0x80) != 0); + + return value; + } + + public static class VarintSequenceIterator implements IntIterator { + + private final ByteBuffer buffer; + int rem = 0; + private int last; + private int next = Integer.MIN_VALUE; + + public VarintSequenceIterator(ByteBuffer buffer, int zero) { + this.buffer = buffer; + if (zero == Integer.MIN_VALUE) { + throw new IllegalArgumentException("Integer.MIN_VALUE is a reserved offset that may not be used as zero point"); + } + + last = zero; + rem = decodeValue(buffer) - 1; + } + + public VarintSequenceIterator(ByteBuffer buffer) { + this(buffer, 0); + } + + // This is BitWriter.getGamma with more checks in place for streaming iteration + @Override + public boolean hasNext() { + if (next != Integer.MIN_VALUE) return true; + if (--rem < 0) return false; + + int delta = decodeValue(buffer); + + last += delta; + next = last; + + return true; + } + + @Override + public int nextInt() { + if (hasNext()) { + int ret = next; + next = Integer.MIN_VALUE; + return ret; + } + throw new ArrayIndexOutOfBoundsException("No more data to read"); + } + + + } +} diff --git a/code/libraries/coded-sequence/src/jmh/java/nu/marginalia/bench/SequenceBenchmarks.java b/code/libraries/coded-sequence/src/jmh/java/nu/marginalia/bench/SequenceBenchmarks.java new file mode 100644 index 00000000..f09e82bb --- /dev/null +++ b/code/libraries/coded-sequence/src/jmh/java/nu/marginalia/bench/SequenceBenchmarks.java @@ -0,0 +1,71 @@ +package nu.marginalia.bench; + +import nu.marginalia.sequence.GammaCodedSequence; +import nu.marginalia.sequence.VarintCodedSequence; +import org.openjdk.jmh.annotations.*; + +import java.nio.ByteBuffer; + +public class SequenceBenchmarks { + + @State(Scope.Benchmark) + public static class SequenceState { + VarintCodedSequence vcs; + GammaCodedSequence gcs; + ByteBuffer workArea; + int[] valueBuffer; + public SequenceState() + { + valueBuffer = new int[128]; + + workArea = ByteBuffer.allocate(65536); + vcs = VarintCodedSequence.generate(1, 3, 5, 16, 1024, 2048); + gcs = GammaCodedSequence.generate(workArea, 1, 3, 5, 16, 1024, 2048); + } + } + + @Fork(value = 5, warmups = 5) + @Warmup(iterations = 5) + @Benchmark + @BenchmarkMode(Mode.Throughput) + public int vcsDecode(SequenceState state) { + var iter = state.vcs.iterator(); + int sum = 0; + while (iter.hasNext()) { + sum += iter.nextInt(); + } + return sum; + } + + +// @Fork(value = 1, warmups = 1) +// @Warmup(iterations = 1) +// @Benchmark +// @BenchmarkMode(Mode.Throughput) +// public int gcsDecode(SequenceState state) { +// var iter = state.gcs.iterator(); +// int sum = 0; +// while (iter.hasNext()) { +// sum += iter.nextInt(); +// } +// return sum; +// } + +// @Fork(value = 1, warmups = 1) +// @Warmup(iterations = 1) +// @Benchmark +// @BenchmarkMode(Mode.Throughput) +// public VarintCodedSequence vcsEncode(SequenceState state) { +// return VarintCodedSequence.generate(1, 3, 5, 16, 1024, 2048, 4096, 4098, 4100); +// } + +// @Fork(value = 1, warmups = 1) +// @Warmup(iterations = 1) +// @Benchmark +// @BenchmarkMode(Mode.Throughput) +// public GammaCodedSequence gcsEncode(SequenceState state) { +// return GammaCodedSequence.generate(state.workArea, 1, 3, 5, 16, 1024, 2048, 4096, 4098, 4100); +// } + + +} diff --git a/code/libraries/coded-sequence/test/nu/marginalia/sequence/VarintCodedSequenceTest.java b/code/libraries/coded-sequence/test/nu/marginalia/sequence/VarintCodedSequenceTest.java new file mode 100644 index 00000000..67554b04 --- /dev/null +++ b/code/libraries/coded-sequence/test/nu/marginalia/sequence/VarintCodedSequenceTest.java @@ -0,0 +1,50 @@ +package nu.marginalia.sequence; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +class VarintCodedSequenceTest { + @Test + public void testSimple() { + var sequence = VarintCodedSequence.generate(1, 3, 5, 16, 1024, 2048, 40000, 268435446); + + assertEquals(8, sequence.valueCount()); + + var values = sequence.values(); + System.out.println(values); + assertEquals(1, values.getInt(0)); + assertEquals(3, values.getInt(1)); + assertEquals(5, values.getInt(2)); + assertEquals(16, values.getInt(3)); + assertEquals(1024, values.getInt(4)); + assertEquals(2048, values.getInt(5)); + assertEquals(40000, values.getInt(6)); + assertEquals(268435446, values.getInt(7)); + + + var iter = sequence.iterator(); + assertEquals(1, iter.nextInt()); + assertEquals(3, iter.nextInt()); + assertEquals(5, iter.nextInt()); + assertEquals(16, iter.nextInt()); + assertEquals(1024, iter.nextInt()); + assertEquals(2048, iter.nextInt()); + assertEquals(40000, iter.nextInt()); + assertEquals(268435446, iter.nextInt()); + + } + + @Test + public void testEmpty() { + var sequence = VarintCodedSequence.generate(); + + assertEquals(0, sequence.valueCount()); + + var values = sequence.values(); + assertTrue(values.isEmpty()); + + var iter = sequence.iterator(); + assertFalse(iter.hasNext()); + } +} \ No newline at end of file