mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(coded-sequence) Varint sequence
This commit is contained in:
parent
4430a39120
commit
57929ff242
@ -1,5 +1,6 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
id "me.champeau.jmh" version "0.6.6"
|
||||
}
|
||||
|
||||
java {
|
||||
@ -24,3 +25,15 @@ dependencies {
|
||||
test {
|
||||
useJUnitPlatform()
|
||||
}
|
||||
|
||||
jmh {
|
||||
jvmArgs = [ "--enable-preview" ]
|
||||
}
|
||||
tasks.withType(me.champeau.jmh.WithJavaToolchain).configureEach {
|
||||
javaLauncher.set(javaToolchains.launcherFor {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
})
|
||||
}
|
||||
tasks.withType(me.champeau.jmh.JmhBytecodeGeneratorTask).configureEach {
|
||||
jvmArgs = ["--enable-preview"]
|
||||
}
|
@ -0,0 +1,198 @@
|
||||
package nu.marginalia.sequence;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.IntArrayList;
|
||||
import it.unimi.dsi.fastutil.ints.IntIterator;
|
||||
import it.unimi.dsi.fastutil.ints.IntList;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
|
||||
public class VarintCodedSequence implements CodedSequence {
|
||||
|
||||
private final ByteBuffer raw;
|
||||
|
||||
private final int startPos;
|
||||
private final int startLimit;
|
||||
|
||||
public VarintCodedSequence(ByteBuffer buffer) {
|
||||
this.raw = buffer;
|
||||
|
||||
this.startPos = buffer.position();
|
||||
this.startLimit = buffer.limit();
|
||||
}
|
||||
|
||||
private static int requiredBufferSize(int[] values) {
|
||||
int prev = 0;
|
||||
int size = 0;
|
||||
|
||||
for (int value : values) {
|
||||
size += varintSize(value - prev);
|
||||
prev = value;
|
||||
}
|
||||
|
||||
return size + varintSize(size + 1);
|
||||
}
|
||||
|
||||
private static int varintSize(int value) {
|
||||
int bits = 32 - Integer.numberOfLeadingZeros(value);
|
||||
return (bits + 6) / 7;
|
||||
}
|
||||
|
||||
public static VarintCodedSequence generate(int... values) {
|
||||
int bufferSize = requiredBufferSize(values);
|
||||
ByteBuffer buffer = ByteBuffer.allocate(bufferSize);
|
||||
|
||||
int prev = 0;
|
||||
|
||||
encodeValue(buffer, values.length + 1);
|
||||
|
||||
for (int value : values) {
|
||||
int toEncode = value - prev;
|
||||
assert toEncode > 0 : "Values must be strictly increasing";
|
||||
|
||||
encodeValue(buffer, toEncode);
|
||||
|
||||
prev = value;
|
||||
}
|
||||
|
||||
buffer.flip();
|
||||
|
||||
return new VarintCodedSequence(buffer);
|
||||
}
|
||||
|
||||
private static void encodeValue(ByteBuffer buffer, int value) {
|
||||
if (value < 0x80) {
|
||||
buffer.put((byte) value);
|
||||
}
|
||||
else if (value < 0x4_000) {
|
||||
buffer.put((byte) (value >>> (7) | 0x80));
|
||||
buffer.put((byte) (value & 0x7F));
|
||||
}
|
||||
else if (value < 0x20_0000) {
|
||||
buffer.put((byte) (value >>> (14) | 0x80));
|
||||
buffer.put((byte) (value >>> (7) | 0x80));
|
||||
buffer.put((byte) (value & 0x7F));
|
||||
}
|
||||
else if (value < 0x1000_0000) {
|
||||
buffer.putInt(Integer.expand(value, 0x00808080) | 0x80808000);
|
||||
}
|
||||
else {
|
||||
throw new IllegalArgumentException("Value too large to encode");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte[] bytes() {
|
||||
return raw.array();
|
||||
}
|
||||
|
||||
@Override
|
||||
public IntIterator iterator() {
|
||||
return new VarintSequenceIterator(buffer());
|
||||
}
|
||||
|
||||
@Override
|
||||
public IntIterator offsetIterator(int offset) {
|
||||
return new VarintSequenceIterator(buffer(), offset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public IntList values() {
|
||||
var buffer = buffer();
|
||||
|
||||
int val = 0;
|
||||
int count = decodeValue(buffer) - 1;
|
||||
|
||||
IntArrayList list = new IntArrayList(count);
|
||||
|
||||
while (buffer.hasRemaining()) {
|
||||
val += decodeValue(buffer);
|
||||
list.add(val);
|
||||
}
|
||||
|
||||
return list;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ByteBuffer buffer() {
|
||||
raw.position(startPos);
|
||||
raw.limit(startLimit);
|
||||
|
||||
return raw;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int bufferSize() {
|
||||
return raw.capacity();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int valueCount() {
|
||||
var buffer = buffer();
|
||||
return decodeValue(buffer) - 1;
|
||||
}
|
||||
|
||||
private static int decodeValue(ByteBuffer buffer) {
|
||||
// most common case gets a fast path, this is a fairly large performance win
|
||||
// on average, something like 10-20% faster than not having this check
|
||||
byte b = buffer.get();
|
||||
if ((b & 0x80) == 0) {
|
||||
return b;
|
||||
}
|
||||
|
||||
int value = b;
|
||||
do {
|
||||
b = buffer.get();
|
||||
value = value << 7 | (b & 0x7F);
|
||||
} while ((b & 0x80) != 0);
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
public static class VarintSequenceIterator implements IntIterator {
|
||||
|
||||
private final ByteBuffer buffer;
|
||||
int rem = 0;
|
||||
private int last;
|
||||
private int next = Integer.MIN_VALUE;
|
||||
|
||||
public VarintSequenceIterator(ByteBuffer buffer, int zero) {
|
||||
this.buffer = buffer;
|
||||
if (zero == Integer.MIN_VALUE) {
|
||||
throw new IllegalArgumentException("Integer.MIN_VALUE is a reserved offset that may not be used as zero point");
|
||||
}
|
||||
|
||||
last = zero;
|
||||
rem = decodeValue(buffer) - 1;
|
||||
}
|
||||
|
||||
public VarintSequenceIterator(ByteBuffer buffer) {
|
||||
this(buffer, 0);
|
||||
}
|
||||
|
||||
// This is BitWriter.getGamma with more checks in place for streaming iteration
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
if (next != Integer.MIN_VALUE) return true;
|
||||
if (--rem < 0) return false;
|
||||
|
||||
int delta = decodeValue(buffer);
|
||||
|
||||
last += delta;
|
||||
next = last;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextInt() {
|
||||
if (hasNext()) {
|
||||
int ret = next;
|
||||
next = Integer.MIN_VALUE;
|
||||
return ret;
|
||||
}
|
||||
throw new ArrayIndexOutOfBoundsException("No more data to read");
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
@ -0,0 +1,71 @@
|
||||
package nu.marginalia.bench;
|
||||
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
import nu.marginalia.sequence.VarintCodedSequence;
|
||||
import org.openjdk.jmh.annotations.*;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
|
||||
public class SequenceBenchmarks {
|
||||
|
||||
@State(Scope.Benchmark)
|
||||
public static class SequenceState {
|
||||
VarintCodedSequence vcs;
|
||||
GammaCodedSequence gcs;
|
||||
ByteBuffer workArea;
|
||||
int[] valueBuffer;
|
||||
public SequenceState()
|
||||
{
|
||||
valueBuffer = new int[128];
|
||||
|
||||
workArea = ByteBuffer.allocate(65536);
|
||||
vcs = VarintCodedSequence.generate(1, 3, 5, 16, 1024, 2048);
|
||||
gcs = GammaCodedSequence.generate(workArea, 1, 3, 5, 16, 1024, 2048);
|
||||
}
|
||||
}
|
||||
|
||||
@Fork(value = 5, warmups = 5)
|
||||
@Warmup(iterations = 5)
|
||||
@Benchmark
|
||||
@BenchmarkMode(Mode.Throughput)
|
||||
public int vcsDecode(SequenceState state) {
|
||||
var iter = state.vcs.iterator();
|
||||
int sum = 0;
|
||||
while (iter.hasNext()) {
|
||||
sum += iter.nextInt();
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
||||
// @Fork(value = 1, warmups = 1)
|
||||
// @Warmup(iterations = 1)
|
||||
// @Benchmark
|
||||
// @BenchmarkMode(Mode.Throughput)
|
||||
// public int gcsDecode(SequenceState state) {
|
||||
// var iter = state.gcs.iterator();
|
||||
// int sum = 0;
|
||||
// while (iter.hasNext()) {
|
||||
// sum += iter.nextInt();
|
||||
// }
|
||||
// return sum;
|
||||
// }
|
||||
|
||||
// @Fork(value = 1, warmups = 1)
|
||||
// @Warmup(iterations = 1)
|
||||
// @Benchmark
|
||||
// @BenchmarkMode(Mode.Throughput)
|
||||
// public VarintCodedSequence vcsEncode(SequenceState state) {
|
||||
// return VarintCodedSequence.generate(1, 3, 5, 16, 1024, 2048, 4096, 4098, 4100);
|
||||
// }
|
||||
|
||||
// @Fork(value = 1, warmups = 1)
|
||||
// @Warmup(iterations = 1)
|
||||
// @Benchmark
|
||||
// @BenchmarkMode(Mode.Throughput)
|
||||
// public GammaCodedSequence gcsEncode(SequenceState state) {
|
||||
// return GammaCodedSequence.generate(state.workArea, 1, 3, 5, 16, 1024, 2048, 4096, 4098, 4100);
|
||||
// }
|
||||
|
||||
|
||||
}
|
@ -0,0 +1,50 @@
|
||||
package nu.marginalia.sequence;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class VarintCodedSequenceTest {
|
||||
@Test
|
||||
public void testSimple() {
|
||||
var sequence = VarintCodedSequence.generate(1, 3, 5, 16, 1024, 2048, 40000, 268435446);
|
||||
|
||||
assertEquals(8, sequence.valueCount());
|
||||
|
||||
var values = sequence.values();
|
||||
System.out.println(values);
|
||||
assertEquals(1, values.getInt(0));
|
||||
assertEquals(3, values.getInt(1));
|
||||
assertEquals(5, values.getInt(2));
|
||||
assertEquals(16, values.getInt(3));
|
||||
assertEquals(1024, values.getInt(4));
|
||||
assertEquals(2048, values.getInt(5));
|
||||
assertEquals(40000, values.getInt(6));
|
||||
assertEquals(268435446, values.getInt(7));
|
||||
|
||||
|
||||
var iter = sequence.iterator();
|
||||
assertEquals(1, iter.nextInt());
|
||||
assertEquals(3, iter.nextInt());
|
||||
assertEquals(5, iter.nextInt());
|
||||
assertEquals(16, iter.nextInt());
|
||||
assertEquals(1024, iter.nextInt());
|
||||
assertEquals(2048, iter.nextInt());
|
||||
assertEquals(40000, iter.nextInt());
|
||||
assertEquals(268435446, iter.nextInt());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEmpty() {
|
||||
var sequence = VarintCodedSequence.generate();
|
||||
|
||||
assertEquals(0, sequence.valueCount());
|
||||
|
||||
var values = sequence.values();
|
||||
assertTrue(values.isEmpty());
|
||||
|
||||
var iter = sequence.iterator();
|
||||
assertFalse(iter.hasNext());
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user