(index, EXPERIMENTAL) Evaluate using Varint instead of GCS for position data

This commit is contained in:
Viktor Lofgren 2024-08-26 14:20:39 +02:00
parent 30bf845c81
commit abab5bdc8a
23 changed files with 478 additions and 113 deletions

View File

@ -1,6 +1,6 @@
package nu.marginalia.model.idx;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.sequence.VarintCodedSequence;
public record CodedWordSpan(byte code, GammaCodedSequence spans) {
public record CodedWordSpan(byte code, VarintCodedSequence spans) {
}

View File

@ -120,7 +120,6 @@ public class ForwardIndexConverter {
for (int i = 0; i < spansCodes.length; i++) {
spansWriter.writeSpan(spansCodes[i], spans.get(i));
}
long encodedSpansOffset = spansWriter.endRecord();

View File

@ -6,6 +6,7 @@ import nu.marginalia.sequence.CodedSequence;
import java.util.Arrays;
/** A list of the interlaced start and end positions of each span in the document of this type */
public class DocumentSpan {
/** A list of the interlaced start and end positions of each span in the document of this type */
@ -19,6 +20,7 @@ public class DocumentSpan {
this.startsEnds = null;
}
/** Counts the number of intersections between the spans in the document of this type and the given list of positions */
public int countIntersections(int[] positions) {
if (null == startsEnds || startsEnds.isEmpty() || positions.length == 0) {
return 0;
@ -26,37 +28,39 @@ public class DocumentSpan {
int cnt = 0;
if (positions.length < 8) {
if (positions.length < 8) { // for small arrays we can do a linear search
int seis = 0;
for (int pi = 0; pi < positions.length; pi++) {
int position = positions[pi];
// search through the spans until we find an item that is greater than the given position
for (int sei = seis; sei < startsEnds.size(); sei ++) {
if (startsEnds.getInt(sei) > position) {
cnt += sei % 2;
cnt += sei % 2; // if sei is odd, we are between a start and end position in the spans list
seis = Math.max(seis, sei - 1);
break;
}
}
}
}
else {
int ss = 0;
else { // for large arrays we use a binary search
int searchStart = 0;
for (int sei = 0; sei < startsEnds.size() && ss < positions.length; ) {
for (int sei = 0; sei < startsEnds.size() && searchStart < positions.length; ) {
int start = startsEnds.getInt(sei++);
int end = startsEnds.getInt(sei++);
int i = Arrays.binarySearch(positions, ss, positions.length, start);
if (i < 0) {
i = -i - 1;
}
// find the first position that is greater or equal to the start position
int i = Arrays.binarySearch(positions, searchStart, positions.length, start);
if (i < 0) i = -i - 1; // if the position is not found, we get the insertion point
// ... from that point, count the number of positions that smaller than the end position
while (i < positions.length && positions[i] < end) {
cnt++;
i++;
}
ss = i;
searchStart = i;
}
}
@ -83,6 +87,8 @@ public class DocumentSpan {
return false;
}
/** Returns true if for any position in the list, there exists a range
* (position[i], position[i]+len] that is overlapped by a span */
public boolean containsRange(IntList positions, int len) {
if (null == startsEnds || startsEnds.size() < 2 || positions.isEmpty()) {
return false;

View File

@ -3,6 +3,10 @@ package nu.marginalia.index.forward.spans;
import nu.marginalia.language.sentence.tag.HtmlTag;
import nu.marginalia.sequence.CodedSequence;
/** All spans associated with a document
* <p></p>
* A span is a list of document positions that are associated with a particular tag in the document.
* */
public class DocumentSpans {
private static final DocumentSpan EMPTY_SPAN = new DocumentSpan();

View File

@ -1,9 +1,10 @@
package nu.marginalia.index.forward.spans;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.sequence.VarintCodedSequence;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
@ -18,9 +19,11 @@ public class ForwardIndexSpansReader implements AutoCloseable {
}
public DocumentSpans readSpans(Arena arena, long encodedOffset) throws IOException {
// Decode the size and offset from the encoded offset
long size = SpansCodec.decodeSize(encodedOffset);
long offset = SpansCodec.decodeStartOffset(encodedOffset);
// Allocate a buffer from the arena
var buffer = arena.allocate(size).asByteBuffer();
buffer.clear();
while (buffer.hasRemaining()) {
@ -28,15 +31,18 @@ public class ForwardIndexSpansReader implements AutoCloseable {
}
buffer.flip();
// Read the number of spans in the document
int count = buffer.get();
DocumentSpans ret = new DocumentSpans();
// Decode each span
while (count-- > 0) {
byte code = buffer.get();
short len = buffer.getShort();
ret.accept(code, new GammaCodedSequence(buffer.slice(buffer.position(), len)));
ByteBuffer data = buffer.slice(buffer.position(), len);
ret.accept(code, new VarintCodedSequence(data));
// Reset the buffer position to the end of the span
buffer.position(buffer.position() + len);

View File

@ -1,6 +1,6 @@
package nu.marginalia.index.journal;
import nu.marginalia.sequence.slop.GammaCodedSequenceArrayColumn;
import nu.marginalia.sequence.slop.VarintCodedSequenceArrayColumn;
import nu.marginalia.slop.SlopTable;
import nu.marginalia.slop.column.array.ByteArrayColumn;
import nu.marginalia.slop.column.array.LongArrayColumn;
@ -19,10 +19,10 @@ public record IndexJournalPage(Path baseDir, int page) {
public static LongArrayColumn termIds = new LongArrayColumn("termIds", StorageType.ZSTD);
public static ByteArrayColumn termMeta = new ByteArrayColumn("termMetadata", StorageType.ZSTD);
public static GammaCodedSequenceArrayColumn positions = new GammaCodedSequenceArrayColumn("termPositions", StorageType.ZSTD);
public static VarintCodedSequenceArrayColumn positions = new VarintCodedSequenceArrayColumn("termPositions", StorageType.ZSTD);
public static ByteArrayColumn spanCodes = new ByteArrayColumn("spanCodes", StorageType.ZSTD);
public static GammaCodedSequenceArrayColumn spans = new GammaCodedSequenceArrayColumn("spans", StorageType.ZSTD);
public static VarintCodedSequenceArrayColumn spans = new VarintCodedSequenceArrayColumn("spans", StorageType.ZSTD);
public IndexJournalPage {
if (!baseDir.toFile().isDirectory()) {
@ -55,11 +55,11 @@ public record IndexJournalPage(Path baseDir, int page) {
return termMeta.open(table);
}
public GammaCodedSequenceArrayColumn.Reader openTermPositions(SlopTable table) throws IOException {
public VarintCodedSequenceArrayColumn.Reader openTermPositions(SlopTable table) throws IOException {
return positions.open(table);
}
public GammaCodedSequenceArrayColumn.Reader openSpans(SlopTable table) throws IOException {
public VarintCodedSequenceArrayColumn.Reader openSpans(SlopTable table) throws IOException {
return spans.open(table);
}

View File

@ -3,7 +3,7 @@ package nu.marginalia.index.journal;
import lombok.SneakyThrows;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.model.processed.SlopDocumentRecord;
import nu.marginalia.sequence.slop.GammaCodedSequenceArrayColumn;
import nu.marginalia.sequence.slop.VarintCodedSequenceArrayColumn;
import nu.marginalia.slop.SlopTable;
import nu.marginalia.slop.column.array.ByteArrayColumn;
import nu.marginalia.slop.column.array.LongArrayColumn;
@ -24,9 +24,9 @@ public class IndexJournalSlopWriter extends SlopTable {
private final LongArrayColumn.Writer termIdsWriter;
private final ByteArrayColumn.Writer termMetadataWriter;
private final GammaCodedSequenceArrayColumn.Writer termPositionsWriter;
private final VarintCodedSequenceArrayColumn.Writer termPositionsWriter;
private final GammaCodedSequenceArrayColumn.Writer spansWriter;
private final VarintCodedSequenceArrayColumn.Writer spansWriter;
private final ByteArrayColumn.Writer spanCodesWriter;
private static final MurmurHash3_128 hash = new MurmurHash3_128();

View File

@ -1,7 +1,7 @@
package nu.marginalia.index.positions;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.sequence.VarintCodedSequence;
import java.nio.ByteBuffer;
@ -17,6 +17,6 @@ public class TermData {
}
public CodedSequence positions() {
return new GammaCodedSequence(buffer, 1, buffer.capacity());
return new VarintCodedSequence(buffer, 1, buffer.capacity());
}
}

View File

@ -3,11 +3,10 @@ package nu.marginalia.index.construction.full;
import nu.marginalia.index.journal.IndexJournalPage;
import nu.marginalia.index.journal.IndexJournalSlopWriter;
import nu.marginalia.model.processed.SlopDocumentRecord;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.sequence.VarintCodedSequence;
import nu.marginalia.test.TestUtil;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
@ -46,14 +45,14 @@ public class TestJournalFactory {
'}';
}
}
public record WordWithMeta(String wordId, byte meta, GammaCodedSequence gcs) {
public WordWithMeta(long wordId, byte meta, GammaCodedSequence gcs) {
public record WordWithMeta(String wordId, byte meta, VarintCodedSequence gcs) {
public WordWithMeta(long wordId, byte meta, VarintCodedSequence gcs) {
this(String.valueOf(wordId), meta, gcs);
}
}
public static WordWithMeta wm(long wordId, int meta, int... positions) {
return new WordWithMeta(wordId, (byte) meta, GammaCodedSequence.generate(ByteBuffer.allocate(128), positions));
return new WordWithMeta(wordId, (byte) meta, VarintCodedSequence.generate(positions));
}
public IndexJournalPage createReader(EntryData... entries) throws IOException {
@ -64,11 +63,11 @@ public class TestJournalFactory {
String[] termIds = new String[entry.wordIds.length];
byte[] meta = new byte[entry.wordIds.length];
GammaCodedSequence[] positions = new GammaCodedSequence[entry.wordIds.length];
VarintCodedSequence[] positions = new VarintCodedSequence[entry.wordIds.length];
for (int i = 0; i < entry.wordIds.length; i++) {
termIds[i] = entry.wordIds[i];
meta[i] = 0;
positions[i] = new GammaCodedSequence(new byte[1]);
positions[i] = VarintCodedSequence.generate();
}
writer.put(
@ -100,11 +99,11 @@ public class TestJournalFactory {
String[] termIds = new String[entry.wordIds.length];
byte[] meta = new byte[entry.wordIds.length];
GammaCodedSequence[] positions = new GammaCodedSequence[entry.wordIds.length];
VarintCodedSequence[] positions = new VarintCodedSequence[entry.wordIds.length];
for (int i = 0; i < entry.wordIds.length; i++) {
termIds[i] = entry.wordIds[i].wordId;
meta[i] = entry.wordIds[i].meta;
positions[i] = Objects.requireNonNullElseGet(entry.wordIds[i].gcs, () -> new GammaCodedSequence(new byte[1]));
positions[i] = Objects.requireNonNullElseGet(entry.wordIds[i].gcs, VarintCodedSequence::generate);
}
writer.put(

View File

@ -28,7 +28,7 @@ import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.processed.SlopDocumentRecord;
import nu.marginalia.process.control.FakeProcessHeartbeat;
import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.sequence.VarintCodedSequence;
import nu.marginalia.service.server.Initialization;
import nu.marginalia.storage.FileStorageService;
import org.junit.jupiter.api.AfterEach;
@ -39,7 +39,6 @@ import org.junit.jupiter.api.parallel.Execution;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.net.URISyntaxException;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
@ -321,7 +320,7 @@ public class CombinedIndexReaderTest {
for (int i = 0; i < words.size(); i++) {
metadata[i] = words.get(i).termMetadata;
}
var positions = words.stream().map(w -> w.positions).map(pos -> GammaCodedSequence.generate(ByteBuffer.allocate(1024), pos.toIntArray())).toList();
var positions = words.stream().map(w -> w.positions).map(pos -> VarintCodedSequence.generate(pos.toIntArray())).toList();
indexJournalWriter.put(doc,
new SlopDocumentRecord.KeywordsProjection(

View File

@ -31,7 +31,7 @@ import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.processed.SlopDocumentRecord;
import nu.marginalia.process.control.FakeProcessHeartbeat;
import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.sequence.VarintCodedSequence;
import nu.marginalia.service.control.ServiceHeartbeat;
import nu.marginalia.service.server.Initialization;
import nu.marginalia.storage.FileStorageService;
@ -377,11 +377,11 @@ public class IndexQueryServiceIntegrationSmokeTest {
metadata[i] = WordFlags.Title.asBit();
}
List<GammaCodedSequence> positions = new ArrayList<>();
List<VarintCodedSequence> positions = new ArrayList<>();
ByteBuffer wa = ByteBuffer.allocate(32);
for (int i = 0; i < factors.length; i++) {
positions.add(GammaCodedSequence.generate(wa, factors));
positions.add(VarintCodedSequence.generate(factors));
}
indexJournalWriter.put(fullId,
@ -417,11 +417,11 @@ public class IndexQueryServiceIntegrationSmokeTest {
metadata[i] = WordFlags.Title.asBit();
}
List<GammaCodedSequence> positions = new ArrayList<>();
List<VarintCodedSequence> positions = new ArrayList<>();
ByteBuffer wa = ByteBuffer.allocate(32);
for (int i = 0; i < factors.length; i++) {
positions.add(GammaCodedSequence.generate(wa, i + 1));
positions.add(VarintCodedSequence.generate(i + 1));
}
indexJournalWriter.put(fullId,

View File

@ -33,7 +33,7 @@ import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.processed.SlopDocumentRecord;
import nu.marginalia.process.control.FakeProcessHeartbeat;
import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.sequence.VarintCodedSequence;
import nu.marginalia.service.control.ServiceHeartbeat;
import nu.marginalia.service.server.Initialization;
import nu.marginalia.storage.FileStorageService;
@ -46,7 +46,6 @@ import org.junit.jupiter.api.parallel.Execution;
import javax.annotation.CheckReturnValue;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
@ -544,10 +543,9 @@ public class IndexQueryServiceIntegrationTest {
metadata[i] = (byte) words.get(i).termMetadata;
}
List<GammaCodedSequence> positions = new ArrayList<>();
ByteBuffer workBuffer = ByteBuffer.allocate(8192);
List<VarintCodedSequence> positions = new ArrayList<>();
for (int i = 0; i < words.size(); i++) {
positions.add(GammaCodedSequence.generate(workBuffer, words.get(i).positions));
positions.add(VarintCodedSequence.generate(words.get(i).positions));
}
indexJournalWriter.put(doc,

View File

@ -20,6 +20,13 @@ public class VarintCodedSequence implements CodedSequence {
this.startLimit = buffer.limit();
}
public VarintCodedSequence(ByteBuffer buffer, int startPos, int startLimit) {
this.raw = buffer;
this.startPos = startPos;
this.startLimit = startLimit;
}
private static int requiredBufferSize(int[] values) {
int prev = 0;
int size = 0;
@ -32,11 +39,47 @@ public class VarintCodedSequence implements CodedSequence {
return size + varintSize(size + 1);
}
private static int requiredBufferSize(IntList values) {
int prev = 0;
int size = 0;
for (int i = 0; i < values.size(); i++) {
int value = values.getInt(i);
size += varintSize(value - prev);
prev = value;
}
return size + varintSize(size + 1);
}
private static int varintSize(int value) {
int bits = 32 - Integer.numberOfLeadingZeros(value);
return (bits + 6) / 7;
}
public static VarintCodedSequence generate(IntList values) {
int bufferSize = requiredBufferSize(values);
ByteBuffer buffer = ByteBuffer.allocate(bufferSize);
int prev = 0;
encodeValue(buffer, values.size() + 1);
for (int i = 0; i < values.size(); i++) {
int value = values.getInt(i);
int toEncode = value - prev;
assert toEncode > 0 : "Values must be strictly increasing";
encodeValue(buffer, toEncode);
prev = value;
}
buffer.flip();
return new VarintCodedSequence(buffer);
}
public static VarintCodedSequence generate(int... values) {
int bufferSize = requiredBufferSize(values);
ByteBuffer buffer = ByteBuffer.allocate(bufferSize);
@ -60,20 +103,23 @@ public class VarintCodedSequence implements CodedSequence {
}
private static void encodeValue(ByteBuffer buffer, int value) {
if (value < 0x80) {
if (value < (1<<7)) {
buffer.put((byte) value);
}
else if (value < 0x4_000) {
else if (value < (1<<14)) {
buffer.put((byte) (value >>> (7) | 0x80));
buffer.put((byte) (value & 0x7F));
}
else if (value < 0x20_0000) {
else if (value < (1<<21)) {
buffer.put((byte) (value >>> (14) | 0x80));
buffer.put((byte) (value >>> (7) | 0x80));
buffer.put((byte) (value & 0x7F));
}
else if (value < 0x1000_0000) {
buffer.putInt(Integer.expand(value, 0x00808080) | 0x80808000);
else if (value < (1<<28)) {
buffer.put((byte) ((value >>> 21) | 0x80));
buffer.put((byte) ((value >>> 14) | 0x80));
buffer.put((byte) ((value >>> 7) | 0x80));
buffer.put((byte) (value & 0x7F));
}
else {
throw new IllegalArgumentException("Value too large to encode");
@ -139,12 +185,13 @@ public class VarintCodedSequence implements CodedSequence {
return b;
}
int value = b;
int value = b & 0x7F;
do {
b = buffer.get();
value = value << 7 | (b & 0x7F);
value = (value << 7) | (b & 0x7F);
} while ((b & 0x80) != 0);
return value;
}

View File

@ -49,20 +49,22 @@ public class BitReader {
/** Read the next width bits from the buffer */
public int get(int width) {
if (width == 0) {
return 0;
// Fast path for reading a full integer from the current value
if (bitPosition >= width) {
// We have enough bits in the current value to satisfy the request
int result = (int)(currentValue >>> (bitPosition - width)) & ~-(1<<width);
// Update the bit position
bitPosition -= width;
return result;
}
assert width <= 32;
if (bitPosition <= 0) {
readNext();
}
int result = 0;
while (width > 0) {
do {
int dw = bitPosition - width;
if (dw >= 0) { // We have enough bits in the current value to satisfy the request
result |= ((int)(currentValue >>> dw)) & ~-(1<<width);
@ -85,6 +87,7 @@ public class BitReader {
readNext(); // implicitly: bitPosition = 0 here
}
}
while (width > 0);
return result;
}

View File

@ -0,0 +1,154 @@
package nu.marginalia.sequence.slop;
import nu.marginalia.sequence.VarintCodedSequence;
import nu.marginalia.slop.column.AbstractColumn;
import nu.marginalia.slop.column.AbstractObjectColumn;
import nu.marginalia.slop.column.ObjectColumnReader;
import nu.marginalia.slop.column.ObjectColumnWriter;
import nu.marginalia.slop.column.dynamic.VarintColumn;
import nu.marginalia.slop.desc.ColumnFunction;
import nu.marginalia.slop.desc.StorageType;
import java.io.IOException;
import java.net.URI;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
/** Slop column extension for storing GammaCodedSequence objects. */
public class VarintCodedSequenceArrayColumn extends AbstractObjectColumn<List<VarintCodedSequence>, VarintCodedSequenceArrayColumn.Reader, VarintCodedSequenceArrayColumn.Writer> {
private final VarintColumn groupsColumn;
private final VarintCodedSequenceColumn dataColumn;
public VarintCodedSequenceArrayColumn(String name) {
this(name, StorageType.PLAIN);
}
public VarintCodedSequenceArrayColumn(String name, StorageType storageType) {
super(name,
"vcs[]",
ByteOrder.nativeOrder(),
ColumnFunction.DATA,
storageType);
groupsColumn = new VarintColumn(name, ColumnFunction.GROUP_LENGTH, storageType);
dataColumn = new VarintCodedSequenceColumn(name);
}
public Writer createUnregistered(Path path, int page) throws IOException {
return new Writer(
dataColumn.createUnregistered(path, page),
groupsColumn.createUnregistered(path, page)
);
}
public Reader openUnregistered(URI uri, int page) throws IOException {
return new Reader(
dataColumn.openUnregistered(uri, page),
groupsColumn.openUnregistered(uri, page)
);
}
public class Writer implements ObjectColumnWriter<List<VarintCodedSequence>> {
private final VarintColumn.Writer groupsWriter;
private final VarintCodedSequenceColumn.Writer dataWriter;
Writer(VarintCodedSequenceColumn.Writer dataWriter, VarintColumn.Writer groupsWriter)
{
this.groupsWriter = groupsWriter;
this.dataWriter = dataWriter;
}
@Override
public AbstractColumn<?, ?> columnDesc() {
return VarintCodedSequenceArrayColumn.this;
}
@Override
public void put(List<VarintCodedSequence> sequences) throws IOException {
groupsWriter.put(sequences.size());
for (VarintCodedSequence sequence : sequences) {
dataWriter.put(sequence);
}
}
public long position() {
return groupsWriter.position();
}
public void close() throws IOException {
dataWriter.close();
groupsWriter.close();
}
}
public class Reader implements ObjectColumnReader<List<VarintCodedSequence>> {
private final VarintCodedSequenceColumn.Reader dataReader;
private final VarintColumn.Reader groupsReader;
public Reader(VarintCodedSequenceColumn.Reader dataReader, VarintColumn.Reader groupsReader) {
this.dataReader = dataReader;
this.groupsReader = groupsReader;
}
@Override
public AbstractColumn<?, ?> columnDesc() {
return VarintCodedSequenceArrayColumn.this;
}
@Override
public void skip(long positions) throws IOException {
int toSkip = 0;
for (int i = 0; i < positions; i++) {
toSkip += groupsReader.get();
}
dataReader.skip(toSkip);
}
@Override
public boolean hasRemaining() throws IOException {
return groupsReader.hasRemaining();
}
public long position() throws IOException {
return groupsReader.position();
}
@Override
public List<VarintCodedSequence> get() throws IOException {
int count = groupsReader.get();
var ret = new ArrayList<VarintCodedSequence>(count);
for (int i = 0; i < count; i++) {
ret.add(dataReader.get());
}
return ret;
}
public List<ByteBuffer> getData(ByteBuffer workArea) throws IOException {
int count = groupsReader.get();
var ret = new ArrayList<ByteBuffer>(count);
for (int i = 0; i < count; i++) {
int start = workArea.position();
dataReader.getData(workArea);
var slice = workArea.slice(start, workArea.position() - start);
ret.add(slice);
}
return ret;
}
public void close() throws IOException {
dataReader.close();
groupsReader.close();
}
}
}

View File

@ -0,0 +1,148 @@
package nu.marginalia.sequence.slop;
import nu.marginalia.sequence.VarintCodedSequence;
import nu.marginalia.slop.column.AbstractColumn;
import nu.marginalia.slop.column.AbstractObjectColumn;
import nu.marginalia.slop.column.ObjectColumnReader;
import nu.marginalia.slop.column.ObjectColumnWriter;
import nu.marginalia.slop.column.dynamic.VarintColumn;
import nu.marginalia.slop.desc.ColumnFunction;
import nu.marginalia.slop.desc.StorageType;
import nu.marginalia.slop.storage.Storage;
import nu.marginalia.slop.storage.StorageReader;
import nu.marginalia.slop.storage.StorageWriter;
import java.io.IOException;
import java.net.URI;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.file.Path;
/** Slop column extension for storing GammaCodedSequence objects. */
public class VarintCodedSequenceColumn extends AbstractObjectColumn<VarintCodedSequence, VarintCodedSequenceColumn.Reader, VarintCodedSequenceColumn.Writer> {
private final VarintColumn indexColumn;
public VarintCodedSequenceColumn(String name) {
this(name, StorageType.PLAIN);
}
public VarintCodedSequenceColumn(String name, StorageType storageType) {
super(name,
"vcs",
ByteOrder.nativeOrder(),
ColumnFunction.DATA,
storageType);
indexColumn = new VarintColumn(name, ColumnFunction.DATA_LEN, StorageType.PLAIN);
}
public Writer createUnregistered(Path path, int page) throws IOException {
return new Writer(
Storage.writer(path, this, page),
indexColumn.createUnregistered(path, page)
);
}
public Reader openUnregistered(URI uri, int page) throws IOException {
return new Reader(
Storage.reader(uri, this, page, false),
indexColumn.openUnregistered(uri, page)
);
}
public class Writer implements ObjectColumnWriter<VarintCodedSequence> {
private final VarintColumn.Writer indexWriter;
private final StorageWriter storage;
public Writer(StorageWriter storage,
VarintColumn.Writer indexWriter)
{
this.storage = storage;
this.indexWriter = indexWriter;
}
@Override
public AbstractColumn<?, ?> columnDesc() {
return VarintCodedSequenceColumn.this;
}
@Override
public void put(VarintCodedSequence sequence) throws IOException {
var buffer = sequence.buffer();
int length = buffer.remaining();
indexWriter.put(length);
storage.putBytes(buffer);
}
public long position() {
return indexWriter.position();
}
public void close() throws IOException {
indexWriter.close();
storage.close();
}
}
public class Reader implements ObjectColumnReader<VarintCodedSequence> {
private final VarintColumn.Reader indexReader;
private final StorageReader storage;
Reader(StorageReader reader, VarintColumn.Reader indexReader) throws IOException {
this.storage = reader;
this.indexReader = indexReader;
}
@Override
public AbstractColumn<?, ?> columnDesc() {
return VarintCodedSequenceColumn.this;
}
@Override
public void skip(long positions) throws IOException {
for (int i = 0; i < positions; i++) {
int size = indexReader.get();
storage.skip(size, 1);
}
}
@Override
public boolean hasRemaining() throws IOException {
return indexReader.hasRemaining();
}
public long position() throws IOException {
return indexReader.position();
}
@Override
public VarintCodedSequence get() throws IOException {
int size = indexReader.get();
ByteBuffer dest = ByteBuffer.allocate(size);
storage.getBytes(dest);
dest.flip();
return new VarintCodedSequence(dest);
}
public void getData(ByteBuffer workArea) throws IOException {
int size = indexReader.get();
int oldLimit = workArea.limit();
workArea.limit(workArea.position() + size);
storage.getBytes(workArea);
workArea.limit(oldLimit);
}
public void close() throws IOException {
indexReader.close();
storage.close();
}
}
}

View File

@ -25,51 +25,51 @@ public class SequenceBenchmarks {
workArea = ByteBuffer.allocate(65536);
arrayValues = new int[] { 1, 3, 5, 16, 1024, 2048, 4096, 4098, 4100 };
list = new IntArrayList(arrayValues);
vcs = VarintCodedSequence.generate(1, 3, 5, 16, 1024, 2048);
gcs = GammaCodedSequence.generate(workArea, 1, 3, 5, 16, 1024, 2048);
vcs = VarintCodedSequence.generate(16,21,24,28,66,71,76,83,87,98,101,106,113,115,119,122,143,148,159,164,167,177,182,211,223,242,245,250,273,275,280,289,292,300,307,322,330,338,345,371,397,402,411,420,427,430,433,437,440,448,451,481,490,513,522,555,571,573,585,597,606,613,634,638,640,644,656,660,666,683,689,692,696,709,712,718,727,731,735,738);
gcs = GammaCodedSequence.generate(workArea, 16,21,24,28,66,71,76,83,87,98,101,106,113,115,119,122,143,148,159,164,167,177,182,211,223,242,245,250,273,275,280,289,292,300,307,322,330,338,345,371,397,402,411,420,427,430,433,437,440,448,451,481,490,513,522,555,571,573,585,597,606,613,634,638,640,644,656,660,666,683,689,692,696,709,712,718,727,731,735,738);
}
}
@Fork(value = 1, warmups = 1)
@Warmup(iterations = 1)
@Benchmark
@BenchmarkMode(Mode.Throughput)
public int vcsDecode(SequenceState state) {
var iter = state.vcs.iterator();
int sum = 0;
while (iter.hasNext()) {
sum += iter.nextInt();
}
return sum;
}
//
// @Fork(value = 5, warmups = 5)
// @Warmup(iterations = 5)
// @Benchmark
// @BenchmarkMode(Mode.Throughput)
// public int vcsDecode(SequenceState state) {
// var iter = state.vcs.iterator();
// public int listDecode2(SequenceState state) {
// var list = state.arrayValues;
// int sum = 0;
// while (iter.hasNext()) {
// sum += iter.nextInt();
// for (int i = 0; i < list.length; i++) {
// sum += list[i];
// }
// return sum;
// }
@Fork(value = 5, warmups = 5)
@Warmup(iterations = 5)
@Fork(value = 1, warmups = 1)
@Warmup(iterations = 1)
@Benchmark
@BenchmarkMode(Mode.Throughput)
public int listDecode2(SequenceState state) {
var list = state.arrayValues;
public int gcsDecode(SequenceState state) {
var iter = state.gcs.iterator();
int sum = 0;
for (int i = 0; i < list.length; i++) {
sum += list[i];
while (iter.hasNext()) {
sum += iter.nextInt();
}
return sum;
}
// @Fork(value = 1, warmups = 1)
// @Warmup(iterations = 1)
// @Benchmark
// @BenchmarkMode(Mode.Throughput)
// public int gcsDecode(SequenceState state) {
// var iter = state.gcs.iterator();
// int sum = 0;
// while (iter.hasNext()) {
// sum += iter.nextInt();
// }
// return sum;
// }
// @Fork(value = 1, warmups = 1)
// @Warmup(iterations = 1)
// @Benchmark

View File

@ -63,6 +63,8 @@ class SequenceOperationsTest {
assertTrue(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator(), seq3.iterator()));
}
@Test
void intersectSequencesDeepMatch3findIntersections() {
ByteBuffer wa = ByteBuffer.allocate(1024);

View File

@ -1,7 +1,7 @@
package nu.marginalia.keyword.model;
import nu.marginalia.model.idx.CodedWordSpan;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.sequence.VarintCodedSequence;
import java.util.List;
@ -9,12 +9,12 @@ public final class DocumentKeywords {
public final List<String> keywords;
public final byte[] metadata;
public final List<GammaCodedSequence> positions;
public final List<VarintCodedSequence> positions;
public final List<CodedWordSpan> spans;
public DocumentKeywords(List<String> keywords,
byte[] metadata,
List<GammaCodedSequence> positions,
List<VarintCodedSequence> positions,
List<CodedWordSpan> spans)
{
this.keywords = keywords;

View File

@ -8,7 +8,7 @@ import lombok.Getter;
import nu.marginalia.language.sentence.tag.HtmlTag;
import nu.marginalia.model.idx.CodedWordSpan;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.sequence.VarintCodedSequence;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -39,7 +39,7 @@ public class DocumentKeywordsBuilder {
public DocumentKeywords build(ByteBuffer workArea) {
final List<String> wordArray = new ArrayList<>(wordToMeta.size());
final TByteArrayList meta = new TByteArrayList(wordToMeta.size());
final List<GammaCodedSequence> positions = new ArrayList<>(wordToMeta.size());
final List<VarintCodedSequence> positions = new ArrayList<>(wordToMeta.size());
var iter = wordToMeta.object2ByteEntrySet().fastIterator();
@ -49,13 +49,13 @@ public class DocumentKeywordsBuilder {
meta.add(entry.getByteValue());
wordArray.add(entry.getKey());
var posList = wordToPos.getOrDefault(entry.getKey(), IntList.of());
IntList posList = wordToPos.getOrDefault(entry.getKey(), IntList.of());
if (posList.size() > MAX_POSITIONS_PER_WORD) {
posList.subList(MAX_POSITIONS_PER_WORD, posList.size()).clear();
}
positions.add(GammaCodedSequence.generate(workArea, posList));
positions.add(VarintCodedSequence.generate(posList));
}
// Encode spans
@ -70,7 +70,7 @@ public class DocumentKeywordsBuilder {
positionsForTag.add(span.end());
}
spans.add(new CodedWordSpan((byte) tag.charValue(), GammaCodedSequence.generate(workArea, positionsForTag)));
spans.add(new CodedWordSpan((byte) tag.charValue(), VarintCodedSequence.generate(positionsForTag)));
});
return new DocumentKeywords(wordArray, meta.toArray(), positions, spans);

View File

@ -12,7 +12,7 @@ import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.processed.SlopDocumentRecord;
import nu.marginalia.model.processed.SlopDomainLinkRecord;
import nu.marginalia.model.processed.SlopDomainRecord;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.sequence.VarintCodedSequence;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -96,7 +96,7 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter
var wb = document.words.build(workArea);
List<GammaCodedSequence> spanSequences = new ArrayList<>(wb.spans.size());
List<VarintCodedSequence> spanSequences = new ArrayList<>(wb.spans.size());
byte[] spanCodes = new byte[wb.spans.size()];
for (int i = 0; i < wb.spans.size(); i++) {

View File

@ -1,8 +1,8 @@
package nu.marginalia.model.processed;
import lombok.Builder;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.sequence.slop.GammaCodedSequenceArrayColumn;
import nu.marginalia.sequence.VarintCodedSequence;
import nu.marginalia.sequence.slop.VarintCodedSequenceArrayColumn;
import nu.marginalia.slop.SlopTable;
import nu.marginalia.slop.column.array.ByteArrayColumn;
import nu.marginalia.slop.column.array.ObjectArrayColumn;
@ -39,9 +39,9 @@ public record SlopDocumentRecord(
Integer pubYear,
List<String> words,
byte[] metas,
List<GammaCodedSequence> positions,
List<VarintCodedSequence> positions,
byte[] spanCodes,
List<GammaCodedSequence> spans
List<VarintCodedSequence> spans
) {
public SlopDocumentRecord {
@ -60,9 +60,9 @@ public record SlopDocumentRecord(
int length,
List<String> words,
byte[] metas,
List<GammaCodedSequence> positions,
List<VarintCodedSequence> positions,
byte[] spanCodes,
List<GammaCodedSequence> spans)
List<VarintCodedSequence> spans)
{
// Override the equals method since records don't generate default equals that deal with array fields properly
@Override
@ -127,12 +127,12 @@ public record SlopDocumentRecord(
private static final ObjectArrayColumn<String> keywordsColumn = new StringColumn("keywords", StorageType.ZSTD).asArray();
private static final ByteArrayColumn termMetaColumn = new ByteArrayColumn("termMetadata", StorageType.ZSTD);
private static final GammaCodedSequenceArrayColumn termPositionsColumn = new GammaCodedSequenceArrayColumn("termPositions", StorageType.ZSTD);
private static final VarintCodedSequenceArrayColumn termPositionsColumn = new VarintCodedSequenceArrayColumn("termPositions", StorageType.ZSTD);
// Spans columns
private static final ByteArrayColumn spanCodesColumn = new ByteArrayColumn("spanCodes", StorageType.ZSTD);
private static final GammaCodedSequenceArrayColumn spansColumn = new GammaCodedSequenceArrayColumn("spans", StorageType.ZSTD);
private static final VarintCodedSequenceArrayColumn spansColumn = new VarintCodedSequenceArrayColumn("spans", StorageType.ZSTD);
public static class KeywordsProjectionReader extends SlopTable {
private final TxtStringColumn.Reader domainsReader;
@ -143,10 +143,10 @@ public record SlopDocumentRecord(
private final ObjectArrayColumn<String>.Reader keywordsReader;
private final ByteArrayColumn.Reader termMetaReader;
private final GammaCodedSequenceArrayColumn.Reader termPositionsReader;
private final VarintCodedSequenceArrayColumn.Reader termPositionsReader;
private final ByteArrayColumn.Reader spanCodesReader;
private final GammaCodedSequenceArrayColumn.Reader spansReader;
private final VarintCodedSequenceArrayColumn.Reader spansReader;
public KeywordsProjectionReader(SlopTable.Ref<SlopDocumentRecord> pageRef) throws IOException {
super(pageRef);
@ -177,10 +177,10 @@ public record SlopDocumentRecord(
int length = lengthsReader.get();
List<String> words = keywordsReader.get();
List<GammaCodedSequence> positions = termPositionsReader.get();
List<VarintCodedSequence> positions = termPositionsReader.get();
byte[] metas = termMetaReader.get();
byte[] spanCodes = spanCodesReader.get();
List<GammaCodedSequence> spans = spansReader.get();
List<VarintCodedSequence> spans = spansReader.get();
return new KeywordsProjection(
domain,
@ -272,9 +272,9 @@ public record SlopDocumentRecord(
private final IntColumn.Writer pubYearWriter;
private final ObjectArrayColumn<String>.Writer keywordsWriter;
private final ByteArrayColumn.Writer termMetaWriter;
private final GammaCodedSequenceArrayColumn.Writer termPositionsWriter;
private final VarintCodedSequenceArrayColumn.Writer termPositionsWriter;
private final ByteArrayColumn.Writer spansCodesWriter;
private final GammaCodedSequenceArrayColumn.Writer spansWriter;
private final VarintCodedSequenceArrayColumn.Writer spansWriter;
public Writer(Path baseDir, int page) throws IOException {
super(baseDir, page);

View File

@ -1,6 +1,6 @@
package nu.marginalia.model.processed;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.sequence.VarintCodedSequence;
import nu.marginalia.slop.SlopTable;
import nu.marginalia.test.TestUtil;
import org.junit.jupiter.api.AfterEach;
@ -46,9 +46,9 @@ public class SlopDocumentRecordTest {
null,
List.of("test1", "test2"),
new byte[] { 2, 3},
List.of(GammaCodedSequence.generate(workArea, 1, 3, 5), GammaCodedSequence.generate(workArea, 2, 4, 6)),
List.of(VarintCodedSequence.generate(1, 3, 5), VarintCodedSequence.generate(2, 4, 6)),
new byte[] { 'a', 'b' },
List.of(GammaCodedSequence.generate(workArea, 2, 3, 5), GammaCodedSequence.generate(workArea, 3, 4, 6))
List.of(VarintCodedSequence.generate(2, 3, 5), VarintCodedSequence.generate(3, 4, 6))
);
try (var writer = new SlopDocumentRecord.Writer(testDir, 0)) {