(index, EXPERIMENTAL) Evaluate using Varint instead of GCS for position data

This commit is contained in:
Viktor Lofgren 2024-08-26 14:20:39 +02:00
parent 30bf845c81
commit abab5bdc8a
23 changed files with 478 additions and 113 deletions

View File

@ -1,6 +1,6 @@
package nu.marginalia.model.idx; package nu.marginalia.model.idx;
import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.sequence.VarintCodedSequence;
public record CodedWordSpan(byte code, GammaCodedSequence spans) { public record CodedWordSpan(byte code, VarintCodedSequence spans) {
} }

View File

@ -120,7 +120,6 @@ public class ForwardIndexConverter {
for (int i = 0; i < spansCodes.length; i++) { for (int i = 0; i < spansCodes.length; i++) {
spansWriter.writeSpan(spansCodes[i], spans.get(i)); spansWriter.writeSpan(spansCodes[i], spans.get(i));
} }
long encodedSpansOffset = spansWriter.endRecord(); long encodedSpansOffset = spansWriter.endRecord();

View File

@ -6,6 +6,7 @@ import nu.marginalia.sequence.CodedSequence;
import java.util.Arrays; import java.util.Arrays;
/** A list of the interlaced start and end positions of each span in the document of this type */
public class DocumentSpan { public class DocumentSpan {
/** A list of the interlaced start and end positions of each span in the document of this type */ /** A list of the interlaced start and end positions of each span in the document of this type */
@ -19,6 +20,7 @@ public class DocumentSpan {
this.startsEnds = null; this.startsEnds = null;
} }
/** Counts the number of intersections between the spans in the document of this type and the given list of positions */
public int countIntersections(int[] positions) { public int countIntersections(int[] positions) {
if (null == startsEnds || startsEnds.isEmpty() || positions.length == 0) { if (null == startsEnds || startsEnds.isEmpty() || positions.length == 0) {
return 0; return 0;
@ -26,37 +28,39 @@ public class DocumentSpan {
int cnt = 0; int cnt = 0;
if (positions.length < 8) { if (positions.length < 8) { // for small arrays we can do a linear search
int seis = 0; int seis = 0;
for (int pi = 0; pi < positions.length; pi++) { for (int pi = 0; pi < positions.length; pi++) {
int position = positions[pi]; int position = positions[pi];
// search through the spans until we find an item that is greater than the given position
for (int sei = seis; sei < startsEnds.size(); sei ++) { for (int sei = seis; sei < startsEnds.size(); sei ++) {
if (startsEnds.getInt(sei) > position) { if (startsEnds.getInt(sei) > position) {
cnt += sei % 2; cnt += sei % 2; // if sei is odd, we are between a start and end position in the spans list
seis = Math.max(seis, sei - 1); seis = Math.max(seis, sei - 1);
break; break;
} }
} }
} }
} }
else { else { // for large arrays we use a binary search
int ss = 0; int searchStart = 0;
for (int sei = 0; sei < startsEnds.size() && ss < positions.length; ) { for (int sei = 0; sei < startsEnds.size() && searchStart < positions.length; ) {
int start = startsEnds.getInt(sei++); int start = startsEnds.getInt(sei++);
int end = startsEnds.getInt(sei++); int end = startsEnds.getInt(sei++);
int i = Arrays.binarySearch(positions, ss, positions.length, start); // find the first position that is greater or equal to the start position
if (i < 0) { int i = Arrays.binarySearch(positions, searchStart, positions.length, start);
i = -i - 1; if (i < 0) i = -i - 1; // if the position is not found, we get the insertion point
}
// ... from that point, count the number of positions that smaller than the end position
while (i < positions.length && positions[i] < end) { while (i < positions.length && positions[i] < end) {
cnt++; cnt++;
i++; i++;
} }
ss = i; searchStart = i;
} }
} }
@ -83,6 +87,8 @@ public class DocumentSpan {
return false; return false;
} }
/** Returns true if for any position in the list, there exists a range
* (position[i], position[i]+len] that is overlapped by a span */
public boolean containsRange(IntList positions, int len) { public boolean containsRange(IntList positions, int len) {
if (null == startsEnds || startsEnds.size() < 2 || positions.isEmpty()) { if (null == startsEnds || startsEnds.size() < 2 || positions.isEmpty()) {
return false; return false;

View File

@ -3,6 +3,10 @@ package nu.marginalia.index.forward.spans;
import nu.marginalia.language.sentence.tag.HtmlTag; import nu.marginalia.language.sentence.tag.HtmlTag;
import nu.marginalia.sequence.CodedSequence; import nu.marginalia.sequence.CodedSequence;
/** All spans associated with a document
* <p></p>
* A span is a list of document positions that are associated with a particular tag in the document.
* */
public class DocumentSpans { public class DocumentSpans {
private static final DocumentSpan EMPTY_SPAN = new DocumentSpan(); private static final DocumentSpan EMPTY_SPAN = new DocumentSpan();

View File

@ -1,9 +1,10 @@
package nu.marginalia.index.forward.spans; package nu.marginalia.index.forward.spans;
import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.sequence.VarintCodedSequence;
import java.io.IOException; import java.io.IOException;
import java.lang.foreign.Arena; import java.lang.foreign.Arena;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel; import java.nio.channels.FileChannel;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
@ -18,9 +19,11 @@ public class ForwardIndexSpansReader implements AutoCloseable {
} }
public DocumentSpans readSpans(Arena arena, long encodedOffset) throws IOException { public DocumentSpans readSpans(Arena arena, long encodedOffset) throws IOException {
// Decode the size and offset from the encoded offset
long size = SpansCodec.decodeSize(encodedOffset); long size = SpansCodec.decodeSize(encodedOffset);
long offset = SpansCodec.decodeStartOffset(encodedOffset); long offset = SpansCodec.decodeStartOffset(encodedOffset);
// Allocate a buffer from the arena
var buffer = arena.allocate(size).asByteBuffer(); var buffer = arena.allocate(size).asByteBuffer();
buffer.clear(); buffer.clear();
while (buffer.hasRemaining()) { while (buffer.hasRemaining()) {
@ -28,15 +31,18 @@ public class ForwardIndexSpansReader implements AutoCloseable {
} }
buffer.flip(); buffer.flip();
// Read the number of spans in the document
int count = buffer.get(); int count = buffer.get();
DocumentSpans ret = new DocumentSpans(); DocumentSpans ret = new DocumentSpans();
// Decode each span
while (count-- > 0) { while (count-- > 0) {
byte code = buffer.get(); byte code = buffer.get();
short len = buffer.getShort(); short len = buffer.getShort();
ret.accept(code, new GammaCodedSequence(buffer.slice(buffer.position(), len))); ByteBuffer data = buffer.slice(buffer.position(), len);
ret.accept(code, new VarintCodedSequence(data));
// Reset the buffer position to the end of the span // Reset the buffer position to the end of the span
buffer.position(buffer.position() + len); buffer.position(buffer.position() + len);

View File

@ -1,6 +1,6 @@
package nu.marginalia.index.journal; package nu.marginalia.index.journal;
import nu.marginalia.sequence.slop.GammaCodedSequenceArrayColumn; import nu.marginalia.sequence.slop.VarintCodedSequenceArrayColumn;
import nu.marginalia.slop.SlopTable; import nu.marginalia.slop.SlopTable;
import nu.marginalia.slop.column.array.ByteArrayColumn; import nu.marginalia.slop.column.array.ByteArrayColumn;
import nu.marginalia.slop.column.array.LongArrayColumn; import nu.marginalia.slop.column.array.LongArrayColumn;
@ -19,10 +19,10 @@ public record IndexJournalPage(Path baseDir, int page) {
public static LongArrayColumn termIds = new LongArrayColumn("termIds", StorageType.ZSTD); public static LongArrayColumn termIds = new LongArrayColumn("termIds", StorageType.ZSTD);
public static ByteArrayColumn termMeta = new ByteArrayColumn("termMetadata", StorageType.ZSTD); public static ByteArrayColumn termMeta = new ByteArrayColumn("termMetadata", StorageType.ZSTD);
public static GammaCodedSequenceArrayColumn positions = new GammaCodedSequenceArrayColumn("termPositions", StorageType.ZSTD); public static VarintCodedSequenceArrayColumn positions = new VarintCodedSequenceArrayColumn("termPositions", StorageType.ZSTD);
public static ByteArrayColumn spanCodes = new ByteArrayColumn("spanCodes", StorageType.ZSTD); public static ByteArrayColumn spanCodes = new ByteArrayColumn("spanCodes", StorageType.ZSTD);
public static GammaCodedSequenceArrayColumn spans = new GammaCodedSequenceArrayColumn("spans", StorageType.ZSTD); public static VarintCodedSequenceArrayColumn spans = new VarintCodedSequenceArrayColumn("spans", StorageType.ZSTD);
public IndexJournalPage { public IndexJournalPage {
if (!baseDir.toFile().isDirectory()) { if (!baseDir.toFile().isDirectory()) {
@ -55,11 +55,11 @@ public record IndexJournalPage(Path baseDir, int page) {
return termMeta.open(table); return termMeta.open(table);
} }
public GammaCodedSequenceArrayColumn.Reader openTermPositions(SlopTable table) throws IOException { public VarintCodedSequenceArrayColumn.Reader openTermPositions(SlopTable table) throws IOException {
return positions.open(table); return positions.open(table);
} }
public GammaCodedSequenceArrayColumn.Reader openSpans(SlopTable table) throws IOException { public VarintCodedSequenceArrayColumn.Reader openSpans(SlopTable table) throws IOException {
return spans.open(table); return spans.open(table);
} }

View File

@ -3,7 +3,7 @@ package nu.marginalia.index.journal;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.hash.MurmurHash3_128; import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.model.processed.SlopDocumentRecord; import nu.marginalia.model.processed.SlopDocumentRecord;
import nu.marginalia.sequence.slop.GammaCodedSequenceArrayColumn; import nu.marginalia.sequence.slop.VarintCodedSequenceArrayColumn;
import nu.marginalia.slop.SlopTable; import nu.marginalia.slop.SlopTable;
import nu.marginalia.slop.column.array.ByteArrayColumn; import nu.marginalia.slop.column.array.ByteArrayColumn;
import nu.marginalia.slop.column.array.LongArrayColumn; import nu.marginalia.slop.column.array.LongArrayColumn;
@ -24,9 +24,9 @@ public class IndexJournalSlopWriter extends SlopTable {
private final LongArrayColumn.Writer termIdsWriter; private final LongArrayColumn.Writer termIdsWriter;
private final ByteArrayColumn.Writer termMetadataWriter; private final ByteArrayColumn.Writer termMetadataWriter;
private final GammaCodedSequenceArrayColumn.Writer termPositionsWriter; private final VarintCodedSequenceArrayColumn.Writer termPositionsWriter;
private final GammaCodedSequenceArrayColumn.Writer spansWriter; private final VarintCodedSequenceArrayColumn.Writer spansWriter;
private final ByteArrayColumn.Writer spanCodesWriter; private final ByteArrayColumn.Writer spanCodesWriter;
private static final MurmurHash3_128 hash = new MurmurHash3_128(); private static final MurmurHash3_128 hash = new MurmurHash3_128();

View File

@ -1,7 +1,7 @@
package nu.marginalia.index.positions; package nu.marginalia.index.positions;
import nu.marginalia.sequence.CodedSequence; import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.sequence.VarintCodedSequence;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
@ -17,6 +17,6 @@ public class TermData {
} }
public CodedSequence positions() { public CodedSequence positions() {
return new GammaCodedSequence(buffer, 1, buffer.capacity()); return new VarintCodedSequence(buffer, 1, buffer.capacity());
} }
} }

View File

@ -3,11 +3,10 @@ package nu.marginalia.index.construction.full;
import nu.marginalia.index.journal.IndexJournalPage; import nu.marginalia.index.journal.IndexJournalPage;
import nu.marginalia.index.journal.IndexJournalSlopWriter; import nu.marginalia.index.journal.IndexJournalSlopWriter;
import nu.marginalia.model.processed.SlopDocumentRecord; import nu.marginalia.model.processed.SlopDocumentRecord;
import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.sequence.VarintCodedSequence;
import nu.marginalia.test.TestUtil; import nu.marginalia.test.TestUtil;
import java.io.IOException; import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.Arrays; import java.util.Arrays;
@ -46,14 +45,14 @@ public class TestJournalFactory {
'}'; '}';
} }
} }
public record WordWithMeta(String wordId, byte meta, GammaCodedSequence gcs) { public record WordWithMeta(String wordId, byte meta, VarintCodedSequence gcs) {
public WordWithMeta(long wordId, byte meta, GammaCodedSequence gcs) { public WordWithMeta(long wordId, byte meta, VarintCodedSequence gcs) {
this(String.valueOf(wordId), meta, gcs); this(String.valueOf(wordId), meta, gcs);
} }
} }
public static WordWithMeta wm(long wordId, int meta, int... positions) { public static WordWithMeta wm(long wordId, int meta, int... positions) {
return new WordWithMeta(wordId, (byte) meta, GammaCodedSequence.generate(ByteBuffer.allocate(128), positions)); return new WordWithMeta(wordId, (byte) meta, VarintCodedSequence.generate(positions));
} }
public IndexJournalPage createReader(EntryData... entries) throws IOException { public IndexJournalPage createReader(EntryData... entries) throws IOException {
@ -64,11 +63,11 @@ public class TestJournalFactory {
String[] termIds = new String[entry.wordIds.length]; String[] termIds = new String[entry.wordIds.length];
byte[] meta = new byte[entry.wordIds.length]; byte[] meta = new byte[entry.wordIds.length];
GammaCodedSequence[] positions = new GammaCodedSequence[entry.wordIds.length]; VarintCodedSequence[] positions = new VarintCodedSequence[entry.wordIds.length];
for (int i = 0; i < entry.wordIds.length; i++) { for (int i = 0; i < entry.wordIds.length; i++) {
termIds[i] = entry.wordIds[i]; termIds[i] = entry.wordIds[i];
meta[i] = 0; meta[i] = 0;
positions[i] = new GammaCodedSequence(new byte[1]); positions[i] = VarintCodedSequence.generate();
} }
writer.put( writer.put(
@ -100,11 +99,11 @@ public class TestJournalFactory {
String[] termIds = new String[entry.wordIds.length]; String[] termIds = new String[entry.wordIds.length];
byte[] meta = new byte[entry.wordIds.length]; byte[] meta = new byte[entry.wordIds.length];
GammaCodedSequence[] positions = new GammaCodedSequence[entry.wordIds.length]; VarintCodedSequence[] positions = new VarintCodedSequence[entry.wordIds.length];
for (int i = 0; i < entry.wordIds.length; i++) { for (int i = 0; i < entry.wordIds.length; i++) {
termIds[i] = entry.wordIds[i].wordId; termIds[i] = entry.wordIds[i].wordId;
meta[i] = entry.wordIds[i].meta; meta[i] = entry.wordIds[i].meta;
positions[i] = Objects.requireNonNullElseGet(entry.wordIds[i].gcs, () -> new GammaCodedSequence(new byte[1])); positions[i] = Objects.requireNonNullElseGet(entry.wordIds[i].gcs, VarintCodedSequence::generate);
} }
writer.put( writer.put(

View File

@ -28,7 +28,7 @@ import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.processed.SlopDocumentRecord; import nu.marginalia.model.processed.SlopDocumentRecord;
import nu.marginalia.process.control.FakeProcessHeartbeat; import nu.marginalia.process.control.FakeProcessHeartbeat;
import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.sequence.VarintCodedSequence;
import nu.marginalia.service.server.Initialization; import nu.marginalia.service.server.Initialization;
import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.FileStorageService;
import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.AfterEach;
@ -39,7 +39,6 @@ import org.junit.jupiter.api.parallel.Execution;
import java.io.IOException; import java.io.IOException;
import java.lang.foreign.Arena; import java.lang.foreign.Arena;
import java.net.URISyntaxException; import java.net.URISyntaxException;
import java.nio.ByteBuffer;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.sql.SQLException; import java.sql.SQLException;
@ -321,7 +320,7 @@ public class CombinedIndexReaderTest {
for (int i = 0; i < words.size(); i++) { for (int i = 0; i < words.size(); i++) {
metadata[i] = words.get(i).termMetadata; metadata[i] = words.get(i).termMetadata;
} }
var positions = words.stream().map(w -> w.positions).map(pos -> GammaCodedSequence.generate(ByteBuffer.allocate(1024), pos.toIntArray())).toList(); var positions = words.stream().map(w -> w.positions).map(pos -> VarintCodedSequence.generate(pos.toIntArray())).toList();
indexJournalWriter.put(doc, indexJournalWriter.put(doc,
new SlopDocumentRecord.KeywordsProjection( new SlopDocumentRecord.KeywordsProjection(

View File

@ -31,7 +31,7 @@ import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.processed.SlopDocumentRecord; import nu.marginalia.model.processed.SlopDocumentRecord;
import nu.marginalia.process.control.FakeProcessHeartbeat; import nu.marginalia.process.control.FakeProcessHeartbeat;
import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.sequence.VarintCodedSequence;
import nu.marginalia.service.control.ServiceHeartbeat; import nu.marginalia.service.control.ServiceHeartbeat;
import nu.marginalia.service.server.Initialization; import nu.marginalia.service.server.Initialization;
import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.FileStorageService;
@ -377,11 +377,11 @@ public class IndexQueryServiceIntegrationSmokeTest {
metadata[i] = WordFlags.Title.asBit(); metadata[i] = WordFlags.Title.asBit();
} }
List<GammaCodedSequence> positions = new ArrayList<>(); List<VarintCodedSequence> positions = new ArrayList<>();
ByteBuffer wa = ByteBuffer.allocate(32); ByteBuffer wa = ByteBuffer.allocate(32);
for (int i = 0; i < factors.length; i++) { for (int i = 0; i < factors.length; i++) {
positions.add(GammaCodedSequence.generate(wa, factors)); positions.add(VarintCodedSequence.generate(factors));
} }
indexJournalWriter.put(fullId, indexJournalWriter.put(fullId,
@ -417,11 +417,11 @@ public class IndexQueryServiceIntegrationSmokeTest {
metadata[i] = WordFlags.Title.asBit(); metadata[i] = WordFlags.Title.asBit();
} }
List<GammaCodedSequence> positions = new ArrayList<>(); List<VarintCodedSequence> positions = new ArrayList<>();
ByteBuffer wa = ByteBuffer.allocate(32); ByteBuffer wa = ByteBuffer.allocate(32);
for (int i = 0; i < factors.length; i++) { for (int i = 0; i < factors.length; i++) {
positions.add(GammaCodedSequence.generate(wa, i + 1)); positions.add(VarintCodedSequence.generate(i + 1));
} }
indexJournalWriter.put(fullId, indexJournalWriter.put(fullId,

View File

@ -33,7 +33,7 @@ import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.processed.SlopDocumentRecord; import nu.marginalia.model.processed.SlopDocumentRecord;
import nu.marginalia.process.control.FakeProcessHeartbeat; import nu.marginalia.process.control.FakeProcessHeartbeat;
import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.sequence.VarintCodedSequence;
import nu.marginalia.service.control.ServiceHeartbeat; import nu.marginalia.service.control.ServiceHeartbeat;
import nu.marginalia.service.server.Initialization; import nu.marginalia.service.server.Initialization;
import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.FileStorageService;
@ -46,7 +46,6 @@ import org.junit.jupiter.api.parallel.Execution;
import javax.annotation.CheckReturnValue; import javax.annotation.CheckReturnValue;
import java.io.IOException; import java.io.IOException;
import java.net.URISyntaxException; import java.net.URISyntaxException;
import java.nio.ByteBuffer;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.sql.SQLException; import java.sql.SQLException;
@ -544,10 +543,9 @@ public class IndexQueryServiceIntegrationTest {
metadata[i] = (byte) words.get(i).termMetadata; metadata[i] = (byte) words.get(i).termMetadata;
} }
List<GammaCodedSequence> positions = new ArrayList<>(); List<VarintCodedSequence> positions = new ArrayList<>();
ByteBuffer workBuffer = ByteBuffer.allocate(8192);
for (int i = 0; i < words.size(); i++) { for (int i = 0; i < words.size(); i++) {
positions.add(GammaCodedSequence.generate(workBuffer, words.get(i).positions)); positions.add(VarintCodedSequence.generate(words.get(i).positions));
} }
indexJournalWriter.put(doc, indexJournalWriter.put(doc,

View File

@ -20,6 +20,13 @@ public class VarintCodedSequence implements CodedSequence {
this.startLimit = buffer.limit(); this.startLimit = buffer.limit();
} }
public VarintCodedSequence(ByteBuffer buffer, int startPos, int startLimit) {
this.raw = buffer;
this.startPos = startPos;
this.startLimit = startLimit;
}
private static int requiredBufferSize(int[] values) { private static int requiredBufferSize(int[] values) {
int prev = 0; int prev = 0;
int size = 0; int size = 0;
@ -32,11 +39,47 @@ public class VarintCodedSequence implements CodedSequence {
return size + varintSize(size + 1); return size + varintSize(size + 1);
} }
private static int requiredBufferSize(IntList values) {
int prev = 0;
int size = 0;
for (int i = 0; i < values.size(); i++) {
int value = values.getInt(i);
size += varintSize(value - prev);
prev = value;
}
return size + varintSize(size + 1);
}
private static int varintSize(int value) { private static int varintSize(int value) {
int bits = 32 - Integer.numberOfLeadingZeros(value); int bits = 32 - Integer.numberOfLeadingZeros(value);
return (bits + 6) / 7; return (bits + 6) / 7;
} }
public static VarintCodedSequence generate(IntList values) {
int bufferSize = requiredBufferSize(values);
ByteBuffer buffer = ByteBuffer.allocate(bufferSize);
int prev = 0;
encodeValue(buffer, values.size() + 1);
for (int i = 0; i < values.size(); i++) {
int value = values.getInt(i);
int toEncode = value - prev;
assert toEncode > 0 : "Values must be strictly increasing";
encodeValue(buffer, toEncode);
prev = value;
}
buffer.flip();
return new VarintCodedSequence(buffer);
}
public static VarintCodedSequence generate(int... values) { public static VarintCodedSequence generate(int... values) {
int bufferSize = requiredBufferSize(values); int bufferSize = requiredBufferSize(values);
ByteBuffer buffer = ByteBuffer.allocate(bufferSize); ByteBuffer buffer = ByteBuffer.allocate(bufferSize);
@ -60,20 +103,23 @@ public class VarintCodedSequence implements CodedSequence {
} }
private static void encodeValue(ByteBuffer buffer, int value) { private static void encodeValue(ByteBuffer buffer, int value) {
if (value < 0x80) { if (value < (1<<7)) {
buffer.put((byte) value); buffer.put((byte) value);
} }
else if (value < 0x4_000) { else if (value < (1<<14)) {
buffer.put((byte) (value >>> (7) | 0x80)); buffer.put((byte) (value >>> (7) | 0x80));
buffer.put((byte) (value & 0x7F)); buffer.put((byte) (value & 0x7F));
} }
else if (value < 0x20_0000) { else if (value < (1<<21)) {
buffer.put((byte) (value >>> (14) | 0x80)); buffer.put((byte) (value >>> (14) | 0x80));
buffer.put((byte) (value >>> (7) | 0x80)); buffer.put((byte) (value >>> (7) | 0x80));
buffer.put((byte) (value & 0x7F)); buffer.put((byte) (value & 0x7F));
} }
else if (value < 0x1000_0000) { else if (value < (1<<28)) {
buffer.putInt(Integer.expand(value, 0x00808080) | 0x80808000); buffer.put((byte) ((value >>> 21) | 0x80));
buffer.put((byte) ((value >>> 14) | 0x80));
buffer.put((byte) ((value >>> 7) | 0x80));
buffer.put((byte) (value & 0x7F));
} }
else { else {
throw new IllegalArgumentException("Value too large to encode"); throw new IllegalArgumentException("Value too large to encode");
@ -139,12 +185,13 @@ public class VarintCodedSequence implements CodedSequence {
return b; return b;
} }
int value = b; int value = b & 0x7F;
do { do {
b = buffer.get(); b = buffer.get();
value = value << 7 | (b & 0x7F); value = (value << 7) | (b & 0x7F);
} while ((b & 0x80) != 0); } while ((b & 0x80) != 0);
return value; return value;
} }

View File

@ -49,20 +49,22 @@ public class BitReader {
/** Read the next width bits from the buffer */ /** Read the next width bits from the buffer */
public int get(int width) { public int get(int width) {
if (width == 0) { // Fast path for reading a full integer from the current value
return 0; if (bitPosition >= width) {
// We have enough bits in the current value to satisfy the request
int result = (int)(currentValue >>> (bitPosition - width)) & ~-(1<<width);
// Update the bit position
bitPosition -= width;
return result;
} }
assert width <= 32;
if (bitPosition <= 0) { if (bitPosition <= 0) {
readNext(); readNext();
} }
int result = 0; int result = 0;
do {
while (width > 0) {
int dw = bitPosition - width; int dw = bitPosition - width;
if (dw >= 0) { // We have enough bits in the current value to satisfy the request if (dw >= 0) { // We have enough bits in the current value to satisfy the request
result |= ((int)(currentValue >>> dw)) & ~-(1<<width); result |= ((int)(currentValue >>> dw)) & ~-(1<<width);
@ -85,6 +87,7 @@ public class BitReader {
readNext(); // implicitly: bitPosition = 0 here readNext(); // implicitly: bitPosition = 0 here
} }
} }
while (width > 0);
return result; return result;
} }

View File

@ -0,0 +1,154 @@
package nu.marginalia.sequence.slop;
import nu.marginalia.sequence.VarintCodedSequence;
import nu.marginalia.slop.column.AbstractColumn;
import nu.marginalia.slop.column.AbstractObjectColumn;
import nu.marginalia.slop.column.ObjectColumnReader;
import nu.marginalia.slop.column.ObjectColumnWriter;
import nu.marginalia.slop.column.dynamic.VarintColumn;
import nu.marginalia.slop.desc.ColumnFunction;
import nu.marginalia.slop.desc.StorageType;
import java.io.IOException;
import java.net.URI;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
/** Slop column extension for storing GammaCodedSequence objects. */
public class VarintCodedSequenceArrayColumn extends AbstractObjectColumn<List<VarintCodedSequence>, VarintCodedSequenceArrayColumn.Reader, VarintCodedSequenceArrayColumn.Writer> {
private final VarintColumn groupsColumn;
private final VarintCodedSequenceColumn dataColumn;
public VarintCodedSequenceArrayColumn(String name) {
this(name, StorageType.PLAIN);
}
public VarintCodedSequenceArrayColumn(String name, StorageType storageType) {
super(name,
"vcs[]",
ByteOrder.nativeOrder(),
ColumnFunction.DATA,
storageType);
groupsColumn = new VarintColumn(name, ColumnFunction.GROUP_LENGTH, storageType);
dataColumn = new VarintCodedSequenceColumn(name);
}
public Writer createUnregistered(Path path, int page) throws IOException {
return new Writer(
dataColumn.createUnregistered(path, page),
groupsColumn.createUnregistered(path, page)
);
}
public Reader openUnregistered(URI uri, int page) throws IOException {
return new Reader(
dataColumn.openUnregistered(uri, page),
groupsColumn.openUnregistered(uri, page)
);
}
public class Writer implements ObjectColumnWriter<List<VarintCodedSequence>> {
private final VarintColumn.Writer groupsWriter;
private final VarintCodedSequenceColumn.Writer dataWriter;
Writer(VarintCodedSequenceColumn.Writer dataWriter, VarintColumn.Writer groupsWriter)
{
this.groupsWriter = groupsWriter;
this.dataWriter = dataWriter;
}
@Override
public AbstractColumn<?, ?> columnDesc() {
return VarintCodedSequenceArrayColumn.this;
}
@Override
public void put(List<VarintCodedSequence> sequences) throws IOException {
groupsWriter.put(sequences.size());
for (VarintCodedSequence sequence : sequences) {
dataWriter.put(sequence);
}
}
public long position() {
return groupsWriter.position();
}
public void close() throws IOException {
dataWriter.close();
groupsWriter.close();
}
}
public class Reader implements ObjectColumnReader<List<VarintCodedSequence>> {
private final VarintCodedSequenceColumn.Reader dataReader;
private final VarintColumn.Reader groupsReader;
public Reader(VarintCodedSequenceColumn.Reader dataReader, VarintColumn.Reader groupsReader) {
this.dataReader = dataReader;
this.groupsReader = groupsReader;
}
@Override
public AbstractColumn<?, ?> columnDesc() {
return VarintCodedSequenceArrayColumn.this;
}
@Override
public void skip(long positions) throws IOException {
int toSkip = 0;
for (int i = 0; i < positions; i++) {
toSkip += groupsReader.get();
}
dataReader.skip(toSkip);
}
@Override
public boolean hasRemaining() throws IOException {
return groupsReader.hasRemaining();
}
public long position() throws IOException {
return groupsReader.position();
}
@Override
public List<VarintCodedSequence> get() throws IOException {
int count = groupsReader.get();
var ret = new ArrayList<VarintCodedSequence>(count);
for (int i = 0; i < count; i++) {
ret.add(dataReader.get());
}
return ret;
}
public List<ByteBuffer> getData(ByteBuffer workArea) throws IOException {
int count = groupsReader.get();
var ret = new ArrayList<ByteBuffer>(count);
for (int i = 0; i < count; i++) {
int start = workArea.position();
dataReader.getData(workArea);
var slice = workArea.slice(start, workArea.position() - start);
ret.add(slice);
}
return ret;
}
public void close() throws IOException {
dataReader.close();
groupsReader.close();
}
}
}

View File

@ -0,0 +1,148 @@
package nu.marginalia.sequence.slop;
import nu.marginalia.sequence.VarintCodedSequence;
import nu.marginalia.slop.column.AbstractColumn;
import nu.marginalia.slop.column.AbstractObjectColumn;
import nu.marginalia.slop.column.ObjectColumnReader;
import nu.marginalia.slop.column.ObjectColumnWriter;
import nu.marginalia.slop.column.dynamic.VarintColumn;
import nu.marginalia.slop.desc.ColumnFunction;
import nu.marginalia.slop.desc.StorageType;
import nu.marginalia.slop.storage.Storage;
import nu.marginalia.slop.storage.StorageReader;
import nu.marginalia.slop.storage.StorageWriter;
import java.io.IOException;
import java.net.URI;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.file.Path;
/** Slop column extension for storing GammaCodedSequence objects. */
public class VarintCodedSequenceColumn extends AbstractObjectColumn<VarintCodedSequence, VarintCodedSequenceColumn.Reader, VarintCodedSequenceColumn.Writer> {
private final VarintColumn indexColumn;
public VarintCodedSequenceColumn(String name) {
this(name, StorageType.PLAIN);
}
public VarintCodedSequenceColumn(String name, StorageType storageType) {
super(name,
"vcs",
ByteOrder.nativeOrder(),
ColumnFunction.DATA,
storageType);
indexColumn = new VarintColumn(name, ColumnFunction.DATA_LEN, StorageType.PLAIN);
}
public Writer createUnregistered(Path path, int page) throws IOException {
return new Writer(
Storage.writer(path, this, page),
indexColumn.createUnregistered(path, page)
);
}
public Reader openUnregistered(URI uri, int page) throws IOException {
return new Reader(
Storage.reader(uri, this, page, false),
indexColumn.openUnregistered(uri, page)
);
}
public class Writer implements ObjectColumnWriter<VarintCodedSequence> {
private final VarintColumn.Writer indexWriter;
private final StorageWriter storage;
public Writer(StorageWriter storage,
VarintColumn.Writer indexWriter)
{
this.storage = storage;
this.indexWriter = indexWriter;
}
@Override
public AbstractColumn<?, ?> columnDesc() {
return VarintCodedSequenceColumn.this;
}
@Override
public void put(VarintCodedSequence sequence) throws IOException {
var buffer = sequence.buffer();
int length = buffer.remaining();
indexWriter.put(length);
storage.putBytes(buffer);
}
public long position() {
return indexWriter.position();
}
public void close() throws IOException {
indexWriter.close();
storage.close();
}
}
public class Reader implements ObjectColumnReader<VarintCodedSequence> {
private final VarintColumn.Reader indexReader;
private final StorageReader storage;
Reader(StorageReader reader, VarintColumn.Reader indexReader) throws IOException {
this.storage = reader;
this.indexReader = indexReader;
}
@Override
public AbstractColumn<?, ?> columnDesc() {
return VarintCodedSequenceColumn.this;
}
@Override
public void skip(long positions) throws IOException {
for (int i = 0; i < positions; i++) {
int size = indexReader.get();
storage.skip(size, 1);
}
}
@Override
public boolean hasRemaining() throws IOException {
return indexReader.hasRemaining();
}
public long position() throws IOException {
return indexReader.position();
}
@Override
public VarintCodedSequence get() throws IOException {
int size = indexReader.get();
ByteBuffer dest = ByteBuffer.allocate(size);
storage.getBytes(dest);
dest.flip();
return new VarintCodedSequence(dest);
}
public void getData(ByteBuffer workArea) throws IOException {
int size = indexReader.get();
int oldLimit = workArea.limit();
workArea.limit(workArea.position() + size);
storage.getBytes(workArea);
workArea.limit(oldLimit);
}
public void close() throws IOException {
indexReader.close();
storage.close();
}
}
}

View File

@ -25,51 +25,51 @@ public class SequenceBenchmarks {
workArea = ByteBuffer.allocate(65536); workArea = ByteBuffer.allocate(65536);
arrayValues = new int[] { 1, 3, 5, 16, 1024, 2048, 4096, 4098, 4100 }; arrayValues = new int[] { 1, 3, 5, 16, 1024, 2048, 4096, 4098, 4100 };
list = new IntArrayList(arrayValues); list = new IntArrayList(arrayValues);
vcs = VarintCodedSequence.generate(1, 3, 5, 16, 1024, 2048); vcs = VarintCodedSequence.generate(16,21,24,28,66,71,76,83,87,98,101,106,113,115,119,122,143,148,159,164,167,177,182,211,223,242,245,250,273,275,280,289,292,300,307,322,330,338,345,371,397,402,411,420,427,430,433,437,440,448,451,481,490,513,522,555,571,573,585,597,606,613,634,638,640,644,656,660,666,683,689,692,696,709,712,718,727,731,735,738);
gcs = GammaCodedSequence.generate(workArea, 1, 3, 5, 16, 1024, 2048); gcs = GammaCodedSequence.generate(workArea, 16,21,24,28,66,71,76,83,87,98,101,106,113,115,119,122,143,148,159,164,167,177,182,211,223,242,245,250,273,275,280,289,292,300,307,322,330,338,345,371,397,402,411,420,427,430,433,437,440,448,451,481,490,513,522,555,571,573,585,597,606,613,634,638,640,644,656,660,666,683,689,692,696,709,712,718,727,731,735,738);
} }
} }
@Fork(value = 1, warmups = 1)
@Warmup(iterations = 1)
@Benchmark
@BenchmarkMode(Mode.Throughput)
public int vcsDecode(SequenceState state) {
var iter = state.vcs.iterator();
int sum = 0;
while (iter.hasNext()) {
sum += iter.nextInt();
}
return sum;
}
//
// @Fork(value = 5, warmups = 5) // @Fork(value = 5, warmups = 5)
// @Warmup(iterations = 5) // @Warmup(iterations = 5)
// @Benchmark // @Benchmark
// @BenchmarkMode(Mode.Throughput) // @BenchmarkMode(Mode.Throughput)
// public int vcsDecode(SequenceState state) { // public int listDecode2(SequenceState state) {
// var iter = state.vcs.iterator(); // var list = state.arrayValues;
// int sum = 0; // int sum = 0;
// while (iter.hasNext()) { // for (int i = 0; i < list.length; i++) {
// sum += iter.nextInt(); // sum += list[i];
// } // }
// return sum; // return sum;
// } // }
@Fork(value = 5, warmups = 5)
@Warmup(iterations = 5) @Fork(value = 1, warmups = 1)
@Warmup(iterations = 1)
@Benchmark @Benchmark
@BenchmarkMode(Mode.Throughput) @BenchmarkMode(Mode.Throughput)
public int listDecode2(SequenceState state) { public int gcsDecode(SequenceState state) {
var list = state.arrayValues; var iter = state.gcs.iterator();
int sum = 0; int sum = 0;
for (int i = 0; i < list.length; i++) { while (iter.hasNext()) {
sum += list[i]; sum += iter.nextInt();
} }
return sum; return sum;
} }
// @Fork(value = 1, warmups = 1)
// @Warmup(iterations = 1)
// @Benchmark
// @BenchmarkMode(Mode.Throughput)
// public int gcsDecode(SequenceState state) {
// var iter = state.gcs.iterator();
// int sum = 0;
// while (iter.hasNext()) {
// sum += iter.nextInt();
// }
// return sum;
// }
// @Fork(value = 1, warmups = 1) // @Fork(value = 1, warmups = 1)
// @Warmup(iterations = 1) // @Warmup(iterations = 1)
// @Benchmark // @Benchmark

View File

@ -63,6 +63,8 @@ class SequenceOperationsTest {
assertTrue(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator(), seq3.iterator())); assertTrue(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator(), seq3.iterator()));
} }
@Test @Test
void intersectSequencesDeepMatch3findIntersections() { void intersectSequencesDeepMatch3findIntersections() {
ByteBuffer wa = ByteBuffer.allocate(1024); ByteBuffer wa = ByteBuffer.allocate(1024);

View File

@ -1,7 +1,7 @@
package nu.marginalia.keyword.model; package nu.marginalia.keyword.model;
import nu.marginalia.model.idx.CodedWordSpan; import nu.marginalia.model.idx.CodedWordSpan;
import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.sequence.VarintCodedSequence;
import java.util.List; import java.util.List;
@ -9,12 +9,12 @@ public final class DocumentKeywords {
public final List<String> keywords; public final List<String> keywords;
public final byte[] metadata; public final byte[] metadata;
public final List<GammaCodedSequence> positions; public final List<VarintCodedSequence> positions;
public final List<CodedWordSpan> spans; public final List<CodedWordSpan> spans;
public DocumentKeywords(List<String> keywords, public DocumentKeywords(List<String> keywords,
byte[] metadata, byte[] metadata,
List<GammaCodedSequence> positions, List<VarintCodedSequence> positions,
List<CodedWordSpan> spans) List<CodedWordSpan> spans)
{ {
this.keywords = keywords; this.keywords = keywords;

View File

@ -8,7 +8,7 @@ import lombok.Getter;
import nu.marginalia.language.sentence.tag.HtmlTag; import nu.marginalia.language.sentence.tag.HtmlTag;
import nu.marginalia.model.idx.CodedWordSpan; import nu.marginalia.model.idx.CodedWordSpan;
import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.sequence.VarintCodedSequence;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -39,7 +39,7 @@ public class DocumentKeywordsBuilder {
public DocumentKeywords build(ByteBuffer workArea) { public DocumentKeywords build(ByteBuffer workArea) {
final List<String> wordArray = new ArrayList<>(wordToMeta.size()); final List<String> wordArray = new ArrayList<>(wordToMeta.size());
final TByteArrayList meta = new TByteArrayList(wordToMeta.size()); final TByteArrayList meta = new TByteArrayList(wordToMeta.size());
final List<GammaCodedSequence> positions = new ArrayList<>(wordToMeta.size()); final List<VarintCodedSequence> positions = new ArrayList<>(wordToMeta.size());
var iter = wordToMeta.object2ByteEntrySet().fastIterator(); var iter = wordToMeta.object2ByteEntrySet().fastIterator();
@ -49,13 +49,13 @@ public class DocumentKeywordsBuilder {
meta.add(entry.getByteValue()); meta.add(entry.getByteValue());
wordArray.add(entry.getKey()); wordArray.add(entry.getKey());
var posList = wordToPos.getOrDefault(entry.getKey(), IntList.of()); IntList posList = wordToPos.getOrDefault(entry.getKey(), IntList.of());
if (posList.size() > MAX_POSITIONS_PER_WORD) { if (posList.size() > MAX_POSITIONS_PER_WORD) {
posList.subList(MAX_POSITIONS_PER_WORD, posList.size()).clear(); posList.subList(MAX_POSITIONS_PER_WORD, posList.size()).clear();
} }
positions.add(GammaCodedSequence.generate(workArea, posList)); positions.add(VarintCodedSequence.generate(posList));
} }
// Encode spans // Encode spans
@ -70,7 +70,7 @@ public class DocumentKeywordsBuilder {
positionsForTag.add(span.end()); positionsForTag.add(span.end());
} }
spans.add(new CodedWordSpan((byte) tag.charValue(), GammaCodedSequence.generate(workArea, positionsForTag))); spans.add(new CodedWordSpan((byte) tag.charValue(), VarintCodedSequence.generate(positionsForTag)));
}); });
return new DocumentKeywords(wordArray, meta.toArray(), positions, spans); return new DocumentKeywords(wordArray, meta.toArray(), positions, spans);

View File

@ -12,7 +12,7 @@ import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.processed.SlopDocumentRecord; import nu.marginalia.model.processed.SlopDocumentRecord;
import nu.marginalia.model.processed.SlopDomainLinkRecord; import nu.marginalia.model.processed.SlopDomainLinkRecord;
import nu.marginalia.model.processed.SlopDomainRecord; import nu.marginalia.model.processed.SlopDomainRecord;
import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.sequence.VarintCodedSequence;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -96,7 +96,7 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter
var wb = document.words.build(workArea); var wb = document.words.build(workArea);
List<GammaCodedSequence> spanSequences = new ArrayList<>(wb.spans.size()); List<VarintCodedSequence> spanSequences = new ArrayList<>(wb.spans.size());
byte[] spanCodes = new byte[wb.spans.size()]; byte[] spanCodes = new byte[wb.spans.size()];
for (int i = 0; i < wb.spans.size(); i++) { for (int i = 0; i < wb.spans.size(); i++) {

View File

@ -1,8 +1,8 @@
package nu.marginalia.model.processed; package nu.marginalia.model.processed;
import lombok.Builder; import lombok.Builder;
import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.sequence.VarintCodedSequence;
import nu.marginalia.sequence.slop.GammaCodedSequenceArrayColumn; import nu.marginalia.sequence.slop.VarintCodedSequenceArrayColumn;
import nu.marginalia.slop.SlopTable; import nu.marginalia.slop.SlopTable;
import nu.marginalia.slop.column.array.ByteArrayColumn; import nu.marginalia.slop.column.array.ByteArrayColumn;
import nu.marginalia.slop.column.array.ObjectArrayColumn; import nu.marginalia.slop.column.array.ObjectArrayColumn;
@ -39,9 +39,9 @@ public record SlopDocumentRecord(
Integer pubYear, Integer pubYear,
List<String> words, List<String> words,
byte[] metas, byte[] metas,
List<GammaCodedSequence> positions, List<VarintCodedSequence> positions,
byte[] spanCodes, byte[] spanCodes,
List<GammaCodedSequence> spans List<VarintCodedSequence> spans
) { ) {
public SlopDocumentRecord { public SlopDocumentRecord {
@ -60,9 +60,9 @@ public record SlopDocumentRecord(
int length, int length,
List<String> words, List<String> words,
byte[] metas, byte[] metas,
List<GammaCodedSequence> positions, List<VarintCodedSequence> positions,
byte[] spanCodes, byte[] spanCodes,
List<GammaCodedSequence> spans) List<VarintCodedSequence> spans)
{ {
// Override the equals method since records don't generate default equals that deal with array fields properly // Override the equals method since records don't generate default equals that deal with array fields properly
@Override @Override
@ -127,12 +127,12 @@ public record SlopDocumentRecord(
private static final ObjectArrayColumn<String> keywordsColumn = new StringColumn("keywords", StorageType.ZSTD).asArray(); private static final ObjectArrayColumn<String> keywordsColumn = new StringColumn("keywords", StorageType.ZSTD).asArray();
private static final ByteArrayColumn termMetaColumn = new ByteArrayColumn("termMetadata", StorageType.ZSTD); private static final ByteArrayColumn termMetaColumn = new ByteArrayColumn("termMetadata", StorageType.ZSTD);
private static final GammaCodedSequenceArrayColumn termPositionsColumn = new GammaCodedSequenceArrayColumn("termPositions", StorageType.ZSTD); private static final VarintCodedSequenceArrayColumn termPositionsColumn = new VarintCodedSequenceArrayColumn("termPositions", StorageType.ZSTD);
// Spans columns // Spans columns
private static final ByteArrayColumn spanCodesColumn = new ByteArrayColumn("spanCodes", StorageType.ZSTD); private static final ByteArrayColumn spanCodesColumn = new ByteArrayColumn("spanCodes", StorageType.ZSTD);
private static final GammaCodedSequenceArrayColumn spansColumn = new GammaCodedSequenceArrayColumn("spans", StorageType.ZSTD); private static final VarintCodedSequenceArrayColumn spansColumn = new VarintCodedSequenceArrayColumn("spans", StorageType.ZSTD);
public static class KeywordsProjectionReader extends SlopTable { public static class KeywordsProjectionReader extends SlopTable {
private final TxtStringColumn.Reader domainsReader; private final TxtStringColumn.Reader domainsReader;
@ -143,10 +143,10 @@ public record SlopDocumentRecord(
private final ObjectArrayColumn<String>.Reader keywordsReader; private final ObjectArrayColumn<String>.Reader keywordsReader;
private final ByteArrayColumn.Reader termMetaReader; private final ByteArrayColumn.Reader termMetaReader;
private final GammaCodedSequenceArrayColumn.Reader termPositionsReader; private final VarintCodedSequenceArrayColumn.Reader termPositionsReader;
private final ByteArrayColumn.Reader spanCodesReader; private final ByteArrayColumn.Reader spanCodesReader;
private final GammaCodedSequenceArrayColumn.Reader spansReader; private final VarintCodedSequenceArrayColumn.Reader spansReader;
public KeywordsProjectionReader(SlopTable.Ref<SlopDocumentRecord> pageRef) throws IOException { public KeywordsProjectionReader(SlopTable.Ref<SlopDocumentRecord> pageRef) throws IOException {
super(pageRef); super(pageRef);
@ -177,10 +177,10 @@ public record SlopDocumentRecord(
int length = lengthsReader.get(); int length = lengthsReader.get();
List<String> words = keywordsReader.get(); List<String> words = keywordsReader.get();
List<GammaCodedSequence> positions = termPositionsReader.get(); List<VarintCodedSequence> positions = termPositionsReader.get();
byte[] metas = termMetaReader.get(); byte[] metas = termMetaReader.get();
byte[] spanCodes = spanCodesReader.get(); byte[] spanCodes = spanCodesReader.get();
List<GammaCodedSequence> spans = spansReader.get(); List<VarintCodedSequence> spans = spansReader.get();
return new KeywordsProjection( return new KeywordsProjection(
domain, domain,
@ -272,9 +272,9 @@ public record SlopDocumentRecord(
private final IntColumn.Writer pubYearWriter; private final IntColumn.Writer pubYearWriter;
private final ObjectArrayColumn<String>.Writer keywordsWriter; private final ObjectArrayColumn<String>.Writer keywordsWriter;
private final ByteArrayColumn.Writer termMetaWriter; private final ByteArrayColumn.Writer termMetaWriter;
private final GammaCodedSequenceArrayColumn.Writer termPositionsWriter; private final VarintCodedSequenceArrayColumn.Writer termPositionsWriter;
private final ByteArrayColumn.Writer spansCodesWriter; private final ByteArrayColumn.Writer spansCodesWriter;
private final GammaCodedSequenceArrayColumn.Writer spansWriter; private final VarintCodedSequenceArrayColumn.Writer spansWriter;
public Writer(Path baseDir, int page) throws IOException { public Writer(Path baseDir, int page) throws IOException {
super(baseDir, page); super(baseDir, page);

View File

@ -1,6 +1,6 @@
package nu.marginalia.model.processed; package nu.marginalia.model.processed;
import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.sequence.VarintCodedSequence;
import nu.marginalia.slop.SlopTable; import nu.marginalia.slop.SlopTable;
import nu.marginalia.test.TestUtil; import nu.marginalia.test.TestUtil;
import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.AfterEach;
@ -46,9 +46,9 @@ public class SlopDocumentRecordTest {
null, null,
List.of("test1", "test2"), List.of("test1", "test2"),
new byte[] { 2, 3}, new byte[] { 2, 3},
List.of(GammaCodedSequence.generate(workArea, 1, 3, 5), GammaCodedSequence.generate(workArea, 2, 4, 6)), List.of(VarintCodedSequence.generate(1, 3, 5), VarintCodedSequence.generate(2, 4, 6)),
new byte[] { 'a', 'b' }, new byte[] { 'a', 'b' },
List.of(GammaCodedSequence.generate(workArea, 2, 3, 5), GammaCodedSequence.generate(workArea, 3, 4, 6)) List.of(VarintCodedSequence.generate(2, 3, 5), VarintCodedSequence.generate(3, 4, 6))
); );
try (var writer = new SlopDocumentRecord.Writer(testDir, 0)) { try (var writer = new SlopDocumentRecord.Writer(testDir, 0)) {