mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(index, EXPERIMENTAL) Evaluate using Varint instead of GCS for position data
This commit is contained in:
parent
30bf845c81
commit
abab5bdc8a
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.model.idx;
|
||||
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
import nu.marginalia.sequence.VarintCodedSequence;
|
||||
|
||||
public record CodedWordSpan(byte code, GammaCodedSequence spans) {
|
||||
public record CodedWordSpan(byte code, VarintCodedSequence spans) {
|
||||
}
|
||||
|
@ -120,7 +120,6 @@ public class ForwardIndexConverter {
|
||||
for (int i = 0; i < spansCodes.length; i++) {
|
||||
spansWriter.writeSpan(spansCodes[i], spans.get(i));
|
||||
}
|
||||
|
||||
long encodedSpansOffset = spansWriter.endRecord();
|
||||
|
||||
|
||||
|
@ -6,6 +6,7 @@ import nu.marginalia.sequence.CodedSequence;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/** A list of the interlaced start and end positions of each span in the document of this type */
|
||||
public class DocumentSpan {
|
||||
|
||||
/** A list of the interlaced start and end positions of each span in the document of this type */
|
||||
@ -19,6 +20,7 @@ public class DocumentSpan {
|
||||
this.startsEnds = null;
|
||||
}
|
||||
|
||||
/** Counts the number of intersections between the spans in the document of this type and the given list of positions */
|
||||
public int countIntersections(int[] positions) {
|
||||
if (null == startsEnds || startsEnds.isEmpty() || positions.length == 0) {
|
||||
return 0;
|
||||
@ -26,37 +28,39 @@ public class DocumentSpan {
|
||||
|
||||
int cnt = 0;
|
||||
|
||||
if (positions.length < 8) {
|
||||
if (positions.length < 8) { // for small arrays we can do a linear search
|
||||
int seis = 0;
|
||||
|
||||
for (int pi = 0; pi < positions.length; pi++) {
|
||||
int position = positions[pi];
|
||||
|
||||
// search through the spans until we find an item that is greater than the given position
|
||||
for (int sei = seis; sei < startsEnds.size(); sei ++) {
|
||||
if (startsEnds.getInt(sei) > position) {
|
||||
cnt += sei % 2;
|
||||
cnt += sei % 2; // if sei is odd, we are between a start and end position in the spans list
|
||||
seis = Math.max(seis, sei - 1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
int ss = 0;
|
||||
else { // for large arrays we use a binary search
|
||||
int searchStart = 0;
|
||||
|
||||
for (int sei = 0; sei < startsEnds.size() && ss < positions.length; ) {
|
||||
for (int sei = 0; sei < startsEnds.size() && searchStart < positions.length; ) {
|
||||
int start = startsEnds.getInt(sei++);
|
||||
int end = startsEnds.getInt(sei++);
|
||||
|
||||
int i = Arrays.binarySearch(positions, ss, positions.length, start);
|
||||
if (i < 0) {
|
||||
i = -i - 1;
|
||||
}
|
||||
// find the first position that is greater or equal to the start position
|
||||
int i = Arrays.binarySearch(positions, searchStart, positions.length, start);
|
||||
if (i < 0) i = -i - 1; // if the position is not found, we get the insertion point
|
||||
|
||||
// ... from that point, count the number of positions that smaller than the end position
|
||||
while (i < positions.length && positions[i] < end) {
|
||||
cnt++;
|
||||
i++;
|
||||
}
|
||||
ss = i;
|
||||
searchStart = i;
|
||||
}
|
||||
}
|
||||
|
||||
@ -83,6 +87,8 @@ public class DocumentSpan {
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Returns true if for any position in the list, there exists a range
|
||||
* (position[i], position[i]+len] that is overlapped by a span */
|
||||
public boolean containsRange(IntList positions, int len) {
|
||||
if (null == startsEnds || startsEnds.size() < 2 || positions.isEmpty()) {
|
||||
return false;
|
||||
|
@ -3,6 +3,10 @@ package nu.marginalia.index.forward.spans;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
import nu.marginalia.sequence.CodedSequence;
|
||||
|
||||
/** All spans associated with a document
|
||||
* <p></p>
|
||||
* A span is a list of document positions that are associated with a particular tag in the document.
|
||||
* */
|
||||
public class DocumentSpans {
|
||||
private static final DocumentSpan EMPTY_SPAN = new DocumentSpan();
|
||||
|
||||
|
@ -1,9 +1,10 @@
|
||||
package nu.marginalia.index.forward.spans;
|
||||
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
import nu.marginalia.sequence.VarintCodedSequence;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.foreign.Arena;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
@ -18,9 +19,11 @@ public class ForwardIndexSpansReader implements AutoCloseable {
|
||||
}
|
||||
|
||||
public DocumentSpans readSpans(Arena arena, long encodedOffset) throws IOException {
|
||||
// Decode the size and offset from the encoded offset
|
||||
long size = SpansCodec.decodeSize(encodedOffset);
|
||||
long offset = SpansCodec.decodeStartOffset(encodedOffset);
|
||||
|
||||
// Allocate a buffer from the arena
|
||||
var buffer = arena.allocate(size).asByteBuffer();
|
||||
buffer.clear();
|
||||
while (buffer.hasRemaining()) {
|
||||
@ -28,15 +31,18 @@ public class ForwardIndexSpansReader implements AutoCloseable {
|
||||
}
|
||||
buffer.flip();
|
||||
|
||||
// Read the number of spans in the document
|
||||
int count = buffer.get();
|
||||
|
||||
DocumentSpans ret = new DocumentSpans();
|
||||
|
||||
// Decode each span
|
||||
while (count-- > 0) {
|
||||
byte code = buffer.get();
|
||||
short len = buffer.getShort();
|
||||
|
||||
ret.accept(code, new GammaCodedSequence(buffer.slice(buffer.position(), len)));
|
||||
ByteBuffer data = buffer.slice(buffer.position(), len);
|
||||
ret.accept(code, new VarintCodedSequence(data));
|
||||
|
||||
// Reset the buffer position to the end of the span
|
||||
buffer.position(buffer.position() + len);
|
||||
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.index.journal;
|
||||
|
||||
import nu.marginalia.sequence.slop.GammaCodedSequenceArrayColumn;
|
||||
import nu.marginalia.sequence.slop.VarintCodedSequenceArrayColumn;
|
||||
import nu.marginalia.slop.SlopTable;
|
||||
import nu.marginalia.slop.column.array.ByteArrayColumn;
|
||||
import nu.marginalia.slop.column.array.LongArrayColumn;
|
||||
@ -19,10 +19,10 @@ public record IndexJournalPage(Path baseDir, int page) {
|
||||
|
||||
public static LongArrayColumn termIds = new LongArrayColumn("termIds", StorageType.ZSTD);
|
||||
public static ByteArrayColumn termMeta = new ByteArrayColumn("termMetadata", StorageType.ZSTD);
|
||||
public static GammaCodedSequenceArrayColumn positions = new GammaCodedSequenceArrayColumn("termPositions", StorageType.ZSTD);
|
||||
public static VarintCodedSequenceArrayColumn positions = new VarintCodedSequenceArrayColumn("termPositions", StorageType.ZSTD);
|
||||
|
||||
public static ByteArrayColumn spanCodes = new ByteArrayColumn("spanCodes", StorageType.ZSTD);
|
||||
public static GammaCodedSequenceArrayColumn spans = new GammaCodedSequenceArrayColumn("spans", StorageType.ZSTD);
|
||||
public static VarintCodedSequenceArrayColumn spans = new VarintCodedSequenceArrayColumn("spans", StorageType.ZSTD);
|
||||
|
||||
public IndexJournalPage {
|
||||
if (!baseDir.toFile().isDirectory()) {
|
||||
@ -55,11 +55,11 @@ public record IndexJournalPage(Path baseDir, int page) {
|
||||
return termMeta.open(table);
|
||||
}
|
||||
|
||||
public GammaCodedSequenceArrayColumn.Reader openTermPositions(SlopTable table) throws IOException {
|
||||
public VarintCodedSequenceArrayColumn.Reader openTermPositions(SlopTable table) throws IOException {
|
||||
return positions.open(table);
|
||||
}
|
||||
|
||||
public GammaCodedSequenceArrayColumn.Reader openSpans(SlopTable table) throws IOException {
|
||||
public VarintCodedSequenceArrayColumn.Reader openSpans(SlopTable table) throws IOException {
|
||||
return spans.open(table);
|
||||
}
|
||||
|
||||
|
@ -3,7 +3,7 @@ package nu.marginalia.index.journal;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.hash.MurmurHash3_128;
|
||||
import nu.marginalia.model.processed.SlopDocumentRecord;
|
||||
import nu.marginalia.sequence.slop.GammaCodedSequenceArrayColumn;
|
||||
import nu.marginalia.sequence.slop.VarintCodedSequenceArrayColumn;
|
||||
import nu.marginalia.slop.SlopTable;
|
||||
import nu.marginalia.slop.column.array.ByteArrayColumn;
|
||||
import nu.marginalia.slop.column.array.LongArrayColumn;
|
||||
@ -24,9 +24,9 @@ public class IndexJournalSlopWriter extends SlopTable {
|
||||
|
||||
private final LongArrayColumn.Writer termIdsWriter;
|
||||
private final ByteArrayColumn.Writer termMetadataWriter;
|
||||
private final GammaCodedSequenceArrayColumn.Writer termPositionsWriter;
|
||||
private final VarintCodedSequenceArrayColumn.Writer termPositionsWriter;
|
||||
|
||||
private final GammaCodedSequenceArrayColumn.Writer spansWriter;
|
||||
private final VarintCodedSequenceArrayColumn.Writer spansWriter;
|
||||
private final ByteArrayColumn.Writer spanCodesWriter;
|
||||
|
||||
private static final MurmurHash3_128 hash = new MurmurHash3_128();
|
||||
|
@ -1,7 +1,7 @@
|
||||
package nu.marginalia.index.positions;
|
||||
|
||||
import nu.marginalia.sequence.CodedSequence;
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
import nu.marginalia.sequence.VarintCodedSequence;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
|
||||
@ -17,6 +17,6 @@ public class TermData {
|
||||
}
|
||||
|
||||
public CodedSequence positions() {
|
||||
return new GammaCodedSequence(buffer, 1, buffer.capacity());
|
||||
return new VarintCodedSequence(buffer, 1, buffer.capacity());
|
||||
}
|
||||
}
|
||||
|
@ -3,11 +3,10 @@ package nu.marginalia.index.construction.full;
|
||||
import nu.marginalia.index.journal.IndexJournalPage;
|
||||
import nu.marginalia.index.journal.IndexJournalSlopWriter;
|
||||
import nu.marginalia.model.processed.SlopDocumentRecord;
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
import nu.marginalia.sequence.VarintCodedSequence;
|
||||
import nu.marginalia.test.TestUtil;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Arrays;
|
||||
@ -46,14 +45,14 @@ public class TestJournalFactory {
|
||||
'}';
|
||||
}
|
||||
}
|
||||
public record WordWithMeta(String wordId, byte meta, GammaCodedSequence gcs) {
|
||||
public WordWithMeta(long wordId, byte meta, GammaCodedSequence gcs) {
|
||||
public record WordWithMeta(String wordId, byte meta, VarintCodedSequence gcs) {
|
||||
public WordWithMeta(long wordId, byte meta, VarintCodedSequence gcs) {
|
||||
this(String.valueOf(wordId), meta, gcs);
|
||||
}
|
||||
}
|
||||
|
||||
public static WordWithMeta wm(long wordId, int meta, int... positions) {
|
||||
return new WordWithMeta(wordId, (byte) meta, GammaCodedSequence.generate(ByteBuffer.allocate(128), positions));
|
||||
return new WordWithMeta(wordId, (byte) meta, VarintCodedSequence.generate(positions));
|
||||
}
|
||||
|
||||
public IndexJournalPage createReader(EntryData... entries) throws IOException {
|
||||
@ -64,11 +63,11 @@ public class TestJournalFactory {
|
||||
String[] termIds = new String[entry.wordIds.length];
|
||||
byte[] meta = new byte[entry.wordIds.length];
|
||||
|
||||
GammaCodedSequence[] positions = new GammaCodedSequence[entry.wordIds.length];
|
||||
VarintCodedSequence[] positions = new VarintCodedSequence[entry.wordIds.length];
|
||||
for (int i = 0; i < entry.wordIds.length; i++) {
|
||||
termIds[i] = entry.wordIds[i];
|
||||
meta[i] = 0;
|
||||
positions[i] = new GammaCodedSequence(new byte[1]);
|
||||
positions[i] = VarintCodedSequence.generate();
|
||||
}
|
||||
|
||||
writer.put(
|
||||
@ -100,11 +99,11 @@ public class TestJournalFactory {
|
||||
|
||||
String[] termIds = new String[entry.wordIds.length];
|
||||
byte[] meta = new byte[entry.wordIds.length];
|
||||
GammaCodedSequence[] positions = new GammaCodedSequence[entry.wordIds.length];
|
||||
VarintCodedSequence[] positions = new VarintCodedSequence[entry.wordIds.length];
|
||||
for (int i = 0; i < entry.wordIds.length; i++) {
|
||||
termIds[i] = entry.wordIds[i].wordId;
|
||||
meta[i] = entry.wordIds[i].meta;
|
||||
positions[i] = Objects.requireNonNullElseGet(entry.wordIds[i].gcs, () -> new GammaCodedSequence(new byte[1]));
|
||||
positions[i] = Objects.requireNonNullElseGet(entry.wordIds[i].gcs, VarintCodedSequence::generate);
|
||||
}
|
||||
|
||||
writer.put(
|
||||
|
@ -28,7 +28,7 @@ import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.model.processed.SlopDocumentRecord;
|
||||
import nu.marginalia.process.control.FakeProcessHeartbeat;
|
||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
import nu.marginalia.sequence.VarintCodedSequence;
|
||||
import nu.marginalia.service.server.Initialization;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
@ -39,7 +39,6 @@ import org.junit.jupiter.api.parallel.Execution;
|
||||
import java.io.IOException;
|
||||
import java.lang.foreign.Arena;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
@ -321,7 +320,7 @@ public class CombinedIndexReaderTest {
|
||||
for (int i = 0; i < words.size(); i++) {
|
||||
metadata[i] = words.get(i).termMetadata;
|
||||
}
|
||||
var positions = words.stream().map(w -> w.positions).map(pos -> GammaCodedSequence.generate(ByteBuffer.allocate(1024), pos.toIntArray())).toList();
|
||||
var positions = words.stream().map(w -> w.positions).map(pos -> VarintCodedSequence.generate(pos.toIntArray())).toList();
|
||||
|
||||
indexJournalWriter.put(doc,
|
||||
new SlopDocumentRecord.KeywordsProjection(
|
||||
|
@ -31,7 +31,7 @@ import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.model.processed.SlopDocumentRecord;
|
||||
import nu.marginalia.process.control.FakeProcessHeartbeat;
|
||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
import nu.marginalia.sequence.VarintCodedSequence;
|
||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||
import nu.marginalia.service.server.Initialization;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
@ -377,11 +377,11 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
||||
metadata[i] = WordFlags.Title.asBit();
|
||||
}
|
||||
|
||||
List<GammaCodedSequence> positions = new ArrayList<>();
|
||||
List<VarintCodedSequence> positions = new ArrayList<>();
|
||||
|
||||
ByteBuffer wa = ByteBuffer.allocate(32);
|
||||
for (int i = 0; i < factors.length; i++) {
|
||||
positions.add(GammaCodedSequence.generate(wa, factors));
|
||||
positions.add(VarintCodedSequence.generate(factors));
|
||||
}
|
||||
|
||||
indexJournalWriter.put(fullId,
|
||||
@ -417,11 +417,11 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
||||
metadata[i] = WordFlags.Title.asBit();
|
||||
}
|
||||
|
||||
List<GammaCodedSequence> positions = new ArrayList<>();
|
||||
List<VarintCodedSequence> positions = new ArrayList<>();
|
||||
|
||||
ByteBuffer wa = ByteBuffer.allocate(32);
|
||||
for (int i = 0; i < factors.length; i++) {
|
||||
positions.add(GammaCodedSequence.generate(wa, i + 1));
|
||||
positions.add(VarintCodedSequence.generate(i + 1));
|
||||
}
|
||||
|
||||
indexJournalWriter.put(fullId,
|
||||
|
@ -33,7 +33,7 @@ import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.model.processed.SlopDocumentRecord;
|
||||
import nu.marginalia.process.control.FakeProcessHeartbeat;
|
||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
import nu.marginalia.sequence.VarintCodedSequence;
|
||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||
import nu.marginalia.service.server.Initialization;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
@ -46,7 +46,6 @@ import org.junit.jupiter.api.parallel.Execution;
|
||||
import javax.annotation.CheckReturnValue;
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
@ -544,10 +543,9 @@ public class IndexQueryServiceIntegrationTest {
|
||||
metadata[i] = (byte) words.get(i).termMetadata;
|
||||
}
|
||||
|
||||
List<GammaCodedSequence> positions = new ArrayList<>();
|
||||
ByteBuffer workBuffer = ByteBuffer.allocate(8192);
|
||||
List<VarintCodedSequence> positions = new ArrayList<>();
|
||||
for (int i = 0; i < words.size(); i++) {
|
||||
positions.add(GammaCodedSequence.generate(workBuffer, words.get(i).positions));
|
||||
positions.add(VarintCodedSequence.generate(words.get(i).positions));
|
||||
}
|
||||
|
||||
indexJournalWriter.put(doc,
|
||||
|
@ -20,6 +20,13 @@ public class VarintCodedSequence implements CodedSequence {
|
||||
this.startLimit = buffer.limit();
|
||||
}
|
||||
|
||||
public VarintCodedSequence(ByteBuffer buffer, int startPos, int startLimit) {
|
||||
this.raw = buffer;
|
||||
|
||||
this.startPos = startPos;
|
||||
this.startLimit = startLimit;
|
||||
}
|
||||
|
||||
private static int requiredBufferSize(int[] values) {
|
||||
int prev = 0;
|
||||
int size = 0;
|
||||
@ -32,11 +39,47 @@ public class VarintCodedSequence implements CodedSequence {
|
||||
return size + varintSize(size + 1);
|
||||
}
|
||||
|
||||
private static int requiredBufferSize(IntList values) {
|
||||
int prev = 0;
|
||||
int size = 0;
|
||||
|
||||
for (int i = 0; i < values.size(); i++) {
|
||||
int value = values.getInt(i);
|
||||
size += varintSize(value - prev);
|
||||
prev = value;
|
||||
}
|
||||
|
||||
return size + varintSize(size + 1);
|
||||
}
|
||||
|
||||
private static int varintSize(int value) {
|
||||
int bits = 32 - Integer.numberOfLeadingZeros(value);
|
||||
return (bits + 6) / 7;
|
||||
}
|
||||
|
||||
public static VarintCodedSequence generate(IntList values) {
|
||||
int bufferSize = requiredBufferSize(values);
|
||||
ByteBuffer buffer = ByteBuffer.allocate(bufferSize);
|
||||
|
||||
int prev = 0;
|
||||
|
||||
encodeValue(buffer, values.size() + 1);
|
||||
|
||||
for (int i = 0; i < values.size(); i++) {
|
||||
int value = values.getInt(i);
|
||||
int toEncode = value - prev;
|
||||
assert toEncode > 0 : "Values must be strictly increasing";
|
||||
|
||||
encodeValue(buffer, toEncode);
|
||||
|
||||
prev = value;
|
||||
}
|
||||
|
||||
buffer.flip();
|
||||
|
||||
return new VarintCodedSequence(buffer);
|
||||
}
|
||||
|
||||
public static VarintCodedSequence generate(int... values) {
|
||||
int bufferSize = requiredBufferSize(values);
|
||||
ByteBuffer buffer = ByteBuffer.allocate(bufferSize);
|
||||
@ -60,20 +103,23 @@ public class VarintCodedSequence implements CodedSequence {
|
||||
}
|
||||
|
||||
private static void encodeValue(ByteBuffer buffer, int value) {
|
||||
if (value < 0x80) {
|
||||
if (value < (1<<7)) {
|
||||
buffer.put((byte) value);
|
||||
}
|
||||
else if (value < 0x4_000) {
|
||||
else if (value < (1<<14)) {
|
||||
buffer.put((byte) (value >>> (7) | 0x80));
|
||||
buffer.put((byte) (value & 0x7F));
|
||||
}
|
||||
else if (value < 0x20_0000) {
|
||||
else if (value < (1<<21)) {
|
||||
buffer.put((byte) (value >>> (14) | 0x80));
|
||||
buffer.put((byte) (value >>> (7) | 0x80));
|
||||
buffer.put((byte) (value & 0x7F));
|
||||
}
|
||||
else if (value < 0x1000_0000) {
|
||||
buffer.putInt(Integer.expand(value, 0x00808080) | 0x80808000);
|
||||
else if (value < (1<<28)) {
|
||||
buffer.put((byte) ((value >>> 21) | 0x80));
|
||||
buffer.put((byte) ((value >>> 14) | 0x80));
|
||||
buffer.put((byte) ((value >>> 7) | 0x80));
|
||||
buffer.put((byte) (value & 0x7F));
|
||||
}
|
||||
else {
|
||||
throw new IllegalArgumentException("Value too large to encode");
|
||||
@ -139,12 +185,13 @@ public class VarintCodedSequence implements CodedSequence {
|
||||
return b;
|
||||
}
|
||||
|
||||
int value = b;
|
||||
int value = b & 0x7F;
|
||||
do {
|
||||
b = buffer.get();
|
||||
value = value << 7 | (b & 0x7F);
|
||||
value = (value << 7) | (b & 0x7F);
|
||||
} while ((b & 0x80) != 0);
|
||||
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
|
@ -49,20 +49,22 @@ public class BitReader {
|
||||
|
||||
/** Read the next width bits from the buffer */
|
||||
public int get(int width) {
|
||||
if (width == 0) {
|
||||
return 0;
|
||||
// Fast path for reading a full integer from the current value
|
||||
if (bitPosition >= width) {
|
||||
// We have enough bits in the current value to satisfy the request
|
||||
int result = (int)(currentValue >>> (bitPosition - width)) & ~-(1<<width);
|
||||
// Update the bit position
|
||||
bitPosition -= width;
|
||||
return result;
|
||||
}
|
||||
assert width <= 32;
|
||||
|
||||
if (bitPosition <= 0) {
|
||||
readNext();
|
||||
}
|
||||
|
||||
int result = 0;
|
||||
|
||||
while (width > 0) {
|
||||
do {
|
||||
int dw = bitPosition - width;
|
||||
|
||||
if (dw >= 0) { // We have enough bits in the current value to satisfy the request
|
||||
result |= ((int)(currentValue >>> dw)) & ~-(1<<width);
|
||||
|
||||
@ -85,6 +87,7 @@ public class BitReader {
|
||||
readNext(); // implicitly: bitPosition = 0 here
|
||||
}
|
||||
}
|
||||
while (width > 0);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
@ -0,0 +1,154 @@
|
||||
package nu.marginalia.sequence.slop;
|
||||
|
||||
import nu.marginalia.sequence.VarintCodedSequence;
|
||||
import nu.marginalia.slop.column.AbstractColumn;
|
||||
import nu.marginalia.slop.column.AbstractObjectColumn;
|
||||
import nu.marginalia.slop.column.ObjectColumnReader;
|
||||
import nu.marginalia.slop.column.ObjectColumnWriter;
|
||||
import nu.marginalia.slop.column.dynamic.VarintColumn;
|
||||
import nu.marginalia.slop.desc.ColumnFunction;
|
||||
import nu.marginalia.slop.desc.StorageType;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/** Slop column extension for storing GammaCodedSequence objects. */
|
||||
public class VarintCodedSequenceArrayColumn extends AbstractObjectColumn<List<VarintCodedSequence>, VarintCodedSequenceArrayColumn.Reader, VarintCodedSequenceArrayColumn.Writer> {
|
||||
|
||||
private final VarintColumn groupsColumn;
|
||||
private final VarintCodedSequenceColumn dataColumn;
|
||||
|
||||
public VarintCodedSequenceArrayColumn(String name) {
|
||||
this(name, StorageType.PLAIN);
|
||||
}
|
||||
|
||||
public VarintCodedSequenceArrayColumn(String name, StorageType storageType) {
|
||||
super(name,
|
||||
"vcs[]",
|
||||
ByteOrder.nativeOrder(),
|
||||
ColumnFunction.DATA,
|
||||
storageType);
|
||||
|
||||
groupsColumn = new VarintColumn(name, ColumnFunction.GROUP_LENGTH, storageType);
|
||||
dataColumn = new VarintCodedSequenceColumn(name);
|
||||
}
|
||||
|
||||
public Writer createUnregistered(Path path, int page) throws IOException {
|
||||
return new Writer(
|
||||
dataColumn.createUnregistered(path, page),
|
||||
groupsColumn.createUnregistered(path, page)
|
||||
);
|
||||
}
|
||||
|
||||
public Reader openUnregistered(URI uri, int page) throws IOException {
|
||||
return new Reader(
|
||||
dataColumn.openUnregistered(uri, page),
|
||||
groupsColumn.openUnregistered(uri, page)
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
public class Writer implements ObjectColumnWriter<List<VarintCodedSequence>> {
|
||||
private final VarintColumn.Writer groupsWriter;
|
||||
private final VarintCodedSequenceColumn.Writer dataWriter;
|
||||
|
||||
Writer(VarintCodedSequenceColumn.Writer dataWriter, VarintColumn.Writer groupsWriter)
|
||||
{
|
||||
this.groupsWriter = groupsWriter;
|
||||
this.dataWriter = dataWriter;
|
||||
}
|
||||
|
||||
@Override
|
||||
public AbstractColumn<?, ?> columnDesc() {
|
||||
return VarintCodedSequenceArrayColumn.this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void put(List<VarintCodedSequence> sequences) throws IOException {
|
||||
groupsWriter.put(sequences.size());
|
||||
for (VarintCodedSequence sequence : sequences) {
|
||||
dataWriter.put(sequence);
|
||||
}
|
||||
}
|
||||
|
||||
public long position() {
|
||||
return groupsWriter.position();
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
dataWriter.close();
|
||||
groupsWriter.close();
|
||||
}
|
||||
}
|
||||
|
||||
public class Reader implements ObjectColumnReader<List<VarintCodedSequence>> {
|
||||
private final VarintCodedSequenceColumn.Reader dataReader;
|
||||
private final VarintColumn.Reader groupsReader;
|
||||
|
||||
public Reader(VarintCodedSequenceColumn.Reader dataReader, VarintColumn.Reader groupsReader) {
|
||||
this.dataReader = dataReader;
|
||||
this.groupsReader = groupsReader;
|
||||
}
|
||||
|
||||
@Override
|
||||
public AbstractColumn<?, ?> columnDesc() {
|
||||
return VarintCodedSequenceArrayColumn.this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void skip(long positions) throws IOException {
|
||||
int toSkip = 0;
|
||||
for (int i = 0; i < positions; i++) {
|
||||
toSkip += groupsReader.get();
|
||||
}
|
||||
dataReader.skip(toSkip);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasRemaining() throws IOException {
|
||||
return groupsReader.hasRemaining();
|
||||
}
|
||||
|
||||
public long position() throws IOException {
|
||||
return groupsReader.position();
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<VarintCodedSequence> get() throws IOException {
|
||||
int count = groupsReader.get();
|
||||
var ret = new ArrayList<VarintCodedSequence>(count);
|
||||
|
||||
for (int i = 0; i < count; i++) {
|
||||
ret.add(dataReader.get());
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public List<ByteBuffer> getData(ByteBuffer workArea) throws IOException {
|
||||
int count = groupsReader.get();
|
||||
var ret = new ArrayList<ByteBuffer>(count);
|
||||
|
||||
for (int i = 0; i < count; i++) {
|
||||
int start = workArea.position();
|
||||
dataReader.getData(workArea);
|
||||
var slice = workArea.slice(start, workArea.position() - start);
|
||||
ret.add(slice);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
public void close() throws IOException {
|
||||
dataReader.close();
|
||||
groupsReader.close();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
@ -0,0 +1,148 @@
|
||||
package nu.marginalia.sequence.slop;
|
||||
|
||||
import nu.marginalia.sequence.VarintCodedSequence;
|
||||
import nu.marginalia.slop.column.AbstractColumn;
|
||||
import nu.marginalia.slop.column.AbstractObjectColumn;
|
||||
import nu.marginalia.slop.column.ObjectColumnReader;
|
||||
import nu.marginalia.slop.column.ObjectColumnWriter;
|
||||
import nu.marginalia.slop.column.dynamic.VarintColumn;
|
||||
import nu.marginalia.slop.desc.ColumnFunction;
|
||||
import nu.marginalia.slop.desc.StorageType;
|
||||
import nu.marginalia.slop.storage.Storage;
|
||||
import nu.marginalia.slop.storage.StorageReader;
|
||||
import nu.marginalia.slop.storage.StorageWriter;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
import java.nio.file.Path;
|
||||
|
||||
/** Slop column extension for storing GammaCodedSequence objects. */
|
||||
public class VarintCodedSequenceColumn extends AbstractObjectColumn<VarintCodedSequence, VarintCodedSequenceColumn.Reader, VarintCodedSequenceColumn.Writer> {
|
||||
|
||||
private final VarintColumn indexColumn;
|
||||
|
||||
public VarintCodedSequenceColumn(String name) {
|
||||
this(name, StorageType.PLAIN);
|
||||
}
|
||||
|
||||
public VarintCodedSequenceColumn(String name, StorageType storageType) {
|
||||
super(name,
|
||||
"vcs",
|
||||
ByteOrder.nativeOrder(),
|
||||
ColumnFunction.DATA,
|
||||
storageType);
|
||||
|
||||
indexColumn = new VarintColumn(name, ColumnFunction.DATA_LEN, StorageType.PLAIN);
|
||||
}
|
||||
|
||||
public Writer createUnregistered(Path path, int page) throws IOException {
|
||||
return new Writer(
|
||||
Storage.writer(path, this, page),
|
||||
indexColumn.createUnregistered(path, page)
|
||||
);
|
||||
}
|
||||
|
||||
public Reader openUnregistered(URI uri, int page) throws IOException {
|
||||
return new Reader(
|
||||
Storage.reader(uri, this, page, false),
|
||||
indexColumn.openUnregistered(uri, page)
|
||||
);
|
||||
}
|
||||
|
||||
public class Writer implements ObjectColumnWriter<VarintCodedSequence> {
|
||||
private final VarintColumn.Writer indexWriter;
|
||||
private final StorageWriter storage;
|
||||
|
||||
public Writer(StorageWriter storage,
|
||||
VarintColumn.Writer indexWriter)
|
||||
{
|
||||
this.storage = storage;
|
||||
|
||||
this.indexWriter = indexWriter;
|
||||
}
|
||||
|
||||
@Override
|
||||
public AbstractColumn<?, ?> columnDesc() {
|
||||
return VarintCodedSequenceColumn.this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void put(VarintCodedSequence sequence) throws IOException {
|
||||
var buffer = sequence.buffer();
|
||||
int length = buffer.remaining();
|
||||
|
||||
indexWriter.put(length);
|
||||
storage.putBytes(buffer);
|
||||
}
|
||||
|
||||
public long position() {
|
||||
return indexWriter.position();
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
indexWriter.close();
|
||||
storage.close();
|
||||
}
|
||||
}
|
||||
|
||||
public class Reader implements ObjectColumnReader<VarintCodedSequence> {
|
||||
private final VarintColumn.Reader indexReader;
|
||||
private final StorageReader storage;
|
||||
|
||||
Reader(StorageReader reader, VarintColumn.Reader indexReader) throws IOException {
|
||||
this.storage = reader;
|
||||
this.indexReader = indexReader;
|
||||
}
|
||||
|
||||
@Override
|
||||
public AbstractColumn<?, ?> columnDesc() {
|
||||
return VarintCodedSequenceColumn.this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void skip(long positions) throws IOException {
|
||||
for (int i = 0; i < positions; i++) {
|
||||
int size = indexReader.get();
|
||||
storage.skip(size, 1);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasRemaining() throws IOException {
|
||||
return indexReader.hasRemaining();
|
||||
}
|
||||
|
||||
public long position() throws IOException {
|
||||
return indexReader.position();
|
||||
}
|
||||
|
||||
@Override
|
||||
public VarintCodedSequence get() throws IOException {
|
||||
int size = indexReader.get();
|
||||
|
||||
ByteBuffer dest = ByteBuffer.allocate(size);
|
||||
storage.getBytes(dest);
|
||||
dest.flip();
|
||||
|
||||
return new VarintCodedSequence(dest);
|
||||
}
|
||||
|
||||
public void getData(ByteBuffer workArea) throws IOException {
|
||||
int size = indexReader.get();
|
||||
|
||||
int oldLimit = workArea.limit();
|
||||
workArea.limit(workArea.position() + size);
|
||||
storage.getBytes(workArea);
|
||||
workArea.limit(oldLimit);
|
||||
}
|
||||
|
||||
|
||||
public void close() throws IOException {
|
||||
indexReader.close();
|
||||
storage.close();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
@ -25,51 +25,51 @@ public class SequenceBenchmarks {
|
||||
workArea = ByteBuffer.allocate(65536);
|
||||
arrayValues = new int[] { 1, 3, 5, 16, 1024, 2048, 4096, 4098, 4100 };
|
||||
list = new IntArrayList(arrayValues);
|
||||
vcs = VarintCodedSequence.generate(1, 3, 5, 16, 1024, 2048);
|
||||
gcs = GammaCodedSequence.generate(workArea, 1, 3, 5, 16, 1024, 2048);
|
||||
vcs = VarintCodedSequence.generate(16,21,24,28,66,71,76,83,87,98,101,106,113,115,119,122,143,148,159,164,167,177,182,211,223,242,245,250,273,275,280,289,292,300,307,322,330,338,345,371,397,402,411,420,427,430,433,437,440,448,451,481,490,513,522,555,571,573,585,597,606,613,634,638,640,644,656,660,666,683,689,692,696,709,712,718,727,731,735,738);
|
||||
gcs = GammaCodedSequence.generate(workArea, 16,21,24,28,66,71,76,83,87,98,101,106,113,115,119,122,143,148,159,164,167,177,182,211,223,242,245,250,273,275,280,289,292,300,307,322,330,338,345,371,397,402,411,420,427,430,433,437,440,448,451,481,490,513,522,555,571,573,585,597,606,613,634,638,640,644,656,660,666,683,689,692,696,709,712,718,727,731,735,738);
|
||||
}
|
||||
}
|
||||
|
||||
@Fork(value = 1, warmups = 1)
|
||||
@Warmup(iterations = 1)
|
||||
@Benchmark
|
||||
@BenchmarkMode(Mode.Throughput)
|
||||
public int vcsDecode(SequenceState state) {
|
||||
var iter = state.vcs.iterator();
|
||||
int sum = 0;
|
||||
while (iter.hasNext()) {
|
||||
sum += iter.nextInt();
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
//
|
||||
// @Fork(value = 5, warmups = 5)
|
||||
// @Warmup(iterations = 5)
|
||||
// @Benchmark
|
||||
// @BenchmarkMode(Mode.Throughput)
|
||||
// public int vcsDecode(SequenceState state) {
|
||||
// var iter = state.vcs.iterator();
|
||||
// public int listDecode2(SequenceState state) {
|
||||
// var list = state.arrayValues;
|
||||
// int sum = 0;
|
||||
// while (iter.hasNext()) {
|
||||
// sum += iter.nextInt();
|
||||
// for (int i = 0; i < list.length; i++) {
|
||||
// sum += list[i];
|
||||
// }
|
||||
// return sum;
|
||||
// }
|
||||
|
||||
@Fork(value = 5, warmups = 5)
|
||||
@Warmup(iterations = 5)
|
||||
|
||||
@Fork(value = 1, warmups = 1)
|
||||
@Warmup(iterations = 1)
|
||||
@Benchmark
|
||||
@BenchmarkMode(Mode.Throughput)
|
||||
public int listDecode2(SequenceState state) {
|
||||
var list = state.arrayValues;
|
||||
public int gcsDecode(SequenceState state) {
|
||||
var iter = state.gcs.iterator();
|
||||
int sum = 0;
|
||||
for (int i = 0; i < list.length; i++) {
|
||||
sum += list[i];
|
||||
while (iter.hasNext()) {
|
||||
sum += iter.nextInt();
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
||||
// @Fork(value = 1, warmups = 1)
|
||||
// @Warmup(iterations = 1)
|
||||
// @Benchmark
|
||||
// @BenchmarkMode(Mode.Throughput)
|
||||
// public int gcsDecode(SequenceState state) {
|
||||
// var iter = state.gcs.iterator();
|
||||
// int sum = 0;
|
||||
// while (iter.hasNext()) {
|
||||
// sum += iter.nextInt();
|
||||
// }
|
||||
// return sum;
|
||||
// }
|
||||
|
||||
// @Fork(value = 1, warmups = 1)
|
||||
// @Warmup(iterations = 1)
|
||||
// @Benchmark
|
||||
|
@ -63,6 +63,8 @@ class SequenceOperationsTest {
|
||||
assertTrue(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator(), seq3.iterator()));
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Test
|
||||
void intersectSequencesDeepMatch3findIntersections() {
|
||||
ByteBuffer wa = ByteBuffer.allocate(1024);
|
||||
|
@ -1,7 +1,7 @@
|
||||
package nu.marginalia.keyword.model;
|
||||
|
||||
import nu.marginalia.model.idx.CodedWordSpan;
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
import nu.marginalia.sequence.VarintCodedSequence;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@ -9,12 +9,12 @@ public final class DocumentKeywords {
|
||||
|
||||
public final List<String> keywords;
|
||||
public final byte[] metadata;
|
||||
public final List<GammaCodedSequence> positions;
|
||||
public final List<VarintCodedSequence> positions;
|
||||
public final List<CodedWordSpan> spans;
|
||||
|
||||
public DocumentKeywords(List<String> keywords,
|
||||
byte[] metadata,
|
||||
List<GammaCodedSequence> positions,
|
||||
List<VarintCodedSequence> positions,
|
||||
List<CodedWordSpan> spans)
|
||||
{
|
||||
this.keywords = keywords;
|
||||
|
@ -8,7 +8,7 @@ import lombok.Getter;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
import nu.marginalia.model.idx.CodedWordSpan;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
import nu.marginalia.sequence.VarintCodedSequence;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -39,7 +39,7 @@ public class DocumentKeywordsBuilder {
|
||||
public DocumentKeywords build(ByteBuffer workArea) {
|
||||
final List<String> wordArray = new ArrayList<>(wordToMeta.size());
|
||||
final TByteArrayList meta = new TByteArrayList(wordToMeta.size());
|
||||
final List<GammaCodedSequence> positions = new ArrayList<>(wordToMeta.size());
|
||||
final List<VarintCodedSequence> positions = new ArrayList<>(wordToMeta.size());
|
||||
|
||||
var iter = wordToMeta.object2ByteEntrySet().fastIterator();
|
||||
|
||||
@ -49,13 +49,13 @@ public class DocumentKeywordsBuilder {
|
||||
meta.add(entry.getByteValue());
|
||||
wordArray.add(entry.getKey());
|
||||
|
||||
var posList = wordToPos.getOrDefault(entry.getKey(), IntList.of());
|
||||
IntList posList = wordToPos.getOrDefault(entry.getKey(), IntList.of());
|
||||
|
||||
if (posList.size() > MAX_POSITIONS_PER_WORD) {
|
||||
posList.subList(MAX_POSITIONS_PER_WORD, posList.size()).clear();
|
||||
}
|
||||
|
||||
positions.add(GammaCodedSequence.generate(workArea, posList));
|
||||
positions.add(VarintCodedSequence.generate(posList));
|
||||
}
|
||||
|
||||
// Encode spans
|
||||
@ -70,7 +70,7 @@ public class DocumentKeywordsBuilder {
|
||||
positionsForTag.add(span.end());
|
||||
}
|
||||
|
||||
spans.add(new CodedWordSpan((byte) tag.charValue(), GammaCodedSequence.generate(workArea, positionsForTag)));
|
||||
spans.add(new CodedWordSpan((byte) tag.charValue(), VarintCodedSequence.generate(positionsForTag)));
|
||||
});
|
||||
|
||||
return new DocumentKeywords(wordArray, meta.toArray(), positions, spans);
|
||||
|
@ -12,7 +12,7 @@ import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.model.processed.SlopDocumentRecord;
|
||||
import nu.marginalia.model.processed.SlopDomainLinkRecord;
|
||||
import nu.marginalia.model.processed.SlopDomainRecord;
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
import nu.marginalia.sequence.VarintCodedSequence;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -96,7 +96,7 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter
|
||||
|
||||
var wb = document.words.build(workArea);
|
||||
|
||||
List<GammaCodedSequence> spanSequences = new ArrayList<>(wb.spans.size());
|
||||
List<VarintCodedSequence> spanSequences = new ArrayList<>(wb.spans.size());
|
||||
byte[] spanCodes = new byte[wb.spans.size()];
|
||||
|
||||
for (int i = 0; i < wb.spans.size(); i++) {
|
||||
|
@ -1,8 +1,8 @@
|
||||
package nu.marginalia.model.processed;
|
||||
|
||||
import lombok.Builder;
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
import nu.marginalia.sequence.slop.GammaCodedSequenceArrayColumn;
|
||||
import nu.marginalia.sequence.VarintCodedSequence;
|
||||
import nu.marginalia.sequence.slop.VarintCodedSequenceArrayColumn;
|
||||
import nu.marginalia.slop.SlopTable;
|
||||
import nu.marginalia.slop.column.array.ByteArrayColumn;
|
||||
import nu.marginalia.slop.column.array.ObjectArrayColumn;
|
||||
@ -39,9 +39,9 @@ public record SlopDocumentRecord(
|
||||
Integer pubYear,
|
||||
List<String> words,
|
||||
byte[] metas,
|
||||
List<GammaCodedSequence> positions,
|
||||
List<VarintCodedSequence> positions,
|
||||
byte[] spanCodes,
|
||||
List<GammaCodedSequence> spans
|
||||
List<VarintCodedSequence> spans
|
||||
) {
|
||||
|
||||
public SlopDocumentRecord {
|
||||
@ -60,9 +60,9 @@ public record SlopDocumentRecord(
|
||||
int length,
|
||||
List<String> words,
|
||||
byte[] metas,
|
||||
List<GammaCodedSequence> positions,
|
||||
List<VarintCodedSequence> positions,
|
||||
byte[] spanCodes,
|
||||
List<GammaCodedSequence> spans)
|
||||
List<VarintCodedSequence> spans)
|
||||
{
|
||||
// Override the equals method since records don't generate default equals that deal with array fields properly
|
||||
@Override
|
||||
@ -127,12 +127,12 @@ public record SlopDocumentRecord(
|
||||
|
||||
private static final ObjectArrayColumn<String> keywordsColumn = new StringColumn("keywords", StorageType.ZSTD).asArray();
|
||||
private static final ByteArrayColumn termMetaColumn = new ByteArrayColumn("termMetadata", StorageType.ZSTD);
|
||||
private static final GammaCodedSequenceArrayColumn termPositionsColumn = new GammaCodedSequenceArrayColumn("termPositions", StorageType.ZSTD);
|
||||
private static final VarintCodedSequenceArrayColumn termPositionsColumn = new VarintCodedSequenceArrayColumn("termPositions", StorageType.ZSTD);
|
||||
|
||||
// Spans columns
|
||||
|
||||
private static final ByteArrayColumn spanCodesColumn = new ByteArrayColumn("spanCodes", StorageType.ZSTD);
|
||||
private static final GammaCodedSequenceArrayColumn spansColumn = new GammaCodedSequenceArrayColumn("spans", StorageType.ZSTD);
|
||||
private static final VarintCodedSequenceArrayColumn spansColumn = new VarintCodedSequenceArrayColumn("spans", StorageType.ZSTD);
|
||||
|
||||
public static class KeywordsProjectionReader extends SlopTable {
|
||||
private final TxtStringColumn.Reader domainsReader;
|
||||
@ -143,10 +143,10 @@ public record SlopDocumentRecord(
|
||||
|
||||
private final ObjectArrayColumn<String>.Reader keywordsReader;
|
||||
private final ByteArrayColumn.Reader termMetaReader;
|
||||
private final GammaCodedSequenceArrayColumn.Reader termPositionsReader;
|
||||
private final VarintCodedSequenceArrayColumn.Reader termPositionsReader;
|
||||
|
||||
private final ByteArrayColumn.Reader spanCodesReader;
|
||||
private final GammaCodedSequenceArrayColumn.Reader spansReader;
|
||||
private final VarintCodedSequenceArrayColumn.Reader spansReader;
|
||||
|
||||
public KeywordsProjectionReader(SlopTable.Ref<SlopDocumentRecord> pageRef) throws IOException {
|
||||
super(pageRef);
|
||||
@ -177,10 +177,10 @@ public record SlopDocumentRecord(
|
||||
int length = lengthsReader.get();
|
||||
|
||||
List<String> words = keywordsReader.get();
|
||||
List<GammaCodedSequence> positions = termPositionsReader.get();
|
||||
List<VarintCodedSequence> positions = termPositionsReader.get();
|
||||
byte[] metas = termMetaReader.get();
|
||||
byte[] spanCodes = spanCodesReader.get();
|
||||
List<GammaCodedSequence> spans = spansReader.get();
|
||||
List<VarintCodedSequence> spans = spansReader.get();
|
||||
|
||||
return new KeywordsProjection(
|
||||
domain,
|
||||
@ -272,9 +272,9 @@ public record SlopDocumentRecord(
|
||||
private final IntColumn.Writer pubYearWriter;
|
||||
private final ObjectArrayColumn<String>.Writer keywordsWriter;
|
||||
private final ByteArrayColumn.Writer termMetaWriter;
|
||||
private final GammaCodedSequenceArrayColumn.Writer termPositionsWriter;
|
||||
private final VarintCodedSequenceArrayColumn.Writer termPositionsWriter;
|
||||
private final ByteArrayColumn.Writer spansCodesWriter;
|
||||
private final GammaCodedSequenceArrayColumn.Writer spansWriter;
|
||||
private final VarintCodedSequenceArrayColumn.Writer spansWriter;
|
||||
|
||||
public Writer(Path baseDir, int page) throws IOException {
|
||||
super(baseDir, page);
|
||||
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.model.processed;
|
||||
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
import nu.marginalia.sequence.VarintCodedSequence;
|
||||
import nu.marginalia.slop.SlopTable;
|
||||
import nu.marginalia.test.TestUtil;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
@ -46,9 +46,9 @@ public class SlopDocumentRecordTest {
|
||||
null,
|
||||
List.of("test1", "test2"),
|
||||
new byte[] { 2, 3},
|
||||
List.of(GammaCodedSequence.generate(workArea, 1, 3, 5), GammaCodedSequence.generate(workArea, 2, 4, 6)),
|
||||
List.of(VarintCodedSequence.generate(1, 3, 5), VarintCodedSequence.generate(2, 4, 6)),
|
||||
new byte[] { 'a', 'b' },
|
||||
List.of(GammaCodedSequence.generate(workArea, 2, 3, 5), GammaCodedSequence.generate(workArea, 3, 4, 6))
|
||||
List.of(VarintCodedSequence.generate(2, 3, 5), VarintCodedSequence.generate(3, 4, 6))
|
||||
);
|
||||
|
||||
try (var writer = new SlopDocumentRecord.Writer(testDir, 0)) {
|
||||
|
Loading…
Reference in New Issue
Block a user