mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(index, EXPERIMENTAL) Evaluate using Varint instead of GCS for position data
This commit is contained in:
parent
30bf845c81
commit
abab5bdc8a
@ -1,6 +1,6 @@
|
|||||||
package nu.marginalia.model.idx;
|
package nu.marginalia.model.idx;
|
||||||
|
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.VarintCodedSequence;
|
||||||
|
|
||||||
public record CodedWordSpan(byte code, GammaCodedSequence spans) {
|
public record CodedWordSpan(byte code, VarintCodedSequence spans) {
|
||||||
}
|
}
|
||||||
|
@ -120,7 +120,6 @@ public class ForwardIndexConverter {
|
|||||||
for (int i = 0; i < spansCodes.length; i++) {
|
for (int i = 0; i < spansCodes.length; i++) {
|
||||||
spansWriter.writeSpan(spansCodes[i], spans.get(i));
|
spansWriter.writeSpan(spansCodes[i], spans.get(i));
|
||||||
}
|
}
|
||||||
|
|
||||||
long encodedSpansOffset = spansWriter.endRecord();
|
long encodedSpansOffset = spansWriter.endRecord();
|
||||||
|
|
||||||
|
|
||||||
|
@ -6,6 +6,7 @@ import nu.marginalia.sequence.CodedSequence;
|
|||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
/** A list of the interlaced start and end positions of each span in the document of this type */
|
||||||
public class DocumentSpan {
|
public class DocumentSpan {
|
||||||
|
|
||||||
/** A list of the interlaced start and end positions of each span in the document of this type */
|
/** A list of the interlaced start and end positions of each span in the document of this type */
|
||||||
@ -19,6 +20,7 @@ public class DocumentSpan {
|
|||||||
this.startsEnds = null;
|
this.startsEnds = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Counts the number of intersections between the spans in the document of this type and the given list of positions */
|
||||||
public int countIntersections(int[] positions) {
|
public int countIntersections(int[] positions) {
|
||||||
if (null == startsEnds || startsEnds.isEmpty() || positions.length == 0) {
|
if (null == startsEnds || startsEnds.isEmpty() || positions.length == 0) {
|
||||||
return 0;
|
return 0;
|
||||||
@ -26,37 +28,39 @@ public class DocumentSpan {
|
|||||||
|
|
||||||
int cnt = 0;
|
int cnt = 0;
|
||||||
|
|
||||||
if (positions.length < 8) {
|
if (positions.length < 8) { // for small arrays we can do a linear search
|
||||||
int seis = 0;
|
int seis = 0;
|
||||||
|
|
||||||
for (int pi = 0; pi < positions.length; pi++) {
|
for (int pi = 0; pi < positions.length; pi++) {
|
||||||
int position = positions[pi];
|
int position = positions[pi];
|
||||||
|
|
||||||
|
// search through the spans until we find an item that is greater than the given position
|
||||||
for (int sei = seis; sei < startsEnds.size(); sei ++) {
|
for (int sei = seis; sei < startsEnds.size(); sei ++) {
|
||||||
if (startsEnds.getInt(sei) > position) {
|
if (startsEnds.getInt(sei) > position) {
|
||||||
cnt += sei % 2;
|
cnt += sei % 2; // if sei is odd, we are between a start and end position in the spans list
|
||||||
seis = Math.max(seis, sei - 1);
|
seis = Math.max(seis, sei - 1);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else { // for large arrays we use a binary search
|
||||||
int ss = 0;
|
int searchStart = 0;
|
||||||
|
|
||||||
for (int sei = 0; sei < startsEnds.size() && ss < positions.length; ) {
|
for (int sei = 0; sei < startsEnds.size() && searchStart < positions.length; ) {
|
||||||
int start = startsEnds.getInt(sei++);
|
int start = startsEnds.getInt(sei++);
|
||||||
int end = startsEnds.getInt(sei++);
|
int end = startsEnds.getInt(sei++);
|
||||||
|
|
||||||
int i = Arrays.binarySearch(positions, ss, positions.length, start);
|
// find the first position that is greater or equal to the start position
|
||||||
if (i < 0) {
|
int i = Arrays.binarySearch(positions, searchStart, positions.length, start);
|
||||||
i = -i - 1;
|
if (i < 0) i = -i - 1; // if the position is not found, we get the insertion point
|
||||||
}
|
|
||||||
|
// ... from that point, count the number of positions that smaller than the end position
|
||||||
while (i < positions.length && positions[i] < end) {
|
while (i < positions.length && positions[i] < end) {
|
||||||
cnt++;
|
cnt++;
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
ss = i;
|
searchStart = i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -83,6 +87,8 @@ public class DocumentSpan {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Returns true if for any position in the list, there exists a range
|
||||||
|
* (position[i], position[i]+len] that is overlapped by a span */
|
||||||
public boolean containsRange(IntList positions, int len) {
|
public boolean containsRange(IntList positions, int len) {
|
||||||
if (null == startsEnds || startsEnds.size() < 2 || positions.isEmpty()) {
|
if (null == startsEnds || startsEnds.size() < 2 || positions.isEmpty()) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -3,6 +3,10 @@ package nu.marginalia.index.forward.spans;
|
|||||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||||
import nu.marginalia.sequence.CodedSequence;
|
import nu.marginalia.sequence.CodedSequence;
|
||||||
|
|
||||||
|
/** All spans associated with a document
|
||||||
|
* <p></p>
|
||||||
|
* A span is a list of document positions that are associated with a particular tag in the document.
|
||||||
|
* */
|
||||||
public class DocumentSpans {
|
public class DocumentSpans {
|
||||||
private static final DocumentSpan EMPTY_SPAN = new DocumentSpan();
|
private static final DocumentSpan EMPTY_SPAN = new DocumentSpan();
|
||||||
|
|
||||||
|
@ -1,9 +1,10 @@
|
|||||||
package nu.marginalia.index.forward.spans;
|
package nu.marginalia.index.forward.spans;
|
||||||
|
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.VarintCodedSequence;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.lang.foreign.Arena;
|
import java.lang.foreign.Arena;
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
import java.nio.channels.FileChannel;
|
import java.nio.channels.FileChannel;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
@ -18,9 +19,11 @@ public class ForwardIndexSpansReader implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public DocumentSpans readSpans(Arena arena, long encodedOffset) throws IOException {
|
public DocumentSpans readSpans(Arena arena, long encodedOffset) throws IOException {
|
||||||
|
// Decode the size and offset from the encoded offset
|
||||||
long size = SpansCodec.decodeSize(encodedOffset);
|
long size = SpansCodec.decodeSize(encodedOffset);
|
||||||
long offset = SpansCodec.decodeStartOffset(encodedOffset);
|
long offset = SpansCodec.decodeStartOffset(encodedOffset);
|
||||||
|
|
||||||
|
// Allocate a buffer from the arena
|
||||||
var buffer = arena.allocate(size).asByteBuffer();
|
var buffer = arena.allocate(size).asByteBuffer();
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
while (buffer.hasRemaining()) {
|
while (buffer.hasRemaining()) {
|
||||||
@ -28,15 +31,18 @@ public class ForwardIndexSpansReader implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
buffer.flip();
|
buffer.flip();
|
||||||
|
|
||||||
|
// Read the number of spans in the document
|
||||||
int count = buffer.get();
|
int count = buffer.get();
|
||||||
|
|
||||||
DocumentSpans ret = new DocumentSpans();
|
DocumentSpans ret = new DocumentSpans();
|
||||||
|
|
||||||
|
// Decode each span
|
||||||
while (count-- > 0) {
|
while (count-- > 0) {
|
||||||
byte code = buffer.get();
|
byte code = buffer.get();
|
||||||
short len = buffer.getShort();
|
short len = buffer.getShort();
|
||||||
|
|
||||||
ret.accept(code, new GammaCodedSequence(buffer.slice(buffer.position(), len)));
|
ByteBuffer data = buffer.slice(buffer.position(), len);
|
||||||
|
ret.accept(code, new VarintCodedSequence(data));
|
||||||
|
|
||||||
// Reset the buffer position to the end of the span
|
// Reset the buffer position to the end of the span
|
||||||
buffer.position(buffer.position() + len);
|
buffer.position(buffer.position() + len);
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
package nu.marginalia.index.journal;
|
package nu.marginalia.index.journal;
|
||||||
|
|
||||||
import nu.marginalia.sequence.slop.GammaCodedSequenceArrayColumn;
|
import nu.marginalia.sequence.slop.VarintCodedSequenceArrayColumn;
|
||||||
import nu.marginalia.slop.SlopTable;
|
import nu.marginalia.slop.SlopTable;
|
||||||
import nu.marginalia.slop.column.array.ByteArrayColumn;
|
import nu.marginalia.slop.column.array.ByteArrayColumn;
|
||||||
import nu.marginalia.slop.column.array.LongArrayColumn;
|
import nu.marginalia.slop.column.array.LongArrayColumn;
|
||||||
@ -19,10 +19,10 @@ public record IndexJournalPage(Path baseDir, int page) {
|
|||||||
|
|
||||||
public static LongArrayColumn termIds = new LongArrayColumn("termIds", StorageType.ZSTD);
|
public static LongArrayColumn termIds = new LongArrayColumn("termIds", StorageType.ZSTD);
|
||||||
public static ByteArrayColumn termMeta = new ByteArrayColumn("termMetadata", StorageType.ZSTD);
|
public static ByteArrayColumn termMeta = new ByteArrayColumn("termMetadata", StorageType.ZSTD);
|
||||||
public static GammaCodedSequenceArrayColumn positions = new GammaCodedSequenceArrayColumn("termPositions", StorageType.ZSTD);
|
public static VarintCodedSequenceArrayColumn positions = new VarintCodedSequenceArrayColumn("termPositions", StorageType.ZSTD);
|
||||||
|
|
||||||
public static ByteArrayColumn spanCodes = new ByteArrayColumn("spanCodes", StorageType.ZSTD);
|
public static ByteArrayColumn spanCodes = new ByteArrayColumn("spanCodes", StorageType.ZSTD);
|
||||||
public static GammaCodedSequenceArrayColumn spans = new GammaCodedSequenceArrayColumn("spans", StorageType.ZSTD);
|
public static VarintCodedSequenceArrayColumn spans = new VarintCodedSequenceArrayColumn("spans", StorageType.ZSTD);
|
||||||
|
|
||||||
public IndexJournalPage {
|
public IndexJournalPage {
|
||||||
if (!baseDir.toFile().isDirectory()) {
|
if (!baseDir.toFile().isDirectory()) {
|
||||||
@ -55,11 +55,11 @@ public record IndexJournalPage(Path baseDir, int page) {
|
|||||||
return termMeta.open(table);
|
return termMeta.open(table);
|
||||||
}
|
}
|
||||||
|
|
||||||
public GammaCodedSequenceArrayColumn.Reader openTermPositions(SlopTable table) throws IOException {
|
public VarintCodedSequenceArrayColumn.Reader openTermPositions(SlopTable table) throws IOException {
|
||||||
return positions.open(table);
|
return positions.open(table);
|
||||||
}
|
}
|
||||||
|
|
||||||
public GammaCodedSequenceArrayColumn.Reader openSpans(SlopTable table) throws IOException {
|
public VarintCodedSequenceArrayColumn.Reader openSpans(SlopTable table) throws IOException {
|
||||||
return spans.open(table);
|
return spans.open(table);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3,7 +3,7 @@ package nu.marginalia.index.journal;
|
|||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.hash.MurmurHash3_128;
|
import nu.marginalia.hash.MurmurHash3_128;
|
||||||
import nu.marginalia.model.processed.SlopDocumentRecord;
|
import nu.marginalia.model.processed.SlopDocumentRecord;
|
||||||
import nu.marginalia.sequence.slop.GammaCodedSequenceArrayColumn;
|
import nu.marginalia.sequence.slop.VarintCodedSequenceArrayColumn;
|
||||||
import nu.marginalia.slop.SlopTable;
|
import nu.marginalia.slop.SlopTable;
|
||||||
import nu.marginalia.slop.column.array.ByteArrayColumn;
|
import nu.marginalia.slop.column.array.ByteArrayColumn;
|
||||||
import nu.marginalia.slop.column.array.LongArrayColumn;
|
import nu.marginalia.slop.column.array.LongArrayColumn;
|
||||||
@ -24,9 +24,9 @@ public class IndexJournalSlopWriter extends SlopTable {
|
|||||||
|
|
||||||
private final LongArrayColumn.Writer termIdsWriter;
|
private final LongArrayColumn.Writer termIdsWriter;
|
||||||
private final ByteArrayColumn.Writer termMetadataWriter;
|
private final ByteArrayColumn.Writer termMetadataWriter;
|
||||||
private final GammaCodedSequenceArrayColumn.Writer termPositionsWriter;
|
private final VarintCodedSequenceArrayColumn.Writer termPositionsWriter;
|
||||||
|
|
||||||
private final GammaCodedSequenceArrayColumn.Writer spansWriter;
|
private final VarintCodedSequenceArrayColumn.Writer spansWriter;
|
||||||
private final ByteArrayColumn.Writer spanCodesWriter;
|
private final ByteArrayColumn.Writer spanCodesWriter;
|
||||||
|
|
||||||
private static final MurmurHash3_128 hash = new MurmurHash3_128();
|
private static final MurmurHash3_128 hash = new MurmurHash3_128();
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.index.positions;
|
package nu.marginalia.index.positions;
|
||||||
|
|
||||||
import nu.marginalia.sequence.CodedSequence;
|
import nu.marginalia.sequence.CodedSequence;
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.VarintCodedSequence;
|
||||||
|
|
||||||
import java.nio.ByteBuffer;
|
import java.nio.ByteBuffer;
|
||||||
|
|
||||||
@ -17,6 +17,6 @@ public class TermData {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public CodedSequence positions() {
|
public CodedSequence positions() {
|
||||||
return new GammaCodedSequence(buffer, 1, buffer.capacity());
|
return new VarintCodedSequence(buffer, 1, buffer.capacity());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3,11 +3,10 @@ package nu.marginalia.index.construction.full;
|
|||||||
import nu.marginalia.index.journal.IndexJournalPage;
|
import nu.marginalia.index.journal.IndexJournalPage;
|
||||||
import nu.marginalia.index.journal.IndexJournalSlopWriter;
|
import nu.marginalia.index.journal.IndexJournalSlopWriter;
|
||||||
import nu.marginalia.model.processed.SlopDocumentRecord;
|
import nu.marginalia.model.processed.SlopDocumentRecord;
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.VarintCodedSequence;
|
||||||
import nu.marginalia.test.TestUtil;
|
import nu.marginalia.test.TestUtil;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.ByteBuffer;
|
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
@ -46,14 +45,14 @@ public class TestJournalFactory {
|
|||||||
'}';
|
'}';
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
public record WordWithMeta(String wordId, byte meta, GammaCodedSequence gcs) {
|
public record WordWithMeta(String wordId, byte meta, VarintCodedSequence gcs) {
|
||||||
public WordWithMeta(long wordId, byte meta, GammaCodedSequence gcs) {
|
public WordWithMeta(long wordId, byte meta, VarintCodedSequence gcs) {
|
||||||
this(String.valueOf(wordId), meta, gcs);
|
this(String.valueOf(wordId), meta, gcs);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static WordWithMeta wm(long wordId, int meta, int... positions) {
|
public static WordWithMeta wm(long wordId, int meta, int... positions) {
|
||||||
return new WordWithMeta(wordId, (byte) meta, GammaCodedSequence.generate(ByteBuffer.allocate(128), positions));
|
return new WordWithMeta(wordId, (byte) meta, VarintCodedSequence.generate(positions));
|
||||||
}
|
}
|
||||||
|
|
||||||
public IndexJournalPage createReader(EntryData... entries) throws IOException {
|
public IndexJournalPage createReader(EntryData... entries) throws IOException {
|
||||||
@ -64,11 +63,11 @@ public class TestJournalFactory {
|
|||||||
String[] termIds = new String[entry.wordIds.length];
|
String[] termIds = new String[entry.wordIds.length];
|
||||||
byte[] meta = new byte[entry.wordIds.length];
|
byte[] meta = new byte[entry.wordIds.length];
|
||||||
|
|
||||||
GammaCodedSequence[] positions = new GammaCodedSequence[entry.wordIds.length];
|
VarintCodedSequence[] positions = new VarintCodedSequence[entry.wordIds.length];
|
||||||
for (int i = 0; i < entry.wordIds.length; i++) {
|
for (int i = 0; i < entry.wordIds.length; i++) {
|
||||||
termIds[i] = entry.wordIds[i];
|
termIds[i] = entry.wordIds[i];
|
||||||
meta[i] = 0;
|
meta[i] = 0;
|
||||||
positions[i] = new GammaCodedSequence(new byte[1]);
|
positions[i] = VarintCodedSequence.generate();
|
||||||
}
|
}
|
||||||
|
|
||||||
writer.put(
|
writer.put(
|
||||||
@ -100,11 +99,11 @@ public class TestJournalFactory {
|
|||||||
|
|
||||||
String[] termIds = new String[entry.wordIds.length];
|
String[] termIds = new String[entry.wordIds.length];
|
||||||
byte[] meta = new byte[entry.wordIds.length];
|
byte[] meta = new byte[entry.wordIds.length];
|
||||||
GammaCodedSequence[] positions = new GammaCodedSequence[entry.wordIds.length];
|
VarintCodedSequence[] positions = new VarintCodedSequence[entry.wordIds.length];
|
||||||
for (int i = 0; i < entry.wordIds.length; i++) {
|
for (int i = 0; i < entry.wordIds.length; i++) {
|
||||||
termIds[i] = entry.wordIds[i].wordId;
|
termIds[i] = entry.wordIds[i].wordId;
|
||||||
meta[i] = entry.wordIds[i].meta;
|
meta[i] = entry.wordIds[i].meta;
|
||||||
positions[i] = Objects.requireNonNullElseGet(entry.wordIds[i].gcs, () -> new GammaCodedSequence(new byte[1]));
|
positions[i] = Objects.requireNonNullElseGet(entry.wordIds[i].gcs, VarintCodedSequence::generate);
|
||||||
}
|
}
|
||||||
|
|
||||||
writer.put(
|
writer.put(
|
||||||
|
@ -28,7 +28,7 @@ import nu.marginalia.model.idx.WordFlags;
|
|||||||
import nu.marginalia.model.processed.SlopDocumentRecord;
|
import nu.marginalia.model.processed.SlopDocumentRecord;
|
||||||
import nu.marginalia.process.control.FakeProcessHeartbeat;
|
import nu.marginalia.process.control.FakeProcessHeartbeat;
|
||||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.VarintCodedSequence;
|
||||||
import nu.marginalia.service.server.Initialization;
|
import nu.marginalia.service.server.Initialization;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
@ -39,7 +39,6 @@ import org.junit.jupiter.api.parallel.Execution;
|
|||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.lang.foreign.Arena;
|
import java.lang.foreign.Arena;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
import java.nio.ByteBuffer;
|
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
@ -321,7 +320,7 @@ public class CombinedIndexReaderTest {
|
|||||||
for (int i = 0; i < words.size(); i++) {
|
for (int i = 0; i < words.size(); i++) {
|
||||||
metadata[i] = words.get(i).termMetadata;
|
metadata[i] = words.get(i).termMetadata;
|
||||||
}
|
}
|
||||||
var positions = words.stream().map(w -> w.positions).map(pos -> GammaCodedSequence.generate(ByteBuffer.allocate(1024), pos.toIntArray())).toList();
|
var positions = words.stream().map(w -> w.positions).map(pos -> VarintCodedSequence.generate(pos.toIntArray())).toList();
|
||||||
|
|
||||||
indexJournalWriter.put(doc,
|
indexJournalWriter.put(doc,
|
||||||
new SlopDocumentRecord.KeywordsProjection(
|
new SlopDocumentRecord.KeywordsProjection(
|
||||||
|
@ -31,7 +31,7 @@ import nu.marginalia.model.idx.WordFlags;
|
|||||||
import nu.marginalia.model.processed.SlopDocumentRecord;
|
import nu.marginalia.model.processed.SlopDocumentRecord;
|
||||||
import nu.marginalia.process.control.FakeProcessHeartbeat;
|
import nu.marginalia.process.control.FakeProcessHeartbeat;
|
||||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.VarintCodedSequence;
|
||||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||||
import nu.marginalia.service.server.Initialization;
|
import nu.marginalia.service.server.Initialization;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
@ -377,11 +377,11 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
|||||||
metadata[i] = WordFlags.Title.asBit();
|
metadata[i] = WordFlags.Title.asBit();
|
||||||
}
|
}
|
||||||
|
|
||||||
List<GammaCodedSequence> positions = new ArrayList<>();
|
List<VarintCodedSequence> positions = new ArrayList<>();
|
||||||
|
|
||||||
ByteBuffer wa = ByteBuffer.allocate(32);
|
ByteBuffer wa = ByteBuffer.allocate(32);
|
||||||
for (int i = 0; i < factors.length; i++) {
|
for (int i = 0; i < factors.length; i++) {
|
||||||
positions.add(GammaCodedSequence.generate(wa, factors));
|
positions.add(VarintCodedSequence.generate(factors));
|
||||||
}
|
}
|
||||||
|
|
||||||
indexJournalWriter.put(fullId,
|
indexJournalWriter.put(fullId,
|
||||||
@ -417,11 +417,11 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
|||||||
metadata[i] = WordFlags.Title.asBit();
|
metadata[i] = WordFlags.Title.asBit();
|
||||||
}
|
}
|
||||||
|
|
||||||
List<GammaCodedSequence> positions = new ArrayList<>();
|
List<VarintCodedSequence> positions = new ArrayList<>();
|
||||||
|
|
||||||
ByteBuffer wa = ByteBuffer.allocate(32);
|
ByteBuffer wa = ByteBuffer.allocate(32);
|
||||||
for (int i = 0; i < factors.length; i++) {
|
for (int i = 0; i < factors.length; i++) {
|
||||||
positions.add(GammaCodedSequence.generate(wa, i + 1));
|
positions.add(VarintCodedSequence.generate(i + 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
indexJournalWriter.put(fullId,
|
indexJournalWriter.put(fullId,
|
||||||
|
@ -33,7 +33,7 @@ import nu.marginalia.model.idx.WordFlags;
|
|||||||
import nu.marginalia.model.processed.SlopDocumentRecord;
|
import nu.marginalia.model.processed.SlopDocumentRecord;
|
||||||
import nu.marginalia.process.control.FakeProcessHeartbeat;
|
import nu.marginalia.process.control.FakeProcessHeartbeat;
|
||||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.VarintCodedSequence;
|
||||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||||
import nu.marginalia.service.server.Initialization;
|
import nu.marginalia.service.server.Initialization;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
@ -46,7 +46,6 @@ import org.junit.jupiter.api.parallel.Execution;
|
|||||||
import javax.annotation.CheckReturnValue;
|
import javax.annotation.CheckReturnValue;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
import java.nio.ByteBuffer;
|
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
@ -544,10 +543,9 @@ public class IndexQueryServiceIntegrationTest {
|
|||||||
metadata[i] = (byte) words.get(i).termMetadata;
|
metadata[i] = (byte) words.get(i).termMetadata;
|
||||||
}
|
}
|
||||||
|
|
||||||
List<GammaCodedSequence> positions = new ArrayList<>();
|
List<VarintCodedSequence> positions = new ArrayList<>();
|
||||||
ByteBuffer workBuffer = ByteBuffer.allocate(8192);
|
|
||||||
for (int i = 0; i < words.size(); i++) {
|
for (int i = 0; i < words.size(); i++) {
|
||||||
positions.add(GammaCodedSequence.generate(workBuffer, words.get(i).positions));
|
positions.add(VarintCodedSequence.generate(words.get(i).positions));
|
||||||
}
|
}
|
||||||
|
|
||||||
indexJournalWriter.put(doc,
|
indexJournalWriter.put(doc,
|
||||||
|
@ -20,6 +20,13 @@ public class VarintCodedSequence implements CodedSequence {
|
|||||||
this.startLimit = buffer.limit();
|
this.startLimit = buffer.limit();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public VarintCodedSequence(ByteBuffer buffer, int startPos, int startLimit) {
|
||||||
|
this.raw = buffer;
|
||||||
|
|
||||||
|
this.startPos = startPos;
|
||||||
|
this.startLimit = startLimit;
|
||||||
|
}
|
||||||
|
|
||||||
private static int requiredBufferSize(int[] values) {
|
private static int requiredBufferSize(int[] values) {
|
||||||
int prev = 0;
|
int prev = 0;
|
||||||
int size = 0;
|
int size = 0;
|
||||||
@ -32,11 +39,47 @@ public class VarintCodedSequence implements CodedSequence {
|
|||||||
return size + varintSize(size + 1);
|
return size + varintSize(size + 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static int requiredBufferSize(IntList values) {
|
||||||
|
int prev = 0;
|
||||||
|
int size = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < values.size(); i++) {
|
||||||
|
int value = values.getInt(i);
|
||||||
|
size += varintSize(value - prev);
|
||||||
|
prev = value;
|
||||||
|
}
|
||||||
|
|
||||||
|
return size + varintSize(size + 1);
|
||||||
|
}
|
||||||
|
|
||||||
private static int varintSize(int value) {
|
private static int varintSize(int value) {
|
||||||
int bits = 32 - Integer.numberOfLeadingZeros(value);
|
int bits = 32 - Integer.numberOfLeadingZeros(value);
|
||||||
return (bits + 6) / 7;
|
return (bits + 6) / 7;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static VarintCodedSequence generate(IntList values) {
|
||||||
|
int bufferSize = requiredBufferSize(values);
|
||||||
|
ByteBuffer buffer = ByteBuffer.allocate(bufferSize);
|
||||||
|
|
||||||
|
int prev = 0;
|
||||||
|
|
||||||
|
encodeValue(buffer, values.size() + 1);
|
||||||
|
|
||||||
|
for (int i = 0; i < values.size(); i++) {
|
||||||
|
int value = values.getInt(i);
|
||||||
|
int toEncode = value - prev;
|
||||||
|
assert toEncode > 0 : "Values must be strictly increasing";
|
||||||
|
|
||||||
|
encodeValue(buffer, toEncode);
|
||||||
|
|
||||||
|
prev = value;
|
||||||
|
}
|
||||||
|
|
||||||
|
buffer.flip();
|
||||||
|
|
||||||
|
return new VarintCodedSequence(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
public static VarintCodedSequence generate(int... values) {
|
public static VarintCodedSequence generate(int... values) {
|
||||||
int bufferSize = requiredBufferSize(values);
|
int bufferSize = requiredBufferSize(values);
|
||||||
ByteBuffer buffer = ByteBuffer.allocate(bufferSize);
|
ByteBuffer buffer = ByteBuffer.allocate(bufferSize);
|
||||||
@ -60,20 +103,23 @@ public class VarintCodedSequence implements CodedSequence {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private static void encodeValue(ByteBuffer buffer, int value) {
|
private static void encodeValue(ByteBuffer buffer, int value) {
|
||||||
if (value < 0x80) {
|
if (value < (1<<7)) {
|
||||||
buffer.put((byte) value);
|
buffer.put((byte) value);
|
||||||
}
|
}
|
||||||
else if (value < 0x4_000) {
|
else if (value < (1<<14)) {
|
||||||
buffer.put((byte) (value >>> (7) | 0x80));
|
buffer.put((byte) (value >>> (7) | 0x80));
|
||||||
buffer.put((byte) (value & 0x7F));
|
buffer.put((byte) (value & 0x7F));
|
||||||
}
|
}
|
||||||
else if (value < 0x20_0000) {
|
else if (value < (1<<21)) {
|
||||||
buffer.put((byte) (value >>> (14) | 0x80));
|
buffer.put((byte) (value >>> (14) | 0x80));
|
||||||
buffer.put((byte) (value >>> (7) | 0x80));
|
buffer.put((byte) (value >>> (7) | 0x80));
|
||||||
buffer.put((byte) (value & 0x7F));
|
buffer.put((byte) (value & 0x7F));
|
||||||
}
|
}
|
||||||
else if (value < 0x1000_0000) {
|
else if (value < (1<<28)) {
|
||||||
buffer.putInt(Integer.expand(value, 0x00808080) | 0x80808000);
|
buffer.put((byte) ((value >>> 21) | 0x80));
|
||||||
|
buffer.put((byte) ((value >>> 14) | 0x80));
|
||||||
|
buffer.put((byte) ((value >>> 7) | 0x80));
|
||||||
|
buffer.put((byte) (value & 0x7F));
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
throw new IllegalArgumentException("Value too large to encode");
|
throw new IllegalArgumentException("Value too large to encode");
|
||||||
@ -139,12 +185,13 @@ public class VarintCodedSequence implements CodedSequence {
|
|||||||
return b;
|
return b;
|
||||||
}
|
}
|
||||||
|
|
||||||
int value = b;
|
int value = b & 0x7F;
|
||||||
do {
|
do {
|
||||||
b = buffer.get();
|
b = buffer.get();
|
||||||
value = value << 7 | (b & 0x7F);
|
value = (value << 7) | (b & 0x7F);
|
||||||
} while ((b & 0x80) != 0);
|
} while ((b & 0x80) != 0);
|
||||||
|
|
||||||
|
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -49,20 +49,22 @@ public class BitReader {
|
|||||||
|
|
||||||
/** Read the next width bits from the buffer */
|
/** Read the next width bits from the buffer */
|
||||||
public int get(int width) {
|
public int get(int width) {
|
||||||
if (width == 0) {
|
// Fast path for reading a full integer from the current value
|
||||||
return 0;
|
if (bitPosition >= width) {
|
||||||
|
// We have enough bits in the current value to satisfy the request
|
||||||
|
int result = (int)(currentValue >>> (bitPosition - width)) & ~-(1<<width);
|
||||||
|
// Update the bit position
|
||||||
|
bitPosition -= width;
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
assert width <= 32;
|
|
||||||
|
|
||||||
if (bitPosition <= 0) {
|
if (bitPosition <= 0) {
|
||||||
readNext();
|
readNext();
|
||||||
}
|
}
|
||||||
|
|
||||||
int result = 0;
|
int result = 0;
|
||||||
|
do {
|
||||||
while (width > 0) {
|
|
||||||
int dw = bitPosition - width;
|
int dw = bitPosition - width;
|
||||||
|
|
||||||
if (dw >= 0) { // We have enough bits in the current value to satisfy the request
|
if (dw >= 0) { // We have enough bits in the current value to satisfy the request
|
||||||
result |= ((int)(currentValue >>> dw)) & ~-(1<<width);
|
result |= ((int)(currentValue >>> dw)) & ~-(1<<width);
|
||||||
|
|
||||||
@ -85,6 +87,7 @@ public class BitReader {
|
|||||||
readNext(); // implicitly: bitPosition = 0 here
|
readNext(); // implicitly: bitPosition = 0 here
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
while (width > 0);
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,154 @@
|
|||||||
|
package nu.marginalia.sequence.slop;
|
||||||
|
|
||||||
|
import nu.marginalia.sequence.VarintCodedSequence;
|
||||||
|
import nu.marginalia.slop.column.AbstractColumn;
|
||||||
|
import nu.marginalia.slop.column.AbstractObjectColumn;
|
||||||
|
import nu.marginalia.slop.column.ObjectColumnReader;
|
||||||
|
import nu.marginalia.slop.column.ObjectColumnWriter;
|
||||||
|
import nu.marginalia.slop.column.dynamic.VarintColumn;
|
||||||
|
import nu.marginalia.slop.desc.ColumnFunction;
|
||||||
|
import nu.marginalia.slop.desc.StorageType;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.URI;
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
|
import java.nio.ByteOrder;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/** Slop column extension for storing GammaCodedSequence objects. */
|
||||||
|
public class VarintCodedSequenceArrayColumn extends AbstractObjectColumn<List<VarintCodedSequence>, VarintCodedSequenceArrayColumn.Reader, VarintCodedSequenceArrayColumn.Writer> {
|
||||||
|
|
||||||
|
private final VarintColumn groupsColumn;
|
||||||
|
private final VarintCodedSequenceColumn dataColumn;
|
||||||
|
|
||||||
|
public VarintCodedSequenceArrayColumn(String name) {
|
||||||
|
this(name, StorageType.PLAIN);
|
||||||
|
}
|
||||||
|
|
||||||
|
public VarintCodedSequenceArrayColumn(String name, StorageType storageType) {
|
||||||
|
super(name,
|
||||||
|
"vcs[]",
|
||||||
|
ByteOrder.nativeOrder(),
|
||||||
|
ColumnFunction.DATA,
|
||||||
|
storageType);
|
||||||
|
|
||||||
|
groupsColumn = new VarintColumn(name, ColumnFunction.GROUP_LENGTH, storageType);
|
||||||
|
dataColumn = new VarintCodedSequenceColumn(name);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Writer createUnregistered(Path path, int page) throws IOException {
|
||||||
|
return new Writer(
|
||||||
|
dataColumn.createUnregistered(path, page),
|
||||||
|
groupsColumn.createUnregistered(path, page)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Reader openUnregistered(URI uri, int page) throws IOException {
|
||||||
|
return new Reader(
|
||||||
|
dataColumn.openUnregistered(uri, page),
|
||||||
|
groupsColumn.openUnregistered(uri, page)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public class Writer implements ObjectColumnWriter<List<VarintCodedSequence>> {
|
||||||
|
private final VarintColumn.Writer groupsWriter;
|
||||||
|
private final VarintCodedSequenceColumn.Writer dataWriter;
|
||||||
|
|
||||||
|
Writer(VarintCodedSequenceColumn.Writer dataWriter, VarintColumn.Writer groupsWriter)
|
||||||
|
{
|
||||||
|
this.groupsWriter = groupsWriter;
|
||||||
|
this.dataWriter = dataWriter;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public AbstractColumn<?, ?> columnDesc() {
|
||||||
|
return VarintCodedSequenceArrayColumn.this;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void put(List<VarintCodedSequence> sequences) throws IOException {
|
||||||
|
groupsWriter.put(sequences.size());
|
||||||
|
for (VarintCodedSequence sequence : sequences) {
|
||||||
|
dataWriter.put(sequence);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public long position() {
|
||||||
|
return groupsWriter.position();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void close() throws IOException {
|
||||||
|
dataWriter.close();
|
||||||
|
groupsWriter.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public class Reader implements ObjectColumnReader<List<VarintCodedSequence>> {
|
||||||
|
private final VarintCodedSequenceColumn.Reader dataReader;
|
||||||
|
private final VarintColumn.Reader groupsReader;
|
||||||
|
|
||||||
|
public Reader(VarintCodedSequenceColumn.Reader dataReader, VarintColumn.Reader groupsReader) {
|
||||||
|
this.dataReader = dataReader;
|
||||||
|
this.groupsReader = groupsReader;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public AbstractColumn<?, ?> columnDesc() {
|
||||||
|
return VarintCodedSequenceArrayColumn.this;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void skip(long positions) throws IOException {
|
||||||
|
int toSkip = 0;
|
||||||
|
for (int i = 0; i < positions; i++) {
|
||||||
|
toSkip += groupsReader.get();
|
||||||
|
}
|
||||||
|
dataReader.skip(toSkip);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean hasRemaining() throws IOException {
|
||||||
|
return groupsReader.hasRemaining();
|
||||||
|
}
|
||||||
|
|
||||||
|
public long position() throws IOException {
|
||||||
|
return groupsReader.position();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<VarintCodedSequence> get() throws IOException {
|
||||||
|
int count = groupsReader.get();
|
||||||
|
var ret = new ArrayList<VarintCodedSequence>(count);
|
||||||
|
|
||||||
|
for (int i = 0; i < count; i++) {
|
||||||
|
ret.add(dataReader.get());
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<ByteBuffer> getData(ByteBuffer workArea) throws IOException {
|
||||||
|
int count = groupsReader.get();
|
||||||
|
var ret = new ArrayList<ByteBuffer>(count);
|
||||||
|
|
||||||
|
for (int i = 0; i < count; i++) {
|
||||||
|
int start = workArea.position();
|
||||||
|
dataReader.getData(workArea);
|
||||||
|
var slice = workArea.slice(start, workArea.position() - start);
|
||||||
|
ret.add(slice);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void close() throws IOException {
|
||||||
|
dataReader.close();
|
||||||
|
groupsReader.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,148 @@
|
|||||||
|
package nu.marginalia.sequence.slop;
|
||||||
|
|
||||||
|
import nu.marginalia.sequence.VarintCodedSequence;
|
||||||
|
import nu.marginalia.slop.column.AbstractColumn;
|
||||||
|
import nu.marginalia.slop.column.AbstractObjectColumn;
|
||||||
|
import nu.marginalia.slop.column.ObjectColumnReader;
|
||||||
|
import nu.marginalia.slop.column.ObjectColumnWriter;
|
||||||
|
import nu.marginalia.slop.column.dynamic.VarintColumn;
|
||||||
|
import nu.marginalia.slop.desc.ColumnFunction;
|
||||||
|
import nu.marginalia.slop.desc.StorageType;
|
||||||
|
import nu.marginalia.slop.storage.Storage;
|
||||||
|
import nu.marginalia.slop.storage.StorageReader;
|
||||||
|
import nu.marginalia.slop.storage.StorageWriter;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.URI;
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
|
import java.nio.ByteOrder;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
/** Slop column extension for storing GammaCodedSequence objects. */
|
||||||
|
public class VarintCodedSequenceColumn extends AbstractObjectColumn<VarintCodedSequence, VarintCodedSequenceColumn.Reader, VarintCodedSequenceColumn.Writer> {
|
||||||
|
|
||||||
|
private final VarintColumn indexColumn;
|
||||||
|
|
||||||
|
public VarintCodedSequenceColumn(String name) {
|
||||||
|
this(name, StorageType.PLAIN);
|
||||||
|
}
|
||||||
|
|
||||||
|
public VarintCodedSequenceColumn(String name, StorageType storageType) {
|
||||||
|
super(name,
|
||||||
|
"vcs",
|
||||||
|
ByteOrder.nativeOrder(),
|
||||||
|
ColumnFunction.DATA,
|
||||||
|
storageType);
|
||||||
|
|
||||||
|
indexColumn = new VarintColumn(name, ColumnFunction.DATA_LEN, StorageType.PLAIN);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Writer createUnregistered(Path path, int page) throws IOException {
|
||||||
|
return new Writer(
|
||||||
|
Storage.writer(path, this, page),
|
||||||
|
indexColumn.createUnregistered(path, page)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Reader openUnregistered(URI uri, int page) throws IOException {
|
||||||
|
return new Reader(
|
||||||
|
Storage.reader(uri, this, page, false),
|
||||||
|
indexColumn.openUnregistered(uri, page)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
public class Writer implements ObjectColumnWriter<VarintCodedSequence> {
|
||||||
|
private final VarintColumn.Writer indexWriter;
|
||||||
|
private final StorageWriter storage;
|
||||||
|
|
||||||
|
public Writer(StorageWriter storage,
|
||||||
|
VarintColumn.Writer indexWriter)
|
||||||
|
{
|
||||||
|
this.storage = storage;
|
||||||
|
|
||||||
|
this.indexWriter = indexWriter;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public AbstractColumn<?, ?> columnDesc() {
|
||||||
|
return VarintCodedSequenceColumn.this;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void put(VarintCodedSequence sequence) throws IOException {
|
||||||
|
var buffer = sequence.buffer();
|
||||||
|
int length = buffer.remaining();
|
||||||
|
|
||||||
|
indexWriter.put(length);
|
||||||
|
storage.putBytes(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
public long position() {
|
||||||
|
return indexWriter.position();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void close() throws IOException {
|
||||||
|
indexWriter.close();
|
||||||
|
storage.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public class Reader implements ObjectColumnReader<VarintCodedSequence> {
|
||||||
|
private final VarintColumn.Reader indexReader;
|
||||||
|
private final StorageReader storage;
|
||||||
|
|
||||||
|
Reader(StorageReader reader, VarintColumn.Reader indexReader) throws IOException {
|
||||||
|
this.storage = reader;
|
||||||
|
this.indexReader = indexReader;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public AbstractColumn<?, ?> columnDesc() {
|
||||||
|
return VarintCodedSequenceColumn.this;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void skip(long positions) throws IOException {
|
||||||
|
for (int i = 0; i < positions; i++) {
|
||||||
|
int size = indexReader.get();
|
||||||
|
storage.skip(size, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean hasRemaining() throws IOException {
|
||||||
|
return indexReader.hasRemaining();
|
||||||
|
}
|
||||||
|
|
||||||
|
public long position() throws IOException {
|
||||||
|
return indexReader.position();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public VarintCodedSequence get() throws IOException {
|
||||||
|
int size = indexReader.get();
|
||||||
|
|
||||||
|
ByteBuffer dest = ByteBuffer.allocate(size);
|
||||||
|
storage.getBytes(dest);
|
||||||
|
dest.flip();
|
||||||
|
|
||||||
|
return new VarintCodedSequence(dest);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void getData(ByteBuffer workArea) throws IOException {
|
||||||
|
int size = indexReader.get();
|
||||||
|
|
||||||
|
int oldLimit = workArea.limit();
|
||||||
|
workArea.limit(workArea.position() + size);
|
||||||
|
storage.getBytes(workArea);
|
||||||
|
workArea.limit(oldLimit);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void close() throws IOException {
|
||||||
|
indexReader.close();
|
||||||
|
storage.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
@ -25,51 +25,51 @@ public class SequenceBenchmarks {
|
|||||||
workArea = ByteBuffer.allocate(65536);
|
workArea = ByteBuffer.allocate(65536);
|
||||||
arrayValues = new int[] { 1, 3, 5, 16, 1024, 2048, 4096, 4098, 4100 };
|
arrayValues = new int[] { 1, 3, 5, 16, 1024, 2048, 4096, 4098, 4100 };
|
||||||
list = new IntArrayList(arrayValues);
|
list = new IntArrayList(arrayValues);
|
||||||
vcs = VarintCodedSequence.generate(1, 3, 5, 16, 1024, 2048);
|
vcs = VarintCodedSequence.generate(16,21,24,28,66,71,76,83,87,98,101,106,113,115,119,122,143,148,159,164,167,177,182,211,223,242,245,250,273,275,280,289,292,300,307,322,330,338,345,371,397,402,411,420,427,430,433,437,440,448,451,481,490,513,522,555,571,573,585,597,606,613,634,638,640,644,656,660,666,683,689,692,696,709,712,718,727,731,735,738);
|
||||||
gcs = GammaCodedSequence.generate(workArea, 1, 3, 5, 16, 1024, 2048);
|
gcs = GammaCodedSequence.generate(workArea, 16,21,24,28,66,71,76,83,87,98,101,106,113,115,119,122,143,148,159,164,167,177,182,211,223,242,245,250,273,275,280,289,292,300,307,322,330,338,345,371,397,402,411,420,427,430,433,437,440,448,451,481,490,513,522,555,571,573,585,597,606,613,634,638,640,644,656,660,666,683,689,692,696,709,712,718,727,731,735,738);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Fork(value = 1, warmups = 1)
|
||||||
|
@Warmup(iterations = 1)
|
||||||
|
@Benchmark
|
||||||
|
@BenchmarkMode(Mode.Throughput)
|
||||||
|
public int vcsDecode(SequenceState state) {
|
||||||
|
var iter = state.vcs.iterator();
|
||||||
|
int sum = 0;
|
||||||
|
while (iter.hasNext()) {
|
||||||
|
sum += iter.nextInt();
|
||||||
|
}
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
//
|
||||||
// @Fork(value = 5, warmups = 5)
|
// @Fork(value = 5, warmups = 5)
|
||||||
// @Warmup(iterations = 5)
|
// @Warmup(iterations = 5)
|
||||||
// @Benchmark
|
// @Benchmark
|
||||||
// @BenchmarkMode(Mode.Throughput)
|
// @BenchmarkMode(Mode.Throughput)
|
||||||
// public int vcsDecode(SequenceState state) {
|
// public int listDecode2(SequenceState state) {
|
||||||
// var iter = state.vcs.iterator();
|
// var list = state.arrayValues;
|
||||||
// int sum = 0;
|
// int sum = 0;
|
||||||
// while (iter.hasNext()) {
|
// for (int i = 0; i < list.length; i++) {
|
||||||
// sum += iter.nextInt();
|
// sum += list[i];
|
||||||
// }
|
// }
|
||||||
// return sum;
|
// return sum;
|
||||||
// }
|
// }
|
||||||
|
|
||||||
@Fork(value = 5, warmups = 5)
|
|
||||||
@Warmup(iterations = 5)
|
@Fork(value = 1, warmups = 1)
|
||||||
|
@Warmup(iterations = 1)
|
||||||
@Benchmark
|
@Benchmark
|
||||||
@BenchmarkMode(Mode.Throughput)
|
@BenchmarkMode(Mode.Throughput)
|
||||||
public int listDecode2(SequenceState state) {
|
public int gcsDecode(SequenceState state) {
|
||||||
var list = state.arrayValues;
|
var iter = state.gcs.iterator();
|
||||||
int sum = 0;
|
int sum = 0;
|
||||||
for (int i = 0; i < list.length; i++) {
|
while (iter.hasNext()) {
|
||||||
sum += list[i];
|
sum += iter.nextInt();
|
||||||
}
|
}
|
||||||
return sum;
|
return sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// @Fork(value = 1, warmups = 1)
|
|
||||||
// @Warmup(iterations = 1)
|
|
||||||
// @Benchmark
|
|
||||||
// @BenchmarkMode(Mode.Throughput)
|
|
||||||
// public int gcsDecode(SequenceState state) {
|
|
||||||
// var iter = state.gcs.iterator();
|
|
||||||
// int sum = 0;
|
|
||||||
// while (iter.hasNext()) {
|
|
||||||
// sum += iter.nextInt();
|
|
||||||
// }
|
|
||||||
// return sum;
|
|
||||||
// }
|
|
||||||
|
|
||||||
// @Fork(value = 1, warmups = 1)
|
// @Fork(value = 1, warmups = 1)
|
||||||
// @Warmup(iterations = 1)
|
// @Warmup(iterations = 1)
|
||||||
// @Benchmark
|
// @Benchmark
|
||||||
|
@ -63,6 +63,8 @@ class SequenceOperationsTest {
|
|||||||
assertTrue(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator(), seq3.iterator()));
|
assertTrue(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator(), seq3.iterator()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void intersectSequencesDeepMatch3findIntersections() {
|
void intersectSequencesDeepMatch3findIntersections() {
|
||||||
ByteBuffer wa = ByteBuffer.allocate(1024);
|
ByteBuffer wa = ByteBuffer.allocate(1024);
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.keyword.model;
|
package nu.marginalia.keyword.model;
|
||||||
|
|
||||||
import nu.marginalia.model.idx.CodedWordSpan;
|
import nu.marginalia.model.idx.CodedWordSpan;
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.VarintCodedSequence;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
@ -9,12 +9,12 @@ public final class DocumentKeywords {
|
|||||||
|
|
||||||
public final List<String> keywords;
|
public final List<String> keywords;
|
||||||
public final byte[] metadata;
|
public final byte[] metadata;
|
||||||
public final List<GammaCodedSequence> positions;
|
public final List<VarintCodedSequence> positions;
|
||||||
public final List<CodedWordSpan> spans;
|
public final List<CodedWordSpan> spans;
|
||||||
|
|
||||||
public DocumentKeywords(List<String> keywords,
|
public DocumentKeywords(List<String> keywords,
|
||||||
byte[] metadata,
|
byte[] metadata,
|
||||||
List<GammaCodedSequence> positions,
|
List<VarintCodedSequence> positions,
|
||||||
List<CodedWordSpan> spans)
|
List<CodedWordSpan> spans)
|
||||||
{
|
{
|
||||||
this.keywords = keywords;
|
this.keywords = keywords;
|
||||||
|
@ -8,7 +8,7 @@ import lombok.Getter;
|
|||||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||||
import nu.marginalia.model.idx.CodedWordSpan;
|
import nu.marginalia.model.idx.CodedWordSpan;
|
||||||
import nu.marginalia.model.idx.WordFlags;
|
import nu.marginalia.model.idx.WordFlags;
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.VarintCodedSequence;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -39,7 +39,7 @@ public class DocumentKeywordsBuilder {
|
|||||||
public DocumentKeywords build(ByteBuffer workArea) {
|
public DocumentKeywords build(ByteBuffer workArea) {
|
||||||
final List<String> wordArray = new ArrayList<>(wordToMeta.size());
|
final List<String> wordArray = new ArrayList<>(wordToMeta.size());
|
||||||
final TByteArrayList meta = new TByteArrayList(wordToMeta.size());
|
final TByteArrayList meta = new TByteArrayList(wordToMeta.size());
|
||||||
final List<GammaCodedSequence> positions = new ArrayList<>(wordToMeta.size());
|
final List<VarintCodedSequence> positions = new ArrayList<>(wordToMeta.size());
|
||||||
|
|
||||||
var iter = wordToMeta.object2ByteEntrySet().fastIterator();
|
var iter = wordToMeta.object2ByteEntrySet().fastIterator();
|
||||||
|
|
||||||
@ -49,13 +49,13 @@ public class DocumentKeywordsBuilder {
|
|||||||
meta.add(entry.getByteValue());
|
meta.add(entry.getByteValue());
|
||||||
wordArray.add(entry.getKey());
|
wordArray.add(entry.getKey());
|
||||||
|
|
||||||
var posList = wordToPos.getOrDefault(entry.getKey(), IntList.of());
|
IntList posList = wordToPos.getOrDefault(entry.getKey(), IntList.of());
|
||||||
|
|
||||||
if (posList.size() > MAX_POSITIONS_PER_WORD) {
|
if (posList.size() > MAX_POSITIONS_PER_WORD) {
|
||||||
posList.subList(MAX_POSITIONS_PER_WORD, posList.size()).clear();
|
posList.subList(MAX_POSITIONS_PER_WORD, posList.size()).clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
positions.add(GammaCodedSequence.generate(workArea, posList));
|
positions.add(VarintCodedSequence.generate(posList));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Encode spans
|
// Encode spans
|
||||||
@ -70,7 +70,7 @@ public class DocumentKeywordsBuilder {
|
|||||||
positionsForTag.add(span.end());
|
positionsForTag.add(span.end());
|
||||||
}
|
}
|
||||||
|
|
||||||
spans.add(new CodedWordSpan((byte) tag.charValue(), GammaCodedSequence.generate(workArea, positionsForTag)));
|
spans.add(new CodedWordSpan((byte) tag.charValue(), VarintCodedSequence.generate(positionsForTag)));
|
||||||
});
|
});
|
||||||
|
|
||||||
return new DocumentKeywords(wordArray, meta.toArray(), positions, spans);
|
return new DocumentKeywords(wordArray, meta.toArray(), positions, spans);
|
||||||
|
@ -12,7 +12,7 @@ import nu.marginalia.model.crawl.HtmlFeature;
|
|||||||
import nu.marginalia.model.processed.SlopDocumentRecord;
|
import nu.marginalia.model.processed.SlopDocumentRecord;
|
||||||
import nu.marginalia.model.processed.SlopDomainLinkRecord;
|
import nu.marginalia.model.processed.SlopDomainLinkRecord;
|
||||||
import nu.marginalia.model.processed.SlopDomainRecord;
|
import nu.marginalia.model.processed.SlopDomainRecord;
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.VarintCodedSequence;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -96,7 +96,7 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter
|
|||||||
|
|
||||||
var wb = document.words.build(workArea);
|
var wb = document.words.build(workArea);
|
||||||
|
|
||||||
List<GammaCodedSequence> spanSequences = new ArrayList<>(wb.spans.size());
|
List<VarintCodedSequence> spanSequences = new ArrayList<>(wb.spans.size());
|
||||||
byte[] spanCodes = new byte[wb.spans.size()];
|
byte[] spanCodes = new byte[wb.spans.size()];
|
||||||
|
|
||||||
for (int i = 0; i < wb.spans.size(); i++) {
|
for (int i = 0; i < wb.spans.size(); i++) {
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
package nu.marginalia.model.processed;
|
package nu.marginalia.model.processed;
|
||||||
|
|
||||||
import lombok.Builder;
|
import lombok.Builder;
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.VarintCodedSequence;
|
||||||
import nu.marginalia.sequence.slop.GammaCodedSequenceArrayColumn;
|
import nu.marginalia.sequence.slop.VarintCodedSequenceArrayColumn;
|
||||||
import nu.marginalia.slop.SlopTable;
|
import nu.marginalia.slop.SlopTable;
|
||||||
import nu.marginalia.slop.column.array.ByteArrayColumn;
|
import nu.marginalia.slop.column.array.ByteArrayColumn;
|
||||||
import nu.marginalia.slop.column.array.ObjectArrayColumn;
|
import nu.marginalia.slop.column.array.ObjectArrayColumn;
|
||||||
@ -39,9 +39,9 @@ public record SlopDocumentRecord(
|
|||||||
Integer pubYear,
|
Integer pubYear,
|
||||||
List<String> words,
|
List<String> words,
|
||||||
byte[] metas,
|
byte[] metas,
|
||||||
List<GammaCodedSequence> positions,
|
List<VarintCodedSequence> positions,
|
||||||
byte[] spanCodes,
|
byte[] spanCodes,
|
||||||
List<GammaCodedSequence> spans
|
List<VarintCodedSequence> spans
|
||||||
) {
|
) {
|
||||||
|
|
||||||
public SlopDocumentRecord {
|
public SlopDocumentRecord {
|
||||||
@ -60,9 +60,9 @@ public record SlopDocumentRecord(
|
|||||||
int length,
|
int length,
|
||||||
List<String> words,
|
List<String> words,
|
||||||
byte[] metas,
|
byte[] metas,
|
||||||
List<GammaCodedSequence> positions,
|
List<VarintCodedSequence> positions,
|
||||||
byte[] spanCodes,
|
byte[] spanCodes,
|
||||||
List<GammaCodedSequence> spans)
|
List<VarintCodedSequence> spans)
|
||||||
{
|
{
|
||||||
// Override the equals method since records don't generate default equals that deal with array fields properly
|
// Override the equals method since records don't generate default equals that deal with array fields properly
|
||||||
@Override
|
@Override
|
||||||
@ -127,12 +127,12 @@ public record SlopDocumentRecord(
|
|||||||
|
|
||||||
private static final ObjectArrayColumn<String> keywordsColumn = new StringColumn("keywords", StorageType.ZSTD).asArray();
|
private static final ObjectArrayColumn<String> keywordsColumn = new StringColumn("keywords", StorageType.ZSTD).asArray();
|
||||||
private static final ByteArrayColumn termMetaColumn = new ByteArrayColumn("termMetadata", StorageType.ZSTD);
|
private static final ByteArrayColumn termMetaColumn = new ByteArrayColumn("termMetadata", StorageType.ZSTD);
|
||||||
private static final GammaCodedSequenceArrayColumn termPositionsColumn = new GammaCodedSequenceArrayColumn("termPositions", StorageType.ZSTD);
|
private static final VarintCodedSequenceArrayColumn termPositionsColumn = new VarintCodedSequenceArrayColumn("termPositions", StorageType.ZSTD);
|
||||||
|
|
||||||
// Spans columns
|
// Spans columns
|
||||||
|
|
||||||
private static final ByteArrayColumn spanCodesColumn = new ByteArrayColumn("spanCodes", StorageType.ZSTD);
|
private static final ByteArrayColumn spanCodesColumn = new ByteArrayColumn("spanCodes", StorageType.ZSTD);
|
||||||
private static final GammaCodedSequenceArrayColumn spansColumn = new GammaCodedSequenceArrayColumn("spans", StorageType.ZSTD);
|
private static final VarintCodedSequenceArrayColumn spansColumn = new VarintCodedSequenceArrayColumn("spans", StorageType.ZSTD);
|
||||||
|
|
||||||
public static class KeywordsProjectionReader extends SlopTable {
|
public static class KeywordsProjectionReader extends SlopTable {
|
||||||
private final TxtStringColumn.Reader domainsReader;
|
private final TxtStringColumn.Reader domainsReader;
|
||||||
@ -143,10 +143,10 @@ public record SlopDocumentRecord(
|
|||||||
|
|
||||||
private final ObjectArrayColumn<String>.Reader keywordsReader;
|
private final ObjectArrayColumn<String>.Reader keywordsReader;
|
||||||
private final ByteArrayColumn.Reader termMetaReader;
|
private final ByteArrayColumn.Reader termMetaReader;
|
||||||
private final GammaCodedSequenceArrayColumn.Reader termPositionsReader;
|
private final VarintCodedSequenceArrayColumn.Reader termPositionsReader;
|
||||||
|
|
||||||
private final ByteArrayColumn.Reader spanCodesReader;
|
private final ByteArrayColumn.Reader spanCodesReader;
|
||||||
private final GammaCodedSequenceArrayColumn.Reader spansReader;
|
private final VarintCodedSequenceArrayColumn.Reader spansReader;
|
||||||
|
|
||||||
public KeywordsProjectionReader(SlopTable.Ref<SlopDocumentRecord> pageRef) throws IOException {
|
public KeywordsProjectionReader(SlopTable.Ref<SlopDocumentRecord> pageRef) throws IOException {
|
||||||
super(pageRef);
|
super(pageRef);
|
||||||
@ -177,10 +177,10 @@ public record SlopDocumentRecord(
|
|||||||
int length = lengthsReader.get();
|
int length = lengthsReader.get();
|
||||||
|
|
||||||
List<String> words = keywordsReader.get();
|
List<String> words = keywordsReader.get();
|
||||||
List<GammaCodedSequence> positions = termPositionsReader.get();
|
List<VarintCodedSequence> positions = termPositionsReader.get();
|
||||||
byte[] metas = termMetaReader.get();
|
byte[] metas = termMetaReader.get();
|
||||||
byte[] spanCodes = spanCodesReader.get();
|
byte[] spanCodes = spanCodesReader.get();
|
||||||
List<GammaCodedSequence> spans = spansReader.get();
|
List<VarintCodedSequence> spans = spansReader.get();
|
||||||
|
|
||||||
return new KeywordsProjection(
|
return new KeywordsProjection(
|
||||||
domain,
|
domain,
|
||||||
@ -272,9 +272,9 @@ public record SlopDocumentRecord(
|
|||||||
private final IntColumn.Writer pubYearWriter;
|
private final IntColumn.Writer pubYearWriter;
|
||||||
private final ObjectArrayColumn<String>.Writer keywordsWriter;
|
private final ObjectArrayColumn<String>.Writer keywordsWriter;
|
||||||
private final ByteArrayColumn.Writer termMetaWriter;
|
private final ByteArrayColumn.Writer termMetaWriter;
|
||||||
private final GammaCodedSequenceArrayColumn.Writer termPositionsWriter;
|
private final VarintCodedSequenceArrayColumn.Writer termPositionsWriter;
|
||||||
private final ByteArrayColumn.Writer spansCodesWriter;
|
private final ByteArrayColumn.Writer spansCodesWriter;
|
||||||
private final GammaCodedSequenceArrayColumn.Writer spansWriter;
|
private final VarintCodedSequenceArrayColumn.Writer spansWriter;
|
||||||
|
|
||||||
public Writer(Path baseDir, int page) throws IOException {
|
public Writer(Path baseDir, int page) throws IOException {
|
||||||
super(baseDir, page);
|
super(baseDir, page);
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
package nu.marginalia.model.processed;
|
package nu.marginalia.model.processed;
|
||||||
|
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.VarintCodedSequence;
|
||||||
import nu.marginalia.slop.SlopTable;
|
import nu.marginalia.slop.SlopTable;
|
||||||
import nu.marginalia.test.TestUtil;
|
import nu.marginalia.test.TestUtil;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
@ -46,9 +46,9 @@ public class SlopDocumentRecordTest {
|
|||||||
null,
|
null,
|
||||||
List.of("test1", "test2"),
|
List.of("test1", "test2"),
|
||||||
new byte[] { 2, 3},
|
new byte[] { 2, 3},
|
||||||
List.of(GammaCodedSequence.generate(workArea, 1, 3, 5), GammaCodedSequence.generate(workArea, 2, 4, 6)),
|
List.of(VarintCodedSequence.generate(1, 3, 5), VarintCodedSequence.generate(2, 4, 6)),
|
||||||
new byte[] { 'a', 'b' },
|
new byte[] { 'a', 'b' },
|
||||||
List.of(GammaCodedSequence.generate(workArea, 2, 3, 5), GammaCodedSequence.generate(workArea, 3, 4, 6))
|
List.of(VarintCodedSequence.generate(2, 3, 5), VarintCodedSequence.generate(3, 4, 6))
|
||||||
);
|
);
|
||||||
|
|
||||||
try (var writer = new SlopDocumentRecord.Writer(testDir, 0)) {
|
try (var writer = new SlopDocumentRecord.Writer(testDir, 0)) {
|
||||||
|
Loading…
Reference in New Issue
Block a user