(slop) Migrate to latest Slop version

This commit is contained in:
Viktor Lofgren 2024-08-14 11:44:35 +02:00
parent 2ad93ad41a
commit 75b0888032
18 changed files with 240 additions and 341 deletions

View File

@ -10,8 +10,8 @@ import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.slop.column.primitive.LongColumnReader;
import nu.marginalia.slop.desc.SlopTable;
import nu.marginalia.slop.SlopTable;
import nu.marginalia.slop.column.primitive.LongColumn;
import org.roaringbitmap.longlong.LongConsumer;
import org.roaringbitmap.longlong.Roaring64Bitmap;
import org.slf4j.Logger;
@ -153,7 +153,7 @@ public class ForwardIndexConverter {
for (var instance : journalReader.pages()) {
try (var slopTable = new SlopTable(instance.page())) {
LongColumnReader idReader = instance.openCombinedId(slopTable);
LongColumn.Reader idReader = instance.openCombinedId(slopTable);
while (idReader.hasRemaining()) {
rbm.add(idReader.get());

View File

@ -1,6 +1,6 @@
package nu.marginalia.index.journal;
import nu.marginalia.slop.desc.SlopTable;
import nu.marginalia.slop.SlopTable;
import java.nio.file.Files;
import java.nio.file.Path;

View File

@ -1,36 +1,28 @@
package nu.marginalia.index.journal;
import nu.marginalia.sequence.slop.GammaCodedSequenceArrayColumn;
import nu.marginalia.sequence.slop.GammaCodedSequenceArrayReader;
import nu.marginalia.sequence.slop.GammaCodedSequenceArrayWriter;
import nu.marginalia.slop.ColumnTypes;
import nu.marginalia.slop.column.array.ByteArrayColumnReader;
import nu.marginalia.slop.column.array.ByteArrayColumnWriter;
import nu.marginalia.slop.column.array.LongArrayColumnReader;
import nu.marginalia.slop.column.array.LongArrayColumnWriter;
import nu.marginalia.slop.column.primitive.IntColumnReader;
import nu.marginalia.slop.column.primitive.IntColumnWriter;
import nu.marginalia.slop.column.primitive.LongColumnReader;
import nu.marginalia.slop.column.primitive.LongColumnWriter;
import nu.marginalia.slop.desc.ColumnDesc;
import nu.marginalia.slop.desc.SlopTable;
import nu.marginalia.slop.SlopTable;
import nu.marginalia.slop.column.array.ByteArrayColumn;
import nu.marginalia.slop.column.array.LongArrayColumn;
import nu.marginalia.slop.column.primitive.IntColumn;
import nu.marginalia.slop.column.primitive.LongColumn;
import nu.marginalia.slop.desc.StorageType;
import java.io.IOException;
import java.nio.file.Path;
public record IndexJournalPage(Path baseDir, int page) {
public static final ColumnDesc<IntColumnReader, IntColumnWriter> features = new ColumnDesc<>("features", ColumnTypes.INT_LE, StorageType.PLAIN);
public static final ColumnDesc<IntColumnReader, IntColumnWriter> size = new ColumnDesc<>("size", ColumnTypes.INT_LE, StorageType.PLAIN);
public static final ColumnDesc<LongColumnReader, LongColumnWriter> combinedId = new ColumnDesc<>("combinedId", ColumnTypes.LONG_LE, StorageType.PLAIN);
public static final ColumnDesc<LongColumnReader, LongColumnWriter> documentMeta = new ColumnDesc<>("documentMeta", ColumnTypes.LONG_LE, StorageType.PLAIN);
public static IntColumn features = new IntColumn("features", StorageType.PLAIN);
public static IntColumn size = new IntColumn("size", StorageType.PLAIN);
public static LongColumn combinedId = new LongColumn("combinedId", StorageType.PLAIN);
public static LongColumn documentMeta = new LongColumn("documentMeta", StorageType.PLAIN);
public static final ColumnDesc<LongArrayColumnReader, LongArrayColumnWriter> termIds = new ColumnDesc<>("termIds", ColumnTypes.LONG_ARRAY_LE, StorageType.ZSTD);
public static final ColumnDesc<ByteArrayColumnReader, ByteArrayColumnWriter> termMeta = new ColumnDesc<>("termMetadata", ColumnTypes.BYTE_ARRAY, StorageType.ZSTD);
public static final ColumnDesc<GammaCodedSequenceArrayReader, GammaCodedSequenceArrayWriter> positions = new ColumnDesc<>("termPositions", GammaCodedSequenceArrayColumn.TYPE, StorageType.ZSTD);
public static LongArrayColumn termIds = new LongArrayColumn("termIds", StorageType.ZSTD);
public static ByteArrayColumn termMeta = new ByteArrayColumn("termMetadata", StorageType.ZSTD);
public static GammaCodedSequenceArrayColumn positions = new GammaCodedSequenceArrayColumn("termPositions", StorageType.ZSTD);
public static final ColumnDesc<ByteArrayColumnReader, ByteArrayColumnWriter> spanCodes = new ColumnDesc<>("spanCodes", ColumnTypes.BYTE_ARRAY, StorageType.ZSTD);
public static final ColumnDesc<GammaCodedSequenceArrayReader, GammaCodedSequenceArrayWriter> spans = new ColumnDesc<>("spans", GammaCodedSequenceArrayColumn.TYPE, StorageType.ZSTD);
public static ByteArrayColumn spanCodes = new ByteArrayColumn("spanCodes", StorageType.ZSTD);
public static GammaCodedSequenceArrayColumn spans = new GammaCodedSequenceArrayColumn("spans", StorageType.ZSTD);
public IndexJournalPage {
if (!baseDir.toFile().isDirectory()) {
@ -38,40 +30,40 @@ public record IndexJournalPage(Path baseDir, int page) {
}
}
public LongColumnReader openCombinedId(SlopTable table) throws IOException {
public LongColumn.Reader openCombinedId(SlopTable table) throws IOException {
return combinedId.open(table, baseDir);
}
public LongColumnReader openDocumentMeta(SlopTable table) throws IOException {
public LongColumn.Reader openDocumentMeta(SlopTable table) throws IOException {
return documentMeta.open(table, baseDir);
}
public IntColumnReader openFeatures(SlopTable table) throws IOException {
public IntColumn.Reader openFeatures(SlopTable table) throws IOException {
return features.open(table, baseDir);
}
public IntColumnReader openSize(SlopTable table) throws IOException {
public IntColumn.Reader openSize(SlopTable table) throws IOException {
return size.open(table, baseDir);
}
public LongArrayColumnReader openTermIds(SlopTable table) throws IOException {
public LongArrayColumn.Reader openTermIds(SlopTable table) throws IOException {
return termIds.open(table, baseDir);
}
public ByteArrayColumnReader openTermMetadata(SlopTable table) throws IOException {
public ByteArrayColumn.Reader openTermMetadata(SlopTable table) throws IOException {
return termMeta.open(table, baseDir);
}
public GammaCodedSequenceArrayReader openTermPositions(SlopTable table) throws IOException {
public GammaCodedSequenceArrayColumn.Reader openTermPositions(SlopTable table) throws IOException {
return positions.open(table, baseDir);
}
public GammaCodedSequenceArrayReader openSpans(SlopTable table) throws IOException {
public GammaCodedSequenceArrayColumn.Reader openSpans(SlopTable table) throws IOException {
return spans.open(table, baseDir);
}
public ByteArrayColumnReader openSpanCodes(SlopTable table) throws IOException {
public ByteArrayColumn.Reader openSpanCodes(SlopTable table) throws IOException {
return spanCodes.open(table, baseDir);
}
}

View File

@ -3,12 +3,12 @@ package nu.marginalia.index.journal;
import lombok.SneakyThrows;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.model.processed.SlopDocumentRecord;
import nu.marginalia.sequence.slop.GammaCodedSequenceArrayWriter;
import nu.marginalia.slop.column.array.ByteArrayColumnWriter;
import nu.marginalia.slop.column.array.LongArrayColumnWriter;
import nu.marginalia.slop.column.primitive.IntColumnWriter;
import nu.marginalia.slop.column.primitive.LongColumnWriter;
import nu.marginalia.slop.desc.SlopTable;
import nu.marginalia.sequence.slop.GammaCodedSequenceArrayColumn;
import nu.marginalia.slop.SlopTable;
import nu.marginalia.slop.column.array.ByteArrayColumn;
import nu.marginalia.slop.column.array.LongArrayColumn;
import nu.marginalia.slop.column.primitive.IntColumn;
import nu.marginalia.slop.column.primitive.LongColumn;
import java.io.IOException;
import java.nio.file.Files;
@ -17,17 +17,17 @@ import java.util.List;
public class IndexJournalSlopWriter extends SlopTable {
private final IntColumnWriter featuresWriter;
private final IntColumnWriter sizeWriter;
private final LongColumnWriter combinedIdWriter;
private final LongColumnWriter documentMetaWriter;
private final IntColumn.Writer featuresWriter;
private final IntColumn.Writer sizeWriter;
private final LongColumn.Writer combinedIdWriter;
private final LongColumn.Writer documentMetaWriter;
private final LongArrayColumnWriter termIdsWriter;
private final ByteArrayColumnWriter termMetadataWriter;
private final GammaCodedSequenceArrayWriter termPositionsWriter;
private final LongArrayColumn.Writer termIdsWriter;
private final ByteArrayColumn.Writer termMetadataWriter;
private final GammaCodedSequenceArrayColumn.Writer termPositionsWriter;
private final GammaCodedSequenceArrayWriter spansWriter;
private final ByteArrayColumnWriter spanCodesWriter;
private final GammaCodedSequenceArrayColumn.Writer spansWriter;
private final ByteArrayColumn.Writer spanCodesWriter;
private static final MurmurHash3_128 hash = new MurmurHash3_128();

View File

@ -7,7 +7,7 @@ import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.PositionsFileConstructor;
import nu.marginalia.index.journal.IndexJournalPage;
import nu.marginalia.rwf.RandomFileAssembler;
import nu.marginalia.slop.desc.SlopTable;
import nu.marginalia.slop.SlopTable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

View File

@ -6,7 +6,7 @@ import it.unimi.dsi.fastutil.longs.LongIterator;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.index.journal.IndexJournalPage;
import nu.marginalia.slop.desc.SlopTable;
import nu.marginalia.slop.SlopTable;
import java.io.IOException;
import java.nio.file.Files;

View File

@ -6,7 +6,7 @@ import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.journal.IndexJournalPage;
import nu.marginalia.rwf.RandomFileAssembler;
import nu.marginalia.slop.desc.SlopTable;
import nu.marginalia.slop.SlopTable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

View File

@ -6,7 +6,7 @@ import it.unimi.dsi.fastutil.longs.LongIterator;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.index.journal.IndexJournalPage;
import nu.marginalia.slop.desc.SlopTable;
import nu.marginalia.slop.SlopTable;
import java.io.IOException;
import java.nio.file.Files;

View File

@ -1,13 +1,12 @@
package nu.marginalia.sequence.slop;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.slop.ColumnTypes;
import nu.marginalia.slop.column.AbstractColumn;
import nu.marginalia.slop.column.AbstractObjectColumn;
import nu.marginalia.slop.column.ObjectColumnReader;
import nu.marginalia.slop.column.ObjectColumnWriter;
import nu.marginalia.slop.column.dynamic.VarintColumn;
import nu.marginalia.slop.column.dynamic.VarintColumnReader;
import nu.marginalia.slop.column.dynamic.VarintColumnWriter;
import nu.marginalia.slop.desc.ColumnDesc;
import nu.marginalia.slop.desc.ColumnFunction;
import nu.marginalia.slop.desc.ColumnType;
import nu.marginalia.slop.desc.StorageType;
import java.io.IOException;
@ -18,45 +17,54 @@ import java.util.ArrayList;
import java.util.List;
/** Slop column extension for storing GammaCodedSequence objects. */
public class GammaCodedSequenceArrayColumn {
public class GammaCodedSequenceArrayColumn extends AbstractObjectColumn<List<GammaCodedSequence>, GammaCodedSequenceArrayColumn.Reader, GammaCodedSequenceArrayColumn.Writer> {
public static ColumnType<GammaCodedSequenceArrayReader, GammaCodedSequenceArrayWriter> TYPE = ColumnTypes.register("s8[]+gcs[]", ByteOrder.nativeOrder(), GammaCodedSequenceArrayColumn::open, GammaCodedSequenceArrayColumn::create);
private final VarintColumn groupsColumn;
private final GammaCodedSequenceColumn dataColumn;
public static GammaCodedSequenceArrayReader open(Path path, ColumnDesc columnDesc) throws IOException {
return new Reader(columnDesc,
GammaCodedSequenceColumn.open(path, columnDesc),
VarintColumn.open(path, columnDesc.createSupplementaryColumn(ColumnFunction.GROUP_LENGTH,
ColumnTypes.VARINT_LE,
StorageType.PLAIN)
)
public GammaCodedSequenceArrayColumn(String name) {
this(name, StorageType.PLAIN);
}
public GammaCodedSequenceArrayColumn(String name, StorageType storageType) {
super(name,
"gcs[]",
ByteOrder.nativeOrder(),
ColumnFunction.DATA,
storageType);
groupsColumn = new VarintColumn(name, ColumnFunction.GROUP_LENGTH, storageType);
dataColumn = new GammaCodedSequenceColumn(name);
}
public Writer createUnregistered(Path path, int page) throws IOException {
return new Writer(
dataColumn.createUnregistered(path, page),
groupsColumn.createUnregistered(path, page)
);
}
public static GammaCodedSequenceArrayWriter create(Path path, ColumnDesc columnDesc) throws IOException {
return new Writer(columnDesc,
GammaCodedSequenceColumn.create(path, columnDesc),
VarintColumn.create(path, columnDesc.createSupplementaryColumn(ColumnFunction.GROUP_LENGTH,
ColumnTypes.VARINT_LE,
StorageType.PLAIN)
)
public Reader openUnregistered(Path path, int page) throws IOException {
return new Reader(
dataColumn.openUnregistered(path, page),
groupsColumn.openUnregistered(path, page)
);
}
private static class Writer implements GammaCodedSequenceArrayWriter {
private final VarintColumnWriter groupsWriter;
private final GammaCodedSequenceWriter dataWriter;
private final ColumnDesc<?, ?> columnDesc;
public Writer(ColumnDesc<?, ?> columnDesc, GammaCodedSequenceWriter dataWriter, VarintColumnWriter groupsWriter)
public class Writer implements ObjectColumnWriter<List<GammaCodedSequence>> {
private final VarintColumn.Writer groupsWriter;
private final GammaCodedSequenceColumn.Writer dataWriter;
Writer(GammaCodedSequenceColumn.Writer dataWriter, VarintColumn.Writer groupsWriter)
{
this.groupsWriter = groupsWriter;
this.dataWriter = dataWriter;
this.columnDesc = columnDesc;
}
@Override
public ColumnDesc<?, ?> columnDesc() {
return columnDesc;
public AbstractColumn<?, ?> columnDesc() {
return GammaCodedSequenceArrayColumn.this;
}
@Override
@ -77,20 +85,18 @@ public class GammaCodedSequenceArrayColumn {
}
}
private static class Reader implements GammaCodedSequenceArrayReader {
private final GammaCodedSequenceReader dataReader;
private final VarintColumnReader groupsReader;
private final ColumnDesc<?, ?> columnDesc;
public class Reader implements ObjectColumnReader<List<GammaCodedSequence>> {
private final GammaCodedSequenceColumn.Reader dataReader;
private final VarintColumn.Reader groupsReader;
public Reader(ColumnDesc<?, ?> columnDesc, GammaCodedSequenceReader dataReader, VarintColumnReader groupsReader) throws IOException {
public Reader(GammaCodedSequenceColumn.Reader dataReader, VarintColumn.Reader groupsReader) {
this.dataReader = dataReader;
this.groupsReader = groupsReader;
this.columnDesc = columnDesc;
}
@Override
public ColumnDesc<?, ?> columnDesc() {
return columnDesc;
public AbstractColumn<?, ?> columnDesc() {
return GammaCodedSequenceArrayColumn.this;
}
@Override
@ -123,7 +129,6 @@ public class GammaCodedSequenceArrayColumn {
return ret;
}
@Override
public List<ByteBuffer> getData(ByteBuffer workArea) throws IOException {
int count = groupsReader.get();
var ret = new ArrayList<ByteBuffer>(count);

View File

@ -1,32 +0,0 @@
package nu.marginalia.sequence.slop;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.slop.column.ColumnReader;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.List;
public interface GammaCodedSequenceArrayReader extends AutoCloseable, ColumnReader {
/** Read the next gamma-coded sequence from the column. Unlike most other
* readers, this method requires an intermediate buffer to use for reading
* the sequence. As this buffer typically needs to be fairly large to accommodate
* the largest possible sequence, it is not practical to allocate a new buffer
* for each call to this method. Instead, the caller should allocate a buffer
* once and reuse it for each call to this method.
*
* @return The next gamma-coded sequence.
*/
List<GammaCodedSequence> get() throws IOException;
/** Read just the data portion of the next gamma-coded sequence from the column.
* This method is useful when the caller is only interested in the data portion
* of the sequence and does not want to decode the values.
*
* @param workArea A buffer to use for reading the data.
* @return slices of the work buffer containing the data.
*/
List<ByteBuffer> getData(ByteBuffer workArea) throws IOException;
void close() throws IOException;
}

View File

@ -1,12 +0,0 @@
package nu.marginalia.sequence.slop;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.slop.column.ColumnWriter;
import java.io.IOException;
import java.util.List;
public interface GammaCodedSequenceArrayWriter extends AutoCloseable, ColumnWriter {
void put(List<GammaCodedSequence> sequence) throws IOException;
void close() throws IOException;
}

View File

@ -1,13 +1,12 @@
package nu.marginalia.sequence.slop;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.slop.ColumnTypes;
import nu.marginalia.slop.column.AbstractColumn;
import nu.marginalia.slop.column.AbstractObjectColumn;
import nu.marginalia.slop.column.ObjectColumnReader;
import nu.marginalia.slop.column.ObjectColumnWriter;
import nu.marginalia.slop.column.dynamic.VarintColumn;
import nu.marginalia.slop.column.dynamic.VarintColumnReader;
import nu.marginalia.slop.column.dynamic.VarintColumnWriter;
import nu.marginalia.slop.desc.ColumnDesc;
import nu.marginalia.slop.desc.ColumnFunction;
import nu.marginalia.slop.desc.ColumnType;
import nu.marginalia.slop.desc.StorageType;
import nu.marginalia.slop.storage.Storage;
import nu.marginalia.slop.storage.StorageReader;
@ -19,48 +18,53 @@ import java.nio.ByteOrder;
import java.nio.file.Path;
/** Slop column extension for storing GammaCodedSequence objects. */
public class GammaCodedSequenceColumn {
public class GammaCodedSequenceColumn extends AbstractObjectColumn<GammaCodedSequence, GammaCodedSequenceColumn.Reader, GammaCodedSequenceColumn.Writer> {
public static ColumnType<GammaCodedSequenceReader, GammaCodedSequenceWriter> TYPE = ColumnTypes.register("s8[]+gcs", ByteOrder.nativeOrder(), GammaCodedSequenceColumn::open, GammaCodedSequenceColumn::create);
private final VarintColumn indexColumn;
public static GammaCodedSequenceReader open(Path path, ColumnDesc columnDesc) throws IOException {
return new Reader(columnDesc,
Storage.reader(path, columnDesc, false), // note we must never pass aligned=true here, as the data is not guaranteed alignment
VarintColumn.open(path, columnDesc.createSupplementaryColumn(ColumnFunction.DATA_LEN,
ColumnTypes.VARINT_LE,
StorageType.PLAIN)
)
public GammaCodedSequenceColumn(String name) {
this(name, StorageType.PLAIN);
}
public GammaCodedSequenceColumn(String name, StorageType storageType) {
super(name,
"gamma",
ByteOrder.nativeOrder(),
ColumnFunction.DATA,
storageType);
indexColumn = new VarintColumn(name, ColumnFunction.DATA_LEN, StorageType.PLAIN);
}
public Writer createUnregistered(Path path, int page) throws IOException {
return new Writer(
Storage.writer(path, this, page),
indexColumn.createUnregistered(path, page)
);
}
public static GammaCodedSequenceWriter create(Path path, ColumnDesc columnDesc) throws IOException {
return new Writer(columnDesc,
Storage.writer(path, columnDesc),
VarintColumn.create(path, columnDesc.createSupplementaryColumn(ColumnFunction.DATA_LEN,
ColumnTypes.VARINT_LE,
StorageType.PLAIN)
)
public Reader openUnregistered(Path path, int page) throws IOException {
return new Reader(
Storage.reader(path, this, page, false),
indexColumn.openUnregistered(path, page)
);
}
private static class Writer implements GammaCodedSequenceWriter {
private final VarintColumnWriter indexWriter;
private final ColumnDesc<?, ?> columnDesc;
public class Writer implements ObjectColumnWriter<GammaCodedSequence> {
private final VarintColumn.Writer indexWriter;
private final StorageWriter storage;
public Writer(ColumnDesc<?, ?> columnDesc,
StorageWriter storage,
VarintColumnWriter indexWriter)
public Writer(StorageWriter storage,
VarintColumn.Writer indexWriter)
{
this.columnDesc = columnDesc;
this.storage = storage;
this.indexWriter = indexWriter;
}
@Override
public ColumnDesc<?, ?> columnDesc() {
return columnDesc;
public AbstractColumn<?, ?> columnDesc() {
return GammaCodedSequenceColumn.this;
}
@Override
@ -82,20 +86,18 @@ public class GammaCodedSequenceColumn {
}
}
private static class Reader implements GammaCodedSequenceReader {
private final VarintColumnReader indexReader;
private final ColumnDesc<?, ?> columnDesc;
public class Reader implements ObjectColumnReader<GammaCodedSequence> {
private final VarintColumn.Reader indexReader;
private final StorageReader storage;
public Reader(ColumnDesc<?, ?> columnDesc, StorageReader reader, VarintColumnReader indexReader) throws IOException {
this.columnDesc = columnDesc;
Reader(StorageReader reader, VarintColumn.Reader indexReader) throws IOException {
this.storage = reader;
this.indexReader = indexReader;
}
@Override
public ColumnDesc<?, ?> columnDesc() {
return columnDesc;
public AbstractColumn<?, ?> columnDesc() {
return GammaCodedSequenceColumn.this;
}
@Override
@ -126,7 +128,6 @@ public class GammaCodedSequenceColumn {
return new GammaCodedSequence(dest);
}
@Override
public void getData(ByteBuffer workArea) throws IOException {
int size = indexReader.get();

View File

@ -1,33 +0,0 @@
package nu.marginalia.sequence.slop;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.slop.column.ColumnReader;
import java.io.IOException;
import java.nio.ByteBuffer;
public interface GammaCodedSequenceReader extends AutoCloseable, ColumnReader {
/** Read the next gamma-coded sequence from the column. Unlike most other
* readers, this method requires an intermediate buffer to use for reading
* the sequence. As this buffer typically needs to be fairly large to accommodate
* the largest possible sequence, it is not practical to allocate a new buffer
* for each call to this method. Instead, the caller should allocate a buffer
* once and reuse it for each call to this method.
*
* @return The next gamma-coded sequence.
*/
GammaCodedSequence get() throws IOException;
/** Read just the data portion of the next gamma-coded sequence from the column.
* This method is useful when the caller is only interested in the data portion
* of the sequence and does not want to decode the values.
*
* The position of the buffer is advanced to the end of the data that has just been read,
* and the limit remains the same.
*
* @param workArea A buffer to use for reading the data.
*/
void getData(ByteBuffer workArea) throws IOException;
void close() throws IOException;
}

View File

@ -1,11 +0,0 @@
package nu.marginalia.sequence.slop;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.slop.column.ColumnWriter;
import java.io.IOException;
public interface GammaCodedSequenceWriter extends AutoCloseable, ColumnWriter {
void put(GammaCodedSequence sequence) throws IOException;
void close() throws IOException;
}

View File

@ -3,21 +3,16 @@ package nu.marginalia.model.processed;
import lombok.Builder;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.sequence.slop.GammaCodedSequenceArrayColumn;
import nu.marginalia.sequence.slop.GammaCodedSequenceArrayReader;
import nu.marginalia.sequence.slop.GammaCodedSequenceArrayWriter;
import nu.marginalia.slop.ColumnTypes;
import nu.marginalia.slop.column.array.ByteArrayColumnReader;
import nu.marginalia.slop.column.array.ByteArrayColumnWriter;
import nu.marginalia.slop.column.array.ObjectArrayColumnReader;
import nu.marginalia.slop.column.array.ObjectArrayColumnWriter;
import nu.marginalia.slop.column.dynamic.VarintColumnReader;
import nu.marginalia.slop.column.dynamic.VarintColumnWriter;
import nu.marginalia.slop.column.primitive.*;
import nu.marginalia.slop.column.string.EnumColumnReader;
import nu.marginalia.slop.column.string.StringColumnReader;
import nu.marginalia.slop.column.string.StringColumnWriter;
import nu.marginalia.slop.desc.ColumnDesc;
import nu.marginalia.slop.desc.SlopTable;
import nu.marginalia.slop.SlopTable;
import nu.marginalia.slop.column.array.ByteArrayColumn;
import nu.marginalia.slop.column.array.ObjectArrayColumn;
import nu.marginalia.slop.column.dynamic.VarintColumn;
import nu.marginalia.slop.column.primitive.FloatColumn;
import nu.marginalia.slop.column.primitive.IntColumn;
import nu.marginalia.slop.column.primitive.LongColumn;
import nu.marginalia.slop.column.string.EnumColumn;
import nu.marginalia.slop.column.string.StringColumn;
import nu.marginalia.slop.column.string.TxtStringColumn;
import nu.marginalia.slop.desc.StorageType;
import org.jetbrains.annotations.Nullable;
@ -111,45 +106,47 @@ public record SlopDocumentRecord(
}
// Basic information
private static final ColumnDesc<StringColumnReader, StringColumnWriter> domainsColumn = new ColumnDesc<>("domain", ColumnTypes.TXTSTRING, StorageType.GZIP);
private static final ColumnDesc<StringColumnReader, StringColumnWriter> urlsColumn = new ColumnDesc<>("url", ColumnTypes.TXTSTRING, StorageType.GZIP);
private static final ColumnDesc<VarintColumnReader, VarintColumnWriter> ordinalsColumn = new ColumnDesc<>("ordinal", ColumnTypes.VARINT_LE, StorageType.PLAIN);
private static final ColumnDesc<EnumColumnReader, StringColumnWriter> statesColumn = new ColumnDesc<>("state", ColumnTypes.ENUM_LE, StorageType.PLAIN);
private static final ColumnDesc<StringColumnReader, StringColumnWriter> stateReasonsColumn = new ColumnDesc<>("stateReason", ColumnTypes.TXTSTRING, StorageType.GZIP);
private static final TxtStringColumn domainsColumn = new TxtStringColumn("domain", StorageType.GZIP);
private static final TxtStringColumn urlsColumn = new TxtStringColumn("url", StorageType.GZIP);
private static final VarintColumn ordinalsColumn = new VarintColumn("ordinal", StorageType.PLAIN);
private static final EnumColumn statesColumn = new EnumColumn("state", StorageType.PLAIN);
private static final StringColumn stateReasonsColumn = new StringColumn("stateReason", StorageType.GZIP);
// Document metadata
private static final ColumnDesc<StringColumnReader, StringColumnWriter> titlesColumn = new ColumnDesc<>("title", ColumnTypes.STRING, StorageType.GZIP);
private static final ColumnDesc<StringColumnReader, StringColumnWriter> descriptionsColumn = new ColumnDesc<>("description", ColumnTypes.STRING, StorageType.GZIP);
private static final ColumnDesc<EnumColumnReader, StringColumnWriter> htmlStandardsColumn = new ColumnDesc<>("htmlStandard", ColumnTypes.ENUM_LE, StorageType.GZIP);
private static final ColumnDesc<IntColumnReader, IntColumnWriter> htmlFeaturesColumn = new ColumnDesc<>("htmlFeatures", ColumnTypes.INT_LE, StorageType.PLAIN);
private static final ColumnDesc<IntColumnReader, IntColumnWriter> lengthsColumn = new ColumnDesc<>("length", ColumnTypes.INT_LE, StorageType.PLAIN);
private static final ColumnDesc<IntColumnReader, IntColumnWriter> pubYearColumn = new ColumnDesc<>("pubYear", ColumnTypes.INT_LE, StorageType.PLAIN);
private static final ColumnDesc<LongColumnReader, LongColumnWriter> hashesColumn = new ColumnDesc<>("hash", ColumnTypes.LONG_LE, StorageType.PLAIN);
private static final ColumnDesc<FloatColumnReader, FloatColumnWriter> qualitiesColumn = new ColumnDesc<>("quality", ColumnTypes.FLOAT_LE, StorageType.PLAIN);
private static final ColumnDesc<LongColumnReader, LongColumnWriter> domainMetadata = new ColumnDesc<>("domainMetadata", ColumnTypes.LONG_LE, StorageType.PLAIN);
private static final StringColumn titlesColumn = new StringColumn("title", StorageType.GZIP);
private static final StringColumn descriptionsColumn = new StringColumn("description", StorageType.GZIP);
private static final EnumColumn htmlStandardsColumn = new EnumColumn("htmlStandard", StorageType.PLAIN);
private static final IntColumn htmlFeaturesColumn = new IntColumn("htmlFeatures", StorageType.PLAIN);
private static final IntColumn lengthsColumn = new IntColumn("length", StorageType.PLAIN);
private static final IntColumn pubYearColumn = new IntColumn("pubYear", StorageType.PLAIN);
private static final LongColumn hashesColumn = new LongColumn("hash", StorageType.PLAIN);
private static final FloatColumn qualitiesColumn = new FloatColumn("quality", StorageType.PLAIN);
private static final LongColumn domainMetadata = new LongColumn("domainMetadata", StorageType.PLAIN);
// Keyword-level columns, these are enumerated by the counts column
private static final ColumnDesc<ObjectArrayColumnReader<String>, ObjectArrayColumnWriter<String>> keywordsColumn = new ColumnDesc<>("keywords", ColumnTypes.STRING_ARRAY, StorageType.ZSTD);
private static final ColumnDesc<ByteArrayColumnReader, ByteArrayColumnWriter> termMetaColumn = new ColumnDesc<>("termMetadata", ColumnTypes.BYTE_ARRAY, StorageType.ZSTD);
private static final ColumnDesc<GammaCodedSequenceArrayReader, GammaCodedSequenceArrayWriter> termPositionsColumn = new ColumnDesc<>("termPositions", GammaCodedSequenceArrayColumn.TYPE, StorageType.ZSTD);
private static final ObjectArrayColumn<String> keywordsColumn = new StringColumn("keywords", StorageType.ZSTD).asArray();
private static final ByteArrayColumn termMetaColumn = new ByteArrayColumn("termMetadata", StorageType.ZSTD);
private static final GammaCodedSequenceArrayColumn termPositionsColumn = new GammaCodedSequenceArrayColumn("termPositions", StorageType.ZSTD);
// Spans columns
private static final ColumnDesc<ByteArrayColumnReader, ByteArrayColumnWriter> spanCodesColumn = new ColumnDesc<>("spanCodes", ColumnTypes.BYTE_ARRAY, StorageType.ZSTD);
private static final ColumnDesc<GammaCodedSequenceArrayReader, GammaCodedSequenceArrayWriter> spansColumn = new ColumnDesc<>("spans", GammaCodedSequenceArrayColumn.TYPE, StorageType.ZSTD);
private static final ByteArrayColumn spanCodesColumn = new ByteArrayColumn("spanCodes", StorageType.ZSTD);
private static final GammaCodedSequenceArrayColumn spansColumn = new GammaCodedSequenceArrayColumn("spans", StorageType.ZSTD);
public static class KeywordsProjectionReader extends SlopTable {
private final StringColumnReader domainsReader;
private final VarintColumnReader ordinalsReader;
private final IntColumnReader htmlFeaturesReader;
private final LongColumnReader domainMetadataReader;
private final IntColumnReader lengthsReader;
private final TxtStringColumn.Reader domainsReader;
private final VarintColumn.Reader ordinalsReader;
private final IntColumn.Reader htmlFeaturesReader;
private final LongColumn.Reader domainMetadataReader;
private final IntColumn.Reader lengthsReader;
private final ObjectArrayColumnReader<String> keywordsReader;
private final ByteArrayColumnReader termMetaReader;
private final GammaCodedSequenceArrayReader termPositionsReader;
private final ObjectArrayColumn<String>.Reader keywordsReader;
private final ByteArrayColumn.Reader termMetaReader;
private final GammaCodedSequenceArrayColumn.Reader termPositionsReader;
private final ByteArrayColumnReader spanCodesReader;
private final GammaCodedSequenceArrayReader spansReader;
private final ByteArrayColumn.Reader spanCodesReader;
private final GammaCodedSequenceArrayColumn.Reader spansReader;
public KeywordsProjectionReader(SlopPageRef<SlopDocumentRecord> pageRef) throws IOException {
this(pageRef.baseDir(), pageRef.page());
@ -206,18 +203,18 @@ public record SlopDocumentRecord(
}
public static class MetadataReader extends SlopTable {
private final StringColumnReader domainsReader;
private final StringColumnReader urlsReader;
private final VarintColumnReader ordinalsReader;
private final StringColumnReader titlesReader;
private final StringColumnReader descriptionsReader;
private final TxtStringColumn.Reader domainsReader;
private final TxtStringColumn.Reader urlsReader;
private final VarintColumn.Reader ordinalsReader;
private final StringColumn.Reader titlesReader;
private final StringColumn.Reader descriptionsReader;
private final IntColumnReader htmlFeaturesReader;
private final StringColumnReader htmlStandardsReader;
private final IntColumnReader lengthsReader;
private final LongColumnReader hashesReader;
private final FloatColumnReader qualitiesReader;
private final IntColumnReader pubYearReader;
private final IntColumn.Reader htmlFeaturesReader;
private final EnumColumn.Reader htmlStandardsReader;
private final IntColumn.Reader lengthsReader;
private final LongColumn.Reader hashesReader;
private final FloatColumn.Reader qualitiesReader;
private final IntColumn.Reader pubYearReader;
public MetadataReader(SlopPageRef<SlopDocumentRecord> pageRef) throws IOException{
this(pageRef.baseDir(), pageRef.page());
@ -263,25 +260,25 @@ public record SlopDocumentRecord(
}
public static class Writer extends SlopTable {
private final StringColumnWriter domainsWriter;
private final StringColumnWriter urlsWriter;
private final VarintColumnWriter ordinalsWriter;
private final StringColumnWriter statesWriter;
private final StringColumnWriter stateReasonsWriter;
private final StringColumnWriter titlesWriter;
private final StringColumnWriter descriptionsWriter;
private final IntColumnWriter htmlFeaturesWriter;
private final StringColumnWriter htmlStandardsWriter;
private final IntColumnWriter lengthsWriter;
private final LongColumnWriter hashesWriter;
private final FloatColumnWriter qualitiesWriter;
private final LongColumnWriter domainMetadataWriter;
private final IntColumnWriter pubYearWriter;
private final ObjectArrayColumnWriter<String> keywordsWriter;
private final ByteArrayColumnWriter termMetaWriter;
private final GammaCodedSequenceArrayWriter termPositionsWriter;
private final ByteArrayColumnWriter spansCodesWriter;
private final GammaCodedSequenceArrayWriter spansWriter;
private final TxtStringColumn.Writer domainsWriter;
private final TxtStringColumn.Writer urlsWriter;
private final VarintColumn.Writer ordinalsWriter;
private final EnumColumn.Writer statesWriter;
private final StringColumn.Writer stateReasonsWriter;
private final StringColumn.Writer titlesWriter;
private final StringColumn.Writer descriptionsWriter;
private final IntColumn.Writer htmlFeaturesWriter;
private final EnumColumn.Writer htmlStandardsWriter;
private final IntColumn.Writer lengthsWriter;
private final LongColumn.Writer hashesWriter;
private final FloatColumn.Writer qualitiesWriter;
private final LongColumn.Writer domainMetadataWriter;
private final IntColumn.Writer pubYearWriter;
private final ObjectArrayColumn<String>.Writer keywordsWriter;
private final ByteArrayColumn.Writer termMetaWriter;
private final GammaCodedSequenceArrayColumn.Writer termPositionsWriter;
private final ByteArrayColumn.Writer spansCodesWriter;
private final GammaCodedSequenceArrayColumn.Writer spansWriter;
public Writer(Path baseDir, int page) throws IOException {
super(page);

View File

@ -1,10 +1,7 @@
package nu.marginalia.model.processed;
import nu.marginalia.slop.ColumnTypes;
import nu.marginalia.slop.column.string.StringColumnReader;
import nu.marginalia.slop.column.string.StringColumnWriter;
import nu.marginalia.slop.desc.ColumnDesc;
import nu.marginalia.slop.desc.SlopTable;
import nu.marginalia.slop.SlopTable;
import nu.marginalia.slop.column.string.TxtStringColumn;
import nu.marginalia.slop.desc.StorageType;
import java.io.IOException;
@ -15,16 +12,16 @@ public record SlopDomainLinkRecord(
String source,
String dest)
{
private static final ColumnDesc<StringColumnReader, StringColumnWriter> sourcesColumn = new ColumnDesc<>("source", ColumnTypes.TXTSTRING, StorageType.GZIP);
private static final ColumnDesc<StringColumnReader, StringColumnWriter> destsColumn = new ColumnDesc<>("dest", ColumnTypes.TXTSTRING, StorageType.GZIP);
private static final TxtStringColumn sourcesColumn = new TxtStringColumn("source", StorageType.GZIP);
private static final TxtStringColumn destsColumn = new TxtStringColumn("dest", StorageType.GZIP);
public static Reader reader(Path baseDir, int page) throws IOException {
return new Reader(baseDir, page);
}
public static class Reader extends SlopTable {
private final StringColumnReader sourcesReader;
private final StringColumnReader destsReader;
private final TxtStringColumn.Reader sourcesReader;
private final TxtStringColumn.Reader destsReader;
public Reader(SlopPageRef<SlopDomainLinkRecord> page) throws IOException {
this(page.baseDir(), page.page());
@ -57,8 +54,8 @@ public record SlopDomainLinkRecord(
}
public static class Writer extends SlopTable {
private final StringColumnWriter sourcesWriter;
private final StringColumnWriter destsWriter;
private final TxtStringColumn.Writer sourcesWriter;
private final TxtStringColumn.Writer destsWriter;
public Writer(Path baseDir, int page) throws IOException {
super(page);

View File

@ -1,15 +1,10 @@
package nu.marginalia.model.processed;
import nu.marginalia.slop.ColumnTypes;
import nu.marginalia.slop.column.array.ObjectArrayColumnReader;
import nu.marginalia.slop.column.array.ObjectArrayColumnWriter;
import nu.marginalia.slop.column.primitive.IntColumnReader;
import nu.marginalia.slop.column.primitive.IntColumnWriter;
import nu.marginalia.slop.column.string.EnumColumnReader;
import nu.marginalia.slop.column.string.StringColumnReader;
import nu.marginalia.slop.column.string.StringColumnWriter;
import nu.marginalia.slop.desc.ColumnDesc;
import nu.marginalia.slop.desc.SlopTable;
import nu.marginalia.slop.SlopTable;
import nu.marginalia.slop.column.array.ObjectArrayColumn;
import nu.marginalia.slop.column.primitive.IntColumn;
import nu.marginalia.slop.column.string.EnumColumn;
import nu.marginalia.slop.column.string.TxtStringColumn;
import nu.marginalia.slop.desc.StorageType;
import java.io.IOException;
@ -33,20 +28,20 @@ public record SlopDomainRecord(
String ip)
{}
private static final ColumnDesc<StringColumnReader, StringColumnWriter> domainsColumn = new ColumnDesc<>("domain", ColumnTypes.TXTSTRING, StorageType.GZIP);
private static final ColumnDesc<EnumColumnReader, StringColumnWriter> statesColumn = new ColumnDesc<>("state", ColumnTypes.ENUM_LE, StorageType.PLAIN);
private static final ColumnDesc<StringColumnReader, StringColumnWriter> redirectDomainsColumn = new ColumnDesc<>("redirectDomain", ColumnTypes.TXTSTRING, StorageType.GZIP);
private static final ColumnDesc<StringColumnReader, StringColumnWriter> ipColumn = new ColumnDesc<>("ip", ColumnTypes.TXTSTRING, StorageType.GZIP);
private static final TxtStringColumn domainsColumn = new TxtStringColumn("domain", StorageType.GZIP);
private static final EnumColumn statesColumn = new EnumColumn("state", StorageType.PLAIN);
private static final TxtStringColumn redirectDomainsColumn = new TxtStringColumn("redirectDomain", StorageType.GZIP);
private static final TxtStringColumn ipColumn = new TxtStringColumn("ip", StorageType.GZIP);
private static final ColumnDesc<IntColumnReader, IntColumnWriter> knownUrlsColumn = new ColumnDesc<>("knownUrls", ColumnTypes.INT_LE, StorageType.PLAIN);
private static final ColumnDesc<IntColumnReader, IntColumnWriter> goodUrlsColumn = new ColumnDesc<>("goodUrls", ColumnTypes.INT_LE, StorageType.PLAIN);
private static final ColumnDesc<IntColumnReader, IntColumnWriter> visitedUrlsColumn = new ColumnDesc<>("visitedUrls", ColumnTypes.INT_LE, StorageType.PLAIN);
private static final IntColumn knownUrlsColumn = new IntColumn("knownUrls", StorageType.PLAIN);
private static final IntColumn goodUrlsColumn = new IntColumn("goodUrls", StorageType.PLAIN);
private static final IntColumn visitedUrlsColumn = new IntColumn("visitedUrls", StorageType.PLAIN);
private static final ColumnDesc<ObjectArrayColumnReader<String>, ObjectArrayColumnWriter<String>> rssFeedsColumn = new ColumnDesc<>("rssFeeds", ColumnTypes.TXTSTRING_ARRAY, StorageType.GZIP);
private static final ObjectArrayColumn<String> rssFeedsColumn = new TxtStringColumn("rssFeeds", StorageType.GZIP).asArray();
public static class DomainNameReader extends SlopTable {
private final StringColumnReader domainsReader;
private final TxtStringColumn.Reader domainsReader;
public DomainNameReader(SlopPageRef<SlopDomainRecord> page) throws IOException {
this(page.baseDir(), page.page());
@ -68,8 +63,8 @@ public record SlopDomainRecord(
}
public static class DomainWithIpReader extends SlopTable {
private final StringColumnReader domainsReader;
private final StringColumnReader ipReader;
private final TxtStringColumn.Reader domainsReader;
private final TxtStringColumn.Reader ipReader;
public DomainWithIpReader(SlopPageRef<SlopDomainRecord> page) throws IOException {
this(page.baseDir(), page.page());
@ -96,16 +91,16 @@ public record SlopDomainRecord(
}
public static class Reader extends SlopTable {
private final StringColumnReader domainsReader;
private final StringColumnReader statesReader;
private final StringColumnReader redirectReader;
private final StringColumnReader ipReader;
private final TxtStringColumn.Reader domainsReader;
private final EnumColumn.Reader statesReader;
private final TxtStringColumn.Reader redirectReader;
private final TxtStringColumn.Reader ipReader;
private final IntColumnReader knownUrlsReader;
private final IntColumnReader goodUrlsReader;
private final IntColumnReader visitedUrlsReader;
private final IntColumn.Reader knownUrlsReader;
private final IntColumn.Reader goodUrlsReader;
private final IntColumn.Reader visitedUrlsReader;
private final ObjectArrayColumnReader<String> rssFeedsReader;
private final ObjectArrayColumn<String>.Reader rssFeedsReader;
public Reader(SlopPageRef<SlopDomainRecord> page) throws IOException {
this(page.baseDir(), page.page());
@ -151,16 +146,16 @@ public record SlopDomainRecord(
}
public static class Writer extends SlopTable {
private final StringColumnWriter domainsWriter;
private final StringColumnWriter statesWriter;
private final StringColumnWriter redirectWriter;
private final StringColumnWriter ipWriter;
private final TxtStringColumn.Writer domainsWriter;
private final EnumColumn.Writer statesWriter;
private final TxtStringColumn.Writer redirectWriter;
private final TxtStringColumn.Writer ipWriter;
private final IntColumnWriter knownUrlsWriter;
private final IntColumnWriter goodUrlsWriter;
private final IntColumnWriter visitedUrlsWriter;
private final IntColumn.Writer knownUrlsWriter;
private final IntColumn.Writer goodUrlsWriter;
private final IntColumn.Writer visitedUrlsWriter;
private final ObjectArrayColumnWriter<String> rssFeedsWriter;
private final ObjectArrayColumn<String>.Writer rssFeedsWriter;
public Writer(Path baseDir, int page) throws IOException {
super(page);

View File

@ -226,7 +226,7 @@ dependencyResolutionManagement {
library('jetty-util','org.eclipse.jetty','jetty-util').version('9.4.54.v20240208')
library('jetty-servlet','org.eclipse.jetty','jetty-servlet').version('9.4.54.v20240208')
library('slop', 'nu.marginalia', 'slop').version('0.0.1-SNAPSHOT')
library('slop', 'nu.marginalia', 'slop').version('0.0.3-SNAPSHOT')
bundle('jetty', ['jetty-server', 'jetty-util', 'jetty-servlet'])