mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(slop) Migrate to latest Slop version
This commit is contained in:
parent
2ad93ad41a
commit
75b0888032
@ -10,8 +10,8 @@ import nu.marginalia.index.journal.IndexJournal;
|
|||||||
import nu.marginalia.model.id.UrlIdCodec;
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||||
import nu.marginalia.slop.column.primitive.LongColumnReader;
|
import nu.marginalia.slop.SlopTable;
|
||||||
import nu.marginalia.slop.desc.SlopTable;
|
import nu.marginalia.slop.column.primitive.LongColumn;
|
||||||
import org.roaringbitmap.longlong.LongConsumer;
|
import org.roaringbitmap.longlong.LongConsumer;
|
||||||
import org.roaringbitmap.longlong.Roaring64Bitmap;
|
import org.roaringbitmap.longlong.Roaring64Bitmap;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -153,7 +153,7 @@ public class ForwardIndexConverter {
|
|||||||
|
|
||||||
for (var instance : journalReader.pages()) {
|
for (var instance : journalReader.pages()) {
|
||||||
try (var slopTable = new SlopTable(instance.page())) {
|
try (var slopTable = new SlopTable(instance.page())) {
|
||||||
LongColumnReader idReader = instance.openCombinedId(slopTable);
|
LongColumn.Reader idReader = instance.openCombinedId(slopTable);
|
||||||
|
|
||||||
while (idReader.hasRemaining()) {
|
while (idReader.hasRemaining()) {
|
||||||
rbm.add(idReader.get());
|
rbm.add(idReader.get());
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
package nu.marginalia.index.journal;
|
package nu.marginalia.index.journal;
|
||||||
|
|
||||||
import nu.marginalia.slop.desc.SlopTable;
|
import nu.marginalia.slop.SlopTable;
|
||||||
|
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
@ -1,36 +1,28 @@
|
|||||||
package nu.marginalia.index.journal;
|
package nu.marginalia.index.journal;
|
||||||
|
|
||||||
import nu.marginalia.sequence.slop.GammaCodedSequenceArrayColumn;
|
import nu.marginalia.sequence.slop.GammaCodedSequenceArrayColumn;
|
||||||
import nu.marginalia.sequence.slop.GammaCodedSequenceArrayReader;
|
import nu.marginalia.slop.SlopTable;
|
||||||
import nu.marginalia.sequence.slop.GammaCodedSequenceArrayWriter;
|
import nu.marginalia.slop.column.array.ByteArrayColumn;
|
||||||
import nu.marginalia.slop.ColumnTypes;
|
import nu.marginalia.slop.column.array.LongArrayColumn;
|
||||||
import nu.marginalia.slop.column.array.ByteArrayColumnReader;
|
import nu.marginalia.slop.column.primitive.IntColumn;
|
||||||
import nu.marginalia.slop.column.array.ByteArrayColumnWriter;
|
import nu.marginalia.slop.column.primitive.LongColumn;
|
||||||
import nu.marginalia.slop.column.array.LongArrayColumnReader;
|
|
||||||
import nu.marginalia.slop.column.array.LongArrayColumnWriter;
|
|
||||||
import nu.marginalia.slop.column.primitive.IntColumnReader;
|
|
||||||
import nu.marginalia.slop.column.primitive.IntColumnWriter;
|
|
||||||
import nu.marginalia.slop.column.primitive.LongColumnReader;
|
|
||||||
import nu.marginalia.slop.column.primitive.LongColumnWriter;
|
|
||||||
import nu.marginalia.slop.desc.ColumnDesc;
|
|
||||||
import nu.marginalia.slop.desc.SlopTable;
|
|
||||||
import nu.marginalia.slop.desc.StorageType;
|
import nu.marginalia.slop.desc.StorageType;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
|
||||||
public record IndexJournalPage(Path baseDir, int page) {
|
public record IndexJournalPage(Path baseDir, int page) {
|
||||||
public static final ColumnDesc<IntColumnReader, IntColumnWriter> features = new ColumnDesc<>("features", ColumnTypes.INT_LE, StorageType.PLAIN);
|
public static IntColumn features = new IntColumn("features", StorageType.PLAIN);
|
||||||
public static final ColumnDesc<IntColumnReader, IntColumnWriter> size = new ColumnDesc<>("size", ColumnTypes.INT_LE, StorageType.PLAIN);
|
public static IntColumn size = new IntColumn("size", StorageType.PLAIN);
|
||||||
public static final ColumnDesc<LongColumnReader, LongColumnWriter> combinedId = new ColumnDesc<>("combinedId", ColumnTypes.LONG_LE, StorageType.PLAIN);
|
public static LongColumn combinedId = new LongColumn("combinedId", StorageType.PLAIN);
|
||||||
public static final ColumnDesc<LongColumnReader, LongColumnWriter> documentMeta = new ColumnDesc<>("documentMeta", ColumnTypes.LONG_LE, StorageType.PLAIN);
|
public static LongColumn documentMeta = new LongColumn("documentMeta", StorageType.PLAIN);
|
||||||
|
|
||||||
public static final ColumnDesc<LongArrayColumnReader, LongArrayColumnWriter> termIds = new ColumnDesc<>("termIds", ColumnTypes.LONG_ARRAY_LE, StorageType.ZSTD);
|
public static LongArrayColumn termIds = new LongArrayColumn("termIds", StorageType.ZSTD);
|
||||||
public static final ColumnDesc<ByteArrayColumnReader, ByteArrayColumnWriter> termMeta = new ColumnDesc<>("termMetadata", ColumnTypes.BYTE_ARRAY, StorageType.ZSTD);
|
public static ByteArrayColumn termMeta = new ByteArrayColumn("termMetadata", StorageType.ZSTD);
|
||||||
public static final ColumnDesc<GammaCodedSequenceArrayReader, GammaCodedSequenceArrayWriter> positions = new ColumnDesc<>("termPositions", GammaCodedSequenceArrayColumn.TYPE, StorageType.ZSTD);
|
public static GammaCodedSequenceArrayColumn positions = new GammaCodedSequenceArrayColumn("termPositions", StorageType.ZSTD);
|
||||||
|
|
||||||
public static final ColumnDesc<ByteArrayColumnReader, ByteArrayColumnWriter> spanCodes = new ColumnDesc<>("spanCodes", ColumnTypes.BYTE_ARRAY, StorageType.ZSTD);
|
public static ByteArrayColumn spanCodes = new ByteArrayColumn("spanCodes", StorageType.ZSTD);
|
||||||
public static final ColumnDesc<GammaCodedSequenceArrayReader, GammaCodedSequenceArrayWriter> spans = new ColumnDesc<>("spans", GammaCodedSequenceArrayColumn.TYPE, StorageType.ZSTD);
|
public static GammaCodedSequenceArrayColumn spans = new GammaCodedSequenceArrayColumn("spans", StorageType.ZSTD);
|
||||||
|
|
||||||
public IndexJournalPage {
|
public IndexJournalPage {
|
||||||
if (!baseDir.toFile().isDirectory()) {
|
if (!baseDir.toFile().isDirectory()) {
|
||||||
@ -38,40 +30,40 @@ public record IndexJournalPage(Path baseDir, int page) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public LongColumnReader openCombinedId(SlopTable table) throws IOException {
|
public LongColumn.Reader openCombinedId(SlopTable table) throws IOException {
|
||||||
return combinedId.open(table, baseDir);
|
return combinedId.open(table, baseDir);
|
||||||
}
|
}
|
||||||
|
|
||||||
public LongColumnReader openDocumentMeta(SlopTable table) throws IOException {
|
public LongColumn.Reader openDocumentMeta(SlopTable table) throws IOException {
|
||||||
return documentMeta.open(table, baseDir);
|
return documentMeta.open(table, baseDir);
|
||||||
}
|
}
|
||||||
|
|
||||||
public IntColumnReader openFeatures(SlopTable table) throws IOException {
|
public IntColumn.Reader openFeatures(SlopTable table) throws IOException {
|
||||||
return features.open(table, baseDir);
|
return features.open(table, baseDir);
|
||||||
}
|
}
|
||||||
|
|
||||||
public IntColumnReader openSize(SlopTable table) throws IOException {
|
public IntColumn.Reader openSize(SlopTable table) throws IOException {
|
||||||
return size.open(table, baseDir);
|
return size.open(table, baseDir);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public LongArrayColumnReader openTermIds(SlopTable table) throws IOException {
|
public LongArrayColumn.Reader openTermIds(SlopTable table) throws IOException {
|
||||||
return termIds.open(table, baseDir);
|
return termIds.open(table, baseDir);
|
||||||
}
|
}
|
||||||
|
|
||||||
public ByteArrayColumnReader openTermMetadata(SlopTable table) throws IOException {
|
public ByteArrayColumn.Reader openTermMetadata(SlopTable table) throws IOException {
|
||||||
return termMeta.open(table, baseDir);
|
return termMeta.open(table, baseDir);
|
||||||
}
|
}
|
||||||
|
|
||||||
public GammaCodedSequenceArrayReader openTermPositions(SlopTable table) throws IOException {
|
public GammaCodedSequenceArrayColumn.Reader openTermPositions(SlopTable table) throws IOException {
|
||||||
return positions.open(table, baseDir);
|
return positions.open(table, baseDir);
|
||||||
}
|
}
|
||||||
|
|
||||||
public GammaCodedSequenceArrayReader openSpans(SlopTable table) throws IOException {
|
public GammaCodedSequenceArrayColumn.Reader openSpans(SlopTable table) throws IOException {
|
||||||
return spans.open(table, baseDir);
|
return spans.open(table, baseDir);
|
||||||
}
|
}
|
||||||
|
|
||||||
public ByteArrayColumnReader openSpanCodes(SlopTable table) throws IOException {
|
public ByteArrayColumn.Reader openSpanCodes(SlopTable table) throws IOException {
|
||||||
return spanCodes.open(table, baseDir);
|
return spanCodes.open(table, baseDir);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3,12 +3,12 @@ package nu.marginalia.index.journal;
|
|||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.hash.MurmurHash3_128;
|
import nu.marginalia.hash.MurmurHash3_128;
|
||||||
import nu.marginalia.model.processed.SlopDocumentRecord;
|
import nu.marginalia.model.processed.SlopDocumentRecord;
|
||||||
import nu.marginalia.sequence.slop.GammaCodedSequenceArrayWriter;
|
import nu.marginalia.sequence.slop.GammaCodedSequenceArrayColumn;
|
||||||
import nu.marginalia.slop.column.array.ByteArrayColumnWriter;
|
import nu.marginalia.slop.SlopTable;
|
||||||
import nu.marginalia.slop.column.array.LongArrayColumnWriter;
|
import nu.marginalia.slop.column.array.ByteArrayColumn;
|
||||||
import nu.marginalia.slop.column.primitive.IntColumnWriter;
|
import nu.marginalia.slop.column.array.LongArrayColumn;
|
||||||
import nu.marginalia.slop.column.primitive.LongColumnWriter;
|
import nu.marginalia.slop.column.primitive.IntColumn;
|
||||||
import nu.marginalia.slop.desc.SlopTable;
|
import nu.marginalia.slop.column.primitive.LongColumn;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
@ -17,17 +17,17 @@ import java.util.List;
|
|||||||
|
|
||||||
public class IndexJournalSlopWriter extends SlopTable {
|
public class IndexJournalSlopWriter extends SlopTable {
|
||||||
|
|
||||||
private final IntColumnWriter featuresWriter;
|
private final IntColumn.Writer featuresWriter;
|
||||||
private final IntColumnWriter sizeWriter;
|
private final IntColumn.Writer sizeWriter;
|
||||||
private final LongColumnWriter combinedIdWriter;
|
private final LongColumn.Writer combinedIdWriter;
|
||||||
private final LongColumnWriter documentMetaWriter;
|
private final LongColumn.Writer documentMetaWriter;
|
||||||
|
|
||||||
private final LongArrayColumnWriter termIdsWriter;
|
private final LongArrayColumn.Writer termIdsWriter;
|
||||||
private final ByteArrayColumnWriter termMetadataWriter;
|
private final ByteArrayColumn.Writer termMetadataWriter;
|
||||||
private final GammaCodedSequenceArrayWriter termPositionsWriter;
|
private final GammaCodedSequenceArrayColumn.Writer termPositionsWriter;
|
||||||
|
|
||||||
private final GammaCodedSequenceArrayWriter spansWriter;
|
private final GammaCodedSequenceArrayColumn.Writer spansWriter;
|
||||||
private final ByteArrayColumnWriter spanCodesWriter;
|
private final ByteArrayColumn.Writer spanCodesWriter;
|
||||||
|
|
||||||
private static final MurmurHash3_128 hash = new MurmurHash3_128();
|
private static final MurmurHash3_128 hash = new MurmurHash3_128();
|
||||||
|
|
||||||
|
@ -7,7 +7,7 @@ import nu.marginalia.index.construction.DocIdRewriter;
|
|||||||
import nu.marginalia.index.construction.PositionsFileConstructor;
|
import nu.marginalia.index.construction.PositionsFileConstructor;
|
||||||
import nu.marginalia.index.journal.IndexJournalPage;
|
import nu.marginalia.index.journal.IndexJournalPage;
|
||||||
import nu.marginalia.rwf.RandomFileAssembler;
|
import nu.marginalia.rwf.RandomFileAssembler;
|
||||||
import nu.marginalia.slop.desc.SlopTable;
|
import nu.marginalia.slop.SlopTable;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
@ -6,7 +6,7 @@ import it.unimi.dsi.fastutil.longs.LongIterator;
|
|||||||
import nu.marginalia.array.LongArray;
|
import nu.marginalia.array.LongArray;
|
||||||
import nu.marginalia.array.LongArrayFactory;
|
import nu.marginalia.array.LongArrayFactory;
|
||||||
import nu.marginalia.index.journal.IndexJournalPage;
|
import nu.marginalia.index.journal.IndexJournalPage;
|
||||||
import nu.marginalia.slop.desc.SlopTable;
|
import nu.marginalia.slop.SlopTable;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
|
@ -6,7 +6,7 @@ import nu.marginalia.array.LongArrayFactory;
|
|||||||
import nu.marginalia.index.construction.DocIdRewriter;
|
import nu.marginalia.index.construction.DocIdRewriter;
|
||||||
import nu.marginalia.index.journal.IndexJournalPage;
|
import nu.marginalia.index.journal.IndexJournalPage;
|
||||||
import nu.marginalia.rwf.RandomFileAssembler;
|
import nu.marginalia.rwf.RandomFileAssembler;
|
||||||
import nu.marginalia.slop.desc.SlopTable;
|
import nu.marginalia.slop.SlopTable;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
@ -6,7 +6,7 @@ import it.unimi.dsi.fastutil.longs.LongIterator;
|
|||||||
import nu.marginalia.array.LongArray;
|
import nu.marginalia.array.LongArray;
|
||||||
import nu.marginalia.array.LongArrayFactory;
|
import nu.marginalia.array.LongArrayFactory;
|
||||||
import nu.marginalia.index.journal.IndexJournalPage;
|
import nu.marginalia.index.journal.IndexJournalPage;
|
||||||
import nu.marginalia.slop.desc.SlopTable;
|
import nu.marginalia.slop.SlopTable;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
|
@ -1,13 +1,12 @@
|
|||||||
package nu.marginalia.sequence.slop;
|
package nu.marginalia.sequence.slop;
|
||||||
|
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.GammaCodedSequence;
|
||||||
import nu.marginalia.slop.ColumnTypes;
|
import nu.marginalia.slop.column.AbstractColumn;
|
||||||
|
import nu.marginalia.slop.column.AbstractObjectColumn;
|
||||||
|
import nu.marginalia.slop.column.ObjectColumnReader;
|
||||||
|
import nu.marginalia.slop.column.ObjectColumnWriter;
|
||||||
import nu.marginalia.slop.column.dynamic.VarintColumn;
|
import nu.marginalia.slop.column.dynamic.VarintColumn;
|
||||||
import nu.marginalia.slop.column.dynamic.VarintColumnReader;
|
|
||||||
import nu.marginalia.slop.column.dynamic.VarintColumnWriter;
|
|
||||||
import nu.marginalia.slop.desc.ColumnDesc;
|
|
||||||
import nu.marginalia.slop.desc.ColumnFunction;
|
import nu.marginalia.slop.desc.ColumnFunction;
|
||||||
import nu.marginalia.slop.desc.ColumnType;
|
|
||||||
import nu.marginalia.slop.desc.StorageType;
|
import nu.marginalia.slop.desc.StorageType;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
@ -18,45 +17,54 @@ import java.util.ArrayList;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/** Slop column extension for storing GammaCodedSequence objects. */
|
/** Slop column extension for storing GammaCodedSequence objects. */
|
||||||
public class GammaCodedSequenceArrayColumn {
|
public class GammaCodedSequenceArrayColumn extends AbstractObjectColumn<List<GammaCodedSequence>, GammaCodedSequenceArrayColumn.Reader, GammaCodedSequenceArrayColumn.Writer> {
|
||||||
|
|
||||||
public static ColumnType<GammaCodedSequenceArrayReader, GammaCodedSequenceArrayWriter> TYPE = ColumnTypes.register("s8[]+gcs[]", ByteOrder.nativeOrder(), GammaCodedSequenceArrayColumn::open, GammaCodedSequenceArrayColumn::create);
|
private final VarintColumn groupsColumn;
|
||||||
|
private final GammaCodedSequenceColumn dataColumn;
|
||||||
|
|
||||||
public static GammaCodedSequenceArrayReader open(Path path, ColumnDesc columnDesc) throws IOException {
|
public GammaCodedSequenceArrayColumn(String name) {
|
||||||
return new Reader(columnDesc,
|
this(name, StorageType.PLAIN);
|
||||||
GammaCodedSequenceColumn.open(path, columnDesc),
|
}
|
||||||
VarintColumn.open(path, columnDesc.createSupplementaryColumn(ColumnFunction.GROUP_LENGTH,
|
|
||||||
ColumnTypes.VARINT_LE,
|
public GammaCodedSequenceArrayColumn(String name, StorageType storageType) {
|
||||||
StorageType.PLAIN)
|
super(name,
|
||||||
)
|
"gcs[]",
|
||||||
|
ByteOrder.nativeOrder(),
|
||||||
|
ColumnFunction.DATA,
|
||||||
|
storageType);
|
||||||
|
|
||||||
|
groupsColumn = new VarintColumn(name, ColumnFunction.GROUP_LENGTH, storageType);
|
||||||
|
dataColumn = new GammaCodedSequenceColumn(name);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Writer createUnregistered(Path path, int page) throws IOException {
|
||||||
|
return new Writer(
|
||||||
|
dataColumn.createUnregistered(path, page),
|
||||||
|
groupsColumn.createUnregistered(path, page)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static GammaCodedSequenceArrayWriter create(Path path, ColumnDesc columnDesc) throws IOException {
|
public Reader openUnregistered(Path path, int page) throws IOException {
|
||||||
return new Writer(columnDesc,
|
return new Reader(
|
||||||
GammaCodedSequenceColumn.create(path, columnDesc),
|
dataColumn.openUnregistered(path, page),
|
||||||
VarintColumn.create(path, columnDesc.createSupplementaryColumn(ColumnFunction.GROUP_LENGTH,
|
groupsColumn.openUnregistered(path, page)
|
||||||
ColumnTypes.VARINT_LE,
|
|
||||||
StorageType.PLAIN)
|
|
||||||
)
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class Writer implements GammaCodedSequenceArrayWriter {
|
|
||||||
private final VarintColumnWriter groupsWriter;
|
|
||||||
private final GammaCodedSequenceWriter dataWriter;
|
|
||||||
private final ColumnDesc<?, ?> columnDesc;
|
|
||||||
|
|
||||||
public Writer(ColumnDesc<?, ?> columnDesc, GammaCodedSequenceWriter dataWriter, VarintColumnWriter groupsWriter)
|
public class Writer implements ObjectColumnWriter<List<GammaCodedSequence>> {
|
||||||
|
private final VarintColumn.Writer groupsWriter;
|
||||||
|
private final GammaCodedSequenceColumn.Writer dataWriter;
|
||||||
|
|
||||||
|
Writer(GammaCodedSequenceColumn.Writer dataWriter, VarintColumn.Writer groupsWriter)
|
||||||
{
|
{
|
||||||
this.groupsWriter = groupsWriter;
|
this.groupsWriter = groupsWriter;
|
||||||
this.dataWriter = dataWriter;
|
this.dataWriter = dataWriter;
|
||||||
this.columnDesc = columnDesc;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public ColumnDesc<?, ?> columnDesc() {
|
public AbstractColumn<?, ?> columnDesc() {
|
||||||
return columnDesc;
|
return GammaCodedSequenceArrayColumn.this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -77,20 +85,18 @@ public class GammaCodedSequenceArrayColumn {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class Reader implements GammaCodedSequenceArrayReader {
|
public class Reader implements ObjectColumnReader<List<GammaCodedSequence>> {
|
||||||
private final GammaCodedSequenceReader dataReader;
|
private final GammaCodedSequenceColumn.Reader dataReader;
|
||||||
private final VarintColumnReader groupsReader;
|
private final VarintColumn.Reader groupsReader;
|
||||||
private final ColumnDesc<?, ?> columnDesc;
|
|
||||||
|
|
||||||
public Reader(ColumnDesc<?, ?> columnDesc, GammaCodedSequenceReader dataReader, VarintColumnReader groupsReader) throws IOException {
|
public Reader(GammaCodedSequenceColumn.Reader dataReader, VarintColumn.Reader groupsReader) {
|
||||||
this.dataReader = dataReader;
|
this.dataReader = dataReader;
|
||||||
this.groupsReader = groupsReader;
|
this.groupsReader = groupsReader;
|
||||||
this.columnDesc = columnDesc;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public ColumnDesc<?, ?> columnDesc() {
|
public AbstractColumn<?, ?> columnDesc() {
|
||||||
return columnDesc;
|
return GammaCodedSequenceArrayColumn.this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -123,7 +129,6 @@ public class GammaCodedSequenceArrayColumn {
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public List<ByteBuffer> getData(ByteBuffer workArea) throws IOException {
|
public List<ByteBuffer> getData(ByteBuffer workArea) throws IOException {
|
||||||
int count = groupsReader.get();
|
int count = groupsReader.get();
|
||||||
var ret = new ArrayList<ByteBuffer>(count);
|
var ret = new ArrayList<ByteBuffer>(count);
|
||||||
|
@ -1,32 +0,0 @@
|
|||||||
package nu.marginalia.sequence.slop;
|
|
||||||
|
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
|
||||||
import nu.marginalia.slop.column.ColumnReader;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.nio.ByteBuffer;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
public interface GammaCodedSequenceArrayReader extends AutoCloseable, ColumnReader {
|
|
||||||
/** Read the next gamma-coded sequence from the column. Unlike most other
|
|
||||||
* readers, this method requires an intermediate buffer to use for reading
|
|
||||||
* the sequence. As this buffer typically needs to be fairly large to accommodate
|
|
||||||
* the largest possible sequence, it is not practical to allocate a new buffer
|
|
||||||
* for each call to this method. Instead, the caller should allocate a buffer
|
|
||||||
* once and reuse it for each call to this method.
|
|
||||||
*
|
|
||||||
* @return The next gamma-coded sequence.
|
|
||||||
*/
|
|
||||||
List<GammaCodedSequence> get() throws IOException;
|
|
||||||
|
|
||||||
/** Read just the data portion of the next gamma-coded sequence from the column.
|
|
||||||
* This method is useful when the caller is only interested in the data portion
|
|
||||||
* of the sequence and does not want to decode the values.
|
|
||||||
*
|
|
||||||
* @param workArea A buffer to use for reading the data.
|
|
||||||
* @return slices of the work buffer containing the data.
|
|
||||||
*/
|
|
||||||
List<ByteBuffer> getData(ByteBuffer workArea) throws IOException;
|
|
||||||
|
|
||||||
void close() throws IOException;
|
|
||||||
}
|
|
@ -1,12 +0,0 @@
|
|||||||
package nu.marginalia.sequence.slop;
|
|
||||||
|
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
|
||||||
import nu.marginalia.slop.column.ColumnWriter;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
public interface GammaCodedSequenceArrayWriter extends AutoCloseable, ColumnWriter {
|
|
||||||
void put(List<GammaCodedSequence> sequence) throws IOException;
|
|
||||||
void close() throws IOException;
|
|
||||||
}
|
|
@ -1,13 +1,12 @@
|
|||||||
package nu.marginalia.sequence.slop;
|
package nu.marginalia.sequence.slop;
|
||||||
|
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.GammaCodedSequence;
|
||||||
import nu.marginalia.slop.ColumnTypes;
|
import nu.marginalia.slop.column.AbstractColumn;
|
||||||
|
import nu.marginalia.slop.column.AbstractObjectColumn;
|
||||||
|
import nu.marginalia.slop.column.ObjectColumnReader;
|
||||||
|
import nu.marginalia.slop.column.ObjectColumnWriter;
|
||||||
import nu.marginalia.slop.column.dynamic.VarintColumn;
|
import nu.marginalia.slop.column.dynamic.VarintColumn;
|
||||||
import nu.marginalia.slop.column.dynamic.VarintColumnReader;
|
|
||||||
import nu.marginalia.slop.column.dynamic.VarintColumnWriter;
|
|
||||||
import nu.marginalia.slop.desc.ColumnDesc;
|
|
||||||
import nu.marginalia.slop.desc.ColumnFunction;
|
import nu.marginalia.slop.desc.ColumnFunction;
|
||||||
import nu.marginalia.slop.desc.ColumnType;
|
|
||||||
import nu.marginalia.slop.desc.StorageType;
|
import nu.marginalia.slop.desc.StorageType;
|
||||||
import nu.marginalia.slop.storage.Storage;
|
import nu.marginalia.slop.storage.Storage;
|
||||||
import nu.marginalia.slop.storage.StorageReader;
|
import nu.marginalia.slop.storage.StorageReader;
|
||||||
@ -19,48 +18,53 @@ import java.nio.ByteOrder;
|
|||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
|
||||||
/** Slop column extension for storing GammaCodedSequence objects. */
|
/** Slop column extension for storing GammaCodedSequence objects. */
|
||||||
public class GammaCodedSequenceColumn {
|
public class GammaCodedSequenceColumn extends AbstractObjectColumn<GammaCodedSequence, GammaCodedSequenceColumn.Reader, GammaCodedSequenceColumn.Writer> {
|
||||||
|
|
||||||
public static ColumnType<GammaCodedSequenceReader, GammaCodedSequenceWriter> TYPE = ColumnTypes.register("s8[]+gcs", ByteOrder.nativeOrder(), GammaCodedSequenceColumn::open, GammaCodedSequenceColumn::create);
|
private final VarintColumn indexColumn;
|
||||||
|
|
||||||
public static GammaCodedSequenceReader open(Path path, ColumnDesc columnDesc) throws IOException {
|
public GammaCodedSequenceColumn(String name) {
|
||||||
return new Reader(columnDesc,
|
this(name, StorageType.PLAIN);
|
||||||
Storage.reader(path, columnDesc, false), // note we must never pass aligned=true here, as the data is not guaranteed alignment
|
}
|
||||||
VarintColumn.open(path, columnDesc.createSupplementaryColumn(ColumnFunction.DATA_LEN,
|
|
||||||
ColumnTypes.VARINT_LE,
|
public GammaCodedSequenceColumn(String name, StorageType storageType) {
|
||||||
StorageType.PLAIN)
|
super(name,
|
||||||
)
|
"gamma",
|
||||||
|
ByteOrder.nativeOrder(),
|
||||||
|
ColumnFunction.DATA,
|
||||||
|
storageType);
|
||||||
|
|
||||||
|
indexColumn = new VarintColumn(name, ColumnFunction.DATA_LEN, StorageType.PLAIN);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Writer createUnregistered(Path path, int page) throws IOException {
|
||||||
|
return new Writer(
|
||||||
|
Storage.writer(path, this, page),
|
||||||
|
indexColumn.createUnregistered(path, page)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static GammaCodedSequenceWriter create(Path path, ColumnDesc columnDesc) throws IOException {
|
public Reader openUnregistered(Path path, int page) throws IOException {
|
||||||
return new Writer(columnDesc,
|
return new Reader(
|
||||||
Storage.writer(path, columnDesc),
|
Storage.reader(path, this, page, false),
|
||||||
VarintColumn.create(path, columnDesc.createSupplementaryColumn(ColumnFunction.DATA_LEN,
|
indexColumn.openUnregistered(path, page)
|
||||||
ColumnTypes.VARINT_LE,
|
|
||||||
StorageType.PLAIN)
|
|
||||||
)
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class Writer implements GammaCodedSequenceWriter {
|
public class Writer implements ObjectColumnWriter<GammaCodedSequence> {
|
||||||
private final VarintColumnWriter indexWriter;
|
private final VarintColumn.Writer indexWriter;
|
||||||
private final ColumnDesc<?, ?> columnDesc;
|
|
||||||
private final StorageWriter storage;
|
private final StorageWriter storage;
|
||||||
|
|
||||||
public Writer(ColumnDesc<?, ?> columnDesc,
|
public Writer(StorageWriter storage,
|
||||||
StorageWriter storage,
|
VarintColumn.Writer indexWriter)
|
||||||
VarintColumnWriter indexWriter)
|
|
||||||
{
|
{
|
||||||
this.columnDesc = columnDesc;
|
|
||||||
this.storage = storage;
|
this.storage = storage;
|
||||||
|
|
||||||
this.indexWriter = indexWriter;
|
this.indexWriter = indexWriter;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public ColumnDesc<?, ?> columnDesc() {
|
public AbstractColumn<?, ?> columnDesc() {
|
||||||
return columnDesc;
|
return GammaCodedSequenceColumn.this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -82,20 +86,18 @@ public class GammaCodedSequenceColumn {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class Reader implements GammaCodedSequenceReader {
|
public class Reader implements ObjectColumnReader<GammaCodedSequence> {
|
||||||
private final VarintColumnReader indexReader;
|
private final VarintColumn.Reader indexReader;
|
||||||
private final ColumnDesc<?, ?> columnDesc;
|
|
||||||
private final StorageReader storage;
|
private final StorageReader storage;
|
||||||
|
|
||||||
public Reader(ColumnDesc<?, ?> columnDesc, StorageReader reader, VarintColumnReader indexReader) throws IOException {
|
Reader(StorageReader reader, VarintColumn.Reader indexReader) throws IOException {
|
||||||
this.columnDesc = columnDesc;
|
|
||||||
this.storage = reader;
|
this.storage = reader;
|
||||||
this.indexReader = indexReader;
|
this.indexReader = indexReader;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public ColumnDesc<?, ?> columnDesc() {
|
public AbstractColumn<?, ?> columnDesc() {
|
||||||
return columnDesc;
|
return GammaCodedSequenceColumn.this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -126,7 +128,6 @@ public class GammaCodedSequenceColumn {
|
|||||||
return new GammaCodedSequence(dest);
|
return new GammaCodedSequence(dest);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public void getData(ByteBuffer workArea) throws IOException {
|
public void getData(ByteBuffer workArea) throws IOException {
|
||||||
int size = indexReader.get();
|
int size = indexReader.get();
|
||||||
|
|
||||||
|
@ -1,33 +0,0 @@
|
|||||||
package nu.marginalia.sequence.slop;
|
|
||||||
|
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
|
||||||
import nu.marginalia.slop.column.ColumnReader;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.nio.ByteBuffer;
|
|
||||||
|
|
||||||
public interface GammaCodedSequenceReader extends AutoCloseable, ColumnReader {
|
|
||||||
/** Read the next gamma-coded sequence from the column. Unlike most other
|
|
||||||
* readers, this method requires an intermediate buffer to use for reading
|
|
||||||
* the sequence. As this buffer typically needs to be fairly large to accommodate
|
|
||||||
* the largest possible sequence, it is not practical to allocate a new buffer
|
|
||||||
* for each call to this method. Instead, the caller should allocate a buffer
|
|
||||||
* once and reuse it for each call to this method.
|
|
||||||
*
|
|
||||||
* @return The next gamma-coded sequence.
|
|
||||||
*/
|
|
||||||
GammaCodedSequence get() throws IOException;
|
|
||||||
|
|
||||||
/** Read just the data portion of the next gamma-coded sequence from the column.
|
|
||||||
* This method is useful when the caller is only interested in the data portion
|
|
||||||
* of the sequence and does not want to decode the values.
|
|
||||||
*
|
|
||||||
* The position of the buffer is advanced to the end of the data that has just been read,
|
|
||||||
* and the limit remains the same.
|
|
||||||
*
|
|
||||||
* @param workArea A buffer to use for reading the data.
|
|
||||||
*/
|
|
||||||
void getData(ByteBuffer workArea) throws IOException;
|
|
||||||
|
|
||||||
void close() throws IOException;
|
|
||||||
}
|
|
@ -1,11 +0,0 @@
|
|||||||
package nu.marginalia.sequence.slop;
|
|
||||||
|
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
|
||||||
import nu.marginalia.slop.column.ColumnWriter;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
public interface GammaCodedSequenceWriter extends AutoCloseable, ColumnWriter {
|
|
||||||
void put(GammaCodedSequence sequence) throws IOException;
|
|
||||||
void close() throws IOException;
|
|
||||||
}
|
|
@ -3,21 +3,16 @@ package nu.marginalia.model.processed;
|
|||||||
import lombok.Builder;
|
import lombok.Builder;
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.GammaCodedSequence;
|
||||||
import nu.marginalia.sequence.slop.GammaCodedSequenceArrayColumn;
|
import nu.marginalia.sequence.slop.GammaCodedSequenceArrayColumn;
|
||||||
import nu.marginalia.sequence.slop.GammaCodedSequenceArrayReader;
|
import nu.marginalia.slop.SlopTable;
|
||||||
import nu.marginalia.sequence.slop.GammaCodedSequenceArrayWriter;
|
import nu.marginalia.slop.column.array.ByteArrayColumn;
|
||||||
import nu.marginalia.slop.ColumnTypes;
|
import nu.marginalia.slop.column.array.ObjectArrayColumn;
|
||||||
import nu.marginalia.slop.column.array.ByteArrayColumnReader;
|
import nu.marginalia.slop.column.dynamic.VarintColumn;
|
||||||
import nu.marginalia.slop.column.array.ByteArrayColumnWriter;
|
import nu.marginalia.slop.column.primitive.FloatColumn;
|
||||||
import nu.marginalia.slop.column.array.ObjectArrayColumnReader;
|
import nu.marginalia.slop.column.primitive.IntColumn;
|
||||||
import nu.marginalia.slop.column.array.ObjectArrayColumnWriter;
|
import nu.marginalia.slop.column.primitive.LongColumn;
|
||||||
import nu.marginalia.slop.column.dynamic.VarintColumnReader;
|
import nu.marginalia.slop.column.string.EnumColumn;
|
||||||
import nu.marginalia.slop.column.dynamic.VarintColumnWriter;
|
import nu.marginalia.slop.column.string.StringColumn;
|
||||||
import nu.marginalia.slop.column.primitive.*;
|
import nu.marginalia.slop.column.string.TxtStringColumn;
|
||||||
import nu.marginalia.slop.column.string.EnumColumnReader;
|
|
||||||
import nu.marginalia.slop.column.string.StringColumnReader;
|
|
||||||
import nu.marginalia.slop.column.string.StringColumnWriter;
|
|
||||||
import nu.marginalia.slop.desc.ColumnDesc;
|
|
||||||
import nu.marginalia.slop.desc.SlopTable;
|
|
||||||
import nu.marginalia.slop.desc.StorageType;
|
import nu.marginalia.slop.desc.StorageType;
|
||||||
import org.jetbrains.annotations.Nullable;
|
import org.jetbrains.annotations.Nullable;
|
||||||
|
|
||||||
@ -111,45 +106,47 @@ public record SlopDocumentRecord(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Basic information
|
// Basic information
|
||||||
private static final ColumnDesc<StringColumnReader, StringColumnWriter> domainsColumn = new ColumnDesc<>("domain", ColumnTypes.TXTSTRING, StorageType.GZIP);
|
private static final TxtStringColumn domainsColumn = new TxtStringColumn("domain", StorageType.GZIP);
|
||||||
private static final ColumnDesc<StringColumnReader, StringColumnWriter> urlsColumn = new ColumnDesc<>("url", ColumnTypes.TXTSTRING, StorageType.GZIP);
|
private static final TxtStringColumn urlsColumn = new TxtStringColumn("url", StorageType.GZIP);
|
||||||
private static final ColumnDesc<VarintColumnReader, VarintColumnWriter> ordinalsColumn = new ColumnDesc<>("ordinal", ColumnTypes.VARINT_LE, StorageType.PLAIN);
|
private static final VarintColumn ordinalsColumn = new VarintColumn("ordinal", StorageType.PLAIN);
|
||||||
private static final ColumnDesc<EnumColumnReader, StringColumnWriter> statesColumn = new ColumnDesc<>("state", ColumnTypes.ENUM_LE, StorageType.PLAIN);
|
private static final EnumColumn statesColumn = new EnumColumn("state", StorageType.PLAIN);
|
||||||
private static final ColumnDesc<StringColumnReader, StringColumnWriter> stateReasonsColumn = new ColumnDesc<>("stateReason", ColumnTypes.TXTSTRING, StorageType.GZIP);
|
private static final StringColumn stateReasonsColumn = new StringColumn("stateReason", StorageType.GZIP);
|
||||||
|
|
||||||
// Document metadata
|
// Document metadata
|
||||||
private static final ColumnDesc<StringColumnReader, StringColumnWriter> titlesColumn = new ColumnDesc<>("title", ColumnTypes.STRING, StorageType.GZIP);
|
private static final StringColumn titlesColumn = new StringColumn("title", StorageType.GZIP);
|
||||||
private static final ColumnDesc<StringColumnReader, StringColumnWriter> descriptionsColumn = new ColumnDesc<>("description", ColumnTypes.STRING, StorageType.GZIP);
|
private static final StringColumn descriptionsColumn = new StringColumn("description", StorageType.GZIP);
|
||||||
private static final ColumnDesc<EnumColumnReader, StringColumnWriter> htmlStandardsColumn = new ColumnDesc<>("htmlStandard", ColumnTypes.ENUM_LE, StorageType.GZIP);
|
private static final EnumColumn htmlStandardsColumn = new EnumColumn("htmlStandard", StorageType.PLAIN);
|
||||||
private static final ColumnDesc<IntColumnReader, IntColumnWriter> htmlFeaturesColumn = new ColumnDesc<>("htmlFeatures", ColumnTypes.INT_LE, StorageType.PLAIN);
|
private static final IntColumn htmlFeaturesColumn = new IntColumn("htmlFeatures", StorageType.PLAIN);
|
||||||
private static final ColumnDesc<IntColumnReader, IntColumnWriter> lengthsColumn = new ColumnDesc<>("length", ColumnTypes.INT_LE, StorageType.PLAIN);
|
private static final IntColumn lengthsColumn = new IntColumn("length", StorageType.PLAIN);
|
||||||
private static final ColumnDesc<IntColumnReader, IntColumnWriter> pubYearColumn = new ColumnDesc<>("pubYear", ColumnTypes.INT_LE, StorageType.PLAIN);
|
private static final IntColumn pubYearColumn = new IntColumn("pubYear", StorageType.PLAIN);
|
||||||
private static final ColumnDesc<LongColumnReader, LongColumnWriter> hashesColumn = new ColumnDesc<>("hash", ColumnTypes.LONG_LE, StorageType.PLAIN);
|
private static final LongColumn hashesColumn = new LongColumn("hash", StorageType.PLAIN);
|
||||||
private static final ColumnDesc<FloatColumnReader, FloatColumnWriter> qualitiesColumn = new ColumnDesc<>("quality", ColumnTypes.FLOAT_LE, StorageType.PLAIN);
|
private static final FloatColumn qualitiesColumn = new FloatColumn("quality", StorageType.PLAIN);
|
||||||
private static final ColumnDesc<LongColumnReader, LongColumnWriter> domainMetadata = new ColumnDesc<>("domainMetadata", ColumnTypes.LONG_LE, StorageType.PLAIN);
|
private static final LongColumn domainMetadata = new LongColumn("domainMetadata", StorageType.PLAIN);
|
||||||
|
|
||||||
// Keyword-level columns, these are enumerated by the counts column
|
// Keyword-level columns, these are enumerated by the counts column
|
||||||
private static final ColumnDesc<ObjectArrayColumnReader<String>, ObjectArrayColumnWriter<String>> keywordsColumn = new ColumnDesc<>("keywords", ColumnTypes.STRING_ARRAY, StorageType.ZSTD);
|
|
||||||
private static final ColumnDesc<ByteArrayColumnReader, ByteArrayColumnWriter> termMetaColumn = new ColumnDesc<>("termMetadata", ColumnTypes.BYTE_ARRAY, StorageType.ZSTD);
|
private static final ObjectArrayColumn<String> keywordsColumn = new StringColumn("keywords", StorageType.ZSTD).asArray();
|
||||||
private static final ColumnDesc<GammaCodedSequenceArrayReader, GammaCodedSequenceArrayWriter> termPositionsColumn = new ColumnDesc<>("termPositions", GammaCodedSequenceArrayColumn.TYPE, StorageType.ZSTD);
|
private static final ByteArrayColumn termMetaColumn = new ByteArrayColumn("termMetadata", StorageType.ZSTD);
|
||||||
|
private static final GammaCodedSequenceArrayColumn termPositionsColumn = new GammaCodedSequenceArrayColumn("termPositions", StorageType.ZSTD);
|
||||||
|
|
||||||
// Spans columns
|
// Spans columns
|
||||||
private static final ColumnDesc<ByteArrayColumnReader, ByteArrayColumnWriter> spanCodesColumn = new ColumnDesc<>("spanCodes", ColumnTypes.BYTE_ARRAY, StorageType.ZSTD);
|
|
||||||
private static final ColumnDesc<GammaCodedSequenceArrayReader, GammaCodedSequenceArrayWriter> spansColumn = new ColumnDesc<>("spans", GammaCodedSequenceArrayColumn.TYPE, StorageType.ZSTD);
|
private static final ByteArrayColumn spanCodesColumn = new ByteArrayColumn("spanCodes", StorageType.ZSTD);
|
||||||
|
private static final GammaCodedSequenceArrayColumn spansColumn = new GammaCodedSequenceArrayColumn("spans", StorageType.ZSTD);
|
||||||
|
|
||||||
public static class KeywordsProjectionReader extends SlopTable {
|
public static class KeywordsProjectionReader extends SlopTable {
|
||||||
private final StringColumnReader domainsReader;
|
private final TxtStringColumn.Reader domainsReader;
|
||||||
private final VarintColumnReader ordinalsReader;
|
private final VarintColumn.Reader ordinalsReader;
|
||||||
private final IntColumnReader htmlFeaturesReader;
|
private final IntColumn.Reader htmlFeaturesReader;
|
||||||
private final LongColumnReader domainMetadataReader;
|
private final LongColumn.Reader domainMetadataReader;
|
||||||
private final IntColumnReader lengthsReader;
|
private final IntColumn.Reader lengthsReader;
|
||||||
|
|
||||||
private final ObjectArrayColumnReader<String> keywordsReader;
|
private final ObjectArrayColumn<String>.Reader keywordsReader;
|
||||||
private final ByteArrayColumnReader termMetaReader;
|
private final ByteArrayColumn.Reader termMetaReader;
|
||||||
private final GammaCodedSequenceArrayReader termPositionsReader;
|
private final GammaCodedSequenceArrayColumn.Reader termPositionsReader;
|
||||||
|
|
||||||
private final ByteArrayColumnReader spanCodesReader;
|
private final ByteArrayColumn.Reader spanCodesReader;
|
||||||
private final GammaCodedSequenceArrayReader spansReader;
|
private final GammaCodedSequenceArrayColumn.Reader spansReader;
|
||||||
|
|
||||||
public KeywordsProjectionReader(SlopPageRef<SlopDocumentRecord> pageRef) throws IOException {
|
public KeywordsProjectionReader(SlopPageRef<SlopDocumentRecord> pageRef) throws IOException {
|
||||||
this(pageRef.baseDir(), pageRef.page());
|
this(pageRef.baseDir(), pageRef.page());
|
||||||
@ -206,18 +203,18 @@ public record SlopDocumentRecord(
|
|||||||
}
|
}
|
||||||
|
|
||||||
public static class MetadataReader extends SlopTable {
|
public static class MetadataReader extends SlopTable {
|
||||||
private final StringColumnReader domainsReader;
|
private final TxtStringColumn.Reader domainsReader;
|
||||||
private final StringColumnReader urlsReader;
|
private final TxtStringColumn.Reader urlsReader;
|
||||||
private final VarintColumnReader ordinalsReader;
|
private final VarintColumn.Reader ordinalsReader;
|
||||||
private final StringColumnReader titlesReader;
|
private final StringColumn.Reader titlesReader;
|
||||||
private final StringColumnReader descriptionsReader;
|
private final StringColumn.Reader descriptionsReader;
|
||||||
|
|
||||||
private final IntColumnReader htmlFeaturesReader;
|
private final IntColumn.Reader htmlFeaturesReader;
|
||||||
private final StringColumnReader htmlStandardsReader;
|
private final EnumColumn.Reader htmlStandardsReader;
|
||||||
private final IntColumnReader lengthsReader;
|
private final IntColumn.Reader lengthsReader;
|
||||||
private final LongColumnReader hashesReader;
|
private final LongColumn.Reader hashesReader;
|
||||||
private final FloatColumnReader qualitiesReader;
|
private final FloatColumn.Reader qualitiesReader;
|
||||||
private final IntColumnReader pubYearReader;
|
private final IntColumn.Reader pubYearReader;
|
||||||
|
|
||||||
public MetadataReader(SlopPageRef<SlopDocumentRecord> pageRef) throws IOException{
|
public MetadataReader(SlopPageRef<SlopDocumentRecord> pageRef) throws IOException{
|
||||||
this(pageRef.baseDir(), pageRef.page());
|
this(pageRef.baseDir(), pageRef.page());
|
||||||
@ -263,25 +260,25 @@ public record SlopDocumentRecord(
|
|||||||
}
|
}
|
||||||
|
|
||||||
public static class Writer extends SlopTable {
|
public static class Writer extends SlopTable {
|
||||||
private final StringColumnWriter domainsWriter;
|
private final TxtStringColumn.Writer domainsWriter;
|
||||||
private final StringColumnWriter urlsWriter;
|
private final TxtStringColumn.Writer urlsWriter;
|
||||||
private final VarintColumnWriter ordinalsWriter;
|
private final VarintColumn.Writer ordinalsWriter;
|
||||||
private final StringColumnWriter statesWriter;
|
private final EnumColumn.Writer statesWriter;
|
||||||
private final StringColumnWriter stateReasonsWriter;
|
private final StringColumn.Writer stateReasonsWriter;
|
||||||
private final StringColumnWriter titlesWriter;
|
private final StringColumn.Writer titlesWriter;
|
||||||
private final StringColumnWriter descriptionsWriter;
|
private final StringColumn.Writer descriptionsWriter;
|
||||||
private final IntColumnWriter htmlFeaturesWriter;
|
private final IntColumn.Writer htmlFeaturesWriter;
|
||||||
private final StringColumnWriter htmlStandardsWriter;
|
private final EnumColumn.Writer htmlStandardsWriter;
|
||||||
private final IntColumnWriter lengthsWriter;
|
private final IntColumn.Writer lengthsWriter;
|
||||||
private final LongColumnWriter hashesWriter;
|
private final LongColumn.Writer hashesWriter;
|
||||||
private final FloatColumnWriter qualitiesWriter;
|
private final FloatColumn.Writer qualitiesWriter;
|
||||||
private final LongColumnWriter domainMetadataWriter;
|
private final LongColumn.Writer domainMetadataWriter;
|
||||||
private final IntColumnWriter pubYearWriter;
|
private final IntColumn.Writer pubYearWriter;
|
||||||
private final ObjectArrayColumnWriter<String> keywordsWriter;
|
private final ObjectArrayColumn<String>.Writer keywordsWriter;
|
||||||
private final ByteArrayColumnWriter termMetaWriter;
|
private final ByteArrayColumn.Writer termMetaWriter;
|
||||||
private final GammaCodedSequenceArrayWriter termPositionsWriter;
|
private final GammaCodedSequenceArrayColumn.Writer termPositionsWriter;
|
||||||
private final ByteArrayColumnWriter spansCodesWriter;
|
private final ByteArrayColumn.Writer spansCodesWriter;
|
||||||
private final GammaCodedSequenceArrayWriter spansWriter;
|
private final GammaCodedSequenceArrayColumn.Writer spansWriter;
|
||||||
|
|
||||||
public Writer(Path baseDir, int page) throws IOException {
|
public Writer(Path baseDir, int page) throws IOException {
|
||||||
super(page);
|
super(page);
|
||||||
|
@ -1,10 +1,7 @@
|
|||||||
package nu.marginalia.model.processed;
|
package nu.marginalia.model.processed;
|
||||||
|
|
||||||
import nu.marginalia.slop.ColumnTypes;
|
import nu.marginalia.slop.SlopTable;
|
||||||
import nu.marginalia.slop.column.string.StringColumnReader;
|
import nu.marginalia.slop.column.string.TxtStringColumn;
|
||||||
import nu.marginalia.slop.column.string.StringColumnWriter;
|
|
||||||
import nu.marginalia.slop.desc.ColumnDesc;
|
|
||||||
import nu.marginalia.slop.desc.SlopTable;
|
|
||||||
import nu.marginalia.slop.desc.StorageType;
|
import nu.marginalia.slop.desc.StorageType;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
@ -15,16 +12,16 @@ public record SlopDomainLinkRecord(
|
|||||||
String source,
|
String source,
|
||||||
String dest)
|
String dest)
|
||||||
{
|
{
|
||||||
private static final ColumnDesc<StringColumnReader, StringColumnWriter> sourcesColumn = new ColumnDesc<>("source", ColumnTypes.TXTSTRING, StorageType.GZIP);
|
private static final TxtStringColumn sourcesColumn = new TxtStringColumn("source", StorageType.GZIP);
|
||||||
private static final ColumnDesc<StringColumnReader, StringColumnWriter> destsColumn = new ColumnDesc<>("dest", ColumnTypes.TXTSTRING, StorageType.GZIP);
|
private static final TxtStringColumn destsColumn = new TxtStringColumn("dest", StorageType.GZIP);
|
||||||
|
|
||||||
public static Reader reader(Path baseDir, int page) throws IOException {
|
public static Reader reader(Path baseDir, int page) throws IOException {
|
||||||
return new Reader(baseDir, page);
|
return new Reader(baseDir, page);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static class Reader extends SlopTable {
|
public static class Reader extends SlopTable {
|
||||||
private final StringColumnReader sourcesReader;
|
private final TxtStringColumn.Reader sourcesReader;
|
||||||
private final StringColumnReader destsReader;
|
private final TxtStringColumn.Reader destsReader;
|
||||||
|
|
||||||
public Reader(SlopPageRef<SlopDomainLinkRecord> page) throws IOException {
|
public Reader(SlopPageRef<SlopDomainLinkRecord> page) throws IOException {
|
||||||
this(page.baseDir(), page.page());
|
this(page.baseDir(), page.page());
|
||||||
@ -57,8 +54,8 @@ public record SlopDomainLinkRecord(
|
|||||||
}
|
}
|
||||||
|
|
||||||
public static class Writer extends SlopTable {
|
public static class Writer extends SlopTable {
|
||||||
private final StringColumnWriter sourcesWriter;
|
private final TxtStringColumn.Writer sourcesWriter;
|
||||||
private final StringColumnWriter destsWriter;
|
private final TxtStringColumn.Writer destsWriter;
|
||||||
|
|
||||||
public Writer(Path baseDir, int page) throws IOException {
|
public Writer(Path baseDir, int page) throws IOException {
|
||||||
super(page);
|
super(page);
|
||||||
|
@ -1,15 +1,10 @@
|
|||||||
package nu.marginalia.model.processed;
|
package nu.marginalia.model.processed;
|
||||||
|
|
||||||
import nu.marginalia.slop.ColumnTypes;
|
import nu.marginalia.slop.SlopTable;
|
||||||
import nu.marginalia.slop.column.array.ObjectArrayColumnReader;
|
import nu.marginalia.slop.column.array.ObjectArrayColumn;
|
||||||
import nu.marginalia.slop.column.array.ObjectArrayColumnWriter;
|
import nu.marginalia.slop.column.primitive.IntColumn;
|
||||||
import nu.marginalia.slop.column.primitive.IntColumnReader;
|
import nu.marginalia.slop.column.string.EnumColumn;
|
||||||
import nu.marginalia.slop.column.primitive.IntColumnWriter;
|
import nu.marginalia.slop.column.string.TxtStringColumn;
|
||||||
import nu.marginalia.slop.column.string.EnumColumnReader;
|
|
||||||
import nu.marginalia.slop.column.string.StringColumnReader;
|
|
||||||
import nu.marginalia.slop.column.string.StringColumnWriter;
|
|
||||||
import nu.marginalia.slop.desc.ColumnDesc;
|
|
||||||
import nu.marginalia.slop.desc.SlopTable;
|
|
||||||
import nu.marginalia.slop.desc.StorageType;
|
import nu.marginalia.slop.desc.StorageType;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
@ -33,20 +28,20 @@ public record SlopDomainRecord(
|
|||||||
String ip)
|
String ip)
|
||||||
{}
|
{}
|
||||||
|
|
||||||
private static final ColumnDesc<StringColumnReader, StringColumnWriter> domainsColumn = new ColumnDesc<>("domain", ColumnTypes.TXTSTRING, StorageType.GZIP);
|
private static final TxtStringColumn domainsColumn = new TxtStringColumn("domain", StorageType.GZIP);
|
||||||
private static final ColumnDesc<EnumColumnReader, StringColumnWriter> statesColumn = new ColumnDesc<>("state", ColumnTypes.ENUM_LE, StorageType.PLAIN);
|
private static final EnumColumn statesColumn = new EnumColumn("state", StorageType.PLAIN);
|
||||||
private static final ColumnDesc<StringColumnReader, StringColumnWriter> redirectDomainsColumn = new ColumnDesc<>("redirectDomain", ColumnTypes.TXTSTRING, StorageType.GZIP);
|
private static final TxtStringColumn redirectDomainsColumn = new TxtStringColumn("redirectDomain", StorageType.GZIP);
|
||||||
private static final ColumnDesc<StringColumnReader, StringColumnWriter> ipColumn = new ColumnDesc<>("ip", ColumnTypes.TXTSTRING, StorageType.GZIP);
|
private static final TxtStringColumn ipColumn = new TxtStringColumn("ip", StorageType.GZIP);
|
||||||
|
|
||||||
private static final ColumnDesc<IntColumnReader, IntColumnWriter> knownUrlsColumn = new ColumnDesc<>("knownUrls", ColumnTypes.INT_LE, StorageType.PLAIN);
|
private static final IntColumn knownUrlsColumn = new IntColumn("knownUrls", StorageType.PLAIN);
|
||||||
private static final ColumnDesc<IntColumnReader, IntColumnWriter> goodUrlsColumn = new ColumnDesc<>("goodUrls", ColumnTypes.INT_LE, StorageType.PLAIN);
|
private static final IntColumn goodUrlsColumn = new IntColumn("goodUrls", StorageType.PLAIN);
|
||||||
private static final ColumnDesc<IntColumnReader, IntColumnWriter> visitedUrlsColumn = new ColumnDesc<>("visitedUrls", ColumnTypes.INT_LE, StorageType.PLAIN);
|
private static final IntColumn visitedUrlsColumn = new IntColumn("visitedUrls", StorageType.PLAIN);
|
||||||
|
|
||||||
private static final ColumnDesc<ObjectArrayColumnReader<String>, ObjectArrayColumnWriter<String>> rssFeedsColumn = new ColumnDesc<>("rssFeeds", ColumnTypes.TXTSTRING_ARRAY, StorageType.GZIP);
|
private static final ObjectArrayColumn<String> rssFeedsColumn = new TxtStringColumn("rssFeeds", StorageType.GZIP).asArray();
|
||||||
|
|
||||||
|
|
||||||
public static class DomainNameReader extends SlopTable {
|
public static class DomainNameReader extends SlopTable {
|
||||||
private final StringColumnReader domainsReader;
|
private final TxtStringColumn.Reader domainsReader;
|
||||||
|
|
||||||
public DomainNameReader(SlopPageRef<SlopDomainRecord> page) throws IOException {
|
public DomainNameReader(SlopPageRef<SlopDomainRecord> page) throws IOException {
|
||||||
this(page.baseDir(), page.page());
|
this(page.baseDir(), page.page());
|
||||||
@ -68,8 +63,8 @@ public record SlopDomainRecord(
|
|||||||
}
|
}
|
||||||
|
|
||||||
public static class DomainWithIpReader extends SlopTable {
|
public static class DomainWithIpReader extends SlopTable {
|
||||||
private final StringColumnReader domainsReader;
|
private final TxtStringColumn.Reader domainsReader;
|
||||||
private final StringColumnReader ipReader;
|
private final TxtStringColumn.Reader ipReader;
|
||||||
|
|
||||||
public DomainWithIpReader(SlopPageRef<SlopDomainRecord> page) throws IOException {
|
public DomainWithIpReader(SlopPageRef<SlopDomainRecord> page) throws IOException {
|
||||||
this(page.baseDir(), page.page());
|
this(page.baseDir(), page.page());
|
||||||
@ -96,16 +91,16 @@ public record SlopDomainRecord(
|
|||||||
}
|
}
|
||||||
|
|
||||||
public static class Reader extends SlopTable {
|
public static class Reader extends SlopTable {
|
||||||
private final StringColumnReader domainsReader;
|
private final TxtStringColumn.Reader domainsReader;
|
||||||
private final StringColumnReader statesReader;
|
private final EnumColumn.Reader statesReader;
|
||||||
private final StringColumnReader redirectReader;
|
private final TxtStringColumn.Reader redirectReader;
|
||||||
private final StringColumnReader ipReader;
|
private final TxtStringColumn.Reader ipReader;
|
||||||
|
|
||||||
private final IntColumnReader knownUrlsReader;
|
private final IntColumn.Reader knownUrlsReader;
|
||||||
private final IntColumnReader goodUrlsReader;
|
private final IntColumn.Reader goodUrlsReader;
|
||||||
private final IntColumnReader visitedUrlsReader;
|
private final IntColumn.Reader visitedUrlsReader;
|
||||||
|
|
||||||
private final ObjectArrayColumnReader<String> rssFeedsReader;
|
private final ObjectArrayColumn<String>.Reader rssFeedsReader;
|
||||||
|
|
||||||
public Reader(SlopPageRef<SlopDomainRecord> page) throws IOException {
|
public Reader(SlopPageRef<SlopDomainRecord> page) throws IOException {
|
||||||
this(page.baseDir(), page.page());
|
this(page.baseDir(), page.page());
|
||||||
@ -151,16 +146,16 @@ public record SlopDomainRecord(
|
|||||||
}
|
}
|
||||||
|
|
||||||
public static class Writer extends SlopTable {
|
public static class Writer extends SlopTable {
|
||||||
private final StringColumnWriter domainsWriter;
|
private final TxtStringColumn.Writer domainsWriter;
|
||||||
private final StringColumnWriter statesWriter;
|
private final EnumColumn.Writer statesWriter;
|
||||||
private final StringColumnWriter redirectWriter;
|
private final TxtStringColumn.Writer redirectWriter;
|
||||||
private final StringColumnWriter ipWriter;
|
private final TxtStringColumn.Writer ipWriter;
|
||||||
|
|
||||||
private final IntColumnWriter knownUrlsWriter;
|
private final IntColumn.Writer knownUrlsWriter;
|
||||||
private final IntColumnWriter goodUrlsWriter;
|
private final IntColumn.Writer goodUrlsWriter;
|
||||||
private final IntColumnWriter visitedUrlsWriter;
|
private final IntColumn.Writer visitedUrlsWriter;
|
||||||
|
|
||||||
private final ObjectArrayColumnWriter<String> rssFeedsWriter;
|
private final ObjectArrayColumn<String>.Writer rssFeedsWriter;
|
||||||
|
|
||||||
public Writer(Path baseDir, int page) throws IOException {
|
public Writer(Path baseDir, int page) throws IOException {
|
||||||
super(page);
|
super(page);
|
||||||
|
@ -226,7 +226,7 @@ dependencyResolutionManagement {
|
|||||||
library('jetty-util','org.eclipse.jetty','jetty-util').version('9.4.54.v20240208')
|
library('jetty-util','org.eclipse.jetty','jetty-util').version('9.4.54.v20240208')
|
||||||
library('jetty-servlet','org.eclipse.jetty','jetty-servlet').version('9.4.54.v20240208')
|
library('jetty-servlet','org.eclipse.jetty','jetty-servlet').version('9.4.54.v20240208')
|
||||||
|
|
||||||
library('slop', 'nu.marginalia', 'slop').version('0.0.1-SNAPSHOT')
|
library('slop', 'nu.marginalia', 'slop').version('0.0.3-SNAPSHOT')
|
||||||
|
|
||||||
bundle('jetty', ['jetty-server', 'jetty-util', 'jetty-servlet'])
|
bundle('jetty', ['jetty-server', 'jetty-util', 'jetty-servlet'])
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user