mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(slop) Introduce table concept to keep track of positions and simplify closing
The most common error when dealing with Slop columns is that they can fall out of sync with each other if the programmer accidentally does a conditional read and forgets to skip. The second most common error is forgetting to close one of the columns in a reader or writer. To deal with both cases, a new class SlopTable is added that keeps track of the lifecycle of all slop columns and performs a check when closing them that they are in sync.
This commit is contained in:
parent
aebb2652e8
commit
dcb43a3308
@ -22,6 +22,7 @@ dependencies {
|
||||
|
||||
implementation project(':code:libraries:array')
|
||||
implementation project(':code:libraries:btree')
|
||||
implementation project(':code:libraries:slop')
|
||||
implementation project(':code:libraries:coded-sequence')
|
||||
|
||||
implementation project(':code:common:db')
|
||||
|
@ -9,6 +9,7 @@ import nu.marginalia.model.id.UrlIdCodec;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||
import nu.marginalia.slop.column.primitive.LongColumnReader;
|
||||
import nu.marginalia.slop.desc.SlopTable;
|
||||
import org.roaringbitmap.longlong.LongConsumer;
|
||||
import org.roaringbitmap.longlong.Roaring64Bitmap;
|
||||
import org.slf4j.Logger;
|
||||
@ -80,16 +81,15 @@ public class ForwardIndexConverter {
|
||||
|
||||
ByteBuffer workArea = ByteBuffer.allocate(65536);
|
||||
for (var instance : journal.pages()) {
|
||||
try (var docIdReader = instance.openCombinedId();
|
||||
var metaReader = instance.openDocumentMeta();
|
||||
var featuresReader = instance.openFeatures();
|
||||
var sizeReader = instance.openSize();
|
||||
|
||||
var spansCodesReader = instance.openSpanCodes();
|
||||
var spansSeqReader = instance.openSpans();
|
||||
var spansWriter = new ForwardIndexSpansWriter(outputFileSpansData)
|
||||
)
|
||||
try (var slopTable = new SlopTable(); var spansWriter = new ForwardIndexSpansWriter(outputFileSpansData))
|
||||
{
|
||||
var docIdReader = instance.openCombinedId(slopTable);
|
||||
var metaReader = instance.openDocumentMeta(slopTable);
|
||||
var featuresReader = instance.openFeatures(slopTable);
|
||||
var sizeReader = instance.openSize(slopTable);
|
||||
var spansCodesReader = instance.openSpanCodes(slopTable);
|
||||
var spansSeqReader = instance.openSpans(slopTable);
|
||||
|
||||
while (docIdReader.hasRemaining()) {
|
||||
long docId = docIdReader.get();
|
||||
int domainId = UrlIdCodec.getDomainId(docId);
|
||||
@ -148,7 +148,9 @@ public class ForwardIndexConverter {
|
||||
Roaring64Bitmap rbm = new Roaring64Bitmap();
|
||||
|
||||
for (var instance : journalReader.pages()) {
|
||||
try (LongColumnReader idReader = instance.openCombinedId()) {
|
||||
try (var slopTable = new SlopTable()) {
|
||||
LongColumnReader idReader = instance.openCombinedId(slopTable);
|
||||
|
||||
while (idReader.hasRemaining()) {
|
||||
rbm.add(idReader.get());
|
||||
}
|
||||
|
@ -9,6 +9,7 @@ import nu.marginalia.slop.column.dynamic.VarintColumnWriter;
|
||||
import nu.marginalia.slop.column.primitive.*;
|
||||
import nu.marginalia.slop.desc.ColumnDesc;
|
||||
import nu.marginalia.slop.desc.ColumnType;
|
||||
import nu.marginalia.slop.desc.SlopTable;
|
||||
import nu.marginalia.slop.desc.StorageType;
|
||||
|
||||
import java.io.IOException;
|
||||
@ -34,43 +35,43 @@ public record IndexJournalPage(Path baseDir, int page) {
|
||||
}
|
||||
}
|
||||
|
||||
public LongColumnReader openCombinedId() throws IOException {
|
||||
return combinedId.forPage(page).open(baseDir);
|
||||
public LongColumnReader openCombinedId(SlopTable table) throws IOException {
|
||||
return combinedId.forPage(page).open(table, baseDir);
|
||||
}
|
||||
|
||||
public LongColumnReader openDocumentMeta() throws IOException {
|
||||
return documentMeta.forPage(page).open(baseDir);
|
||||
public LongColumnReader openDocumentMeta(SlopTable table) throws IOException {
|
||||
return documentMeta.forPage(page).open(table, baseDir);
|
||||
}
|
||||
|
||||
public IntColumnReader openFeatures() throws IOException {
|
||||
return features.forPage(page).open(baseDir);
|
||||
public IntColumnReader openFeatures(SlopTable table) throws IOException {
|
||||
return features.forPage(page).open(table, baseDir);
|
||||
}
|
||||
|
||||
public IntColumnReader openSize() throws IOException {
|
||||
return size.forPage(page).open(baseDir);
|
||||
public IntColumnReader openSize(SlopTable table) throws IOException {
|
||||
return size.forPage(page).open(table, baseDir);
|
||||
}
|
||||
|
||||
public LongColumnReader openTermCounts() throws IOException {
|
||||
return termCounts.forPage(page).open(baseDir);
|
||||
public LongColumnReader openTermCounts(SlopTable table) throws IOException {
|
||||
return termCounts.forPage(page).open(table, baseDir);
|
||||
}
|
||||
|
||||
public LongColumnReader openTermIds() throws IOException {
|
||||
return termIds.forPage(page).open(baseDir);
|
||||
public LongColumnReader openTermIds(SlopTable table) throws IOException {
|
||||
return termIds.forPage(page).open(table.columnGroup("keywords"), baseDir);
|
||||
}
|
||||
|
||||
public ByteColumnReader openTermMetadata() throws IOException {
|
||||
return termMeta.forPage(page).open(baseDir);
|
||||
public ByteColumnReader openTermMetadata(SlopTable table) throws IOException {
|
||||
return termMeta.forPage(page).open(table.columnGroup("keywords"), baseDir);
|
||||
}
|
||||
|
||||
public GammaCodedSequenceReader openTermPositions() throws IOException {
|
||||
return positions.forPage(page).open(baseDir);
|
||||
public GammaCodedSequenceReader openTermPositions(SlopTable table) throws IOException {
|
||||
return positions.forPage(page).open(table.columnGroup("keywords"), baseDir);
|
||||
}
|
||||
|
||||
public GammaCodedSequenceReader openSpans() throws IOException {
|
||||
return spans.forPage(page).open(baseDir);
|
||||
public GammaCodedSequenceReader openSpans(SlopTable table) throws IOException {
|
||||
return spans.forPage(page).open(table.columnGroup("spans"), baseDir);
|
||||
}
|
||||
|
||||
public ByteArrayColumnReader openSpanCodes() throws IOException {
|
||||
return spanCodes.forPage(page).open(baseDir);
|
||||
public ByteArrayColumnReader openSpanCodes(SlopTable table) throws IOException {
|
||||
return spanCodes.forPage(page).open(table.columnGroup("spans"), baseDir);
|
||||
}
|
||||
}
|
||||
|
@ -10,13 +10,14 @@ import nu.marginalia.slop.column.dynamic.GammaCodedSequenceWriter;
|
||||
import nu.marginalia.slop.column.primitive.ByteColumnWriter;
|
||||
import nu.marginalia.slop.column.primitive.IntColumnWriter;
|
||||
import nu.marginalia.slop.column.primitive.LongColumnWriter;
|
||||
import nu.marginalia.slop.desc.SlopTable;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
public class IndexJournalSlopWriter implements AutoCloseable {
|
||||
public class IndexJournalSlopWriter extends SlopTable {
|
||||
|
||||
private final IntColumnWriter featuresWriter;
|
||||
private final IntColumnWriter sizeWriter;
|
||||
@ -39,19 +40,20 @@ public class IndexJournalSlopWriter implements AutoCloseable {
|
||||
}
|
||||
|
||||
|
||||
featuresWriter = IndexJournalPage.features.forPage(page).create(dir);
|
||||
sizeWriter = IndexJournalPage.size.forPage(page).create(dir);
|
||||
featuresWriter = IndexJournalPage.features.forPage(page).create(this, dir);
|
||||
sizeWriter = IndexJournalPage.size.forPage(page).create(this, dir);
|
||||
|
||||
combinedIdWriter = IndexJournalPage.combinedId.forPage(page).create(dir);
|
||||
documentMetaWriter = IndexJournalPage.documentMeta.forPage(page).create(dir);
|
||||
combinedIdWriter = IndexJournalPage.combinedId.forPage(page).create(this, dir);
|
||||
documentMetaWriter = IndexJournalPage.documentMeta.forPage(page).create(this, dir);
|
||||
|
||||
termCountsWriter = IndexJournalPage.termCounts.forPage(page).create(dir);
|
||||
termIdsWriter = IndexJournalPage.termIds.forPage(page).create(dir);
|
||||
termMetadataWriter = IndexJournalPage.termMeta.forPage(page).create(dir);
|
||||
termPositionsWriter = IndexJournalPage.positions.forPage(page).create(dir);
|
||||
termCountsWriter = IndexJournalPage.termCounts.forPage(page).create(this, dir);
|
||||
|
||||
spansWriter = IndexJournalPage.spans.forPage(page).create(dir);
|
||||
spanCodesWriter = IndexJournalPage.spanCodes.forPage(page).create(dir);
|
||||
termIdsWriter = IndexJournalPage.termIds.forPage(page).create(this.columnGroup("keywords"), dir);
|
||||
termMetadataWriter = IndexJournalPage.termMeta.forPage(page).create(this.columnGroup("keywords"), dir);
|
||||
termPositionsWriter = IndexJournalPage.positions.forPage(page).create(this.columnGroup("keywords"), dir);
|
||||
|
||||
spansWriter = IndexJournalPage.spans.forPage(page).create(this.columnGroup("spans"), dir);
|
||||
spanCodesWriter = IndexJournalPage.spanCodes.forPage(page).create(this.columnGroup("spans"), dir);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
|
@ -7,6 +7,7 @@ import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.construction.PositionsFileConstructor;
|
||||
import nu.marginalia.index.journal.IndexJournalPage;
|
||||
import nu.marginalia.rwf.RandomFileAssembler;
|
||||
import nu.marginalia.slop.desc.SlopTable;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -78,12 +79,13 @@ public class FullPreindexDocuments {
|
||||
final ByteBuffer tempBuffer = ByteBuffer.allocate(65536);
|
||||
|
||||
try (var assembly = RandomFileAssembler.create(workDir, fileSizeLongs);
|
||||
var docIds = journalInstance.openCombinedId();
|
||||
var termCounts = journalInstance.openTermCounts();
|
||||
var termIds = journalInstance.openTermIds();
|
||||
var termMeta = journalInstance.openTermMetadata();
|
||||
var positions = journalInstance.openTermPositions())
|
||||
var slopTable = new SlopTable())
|
||||
{
|
||||
var docIds = journalInstance.openCombinedId(slopTable);
|
||||
var termCounts = journalInstance.openTermCounts(slopTable);
|
||||
var termIds = journalInstance.openTermIds(slopTable);
|
||||
var termMeta = journalInstance.openTermMetadata(slopTable);
|
||||
var positions = journalInstance.openTermPositions(slopTable);
|
||||
|
||||
var offsetMap = segments.asMap(RECORD_SIZE_LONGS);
|
||||
offsetMap.defaultReturnValue(0);
|
||||
|
@ -6,6 +6,7 @@ import it.unimi.dsi.fastutil.longs.LongIterator;
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.index.journal.IndexJournalPage;
|
||||
import nu.marginalia.slop.desc.SlopTable;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
@ -59,7 +60,8 @@ public class FullPreindexWordSegments {
|
||||
Long2IntOpenHashMap countsMap = new Long2IntOpenHashMap(100_000, 0.75f);
|
||||
countsMap.defaultReturnValue(0);
|
||||
|
||||
try (var termIds = journalInstance.openTermIds()) {
|
||||
try (var slopTable = new SlopTable()) {
|
||||
var termIds = journalInstance.openTermIds(slopTable);
|
||||
while (termIds.hasRemaining()) {
|
||||
countsMap.addTo(termIds.get(), 1);
|
||||
}
|
||||
|
@ -6,6 +6,7 @@ import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.journal.IndexJournalPage;
|
||||
import nu.marginalia.rwf.RandomFileAssembler;
|
||||
import nu.marginalia.slop.desc.SlopTable;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -67,11 +68,12 @@ public class PrioPreindexDocuments {
|
||||
long fileSizeLongs = RECORD_SIZE_LONGS * segments.totalSize();
|
||||
|
||||
try (var assembly = RandomFileAssembler.create(workDir, fileSizeLongs);
|
||||
var docIds = journalInstance.openCombinedId();
|
||||
var termIdsCounts = journalInstance.openTermCounts();
|
||||
var termIds = journalInstance.openTermIds();
|
||||
var termMeta = journalInstance.openTermMetadata())
|
||||
var slopTable = new SlopTable())
|
||||
{
|
||||
var docIds = journalInstance.openCombinedId(slopTable);
|
||||
var termIdsCounts = journalInstance.openTermCounts(slopTable);
|
||||
var termIds = journalInstance.openTermIds(slopTable);
|
||||
var termMeta = journalInstance.openTermMetadata(slopTable);
|
||||
|
||||
var offsetMap = segments.asMap(RECORD_SIZE_LONGS);
|
||||
offsetMap.defaultReturnValue(0);
|
||||
|
@ -6,6 +6,7 @@ import it.unimi.dsi.fastutil.longs.LongIterator;
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.index.journal.IndexJournalPage;
|
||||
import nu.marginalia.slop.desc.SlopTable;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
@ -59,8 +60,9 @@ public class PrioPreindexWordSegments {
|
||||
Long2IntOpenHashMap countsMap = new Long2IntOpenHashMap(100_000, 0.75f);
|
||||
countsMap.defaultReturnValue(0);
|
||||
|
||||
try (var termIds = journalInstance.openTermIds();
|
||||
var termMetas = journalInstance.openTermMetadata()) {
|
||||
try (var slopTable = new SlopTable()) {
|
||||
var termIds = journalInstance.openTermIds(slopTable);
|
||||
var termMetas = journalInstance.openTermMetadata(slopTable);
|
||||
|
||||
while (termIds.hasRemaining()) {
|
||||
long data = termIds.get();
|
||||
|
@ -11,4 +11,6 @@ public interface ColumnReader {
|
||||
}
|
||||
|
||||
boolean hasRemaining() throws IOException;
|
||||
|
||||
void close() throws IOException;
|
||||
}
|
||||
|
@ -1,4 +1,10 @@
|
||||
package nu.marginalia.slop.column;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public interface ColumnWriter {
|
||||
/** Return the current record index in the column */
|
||||
long position();
|
||||
|
||||
void close() throws IOException;
|
||||
}
|
||||
|
@ -19,7 +19,7 @@ public class ByteArrayColumn {
|
||||
return new Reader(
|
||||
Storage.reader(path, name, true),
|
||||
VarintColumn.open(path,
|
||||
name.createDerivative(name.function().lengthsTable(),
|
||||
name.createSupplementaryColumn(name.function().lengthsTable(),
|
||||
ColumnType.VARINT_LE,
|
||||
StorageType.PLAIN)
|
||||
)
|
||||
@ -30,7 +30,7 @@ public class ByteArrayColumn {
|
||||
return new Writer(
|
||||
Storage.writer(path, name),
|
||||
VarintColumn.create(path,
|
||||
name.createDerivative(name.function().lengthsTable(),
|
||||
name.createSupplementaryColumn(name.function().lengthsTable(),
|
||||
ColumnType.VARINT_LE,
|
||||
StorageType.PLAIN)
|
||||
)
|
||||
@ -41,16 +41,23 @@ public class ByteArrayColumn {
|
||||
private final StorageWriter storage;
|
||||
private final VarintColumnWriter lengthsWriter;
|
||||
|
||||
private long position = 0;
|
||||
|
||||
public Writer(StorageWriter storage, VarintColumnWriter lengthsWriter) throws IOException {
|
||||
this.storage = storage;
|
||||
this.lengthsWriter = lengthsWriter;
|
||||
}
|
||||
|
||||
public void put(byte[] value) throws IOException {
|
||||
position ++;
|
||||
storage.putBytes(value);
|
||||
lengthsWriter.put(value.length);
|
||||
}
|
||||
|
||||
public long position() {
|
||||
return position;
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
storage.close();
|
||||
lengthsWriter.close();
|
||||
|
@ -17,7 +17,7 @@ public class IntArrayColumn {
|
||||
|
||||
public static IntArrayColumnReader open(Path path, ColumnDesc name) throws IOException {
|
||||
return new Reader(Storage.reader(path, name, true),
|
||||
VarintColumn.open(path, name.createDerivative(name.function().lengthsTable(),
|
||||
VarintColumn.open(path, name.createSupplementaryColumn(name.function().lengthsTable(),
|
||||
ColumnType.VARINT_LE,
|
||||
StorageType.PLAIN)
|
||||
)
|
||||
@ -26,7 +26,7 @@ public class IntArrayColumn {
|
||||
|
||||
public static IntArrayColumnWriter create(Path path, ColumnDesc name) throws IOException {
|
||||
return new Writer(Storage.writer(path, name),
|
||||
VarintColumn.create(path, name.createDerivative(name.function().lengthsTable(),
|
||||
VarintColumn.create(path, name.createSupplementaryColumn(name.function().lengthsTable(),
|
||||
ColumnType.VARINT_LE,
|
||||
StorageType.PLAIN)
|
||||
)
|
||||
@ -47,6 +47,10 @@ public class IntArrayColumn {
|
||||
lengthsWriter.put(value.length);
|
||||
}
|
||||
|
||||
public long position() {
|
||||
return lengthsWriter.position();
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
storage.close();
|
||||
lengthsWriter.close();
|
||||
|
@ -17,7 +17,7 @@ public class LongArrayColumn {
|
||||
|
||||
public static LongArrayColumnReader open(Path path, ColumnDesc name) throws IOException {
|
||||
return new LongArrayColumn.Reader(Storage.reader(path, name, true),
|
||||
VarintColumn.open(path, name.createDerivative(name.function().lengthsTable(),
|
||||
VarintColumn.open(path, name.createSupplementaryColumn(name.function().lengthsTable(),
|
||||
ColumnType.VARINT_LE,
|
||||
StorageType.PLAIN)
|
||||
)
|
||||
@ -26,7 +26,7 @@ public class LongArrayColumn {
|
||||
|
||||
public static LongArrayColumnWriter create(Path path, ColumnDesc name) throws IOException {
|
||||
return new LongArrayColumn.Writer(Storage.writer(path, name),
|
||||
VarintColumn.create(path, name.createDerivative(name.function().lengthsTable(),
|
||||
VarintColumn.create(path, name.createSupplementaryColumn(name.function().lengthsTable(),
|
||||
ColumnType.VARINT_LE,
|
||||
StorageType.PLAIN)
|
||||
)
|
||||
@ -47,6 +47,10 @@ public class LongArrayColumn {
|
||||
lengthsWriter.put(value.length);
|
||||
}
|
||||
|
||||
public long position() {
|
||||
return lengthsWriter.position();
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
storage.close();
|
||||
lengthsWriter.close();
|
||||
|
@ -16,7 +16,7 @@ public class CustomBinaryColumn {
|
||||
public static CustomBinaryColumnReader open(Path path, ColumnDesc name) throws IOException {
|
||||
return new Reader(
|
||||
Storage.reader(path, name, false), // note we must never pass aligned=true here, as the data is not guaranteed alignment
|
||||
VarintColumn.open(path, name.createDerivative(ColumnFunction.DATA_LEN,
|
||||
VarintColumn.open(path, name.createSupplementaryColumn(ColumnFunction.DATA_LEN,
|
||||
ColumnType.VARINT_LE,
|
||||
StorageType.PLAIN)
|
||||
)
|
||||
@ -26,7 +26,7 @@ public class CustomBinaryColumn {
|
||||
public static CustomBinaryColumnWriter create(Path path, ColumnDesc name) throws IOException {
|
||||
return new Writer(
|
||||
Storage.writer(path, name),
|
||||
VarintColumn.create(path, name.createDerivative(ColumnFunction.DATA_LEN,
|
||||
VarintColumn.create(path, name.createSupplementaryColumn(ColumnFunction.DATA_LEN,
|
||||
ColumnType.VARINT_LE,
|
||||
StorageType.PLAIN)
|
||||
)
|
||||
@ -62,6 +62,10 @@ public class CustomBinaryColumn {
|
||||
};
|
||||
}
|
||||
|
||||
public long position() {
|
||||
return indexWriter.position();
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
indexWriter.close();
|
||||
storage.close();
|
||||
|
@ -18,7 +18,7 @@ public class GammaCodedSequenceColumn {
|
||||
public static GammaCodedSequenceReader open(Path path, ColumnDesc name) throws IOException {
|
||||
return new Reader(
|
||||
Storage.reader(path, name, false), // note we must never pass aligned=true here, as the data is not guaranteed alignment
|
||||
VarintColumn.open(path, name.createDerivative(ColumnFunction.DATA_LEN,
|
||||
VarintColumn.open(path, name.createSupplementaryColumn(ColumnFunction.DATA_LEN,
|
||||
ColumnType.VARINT_LE,
|
||||
StorageType.PLAIN)
|
||||
)
|
||||
@ -28,7 +28,7 @@ public class GammaCodedSequenceColumn {
|
||||
public static GammaCodedSequenceWriter create(Path path, ColumnDesc name) throws IOException {
|
||||
return new Writer(
|
||||
Storage.writer(path, name),
|
||||
VarintColumn.create(path, name.createDerivative(ColumnFunction.DATA_LEN,
|
||||
VarintColumn.create(path, name.createSupplementaryColumn(ColumnFunction.DATA_LEN,
|
||||
ColumnType.VARINT_LE,
|
||||
StorageType.PLAIN)
|
||||
)
|
||||
@ -57,6 +57,10 @@ public class GammaCodedSequenceColumn {
|
||||
storage.putBytes(buffer);
|
||||
}
|
||||
|
||||
public long position() {
|
||||
return indexWriter.position();
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
indexWriter.close();
|
||||
storage.close();
|
||||
|
@ -21,12 +21,15 @@ public class VarintColumn {
|
||||
|
||||
private static class Writer implements VarintColumnWriter {
|
||||
private final StorageWriter writer;
|
||||
private long position = 0;
|
||||
|
||||
public Writer(StorageWriter writer) throws IOException {
|
||||
this.writer = writer;
|
||||
}
|
||||
|
||||
public void put(long value) throws IOException {
|
||||
position++;
|
||||
|
||||
while ((value & ~0x7F) != 0) {
|
||||
writer.putByte((byte) (0x80 | (value & 0x7F)));
|
||||
value >>>= 7;
|
||||
@ -40,6 +43,10 @@ public class VarintColumn {
|
||||
}
|
||||
}
|
||||
|
||||
public long position() {
|
||||
return position;
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
writer.close();
|
||||
}
|
||||
|
@ -20,6 +20,7 @@ public class ByteColumn {
|
||||
|
||||
private static class Writer implements ByteColumnWriter {
|
||||
private final StorageWriter storage;
|
||||
private long position = 0;
|
||||
|
||||
public Writer(StorageWriter storageWriter) throws IOException {
|
||||
this.storage = storageWriter;
|
||||
@ -27,6 +28,11 @@ public class ByteColumn {
|
||||
|
||||
public void put(byte value) throws IOException {
|
||||
storage.putByte(value);
|
||||
position++;
|
||||
}
|
||||
|
||||
public long position() {
|
||||
return position;
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
|
@ -20,6 +20,7 @@ public class CharColumn {
|
||||
|
||||
private static class Writer implements CharColumnWriter {
|
||||
private final StorageWriter storage;
|
||||
private long position = 0;
|
||||
|
||||
public Writer(StorageWriter storageWriter) throws IOException {
|
||||
this.storage = storageWriter;
|
||||
@ -27,6 +28,11 @@ public class CharColumn {
|
||||
|
||||
public void put(char value) throws IOException {
|
||||
storage.putChar(value);
|
||||
position++;
|
||||
}
|
||||
|
||||
public long position() {
|
||||
return position / Character.BYTES;
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
|
@ -20,6 +20,7 @@ public class DoubleColumn {
|
||||
|
||||
private static class Writer implements DoubleColumnWriter {
|
||||
private final StorageWriter storage;
|
||||
private long position = 0;
|
||||
|
||||
public Writer(StorageWriter storageWriter) throws IOException {
|
||||
this.storage = storageWriter;
|
||||
@ -27,6 +28,11 @@ public class DoubleColumn {
|
||||
|
||||
public void put(double value) throws IOException {
|
||||
storage.putDouble(value);
|
||||
position++;
|
||||
}
|
||||
|
||||
public long position() {
|
||||
return position / Double.BYTES;
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
|
@ -21,6 +21,7 @@ public class FloatColumn {
|
||||
|
||||
private static class Writer implements FloatColumnWriter {
|
||||
private final StorageWriter storage;
|
||||
private long position = 0;
|
||||
|
||||
public Writer(StorageWriter storageWriter) throws IOException {
|
||||
this.storage = storageWriter;
|
||||
@ -28,6 +29,11 @@ public class FloatColumn {
|
||||
|
||||
public void put(float value) throws IOException {
|
||||
storage.putFloat(value);
|
||||
position++;
|
||||
}
|
||||
|
||||
public long position() {
|
||||
return position;
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
@ -48,7 +54,7 @@ public class FloatColumn {
|
||||
|
||||
@Override
|
||||
public long position() throws IOException {
|
||||
return storage.position();
|
||||
return storage.position() / Float.BYTES;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -20,6 +20,7 @@ public class IntColumn {
|
||||
|
||||
private static class Writer implements IntColumnWriter {
|
||||
private final StorageWriter storage;
|
||||
private long position = 0;
|
||||
|
||||
public Writer(StorageWriter storageWriter) throws IOException {
|
||||
this.storage = storageWriter;
|
||||
@ -29,10 +30,16 @@ public class IntColumn {
|
||||
for (int value : values) {
|
||||
storage.putInt(value);
|
||||
}
|
||||
position+=values.length;
|
||||
}
|
||||
|
||||
public void put(int value) throws IOException {
|
||||
storage.putInt(value);
|
||||
position++;
|
||||
}
|
||||
|
||||
public long position() {
|
||||
return position / Integer.BYTES;
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
@ -53,7 +60,7 @@ public class IntColumn {
|
||||
|
||||
@Override
|
||||
public long position() throws IOException {
|
||||
return storage.position();
|
||||
return storage.position() / Integer.BYTES;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -20,6 +20,7 @@ public class LongColumn {
|
||||
|
||||
private static class Writer implements LongColumnWriter {
|
||||
private final StorageWriter storage;
|
||||
private long position = 0;
|
||||
|
||||
public Writer(StorageWriter storageWriter) {
|
||||
this.storage = storageWriter;
|
||||
@ -27,6 +28,11 @@ public class LongColumn {
|
||||
|
||||
public void put(long value) throws IOException {
|
||||
storage.putLong(value);
|
||||
position++;
|
||||
}
|
||||
|
||||
public long position() {
|
||||
return position;
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
@ -47,7 +53,7 @@ public class LongColumn {
|
||||
|
||||
@Override
|
||||
public long position() throws IOException {
|
||||
return storage.position();
|
||||
return storage.position() / Long.BYTES;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -19,13 +19,13 @@ public class EnumColumn {
|
||||
public static StringColumnReader open(Path path, ColumnDesc name) throws IOException {
|
||||
return new Reader(
|
||||
StringColumn.open(path,
|
||||
name.createDerivative(
|
||||
name.createSupplementaryColumn(
|
||||
ColumnFunction.DICT,
|
||||
ColumnType.TXTSTRING,
|
||||
StorageType.PLAIN)
|
||||
),
|
||||
VarintColumn.open(path,
|
||||
name.createDerivative(
|
||||
name.createSupplementaryColumn(
|
||||
ColumnFunction.DATA,
|
||||
ColumnType.ENUM_LE,
|
||||
StorageType.PLAIN
|
||||
@ -36,8 +36,8 @@ public class EnumColumn {
|
||||
|
||||
public static StringColumnWriter create(Path path, ColumnDesc name) throws IOException {
|
||||
return new Writer(
|
||||
StringColumn.create(path, name.createDerivative(ColumnFunction.DICT, ColumnType.TXTSTRING, StorageType.PLAIN)),
|
||||
VarintColumn.create(path, name.createDerivative(ColumnFunction.DATA, ColumnType.ENUM_LE, StorageType.PLAIN))
|
||||
StringColumn.create(path, name.createSupplementaryColumn(ColumnFunction.DICT, ColumnType.TXTSTRING, StorageType.PLAIN)),
|
||||
VarintColumn.create(path, name.createSupplementaryColumn(ColumnFunction.DATA, ColumnType.ENUM_LE, StorageType.PLAIN))
|
||||
);
|
||||
}
|
||||
|
||||
@ -64,6 +64,10 @@ public class EnumColumn {
|
||||
dataColumn.put(index);
|
||||
}
|
||||
|
||||
public long position() {
|
||||
return dataColumn.position();
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
dataColumn.close();
|
||||
dicionaryColumn.close();
|
||||
|
@ -51,6 +51,10 @@ public class StringColumn {
|
||||
backingColumn.put(value.getBytes());
|
||||
}
|
||||
|
||||
public long position() {
|
||||
return backingColumn.position();
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
backingColumn.close();
|
||||
}
|
||||
@ -92,6 +96,8 @@ public class StringColumn {
|
||||
private static class CStringWriter implements StringColumnWriter {
|
||||
private final StorageWriter storageWriter;
|
||||
|
||||
private long position = 0;
|
||||
|
||||
public CStringWriter(StorageWriter storageWriter) throws IOException {
|
||||
this.storageWriter = storageWriter;
|
||||
}
|
||||
@ -100,10 +106,14 @@ public class StringColumn {
|
||||
if (null == value) {
|
||||
value = "";
|
||||
}
|
||||
|
||||
assert value.indexOf('\0') == -1 : "Null byte not allowed in cstring";
|
||||
storageWriter.putBytes(value.getBytes());
|
||||
storageWriter.putByte((byte) 0);
|
||||
position++;
|
||||
}
|
||||
|
||||
public long position() {
|
||||
return position;
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
@ -113,6 +123,7 @@ public class StringColumn {
|
||||
|
||||
private static class CStringReader implements StringColumnReader {
|
||||
private final StorageReader storageReader;
|
||||
private long position = 0;
|
||||
|
||||
public CStringReader(StorageReader storageReader) throws IOException {
|
||||
this.storageReader = storageReader;
|
||||
@ -124,12 +135,13 @@ public class StringColumn {
|
||||
while (storageReader.hasRemaining() && (b = storageReader.getByte()) != 0) {
|
||||
sb.append((char) b);
|
||||
}
|
||||
position++;
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long position() throws IOException {
|
||||
return storageReader.position();
|
||||
return position;
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -141,6 +153,7 @@ public class StringColumn {
|
||||
i++;
|
||||
}
|
||||
}
|
||||
position += positions;
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -157,6 +170,7 @@ public class StringColumn {
|
||||
|
||||
private static class TxtStringWriter implements StringColumnWriter {
|
||||
private final StorageWriter storageWriter;
|
||||
private long position = 0;
|
||||
|
||||
public TxtStringWriter(StorageWriter storageWriter) throws IOException {
|
||||
this.storageWriter = storageWriter;
|
||||
@ -171,6 +185,11 @@ public class StringColumn {
|
||||
|
||||
storageWriter.putBytes(value.getBytes());
|
||||
storageWriter.putByte((byte) '\n');
|
||||
position++;
|
||||
}
|
||||
|
||||
public long position() {
|
||||
return position;
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
@ -180,6 +199,7 @@ public class StringColumn {
|
||||
|
||||
private static class TxtStringReader implements StringColumnReader {
|
||||
private final StorageReader storageReader;
|
||||
private long position = 0;
|
||||
|
||||
public TxtStringReader(StorageReader storageReader) throws IOException {
|
||||
this.storageReader = storageReader;
|
||||
@ -197,18 +217,21 @@ public class StringColumn {
|
||||
sb.append((char) b);
|
||||
}
|
||||
}
|
||||
position++;
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long position() throws IOException {
|
||||
return storageReader.position();
|
||||
return position;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void skip(long positions) throws IOException {
|
||||
int i = 0;
|
||||
|
||||
position+=positions;
|
||||
|
||||
while (i < positions && storageReader.hasRemaining()) {
|
||||
if (storageReader.getByte() == '\n') {
|
||||
i++;
|
||||
|
@ -36,20 +36,35 @@ public record ColumnDesc<R extends ColumnReader,
|
||||
this(name, 0, ColumnFunction.DATA, type, storageType);
|
||||
}
|
||||
|
||||
public R open(Path path) throws IOException {
|
||||
return type.open(path, this);
|
||||
/** Open a column reader for this column.
|
||||
*
|
||||
* @param table the table to register the reader with
|
||||
* @param path the path to the file to read from
|
||||
* */
|
||||
public R open(SlopTable table, Path path) throws IOException {
|
||||
var reader = type.open(path, this);
|
||||
table.register(reader);
|
||||
return reader;
|
||||
}
|
||||
|
||||
public W create(Path path) throws IOException {
|
||||
return type.register(path, this);
|
||||
/** Create a new column writer for this column.
|
||||
*
|
||||
* @param table the table to register the writer with
|
||||
* @param path the path to the file to write to
|
||||
* */
|
||||
public W create(SlopTable table, Path path) throws IOException {
|
||||
var writer = type.register(path, this);
|
||||
table.register(writer);
|
||||
return writer;
|
||||
}
|
||||
|
||||
public ColumnDesc createDerivative(
|
||||
public <R2 extends ColumnReader, W2 extends ColumnWriter >
|
||||
ColumnDesc<R2, W2> createSupplementaryColumn(
|
||||
ColumnFunction function,
|
||||
ColumnType type,
|
||||
ColumnType<R2, W2> type,
|
||||
StorageType storageType)
|
||||
{
|
||||
return new ColumnDesc(name, page, function, type, storageType);
|
||||
return new ColumnDesc<>(name, page, function, type, storageType);
|
||||
}
|
||||
|
||||
public ByteOrder byteOrder() {
|
||||
@ -57,7 +72,7 @@ public record ColumnDesc<R extends ColumnReader,
|
||||
}
|
||||
|
||||
public ColumnDesc<R, W> forPage(int page) {
|
||||
return new ColumnDesc(name, page, function, type, storageType);
|
||||
return new ColumnDesc<>(name, page, function, type, storageType);
|
||||
}
|
||||
|
||||
public boolean exists(Path base) {
|
||||
|
@ -0,0 +1,87 @@
|
||||
package nu.marginalia.slop.desc;
|
||||
|
||||
import nu.marginalia.slop.column.ColumnReader;
|
||||
import nu.marginalia.slop.column.ColumnWriter;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
/** SlopTable is a utility class for managing a group of columns that are
|
||||
* read and written together. It is used to ensure that the reader and writer
|
||||
* positions are maintained correctly between the columns, and to ensure that
|
||||
* the columns are closed correctly.
|
||||
* <p></p>
|
||||
* To deal with the fact that some columns may not be expected to have the same
|
||||
* number of rows, SlopTable supports the concept of column groups. Each column
|
||||
* group is a separate SlopTable instance, and the columns in the group are
|
||||
* managed together.
|
||||
* <p></p>
|
||||
* It is often a good idea to let the reader or writer class for a particular
|
||||
* table inherit from SlopTable, so that the table is automatically closed when
|
||||
* the reader or writer is closed.
|
||||
*/
|
||||
|
||||
public class SlopTable implements AutoCloseable {
|
||||
private final List<ColumnReader> readerList = new ArrayList<>();
|
||||
private final List<ColumnWriter> writerList = new ArrayList<>();
|
||||
|
||||
private final Map<String, SlopTable> columnGroups = new HashMap<>();
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(SlopTable.class);
|
||||
|
||||
/** Create a SlopTable corresponding to a grouping of columns that have their own
|
||||
* internal consistency check. This is needed e.g. for grouped values. The table is
|
||||
* closed automatically by the current instance.
|
||||
*/
|
||||
public SlopTable columnGroup(String name) {
|
||||
return columnGroups.computeIfAbsent(name, k -> new SlopTable());
|
||||
}
|
||||
|
||||
/** Register a column reader with this table. This is called from ColumnDesc. */
|
||||
void register(ColumnReader reader) {
|
||||
readerList.add(reader);
|
||||
}
|
||||
|
||||
/** Register a column reader with this table. This is called from ColumnDesc. */
|
||||
void register(ColumnWriter writer) {
|
||||
writerList.add(writer);
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
|
||||
Set<Long> positions = new HashSet<>();
|
||||
|
||||
for (ColumnReader reader : readerList) {
|
||||
positions.add(reader.position());
|
||||
reader.close();
|
||||
}
|
||||
for (ColumnWriter writer : writerList) {
|
||||
positions.add(writer.position());
|
||||
writer.close();
|
||||
}
|
||||
|
||||
|
||||
// Check for the scenario where we have multiple positions
|
||||
// and one of the positions is zero, indicating that we haven't
|
||||
// read or written to one of the columns. This is likely a bug,
|
||||
// but not necessarily a severe one, so we just log a warning.
|
||||
|
||||
if (positions.remove(0L) && !positions.isEmpty()) {
|
||||
logger.warn("Zero position found in one of the tables, this is likely development debris");
|
||||
}
|
||||
|
||||
// If there are more than one position and several are non-zero, then we haven't maintained the
|
||||
// position correctly between the columns. This is a disaster, so we throw an exception.
|
||||
if (positions.size() > 1) {
|
||||
throw new IllegalStateException("Expected only one reader position, was " + positions);
|
||||
}
|
||||
|
||||
for (var table : columnGroups.values()) {
|
||||
table.close();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -1,9 +1,6 @@
|
||||
package nu.marginalia.slop.column;
|
||||
|
||||
import nu.marginalia.slop.desc.ColumnDesc;
|
||||
import nu.marginalia.slop.desc.ColumnFunction;
|
||||
import nu.marginalia.slop.desc.ColumnType;
|
||||
import nu.marginalia.slop.desc.StorageType;
|
||||
import nu.marginalia.slop.desc.*;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
@ -61,11 +58,15 @@ class StringColumnTest {
|
||||
ColumnType.STRING,
|
||||
StorageType.GZIP);
|
||||
|
||||
try (var column = name.create(tempDir)) {
|
||||
try (var table = new SlopTable()) {
|
||||
var column = name.create(table, tempDir);
|
||||
|
||||
column.put("Lorem");
|
||||
column.put("Ipsum");
|
||||
}
|
||||
try (var column = name.open(tempDir)) {
|
||||
try (var table = new SlopTable()) {
|
||||
var column = name.open(table, tempDir);
|
||||
|
||||
assertEquals("Lorem", column.get());
|
||||
assertEquals("Ipsum", column.get());
|
||||
assertFalse(column.hasRemaining());
|
||||
@ -80,11 +81,13 @@ class StringColumnTest {
|
||||
ColumnType.CSTRING,
|
||||
StorageType.GZIP);
|
||||
|
||||
try (var column = name.create(tempDir)) {
|
||||
try (var table = new SlopTable()) {
|
||||
var column = name.create(table, tempDir);
|
||||
column.put("Lorem");
|
||||
column.put("Ipsum");
|
||||
}
|
||||
try (var column = name.open(tempDir)) {
|
||||
try (var table = new SlopTable()) {
|
||||
var column = name.open(table, tempDir);
|
||||
assertEquals("Lorem", column.get());
|
||||
assertEquals("Ipsum", column.get());
|
||||
assertFalse(column.hasRemaining());
|
||||
@ -99,11 +102,13 @@ class StringColumnTest {
|
||||
ColumnType.TXTSTRING,
|
||||
StorageType.GZIP);
|
||||
|
||||
try (var column = name.create(tempDir)) {
|
||||
try (var table = new SlopTable()) {
|
||||
var column = name.create(table, tempDir);
|
||||
column.put("Lorem");
|
||||
column.put("Ipsum");
|
||||
}
|
||||
try (var column = name.open(tempDir)) {
|
||||
try (var table = new SlopTable()) {
|
||||
var column = name.open(table, tempDir);
|
||||
assertEquals("Lorem", column.get());
|
||||
assertEquals("Ipsum", column.get());
|
||||
assertFalse(column.hasRemaining());
|
||||
|
@ -36,6 +36,7 @@ dependencies {
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:libraries:message-queue')
|
||||
implementation project(':code:libraries:blocking-thread-pool')
|
||||
implementation project(':code:libraries:slop')
|
||||
|
||||
implementation project(':code:libraries:guarded-regex')
|
||||
implementation project(':code:libraries:easy-lsh')
|
||||
|
@ -14,6 +14,7 @@ import nu.marginalia.slop.column.string.StringColumnReader;
|
||||
import nu.marginalia.slop.column.string.StringColumnWriter;
|
||||
import nu.marginalia.slop.desc.ColumnDesc;
|
||||
import nu.marginalia.slop.desc.ColumnType;
|
||||
import nu.marginalia.slop.desc.SlopTable;
|
||||
import nu.marginalia.slop.desc.StorageType;
|
||||
|
||||
import java.io.IOException;
|
||||
@ -119,7 +120,7 @@ public record SlopDocumentRecord(
|
||||
private static final ColumnDesc<ByteArrayColumnReader, ByteArrayColumnWriter> spanCodesColumn = new ColumnDesc<>("spanCodes", ColumnType.BYTE_ARRAY, StorageType.ZSTD);
|
||||
private static final ColumnDesc<GammaCodedSequenceReader, GammaCodedSequenceWriter> spansColumn = new ColumnDesc<>("spans", ColumnType.BYTE_ARRAY_GCS, StorageType.ZSTD);
|
||||
|
||||
public static class KeywordsProjectionReader implements AutoCloseable {
|
||||
public static class KeywordsProjectionReader extends SlopTable {
|
||||
private final StringColumnReader domainsReader;
|
||||
private final VarintColumnReader ordinalsReader;
|
||||
private final IntColumnReader htmlFeaturesReader;
|
||||
@ -140,17 +141,19 @@ public record SlopDocumentRecord(
|
||||
}
|
||||
|
||||
public KeywordsProjectionReader(Path baseDir, int page) throws IOException {
|
||||
domainsReader = domainsColumn.forPage(page).open(baseDir);
|
||||
ordinalsReader = ordinalsColumn.forPage(page).open(baseDir);
|
||||
htmlFeaturesReader = htmlFeaturesColumn.forPage(page).open(baseDir);
|
||||
domainMetadataReader = domainMetadata.forPage(page).open(baseDir);
|
||||
lengthsReader = lengthsColumn.forPage(page).open(baseDir);
|
||||
keywordsReader = keywordsColumn.forPage(page).open(baseDir);
|
||||
termCountsReader = termCountsColumn.forPage(page).open(baseDir);
|
||||
termMetaReader = termMetaColumn.forPage(page).open(baseDir);
|
||||
termPositionsReader = termPositionsColumn.forPage(page).open(baseDir);
|
||||
spanCodesReader = spanCodesColumn.forPage(page).open(baseDir);
|
||||
spansReader = spansColumn.forPage(page).open(baseDir);
|
||||
domainsReader = domainsColumn.forPage(page).open(this, baseDir);
|
||||
ordinalsReader = ordinalsColumn.forPage(page).open(this, baseDir);
|
||||
htmlFeaturesReader = htmlFeaturesColumn.forPage(page).open(this, baseDir);
|
||||
domainMetadataReader = domainMetadata.forPage(page).open(this, baseDir);
|
||||
lengthsReader = lengthsColumn.forPage(page).open(this, baseDir);
|
||||
termCountsReader = termCountsColumn.forPage(page).open(this, baseDir);
|
||||
|
||||
keywordsReader = keywordsColumn.forPage(page).open(this.columnGroup("keywords"), baseDir);
|
||||
termMetaReader = termMetaColumn.forPage(page).open(this.columnGroup("keywords"), baseDir);
|
||||
termPositionsReader = termPositionsColumn.forPage(page).open(this.columnGroup("keywords"), baseDir);
|
||||
|
||||
spanCodesReader = spanCodesColumn.forPage(page).open(this.columnGroup("spans"), baseDir);
|
||||
spansReader = spansColumn.forPage(page).open(this.columnGroup("spans"), baseDir);
|
||||
}
|
||||
|
||||
public boolean hasMore() throws IOException {
|
||||
@ -197,22 +200,9 @@ public record SlopDocumentRecord(
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
public void close() throws IOException {
|
||||
domainsReader.close();
|
||||
ordinalsReader.close();
|
||||
htmlFeaturesReader.close();
|
||||
domainMetadataReader.close();
|
||||
lengthsReader.close();
|
||||
keywordsReader.close();
|
||||
termMetaReader.close();
|
||||
termPositionsReader.close();
|
||||
spanCodesReader.close();
|
||||
spansReader.close();
|
||||
}
|
||||
}
|
||||
|
||||
public static class MetadataReader implements AutoCloseable {
|
||||
public static class MetadataReader extends SlopTable {
|
||||
private final StringColumnReader domainsReader;
|
||||
private final StringColumnReader urlsReader;
|
||||
private final VarintColumnReader ordinalsReader;
|
||||
@ -230,17 +220,17 @@ public record SlopDocumentRecord(
|
||||
}
|
||||
|
||||
public MetadataReader(Path baseDir, int page) throws IOException {
|
||||
this.domainsReader = domainsColumn.forPage(page).open(baseDir);
|
||||
this.urlsReader = urlsColumn.forPage(page).open(baseDir);
|
||||
this.ordinalsReader = ordinalsColumn.forPage(page).open(baseDir);
|
||||
this.titlesReader = titlesColumn.forPage(page).open(baseDir);
|
||||
this.descriptionsReader = descriptionsColumn.forPage(page).open(baseDir);
|
||||
this.htmlFeaturesReader = htmlFeaturesColumn.forPage(page).open(baseDir);
|
||||
this.htmlStandardsReader = htmlStandardsColumn.forPage(page).open(baseDir);
|
||||
this.lengthsReader = lengthsColumn.forPage(page).open(baseDir);
|
||||
this.hashesReader = hashesColumn.forPage(page).open(baseDir);
|
||||
this.qualitiesReader = qualitiesColumn.forPage(page).open(baseDir);
|
||||
this.pubYearReader = pubYearColumn.forPage(page).open(baseDir);
|
||||
this.domainsReader = domainsColumn.forPage(page).open(this, baseDir);
|
||||
this.urlsReader = urlsColumn.forPage(page).open(this, baseDir);
|
||||
this.ordinalsReader = ordinalsColumn.forPage(page).open(this, baseDir);
|
||||
this.titlesReader = titlesColumn.forPage(page).open(this, baseDir);
|
||||
this.descriptionsReader = descriptionsColumn.forPage(page).open(this, baseDir);
|
||||
this.htmlFeaturesReader = htmlFeaturesColumn.forPage(page).open(this, baseDir);
|
||||
this.htmlStandardsReader = htmlStandardsColumn.forPage(page).open(this, baseDir);
|
||||
this.lengthsReader = lengthsColumn.forPage(page).open(this, baseDir);
|
||||
this.hashesReader = hashesColumn.forPage(page).open(this, baseDir);
|
||||
this.qualitiesReader = qualitiesColumn.forPage(page).open(this, baseDir);
|
||||
this.pubYearReader = pubYearColumn.forPage(page).open(this, baseDir);
|
||||
}
|
||||
|
||||
public MetadataProjection next() throws IOException {
|
||||
@ -264,22 +254,9 @@ public record SlopDocumentRecord(
|
||||
return domainsReader.hasRemaining();
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
domainsReader.close();
|
||||
urlsReader.close();
|
||||
ordinalsReader.close();
|
||||
titlesReader.close();
|
||||
descriptionsReader.close();
|
||||
htmlFeaturesReader.close();
|
||||
htmlStandardsReader.close();
|
||||
lengthsReader.close();
|
||||
hashesReader.close();
|
||||
qualitiesReader.close();
|
||||
pubYearReader.close();
|
||||
}
|
||||
}
|
||||
|
||||
public static class Writer implements AutoCloseable {
|
||||
public static class Writer extends SlopTable {
|
||||
private final StringColumnWriter domainsWriter;
|
||||
private final StringColumnWriter urlsWriter;
|
||||
private final VarintColumnWriter ordinalsWriter;
|
||||
@ -302,27 +279,28 @@ public record SlopDocumentRecord(
|
||||
private final GammaCodedSequenceWriter spansWriter;
|
||||
|
||||
public Writer(Path baseDir, int page) throws IOException {
|
||||
domainsWriter = domainsColumn.forPage(page).create(baseDir);
|
||||
urlsWriter = urlsColumn.forPage(page).create(baseDir);
|
||||
ordinalsWriter = ordinalsColumn.forPage(page).create(baseDir);
|
||||
statesWriter = statesColumn.forPage(page).create(baseDir);
|
||||
stateReasonsWriter = stateReasonsColumn.forPage(page).create(baseDir);
|
||||
titlesWriter = titlesColumn.forPage(page).create(baseDir);
|
||||
descriptionsWriter = descriptionsColumn.forPage(page).create(baseDir);
|
||||
htmlFeaturesWriter = htmlFeaturesColumn.forPage(page).create(baseDir);
|
||||
htmlStandardsWriter = htmlStandardsColumn.forPage(page).create(baseDir);
|
||||
lengthsWriter = lengthsColumn.forPage(page).create(baseDir);
|
||||
hashesWriter = hashesColumn.forPage(page).create(baseDir);
|
||||
qualitiesWriter = qualitiesColumn.forPage(page).create(baseDir);
|
||||
domainMetadataWriter = domainMetadata.forPage(page).create(baseDir);
|
||||
pubYearWriter = pubYearColumn.forPage(page).create(baseDir);
|
||||
termCountsWriter = termCountsColumn.forPage(page).create(baseDir);
|
||||
keywordsWriter = keywordsColumn.forPage(page).create(baseDir);
|
||||
termMetaWriter = termMetaColumn.forPage(page).create(baseDir);
|
||||
termPositionsWriter = termPositionsColumn.forPage(page).create(baseDir);
|
||||
domainsWriter = domainsColumn.forPage(page).create(this, baseDir);
|
||||
urlsWriter = urlsColumn.forPage(page).create(this, baseDir);
|
||||
ordinalsWriter = ordinalsColumn.forPage(page).create(this, baseDir);
|
||||
statesWriter = statesColumn.forPage(page).create(this, baseDir);
|
||||
stateReasonsWriter = stateReasonsColumn.forPage(page).create(this, baseDir);
|
||||
titlesWriter = titlesColumn.forPage(page).create(this, baseDir);
|
||||
descriptionsWriter = descriptionsColumn.forPage(page).create(this, baseDir);
|
||||
htmlFeaturesWriter = htmlFeaturesColumn.forPage(page).create(this, baseDir);
|
||||
htmlStandardsWriter = htmlStandardsColumn.forPage(page).create(this, baseDir);
|
||||
lengthsWriter = lengthsColumn.forPage(page).create(this, baseDir);
|
||||
hashesWriter = hashesColumn.forPage(page).create(this, baseDir);
|
||||
qualitiesWriter = qualitiesColumn.forPage(page).create(this, baseDir);
|
||||
domainMetadataWriter = domainMetadata.forPage(page).create(this, baseDir);
|
||||
pubYearWriter = pubYearColumn.forPage(page).create(this, baseDir);
|
||||
termCountsWriter = termCountsColumn.forPage(page).create(this, baseDir);
|
||||
|
||||
spansWriter = spansColumn.forPage(page).create(baseDir);
|
||||
spansCodesWriter = spanCodesColumn.forPage(page).create(baseDir);
|
||||
keywordsWriter = keywordsColumn.forPage(page).create(this.columnGroup("keywords"), baseDir);
|
||||
termMetaWriter = termMetaColumn.forPage(page).create(this.columnGroup("keywords"), baseDir);
|
||||
termPositionsWriter = termPositionsColumn.forPage(page).create(this.columnGroup("keywords"), baseDir);
|
||||
|
||||
spansWriter = spansColumn.forPage(page).create(this.columnGroup("spans"), baseDir);
|
||||
spansCodesWriter = spanCodesColumn.forPage(page).create(this.columnGroup("spans"), baseDir);
|
||||
}
|
||||
|
||||
public void write(SlopDocumentRecord record) throws IOException {
|
||||
@ -367,29 +345,5 @@ public record SlopDocumentRecord(
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
domainsWriter.close();
|
||||
urlsWriter.close();
|
||||
ordinalsWriter.close();
|
||||
statesWriter.close();
|
||||
stateReasonsWriter.close();
|
||||
titlesWriter.close();
|
||||
descriptionsWriter.close();
|
||||
htmlFeaturesWriter.close();
|
||||
htmlStandardsWriter.close();
|
||||
lengthsWriter.close();
|
||||
hashesWriter.close();
|
||||
qualitiesWriter.close();
|
||||
domainMetadataWriter.close();
|
||||
pubYearWriter.close();
|
||||
termCountsWriter.close();
|
||||
keywordsWriter.close();
|
||||
termMetaWriter.close();
|
||||
termPositionsWriter.close();
|
||||
|
||||
spansCodesWriter.close();
|
||||
spansWriter.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -4,6 +4,7 @@ import nu.marginalia.slop.column.string.StringColumnReader;
|
||||
import nu.marginalia.slop.column.string.StringColumnWriter;
|
||||
import nu.marginalia.slop.desc.ColumnDesc;
|
||||
import nu.marginalia.slop.desc.ColumnType;
|
||||
import nu.marginalia.slop.desc.SlopTable;
|
||||
import nu.marginalia.slop.desc.StorageType;
|
||||
|
||||
import java.io.IOException;
|
||||
@ -21,7 +22,7 @@ public record SlopDomainLinkRecord(
|
||||
return new Reader(baseDir, page);
|
||||
}
|
||||
|
||||
public static class Reader implements AutoCloseable {
|
||||
public static class Reader extends SlopTable {
|
||||
private final StringColumnReader sourcesReader;
|
||||
private final StringColumnReader destsReader;
|
||||
|
||||
@ -30,15 +31,8 @@ public record SlopDomainLinkRecord(
|
||||
}
|
||||
|
||||
public Reader(Path baseDir, int page) throws IOException {
|
||||
sourcesReader = sourcesColumn.forPage(page).open(baseDir);
|
||||
destsReader = destsColumn.forPage(page).open(baseDir);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
sourcesReader.close();
|
||||
destsReader.close();
|
||||
sourcesReader = sourcesColumn.forPage(page).open(this, baseDir);
|
||||
destsReader = destsColumn.forPage(page).open(this, baseDir);
|
||||
}
|
||||
|
||||
public boolean hasMore() throws IOException {
|
||||
@ -60,13 +54,13 @@ public record SlopDomainLinkRecord(
|
||||
}
|
||||
}
|
||||
|
||||
public static class Writer implements AutoCloseable {
|
||||
public static class Writer extends SlopTable {
|
||||
private final StringColumnWriter sourcesWriter;
|
||||
private final StringColumnWriter destsWriter;
|
||||
|
||||
public Writer(Path baseDir, int page) throws IOException {
|
||||
sourcesWriter = sourcesColumn.forPage(page).create(baseDir);
|
||||
destsWriter = destsColumn.forPage(page).create(baseDir);
|
||||
sourcesWriter = sourcesColumn.forPage(page).create(this, baseDir);
|
||||
destsWriter = destsColumn.forPage(page).create(this, baseDir);
|
||||
}
|
||||
|
||||
public void write(SlopDomainLinkRecord record) throws IOException {
|
||||
|
@ -6,6 +6,7 @@ import nu.marginalia.slop.column.string.StringColumnReader;
|
||||
import nu.marginalia.slop.column.string.StringColumnWriter;
|
||||
import nu.marginalia.slop.desc.ColumnDesc;
|
||||
import nu.marginalia.slop.desc.ColumnType;
|
||||
import nu.marginalia.slop.desc.SlopTable;
|
||||
import nu.marginalia.slop.desc.StorageType;
|
||||
|
||||
import java.io.IOException;
|
||||
@ -43,7 +44,7 @@ public record SlopDomainRecord(
|
||||
private static final ColumnDesc<StringColumnReader, StringColumnWriter> rssFeedsColumn = new ColumnDesc<>("rssFeeds", ColumnType.TXTSTRING, StorageType.GZIP);
|
||||
|
||||
|
||||
public static class DomainNameReader implements AutoCloseable {
|
||||
public static class DomainNameReader extends SlopTable {
|
||||
private final StringColumnReader domainsReader;
|
||||
|
||||
public DomainNameReader(SlopPageRef<SlopDomainRecord> page) throws IOException {
|
||||
@ -51,13 +52,7 @@ public record SlopDomainRecord(
|
||||
}
|
||||
|
||||
public DomainNameReader(Path baseDir, int page) throws IOException {
|
||||
domainsReader = domainsColumn.forPage(page).open(baseDir);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
domainsReader.close();
|
||||
domainsReader = domainsColumn.forPage(page).open(this, baseDir);
|
||||
}
|
||||
|
||||
public boolean hasMore() throws IOException {
|
||||
@ -69,7 +64,7 @@ public record SlopDomainRecord(
|
||||
}
|
||||
}
|
||||
|
||||
public static class DomainWithIpReader implements AutoCloseable {
|
||||
public static class DomainWithIpReader extends SlopTable {
|
||||
private final StringColumnReader domainsReader;
|
||||
private final StringColumnReader ipReader;
|
||||
|
||||
@ -78,15 +73,8 @@ public record SlopDomainRecord(
|
||||
}
|
||||
|
||||
public DomainWithIpReader(Path baseDir, int page) throws IOException {
|
||||
domainsReader = domainsColumn.forPage(page).open(baseDir);
|
||||
ipReader = ipColumn.forPage(page).open(baseDir);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
domainsReader.close();
|
||||
ipReader.close();
|
||||
domainsReader = domainsColumn.forPage(page).open(this, baseDir);
|
||||
ipReader = ipColumn.forPage(page).open(this, baseDir);
|
||||
}
|
||||
|
||||
public boolean hasMore() throws IOException {
|
||||
@ -102,7 +90,7 @@ public record SlopDomainRecord(
|
||||
}
|
||||
}
|
||||
|
||||
public static class Reader implements AutoCloseable {
|
||||
public static class Reader extends SlopTable {
|
||||
private final StringColumnReader domainsReader;
|
||||
private final StringColumnReader statesReader;
|
||||
private final StringColumnReader redirectReader;
|
||||
@ -120,33 +108,17 @@ public record SlopDomainRecord(
|
||||
}
|
||||
|
||||
public Reader(Path baseDir, int page) throws IOException {
|
||||
domainsReader = domainsColumn.forPage(page).open(baseDir);
|
||||
statesReader = statesColumn.forPage(page).open(baseDir);
|
||||
redirectReader = redirectDomainsColumn.forPage(page).open(baseDir);
|
||||
ipReader = ipColumn.forPage(page).open(baseDir);
|
||||
domainsReader = domainsColumn.forPage(page).open(this, baseDir);
|
||||
statesReader = statesColumn.forPage(page).open(this, baseDir);
|
||||
redirectReader = redirectDomainsColumn.forPage(page).open(this, baseDir);
|
||||
ipReader = ipColumn.forPage(page).open(this, baseDir);
|
||||
|
||||
knownUrlsReader = knownUrlsColumn.forPage(page).open(baseDir);
|
||||
goodUrlsReader = goodUrlsColumn.forPage(page).open(baseDir);
|
||||
visitedUrlsReader = visitedUrlsColumn.forPage(page).open(baseDir);
|
||||
knownUrlsReader = knownUrlsColumn.forPage(page).open(this, baseDir);
|
||||
goodUrlsReader = goodUrlsColumn.forPage(page).open(this, baseDir);
|
||||
visitedUrlsReader = visitedUrlsColumn.forPage(page).open(this, baseDir);
|
||||
|
||||
rssFeedsCountReader = rssFeedsCountColumn.forPage(page).open(baseDir);
|
||||
rssFeedsReader = rssFeedsColumn.forPage(page).open(baseDir);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
domainsReader.close();
|
||||
statesReader.close();
|
||||
redirectReader.close();
|
||||
ipReader.close();
|
||||
|
||||
knownUrlsReader.close();
|
||||
goodUrlsReader.close();
|
||||
visitedUrlsReader.close();
|
||||
|
||||
rssFeedsCountReader.close();
|
||||
rssFeedsReader.close();
|
||||
rssFeedsCountReader = rssFeedsCountColumn.forPage(page).open(this, baseDir);
|
||||
rssFeedsReader = rssFeedsColumn.forPage(page).open(this, baseDir);
|
||||
}
|
||||
|
||||
public boolean hasMore() throws IOException {
|
||||
@ -179,7 +151,7 @@ public record SlopDomainRecord(
|
||||
}
|
||||
}
|
||||
|
||||
public static class Writer implements AutoCloseable {
|
||||
public static class Writer extends SlopTable {
|
||||
private final StringColumnWriter domainsWriter;
|
||||
private final StringColumnWriter statesWriter;
|
||||
private final StringColumnWriter redirectWriter;
|
||||
@ -193,17 +165,17 @@ public record SlopDomainRecord(
|
||||
private final StringColumnWriter rssFeedsWriter;
|
||||
|
||||
public Writer(Path baseDir, int page) throws IOException {
|
||||
domainsWriter = domainsColumn.forPage(page).create(baseDir);
|
||||
statesWriter = statesColumn.forPage(page).create(baseDir);
|
||||
redirectWriter = redirectDomainsColumn.forPage(page).create(baseDir);
|
||||
ipWriter = ipColumn.forPage(page).create(baseDir);
|
||||
domainsWriter = domainsColumn.forPage(page).create(this, baseDir);
|
||||
statesWriter = statesColumn.forPage(page).create(this, baseDir);
|
||||
redirectWriter = redirectDomainsColumn.forPage(page).create(this, baseDir);
|
||||
ipWriter = ipColumn.forPage(page).create(this, baseDir);
|
||||
|
||||
knownUrlsWriter = knownUrlsColumn.forPage(page).create(baseDir);
|
||||
goodUrlsWriter = goodUrlsColumn.forPage(page).create(baseDir);
|
||||
visitedUrlsWriter = visitedUrlsColumn.forPage(page).create(baseDir);
|
||||
knownUrlsWriter = knownUrlsColumn.forPage(page).create(this, baseDir);
|
||||
goodUrlsWriter = goodUrlsColumn.forPage(page).create(this, baseDir);
|
||||
visitedUrlsWriter = visitedUrlsColumn.forPage(page).create(this, baseDir);
|
||||
|
||||
rssFeedsCountWriter = rssFeedsCountColumn.forPage(page).create(baseDir);
|
||||
rssFeedsWriter = rssFeedsColumn.forPage(page).create(baseDir);
|
||||
rssFeedsCountWriter = rssFeedsCountColumn.forPage(page).create(this, baseDir);
|
||||
rssFeedsWriter = rssFeedsColumn.forPage(page).create(this.columnGroup("rss-feeds"), baseDir);
|
||||
}
|
||||
|
||||
public void write(SlopDomainRecord record) throws IOException {
|
||||
@ -221,20 +193,5 @@ public record SlopDomainRecord(
|
||||
rssFeedsWriter.put(rssFeed);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
domainsWriter.close();
|
||||
statesWriter.close();
|
||||
redirectWriter.close();
|
||||
ipWriter.close();
|
||||
|
||||
knownUrlsWriter.close();
|
||||
goodUrlsWriter.close();
|
||||
visitedUrlsWriter.close();
|
||||
|
||||
rssFeedsCountWriter.close();
|
||||
rssFeedsWriter.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -32,6 +32,7 @@ dependencies {
|
||||
implementation project(':code:libraries:message-queue')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:coded-sequence')
|
||||
implementation project(':code:libraries:slop')
|
||||
implementation project(':third-party:commons-codec')
|
||||
implementation project(':third-party:parquet-floor')
|
||||
testImplementation project(':code:services-application:search-service')
|
||||
|
Loading…
Reference in New Issue
Block a user