diff --git a/code/index/build.gradle b/code/index/build.gradle index db4dab20..bf50a507 100644 --- a/code/index/build.gradle +++ b/code/index/build.gradle @@ -22,6 +22,7 @@ dependencies { implementation project(':code:libraries:array') implementation project(':code:libraries:btree') + implementation project(':code:libraries:slop') implementation project(':code:libraries:coded-sequence') implementation project(':code:common:db') diff --git a/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexConverter.java b/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexConverter.java index 2edc283f..66f45736 100644 --- a/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexConverter.java +++ b/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexConverter.java @@ -9,6 +9,7 @@ import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.slop.column.primitive.LongColumnReader; +import nu.marginalia.slop.desc.SlopTable; import org.roaringbitmap.longlong.LongConsumer; import org.roaringbitmap.longlong.Roaring64Bitmap; import org.slf4j.Logger; @@ -80,16 +81,15 @@ public class ForwardIndexConverter { ByteBuffer workArea = ByteBuffer.allocate(65536); for (var instance : journal.pages()) { - try (var docIdReader = instance.openCombinedId(); - var metaReader = instance.openDocumentMeta(); - var featuresReader = instance.openFeatures(); - var sizeReader = instance.openSize(); - - var spansCodesReader = instance.openSpanCodes(); - var spansSeqReader = instance.openSpans(); - var spansWriter = new ForwardIndexSpansWriter(outputFileSpansData) - ) + try (var slopTable = new SlopTable(); var spansWriter = new ForwardIndexSpansWriter(outputFileSpansData)) { + var docIdReader = instance.openCombinedId(slopTable); + var metaReader = instance.openDocumentMeta(slopTable); + var featuresReader = instance.openFeatures(slopTable); + var sizeReader = instance.openSize(slopTable); + var spansCodesReader = instance.openSpanCodes(slopTable); + var spansSeqReader = instance.openSpans(slopTable); + while (docIdReader.hasRemaining()) { long docId = docIdReader.get(); int domainId = UrlIdCodec.getDomainId(docId); @@ -148,7 +148,9 @@ public class ForwardIndexConverter { Roaring64Bitmap rbm = new Roaring64Bitmap(); for (var instance : journalReader.pages()) { - try (LongColumnReader idReader = instance.openCombinedId()) { + try (var slopTable = new SlopTable()) { + LongColumnReader idReader = instance.openCombinedId(slopTable); + while (idReader.hasRemaining()) { rbm.add(idReader.get()); } diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournalPage.java b/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournalPage.java index 8b8d7c2e..ee5c1be7 100644 --- a/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournalPage.java +++ b/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournalPage.java @@ -9,6 +9,7 @@ import nu.marginalia.slop.column.dynamic.VarintColumnWriter; import nu.marginalia.slop.column.primitive.*; import nu.marginalia.slop.desc.ColumnDesc; import nu.marginalia.slop.desc.ColumnType; +import nu.marginalia.slop.desc.SlopTable; import nu.marginalia.slop.desc.StorageType; import java.io.IOException; @@ -34,43 +35,43 @@ public record IndexJournalPage(Path baseDir, int page) { } } - public LongColumnReader openCombinedId() throws IOException { - return combinedId.forPage(page).open(baseDir); + public LongColumnReader openCombinedId(SlopTable table) throws IOException { + return combinedId.forPage(page).open(table, baseDir); } - public LongColumnReader openDocumentMeta() throws IOException { - return documentMeta.forPage(page).open(baseDir); + public LongColumnReader openDocumentMeta(SlopTable table) throws IOException { + return documentMeta.forPage(page).open(table, baseDir); } - public IntColumnReader openFeatures() throws IOException { - return features.forPage(page).open(baseDir); + public IntColumnReader openFeatures(SlopTable table) throws IOException { + return features.forPage(page).open(table, baseDir); } - public IntColumnReader openSize() throws IOException { - return size.forPage(page).open(baseDir); + public IntColumnReader openSize(SlopTable table) throws IOException { + return size.forPage(page).open(table, baseDir); } - public LongColumnReader openTermCounts() throws IOException { - return termCounts.forPage(page).open(baseDir); + public LongColumnReader openTermCounts(SlopTable table) throws IOException { + return termCounts.forPage(page).open(table, baseDir); } - public LongColumnReader openTermIds() throws IOException { - return termIds.forPage(page).open(baseDir); + public LongColumnReader openTermIds(SlopTable table) throws IOException { + return termIds.forPage(page).open(table.columnGroup("keywords"), baseDir); } - public ByteColumnReader openTermMetadata() throws IOException { - return termMeta.forPage(page).open(baseDir); + public ByteColumnReader openTermMetadata(SlopTable table) throws IOException { + return termMeta.forPage(page).open(table.columnGroup("keywords"), baseDir); } - public GammaCodedSequenceReader openTermPositions() throws IOException { - return positions.forPage(page).open(baseDir); + public GammaCodedSequenceReader openTermPositions(SlopTable table) throws IOException { + return positions.forPage(page).open(table.columnGroup("keywords"), baseDir); } - public GammaCodedSequenceReader openSpans() throws IOException { - return spans.forPage(page).open(baseDir); + public GammaCodedSequenceReader openSpans(SlopTable table) throws IOException { + return spans.forPage(page).open(table.columnGroup("spans"), baseDir); } - public ByteArrayColumnReader openSpanCodes() throws IOException { - return spanCodes.forPage(page).open(baseDir); + public ByteArrayColumnReader openSpanCodes(SlopTable table) throws IOException { + return spanCodes.forPage(page).open(table.columnGroup("spans"), baseDir); } } diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournalSlopWriter.java b/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournalSlopWriter.java index 10e4edd6..492fd605 100644 --- a/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournalSlopWriter.java +++ b/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournalSlopWriter.java @@ -10,13 +10,14 @@ import nu.marginalia.slop.column.dynamic.GammaCodedSequenceWriter; import nu.marginalia.slop.column.primitive.ByteColumnWriter; import nu.marginalia.slop.column.primitive.IntColumnWriter; import nu.marginalia.slop.column.primitive.LongColumnWriter; +import nu.marginalia.slop.desc.SlopTable; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.List; -public class IndexJournalSlopWriter implements AutoCloseable { +public class IndexJournalSlopWriter extends SlopTable { private final IntColumnWriter featuresWriter; private final IntColumnWriter sizeWriter; @@ -39,19 +40,20 @@ public class IndexJournalSlopWriter implements AutoCloseable { } - featuresWriter = IndexJournalPage.features.forPage(page).create(dir); - sizeWriter = IndexJournalPage.size.forPage(page).create(dir); + featuresWriter = IndexJournalPage.features.forPage(page).create(this, dir); + sizeWriter = IndexJournalPage.size.forPage(page).create(this, dir); - combinedIdWriter = IndexJournalPage.combinedId.forPage(page).create(dir); - documentMetaWriter = IndexJournalPage.documentMeta.forPage(page).create(dir); + combinedIdWriter = IndexJournalPage.combinedId.forPage(page).create(this, dir); + documentMetaWriter = IndexJournalPage.documentMeta.forPage(page).create(this, dir); - termCountsWriter = IndexJournalPage.termCounts.forPage(page).create(dir); - termIdsWriter = IndexJournalPage.termIds.forPage(page).create(dir); - termMetadataWriter = IndexJournalPage.termMeta.forPage(page).create(dir); - termPositionsWriter = IndexJournalPage.positions.forPage(page).create(dir); + termCountsWriter = IndexJournalPage.termCounts.forPage(page).create(this, dir); - spansWriter = IndexJournalPage.spans.forPage(page).create(dir); - spanCodesWriter = IndexJournalPage.spanCodes.forPage(page).create(dir); + termIdsWriter = IndexJournalPage.termIds.forPage(page).create(this.columnGroup("keywords"), dir); + termMetadataWriter = IndexJournalPage.termMeta.forPage(page).create(this.columnGroup("keywords"), dir); + termPositionsWriter = IndexJournalPage.positions.forPage(page).create(this.columnGroup("keywords"), dir); + + spansWriter = IndexJournalPage.spans.forPage(page).create(this.columnGroup("spans"), dir); + spanCodesWriter = IndexJournalPage.spanCodes.forPage(page).create(this.columnGroup("spans"), dir); } @SneakyThrows diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexDocuments.java b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexDocuments.java index 9cadeb41..7418f92a 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexDocuments.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexDocuments.java @@ -7,6 +7,7 @@ import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.PositionsFileConstructor; import nu.marginalia.index.journal.IndexJournalPage; import nu.marginalia.rwf.RandomFileAssembler; +import nu.marginalia.slop.desc.SlopTable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -78,12 +79,13 @@ public class FullPreindexDocuments { final ByteBuffer tempBuffer = ByteBuffer.allocate(65536); try (var assembly = RandomFileAssembler.create(workDir, fileSizeLongs); - var docIds = journalInstance.openCombinedId(); - var termCounts = journalInstance.openTermCounts(); - var termIds = journalInstance.openTermIds(); - var termMeta = journalInstance.openTermMetadata(); - var positions = journalInstance.openTermPositions()) + var slopTable = new SlopTable()) { + var docIds = journalInstance.openCombinedId(slopTable); + var termCounts = journalInstance.openTermCounts(slopTable); + var termIds = journalInstance.openTermIds(slopTable); + var termMeta = journalInstance.openTermMetadata(slopTable); + var positions = journalInstance.openTermPositions(slopTable); var offsetMap = segments.asMap(RECORD_SIZE_LONGS); offsetMap.defaultReturnValue(0); diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexWordSegments.java b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexWordSegments.java index 120b1326..51987a36 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexWordSegments.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexWordSegments.java @@ -6,6 +6,7 @@ import it.unimi.dsi.fastutil.longs.LongIterator; import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArrayFactory; import nu.marginalia.index.journal.IndexJournalPage; +import nu.marginalia.slop.desc.SlopTable; import java.io.IOException; import java.nio.file.Files; @@ -59,7 +60,8 @@ public class FullPreindexWordSegments { Long2IntOpenHashMap countsMap = new Long2IntOpenHashMap(100_000, 0.75f); countsMap.defaultReturnValue(0); - try (var termIds = journalInstance.openTermIds()) { + try (var slopTable = new SlopTable()) { + var termIds = journalInstance.openTermIds(slopTable); while (termIds.hasRemaining()) { countsMap.addTo(termIds.get(), 1); } diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexDocuments.java b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexDocuments.java index bdda5a4f..e5ab2409 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexDocuments.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexDocuments.java @@ -6,6 +6,7 @@ import nu.marginalia.array.LongArrayFactory; import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.journal.IndexJournalPage; import nu.marginalia.rwf.RandomFileAssembler; +import nu.marginalia.slop.desc.SlopTable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -67,11 +68,12 @@ public class PrioPreindexDocuments { long fileSizeLongs = RECORD_SIZE_LONGS * segments.totalSize(); try (var assembly = RandomFileAssembler.create(workDir, fileSizeLongs); - var docIds = journalInstance.openCombinedId(); - var termIdsCounts = journalInstance.openTermCounts(); - var termIds = journalInstance.openTermIds(); - var termMeta = journalInstance.openTermMetadata()) + var slopTable = new SlopTable()) { + var docIds = journalInstance.openCombinedId(slopTable); + var termIdsCounts = journalInstance.openTermCounts(slopTable); + var termIds = journalInstance.openTermIds(slopTable); + var termMeta = journalInstance.openTermMetadata(slopTable); var offsetMap = segments.asMap(RECORD_SIZE_LONGS); offsetMap.defaultReturnValue(0); diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexWordSegments.java b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexWordSegments.java index c2fe2e96..a30d8a5f 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexWordSegments.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexWordSegments.java @@ -6,6 +6,7 @@ import it.unimi.dsi.fastutil.longs.LongIterator; import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArrayFactory; import nu.marginalia.index.journal.IndexJournalPage; +import nu.marginalia.slop.desc.SlopTable; import java.io.IOException; import java.nio.file.Files; @@ -59,8 +60,9 @@ public class PrioPreindexWordSegments { Long2IntOpenHashMap countsMap = new Long2IntOpenHashMap(100_000, 0.75f); countsMap.defaultReturnValue(0); - try (var termIds = journalInstance.openTermIds(); - var termMetas = journalInstance.openTermMetadata()) { + try (var slopTable = new SlopTable()) { + var termIds = journalInstance.openTermIds(slopTable); + var termMetas = journalInstance.openTermMetadata(slopTable); while (termIds.hasRemaining()) { long data = termIds.get(); diff --git a/code/libraries/slop/java/nu/marginalia/slop/column/ColumnReader.java b/code/libraries/slop/java/nu/marginalia/slop/column/ColumnReader.java index 89a87740..644ee788 100644 --- a/code/libraries/slop/java/nu/marginalia/slop/column/ColumnReader.java +++ b/code/libraries/slop/java/nu/marginalia/slop/column/ColumnReader.java @@ -11,4 +11,6 @@ public interface ColumnReader { } boolean hasRemaining() throws IOException; + + void close() throws IOException; } diff --git a/code/libraries/slop/java/nu/marginalia/slop/column/ColumnWriter.java b/code/libraries/slop/java/nu/marginalia/slop/column/ColumnWriter.java index 00e06ae2..661a4021 100644 --- a/code/libraries/slop/java/nu/marginalia/slop/column/ColumnWriter.java +++ b/code/libraries/slop/java/nu/marginalia/slop/column/ColumnWriter.java @@ -1,4 +1,10 @@ package nu.marginalia.slop.column; +import java.io.IOException; + public interface ColumnWriter { + /** Return the current record index in the column */ + long position(); + + void close() throws IOException; } diff --git a/code/libraries/slop/java/nu/marginalia/slop/column/array/ByteArrayColumn.java b/code/libraries/slop/java/nu/marginalia/slop/column/array/ByteArrayColumn.java index 24165be4..f641de3f 100644 --- a/code/libraries/slop/java/nu/marginalia/slop/column/array/ByteArrayColumn.java +++ b/code/libraries/slop/java/nu/marginalia/slop/column/array/ByteArrayColumn.java @@ -19,7 +19,7 @@ public class ByteArrayColumn { return new Reader( Storage.reader(path, name, true), VarintColumn.open(path, - name.createDerivative(name.function().lengthsTable(), + name.createSupplementaryColumn(name.function().lengthsTable(), ColumnType.VARINT_LE, StorageType.PLAIN) ) @@ -30,7 +30,7 @@ public class ByteArrayColumn { return new Writer( Storage.writer(path, name), VarintColumn.create(path, - name.createDerivative(name.function().lengthsTable(), + name.createSupplementaryColumn(name.function().lengthsTable(), ColumnType.VARINT_LE, StorageType.PLAIN) ) @@ -41,16 +41,23 @@ public class ByteArrayColumn { private final StorageWriter storage; private final VarintColumnWriter lengthsWriter; + private long position = 0; + public Writer(StorageWriter storage, VarintColumnWriter lengthsWriter) throws IOException { this.storage = storage; this.lengthsWriter = lengthsWriter; } public void put(byte[] value) throws IOException { + position ++; storage.putBytes(value); lengthsWriter.put(value.length); } + public long position() { + return position; + } + public void close() throws IOException { storage.close(); lengthsWriter.close(); diff --git a/code/libraries/slop/java/nu/marginalia/slop/column/array/IntArrayColumn.java b/code/libraries/slop/java/nu/marginalia/slop/column/array/IntArrayColumn.java index 4aeb1fcf..c5a1421c 100644 --- a/code/libraries/slop/java/nu/marginalia/slop/column/array/IntArrayColumn.java +++ b/code/libraries/slop/java/nu/marginalia/slop/column/array/IntArrayColumn.java @@ -17,7 +17,7 @@ public class IntArrayColumn { public static IntArrayColumnReader open(Path path, ColumnDesc name) throws IOException { return new Reader(Storage.reader(path, name, true), - VarintColumn.open(path, name.createDerivative(name.function().lengthsTable(), + VarintColumn.open(path, name.createSupplementaryColumn(name.function().lengthsTable(), ColumnType.VARINT_LE, StorageType.PLAIN) ) @@ -26,7 +26,7 @@ public class IntArrayColumn { public static IntArrayColumnWriter create(Path path, ColumnDesc name) throws IOException { return new Writer(Storage.writer(path, name), - VarintColumn.create(path, name.createDerivative(name.function().lengthsTable(), + VarintColumn.create(path, name.createSupplementaryColumn(name.function().lengthsTable(), ColumnType.VARINT_LE, StorageType.PLAIN) ) @@ -47,6 +47,10 @@ public class IntArrayColumn { lengthsWriter.put(value.length); } + public long position() { + return lengthsWriter.position(); + } + public void close() throws IOException { storage.close(); lengthsWriter.close(); diff --git a/code/libraries/slop/java/nu/marginalia/slop/column/array/LongArrayColumn.java b/code/libraries/slop/java/nu/marginalia/slop/column/array/LongArrayColumn.java index abe96f6e..b805a085 100644 --- a/code/libraries/slop/java/nu/marginalia/slop/column/array/LongArrayColumn.java +++ b/code/libraries/slop/java/nu/marginalia/slop/column/array/LongArrayColumn.java @@ -17,7 +17,7 @@ public class LongArrayColumn { public static LongArrayColumnReader open(Path path, ColumnDesc name) throws IOException { return new LongArrayColumn.Reader(Storage.reader(path, name, true), - VarintColumn.open(path, name.createDerivative(name.function().lengthsTable(), + VarintColumn.open(path, name.createSupplementaryColumn(name.function().lengthsTable(), ColumnType.VARINT_LE, StorageType.PLAIN) ) @@ -26,7 +26,7 @@ public class LongArrayColumn { public static LongArrayColumnWriter create(Path path, ColumnDesc name) throws IOException { return new LongArrayColumn.Writer(Storage.writer(path, name), - VarintColumn.create(path, name.createDerivative(name.function().lengthsTable(), + VarintColumn.create(path, name.createSupplementaryColumn(name.function().lengthsTable(), ColumnType.VARINT_LE, StorageType.PLAIN) ) @@ -47,6 +47,10 @@ public class LongArrayColumn { lengthsWriter.put(value.length); } + public long position() { + return lengthsWriter.position(); + } + public void close() throws IOException { storage.close(); lengthsWriter.close(); diff --git a/code/libraries/slop/java/nu/marginalia/slop/column/dynamic/CustomBinaryColumn.java b/code/libraries/slop/java/nu/marginalia/slop/column/dynamic/CustomBinaryColumn.java index 1bc8d350..910a02a2 100644 --- a/code/libraries/slop/java/nu/marginalia/slop/column/dynamic/CustomBinaryColumn.java +++ b/code/libraries/slop/java/nu/marginalia/slop/column/dynamic/CustomBinaryColumn.java @@ -16,7 +16,7 @@ public class CustomBinaryColumn { public static CustomBinaryColumnReader open(Path path, ColumnDesc name) throws IOException { return new Reader( Storage.reader(path, name, false), // note we must never pass aligned=true here, as the data is not guaranteed alignment - VarintColumn.open(path, name.createDerivative(ColumnFunction.DATA_LEN, + VarintColumn.open(path, name.createSupplementaryColumn(ColumnFunction.DATA_LEN, ColumnType.VARINT_LE, StorageType.PLAIN) ) @@ -26,7 +26,7 @@ public class CustomBinaryColumn { public static CustomBinaryColumnWriter create(Path path, ColumnDesc name) throws IOException { return new Writer( Storage.writer(path, name), - VarintColumn.create(path, name.createDerivative(ColumnFunction.DATA_LEN, + VarintColumn.create(path, name.createSupplementaryColumn(ColumnFunction.DATA_LEN, ColumnType.VARINT_LE, StorageType.PLAIN) ) @@ -62,6 +62,10 @@ public class CustomBinaryColumn { }; } + public long position() { + return indexWriter.position(); + } + public void close() throws IOException { indexWriter.close(); storage.close(); diff --git a/code/libraries/slop/java/nu/marginalia/slop/column/dynamic/GammaCodedSequenceColumn.java b/code/libraries/slop/java/nu/marginalia/slop/column/dynamic/GammaCodedSequenceColumn.java index 55e19f80..cead27b6 100644 --- a/code/libraries/slop/java/nu/marginalia/slop/column/dynamic/GammaCodedSequenceColumn.java +++ b/code/libraries/slop/java/nu/marginalia/slop/column/dynamic/GammaCodedSequenceColumn.java @@ -18,7 +18,7 @@ public class GammaCodedSequenceColumn { public static GammaCodedSequenceReader open(Path path, ColumnDesc name) throws IOException { return new Reader( Storage.reader(path, name, false), // note we must never pass aligned=true here, as the data is not guaranteed alignment - VarintColumn.open(path, name.createDerivative(ColumnFunction.DATA_LEN, + VarintColumn.open(path, name.createSupplementaryColumn(ColumnFunction.DATA_LEN, ColumnType.VARINT_LE, StorageType.PLAIN) ) @@ -28,7 +28,7 @@ public class GammaCodedSequenceColumn { public static GammaCodedSequenceWriter create(Path path, ColumnDesc name) throws IOException { return new Writer( Storage.writer(path, name), - VarintColumn.create(path, name.createDerivative(ColumnFunction.DATA_LEN, + VarintColumn.create(path, name.createSupplementaryColumn(ColumnFunction.DATA_LEN, ColumnType.VARINT_LE, StorageType.PLAIN) ) @@ -57,6 +57,10 @@ public class GammaCodedSequenceColumn { storage.putBytes(buffer); } + public long position() { + return indexWriter.position(); + } + public void close() throws IOException { indexWriter.close(); storage.close(); diff --git a/code/libraries/slop/java/nu/marginalia/slop/column/dynamic/VarintColumn.java b/code/libraries/slop/java/nu/marginalia/slop/column/dynamic/VarintColumn.java index c0236028..aee6409b 100644 --- a/code/libraries/slop/java/nu/marginalia/slop/column/dynamic/VarintColumn.java +++ b/code/libraries/slop/java/nu/marginalia/slop/column/dynamic/VarintColumn.java @@ -21,12 +21,15 @@ public class VarintColumn { private static class Writer implements VarintColumnWriter { private final StorageWriter writer; + private long position = 0; public Writer(StorageWriter writer) throws IOException { this.writer = writer; } public void put(long value) throws IOException { + position++; + while ((value & ~0x7F) != 0) { writer.putByte((byte) (0x80 | (value & 0x7F))); value >>>= 7; @@ -40,6 +43,10 @@ public class VarintColumn { } } + public long position() { + return position; + } + public void close() throws IOException { writer.close(); } diff --git a/code/libraries/slop/java/nu/marginalia/slop/column/primitive/ByteColumn.java b/code/libraries/slop/java/nu/marginalia/slop/column/primitive/ByteColumn.java index 28c481f0..3bb116f5 100644 --- a/code/libraries/slop/java/nu/marginalia/slop/column/primitive/ByteColumn.java +++ b/code/libraries/slop/java/nu/marginalia/slop/column/primitive/ByteColumn.java @@ -20,6 +20,7 @@ public class ByteColumn { private static class Writer implements ByteColumnWriter { private final StorageWriter storage; + private long position = 0; public Writer(StorageWriter storageWriter) throws IOException { this.storage = storageWriter; @@ -27,6 +28,11 @@ public class ByteColumn { public void put(byte value) throws IOException { storage.putByte(value); + position++; + } + + public long position() { + return position; } public void close() throws IOException { diff --git a/code/libraries/slop/java/nu/marginalia/slop/column/primitive/CharColumn.java b/code/libraries/slop/java/nu/marginalia/slop/column/primitive/CharColumn.java index f46fd783..a200e5b4 100644 --- a/code/libraries/slop/java/nu/marginalia/slop/column/primitive/CharColumn.java +++ b/code/libraries/slop/java/nu/marginalia/slop/column/primitive/CharColumn.java @@ -20,6 +20,7 @@ public class CharColumn { private static class Writer implements CharColumnWriter { private final StorageWriter storage; + private long position = 0; public Writer(StorageWriter storageWriter) throws IOException { this.storage = storageWriter; @@ -27,6 +28,11 @@ public class CharColumn { public void put(char value) throws IOException { storage.putChar(value); + position++; + } + + public long position() { + return position / Character.BYTES; } public void close() throws IOException { diff --git a/code/libraries/slop/java/nu/marginalia/slop/column/primitive/DoubleColumn.java b/code/libraries/slop/java/nu/marginalia/slop/column/primitive/DoubleColumn.java index 3faeaf09..1389e1c7 100644 --- a/code/libraries/slop/java/nu/marginalia/slop/column/primitive/DoubleColumn.java +++ b/code/libraries/slop/java/nu/marginalia/slop/column/primitive/DoubleColumn.java @@ -20,6 +20,7 @@ public class DoubleColumn { private static class Writer implements DoubleColumnWriter { private final StorageWriter storage; + private long position = 0; public Writer(StorageWriter storageWriter) throws IOException { this.storage = storageWriter; @@ -27,6 +28,11 @@ public class DoubleColumn { public void put(double value) throws IOException { storage.putDouble(value); + position++; + } + + public long position() { + return position / Double.BYTES; } public void close() throws IOException { diff --git a/code/libraries/slop/java/nu/marginalia/slop/column/primitive/FloatColumn.java b/code/libraries/slop/java/nu/marginalia/slop/column/primitive/FloatColumn.java index 7a18f752..fa5351d9 100644 --- a/code/libraries/slop/java/nu/marginalia/slop/column/primitive/FloatColumn.java +++ b/code/libraries/slop/java/nu/marginalia/slop/column/primitive/FloatColumn.java @@ -21,6 +21,7 @@ public class FloatColumn { private static class Writer implements FloatColumnWriter { private final StorageWriter storage; + private long position = 0; public Writer(StorageWriter storageWriter) throws IOException { this.storage = storageWriter; @@ -28,6 +29,11 @@ public class FloatColumn { public void put(float value) throws IOException { storage.putFloat(value); + position++; + } + + public long position() { + return position; } public void close() throws IOException { @@ -48,7 +54,7 @@ public class FloatColumn { @Override public long position() throws IOException { - return storage.position(); + return storage.position() / Float.BYTES; } @Override diff --git a/code/libraries/slop/java/nu/marginalia/slop/column/primitive/IntColumn.java b/code/libraries/slop/java/nu/marginalia/slop/column/primitive/IntColumn.java index 4920c978..97a446db 100644 --- a/code/libraries/slop/java/nu/marginalia/slop/column/primitive/IntColumn.java +++ b/code/libraries/slop/java/nu/marginalia/slop/column/primitive/IntColumn.java @@ -20,6 +20,7 @@ public class IntColumn { private static class Writer implements IntColumnWriter { private final StorageWriter storage; + private long position = 0; public Writer(StorageWriter storageWriter) throws IOException { this.storage = storageWriter; @@ -29,10 +30,16 @@ public class IntColumn { for (int value : values) { storage.putInt(value); } + position+=values.length; } public void put(int value) throws IOException { storage.putInt(value); + position++; + } + + public long position() { + return position / Integer.BYTES; } public void close() throws IOException { @@ -53,7 +60,7 @@ public class IntColumn { @Override public long position() throws IOException { - return storage.position(); + return storage.position() / Integer.BYTES; } @Override diff --git a/code/libraries/slop/java/nu/marginalia/slop/column/primitive/LongColumn.java b/code/libraries/slop/java/nu/marginalia/slop/column/primitive/LongColumn.java index e2eac930..ac1e72f7 100644 --- a/code/libraries/slop/java/nu/marginalia/slop/column/primitive/LongColumn.java +++ b/code/libraries/slop/java/nu/marginalia/slop/column/primitive/LongColumn.java @@ -20,6 +20,7 @@ public class LongColumn { private static class Writer implements LongColumnWriter { private final StorageWriter storage; + private long position = 0; public Writer(StorageWriter storageWriter) { this.storage = storageWriter; @@ -27,6 +28,11 @@ public class LongColumn { public void put(long value) throws IOException { storage.putLong(value); + position++; + } + + public long position() { + return position; } public void close() throws IOException { @@ -47,7 +53,7 @@ public class LongColumn { @Override public long position() throws IOException { - return storage.position(); + return storage.position() / Long.BYTES; } @Override diff --git a/code/libraries/slop/java/nu/marginalia/slop/column/string/EnumColumn.java b/code/libraries/slop/java/nu/marginalia/slop/column/string/EnumColumn.java index 0a4f2845..c8383a7e 100644 --- a/code/libraries/slop/java/nu/marginalia/slop/column/string/EnumColumn.java +++ b/code/libraries/slop/java/nu/marginalia/slop/column/string/EnumColumn.java @@ -19,13 +19,13 @@ public class EnumColumn { public static StringColumnReader open(Path path, ColumnDesc name) throws IOException { return new Reader( StringColumn.open(path, - name.createDerivative( + name.createSupplementaryColumn( ColumnFunction.DICT, ColumnType.TXTSTRING, StorageType.PLAIN) ), VarintColumn.open(path, - name.createDerivative( + name.createSupplementaryColumn( ColumnFunction.DATA, ColumnType.ENUM_LE, StorageType.PLAIN @@ -36,8 +36,8 @@ public class EnumColumn { public static StringColumnWriter create(Path path, ColumnDesc name) throws IOException { return new Writer( - StringColumn.create(path, name.createDerivative(ColumnFunction.DICT, ColumnType.TXTSTRING, StorageType.PLAIN)), - VarintColumn.create(path, name.createDerivative(ColumnFunction.DATA, ColumnType.ENUM_LE, StorageType.PLAIN)) + StringColumn.create(path, name.createSupplementaryColumn(ColumnFunction.DICT, ColumnType.TXTSTRING, StorageType.PLAIN)), + VarintColumn.create(path, name.createSupplementaryColumn(ColumnFunction.DATA, ColumnType.ENUM_LE, StorageType.PLAIN)) ); } @@ -64,6 +64,10 @@ public class EnumColumn { dataColumn.put(index); } + public long position() { + return dataColumn.position(); + } + public void close() throws IOException { dataColumn.close(); dicionaryColumn.close(); diff --git a/code/libraries/slop/java/nu/marginalia/slop/column/string/StringColumn.java b/code/libraries/slop/java/nu/marginalia/slop/column/string/StringColumn.java index 14424f71..4daaa308 100644 --- a/code/libraries/slop/java/nu/marginalia/slop/column/string/StringColumn.java +++ b/code/libraries/slop/java/nu/marginalia/slop/column/string/StringColumn.java @@ -51,6 +51,10 @@ public class StringColumn { backingColumn.put(value.getBytes()); } + public long position() { + return backingColumn.position(); + } + public void close() throws IOException { backingColumn.close(); } @@ -92,6 +96,8 @@ public class StringColumn { private static class CStringWriter implements StringColumnWriter { private final StorageWriter storageWriter; + private long position = 0; + public CStringWriter(StorageWriter storageWriter) throws IOException { this.storageWriter = storageWriter; } @@ -100,10 +106,14 @@ public class StringColumn { if (null == value) { value = ""; } - assert value.indexOf('\0') == -1 : "Null byte not allowed in cstring"; storageWriter.putBytes(value.getBytes()); storageWriter.putByte((byte) 0); + position++; + } + + public long position() { + return position; } public void close() throws IOException { @@ -113,6 +123,7 @@ public class StringColumn { private static class CStringReader implements StringColumnReader { private final StorageReader storageReader; + private long position = 0; public CStringReader(StorageReader storageReader) throws IOException { this.storageReader = storageReader; @@ -124,12 +135,13 @@ public class StringColumn { while (storageReader.hasRemaining() && (b = storageReader.getByte()) != 0) { sb.append((char) b); } + position++; return sb.toString(); } @Override public long position() throws IOException { - return storageReader.position(); + return position; } @Override @@ -141,6 +153,7 @@ public class StringColumn { i++; } } + position += positions; } @Override @@ -157,6 +170,7 @@ public class StringColumn { private static class TxtStringWriter implements StringColumnWriter { private final StorageWriter storageWriter; + private long position = 0; public TxtStringWriter(StorageWriter storageWriter) throws IOException { this.storageWriter = storageWriter; @@ -171,6 +185,11 @@ public class StringColumn { storageWriter.putBytes(value.getBytes()); storageWriter.putByte((byte) '\n'); + position++; + } + + public long position() { + return position; } public void close() throws IOException { @@ -180,6 +199,7 @@ public class StringColumn { private static class TxtStringReader implements StringColumnReader { private final StorageReader storageReader; + private long position = 0; public TxtStringReader(StorageReader storageReader) throws IOException { this.storageReader = storageReader; @@ -197,18 +217,21 @@ public class StringColumn { sb.append((char) b); } } + position++; return sb.toString(); } @Override public long position() throws IOException { - return storageReader.position(); + return position; } @Override public void skip(long positions) throws IOException { int i = 0; + position+=positions; + while (i < positions && storageReader.hasRemaining()) { if (storageReader.getByte() == '\n') { i++; diff --git a/code/libraries/slop/java/nu/marginalia/slop/desc/ColumnDesc.java b/code/libraries/slop/java/nu/marginalia/slop/desc/ColumnDesc.java index 93d31a54..e5120fbd 100644 --- a/code/libraries/slop/java/nu/marginalia/slop/desc/ColumnDesc.java +++ b/code/libraries/slop/java/nu/marginalia/slop/desc/ColumnDesc.java @@ -36,20 +36,35 @@ public record ColumnDesc + ColumnDesc createSupplementaryColumn( ColumnFunction function, - ColumnType type, + ColumnType type, StorageType storageType) { - return new ColumnDesc(name, page, function, type, storageType); + return new ColumnDesc<>(name, page, function, type, storageType); } public ByteOrder byteOrder() { @@ -57,7 +72,7 @@ public record ColumnDesc forPage(int page) { - return new ColumnDesc(name, page, function, type, storageType); + return new ColumnDesc<>(name, page, function, type, storageType); } public boolean exists(Path base) { diff --git a/code/libraries/slop/java/nu/marginalia/slop/desc/SlopTable.java b/code/libraries/slop/java/nu/marginalia/slop/desc/SlopTable.java new file mode 100644 index 00000000..3d018eca --- /dev/null +++ b/code/libraries/slop/java/nu/marginalia/slop/desc/SlopTable.java @@ -0,0 +1,87 @@ +package nu.marginalia.slop.desc; + +import nu.marginalia.slop.column.ColumnReader; +import nu.marginalia.slop.column.ColumnWriter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.*; + +/** SlopTable is a utility class for managing a group of columns that are + * read and written together. It is used to ensure that the reader and writer + * positions are maintained correctly between the columns, and to ensure that + * the columns are closed correctly. + *

+ * To deal with the fact that some columns may not be expected to have the same + * number of rows, SlopTable supports the concept of column groups. Each column + * group is a separate SlopTable instance, and the columns in the group are + * managed together. + *

+ * It is often a good idea to let the reader or writer class for a particular + * table inherit from SlopTable, so that the table is automatically closed when + * the reader or writer is closed. + */ + +public class SlopTable implements AutoCloseable { + private final List readerList = new ArrayList<>(); + private final List writerList = new ArrayList<>(); + + private final Map columnGroups = new HashMap<>(); + + private static final Logger logger = LoggerFactory.getLogger(SlopTable.class); + + /** Create a SlopTable corresponding to a grouping of columns that have their own + * internal consistency check. This is needed e.g. for grouped values. The table is + * closed automatically by the current instance. + */ + public SlopTable columnGroup(String name) { + return columnGroups.computeIfAbsent(name, k -> new SlopTable()); + } + + /** Register a column reader with this table. This is called from ColumnDesc. */ + void register(ColumnReader reader) { + readerList.add(reader); + } + + /** Register a column reader with this table. This is called from ColumnDesc. */ + void register(ColumnWriter writer) { + writerList.add(writer); + } + + public void close() throws IOException { + + Set positions = new HashSet<>(); + + for (ColumnReader reader : readerList) { + positions.add(reader.position()); + reader.close(); + } + for (ColumnWriter writer : writerList) { + positions.add(writer.position()); + writer.close(); + } + + + // Check for the scenario where we have multiple positions + // and one of the positions is zero, indicating that we haven't + // read or written to one of the columns. This is likely a bug, + // but not necessarily a severe one, so we just log a warning. + + if (positions.remove(0L) && !positions.isEmpty()) { + logger.warn("Zero position found in one of the tables, this is likely development debris"); + } + + // If there are more than one position and several are non-zero, then we haven't maintained the + // position correctly between the columns. This is a disaster, so we throw an exception. + if (positions.size() > 1) { + throw new IllegalStateException("Expected only one reader position, was " + positions); + } + + for (var table : columnGroups.values()) { + table.close(); + } + + } + +} diff --git a/code/libraries/slop/test/nu/marginalia/slop/column/StringColumnTest.java b/code/libraries/slop/test/nu/marginalia/slop/column/StringColumnTest.java index 486bc191..800c93eb 100644 --- a/code/libraries/slop/test/nu/marginalia/slop/column/StringColumnTest.java +++ b/code/libraries/slop/test/nu/marginalia/slop/column/StringColumnTest.java @@ -1,9 +1,6 @@ package nu.marginalia.slop.column; -import nu.marginalia.slop.desc.ColumnDesc; -import nu.marginalia.slop.desc.ColumnFunction; -import nu.marginalia.slop.desc.ColumnType; -import nu.marginalia.slop.desc.StorageType; +import nu.marginalia.slop.desc.*; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -61,11 +58,15 @@ class StringColumnTest { ColumnType.STRING, StorageType.GZIP); - try (var column = name.create(tempDir)) { + try (var table = new SlopTable()) { + var column = name.create(table, tempDir); + column.put("Lorem"); column.put("Ipsum"); } - try (var column = name.open(tempDir)) { + try (var table = new SlopTable()) { + var column = name.open(table, tempDir); + assertEquals("Lorem", column.get()); assertEquals("Ipsum", column.get()); assertFalse(column.hasRemaining()); @@ -80,11 +81,13 @@ class StringColumnTest { ColumnType.CSTRING, StorageType.GZIP); - try (var column = name.create(tempDir)) { + try (var table = new SlopTable()) { + var column = name.create(table, tempDir); column.put("Lorem"); column.put("Ipsum"); } - try (var column = name.open(tempDir)) { + try (var table = new SlopTable()) { + var column = name.open(table, tempDir); assertEquals("Lorem", column.get()); assertEquals("Ipsum", column.get()); assertFalse(column.hasRemaining()); @@ -99,11 +102,13 @@ class StringColumnTest { ColumnType.TXTSTRING, StorageType.GZIP); - try (var column = name.create(tempDir)) { + try (var table = new SlopTable()) { + var column = name.create(table, tempDir); column.put("Lorem"); column.put("Ipsum"); } - try (var column = name.open(tempDir)) { + try (var table = new SlopTable()) { + var column = name.open(table, tempDir); assertEquals("Lorem", column.get()); assertEquals("Ipsum", column.get()); assertFalse(column.hasRemaining()); diff --git a/code/processes/converting-process/build.gradle b/code/processes/converting-process/build.gradle index 942c8acd..1dd1edb9 100644 --- a/code/processes/converting-process/build.gradle +++ b/code/processes/converting-process/build.gradle @@ -36,6 +36,7 @@ dependencies { implementation project(':code:common:config') implementation project(':code:libraries:message-queue') implementation project(':code:libraries:blocking-thread-pool') + implementation project(':code:libraries:slop') implementation project(':code:libraries:guarded-regex') implementation project(':code:libraries:easy-lsh') diff --git a/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDocumentRecord.java b/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDocumentRecord.java index 177eaf9a..a654af5d 100644 --- a/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDocumentRecord.java +++ b/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDocumentRecord.java @@ -14,6 +14,7 @@ import nu.marginalia.slop.column.string.StringColumnReader; import nu.marginalia.slop.column.string.StringColumnWriter; import nu.marginalia.slop.desc.ColumnDesc; import nu.marginalia.slop.desc.ColumnType; +import nu.marginalia.slop.desc.SlopTable; import nu.marginalia.slop.desc.StorageType; import java.io.IOException; @@ -119,7 +120,7 @@ public record SlopDocumentRecord( private static final ColumnDesc spanCodesColumn = new ColumnDesc<>("spanCodes", ColumnType.BYTE_ARRAY, StorageType.ZSTD); private static final ColumnDesc spansColumn = new ColumnDesc<>("spans", ColumnType.BYTE_ARRAY_GCS, StorageType.ZSTD); - public static class KeywordsProjectionReader implements AutoCloseable { + public static class KeywordsProjectionReader extends SlopTable { private final StringColumnReader domainsReader; private final VarintColumnReader ordinalsReader; private final IntColumnReader htmlFeaturesReader; @@ -140,17 +141,19 @@ public record SlopDocumentRecord( } public KeywordsProjectionReader(Path baseDir, int page) throws IOException { - domainsReader = domainsColumn.forPage(page).open(baseDir); - ordinalsReader = ordinalsColumn.forPage(page).open(baseDir); - htmlFeaturesReader = htmlFeaturesColumn.forPage(page).open(baseDir); - domainMetadataReader = domainMetadata.forPage(page).open(baseDir); - lengthsReader = lengthsColumn.forPage(page).open(baseDir); - keywordsReader = keywordsColumn.forPage(page).open(baseDir); - termCountsReader = termCountsColumn.forPage(page).open(baseDir); - termMetaReader = termMetaColumn.forPage(page).open(baseDir); - termPositionsReader = termPositionsColumn.forPage(page).open(baseDir); - spanCodesReader = spanCodesColumn.forPage(page).open(baseDir); - spansReader = spansColumn.forPage(page).open(baseDir); + domainsReader = domainsColumn.forPage(page).open(this, baseDir); + ordinalsReader = ordinalsColumn.forPage(page).open(this, baseDir); + htmlFeaturesReader = htmlFeaturesColumn.forPage(page).open(this, baseDir); + domainMetadataReader = domainMetadata.forPage(page).open(this, baseDir); + lengthsReader = lengthsColumn.forPage(page).open(this, baseDir); + termCountsReader = termCountsColumn.forPage(page).open(this, baseDir); + + keywordsReader = keywordsColumn.forPage(page).open(this.columnGroup("keywords"), baseDir); + termMetaReader = termMetaColumn.forPage(page).open(this.columnGroup("keywords"), baseDir); + termPositionsReader = termPositionsColumn.forPage(page).open(this.columnGroup("keywords"), baseDir); + + spanCodesReader = spanCodesColumn.forPage(page).open(this.columnGroup("spans"), baseDir); + spansReader = spansColumn.forPage(page).open(this.columnGroup("spans"), baseDir); } public boolean hasMore() throws IOException { @@ -197,22 +200,9 @@ public record SlopDocumentRecord( ); } - - public void close() throws IOException { - domainsReader.close(); - ordinalsReader.close(); - htmlFeaturesReader.close(); - domainMetadataReader.close(); - lengthsReader.close(); - keywordsReader.close(); - termMetaReader.close(); - termPositionsReader.close(); - spanCodesReader.close(); - spansReader.close(); - } } - public static class MetadataReader implements AutoCloseable { + public static class MetadataReader extends SlopTable { private final StringColumnReader domainsReader; private final StringColumnReader urlsReader; private final VarintColumnReader ordinalsReader; @@ -230,17 +220,17 @@ public record SlopDocumentRecord( } public MetadataReader(Path baseDir, int page) throws IOException { - this.domainsReader = domainsColumn.forPage(page).open(baseDir); - this.urlsReader = urlsColumn.forPage(page).open(baseDir); - this.ordinalsReader = ordinalsColumn.forPage(page).open(baseDir); - this.titlesReader = titlesColumn.forPage(page).open(baseDir); - this.descriptionsReader = descriptionsColumn.forPage(page).open(baseDir); - this.htmlFeaturesReader = htmlFeaturesColumn.forPage(page).open(baseDir); - this.htmlStandardsReader = htmlStandardsColumn.forPage(page).open(baseDir); - this.lengthsReader = lengthsColumn.forPage(page).open(baseDir); - this.hashesReader = hashesColumn.forPage(page).open(baseDir); - this.qualitiesReader = qualitiesColumn.forPage(page).open(baseDir); - this.pubYearReader = pubYearColumn.forPage(page).open(baseDir); + this.domainsReader = domainsColumn.forPage(page).open(this, baseDir); + this.urlsReader = urlsColumn.forPage(page).open(this, baseDir); + this.ordinalsReader = ordinalsColumn.forPage(page).open(this, baseDir); + this.titlesReader = titlesColumn.forPage(page).open(this, baseDir); + this.descriptionsReader = descriptionsColumn.forPage(page).open(this, baseDir); + this.htmlFeaturesReader = htmlFeaturesColumn.forPage(page).open(this, baseDir); + this.htmlStandardsReader = htmlStandardsColumn.forPage(page).open(this, baseDir); + this.lengthsReader = lengthsColumn.forPage(page).open(this, baseDir); + this.hashesReader = hashesColumn.forPage(page).open(this, baseDir); + this.qualitiesReader = qualitiesColumn.forPage(page).open(this, baseDir); + this.pubYearReader = pubYearColumn.forPage(page).open(this, baseDir); } public MetadataProjection next() throws IOException { @@ -264,22 +254,9 @@ public record SlopDocumentRecord( return domainsReader.hasRemaining(); } - public void close() throws IOException { - domainsReader.close(); - urlsReader.close(); - ordinalsReader.close(); - titlesReader.close(); - descriptionsReader.close(); - htmlFeaturesReader.close(); - htmlStandardsReader.close(); - lengthsReader.close(); - hashesReader.close(); - qualitiesReader.close(); - pubYearReader.close(); - } } - public static class Writer implements AutoCloseable { + public static class Writer extends SlopTable { private final StringColumnWriter domainsWriter; private final StringColumnWriter urlsWriter; private final VarintColumnWriter ordinalsWriter; @@ -302,27 +279,28 @@ public record SlopDocumentRecord( private final GammaCodedSequenceWriter spansWriter; public Writer(Path baseDir, int page) throws IOException { - domainsWriter = domainsColumn.forPage(page).create(baseDir); - urlsWriter = urlsColumn.forPage(page).create(baseDir); - ordinalsWriter = ordinalsColumn.forPage(page).create(baseDir); - statesWriter = statesColumn.forPage(page).create(baseDir); - stateReasonsWriter = stateReasonsColumn.forPage(page).create(baseDir); - titlesWriter = titlesColumn.forPage(page).create(baseDir); - descriptionsWriter = descriptionsColumn.forPage(page).create(baseDir); - htmlFeaturesWriter = htmlFeaturesColumn.forPage(page).create(baseDir); - htmlStandardsWriter = htmlStandardsColumn.forPage(page).create(baseDir); - lengthsWriter = lengthsColumn.forPage(page).create(baseDir); - hashesWriter = hashesColumn.forPage(page).create(baseDir); - qualitiesWriter = qualitiesColumn.forPage(page).create(baseDir); - domainMetadataWriter = domainMetadata.forPage(page).create(baseDir); - pubYearWriter = pubYearColumn.forPage(page).create(baseDir); - termCountsWriter = termCountsColumn.forPage(page).create(baseDir); - keywordsWriter = keywordsColumn.forPage(page).create(baseDir); - termMetaWriter = termMetaColumn.forPage(page).create(baseDir); - termPositionsWriter = termPositionsColumn.forPage(page).create(baseDir); + domainsWriter = domainsColumn.forPage(page).create(this, baseDir); + urlsWriter = urlsColumn.forPage(page).create(this, baseDir); + ordinalsWriter = ordinalsColumn.forPage(page).create(this, baseDir); + statesWriter = statesColumn.forPage(page).create(this, baseDir); + stateReasonsWriter = stateReasonsColumn.forPage(page).create(this, baseDir); + titlesWriter = titlesColumn.forPage(page).create(this, baseDir); + descriptionsWriter = descriptionsColumn.forPage(page).create(this, baseDir); + htmlFeaturesWriter = htmlFeaturesColumn.forPage(page).create(this, baseDir); + htmlStandardsWriter = htmlStandardsColumn.forPage(page).create(this, baseDir); + lengthsWriter = lengthsColumn.forPage(page).create(this, baseDir); + hashesWriter = hashesColumn.forPage(page).create(this, baseDir); + qualitiesWriter = qualitiesColumn.forPage(page).create(this, baseDir); + domainMetadataWriter = domainMetadata.forPage(page).create(this, baseDir); + pubYearWriter = pubYearColumn.forPage(page).create(this, baseDir); + termCountsWriter = termCountsColumn.forPage(page).create(this, baseDir); - spansWriter = spansColumn.forPage(page).create(baseDir); - spansCodesWriter = spanCodesColumn.forPage(page).create(baseDir); + keywordsWriter = keywordsColumn.forPage(page).create(this.columnGroup("keywords"), baseDir); + termMetaWriter = termMetaColumn.forPage(page).create(this.columnGroup("keywords"), baseDir); + termPositionsWriter = termPositionsColumn.forPage(page).create(this.columnGroup("keywords"), baseDir); + + spansWriter = spansColumn.forPage(page).create(this.columnGroup("spans"), baseDir); + spansCodesWriter = spanCodesColumn.forPage(page).create(this.columnGroup("spans"), baseDir); } public void write(SlopDocumentRecord record) throws IOException { @@ -367,29 +345,5 @@ public record SlopDocumentRecord( } } - - public void close() throws IOException { - domainsWriter.close(); - urlsWriter.close(); - ordinalsWriter.close(); - statesWriter.close(); - stateReasonsWriter.close(); - titlesWriter.close(); - descriptionsWriter.close(); - htmlFeaturesWriter.close(); - htmlStandardsWriter.close(); - lengthsWriter.close(); - hashesWriter.close(); - qualitiesWriter.close(); - domainMetadataWriter.close(); - pubYearWriter.close(); - termCountsWriter.close(); - keywordsWriter.close(); - termMetaWriter.close(); - termPositionsWriter.close(); - - spansCodesWriter.close(); - spansWriter.close(); - } } } diff --git a/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainLinkRecord.java b/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainLinkRecord.java index d0b3c6d6..7cb3b7df 100644 --- a/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainLinkRecord.java +++ b/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainLinkRecord.java @@ -4,6 +4,7 @@ import nu.marginalia.slop.column.string.StringColumnReader; import nu.marginalia.slop.column.string.StringColumnWriter; import nu.marginalia.slop.desc.ColumnDesc; import nu.marginalia.slop.desc.ColumnType; +import nu.marginalia.slop.desc.SlopTable; import nu.marginalia.slop.desc.StorageType; import java.io.IOException; @@ -21,7 +22,7 @@ public record SlopDomainLinkRecord( return new Reader(baseDir, page); } - public static class Reader implements AutoCloseable { + public static class Reader extends SlopTable { private final StringColumnReader sourcesReader; private final StringColumnReader destsReader; @@ -30,15 +31,8 @@ public record SlopDomainLinkRecord( } public Reader(Path baseDir, int page) throws IOException { - sourcesReader = sourcesColumn.forPage(page).open(baseDir); - destsReader = destsColumn.forPage(page).open(baseDir); - } - - - @Override - public void close() throws IOException { - sourcesReader.close(); - destsReader.close(); + sourcesReader = sourcesColumn.forPage(page).open(this, baseDir); + destsReader = destsColumn.forPage(page).open(this, baseDir); } public boolean hasMore() throws IOException { @@ -60,13 +54,13 @@ public record SlopDomainLinkRecord( } } - public static class Writer implements AutoCloseable { + public static class Writer extends SlopTable { private final StringColumnWriter sourcesWriter; private final StringColumnWriter destsWriter; public Writer(Path baseDir, int page) throws IOException { - sourcesWriter = sourcesColumn.forPage(page).create(baseDir); - destsWriter = destsColumn.forPage(page).create(baseDir); + sourcesWriter = sourcesColumn.forPage(page).create(this, baseDir); + destsWriter = destsColumn.forPage(page).create(this, baseDir); } public void write(SlopDomainLinkRecord record) throws IOException { diff --git a/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainRecord.java b/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainRecord.java index 059a6e81..b1c6533b 100644 --- a/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainRecord.java +++ b/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainRecord.java @@ -6,6 +6,7 @@ import nu.marginalia.slop.column.string.StringColumnReader; import nu.marginalia.slop.column.string.StringColumnWriter; import nu.marginalia.slop.desc.ColumnDesc; import nu.marginalia.slop.desc.ColumnType; +import nu.marginalia.slop.desc.SlopTable; import nu.marginalia.slop.desc.StorageType; import java.io.IOException; @@ -43,7 +44,7 @@ public record SlopDomainRecord( private static final ColumnDesc rssFeedsColumn = new ColumnDesc<>("rssFeeds", ColumnType.TXTSTRING, StorageType.GZIP); - public static class DomainNameReader implements AutoCloseable { + public static class DomainNameReader extends SlopTable { private final StringColumnReader domainsReader; public DomainNameReader(SlopPageRef page) throws IOException { @@ -51,13 +52,7 @@ public record SlopDomainRecord( } public DomainNameReader(Path baseDir, int page) throws IOException { - domainsReader = domainsColumn.forPage(page).open(baseDir); - } - - - @Override - public void close() throws IOException { - domainsReader.close(); + domainsReader = domainsColumn.forPage(page).open(this, baseDir); } public boolean hasMore() throws IOException { @@ -69,7 +64,7 @@ public record SlopDomainRecord( } } - public static class DomainWithIpReader implements AutoCloseable { + public static class DomainWithIpReader extends SlopTable { private final StringColumnReader domainsReader; private final StringColumnReader ipReader; @@ -78,15 +73,8 @@ public record SlopDomainRecord( } public DomainWithIpReader(Path baseDir, int page) throws IOException { - domainsReader = domainsColumn.forPage(page).open(baseDir); - ipReader = ipColumn.forPage(page).open(baseDir); - } - - - @Override - public void close() throws IOException { - domainsReader.close(); - ipReader.close(); + domainsReader = domainsColumn.forPage(page).open(this, baseDir); + ipReader = ipColumn.forPage(page).open(this, baseDir); } public boolean hasMore() throws IOException { @@ -102,7 +90,7 @@ public record SlopDomainRecord( } } - public static class Reader implements AutoCloseable { + public static class Reader extends SlopTable { private final StringColumnReader domainsReader; private final StringColumnReader statesReader; private final StringColumnReader redirectReader; @@ -120,33 +108,17 @@ public record SlopDomainRecord( } public Reader(Path baseDir, int page) throws IOException { - domainsReader = domainsColumn.forPage(page).open(baseDir); - statesReader = statesColumn.forPage(page).open(baseDir); - redirectReader = redirectDomainsColumn.forPage(page).open(baseDir); - ipReader = ipColumn.forPage(page).open(baseDir); + domainsReader = domainsColumn.forPage(page).open(this, baseDir); + statesReader = statesColumn.forPage(page).open(this, baseDir); + redirectReader = redirectDomainsColumn.forPage(page).open(this, baseDir); + ipReader = ipColumn.forPage(page).open(this, baseDir); - knownUrlsReader = knownUrlsColumn.forPage(page).open(baseDir); - goodUrlsReader = goodUrlsColumn.forPage(page).open(baseDir); - visitedUrlsReader = visitedUrlsColumn.forPage(page).open(baseDir); + knownUrlsReader = knownUrlsColumn.forPage(page).open(this, baseDir); + goodUrlsReader = goodUrlsColumn.forPage(page).open(this, baseDir); + visitedUrlsReader = visitedUrlsColumn.forPage(page).open(this, baseDir); - rssFeedsCountReader = rssFeedsCountColumn.forPage(page).open(baseDir); - rssFeedsReader = rssFeedsColumn.forPage(page).open(baseDir); - } - - - @Override - public void close() throws IOException { - domainsReader.close(); - statesReader.close(); - redirectReader.close(); - ipReader.close(); - - knownUrlsReader.close(); - goodUrlsReader.close(); - visitedUrlsReader.close(); - - rssFeedsCountReader.close(); - rssFeedsReader.close(); + rssFeedsCountReader = rssFeedsCountColumn.forPage(page).open(this, baseDir); + rssFeedsReader = rssFeedsColumn.forPage(page).open(this, baseDir); } public boolean hasMore() throws IOException { @@ -179,7 +151,7 @@ public record SlopDomainRecord( } } - public static class Writer implements AutoCloseable { + public static class Writer extends SlopTable { private final StringColumnWriter domainsWriter; private final StringColumnWriter statesWriter; private final StringColumnWriter redirectWriter; @@ -193,17 +165,17 @@ public record SlopDomainRecord( private final StringColumnWriter rssFeedsWriter; public Writer(Path baseDir, int page) throws IOException { - domainsWriter = domainsColumn.forPage(page).create(baseDir); - statesWriter = statesColumn.forPage(page).create(baseDir); - redirectWriter = redirectDomainsColumn.forPage(page).create(baseDir); - ipWriter = ipColumn.forPage(page).create(baseDir); + domainsWriter = domainsColumn.forPage(page).create(this, baseDir); + statesWriter = statesColumn.forPage(page).create(this, baseDir); + redirectWriter = redirectDomainsColumn.forPage(page).create(this, baseDir); + ipWriter = ipColumn.forPage(page).create(this, baseDir); - knownUrlsWriter = knownUrlsColumn.forPage(page).create(baseDir); - goodUrlsWriter = goodUrlsColumn.forPage(page).create(baseDir); - visitedUrlsWriter = visitedUrlsColumn.forPage(page).create(baseDir); + knownUrlsWriter = knownUrlsColumn.forPage(page).create(this, baseDir); + goodUrlsWriter = goodUrlsColumn.forPage(page).create(this, baseDir); + visitedUrlsWriter = visitedUrlsColumn.forPage(page).create(this, baseDir); - rssFeedsCountWriter = rssFeedsCountColumn.forPage(page).create(baseDir); - rssFeedsWriter = rssFeedsColumn.forPage(page).create(baseDir); + rssFeedsCountWriter = rssFeedsCountColumn.forPage(page).create(this, baseDir); + rssFeedsWriter = rssFeedsColumn.forPage(page).create(this.columnGroup("rss-feeds"), baseDir); } public void write(SlopDomainRecord record) throws IOException { @@ -221,20 +193,5 @@ public record SlopDomainRecord( rssFeedsWriter.put(rssFeed); } } - - @Override - public void close() throws IOException { - domainsWriter.close(); - statesWriter.close(); - redirectWriter.close(); - ipWriter.close(); - - knownUrlsWriter.close(); - goodUrlsWriter.close(); - visitedUrlsWriter.close(); - - rssFeedsCountWriter.close(); - rssFeedsWriter.close(); - } } } diff --git a/code/processes/loading-process/build.gradle b/code/processes/loading-process/build.gradle index 5e49ed30..57bf8eaf 100644 --- a/code/processes/loading-process/build.gradle +++ b/code/processes/loading-process/build.gradle @@ -32,6 +32,7 @@ dependencies { implementation project(':code:libraries:message-queue') implementation project(':code:libraries:language-processing') implementation project(':code:libraries:coded-sequence') + implementation project(':code:libraries:slop') implementation project(':third-party:commons-codec') implementation project(':third-party:parquet-floor') testImplementation project(':code:services-application:search-service')