From dcb43a330808932b25bc3c682467ce414ca81621 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 27 Jul 2024 13:47:47 +0200 Subject: [PATCH] (slop) Introduce table concept to keep track of positions and simplify closing The most common error when dealing with Slop columns is that they can fall out of sync with each other if the programmer accidentally does a conditional read and forgets to skip. The second most common error is forgetting to close one of the columns in a reader or writer. To deal with both cases, a new class SlopTable is added that keeps track of the lifecycle of all slop columns and performs a check when closing them that they are in sync. --- code/index/build.gradle | 1 + .../index/forward/ForwardIndexConverter.java | 22 +-- .../index/journal/IndexJournalPage.java | 41 ++--- .../index/journal/IndexJournalSlopWriter.java | 24 +-- .../full/FullPreindexDocuments.java | 12 +- .../full/FullPreindexWordSegments.java | 4 +- .../prio/PrioPreindexDocuments.java | 10 +- .../prio/PrioPreindexWordSegments.java | 6 +- .../marginalia/slop/column/ColumnReader.java | 2 + .../marginalia/slop/column/ColumnWriter.java | 6 + .../slop/column/array/ByteArrayColumn.java | 11 +- .../slop/column/array/IntArrayColumn.java | 8 +- .../slop/column/array/LongArrayColumn.java | 8 +- .../column/dynamic/CustomBinaryColumn.java | 8 +- .../dynamic/GammaCodedSequenceColumn.java | 8 +- .../slop/column/dynamic/VarintColumn.java | 7 + .../slop/column/primitive/ByteColumn.java | 6 + .../slop/column/primitive/CharColumn.java | 6 + .../slop/column/primitive/DoubleColumn.java | 6 + .../slop/column/primitive/FloatColumn.java | 8 +- .../slop/column/primitive/IntColumn.java | 9 +- .../slop/column/primitive/LongColumn.java | 8 +- .../slop/column/string/EnumColumn.java | 12 +- .../slop/column/string/StringColumn.java | 29 +++- .../nu/marginalia/slop/desc/ColumnDesc.java | 31 +++- .../nu/marginalia/slop/desc/SlopTable.java | 87 +++++++++++ .../slop/column/StringColumnTest.java | 25 +-- .../processes/converting-process/build.gradle | 1 + .../model/processed/SlopDocumentRecord.java | 144 ++++++------------ .../model/processed/SlopDomainLinkRecord.java | 20 +-- .../model/processed/SlopDomainRecord.java | 95 ++++-------- code/processes/loading-process/build.gradle | 1 + 32 files changed, 398 insertions(+), 268 deletions(-) create mode 100644 code/libraries/slop/java/nu/marginalia/slop/desc/SlopTable.java diff --git a/code/index/build.gradle b/code/index/build.gradle index db4dab20..bf50a507 100644 --- a/code/index/build.gradle +++ b/code/index/build.gradle @@ -22,6 +22,7 @@ dependencies { implementation project(':code:libraries:array') implementation project(':code:libraries:btree') + implementation project(':code:libraries:slop') implementation project(':code:libraries:coded-sequence') implementation project(':code:common:db') diff --git a/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexConverter.java b/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexConverter.java index 2edc283f..66f45736 100644 --- a/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexConverter.java +++ b/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexConverter.java @@ -9,6 +9,7 @@ import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.slop.column.primitive.LongColumnReader; +import nu.marginalia.slop.desc.SlopTable; import org.roaringbitmap.longlong.LongConsumer; import org.roaringbitmap.longlong.Roaring64Bitmap; import org.slf4j.Logger; @@ -80,16 +81,15 @@ public class ForwardIndexConverter { ByteBuffer workArea = ByteBuffer.allocate(65536); for (var instance : journal.pages()) { - try (var docIdReader = instance.openCombinedId(); - var metaReader = instance.openDocumentMeta(); - var featuresReader = instance.openFeatures(); - var sizeReader = instance.openSize(); - - var spansCodesReader = instance.openSpanCodes(); - var spansSeqReader = instance.openSpans(); - var spansWriter = new ForwardIndexSpansWriter(outputFileSpansData) - ) + try (var slopTable = new SlopTable(); var spansWriter = new ForwardIndexSpansWriter(outputFileSpansData)) { + var docIdReader = instance.openCombinedId(slopTable); + var metaReader = instance.openDocumentMeta(slopTable); + var featuresReader = instance.openFeatures(slopTable); + var sizeReader = instance.openSize(slopTable); + var spansCodesReader = instance.openSpanCodes(slopTable); + var spansSeqReader = instance.openSpans(slopTable); + while (docIdReader.hasRemaining()) { long docId = docIdReader.get(); int domainId = UrlIdCodec.getDomainId(docId); @@ -148,7 +148,9 @@ public class ForwardIndexConverter { Roaring64Bitmap rbm = new Roaring64Bitmap(); for (var instance : journalReader.pages()) { - try (LongColumnReader idReader = instance.openCombinedId()) { + try (var slopTable = new SlopTable()) { + LongColumnReader idReader = instance.openCombinedId(slopTable); + while (idReader.hasRemaining()) { rbm.add(idReader.get()); } diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournalPage.java b/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournalPage.java index 8b8d7c2e..ee5c1be7 100644 --- a/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournalPage.java +++ b/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournalPage.java @@ -9,6 +9,7 @@ import nu.marginalia.slop.column.dynamic.VarintColumnWriter; import nu.marginalia.slop.column.primitive.*; import nu.marginalia.slop.desc.ColumnDesc; import nu.marginalia.slop.desc.ColumnType; +import nu.marginalia.slop.desc.SlopTable; import nu.marginalia.slop.desc.StorageType; import java.io.IOException; @@ -34,43 +35,43 @@ public record IndexJournalPage(Path baseDir, int page) { } } - public LongColumnReader openCombinedId() throws IOException { - return combinedId.forPage(page).open(baseDir); + public LongColumnReader openCombinedId(SlopTable table) throws IOException { + return combinedId.forPage(page).open(table, baseDir); } - public LongColumnReader openDocumentMeta() throws IOException { - return documentMeta.forPage(page).open(baseDir); + public LongColumnReader openDocumentMeta(SlopTable table) throws IOException { + return documentMeta.forPage(page).open(table, baseDir); } - public IntColumnReader openFeatures() throws IOException { - return features.forPage(page).open(baseDir); + public IntColumnReader openFeatures(SlopTable table) throws IOException { + return features.forPage(page).open(table, baseDir); } - public IntColumnReader openSize() throws IOException { - return size.forPage(page).open(baseDir); + public IntColumnReader openSize(SlopTable table) throws IOException { + return size.forPage(page).open(table, baseDir); } - public LongColumnReader openTermCounts() throws IOException { - return termCounts.forPage(page).open(baseDir); + public LongColumnReader openTermCounts(SlopTable table) throws IOException { + return termCounts.forPage(page).open(table, baseDir); } - public LongColumnReader openTermIds() throws IOException { - return termIds.forPage(page).open(baseDir); + public LongColumnReader openTermIds(SlopTable table) throws IOException { + return termIds.forPage(page).open(table.columnGroup("keywords"), baseDir); } - public ByteColumnReader openTermMetadata() throws IOException { - return termMeta.forPage(page).open(baseDir); + public ByteColumnReader openTermMetadata(SlopTable table) throws IOException { + return termMeta.forPage(page).open(table.columnGroup("keywords"), baseDir); } - public GammaCodedSequenceReader openTermPositions() throws IOException { - return positions.forPage(page).open(baseDir); + public GammaCodedSequenceReader openTermPositions(SlopTable table) throws IOException { + return positions.forPage(page).open(table.columnGroup("keywords"), baseDir); } - public GammaCodedSequenceReader openSpans() throws IOException { - return spans.forPage(page).open(baseDir); + public GammaCodedSequenceReader openSpans(SlopTable table) throws IOException { + return spans.forPage(page).open(table.columnGroup("spans"), baseDir); } - public ByteArrayColumnReader openSpanCodes() throws IOException { - return spanCodes.forPage(page).open(baseDir); + public ByteArrayColumnReader openSpanCodes(SlopTable table) throws IOException { + return spanCodes.forPage(page).open(table.columnGroup("spans"), baseDir); } } diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournalSlopWriter.java b/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournalSlopWriter.java index 10e4edd6..492fd605 100644 --- a/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournalSlopWriter.java +++ b/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournalSlopWriter.java @@ -10,13 +10,14 @@ import nu.marginalia.slop.column.dynamic.GammaCodedSequenceWriter; import nu.marginalia.slop.column.primitive.ByteColumnWriter; import nu.marginalia.slop.column.primitive.IntColumnWriter; import nu.marginalia.slop.column.primitive.LongColumnWriter; +import nu.marginalia.slop.desc.SlopTable; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.List; -public class IndexJournalSlopWriter implements AutoCloseable { +public class IndexJournalSlopWriter extends SlopTable { private final IntColumnWriter featuresWriter; private final IntColumnWriter sizeWriter; @@ -39,19 +40,20 @@ public class IndexJournalSlopWriter implements AutoCloseable { } - featuresWriter = IndexJournalPage.features.forPage(page).create(dir); - sizeWriter = IndexJournalPage.size.forPage(page).create(dir); + featuresWriter = IndexJournalPage.features.forPage(page).create(this, dir); + sizeWriter = IndexJournalPage.size.forPage(page).create(this, dir); - combinedIdWriter = IndexJournalPage.combinedId.forPage(page).create(dir); - documentMetaWriter = IndexJournalPage.documentMeta.forPage(page).create(dir); + combinedIdWriter = IndexJournalPage.combinedId.forPage(page).create(this, dir); + documentMetaWriter = IndexJournalPage.documentMeta.forPage(page).create(this, dir); - termCountsWriter = IndexJournalPage.termCounts.forPage(page).create(dir); - termIdsWriter = IndexJournalPage.termIds.forPage(page).create(dir); - termMetadataWriter = IndexJournalPage.termMeta.forPage(page).create(dir); - termPositionsWriter = IndexJournalPage.positions.forPage(page).create(dir); + termCountsWriter = IndexJournalPage.termCounts.forPage(page).create(this, dir); - spansWriter = IndexJournalPage.spans.forPage(page).create(dir); - spanCodesWriter = IndexJournalPage.spanCodes.forPage(page).create(dir); + termIdsWriter = IndexJournalPage.termIds.forPage(page).create(this.columnGroup("keywords"), dir); + termMetadataWriter = IndexJournalPage.termMeta.forPage(page).create(this.columnGroup("keywords"), dir); + termPositionsWriter = IndexJournalPage.positions.forPage(page).create(this.columnGroup("keywords"), dir); + + spansWriter = IndexJournalPage.spans.forPage(page).create(this.columnGroup("spans"), dir); + spanCodesWriter = IndexJournalPage.spanCodes.forPage(page).create(this.columnGroup("spans"), dir); } @SneakyThrows diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexDocuments.java b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexDocuments.java index 9cadeb41..7418f92a 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexDocuments.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexDocuments.java @@ -7,6 +7,7 @@ import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.PositionsFileConstructor; import nu.marginalia.index.journal.IndexJournalPage; import nu.marginalia.rwf.RandomFileAssembler; +import nu.marginalia.slop.desc.SlopTable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -78,12 +79,13 @@ public class FullPreindexDocuments { final ByteBuffer tempBuffer = ByteBuffer.allocate(65536); try (var assembly = RandomFileAssembler.create(workDir, fileSizeLongs); - var docIds = journalInstance.openCombinedId(); - var termCounts = journalInstance.openTermCounts(); - var termIds = journalInstance.openTermIds(); - var termMeta = journalInstance.openTermMetadata(); - var positions = journalInstance.openTermPositions()) + var slopTable = new SlopTable()) { + var docIds = journalInstance.openCombinedId(slopTable); + var termCounts = journalInstance.openTermCounts(slopTable); + var termIds = journalInstance.openTermIds(slopTable); + var termMeta = journalInstance.openTermMetadata(slopTable); + var positions = journalInstance.openTermPositions(slopTable); var offsetMap = segments.asMap(RECORD_SIZE_LONGS); offsetMap.defaultReturnValue(0); diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexWordSegments.java b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexWordSegments.java index 120b1326..51987a36 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexWordSegments.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexWordSegments.java @@ -6,6 +6,7 @@ import it.unimi.dsi.fastutil.longs.LongIterator; import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArrayFactory; import nu.marginalia.index.journal.IndexJournalPage; +import nu.marginalia.slop.desc.SlopTable; import java.io.IOException; import java.nio.file.Files; @@ -59,7 +60,8 @@ public class FullPreindexWordSegments { Long2IntOpenHashMap countsMap = new Long2IntOpenHashMap(100_000, 0.75f); countsMap.defaultReturnValue(0); - try (var termIds = journalInstance.openTermIds()) { + try (var slopTable = new SlopTable()) { + var termIds = journalInstance.openTermIds(slopTable); while (termIds.hasRemaining()) { countsMap.addTo(termIds.get(), 1); } diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexDocuments.java b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexDocuments.java index bdda5a4f..e5ab2409 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexDocuments.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexDocuments.java @@ -6,6 +6,7 @@ import nu.marginalia.array.LongArrayFactory; import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.journal.IndexJournalPage; import nu.marginalia.rwf.RandomFileAssembler; +import nu.marginalia.slop.desc.SlopTable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -67,11 +68,12 @@ public class PrioPreindexDocuments { long fileSizeLongs = RECORD_SIZE_LONGS * segments.totalSize(); try (var assembly = RandomFileAssembler.create(workDir, fileSizeLongs); - var docIds = journalInstance.openCombinedId(); - var termIdsCounts = journalInstance.openTermCounts(); - var termIds = journalInstance.openTermIds(); - var termMeta = journalInstance.openTermMetadata()) + var slopTable = new SlopTable()) { + var docIds = journalInstance.openCombinedId(slopTable); + var termIdsCounts = journalInstance.openTermCounts(slopTable); + var termIds = journalInstance.openTermIds(slopTable); + var termMeta = journalInstance.openTermMetadata(slopTable); var offsetMap = segments.asMap(RECORD_SIZE_LONGS); offsetMap.defaultReturnValue(0); diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexWordSegments.java b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexWordSegments.java index c2fe2e96..a30d8a5f 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexWordSegments.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexWordSegments.java @@ -6,6 +6,7 @@ import it.unimi.dsi.fastutil.longs.LongIterator; import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArrayFactory; import nu.marginalia.index.journal.IndexJournalPage; +import nu.marginalia.slop.desc.SlopTable; import java.io.IOException; import java.nio.file.Files; @@ -59,8 +60,9 @@ public class PrioPreindexWordSegments { Long2IntOpenHashMap countsMap = new Long2IntOpenHashMap(100_000, 0.75f); countsMap.defaultReturnValue(0); - try (var termIds = journalInstance.openTermIds(); - var termMetas = journalInstance.openTermMetadata()) { + try (var slopTable = new SlopTable()) { + var termIds = journalInstance.openTermIds(slopTable); + var termMetas = journalInstance.openTermMetadata(slopTable); while (termIds.hasRemaining()) { long data = termIds.get(); diff --git a/code/libraries/slop/java/nu/marginalia/slop/column/ColumnReader.java b/code/libraries/slop/java/nu/marginalia/slop/column/ColumnReader.java index 89a87740..644ee788 100644 --- a/code/libraries/slop/java/nu/marginalia/slop/column/ColumnReader.java +++ b/code/libraries/slop/java/nu/marginalia/slop/column/ColumnReader.java @@ -11,4 +11,6 @@ public interface ColumnReader { } boolean hasRemaining() throws IOException; + + void close() throws IOException; } diff --git a/code/libraries/slop/java/nu/marginalia/slop/column/ColumnWriter.java b/code/libraries/slop/java/nu/marginalia/slop/column/ColumnWriter.java index 00e06ae2..661a4021 100644 --- a/code/libraries/slop/java/nu/marginalia/slop/column/ColumnWriter.java +++ b/code/libraries/slop/java/nu/marginalia/slop/column/ColumnWriter.java @@ -1,4 +1,10 @@ package nu.marginalia.slop.column; +import java.io.IOException; + public interface ColumnWriter { + /** Return the current record index in the column */ + long position(); + + void close() throws IOException; } diff --git a/code/libraries/slop/java/nu/marginalia/slop/column/array/ByteArrayColumn.java b/code/libraries/slop/java/nu/marginalia/slop/column/array/ByteArrayColumn.java index 24165be4..f641de3f 100644 --- a/code/libraries/slop/java/nu/marginalia/slop/column/array/ByteArrayColumn.java +++ b/code/libraries/slop/java/nu/marginalia/slop/column/array/ByteArrayColumn.java @@ -19,7 +19,7 @@ public class ByteArrayColumn { return new Reader( Storage.reader(path, name, true), VarintColumn.open(path, - name.createDerivative(name.function().lengthsTable(), + name.createSupplementaryColumn(name.function().lengthsTable(), ColumnType.VARINT_LE, StorageType.PLAIN) ) @@ -30,7 +30,7 @@ public class ByteArrayColumn { return new Writer( Storage.writer(path, name), VarintColumn.create(path, - name.createDerivative(name.function().lengthsTable(), + name.createSupplementaryColumn(name.function().lengthsTable(), ColumnType.VARINT_LE, StorageType.PLAIN) ) @@ -41,16 +41,23 @@ public class ByteArrayColumn { private final StorageWriter storage; private final VarintColumnWriter lengthsWriter; + private long position = 0; + public Writer(StorageWriter storage, VarintColumnWriter lengthsWriter) throws IOException { this.storage = storage; this.lengthsWriter = lengthsWriter; } public void put(byte[] value) throws IOException { + position ++; storage.putBytes(value); lengthsWriter.put(value.length); } + public long position() { + return position; + } + public void close() throws IOException { storage.close(); lengthsWriter.close(); diff --git a/code/libraries/slop/java/nu/marginalia/slop/column/array/IntArrayColumn.java b/code/libraries/slop/java/nu/marginalia/slop/column/array/IntArrayColumn.java index 4aeb1fcf..c5a1421c 100644 --- a/code/libraries/slop/java/nu/marginalia/slop/column/array/IntArrayColumn.java +++ b/code/libraries/slop/java/nu/marginalia/slop/column/array/IntArrayColumn.java @@ -17,7 +17,7 @@ public class IntArrayColumn { public static IntArrayColumnReader open(Path path, ColumnDesc name) throws IOException { return new Reader(Storage.reader(path, name, true), - VarintColumn.open(path, name.createDerivative(name.function().lengthsTable(), + VarintColumn.open(path, name.createSupplementaryColumn(name.function().lengthsTable(), ColumnType.VARINT_LE, StorageType.PLAIN) ) @@ -26,7 +26,7 @@ public class IntArrayColumn { public static IntArrayColumnWriter create(Path path, ColumnDesc name) throws IOException { return new Writer(Storage.writer(path, name), - VarintColumn.create(path, name.createDerivative(name.function().lengthsTable(), + VarintColumn.create(path, name.createSupplementaryColumn(name.function().lengthsTable(), ColumnType.VARINT_LE, StorageType.PLAIN) ) @@ -47,6 +47,10 @@ public class IntArrayColumn { lengthsWriter.put(value.length); } + public long position() { + return lengthsWriter.position(); + } + public void close() throws IOException { storage.close(); lengthsWriter.close(); diff --git a/code/libraries/slop/java/nu/marginalia/slop/column/array/LongArrayColumn.java b/code/libraries/slop/java/nu/marginalia/slop/column/array/LongArrayColumn.java index abe96f6e..b805a085 100644 --- a/code/libraries/slop/java/nu/marginalia/slop/column/array/LongArrayColumn.java +++ b/code/libraries/slop/java/nu/marginalia/slop/column/array/LongArrayColumn.java @@ -17,7 +17,7 @@ public class LongArrayColumn { public static LongArrayColumnReader open(Path path, ColumnDesc name) throws IOException { return new LongArrayColumn.Reader(Storage.reader(path, name, true), - VarintColumn.open(path, name.createDerivative(name.function().lengthsTable(), + VarintColumn.open(path, name.createSupplementaryColumn(name.function().lengthsTable(), ColumnType.VARINT_LE, StorageType.PLAIN) ) @@ -26,7 +26,7 @@ public class LongArrayColumn { public static LongArrayColumnWriter create(Path path, ColumnDesc name) throws IOException { return new LongArrayColumn.Writer(Storage.writer(path, name), - VarintColumn.create(path, name.createDerivative(name.function().lengthsTable(), + VarintColumn.create(path, name.createSupplementaryColumn(name.function().lengthsTable(), ColumnType.VARINT_LE, StorageType.PLAIN) ) @@ -47,6 +47,10 @@ public class LongArrayColumn { lengthsWriter.put(value.length); } + public long position() { + return lengthsWriter.position(); + } + public void close() throws IOException { storage.close(); lengthsWriter.close(); diff --git a/code/libraries/slop/java/nu/marginalia/slop/column/dynamic/CustomBinaryColumn.java b/code/libraries/slop/java/nu/marginalia/slop/column/dynamic/CustomBinaryColumn.java index 1bc8d350..910a02a2 100644 --- a/code/libraries/slop/java/nu/marginalia/slop/column/dynamic/CustomBinaryColumn.java +++ b/code/libraries/slop/java/nu/marginalia/slop/column/dynamic/CustomBinaryColumn.java @@ -16,7 +16,7 @@ public class CustomBinaryColumn { public static CustomBinaryColumnReader open(Path path, ColumnDesc name) throws IOException { return new Reader( Storage.reader(path, name, false), // note we must never pass aligned=true here, as the data is not guaranteed alignment - VarintColumn.open(path, name.createDerivative(ColumnFunction.DATA_LEN, + VarintColumn.open(path, name.createSupplementaryColumn(ColumnFunction.DATA_LEN, ColumnType.VARINT_LE, StorageType.PLAIN) ) @@ -26,7 +26,7 @@ public class CustomBinaryColumn { public static CustomBinaryColumnWriter create(Path path, ColumnDesc name) throws IOException { return new Writer( Storage.writer(path, name), - VarintColumn.create(path, name.createDerivative(ColumnFunction.DATA_LEN, + VarintColumn.create(path, name.createSupplementaryColumn(ColumnFunction.DATA_LEN, ColumnType.VARINT_LE, StorageType.PLAIN) ) @@ -62,6 +62,10 @@ public class CustomBinaryColumn { }; } + public long position() { + return indexWriter.position(); + } + public void close() throws IOException { indexWriter.close(); storage.close(); diff --git a/code/libraries/slop/java/nu/marginalia/slop/column/dynamic/GammaCodedSequenceColumn.java b/code/libraries/slop/java/nu/marginalia/slop/column/dynamic/GammaCodedSequenceColumn.java index 55e19f80..cead27b6 100644 --- a/code/libraries/slop/java/nu/marginalia/slop/column/dynamic/GammaCodedSequenceColumn.java +++ b/code/libraries/slop/java/nu/marginalia/slop/column/dynamic/GammaCodedSequenceColumn.java @@ -18,7 +18,7 @@ public class GammaCodedSequenceColumn { public static GammaCodedSequenceReader open(Path path, ColumnDesc name) throws IOException { return new Reader( Storage.reader(path, name, false), // note we must never pass aligned=true here, as the data is not guaranteed alignment - VarintColumn.open(path, name.createDerivative(ColumnFunction.DATA_LEN, + VarintColumn.open(path, name.createSupplementaryColumn(ColumnFunction.DATA_LEN, ColumnType.VARINT_LE, StorageType.PLAIN) ) @@ -28,7 +28,7 @@ public class GammaCodedSequenceColumn { public static GammaCodedSequenceWriter create(Path path, ColumnDesc name) throws IOException { return new Writer( Storage.writer(path, name), - VarintColumn.create(path, name.createDerivative(ColumnFunction.DATA_LEN, + VarintColumn.create(path, name.createSupplementaryColumn(ColumnFunction.DATA_LEN, ColumnType.VARINT_LE, StorageType.PLAIN) ) @@ -57,6 +57,10 @@ public class GammaCodedSequenceColumn { storage.putBytes(buffer); } + public long position() { + return indexWriter.position(); + } + public void close() throws IOException { indexWriter.close(); storage.close(); diff --git a/code/libraries/slop/java/nu/marginalia/slop/column/dynamic/VarintColumn.java b/code/libraries/slop/java/nu/marginalia/slop/column/dynamic/VarintColumn.java index c0236028..aee6409b 100644 --- a/code/libraries/slop/java/nu/marginalia/slop/column/dynamic/VarintColumn.java +++ b/code/libraries/slop/java/nu/marginalia/slop/column/dynamic/VarintColumn.java @@ -21,12 +21,15 @@ public class VarintColumn { private static class Writer implements VarintColumnWriter { private final StorageWriter writer; + private long position = 0; public Writer(StorageWriter writer) throws IOException { this.writer = writer; } public void put(long value) throws IOException { + position++; + while ((value & ~0x7F) != 0) { writer.putByte((byte) (0x80 | (value & 0x7F))); value >>>= 7; @@ -40,6 +43,10 @@ public class VarintColumn { } } + public long position() { + return position; + } + public void close() throws IOException { writer.close(); } diff --git a/code/libraries/slop/java/nu/marginalia/slop/column/primitive/ByteColumn.java b/code/libraries/slop/java/nu/marginalia/slop/column/primitive/ByteColumn.java index 28c481f0..3bb116f5 100644 --- a/code/libraries/slop/java/nu/marginalia/slop/column/primitive/ByteColumn.java +++ b/code/libraries/slop/java/nu/marginalia/slop/column/primitive/ByteColumn.java @@ -20,6 +20,7 @@ public class ByteColumn { private static class Writer implements ByteColumnWriter { private final StorageWriter storage; + private long position = 0; public Writer(StorageWriter storageWriter) throws IOException { this.storage = storageWriter; @@ -27,6 +28,11 @@ public class ByteColumn { public void put(byte value) throws IOException { storage.putByte(value); + position++; + } + + public long position() { + return position; } public void close() throws IOException { diff --git a/code/libraries/slop/java/nu/marginalia/slop/column/primitive/CharColumn.java b/code/libraries/slop/java/nu/marginalia/slop/column/primitive/CharColumn.java index f46fd783..a200e5b4 100644 --- a/code/libraries/slop/java/nu/marginalia/slop/column/primitive/CharColumn.java +++ b/code/libraries/slop/java/nu/marginalia/slop/column/primitive/CharColumn.java @@ -20,6 +20,7 @@ public class CharColumn { private static class Writer implements CharColumnWriter { private final StorageWriter storage; + private long position = 0; public Writer(StorageWriter storageWriter) throws IOException { this.storage = storageWriter; @@ -27,6 +28,11 @@ public class CharColumn { public void put(char value) throws IOException { storage.putChar(value); + position++; + } + + public long position() { + return position / Character.BYTES; } public void close() throws IOException { diff --git a/code/libraries/slop/java/nu/marginalia/slop/column/primitive/DoubleColumn.java b/code/libraries/slop/java/nu/marginalia/slop/column/primitive/DoubleColumn.java index 3faeaf09..1389e1c7 100644 --- a/code/libraries/slop/java/nu/marginalia/slop/column/primitive/DoubleColumn.java +++ b/code/libraries/slop/java/nu/marginalia/slop/column/primitive/DoubleColumn.java @@ -20,6 +20,7 @@ public class DoubleColumn { private static class Writer implements DoubleColumnWriter { private final StorageWriter storage; + private long position = 0; public Writer(StorageWriter storageWriter) throws IOException { this.storage = storageWriter; @@ -27,6 +28,11 @@ public class DoubleColumn { public void put(double value) throws IOException { storage.putDouble(value); + position++; + } + + public long position() { + return position / Double.BYTES; } public void close() throws IOException { diff --git a/code/libraries/slop/java/nu/marginalia/slop/column/primitive/FloatColumn.java b/code/libraries/slop/java/nu/marginalia/slop/column/primitive/FloatColumn.java index 7a18f752..fa5351d9 100644 --- a/code/libraries/slop/java/nu/marginalia/slop/column/primitive/FloatColumn.java +++ b/code/libraries/slop/java/nu/marginalia/slop/column/primitive/FloatColumn.java @@ -21,6 +21,7 @@ public class FloatColumn { private static class Writer implements FloatColumnWriter { private final StorageWriter storage; + private long position = 0; public Writer(StorageWriter storageWriter) throws IOException { this.storage = storageWriter; @@ -28,6 +29,11 @@ public class FloatColumn { public void put(float value) throws IOException { storage.putFloat(value); + position++; + } + + public long position() { + return position; } public void close() throws IOException { @@ -48,7 +54,7 @@ public class FloatColumn { @Override public long position() throws IOException { - return storage.position(); + return storage.position() / Float.BYTES; } @Override diff --git a/code/libraries/slop/java/nu/marginalia/slop/column/primitive/IntColumn.java b/code/libraries/slop/java/nu/marginalia/slop/column/primitive/IntColumn.java index 4920c978..97a446db 100644 --- a/code/libraries/slop/java/nu/marginalia/slop/column/primitive/IntColumn.java +++ b/code/libraries/slop/java/nu/marginalia/slop/column/primitive/IntColumn.java @@ -20,6 +20,7 @@ public class IntColumn { private static class Writer implements IntColumnWriter { private final StorageWriter storage; + private long position = 0; public Writer(StorageWriter storageWriter) throws IOException { this.storage = storageWriter; @@ -29,10 +30,16 @@ public class IntColumn { for (int value : values) { storage.putInt(value); } + position+=values.length; } public void put(int value) throws IOException { storage.putInt(value); + position++; + } + + public long position() { + return position / Integer.BYTES; } public void close() throws IOException { @@ -53,7 +60,7 @@ public class IntColumn { @Override public long position() throws IOException { - return storage.position(); + return storage.position() / Integer.BYTES; } @Override diff --git a/code/libraries/slop/java/nu/marginalia/slop/column/primitive/LongColumn.java b/code/libraries/slop/java/nu/marginalia/slop/column/primitive/LongColumn.java index e2eac930..ac1e72f7 100644 --- a/code/libraries/slop/java/nu/marginalia/slop/column/primitive/LongColumn.java +++ b/code/libraries/slop/java/nu/marginalia/slop/column/primitive/LongColumn.java @@ -20,6 +20,7 @@ public class LongColumn { private static class Writer implements LongColumnWriter { private final StorageWriter storage; + private long position = 0; public Writer(StorageWriter storageWriter) { this.storage = storageWriter; @@ -27,6 +28,11 @@ public class LongColumn { public void put(long value) throws IOException { storage.putLong(value); + position++; + } + + public long position() { + return position; } public void close() throws IOException { @@ -47,7 +53,7 @@ public class LongColumn { @Override public long position() throws IOException { - return storage.position(); + return storage.position() / Long.BYTES; } @Override diff --git a/code/libraries/slop/java/nu/marginalia/slop/column/string/EnumColumn.java b/code/libraries/slop/java/nu/marginalia/slop/column/string/EnumColumn.java index 0a4f2845..c8383a7e 100644 --- a/code/libraries/slop/java/nu/marginalia/slop/column/string/EnumColumn.java +++ b/code/libraries/slop/java/nu/marginalia/slop/column/string/EnumColumn.java @@ -19,13 +19,13 @@ public class EnumColumn { public static StringColumnReader open(Path path, ColumnDesc name) throws IOException { return new Reader( StringColumn.open(path, - name.createDerivative( + name.createSupplementaryColumn( ColumnFunction.DICT, ColumnType.TXTSTRING, StorageType.PLAIN) ), VarintColumn.open(path, - name.createDerivative( + name.createSupplementaryColumn( ColumnFunction.DATA, ColumnType.ENUM_LE, StorageType.PLAIN @@ -36,8 +36,8 @@ public class EnumColumn { public static StringColumnWriter create(Path path, ColumnDesc name) throws IOException { return new Writer( - StringColumn.create(path, name.createDerivative(ColumnFunction.DICT, ColumnType.TXTSTRING, StorageType.PLAIN)), - VarintColumn.create(path, name.createDerivative(ColumnFunction.DATA, ColumnType.ENUM_LE, StorageType.PLAIN)) + StringColumn.create(path, name.createSupplementaryColumn(ColumnFunction.DICT, ColumnType.TXTSTRING, StorageType.PLAIN)), + VarintColumn.create(path, name.createSupplementaryColumn(ColumnFunction.DATA, ColumnType.ENUM_LE, StorageType.PLAIN)) ); } @@ -64,6 +64,10 @@ public class EnumColumn { dataColumn.put(index); } + public long position() { + return dataColumn.position(); + } + public void close() throws IOException { dataColumn.close(); dicionaryColumn.close(); diff --git a/code/libraries/slop/java/nu/marginalia/slop/column/string/StringColumn.java b/code/libraries/slop/java/nu/marginalia/slop/column/string/StringColumn.java index 14424f71..4daaa308 100644 --- a/code/libraries/slop/java/nu/marginalia/slop/column/string/StringColumn.java +++ b/code/libraries/slop/java/nu/marginalia/slop/column/string/StringColumn.java @@ -51,6 +51,10 @@ public class StringColumn { backingColumn.put(value.getBytes()); } + public long position() { + return backingColumn.position(); + } + public void close() throws IOException { backingColumn.close(); } @@ -92,6 +96,8 @@ public class StringColumn { private static class CStringWriter implements StringColumnWriter { private final StorageWriter storageWriter; + private long position = 0; + public CStringWriter(StorageWriter storageWriter) throws IOException { this.storageWriter = storageWriter; } @@ -100,10 +106,14 @@ public class StringColumn { if (null == value) { value = ""; } - assert value.indexOf('\0') == -1 : "Null byte not allowed in cstring"; storageWriter.putBytes(value.getBytes()); storageWriter.putByte((byte) 0); + position++; + } + + public long position() { + return position; } public void close() throws IOException { @@ -113,6 +123,7 @@ public class StringColumn { private static class CStringReader implements StringColumnReader { private final StorageReader storageReader; + private long position = 0; public CStringReader(StorageReader storageReader) throws IOException { this.storageReader = storageReader; @@ -124,12 +135,13 @@ public class StringColumn { while (storageReader.hasRemaining() && (b = storageReader.getByte()) != 0) { sb.append((char) b); } + position++; return sb.toString(); } @Override public long position() throws IOException { - return storageReader.position(); + return position; } @Override @@ -141,6 +153,7 @@ public class StringColumn { i++; } } + position += positions; } @Override @@ -157,6 +170,7 @@ public class StringColumn { private static class TxtStringWriter implements StringColumnWriter { private final StorageWriter storageWriter; + private long position = 0; public TxtStringWriter(StorageWriter storageWriter) throws IOException { this.storageWriter = storageWriter; @@ -171,6 +185,11 @@ public class StringColumn { storageWriter.putBytes(value.getBytes()); storageWriter.putByte((byte) '\n'); + position++; + } + + public long position() { + return position; } public void close() throws IOException { @@ -180,6 +199,7 @@ public class StringColumn { private static class TxtStringReader implements StringColumnReader { private final StorageReader storageReader; + private long position = 0; public TxtStringReader(StorageReader storageReader) throws IOException { this.storageReader = storageReader; @@ -197,18 +217,21 @@ public class StringColumn { sb.append((char) b); } } + position++; return sb.toString(); } @Override public long position() throws IOException { - return storageReader.position(); + return position; } @Override public void skip(long positions) throws IOException { int i = 0; + position+=positions; + while (i < positions && storageReader.hasRemaining()) { if (storageReader.getByte() == '\n') { i++; diff --git a/code/libraries/slop/java/nu/marginalia/slop/desc/ColumnDesc.java b/code/libraries/slop/java/nu/marginalia/slop/desc/ColumnDesc.java index 93d31a54..e5120fbd 100644 --- a/code/libraries/slop/java/nu/marginalia/slop/desc/ColumnDesc.java +++ b/code/libraries/slop/java/nu/marginalia/slop/desc/ColumnDesc.java @@ -36,20 +36,35 @@ public record ColumnDesc + ColumnDesc createSupplementaryColumn( ColumnFunction function, - ColumnType type, + ColumnType type, StorageType storageType) { - return new ColumnDesc(name, page, function, type, storageType); + return new ColumnDesc<>(name, page, function, type, storageType); } public ByteOrder byteOrder() { @@ -57,7 +72,7 @@ public record ColumnDesc forPage(int page) { - return new ColumnDesc(name, page, function, type, storageType); + return new ColumnDesc<>(name, page, function, type, storageType); } public boolean exists(Path base) { diff --git a/code/libraries/slop/java/nu/marginalia/slop/desc/SlopTable.java b/code/libraries/slop/java/nu/marginalia/slop/desc/SlopTable.java new file mode 100644 index 00000000..3d018eca --- /dev/null +++ b/code/libraries/slop/java/nu/marginalia/slop/desc/SlopTable.java @@ -0,0 +1,87 @@ +package nu.marginalia.slop.desc; + +import nu.marginalia.slop.column.ColumnReader; +import nu.marginalia.slop.column.ColumnWriter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.*; + +/** SlopTable is a utility class for managing a group of columns that are + * read and written together. It is used to ensure that the reader and writer + * positions are maintained correctly between the columns, and to ensure that + * the columns are closed correctly. + *

+ * To deal with the fact that some columns may not be expected to have the same + * number of rows, SlopTable supports the concept of column groups. Each column + * group is a separate SlopTable instance, and the columns in the group are + * managed together. + *

+ * It is often a good idea to let the reader or writer class for a particular + * table inherit from SlopTable, so that the table is automatically closed when + * the reader or writer is closed. + */ + +public class SlopTable implements AutoCloseable { + private final List readerList = new ArrayList<>(); + private final List writerList = new ArrayList<>(); + + private final Map columnGroups = new HashMap<>(); + + private static final Logger logger = LoggerFactory.getLogger(SlopTable.class); + + /** Create a SlopTable corresponding to a grouping of columns that have their own + * internal consistency check. This is needed e.g. for grouped values. The table is + * closed automatically by the current instance. + */ + public SlopTable columnGroup(String name) { + return columnGroups.computeIfAbsent(name, k -> new SlopTable()); + } + + /** Register a column reader with this table. This is called from ColumnDesc. */ + void register(ColumnReader reader) { + readerList.add(reader); + } + + /** Register a column reader with this table. This is called from ColumnDesc. */ + void register(ColumnWriter writer) { + writerList.add(writer); + } + + public void close() throws IOException { + + Set positions = new HashSet<>(); + + for (ColumnReader reader : readerList) { + positions.add(reader.position()); + reader.close(); + } + for (ColumnWriter writer : writerList) { + positions.add(writer.position()); + writer.close(); + } + + + // Check for the scenario where we have multiple positions + // and one of the positions is zero, indicating that we haven't + // read or written to one of the columns. This is likely a bug, + // but not necessarily a severe one, so we just log a warning. + + if (positions.remove(0L) && !positions.isEmpty()) { + logger.warn("Zero position found in one of the tables, this is likely development debris"); + } + + // If there are more than one position and several are non-zero, then we haven't maintained the + // position correctly between the columns. This is a disaster, so we throw an exception. + if (positions.size() > 1) { + throw new IllegalStateException("Expected only one reader position, was " + positions); + } + + for (var table : columnGroups.values()) { + table.close(); + } + + } + +} diff --git a/code/libraries/slop/test/nu/marginalia/slop/column/StringColumnTest.java b/code/libraries/slop/test/nu/marginalia/slop/column/StringColumnTest.java index 486bc191..800c93eb 100644 --- a/code/libraries/slop/test/nu/marginalia/slop/column/StringColumnTest.java +++ b/code/libraries/slop/test/nu/marginalia/slop/column/StringColumnTest.java @@ -1,9 +1,6 @@ package nu.marginalia.slop.column; -import nu.marginalia.slop.desc.ColumnDesc; -import nu.marginalia.slop.desc.ColumnFunction; -import nu.marginalia.slop.desc.ColumnType; -import nu.marginalia.slop.desc.StorageType; +import nu.marginalia.slop.desc.*; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -61,11 +58,15 @@ class StringColumnTest { ColumnType.STRING, StorageType.GZIP); - try (var column = name.create(tempDir)) { + try (var table = new SlopTable()) { + var column = name.create(table, tempDir); + column.put("Lorem"); column.put("Ipsum"); } - try (var column = name.open(tempDir)) { + try (var table = new SlopTable()) { + var column = name.open(table, tempDir); + assertEquals("Lorem", column.get()); assertEquals("Ipsum", column.get()); assertFalse(column.hasRemaining()); @@ -80,11 +81,13 @@ class StringColumnTest { ColumnType.CSTRING, StorageType.GZIP); - try (var column = name.create(tempDir)) { + try (var table = new SlopTable()) { + var column = name.create(table, tempDir); column.put("Lorem"); column.put("Ipsum"); } - try (var column = name.open(tempDir)) { + try (var table = new SlopTable()) { + var column = name.open(table, tempDir); assertEquals("Lorem", column.get()); assertEquals("Ipsum", column.get()); assertFalse(column.hasRemaining()); @@ -99,11 +102,13 @@ class StringColumnTest { ColumnType.TXTSTRING, StorageType.GZIP); - try (var column = name.create(tempDir)) { + try (var table = new SlopTable()) { + var column = name.create(table, tempDir); column.put("Lorem"); column.put("Ipsum"); } - try (var column = name.open(tempDir)) { + try (var table = new SlopTable()) { + var column = name.open(table, tempDir); assertEquals("Lorem", column.get()); assertEquals("Ipsum", column.get()); assertFalse(column.hasRemaining()); diff --git a/code/processes/converting-process/build.gradle b/code/processes/converting-process/build.gradle index 942c8acd..1dd1edb9 100644 --- a/code/processes/converting-process/build.gradle +++ b/code/processes/converting-process/build.gradle @@ -36,6 +36,7 @@ dependencies { implementation project(':code:common:config') implementation project(':code:libraries:message-queue') implementation project(':code:libraries:blocking-thread-pool') + implementation project(':code:libraries:slop') implementation project(':code:libraries:guarded-regex') implementation project(':code:libraries:easy-lsh') diff --git a/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDocumentRecord.java b/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDocumentRecord.java index 177eaf9a..a654af5d 100644 --- a/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDocumentRecord.java +++ b/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDocumentRecord.java @@ -14,6 +14,7 @@ import nu.marginalia.slop.column.string.StringColumnReader; import nu.marginalia.slop.column.string.StringColumnWriter; import nu.marginalia.slop.desc.ColumnDesc; import nu.marginalia.slop.desc.ColumnType; +import nu.marginalia.slop.desc.SlopTable; import nu.marginalia.slop.desc.StorageType; import java.io.IOException; @@ -119,7 +120,7 @@ public record SlopDocumentRecord( private static final ColumnDesc spanCodesColumn = new ColumnDesc<>("spanCodes", ColumnType.BYTE_ARRAY, StorageType.ZSTD); private static final ColumnDesc spansColumn = new ColumnDesc<>("spans", ColumnType.BYTE_ARRAY_GCS, StorageType.ZSTD); - public static class KeywordsProjectionReader implements AutoCloseable { + public static class KeywordsProjectionReader extends SlopTable { private final StringColumnReader domainsReader; private final VarintColumnReader ordinalsReader; private final IntColumnReader htmlFeaturesReader; @@ -140,17 +141,19 @@ public record SlopDocumentRecord( } public KeywordsProjectionReader(Path baseDir, int page) throws IOException { - domainsReader = domainsColumn.forPage(page).open(baseDir); - ordinalsReader = ordinalsColumn.forPage(page).open(baseDir); - htmlFeaturesReader = htmlFeaturesColumn.forPage(page).open(baseDir); - domainMetadataReader = domainMetadata.forPage(page).open(baseDir); - lengthsReader = lengthsColumn.forPage(page).open(baseDir); - keywordsReader = keywordsColumn.forPage(page).open(baseDir); - termCountsReader = termCountsColumn.forPage(page).open(baseDir); - termMetaReader = termMetaColumn.forPage(page).open(baseDir); - termPositionsReader = termPositionsColumn.forPage(page).open(baseDir); - spanCodesReader = spanCodesColumn.forPage(page).open(baseDir); - spansReader = spansColumn.forPage(page).open(baseDir); + domainsReader = domainsColumn.forPage(page).open(this, baseDir); + ordinalsReader = ordinalsColumn.forPage(page).open(this, baseDir); + htmlFeaturesReader = htmlFeaturesColumn.forPage(page).open(this, baseDir); + domainMetadataReader = domainMetadata.forPage(page).open(this, baseDir); + lengthsReader = lengthsColumn.forPage(page).open(this, baseDir); + termCountsReader = termCountsColumn.forPage(page).open(this, baseDir); + + keywordsReader = keywordsColumn.forPage(page).open(this.columnGroup("keywords"), baseDir); + termMetaReader = termMetaColumn.forPage(page).open(this.columnGroup("keywords"), baseDir); + termPositionsReader = termPositionsColumn.forPage(page).open(this.columnGroup("keywords"), baseDir); + + spanCodesReader = spanCodesColumn.forPage(page).open(this.columnGroup("spans"), baseDir); + spansReader = spansColumn.forPage(page).open(this.columnGroup("spans"), baseDir); } public boolean hasMore() throws IOException { @@ -197,22 +200,9 @@ public record SlopDocumentRecord( ); } - - public void close() throws IOException { - domainsReader.close(); - ordinalsReader.close(); - htmlFeaturesReader.close(); - domainMetadataReader.close(); - lengthsReader.close(); - keywordsReader.close(); - termMetaReader.close(); - termPositionsReader.close(); - spanCodesReader.close(); - spansReader.close(); - } } - public static class MetadataReader implements AutoCloseable { + public static class MetadataReader extends SlopTable { private final StringColumnReader domainsReader; private final StringColumnReader urlsReader; private final VarintColumnReader ordinalsReader; @@ -230,17 +220,17 @@ public record SlopDocumentRecord( } public MetadataReader(Path baseDir, int page) throws IOException { - this.domainsReader = domainsColumn.forPage(page).open(baseDir); - this.urlsReader = urlsColumn.forPage(page).open(baseDir); - this.ordinalsReader = ordinalsColumn.forPage(page).open(baseDir); - this.titlesReader = titlesColumn.forPage(page).open(baseDir); - this.descriptionsReader = descriptionsColumn.forPage(page).open(baseDir); - this.htmlFeaturesReader = htmlFeaturesColumn.forPage(page).open(baseDir); - this.htmlStandardsReader = htmlStandardsColumn.forPage(page).open(baseDir); - this.lengthsReader = lengthsColumn.forPage(page).open(baseDir); - this.hashesReader = hashesColumn.forPage(page).open(baseDir); - this.qualitiesReader = qualitiesColumn.forPage(page).open(baseDir); - this.pubYearReader = pubYearColumn.forPage(page).open(baseDir); + this.domainsReader = domainsColumn.forPage(page).open(this, baseDir); + this.urlsReader = urlsColumn.forPage(page).open(this, baseDir); + this.ordinalsReader = ordinalsColumn.forPage(page).open(this, baseDir); + this.titlesReader = titlesColumn.forPage(page).open(this, baseDir); + this.descriptionsReader = descriptionsColumn.forPage(page).open(this, baseDir); + this.htmlFeaturesReader = htmlFeaturesColumn.forPage(page).open(this, baseDir); + this.htmlStandardsReader = htmlStandardsColumn.forPage(page).open(this, baseDir); + this.lengthsReader = lengthsColumn.forPage(page).open(this, baseDir); + this.hashesReader = hashesColumn.forPage(page).open(this, baseDir); + this.qualitiesReader = qualitiesColumn.forPage(page).open(this, baseDir); + this.pubYearReader = pubYearColumn.forPage(page).open(this, baseDir); } public MetadataProjection next() throws IOException { @@ -264,22 +254,9 @@ public record SlopDocumentRecord( return domainsReader.hasRemaining(); } - public void close() throws IOException { - domainsReader.close(); - urlsReader.close(); - ordinalsReader.close(); - titlesReader.close(); - descriptionsReader.close(); - htmlFeaturesReader.close(); - htmlStandardsReader.close(); - lengthsReader.close(); - hashesReader.close(); - qualitiesReader.close(); - pubYearReader.close(); - } } - public static class Writer implements AutoCloseable { + public static class Writer extends SlopTable { private final StringColumnWriter domainsWriter; private final StringColumnWriter urlsWriter; private final VarintColumnWriter ordinalsWriter; @@ -302,27 +279,28 @@ public record SlopDocumentRecord( private final GammaCodedSequenceWriter spansWriter; public Writer(Path baseDir, int page) throws IOException { - domainsWriter = domainsColumn.forPage(page).create(baseDir); - urlsWriter = urlsColumn.forPage(page).create(baseDir); - ordinalsWriter = ordinalsColumn.forPage(page).create(baseDir); - statesWriter = statesColumn.forPage(page).create(baseDir); - stateReasonsWriter = stateReasonsColumn.forPage(page).create(baseDir); - titlesWriter = titlesColumn.forPage(page).create(baseDir); - descriptionsWriter = descriptionsColumn.forPage(page).create(baseDir); - htmlFeaturesWriter = htmlFeaturesColumn.forPage(page).create(baseDir); - htmlStandardsWriter = htmlStandardsColumn.forPage(page).create(baseDir); - lengthsWriter = lengthsColumn.forPage(page).create(baseDir); - hashesWriter = hashesColumn.forPage(page).create(baseDir); - qualitiesWriter = qualitiesColumn.forPage(page).create(baseDir); - domainMetadataWriter = domainMetadata.forPage(page).create(baseDir); - pubYearWriter = pubYearColumn.forPage(page).create(baseDir); - termCountsWriter = termCountsColumn.forPage(page).create(baseDir); - keywordsWriter = keywordsColumn.forPage(page).create(baseDir); - termMetaWriter = termMetaColumn.forPage(page).create(baseDir); - termPositionsWriter = termPositionsColumn.forPage(page).create(baseDir); + domainsWriter = domainsColumn.forPage(page).create(this, baseDir); + urlsWriter = urlsColumn.forPage(page).create(this, baseDir); + ordinalsWriter = ordinalsColumn.forPage(page).create(this, baseDir); + statesWriter = statesColumn.forPage(page).create(this, baseDir); + stateReasonsWriter = stateReasonsColumn.forPage(page).create(this, baseDir); + titlesWriter = titlesColumn.forPage(page).create(this, baseDir); + descriptionsWriter = descriptionsColumn.forPage(page).create(this, baseDir); + htmlFeaturesWriter = htmlFeaturesColumn.forPage(page).create(this, baseDir); + htmlStandardsWriter = htmlStandardsColumn.forPage(page).create(this, baseDir); + lengthsWriter = lengthsColumn.forPage(page).create(this, baseDir); + hashesWriter = hashesColumn.forPage(page).create(this, baseDir); + qualitiesWriter = qualitiesColumn.forPage(page).create(this, baseDir); + domainMetadataWriter = domainMetadata.forPage(page).create(this, baseDir); + pubYearWriter = pubYearColumn.forPage(page).create(this, baseDir); + termCountsWriter = termCountsColumn.forPage(page).create(this, baseDir); - spansWriter = spansColumn.forPage(page).create(baseDir); - spansCodesWriter = spanCodesColumn.forPage(page).create(baseDir); + keywordsWriter = keywordsColumn.forPage(page).create(this.columnGroup("keywords"), baseDir); + termMetaWriter = termMetaColumn.forPage(page).create(this.columnGroup("keywords"), baseDir); + termPositionsWriter = termPositionsColumn.forPage(page).create(this.columnGroup("keywords"), baseDir); + + spansWriter = spansColumn.forPage(page).create(this.columnGroup("spans"), baseDir); + spansCodesWriter = spanCodesColumn.forPage(page).create(this.columnGroup("spans"), baseDir); } public void write(SlopDocumentRecord record) throws IOException { @@ -367,29 +345,5 @@ public record SlopDocumentRecord( } } - - public void close() throws IOException { - domainsWriter.close(); - urlsWriter.close(); - ordinalsWriter.close(); - statesWriter.close(); - stateReasonsWriter.close(); - titlesWriter.close(); - descriptionsWriter.close(); - htmlFeaturesWriter.close(); - htmlStandardsWriter.close(); - lengthsWriter.close(); - hashesWriter.close(); - qualitiesWriter.close(); - domainMetadataWriter.close(); - pubYearWriter.close(); - termCountsWriter.close(); - keywordsWriter.close(); - termMetaWriter.close(); - termPositionsWriter.close(); - - spansCodesWriter.close(); - spansWriter.close(); - } } } diff --git a/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainLinkRecord.java b/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainLinkRecord.java index d0b3c6d6..7cb3b7df 100644 --- a/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainLinkRecord.java +++ b/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainLinkRecord.java @@ -4,6 +4,7 @@ import nu.marginalia.slop.column.string.StringColumnReader; import nu.marginalia.slop.column.string.StringColumnWriter; import nu.marginalia.slop.desc.ColumnDesc; import nu.marginalia.slop.desc.ColumnType; +import nu.marginalia.slop.desc.SlopTable; import nu.marginalia.slop.desc.StorageType; import java.io.IOException; @@ -21,7 +22,7 @@ public record SlopDomainLinkRecord( return new Reader(baseDir, page); } - public static class Reader implements AutoCloseable { + public static class Reader extends SlopTable { private final StringColumnReader sourcesReader; private final StringColumnReader destsReader; @@ -30,15 +31,8 @@ public record SlopDomainLinkRecord( } public Reader(Path baseDir, int page) throws IOException { - sourcesReader = sourcesColumn.forPage(page).open(baseDir); - destsReader = destsColumn.forPage(page).open(baseDir); - } - - - @Override - public void close() throws IOException { - sourcesReader.close(); - destsReader.close(); + sourcesReader = sourcesColumn.forPage(page).open(this, baseDir); + destsReader = destsColumn.forPage(page).open(this, baseDir); } public boolean hasMore() throws IOException { @@ -60,13 +54,13 @@ public record SlopDomainLinkRecord( } } - public static class Writer implements AutoCloseable { + public static class Writer extends SlopTable { private final StringColumnWriter sourcesWriter; private final StringColumnWriter destsWriter; public Writer(Path baseDir, int page) throws IOException { - sourcesWriter = sourcesColumn.forPage(page).create(baseDir); - destsWriter = destsColumn.forPage(page).create(baseDir); + sourcesWriter = sourcesColumn.forPage(page).create(this, baseDir); + destsWriter = destsColumn.forPage(page).create(this, baseDir); } public void write(SlopDomainLinkRecord record) throws IOException { diff --git a/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainRecord.java b/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainRecord.java index 059a6e81..b1c6533b 100644 --- a/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainRecord.java +++ b/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainRecord.java @@ -6,6 +6,7 @@ import nu.marginalia.slop.column.string.StringColumnReader; import nu.marginalia.slop.column.string.StringColumnWriter; import nu.marginalia.slop.desc.ColumnDesc; import nu.marginalia.slop.desc.ColumnType; +import nu.marginalia.slop.desc.SlopTable; import nu.marginalia.slop.desc.StorageType; import java.io.IOException; @@ -43,7 +44,7 @@ public record SlopDomainRecord( private static final ColumnDesc rssFeedsColumn = new ColumnDesc<>("rssFeeds", ColumnType.TXTSTRING, StorageType.GZIP); - public static class DomainNameReader implements AutoCloseable { + public static class DomainNameReader extends SlopTable { private final StringColumnReader domainsReader; public DomainNameReader(SlopPageRef page) throws IOException { @@ -51,13 +52,7 @@ public record SlopDomainRecord( } public DomainNameReader(Path baseDir, int page) throws IOException { - domainsReader = domainsColumn.forPage(page).open(baseDir); - } - - - @Override - public void close() throws IOException { - domainsReader.close(); + domainsReader = domainsColumn.forPage(page).open(this, baseDir); } public boolean hasMore() throws IOException { @@ -69,7 +64,7 @@ public record SlopDomainRecord( } } - public static class DomainWithIpReader implements AutoCloseable { + public static class DomainWithIpReader extends SlopTable { private final StringColumnReader domainsReader; private final StringColumnReader ipReader; @@ -78,15 +73,8 @@ public record SlopDomainRecord( } public DomainWithIpReader(Path baseDir, int page) throws IOException { - domainsReader = domainsColumn.forPage(page).open(baseDir); - ipReader = ipColumn.forPage(page).open(baseDir); - } - - - @Override - public void close() throws IOException { - domainsReader.close(); - ipReader.close(); + domainsReader = domainsColumn.forPage(page).open(this, baseDir); + ipReader = ipColumn.forPage(page).open(this, baseDir); } public boolean hasMore() throws IOException { @@ -102,7 +90,7 @@ public record SlopDomainRecord( } } - public static class Reader implements AutoCloseable { + public static class Reader extends SlopTable { private final StringColumnReader domainsReader; private final StringColumnReader statesReader; private final StringColumnReader redirectReader; @@ -120,33 +108,17 @@ public record SlopDomainRecord( } public Reader(Path baseDir, int page) throws IOException { - domainsReader = domainsColumn.forPage(page).open(baseDir); - statesReader = statesColumn.forPage(page).open(baseDir); - redirectReader = redirectDomainsColumn.forPage(page).open(baseDir); - ipReader = ipColumn.forPage(page).open(baseDir); + domainsReader = domainsColumn.forPage(page).open(this, baseDir); + statesReader = statesColumn.forPage(page).open(this, baseDir); + redirectReader = redirectDomainsColumn.forPage(page).open(this, baseDir); + ipReader = ipColumn.forPage(page).open(this, baseDir); - knownUrlsReader = knownUrlsColumn.forPage(page).open(baseDir); - goodUrlsReader = goodUrlsColumn.forPage(page).open(baseDir); - visitedUrlsReader = visitedUrlsColumn.forPage(page).open(baseDir); + knownUrlsReader = knownUrlsColumn.forPage(page).open(this, baseDir); + goodUrlsReader = goodUrlsColumn.forPage(page).open(this, baseDir); + visitedUrlsReader = visitedUrlsColumn.forPage(page).open(this, baseDir); - rssFeedsCountReader = rssFeedsCountColumn.forPage(page).open(baseDir); - rssFeedsReader = rssFeedsColumn.forPage(page).open(baseDir); - } - - - @Override - public void close() throws IOException { - domainsReader.close(); - statesReader.close(); - redirectReader.close(); - ipReader.close(); - - knownUrlsReader.close(); - goodUrlsReader.close(); - visitedUrlsReader.close(); - - rssFeedsCountReader.close(); - rssFeedsReader.close(); + rssFeedsCountReader = rssFeedsCountColumn.forPage(page).open(this, baseDir); + rssFeedsReader = rssFeedsColumn.forPage(page).open(this, baseDir); } public boolean hasMore() throws IOException { @@ -179,7 +151,7 @@ public record SlopDomainRecord( } } - public static class Writer implements AutoCloseable { + public static class Writer extends SlopTable { private final StringColumnWriter domainsWriter; private final StringColumnWriter statesWriter; private final StringColumnWriter redirectWriter; @@ -193,17 +165,17 @@ public record SlopDomainRecord( private final StringColumnWriter rssFeedsWriter; public Writer(Path baseDir, int page) throws IOException { - domainsWriter = domainsColumn.forPage(page).create(baseDir); - statesWriter = statesColumn.forPage(page).create(baseDir); - redirectWriter = redirectDomainsColumn.forPage(page).create(baseDir); - ipWriter = ipColumn.forPage(page).create(baseDir); + domainsWriter = domainsColumn.forPage(page).create(this, baseDir); + statesWriter = statesColumn.forPage(page).create(this, baseDir); + redirectWriter = redirectDomainsColumn.forPage(page).create(this, baseDir); + ipWriter = ipColumn.forPage(page).create(this, baseDir); - knownUrlsWriter = knownUrlsColumn.forPage(page).create(baseDir); - goodUrlsWriter = goodUrlsColumn.forPage(page).create(baseDir); - visitedUrlsWriter = visitedUrlsColumn.forPage(page).create(baseDir); + knownUrlsWriter = knownUrlsColumn.forPage(page).create(this, baseDir); + goodUrlsWriter = goodUrlsColumn.forPage(page).create(this, baseDir); + visitedUrlsWriter = visitedUrlsColumn.forPage(page).create(this, baseDir); - rssFeedsCountWriter = rssFeedsCountColumn.forPage(page).create(baseDir); - rssFeedsWriter = rssFeedsColumn.forPage(page).create(baseDir); + rssFeedsCountWriter = rssFeedsCountColumn.forPage(page).create(this, baseDir); + rssFeedsWriter = rssFeedsColumn.forPage(page).create(this.columnGroup("rss-feeds"), baseDir); } public void write(SlopDomainRecord record) throws IOException { @@ -221,20 +193,5 @@ public record SlopDomainRecord( rssFeedsWriter.put(rssFeed); } } - - @Override - public void close() throws IOException { - domainsWriter.close(); - statesWriter.close(); - redirectWriter.close(); - ipWriter.close(); - - knownUrlsWriter.close(); - goodUrlsWriter.close(); - visitedUrlsWriter.close(); - - rssFeedsCountWriter.close(); - rssFeedsWriter.close(); - } } } diff --git a/code/processes/loading-process/build.gradle b/code/processes/loading-process/build.gradle index 5e49ed30..57bf8eaf 100644 --- a/code/processes/loading-process/build.gradle +++ b/code/processes/loading-process/build.gradle @@ -32,6 +32,7 @@ dependencies { implementation project(':code:libraries:message-queue') implementation project(':code:libraries:language-processing') implementation project(':code:libraries:coded-sequence') + implementation project(':code:libraries:slop') implementation project(':third-party:commons-codec') implementation project(':third-party:parquet-floor') testImplementation project(':code:services-application:search-service')