From 266d6e4beabdc66aab5209740ee5c4cf48551d55 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 21 Aug 2024 10:13:49 +0200 Subject: [PATCH] (slop) Replace SlopPageRef with SlopTable.Ref --- .../model/processed/SlopDocumentRecord.java | 20 +++++------- .../model/processed/SlopDomainLinkRecord.java | 12 +++---- .../model/processed/SlopDomainRecord.java | 32 +++++++++---------- .../model/processed/SlopPageRef.java | 6 ---- .../processed/SlopDocumentRecordTest.java | 3 +- .../marginalia/loading/LoaderInputData.java | 23 +++++++------ .../documents/DocumentLoaderService.java | 4 +-- .../documents/KeywordLoaderService.java | 6 ++-- .../loading/domains/DomainLoaderService.java | 14 ++++---- .../links/DomainLinksLoaderService.java | 4 +-- settings.gradle | 2 +- 11 files changed, 60 insertions(+), 66 deletions(-) delete mode 100644 code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopPageRef.java diff --git a/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDocumentRecord.java b/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDocumentRecord.java index 4c6b62dd..1515ed9a 100644 --- a/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDocumentRecord.java +++ b/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDocumentRecord.java @@ -148,12 +148,8 @@ public record SlopDocumentRecord( private final ByteArrayColumn.Reader spanCodesReader; private final GammaCodedSequenceArrayColumn.Reader spansReader; - public KeywordsProjectionReader(SlopPageRef pageRef) throws IOException { - this(pageRef.baseDir(), pageRef.page()); - } - - public KeywordsProjectionReader(Path baseDir, int page) throws IOException { - super(baseDir, page); + public KeywordsProjectionReader(SlopTable.Ref pageRef) throws IOException { + super(pageRef); domainsReader = domainsColumn.open(this); ordinalsReader = ordinalsColumn.open(this); htmlFeaturesReader = htmlFeaturesColumn.open(this); @@ -216,12 +212,8 @@ public record SlopDocumentRecord( private final FloatColumn.Reader qualitiesReader; private final IntColumn.Reader pubYearReader; - public MetadataReader(SlopPageRef pageRef) throws IOException{ - this(pageRef.baseDir(), pageRef.page()); - } - - public MetadataReader(Path baseDir, int page) throws IOException { - super(baseDir, page); + public MetadataReader(SlopTable.Ref pageRef) throws IOException{ + super(pageRef); this.domainsReader = domainsColumn.open(this); this.urlsReader = urlsColumn.open(this); @@ -236,6 +228,10 @@ public record SlopDocumentRecord( this.pubYearReader = pubYearColumn.open(this); } + public MetadataReader(Path baseDir, int page) throws IOException { + this(new Ref<>(baseDir, page)); + } + public boolean hasMore() throws IOException { return domainsReader.hasRemaining(); } diff --git a/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainLinkRecord.java b/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainLinkRecord.java index 6d1bcd03..a2184fc1 100644 --- a/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainLinkRecord.java +++ b/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainLinkRecord.java @@ -23,17 +23,17 @@ public record SlopDomainLinkRecord( private final TxtStringColumn.Reader sourcesReader; private final TxtStringColumn.Reader destsReader; - public Reader(SlopPageRef page) throws IOException { - this(page.baseDir(), page.page()); - } - - public Reader(Path baseDir, int page) throws IOException { - super(baseDir, page); + public Reader(SlopTable.Ref ref) throws IOException { + super(ref); sourcesReader = sourcesColumn.open(this); destsReader = destsColumn.open(this); } + public Reader(Path baseDir, int page) throws IOException { + this(new Ref<>(baseDir, page)); + } + public boolean hasMore() throws IOException { return sourcesReader.hasRemaining(); } diff --git a/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainRecord.java b/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainRecord.java index 6cb924f2..6b3d1395 100644 --- a/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainRecord.java +++ b/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainRecord.java @@ -43,12 +43,12 @@ public record SlopDomainRecord( public static class DomainNameReader extends SlopTable { private final TxtStringColumn.Reader domainsReader; - public DomainNameReader(SlopPageRef page) throws IOException { - this(page.baseDir(), page.page()); + public DomainNameReader(Path baseDir, int page) throws IOException { + this(new Ref<>(baseDir, page)); } - public DomainNameReader(Path baseDir, int page) throws IOException { - super(baseDir, page); + public DomainNameReader(SlopTable.Ref ref) throws IOException { + super(ref); domainsReader = domainsColumn.open(this); } @@ -66,17 +66,17 @@ public record SlopDomainRecord( private final TxtStringColumn.Reader domainsReader; private final TxtStringColumn.Reader ipReader; - public DomainWithIpReader(SlopPageRef page) throws IOException { - this(page.baseDir(), page.page()); - } - - public DomainWithIpReader(Path baseDir, int page) throws IOException { - super(baseDir, page); + public DomainWithIpReader(SlopTable.Ref ref) throws IOException { + super(ref); domainsReader = domainsColumn.open(this); ipReader = ipColumn.open(this); } + public DomainWithIpReader(Path baseDir, int page) throws IOException { + this(new Ref<>(baseDir, page)); + } + public boolean hasMore() throws IOException { return domainsReader.hasRemaining(); } @@ -102,12 +102,8 @@ public record SlopDomainRecord( private final ObjectArrayColumn.Reader rssFeedsReader; - public Reader(SlopPageRef page) throws IOException { - this(page.baseDir(), page.page()); - } - - public Reader(Path baseDir, int page) throws IOException { - super(baseDir, page); + public Reader(SlopTable.Ref ref) throws IOException { + super(ref); domainsReader = domainsColumn.open(this); statesReader = statesColumn.open(this); @@ -121,6 +117,10 @@ public record SlopDomainRecord( rssFeedsReader = rssFeedsColumn.open(this); } + public Reader(Path baseDir, int page) throws IOException { + this(new Ref<>(baseDir, page)); + } + public boolean hasMore() throws IOException { return domainsReader.hasRemaining(); } diff --git a/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopPageRef.java b/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopPageRef.java deleted file mode 100644 index fb349621..00000000 --- a/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopPageRef.java +++ /dev/null @@ -1,6 +0,0 @@ -package nu.marginalia.model.processed; - -import java.nio.file.Path; - -public record SlopPageRef(Path baseDir, int page) { -} diff --git a/code/processes/converting-process/model/test/nu/marginalia/model/processed/SlopDocumentRecordTest.java b/code/processes/converting-process/model/test/nu/marginalia/model/processed/SlopDocumentRecordTest.java index 9a3aef56..3dd7ae80 100644 --- a/code/processes/converting-process/model/test/nu/marginalia/model/processed/SlopDocumentRecordTest.java +++ b/code/processes/converting-process/model/test/nu/marginalia/model/processed/SlopDocumentRecordTest.java @@ -1,6 +1,7 @@ package nu.marginalia.model.processed; import nu.marginalia.sequence.GammaCodedSequence; +import nu.marginalia.slop.SlopTable; import nu.marginalia.test.TestUtil; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Assertions; @@ -54,7 +55,7 @@ public class SlopDocumentRecordTest { writer.write(record); } - try (var keywordReader = new SlopDocumentRecord.KeywordsProjectionReader(testDir, 0)) { + try (var keywordReader = new SlopDocumentRecord.KeywordsProjectionReader(new SlopTable.Ref<>(testDir, 0))) { assertTrue(keywordReader.hasMore()); var readRecord = keywordReader.next(); assertFalse(keywordReader.hasMore()); diff --git a/code/processes/loading-process/java/nu/marginalia/loading/LoaderInputData.java b/code/processes/loading-process/java/nu/marginalia/loading/LoaderInputData.java index 7dda3e05..b874bf05 100644 --- a/code/processes/loading-process/java/nu/marginalia/loading/LoaderInputData.java +++ b/code/processes/loading-process/java/nu/marginalia/loading/LoaderInputData.java @@ -4,7 +4,7 @@ import nu.marginalia.io.processed.ProcessedDataFileNames; import nu.marginalia.model.processed.SlopDocumentRecord; import nu.marginalia.model.processed.SlopDomainLinkRecord; import nu.marginalia.model.processed.SlopDomainRecord; -import nu.marginalia.model.processed.SlopPageRef; +import nu.marginalia.slop.SlopTable; import nu.marginalia.worklog.BatchingWorkLogInspector; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -43,31 +43,34 @@ public class LoaderInputData { lastGoodBatch.put(singleSource, lastBatch); } - public Collection> listDomainPages() { - List> pathsAll = new ArrayList<>(); + public Collection> listDomainPages() { + List> pathsAll = new ArrayList<>(); + for (var source : sourceDirectories) { for (int i = 0; i < lastGoodBatch.get(source); i++) { - pathsAll.add(new SlopPageRef<>(ProcessedDataFileNames.domainFileName(source), i)); + pathsAll.add(new SlopTable.Ref<>(ProcessedDataFileNames.domainFileName(source), i)); } } return pathsAll; } - public Collection> listDomainLinkPages() { - List> pathsAll = new ArrayList<>(); + public Collection> listDomainLinkPages() { + List> pathsAll = new ArrayList<>(); + for (var source : sourceDirectories) { for (int i = 0; i < lastGoodBatch.get(source); i++) { - pathsAll.add(new SlopPageRef<>(ProcessedDataFileNames.domainLinkFileName(source), i)); + pathsAll.add(new SlopTable.Ref<>(ProcessedDataFileNames.domainLinkFileName(source), i)); } } return pathsAll; } - public Collection> listDocumentFiles() { - List> pathsAll = new ArrayList<>(); + public Collection> listDocumentFiles() { + List> pathsAll = new ArrayList<>(); + for (var source : sourceDirectories) { for (int i = 0; i < lastGoodBatch.get(source); i++) { - pathsAll.add(new SlopPageRef<>(ProcessedDataFileNames.documentFileName(source), i)); + pathsAll.add(new SlopTable.Ref<>(ProcessedDataFileNames.documentFileName(source), i)); } } return pathsAll; diff --git a/code/processes/loading-process/java/nu/marginalia/loading/documents/DocumentLoaderService.java b/code/processes/loading-process/java/nu/marginalia/loading/documents/DocumentLoaderService.java index 7c96699a..bba79952 100644 --- a/code/processes/loading-process/java/nu/marginalia/loading/documents/DocumentLoaderService.java +++ b/code/processes/loading-process/java/nu/marginalia/loading/documents/DocumentLoaderService.java @@ -10,8 +10,8 @@ import nu.marginalia.loading.domains.DomainIdRegistry; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.model.processed.SlopDocumentRecord; -import nu.marginalia.model.processed.SlopPageRef; import nu.marginalia.process.control.ProcessHeartbeat; +import nu.marginalia.slop.SlopTable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -38,7 +38,7 @@ public class DocumentLoaderService { LoaderInputData inputData) throws IOException, SQLException { - Collection> pageRefs = inputData.listDocumentFiles(); + Collection> pageRefs = inputData.listDocumentFiles(); try (var taskHeartbeat = processHeartbeat.createAdHocTaskHeartbeat("DOCUMENTS")) { diff --git a/code/processes/loading-process/java/nu/marginalia/loading/documents/KeywordLoaderService.java b/code/processes/loading-process/java/nu/marginalia/loading/documents/KeywordLoaderService.java index 5188c06b..fadbd64c 100644 --- a/code/processes/loading-process/java/nu/marginalia/loading/documents/KeywordLoaderService.java +++ b/code/processes/loading-process/java/nu/marginalia/loading/documents/KeywordLoaderService.java @@ -7,8 +7,8 @@ import nu.marginalia.loading.LoaderInputData; import nu.marginalia.loading.domains.DomainIdRegistry; import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.model.processed.SlopDocumentRecord; -import nu.marginalia.model.processed.SlopPageRef; import nu.marginalia.process.control.ProcessHeartbeat; +import nu.marginalia.slop.SlopTable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -30,10 +30,10 @@ public class KeywordLoaderService { LoaderInputData inputData) throws IOException { try (var task = heartbeat.createAdHocTaskHeartbeat("KEYWORDS")) { - Collection> documentFiles = inputData.listDocumentFiles(); + Collection> documentFiles = inputData.listDocumentFiles(); int processed = 0; - for (SlopPageRef pageRef : documentFiles) { + for (SlopTable.Ref pageRef : documentFiles) { task.progress("LOAD", processed++, documentFiles.size()); try (var keywordsReader = new SlopDocumentRecord.KeywordsProjectionReader(pageRef)) { diff --git a/code/processes/loading-process/java/nu/marginalia/loading/domains/DomainLoaderService.java b/code/processes/loading-process/java/nu/marginalia/loading/domains/DomainLoaderService.java index 94419cf5..66389062 100644 --- a/code/processes/loading-process/java/nu/marginalia/loading/domains/DomainLoaderService.java +++ b/code/processes/loading-process/java/nu/marginalia/loading/domains/DomainLoaderService.java @@ -8,9 +8,9 @@ import nu.marginalia.loading.LoaderInputData; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.processed.SlopDomainLinkRecord; import nu.marginalia.model.processed.SlopDomainRecord; -import nu.marginalia.model.processed.SlopPageRef; import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.process.control.ProcessHeartbeatImpl; +import nu.marginalia.slop.SlopTable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -59,8 +59,8 @@ public class DomainLoaderService { { taskHeartbeat.progress(Steps.PREP_DATA); - Collection> domainPageRefs = inputData.listDomainPages(); - Collection> domainLinkPageRefs = inputData.listDomainLinkPages(); + Collection> domainPageRefs = inputData.listDomainPages(); + Collection> domainLinkPageRefs = inputData.listDomainLinkPages(); // Ensure that the domains we've just crawled are in the domain database to this node try (var inserter = new DomainInserter(conn, nodeId); @@ -68,7 +68,7 @@ public class DomainLoaderService { // Add domain names from this data set with the current node affinity int pageIdx = 0; - for (SlopPageRef page : inputData.listDomainPages()) { + for (SlopTable.Ref page : inputData.listDomainPages()) { processHeartbeat.progress("INSERT", pageIdx++, domainPageRefs.size()); try (var reader = new SlopDomainRecord.DomainNameReader(page)) { @@ -89,7 +89,7 @@ public class DomainLoaderService { // Add linked domains, but with -1 affinity meaning they can be grabbed by any index node int pageIdx = 0; - for (SlopPageRef page : inputData.listDomainLinkPages()) { + for (SlopTable.Ref page : inputData.listDomainLinkPages()) { processHeartbeat.progress("INSERT", pageIdx++, domainLinkPageRefs.size()); try (var reader = new SlopDomainLinkRecord.Reader(page)) { @@ -111,7 +111,7 @@ public class DomainLoaderService { // Update the node affinity and IP address for each domain int pageIdx = 0; - for (SlopPageRef page : inputData.listDomainPages()) { + for (SlopTable.Ref page : inputData.listDomainPages()) { processHeartbeat.progress("UPDATE", pageIdx++, domainPageRefs.size()); try (var updater = new DomainAffinityAndIpUpdater(conn, nodeId); @@ -154,7 +154,7 @@ public class DomainLoaderService { int processed = 0; - Collection> pages = inputData.listDomainPages(); + Collection> pages = inputData.listDomainPages(); for (var page : pages) { taskHeartbeat.progress("UPDATE-META", processed++, pages.size()); diff --git a/code/processes/loading-process/java/nu/marginalia/loading/links/DomainLinksLoaderService.java b/code/processes/loading-process/java/nu/marginalia/loading/links/DomainLinksLoaderService.java index 640afd76..bc4479d6 100644 --- a/code/processes/loading-process/java/nu/marginalia/loading/links/DomainLinksLoaderService.java +++ b/code/processes/loading-process/java/nu/marginalia/loading/links/DomainLinksLoaderService.java @@ -7,8 +7,8 @@ import nu.marginalia.linkgraph.io.DomainLinksWriter; import nu.marginalia.loading.LoaderInputData; import nu.marginalia.loading.domains.DomainIdRegistry; import nu.marginalia.model.processed.SlopDomainLinkRecord; -import nu.marginalia.model.processed.SlopPageRef; import nu.marginalia.process.control.ProcessHeartbeat; +import nu.marginalia.slop.SlopTable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -34,7 +34,7 @@ public class DomainLinksLoaderService { try (var task = heartbeat.createAdHocTaskHeartbeat("LINKS"); var linkLoader = new LinkLoader(domainIdRegistry)) { - Collection> pageRefs = inputData.listDomainLinkPages(); + Collection> pageRefs = inputData.listDomainLinkPages(); int processed = 0; diff --git a/settings.gradle b/settings.gradle index ccd51ccb..cadac6a5 100644 --- a/settings.gradle +++ b/settings.gradle @@ -226,7 +226,7 @@ dependencyResolutionManagement { library('jetty-util','org.eclipse.jetty','jetty-util').version('9.4.54.v20240208') library('jetty-servlet','org.eclipse.jetty','jetty-servlet').version('9.4.54.v20240208') - library('slop', 'nu.marginalia', 'slop').version('0.0.5-SNAPSHOT') + library('slop', 'nu.marginalia', 'slop').version('0.0.7-SNAPSHOT') bundle('jetty', ['jetty-server', 'jetty-util', 'jetty-servlet'])