diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index 6d46a85f..290702c1 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -75,7 +75,7 @@ public class DomainProcessor { return fullProcessing(domain); } - public ConverterBatchWritableIf sideloadProcessing(SerializableCrawlDataStream dataStream) { + public SideloadProcessing sideloadProcessing(SerializableCrawlDataStream dataStream) { try { return new SideloadProcessing(dataStream); } @@ -86,7 +86,7 @@ public class DomainProcessor { } - class SideloadProcessing implements ConverterBatchWritableIf, SideloadSource { + public class SideloadProcessing implements ConverterBatchWritableIf, SideloadSource { private final SerializableCrawlDataStream dataStream; private final ProcessedDomain domain; private final DocumentDecorator documentDecorator; @@ -97,10 +97,9 @@ public class DomainProcessor { SideloadProcessing(SerializableCrawlDataStream dataStream) throws IOException { this.dataStream = dataStream; - if (!dataStream.hasNext()) { - throw new IllegalStateException("No data in stream"); - } - if (!(dataStream.next() instanceof CrawledDomain crawledDomain)) { + if (!dataStream.hasNext() + || !(dataStream.next() instanceof CrawledDomain crawledDomain)) + { throw new IllegalStateException("First record must be a domain"); } @@ -126,10 +125,11 @@ public class DomainProcessor { @Override public boolean hasNext() { try { - while (next != null - && dataStream.hasNext() - && dataStream.next() instanceof CrawledDocument doc) + while (next == null + && dataStream.hasNext()) { + if (!(dataStream.next() instanceof CrawledDocument doc)) + continue; if (doc.url == null || !processedUrls.add(doc.url)) continue; diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java index c22f2c66..e253ecb6 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java @@ -63,7 +63,7 @@ public class ConvertingIntegrationTest { } @Test - public void testMemexMarginaliaNu() throws IOException { + public void testMemexMarginaliaNuFullProcessing() throws IOException { var ret = domainProcessor.fullProcessing(asSerializableCrawlData(readMarginaliaWorkingSet())); assertNotNull(ret); assertEquals(ret.state, DomainIndexingState.ACTIVE); @@ -94,6 +94,39 @@ public class ConvertingIntegrationTest { } } + @Test + public void testMemexMarginaliaNuSideloadProcessing() throws IOException { + var ret = domainProcessor.sideloadProcessing(asSerializableCrawlData(readMarginaliaWorkingSet())); + assertNotNull(ret); + assertEquals("memex.marginalia.nu", ret.id()); + + var domain = ret.getDomain(); + assertEquals(domain.domain, new EdgeDomain("memex.marginalia.nu")); + + List docsAll = new ArrayList<>(); + Map resultsByStatusCount = new HashMap<>(); + ret.getDocumentsStream().forEachRemaining(docsAll::add); + assertTrue(docsAll.size() > 25); + + docsAll.forEach(doc -> resultsByStatusCount.merge(doc.state, 1, Integer::sum)); + + assertTrue(resultsByStatusCount.get(UrlIndexingState.OK) > 25); + + for (var doc : docsAll) { + + if (!doc.isProcessedFully()) { + continue; + } + + var details = doc.details; + + assertTrue(details.title.length() > 4); + assertTrue(details.description.length() > 4); + assertEquals(HtmlStandard.HTML5, details.standard); + + } + } + private CrawledDomain readMarginaliaWorkingSet() throws IOException { String index = readClassPathFile("memex-marginalia/index"); String[] files = index.split("\n");