(converter) Basic test coverage for sideloading-style processing

This commit is contained in:
Viktor Lofgren 2023-12-27 18:33:16 +01:00
parent 24051fec03
commit b37223c053
2 changed files with 43 additions and 10 deletions

View File

@ -75,7 +75,7 @@ public class DomainProcessor {
return fullProcessing(domain); return fullProcessing(domain);
} }
public ConverterBatchWritableIf sideloadProcessing(SerializableCrawlDataStream dataStream) { public SideloadProcessing sideloadProcessing(SerializableCrawlDataStream dataStream) {
try { try {
return new SideloadProcessing(dataStream); return new SideloadProcessing(dataStream);
} }
@ -86,7 +86,7 @@ public class DomainProcessor {
} }
class SideloadProcessing implements ConverterBatchWritableIf, SideloadSource { public class SideloadProcessing implements ConverterBatchWritableIf, SideloadSource {
private final SerializableCrawlDataStream dataStream; private final SerializableCrawlDataStream dataStream;
private final ProcessedDomain domain; private final ProcessedDomain domain;
private final DocumentDecorator documentDecorator; private final DocumentDecorator documentDecorator;
@ -97,10 +97,9 @@ public class DomainProcessor {
SideloadProcessing(SerializableCrawlDataStream dataStream) throws IOException { SideloadProcessing(SerializableCrawlDataStream dataStream) throws IOException {
this.dataStream = dataStream; this.dataStream = dataStream;
if (!dataStream.hasNext()) { if (!dataStream.hasNext()
throw new IllegalStateException("No data in stream"); || !(dataStream.next() instanceof CrawledDomain crawledDomain))
} {
if (!(dataStream.next() instanceof CrawledDomain crawledDomain)) {
throw new IllegalStateException("First record must be a domain"); throw new IllegalStateException("First record must be a domain");
} }
@ -126,10 +125,11 @@ public class DomainProcessor {
@Override @Override
public boolean hasNext() { public boolean hasNext() {
try { try {
while (next != null while (next == null
&& dataStream.hasNext() && dataStream.hasNext())
&& dataStream.next() instanceof CrawledDocument doc)
{ {
if (!(dataStream.next() instanceof CrawledDocument doc))
continue;
if (doc.url == null || !processedUrls.add(doc.url)) if (doc.url == null || !processedUrls.add(doc.url))
continue; continue;

View File

@ -63,7 +63,7 @@ public class ConvertingIntegrationTest {
} }
@Test @Test
public void testMemexMarginaliaNu() throws IOException { public void testMemexMarginaliaNuFullProcessing() throws IOException {
var ret = domainProcessor.fullProcessing(asSerializableCrawlData(readMarginaliaWorkingSet())); var ret = domainProcessor.fullProcessing(asSerializableCrawlData(readMarginaliaWorkingSet()));
assertNotNull(ret); assertNotNull(ret);
assertEquals(ret.state, DomainIndexingState.ACTIVE); assertEquals(ret.state, DomainIndexingState.ACTIVE);
@ -94,6 +94,39 @@ public class ConvertingIntegrationTest {
} }
} }
@Test
public void testMemexMarginaliaNuSideloadProcessing() throws IOException {
var ret = domainProcessor.sideloadProcessing(asSerializableCrawlData(readMarginaliaWorkingSet()));
assertNotNull(ret);
assertEquals("memex.marginalia.nu", ret.id());
var domain = ret.getDomain();
assertEquals(domain.domain, new EdgeDomain("memex.marginalia.nu"));
List<ProcessedDocument> docsAll = new ArrayList<>();
Map<UrlIndexingState, Integer> resultsByStatusCount = new HashMap<>();
ret.getDocumentsStream().forEachRemaining(docsAll::add);
assertTrue(docsAll.size() > 25);
docsAll.forEach(doc -> resultsByStatusCount.merge(doc.state, 1, Integer::sum));
assertTrue(resultsByStatusCount.get(UrlIndexingState.OK) > 25);
for (var doc : docsAll) {
if (!doc.isProcessedFully()) {
continue;
}
var details = doc.details;
assertTrue(details.title.length() > 4);
assertTrue(details.description.length() > 4);
assertEquals(HtmlStandard.HTML5, details.standard);
}
}
private CrawledDomain readMarginaliaWorkingSet() throws IOException { private CrawledDomain readMarginaliaWorkingSet() throws IOException {
String index = readClassPathFile("memex-marginalia/index"); String index = readClassPathFile("memex-marginalia/index");
String[] files = index.split("\n"); String[] files = index.split("\n");