diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java index d4ad4ed4..0dcd4625 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java @@ -58,7 +58,7 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial String statusReason = ""; String redirectDomain = null; - if (parquetRecord.contentType.equals("x-marginalia/advisory;state=redir")) { + if (parquetRecord.contentType.equals("x-marginalia/advisory;state=redirect")) { EdgeUrl crawledUrl = new EdgeUrl(parquetRecord.url); redirectDomain = crawledUrl.getDomain().toString(); status = CrawlerDomainStatus.REDIRECT; @@ -84,6 +84,10 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial } private void createDocumentRecord(CrawledDocumentParquetRecord nextRecord) { + if (nextRecord.contentType.startsWith("x-marginalia/advisory")) { + return; + } + String bodyString = DocumentBodyToString.getStringData( ContentType.parse(nextRecord.contentType), nextRecord.body); diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java index 59912f8b..14d2e528 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java @@ -4,35 +4,44 @@ import com.google.inject.Guice; import com.google.inject.Injector; import lombok.SneakyThrows; import nu.marginalia.WmsaHome; +import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.processor.DomainProcessor; import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; -import nu.marginalia.crawling.io.SerializableCrawlDataStream; import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream; -import nu.marginalia.crawling.io.format.WarcSerializableCrawlDataStream; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.SerializableCrawlData; -import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecord; import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.crawlspec.CrawlSpecRecord; import org.junit.jupiter.api.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; import java.util.List; +import java.util.function.Predicate; -/* This is mostly a debugging utility */ +import static org.junit.jupiter.api.Assertions.*; + +/** Tests for the crawler and converter integration. These are pretty slow and potentially + * a bit flaky, since they attempt to fetch real websites. + */ @Tag("slow") public class CrawlingThenConvertingIntegrationTest { private DomainProcessor domainProcessor; private HttpFetcher httpFetcher; + private static final Logger logger = LoggerFactory.getLogger(CrawlingThenConvertingIntegrationTest.class); + private Path fileName; private Path fileName2; @@ -63,7 +72,69 @@ public class CrawlingThenConvertingIntegrationTest { } @Test - public void crawlThenProcess() throws IOException { + public void testInvalidDomain() throws IOException { + // Attempt to fetch an invalid domain + var specs = CrawlSpecRecord.builder() + .domain("invalid.invalid.invalid") + .crawlDepth(10) + .urls(List.of()) // add specific URLs to crawl here + .build(); + + CrawledDomain crawlData = crawl(specs); + + assertEquals("ERROR", crawlData.crawlerStatus); + assertTrue(crawlData.doc.isEmpty()); + + var processedData = process(); + + assertNotNull(processedData); + assertTrue(processedData.documents.isEmpty()); + } + + @Test + public void testRedirectingDomain() throws IOException { + // Attempt to fetch an invalid domain + var specs = CrawlSpecRecord.builder() + .domain("memex.marginalia.nu") + .crawlDepth(10) + .urls(List.of()) // add specific URLs to crawl here + .build(); + + CrawledDomain crawlData = crawl(specs); + + assertEquals("REDIRECT", crawlData.crawlerStatus); + assertEquals("www.marginalia.nu", crawlData.redirectDomain); + assertTrue(crawlData.doc.isEmpty()); + + var processedData = process(); + + assertNotNull(processedData); + assertTrue(processedData.documents.isEmpty()); + } + + @Test + public void testBlockedDomain() throws IOException { + // Attempt to fetch an invalid domain + var specs = CrawlSpecRecord.builder() + .domain("search.marginalia.nu") + .crawlDepth(10) + .urls(List.of()) // add specific URLs to crawl here + .build(); + + CrawledDomain crawlData = crawl(specs, d->false); // simulate blocking by blacklisting everything + + assertEquals("ERROR", crawlData.crawlerStatus); + assertEquals("BLOCKED;IP not allowed", crawlData.crawlerStatusDesc); + assertTrue(crawlData.doc.isEmpty()); + + var processedData = process(); + + assertNotNull(processedData); + assertTrue(processedData.documents.isEmpty()); + } + + @Test + public void crawlSunnyDay() throws IOException { var specs = CrawlSpecRecord.builder() .domain("www.marginalia.nu") .crawlDepth(10) @@ -71,12 +142,20 @@ public class CrawlingThenConvertingIntegrationTest { .build(); CrawledDomain domain = crawl(specs); + assertFalse(domain.doc.isEmpty()); + assertEquals("OK", domain.crawlerStatus); + assertEquals("www.marginalia.nu", domain.domain); - List data = new ArrayList<>(); - data.add(domain); - data.addAll(domain.doc); + boolean hasRobotsTxt = domain.doc.stream().map(doc -> doc.url).anyMatch(url -> url.endsWith("/robots.txt")); + assertFalse(hasRobotsTxt, "Robots.txt should not leave the crawler"); + + var output = process(); + + assertNotNull(output); + assertFalse(output.documents.isEmpty()); + assertEquals(new EdgeDomain("www.marginalia.nu"), output.domain); + assertEquals(DomainIndexingState.ACTIVE, output.state); - var output = domainProcessor.process(SerializableCrawlDataStream.fromIterator(data.iterator())); for (var doc : output.documents) { if (doc.isOk()) { @@ -89,18 +168,33 @@ public class CrawlingThenConvertingIntegrationTest { } + private ProcessedDomain process() { + try (var stream = new ParquetSerializableCrawlDataStream(fileName2)) { + return domainProcessor.process(stream); + } + catch (Exception e) { + Assertions.fail(e); + return null; // unreachable + } + } private CrawledDomain crawl(CrawlSpecRecord specs) throws IOException { + return crawl(specs, domain -> true); + } + + private CrawledDomain crawl(CrawlSpecRecord specs, Predicate domainBlacklist) throws IOException { List data = new ArrayList<>(); try (var recorder = new WarcRecorder(fileName)) { - new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch(); + new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, recorder).fetch(); } CrawledDocumentParquetRecordFileWriter.convertWarc(specs.domain, fileName, fileName2); try (var reader = new ParquetSerializableCrawlDataStream(fileName2)) { while (reader.hasNext()) { - data.add(reader.next()); + var next = reader.next(); + logger.info("{}", next); + data.add(next); } } @@ -109,6 +203,7 @@ public class CrawlingThenConvertingIntegrationTest { .map(CrawledDomain.class::cast) .findFirst() .get(); + data.stream().filter(CrawledDocument.class::isInstance).map(CrawledDocument.class::cast).forEach(domain.doc::add); return domain; }