diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java index 85b06157..d3e54a07 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java @@ -88,6 +88,9 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial if (nextRecord.contentType.startsWith("x-marginalia/advisory;state=content-type-failed-probe")) { status = CrawlerDocumentStatus.BAD_CONTENT_TYPE; } + else if (nextRecord.contentType.startsWith("x-marginalia/advisory;state=robots-txt-skipped")) { + status = CrawlerDocumentStatus.ROBOTS_TXT; + } else if (nextRecord.contentType.startsWith("x-marginalia/advisory")) { // other advisory stuff we don't want return; } diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java index e19aa79c..266670fd 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java @@ -211,6 +211,43 @@ public class CrawlingThenConvertingIntegrationTest { } + + @Test + public void crawlRobotsTxt() throws IOException { + var specs = CrawlSpecRecord.builder() + .domain("search.marginalia.nu") + .crawlDepth(5) + .urls(List.of( + "https://search.marginalia.nu/search?q=hello+world" + )) + .build(); + + CrawledDomain domain = crawl(specs); + assertFalse(domain.doc.isEmpty()); + assertEquals("OK", domain.crawlerStatus); + assertEquals("search.marginalia.nu", domain.domain); + + Set allUrls = domain.doc.stream().map(doc -> doc.url).collect(Collectors.toSet()); + assertTrue(allUrls.contains("https://search.marginalia.nu/search"), "We expect a record for entities that are forbidden"); + + var output = process(); + + assertNotNull(output); + assertFalse(output.documents.isEmpty()); + assertEquals(new EdgeDomain("search.marginalia.nu"), output.domain); + assertEquals(DomainIndexingState.ACTIVE, output.state); + + for (var doc : output.documents) { + if (doc.isOk()) { + System.out.println(doc.url + "\t" + doc.state + "\t" + doc.details.title); + } + else { + System.out.println(doc.url + "\t" + doc.state + "\t" + doc.stateReason); + } + } + + } + private ProcessedDomain process() { try (var stream = new ParquetSerializableCrawlDataStream(fileName2)) { return domainProcessor.process(stream);