diff --git a/code/process-models/crawling-model/build.gradle b/code/process-models/crawling-model/build.gradle index 03db0de9..c933ea55 100644 --- a/code/process-models/crawling-model/build.gradle +++ b/code/process-models/crawling-model/build.gradle @@ -23,6 +23,7 @@ dependencies { implementation project(':code:features-crawl:content-type') implementation project(':code:libraries:language-processing') implementation project(':third-party:parquet-floor') + implementation project(':third-party:commons-codec') implementation libs.bundles.slf4j diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyExtractor.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyExtractor.java index a485e5bc..019aa761 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyExtractor.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyExtractor.java @@ -39,7 +39,12 @@ public class DocumentBodyExtractor { private static DocumentBodyResult toStringResult(ContentType contentType, byte[] bytes) { if (contentTypeLogic.isAllowableContentType(contentType)) { - return new DocumentBodyResult.Ok<>(contentType, DocumentBodyToString.getStringData(contentType, bytes)); + try { + return new DocumentBodyResult.Ok<>(contentType, DocumentBodyToString.getStringData(contentType, bytes)); + } + catch (Exception ex) { + return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.BAD_CONTENT_TYPE, ""); + } } else { return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.BAD_CONTENT_TYPE, ""); diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java index 0dcd4625..85b06157 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java @@ -4,12 +4,10 @@ import lombok.SneakyThrows; import nu.marginalia.contenttype.ContentType; import nu.marginalia.contenttype.DocumentBodyToString; import nu.marginalia.crawling.io.SerializableCrawlDataStream; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawledDomain; -import nu.marginalia.crawling.model.CrawlerDomainStatus; -import nu.marginalia.crawling.model.SerializableCrawlData; +import nu.marginalia.crawling.model.*; import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecord; import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileReader; +import nu.marginalia.hash.MurmurHash3_128; import nu.marginalia.model.EdgeUrl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -22,8 +20,9 @@ import java.util.*; public class ParquetSerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream { private static final Logger logger = LoggerFactory.getLogger(ParquetSerializableCrawlDataStream.class); + private final MurmurHash3_128 hash = new MurmurHash3_128(); private final Iterator backingIterator; - private Deque nextQ = new ArrayDeque<>(); + private final Deque nextQ = new ArrayDeque<>(); private boolean wroteDomainRecord = false; private final Path path; @@ -64,14 +63,13 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial status = CrawlerDomainStatus.REDIRECT; } else if (parquetRecord.contentType.equals("x-marginalia/advisory;state=blocked")) { - status = CrawlerDomainStatus.BLOCKED; // FIXME we don't write this yet + status = CrawlerDomainStatus.BLOCKED; } else if (parquetRecord.contentType.equals("x-marginalia/advisory;state=error")) { status = CrawlerDomainStatus.ERROR; statusReason = new String(parquetRecord.body); } - // FIXME -- cookies nextQ.add(new CrawledDomain( parquetRecord.domain, redirectDomain, @@ -84,25 +82,36 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial } private void createDocumentRecord(CrawledDocumentParquetRecord nextRecord) { - if (nextRecord.contentType.startsWith("x-marginalia/advisory")) { + String bodyString = ""; + CrawlerDocumentStatus status = CrawlerDocumentStatus.OK; + + if (nextRecord.contentType.startsWith("x-marginalia/advisory;state=content-type-failed-probe")) { + status = CrawlerDocumentStatus.BAD_CONTENT_TYPE; + } + else if (nextRecord.contentType.startsWith("x-marginalia/advisory")) { // other advisory stuff we don't want return; } + else { + try { + bodyString = DocumentBodyToString.getStringData( + ContentType.parse(nextRecord.contentType), + nextRecord.body); + } catch (Exception ex) { + logger.error("Failed to convert body to string", ex); + status = CrawlerDocumentStatus.BAD_CHARSET; + } + } - String bodyString = DocumentBodyToString.getStringData( - ContentType.parse(nextRecord.contentType), - nextRecord.body); - - // FIXME -- a lot of these fields are not set properly! nextQ.add(new CrawledDocument("", nextRecord.url, nextRecord.contentType, nextRecord.timestamp.toString(), nextRecord.httpStatus, - "OK", + status.toString(), "", "", bodyString, - "", + Long.toHexString(hash.hashNearlyASCII(bodyString)), // this field isn't actually used, maybe we can skip calculating it? nextRecord.url, null, "", diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java index 40830299..edfbc6b1 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java @@ -168,8 +168,8 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { false, 0, date, - "x-marginalia/advisory;state=error", - errorStatus.getBytes() + errorStatus, + new byte[0] ); } } diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java index 14d2e528..e19aa79c 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java @@ -28,7 +28,9 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; import java.util.List; +import java.util.Set; import java.util.function.Predicate; +import java.util.stream.Collectors; import static org.junit.jupiter.api.Assertions.*; @@ -168,6 +170,47 @@ public class CrawlingThenConvertingIntegrationTest { } + + + @Test + public void crawlContentTypes() throws IOException { + var specs = CrawlSpecRecord.builder() + .domain("www.marginalia.nu") + .crawlDepth(5) + .urls(List.of( + "https://www.marginalia.nu/sanic.png", + "https://www.marginalia.nu/invalid" + )) + .build(); + + CrawledDomain domain = crawl(specs); + assertFalse(domain.doc.isEmpty()); + assertEquals("OK", domain.crawlerStatus); + assertEquals("www.marginalia.nu", domain.domain); + + Set allUrls = domain.doc.stream().map(doc -> doc.url).collect(Collectors.toSet()); + assertTrue(allUrls.contains("https://www.marginalia.nu/sanic.png"), "Should have record for image despite blocked content type"); + assertTrue(allUrls.contains("https://www.marginalia.nu/invalid"), "Should have have record for invalid URL"); + + var output = process(); + + assertNotNull(output); + assertFalse(output.documents.isEmpty()); + assertEquals(new EdgeDomain("www.marginalia.nu"), output.domain); + assertEquals(DomainIndexingState.ACTIVE, output.state); + + + for (var doc : output.documents) { + if (doc.isOk()) { + System.out.println(doc.url + "\t" + doc.state + "\t" + doc.details.title); + } + else { + System.out.println(doc.url + "\t" + doc.state + "\t" + doc.stateReason); + } + } + + } + private ProcessedDomain process() { try (var stream = new ParquetSerializableCrawlDataStream(fileName2)) { return domainProcessor.process(stream);