diff --git a/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java index bcadf0ad..9e6cbcd1 100644 --- a/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java @@ -59,9 +59,14 @@ public final class CrawledDocument implements SerializableCrawlData { } public Document parseBody() throws IOException { + // Prevent stalls from parsing excessively large documents + + byte[] bytes = documentBodyBytes.length > 200_000 + ? Arrays.copyOf(documentBodyBytes, 200_000) : documentBodyBytes; + return DocumentBodyToString.getParsedData( ContentType.parse(contentType), - documentBodyBytes, + bytes, url); }