From db138b2a6f0904d78b36c4068021f8b0f7d194d4 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 26 Jan 2025 14:25:57 +0100 Subject: [PATCH] (converter) Add truncation att the parser step to prevent the converter from spending too much time on exessively large documents --- .../nu/marginalia/model/crawldata/CrawledDocument.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java index bcadf0ad..9e6cbcd1 100644 --- a/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java @@ -59,9 +59,14 @@ public final class CrawledDocument implements SerializableCrawlData { } public Document parseBody() throws IOException { + // Prevent stalls from parsing excessively large documents + + byte[] bytes = documentBodyBytes.length > 200_000 + ? Arrays.copyOf(documentBodyBytes, 200_000) : documentBodyBytes; + return DocumentBodyToString.getParsedData( ContentType.parse(contentType), - documentBodyBytes, + bytes, url); }