(converter) Add truncation att the parser step to prevent the converter from spending too much time on exessively large documents

2025-02-22 12:48:58 +00:00 · 2025-01-26 14:25:57 +01:00 · 2025-01-26 14:25:57 +01:00 · db138b2a6f
commit db138b2a6f
parent 1673fc284c
1 changed files with 6 additions and 1 deletions
--- a/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java
@ -59,9 +59,14 @@ public final class CrawledDocument implements SerializableCrawlData {
    }

    public Document parseBody() throws IOException {
+        // Prevent stalls from parsing excessively large documents
+
+        byte[] bytes = documentBodyBytes.length > 200_000
+                ? Arrays.copyOf(documentBodyBytes, 200_000) : documentBodyBytes;
+
        return DocumentBodyToString.getParsedData(
                ContentType.parse(contentType),
-                documentBodyBytes,
+                bytes,
                url);
    }