From fdec565b34747ea3167fca9d7c37250c248106f3 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 3 Jan 2024 23:14:03 +0100 Subject: [PATCH] (converter) Add upper 128KB limit to how much HTML we'll parse --- .../processor/plugin/HtmlDocumentProcessorPlugin.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java index 7d973909..44da6008 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java @@ -107,6 +107,10 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin throw new DisqualifiedException(DisqualificationReason.LANGUAGE); } + if (documentBody.length() > 128_000) { // 128kb + documentBody = documentBody.substring(0, 128_000); + } + Document doc = Jsoup.parse(documentBody); if (!metaRobotsTag.allowIndexingByMetaTag(doc)) {