(converter) Truncate excessively long strings in SentenceExtractor, malformed data was effectively DOS:ing the converter

2025-02-22 20:48:59 +00:00 · 2025-01-26 12:52:54 +01:00 · 2025-01-26 12:52:54 +01:00 · 18ca926c7f
commit 18ca926c7f
parent db99242db2
1 changed files with 8 additions and 1 deletions
--- a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java
+++ b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java
@ -155,8 +155,15 @@ public class SentenceExtractor {
    public List<DocumentSentence> extractSentencesFromString(String text, EnumSet<HtmlTag> htmlTags) {
        String[] sentences;

-        // Normalize spaces
+        // Safety net against malformed data DOS attacks,
+        // found 5+ MB <p>-tags in the wild that just break
+        // the sentence extractor causing it to stall forever.
+        if (text.length() > 50_000) {
+            // 50k chars can hold a small novel, let alone single html tags
+            text = text.substring(0, 50_000);
+        }

+        // Normalize spaces
        text = normalizeSpaces(text);

        // Split into sentences