(converter) Truncate excessively long strings in SentenceExtractor, malformed data was effectively DOS:ing the converter

2025-02-23 13:09:00 +00:00 · 2025-01-26 12:52:54 +01:00 · 2025-01-26 12:52:54 +01:00 · 18ca926c7f
commit 18ca926c7f
parent db99242db2
1 changed files with 8 additions and 1 deletions
--- a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java
+++ b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java
@ -155,8 +155,15 @@ public class SentenceExtractor {
    public List<DocumentSentence> extractSentencesFromString(String text, EnumSet<HtmlTag> htmlTags) {
        String[] sentences;
-        // Normalize spaces
+        // Safety net against malformed data DOS attacks,
        // found 5+ MB <p>-tags in the wild that just break
        // the sentence extractor causing it to stall forever.
        if (text.length() > 50_000) {
            // 50k chars can hold a small novel, let alone single html tags
            text = text.substring(0, 50_000);
        }
        // Normalize spaces
        text = normalizeSpaces(text);
        // Split into sentences