From 32436d099c6ebe4a26d02c963f4f9ce3f1ecfb34 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 3 Jan 2024 13:49:39 +0100 Subject: [PATCH] (language-processing) Add maximum length limit for text input in SentenceExtractor Added a new constant, MAX_TEXT_LENGTH, to the SentenceExtractor class. If the length of the text input exceeds this limit, the text is truncated to fit within the limit. This modification is designed to prevent excessive resource usage for unusually long text inputs. --- .../nu/marginalia/language/sentence/SentenceExtractor.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractor.java b/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractor.java index 4cbdaf29..178cdee4 100644 --- a/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractor.java +++ b/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractor.java @@ -42,6 +42,7 @@ public class SentenceExtractor { * that might otherwise use an undue amount of processing power. 250 words is about 10X longer than * this comment. */ private static final int MAX_SENTENCE_LENGTH = 250; + private static final int MAX_TEXT_LENGTH = 65536; @SneakyThrows @Inject public SentenceExtractor(LanguageModels models) { @@ -136,6 +137,11 @@ public class SentenceExtractor { String[] sentences; String textNormalizedSpaces = SentenceExtractorStringUtils.normalizeSpaces(text); + + if (text.length() > MAX_TEXT_LENGTH) { + textNormalizedSpaces = textNormalizedSpaces.substring(0, MAX_TEXT_LENGTH); + } + try { sentences = sentenceDetector.sentDetect(textNormalizedSpaces); }