From 0806aa6dfe32d4939ec274dec9b3f8dc0625b7e1 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 3 Jan 2024 13:59:05 +0100 Subject: [PATCH] (language-processing) Add maximum length limit for text input in SentenceExtractor Added a new constant, MAX_TEXT_LENGTH, to the SentenceExtractor class. If the length of the text input exceeds this limit, the text is truncated to fit within the limit. This modification is designed to prevent excessive resource usage for unusually long text inputs. --- .../language/sentence/SentenceExtractor.java | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractor.java b/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractor.java index 178cdee4..13ba2e76 100644 --- a/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractor.java +++ b/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractor.java @@ -96,7 +96,7 @@ public class SentenceExtractor { title = doc.getElementsByTag("h2").text(); } - if (title.trim().length() < 3 && textSentences.length > 0) { + if (title.trim().length() < 3) { for (DocumentSentence textSentence : textSentences) { if (textSentence.length() > 0) { title = textSentence.originalSentence.toLowerCase(); @@ -138,10 +138,6 @@ public class SentenceExtractor { String textNormalizedSpaces = SentenceExtractorStringUtils.normalizeSpaces(text); - if (text.length() > MAX_TEXT_LENGTH) { - textNormalizedSpaces = textNormalizedSpaces.substring(0, MAX_TEXT_LENGTH); - } - try { sentences = sentenceDetector.sentDetect(textNormalizedSpaces); } @@ -221,7 +217,12 @@ public class SentenceExtractor { public String asText(Document dc) { String text = dc.getElementsByTag("body").text(); - return text.substring(0, (int) (text.length()*0.95)); + if (text.length() > MAX_TEXT_LENGTH) { + return text.substring(0, MAX_TEXT_LENGTH); + } + else { + return text.substring(0, (int) (text.length() * 0.95)); + } }