diff --git a/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractor.java b/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractor.java index 178cdee4..13ba2e76 100644 --- a/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractor.java +++ b/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractor.java @@ -96,7 +96,7 @@ public class SentenceExtractor { title = doc.getElementsByTag("h2").text(); } - if (title.trim().length() < 3 && textSentences.length > 0) { + if (title.trim().length() < 3) { for (DocumentSentence textSentence : textSentences) { if (textSentence.length() > 0) { title = textSentence.originalSentence.toLowerCase(); @@ -138,10 +138,6 @@ public class SentenceExtractor { String textNormalizedSpaces = SentenceExtractorStringUtils.normalizeSpaces(text); - if (text.length() > MAX_TEXT_LENGTH) { - textNormalizedSpaces = textNormalizedSpaces.substring(0, MAX_TEXT_LENGTH); - } - try { sentences = sentenceDetector.sentDetect(textNormalizedSpaces); } @@ -221,7 +217,12 @@ public class SentenceExtractor { public String asText(Document dc) { String text = dc.getElementsByTag("body").text(); - return text.substring(0, (int) (text.length()*0.95)); + if (text.length() > MAX_TEXT_LENGTH) { + return text.substring(0, MAX_TEXT_LENGTH); + } + else { + return text.substring(0, (int) (text.length() * 0.95)); + } }