mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-22 20:48:59 +00:00
(converter) Truncate excessively long strings in SentenceExtractor, malformed data was effectively DOS:ing the converter
This commit is contained in:
parent
db99242db2
commit
18ca926c7f
@ -155,8 +155,15 @@ public class SentenceExtractor {
|
||||
public List<DocumentSentence> extractSentencesFromString(String text, EnumSet<HtmlTag> htmlTags) {
|
||||
String[] sentences;
|
||||
|
||||
// Normalize spaces
|
||||
// Safety net against malformed data DOS attacks,
|
||||
// found 5+ MB <p>-tags in the wild that just break
|
||||
// the sentence extractor causing it to stall forever.
|
||||
if (text.length() > 50_000) {
|
||||
// 50k chars can hold a small novel, let alone single html tags
|
||||
text = text.substring(0, 50_000);
|
||||
}
|
||||
|
||||
// Normalize spaces
|
||||
text = normalizeSpaces(text);
|
||||
|
||||
// Split into sentences
|
||||
|
Loading…
Reference in New Issue
Block a user