mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-22 12:48:58 +00:00
(converter) Add truncation att the parser step to prevent the converter from spending too much time on exessively large documents
This commit is contained in:
parent
1673fc284c
commit
db138b2a6f
@ -59,9 +59,14 @@ public final class CrawledDocument implements SerializableCrawlData {
|
||||
}
|
||||
|
||||
public Document parseBody() throws IOException {
|
||||
// Prevent stalls from parsing excessively large documents
|
||||
|
||||
byte[] bytes = documentBodyBytes.length > 200_000
|
||||
? Arrays.copyOf(documentBodyBytes, 200_000) : documentBodyBytes;
|
||||
|
||||
return DocumentBodyToString.getParsedData(
|
||||
ContentType.parse(contentType),
|
||||
documentBodyBytes,
|
||||
bytes,
|
||||
url);
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user