(converter) Add truncation att the parser step to prevent the converter from spending too much time on exessively large documents

This commit is contained in:
Viktor Lofgren 2025-01-26 14:25:57 +01:00
parent 1673fc284c
commit db138b2a6f

View File

@ -59,9 +59,14 @@ public final class CrawledDocument implements SerializableCrawlData {
}
public Document parseBody() throws IOException {
// Prevent stalls from parsing excessively large documents
byte[] bytes = documentBodyBytes.length > 200_000
? Arrays.copyOf(documentBodyBytes, 200_000) : documentBodyBytes;
return DocumentBodyToString.getParsedData(
ContentType.parse(contentType),
documentBodyBytes,
bytes,
url);
}