(crawl) Reduce Charset.forName() object churn

Cache the Charset object returned from Charset.forName() for future use, since we're likely to see the same charset again and Charset.forName(...) can be surprisingly expensive and its built-in caching strategy, which just caches the 2 last values seen doesn't cope well with how we're hitting it with a wide array of random charsets
This commit is contained in:
Viktor Lofgren 2024-07-04 20:49:07 +02:00
parent d023e399d2
commit a6b03a66dc

View File

@ -1,8 +1,11 @@
package nu.marginalia.contenttype;
import java.nio.charset.*;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
public class DocumentBodyToString {
private static final Map<String, Charset> charsetMap = new ConcurrentHashMap<>();
/** Get the string data from a document body, given the content type and charset */
public static String getStringData(ContentType type, byte[] data) {
@ -11,7 +14,7 @@ public class DocumentBodyToString {
if (type.charset() == null || type.charset().isBlank())
charset = StandardCharsets.UTF_8;
else {
charset = Charset.forName(type.charset());
charset = charsetMap.computeIfAbsent(type.charset(), Charset::forName);
}
}
catch (IllegalCharsetNameException ex) {