From a6b03a66dcfbc6b4e945c7f3a9056f0b82cdc6f0 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 4 Jul 2024 20:49:07 +0200 Subject: [PATCH] (crawl) Reduce Charset.forName() object churn Cache the Charset object returned from Charset.forName() for future use, since we're likely to see the same charset again and Charset.forName(...) can be surprisingly expensive and its built-in caching strategy, which just caches the 2 last values seen doesn't cope well with how we're hitting it with a wide array of random charsets --- .../java/nu/marginalia/contenttype/DocumentBodyToString.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/code/features-crawl/content-type/java/nu/marginalia/contenttype/DocumentBodyToString.java b/code/features-crawl/content-type/java/nu/marginalia/contenttype/DocumentBodyToString.java index 7fe604f4..a867a3c2 100644 --- a/code/features-crawl/content-type/java/nu/marginalia/contenttype/DocumentBodyToString.java +++ b/code/features-crawl/content-type/java/nu/marginalia/contenttype/DocumentBodyToString.java @@ -1,8 +1,11 @@ package nu.marginalia.contenttype; import java.nio.charset.*; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; public class DocumentBodyToString { + private static final Map charsetMap = new ConcurrentHashMap<>(); /** Get the string data from a document body, given the content type and charset */ public static String getStringData(ContentType type, byte[] data) { @@ -11,7 +14,7 @@ public class DocumentBodyToString { if (type.charset() == null || type.charset().isBlank()) charset = StandardCharsets.UTF_8; else { - charset = Charset.forName(type.charset()); + charset = charsetMap.computeIfAbsent(type.charset(), Charset::forName); } } catch (IllegalCharsetNameException ex) {