mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(crawl) Reduce Charset.forName() object churn
Cache the Charset object returned from Charset.forName() for future use, since we're likely to see the same charset again and Charset.forName(...) can be surprisingly expensive and its built-in caching strategy, which just caches the 2 last values seen doesn't cope well with how we're hitting it with a wide array of random charsets
This commit is contained in:
parent
d023e399d2
commit
a6b03a66dc
@ -1,8 +1,11 @@
|
||||
package nu.marginalia.contenttype;
|
||||
|
||||
import java.nio.charset.*;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
public class DocumentBodyToString {
|
||||
private static final Map<String, Charset> charsetMap = new ConcurrentHashMap<>();
|
||||
|
||||
/** Get the string data from a document body, given the content type and charset */
|
||||
public static String getStringData(ContentType type, byte[] data) {
|
||||
@ -11,7 +14,7 @@ public class DocumentBodyToString {
|
||||
if (type.charset() == null || type.charset().isBlank())
|
||||
charset = StandardCharsets.UTF_8;
|
||||
else {
|
||||
charset = Charset.forName(type.charset());
|
||||
charset = charsetMap.computeIfAbsent(type.charset(), Charset::forName);
|
||||
}
|
||||
}
|
||||
catch (IllegalCharsetNameException ex) {
|
||||
|
Loading…
Reference in New Issue
Block a user