(perf) Code was still spending a lot of time resolving charsets

... in the failure case which wasn't captured by memoization.
This commit is contained in:
Viktor Lofgren 2024-08-01 11:58:59 +02:00
parent e2107901ec
commit 38e2089c3f

View File

@ -1,34 +1,46 @@
package nu.marginalia.contenttype; package nu.marginalia.contenttype;
import java.nio.charset.*; import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.StandardCharsets;
import java.nio.charset.UnsupportedCharsetException;
import java.util.Map; import java.util.Map;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
public class DocumentBodyToString { public class DocumentBodyToString {
private static final Map<String, Charset> charsetMap = new ConcurrentHashMap<>(); private static final Map<ContentType, Charset> charsetMap = new ConcurrentHashMap<>();
/** Get the string data from a document body, given the content type and charset */ /** Get the string data from a document body, given the content type and charset */
public static String getStringData(ContentType type, byte[] data) { public static String getStringData(ContentType type, byte[] data) {
Charset charset; final Charset charset;
if (type.charset() == null || type.charset().isBlank()) {
charset = StandardCharsets.UTF_8;
} else {
charset = charsetMap.computeIfAbsent(type, DocumentBodyToString::computeCharset);
}
return new String(data, charset);
}
private static Charset computeCharset(ContentType type) {
try { try {
if (type.charset() == null || type.charset().isBlank()) if (type.charset() == null || type.charset().isBlank())
charset = StandardCharsets.UTF_8; return StandardCharsets.UTF_8;
else { else {
charset = charsetMap.computeIfAbsent(type.charset(), Charset::forName); return Charset.forName(type.charset());
} }
} }
catch (IllegalCharsetNameException ex) { catch (IllegalCharsetNameException ex) {
// Fall back to UTF-8 if we don't understand what this is. It's *probably* fine? Maybe? // Fall back to UTF-8 if we don't understand what this is. It's *probably* fine? Maybe?
charset = StandardCharsets.UTF_8; return StandardCharsets.UTF_8;
} }
catch (UnsupportedCharsetException ex) { catch (UnsupportedCharsetException ex) {
// This is usually like Macintosh Latin // This is usually like Macintosh Latin
// (https://en.wikipedia.org/wiki/Macintosh_Latin_encoding) // (https://en.wikipedia.org/wiki/Macintosh_Latin_encoding)
// //
// It's close enough to 8859-1 to serve // It's close enough to 8859-1 to serve
charset = StandardCharsets.ISO_8859_1; return StandardCharsets.ISO_8859_1;
} }
return new String(data, charset);
} }
} }