(perf) Code was still spending a lot of time resolving charsets

... in the failure case which wasn't captured by memoization.
This commit is contained in:
Viktor Lofgren 2024-08-01 11:58:59 +02:00
parent e2107901ec
commit 38e2089c3f

View File

@ -1,34 +1,46 @@
package nu.marginalia.contenttype;
import java.nio.charset.*;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.StandardCharsets;
import java.nio.charset.UnsupportedCharsetException;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
public class DocumentBodyToString {
private static final Map<String, Charset> charsetMap = new ConcurrentHashMap<>();
private static final Map<ContentType, Charset> charsetMap = new ConcurrentHashMap<>();
/** Get the string data from a document body, given the content type and charset */
public static String getStringData(ContentType type, byte[] data) {
Charset charset;
final Charset charset;
if (type.charset() == null || type.charset().isBlank()) {
charset = StandardCharsets.UTF_8;
} else {
charset = charsetMap.computeIfAbsent(type, DocumentBodyToString::computeCharset);
}
return new String(data, charset);
}
private static Charset computeCharset(ContentType type) {
try {
if (type.charset() == null || type.charset().isBlank())
charset = StandardCharsets.UTF_8;
return StandardCharsets.UTF_8;
else {
charset = charsetMap.computeIfAbsent(type.charset(), Charset::forName);
return Charset.forName(type.charset());
}
}
catch (IllegalCharsetNameException ex) {
// Fall back to UTF-8 if we don't understand what this is. It's *probably* fine? Maybe?
charset = StandardCharsets.UTF_8;
return StandardCharsets.UTF_8;
}
catch (UnsupportedCharsetException ex) {
// This is usually like Macintosh Latin
// (https://en.wikipedia.org/wiki/Macintosh_Latin_encoding)
//
// It's close enough to 8859-1 to serve
charset = StandardCharsets.ISO_8859_1;
return StandardCharsets.ISO_8859_1;
}
return new String(data, charset);
}
}