mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(perf) Code was still spending a lot of time resolving charsets
... in the failure case which wasn't captured by memoization.
This commit is contained in:
parent
e2107901ec
commit
38e2089c3f
@ -1,34 +1,46 @@
|
||||
package nu.marginalia.contenttype;
|
||||
|
||||
import java.nio.charset.*;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.IllegalCharsetNameException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.charset.UnsupportedCharsetException;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
public class DocumentBodyToString {
|
||||
private static final Map<String, Charset> charsetMap = new ConcurrentHashMap<>();
|
||||
private static final Map<ContentType, Charset> charsetMap = new ConcurrentHashMap<>();
|
||||
|
||||
/** Get the string data from a document body, given the content type and charset */
|
||||
public static String getStringData(ContentType type, byte[] data) {
|
||||
Charset charset;
|
||||
final Charset charset;
|
||||
|
||||
if (type.charset() == null || type.charset().isBlank()) {
|
||||
charset = StandardCharsets.UTF_8;
|
||||
} else {
|
||||
charset = charsetMap.computeIfAbsent(type, DocumentBodyToString::computeCharset);
|
||||
}
|
||||
|
||||
return new String(data, charset);
|
||||
}
|
||||
|
||||
private static Charset computeCharset(ContentType type) {
|
||||
try {
|
||||
if (type.charset() == null || type.charset().isBlank())
|
||||
charset = StandardCharsets.UTF_8;
|
||||
return StandardCharsets.UTF_8;
|
||||
else {
|
||||
charset = charsetMap.computeIfAbsent(type.charset(), Charset::forName);
|
||||
return Charset.forName(type.charset());
|
||||
}
|
||||
}
|
||||
catch (IllegalCharsetNameException ex) {
|
||||
// Fall back to UTF-8 if we don't understand what this is. It's *probably* fine? Maybe?
|
||||
charset = StandardCharsets.UTF_8;
|
||||
return StandardCharsets.UTF_8;
|
||||
}
|
||||
catch (UnsupportedCharsetException ex) {
|
||||
// This is usually like Macintosh Latin
|
||||
// (https://en.wikipedia.org/wiki/Macintosh_Latin_encoding)
|
||||
//
|
||||
// It's close enough to 8859-1 to serve
|
||||
charset = StandardCharsets.ISO_8859_1;
|
||||
return StandardCharsets.ISO_8859_1;
|
||||
}
|
||||
|
||||
return new String(data, charset);
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user