mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(perf) Code was still spending a lot of time resolving charsets
... in the failure case which wasn't captured by memoization.
This commit is contained in:
parent
e2107901ec
commit
38e2089c3f
@ -1,34 +1,46 @@
|
|||||||
package nu.marginalia.contenttype;
|
package nu.marginalia.contenttype;
|
||||||
|
|
||||||
import java.nio.charset.*;
|
import java.nio.charset.Charset;
|
||||||
|
import java.nio.charset.IllegalCharsetNameException;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.nio.charset.UnsupportedCharsetException;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
|
||||||
public class DocumentBodyToString {
|
public class DocumentBodyToString {
|
||||||
private static final Map<String, Charset> charsetMap = new ConcurrentHashMap<>();
|
private static final Map<ContentType, Charset> charsetMap = new ConcurrentHashMap<>();
|
||||||
|
|
||||||
/** Get the string data from a document body, given the content type and charset */
|
/** Get the string data from a document body, given the content type and charset */
|
||||||
public static String getStringData(ContentType type, byte[] data) {
|
public static String getStringData(ContentType type, byte[] data) {
|
||||||
Charset charset;
|
final Charset charset;
|
||||||
|
|
||||||
|
if (type.charset() == null || type.charset().isBlank()) {
|
||||||
|
charset = StandardCharsets.UTF_8;
|
||||||
|
} else {
|
||||||
|
charset = charsetMap.computeIfAbsent(type, DocumentBodyToString::computeCharset);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new String(data, charset);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Charset computeCharset(ContentType type) {
|
||||||
try {
|
try {
|
||||||
if (type.charset() == null || type.charset().isBlank())
|
if (type.charset() == null || type.charset().isBlank())
|
||||||
charset = StandardCharsets.UTF_8;
|
return StandardCharsets.UTF_8;
|
||||||
else {
|
else {
|
||||||
charset = charsetMap.computeIfAbsent(type.charset(), Charset::forName);
|
return Charset.forName(type.charset());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (IllegalCharsetNameException ex) {
|
catch (IllegalCharsetNameException ex) {
|
||||||
// Fall back to UTF-8 if we don't understand what this is. It's *probably* fine? Maybe?
|
// Fall back to UTF-8 if we don't understand what this is. It's *probably* fine? Maybe?
|
||||||
charset = StandardCharsets.UTF_8;
|
return StandardCharsets.UTF_8;
|
||||||
}
|
}
|
||||||
catch (UnsupportedCharsetException ex) {
|
catch (UnsupportedCharsetException ex) {
|
||||||
// This is usually like Macintosh Latin
|
// This is usually like Macintosh Latin
|
||||||
// (https://en.wikipedia.org/wiki/Macintosh_Latin_encoding)
|
// (https://en.wikipedia.org/wiki/Macintosh_Latin_encoding)
|
||||||
//
|
//
|
||||||
// It's close enough to 8859-1 to serve
|
// It's close enough to 8859-1 to serve
|
||||||
charset = StandardCharsets.ISO_8859_1;
|
return StandardCharsets.ISO_8859_1;
|
||||||
}
|
}
|
||||||
|
|
||||||
return new String(data, charset);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user