diff --git a/code/processes/crawling-process/ft-content-type/java/nu/marginalia/contenttype/DocumentBodyToString.java b/code/processes/crawling-process/ft-content-type/java/nu/marginalia/contenttype/DocumentBodyToString.java index a867a3c2..8187871e 100644 --- a/code/processes/crawling-process/ft-content-type/java/nu/marginalia/contenttype/DocumentBodyToString.java +++ b/code/processes/crawling-process/ft-content-type/java/nu/marginalia/contenttype/DocumentBodyToString.java @@ -1,34 +1,46 @@ package nu.marginalia.contenttype; -import java.nio.charset.*; +import java.nio.charset.Charset; +import java.nio.charset.IllegalCharsetNameException; +import java.nio.charset.StandardCharsets; +import java.nio.charset.UnsupportedCharsetException; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; public class DocumentBodyToString { - private static final Map charsetMap = new ConcurrentHashMap<>(); + private static final Map charsetMap = new ConcurrentHashMap<>(); /** Get the string data from a document body, given the content type and charset */ public static String getStringData(ContentType type, byte[] data) { - Charset charset; + final Charset charset; + + if (type.charset() == null || type.charset().isBlank()) { + charset = StandardCharsets.UTF_8; + } else { + charset = charsetMap.computeIfAbsent(type, DocumentBodyToString::computeCharset); + } + + return new String(data, charset); + } + + private static Charset computeCharset(ContentType type) { try { if (type.charset() == null || type.charset().isBlank()) - charset = StandardCharsets.UTF_8; + return StandardCharsets.UTF_8; else { - charset = charsetMap.computeIfAbsent(type.charset(), Charset::forName); + return Charset.forName(type.charset()); } } catch (IllegalCharsetNameException ex) { // Fall back to UTF-8 if we don't understand what this is. It's *probably* fine? Maybe? - charset = StandardCharsets.UTF_8; + return StandardCharsets.UTF_8; } catch (UnsupportedCharsetException ex) { // This is usually like Macintosh Latin // (https://en.wikipedia.org/wiki/Macintosh_Latin_encoding) // // It's close enough to 8859-1 to serve - charset = StandardCharsets.ISO_8859_1; + return StandardCharsets.ISO_8859_1; } - - return new String(data, charset); } }