(perf) Code was still spending a lot of time resolving charsets

... in the failure case which wasn't captured by memoization.
2025-02-23 21:18:58 +00:00 · 2024-08-01 11:58:59 +02:00 · 2024-08-01 11:58:59 +02:00 · 38e2089c3f
commit 38e2089c3f
parent e2107901ec
1 changed files with 21 additions and 9 deletions
--- a/code/processes/crawling-process/ft-content-type/java/nu/marginalia/contenttype/DocumentBodyToString.java
+++ b/code/processes/crawling-process/ft-content-type/java/nu/marginalia/contenttype/DocumentBodyToString.java
@ -1,34 +1,46 @@
 package nu.marginalia.contenttype;

-import java.nio.charset.*;
+import java.nio.charset.Charset;
+import java.nio.charset.IllegalCharsetNameException;
+import java.nio.charset.StandardCharsets;
+import java.nio.charset.UnsupportedCharsetException;
 import java.util.Map;
 import java.util.concurrent.ConcurrentHashMap;

 public class DocumentBodyToString {
-    private static final Map<String, Charset> charsetMap = new ConcurrentHashMap<>();
+    private static final Map<ContentType, Charset> charsetMap = new ConcurrentHashMap<>();

    /** Get the string data from a document body, given the content type and charset */
    public static String getStringData(ContentType type, byte[] data) {
-        Charset charset;
+        final Charset charset;
+
+        if (type.charset() == null || type.charset().isBlank()) {
+            charset = StandardCharsets.UTF_8;
+        } else {
+            charset = charsetMap.computeIfAbsent(type, DocumentBodyToString::computeCharset);
+        }
+
+        return new String(data, charset);
+    }
+
+    private static Charset computeCharset(ContentType type) {
        try {
            if (type.charset() == null || type.charset().isBlank())
-                charset = StandardCharsets.UTF_8;
+                return StandardCharsets.UTF_8;
            else {
-                charset = charsetMap.computeIfAbsent(type.charset(), Charset::forName);
+                return Charset.forName(type.charset());
            }
        }
        catch (IllegalCharsetNameException ex) {
            // Fall back to UTF-8 if we don't understand what this is.  It's *probably* fine? Maybe?
-            charset = StandardCharsets.UTF_8;
+            return StandardCharsets.UTF_8;
        }
        catch (UnsupportedCharsetException ex) {
            // This is usually like Macintosh Latin
            // (https://en.wikipedia.org/wiki/Macintosh_Latin_encoding)
            //
            // It's close enough to 8859-1 to serve
-            charset = StandardCharsets.ISO_8859_1;
+            return StandardCharsets.ISO_8859_1;
        }
-
-        return new String(data, charset);
    }
 }