From ff174731050cfc3f32d777be440f085ed968b95b Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 25 Nov 2024 14:25:47 +0100 Subject: [PATCH] Fix UTF-8 URL normalization issue in sideloader. Normalize URLs by replacing en-dash with hyphen to prevent encoding errors. This ensures correct handling of a small subset of articles with improperly normalized UTF-8 paths. Added `normalizeUtf8` method to address this issue. Fixes issue #109. --- .../EncyclopediaMarginaliaNuSideloader.java | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/code/processes/converting-process/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java b/code/processes/converting-process/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java index 900bd36e..2185eb45 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java @@ -84,7 +84,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC DomainLinks domainLinks = getDomainLinks(); var stmt = connection.prepareStatement(""" - SELECT url,title,html FROM articles + SELECT url,title,html FROM articles where url """); stmt.setFetchSize(100); @@ -111,7 +111,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC } private ProcessedDocument convertDocument(List parts, String title, String url, DomainLinks domainLinks) throws URISyntaxException, DisqualifiedException { - String fullUrl = baseUrl.toString() + URLEncoder.encode(url, Charsets.UTF_8); + String fullUrl = baseUrl.toString() + URLEncoder.encode(normalizeUtf8(url), Charsets.UTF_8); StringBuilder fullHtml = new StringBuilder(); fullHtml @@ -142,6 +142,15 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC return doc; } + private String normalizeUtf8(String url) { + // A rare number of articles have incorrectly normalized UTF-8 in their paths. + // This is a stopgap to fix them, as the URLs break if you urlencode the UTF-8. + + return url + .replace('\u2013', '-') // Replace en-dash with hyphen + ; + } + private T fromCompressedJson(byte[] stream, Class type) throws IOException { return gson.fromJson(new InputStreamReader(new ZstdInputStream(new ByteArrayInputStream(stream))), type); }