diff --git a/code/processes/converting-process/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java b/code/processes/converting-process/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java index 900bd36e..2185eb45 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java @@ -84,7 +84,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC DomainLinks domainLinks = getDomainLinks(); var stmt = connection.prepareStatement(""" - SELECT url,title,html FROM articles + SELECT url,title,html FROM articles where url """); stmt.setFetchSize(100); @@ -111,7 +111,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC } private ProcessedDocument convertDocument(List parts, String title, String url, DomainLinks domainLinks) throws URISyntaxException, DisqualifiedException { - String fullUrl = baseUrl.toString() + URLEncoder.encode(url, Charsets.UTF_8); + String fullUrl = baseUrl.toString() + URLEncoder.encode(normalizeUtf8(url), Charsets.UTF_8); StringBuilder fullHtml = new StringBuilder(); fullHtml @@ -142,6 +142,15 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC return doc; } + private String normalizeUtf8(String url) { + // A rare number of articles have incorrectly normalized UTF-8 in their paths. + // This is a stopgap to fix them, as the URLs break if you urlencode the UTF-8. + + return url + .replace('\u2013', '-') // Replace en-dash with hyphen + ; + } + private T fromCompressedJson(byte[] stream, Class type) throws IOException { return gson.fromJson(new InputStreamReader(new ZstdInputStream(new ByteArrayInputStream(stream))), type); }