mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Fix UTF-8 URL normalization issue in sideloader.
Normalize URLs by replacing en-dash with hyphen to prevent encoding errors. This ensures correct handling of a small subset of articles with improperly normalized UTF-8 paths. Added `normalizeUtf8` method to address this issue. Fixes issue #109.
This commit is contained in:
parent
dc5f97e737
commit
ff17473105
@ -84,7 +84,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
||||
DomainLinks domainLinks = getDomainLinks();
|
||||
|
||||
var stmt = connection.prepareStatement("""
|
||||
SELECT url,title,html FROM articles
|
||||
SELECT url,title,html FROM articles where url
|
||||
""");
|
||||
stmt.setFetchSize(100);
|
||||
|
||||
@ -111,7 +111,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
||||
}
|
||||
|
||||
private ProcessedDocument convertDocument(List<String> parts, String title, String url, DomainLinks domainLinks) throws URISyntaxException, DisqualifiedException {
|
||||
String fullUrl = baseUrl.toString() + URLEncoder.encode(url, Charsets.UTF_8);
|
||||
String fullUrl = baseUrl.toString() + URLEncoder.encode(normalizeUtf8(url), Charsets.UTF_8);
|
||||
|
||||
StringBuilder fullHtml = new StringBuilder();
|
||||
fullHtml
|
||||
@ -142,6 +142,15 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
||||
return doc;
|
||||
}
|
||||
|
||||
private String normalizeUtf8(String url) {
|
||||
// A rare number of articles have incorrectly normalized UTF-8 in their paths.
|
||||
// This is a stopgap to fix them, as the URLs break if you urlencode the UTF-8.
|
||||
|
||||
return url
|
||||
.replace('\u2013', '-') // Replace en-dash with hyphen
|
||||
;
|
||||
}
|
||||
|
||||
private <T> T fromCompressedJson(byte[] stream, Class<T> type) throws IOException {
|
||||
return gson.fromJson(new InputStreamReader(new ZstdInputStream(new ByteArrayInputStream(stream))), type);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user