mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Fix UTF-8 URL normalization issue in sideloader.
Normalize URLs by replacing en-dash with hyphen to prevent encoding errors. This ensures correct handling of a small subset of articles with improperly normalized UTF-8 paths. Added `normalizeUtf8` method to address this issue. Fixes issue #109.
This commit is contained in:
parent
dc5f97e737
commit
ff17473105
@ -84,7 +84,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
|||||||
DomainLinks domainLinks = getDomainLinks();
|
DomainLinks domainLinks = getDomainLinks();
|
||||||
|
|
||||||
var stmt = connection.prepareStatement("""
|
var stmt = connection.prepareStatement("""
|
||||||
SELECT url,title,html FROM articles
|
SELECT url,title,html FROM articles where url
|
||||||
""");
|
""");
|
||||||
stmt.setFetchSize(100);
|
stmt.setFetchSize(100);
|
||||||
|
|
||||||
@ -111,7 +111,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
|||||||
}
|
}
|
||||||
|
|
||||||
private ProcessedDocument convertDocument(List<String> parts, String title, String url, DomainLinks domainLinks) throws URISyntaxException, DisqualifiedException {
|
private ProcessedDocument convertDocument(List<String> parts, String title, String url, DomainLinks domainLinks) throws URISyntaxException, DisqualifiedException {
|
||||||
String fullUrl = baseUrl.toString() + URLEncoder.encode(url, Charsets.UTF_8);
|
String fullUrl = baseUrl.toString() + URLEncoder.encode(normalizeUtf8(url), Charsets.UTF_8);
|
||||||
|
|
||||||
StringBuilder fullHtml = new StringBuilder();
|
StringBuilder fullHtml = new StringBuilder();
|
||||||
fullHtml
|
fullHtml
|
||||||
@ -142,6 +142,15 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
|||||||
return doc;
|
return doc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private String normalizeUtf8(String url) {
|
||||||
|
// A rare number of articles have incorrectly normalized UTF-8 in their paths.
|
||||||
|
// This is a stopgap to fix them, as the URLs break if you urlencode the UTF-8.
|
||||||
|
|
||||||
|
return url
|
||||||
|
.replace('\u2013', '-') // Replace en-dash with hyphen
|
||||||
|
;
|
||||||
|
}
|
||||||
|
|
||||||
private <T> T fromCompressedJson(byte[] stream, Class<T> type) throws IOException {
|
private <T> T fromCompressedJson(byte[] stream, Class<T> type) throws IOException {
|
||||||
return gson.fromJson(new InputStreamReader(new ZstdInputStream(new ByteArrayInputStream(stream))), type);
|
return gson.fromJson(new InputStreamReader(new ZstdInputStream(new ByteArrayInputStream(stream))), type);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user