mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 04:58:59 +00:00
(crawler) Fix urlencoding in sitemap fetcher
This commit is contained in:
parent
5b347e17ac
commit
4c74e280d3
@ -339,14 +339,14 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
case "sitemapindex" -> {
|
case "sitemapindex" -> {
|
||||||
List<String> references = new ArrayList<>();
|
List<String> references = new ArrayList<>();
|
||||||
for (var locTag : parsedSitemap.getElementsByTag("loc")) {
|
for (var locTag : parsedSitemap.getElementsByTag("loc")) {
|
||||||
references.add(URLDecoder.decode(locTag.text().trim(), StandardCharsets.UTF_8));
|
references.add(locTag.text().trim());
|
||||||
}
|
}
|
||||||
yield new SitemapResult.SitemapReferences(Collections.unmodifiableList(references));
|
yield new SitemapResult.SitemapReferences(Collections.unmodifiableList(references));
|
||||||
}
|
}
|
||||||
case "urlset" -> {
|
case "urlset" -> {
|
||||||
List<String> urls = new ArrayList<>();
|
List<String> urls = new ArrayList<>();
|
||||||
for (var locTag : parsedSitemap.select("url > loc")) {
|
for (var locTag : parsedSitemap.select("url > loc")) {
|
||||||
urls.add(URLDecoder.decode(locTag.text().trim(), StandardCharsets.UTF_8));
|
urls.add(locTag.text().trim());
|
||||||
}
|
}
|
||||||
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
|
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user