(crawler) Fix urlencoding in sitemap fetcher

This commit is contained in:
Viktor Lofgren 2025-01-21 13:33:35 +01:00
parent 5b347e17ac
commit 4c74e280d3

View File

@ -339,14 +339,14 @@ public class HttpFetcherImpl implements HttpFetcher {
case "sitemapindex" -> {
List<String> references = new ArrayList<>();
for (var locTag : parsedSitemap.getElementsByTag("loc")) {
references.add(URLDecoder.decode(locTag.text().trim(), StandardCharsets.UTF_8));
references.add(locTag.text().trim());
}
yield new SitemapResult.SitemapReferences(Collections.unmodifiableList(references));
}
case "urlset" -> {
List<String> urls = new ArrayList<>();
for (var locTag : parsedSitemap.select("url > loc")) {
urls.add(URLDecoder.decode(locTag.text().trim(), StandardCharsets.UTF_8));
urls.add(locTag.text().trim());
}
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
}