From 4c74e280d3c37e332fc92fc55aac79f00b09ee24 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 21 Jan 2025 13:33:35 +0100 Subject: [PATCH] (crawler) Fix urlencoding in sitemap fetcher --- .../java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java index 295d432b..3c330fb4 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java @@ -339,14 +339,14 @@ public class HttpFetcherImpl implements HttpFetcher { case "sitemapindex" -> { List references = new ArrayList<>(); for (var locTag : parsedSitemap.getElementsByTag("loc")) { - references.add(URLDecoder.decode(locTag.text().trim(), StandardCharsets.UTF_8)); + references.add(locTag.text().trim()); } yield new SitemapResult.SitemapReferences(Collections.unmodifiableList(references)); } case "urlset" -> { List urls = new ArrayList<>(); for (var locTag : parsedSitemap.select("url > loc")) { - urls.add(URLDecoder.decode(locTag.text().trim(), StandardCharsets.UTF_8)); + urls.add(locTag.text().trim()); } yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls)); }