From 579a115243fd0c539cdd8cb4e02227118fe46f9e Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 20 Jan 2025 23:17:13 +0100 Subject: [PATCH] (crawler) Reduce log spam from error handling in new sitemap fetcher --- .../java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java index cf3ab721..295d432b 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java @@ -297,7 +297,7 @@ public class HttpFetcherImpl implements HttpFetcher { return ret; } catch (Exception ex) { - logger.error("Error while fetching sitemaps via " + root, ex); + logger.error("Error while fetching sitemaps via {}: {} ({})", root, ex.getClass().getSimpleName(), ex.getMessage()); return List.of(); } } @@ -329,6 +329,10 @@ public class HttpFetcherImpl implements HttpFetcher { } Document parsedSitemap = Jsoup.parse(parserStream, "UTF-8", sitemapUrl.toString(), Parser.xmlParser()); + if (parsedSitemap.childrenSize() == 0) { + return new SitemapResult.SitemapError(); + } + String rootTagName = parsedSitemap.child(0).tagName(); return switch (rootTagName.toLowerCase()) {