(crawler) Correct RSS-sitemap behavior

2025-02-23 21:18:58 +00:00 · 2024-08-31 11:38:34 +02:00 · 2024-08-31 11:38:34 +02:00 · 8d0f9652c7
commit 8d0f9652c7
parent 5353805cc6
1 changed files with 5 additions and 5 deletions
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
@ -28,6 +28,7 @@ import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Objects;
+import java.util.Optional;
 import java.util.concurrent.TimeUnit;

 public class CrawlerRetreiver implements AutoCloseable {
@ -223,7 +224,7 @@ public class CrawlerRetreiver implements AutoCloseable {
            crawlFrontier.setLinkFilter(linkFilterSelector.selectFilter(doc));

            EdgeUrl faviconUrl = url.withPathAndParam("/favicon.ico", null);
-            EdgeUrl sitemapUrl = null;
+            Optional<EdgeUrl> sitemapUrl = Optional.empty();

            for (var link : doc.getElementsByTag("link")) {
                String rel = link.attr("rel");
@ -243,14 +244,13 @@ public class CrawlerRetreiver implements AutoCloseable {
                    String href = link.attr("href");

                    sitemapUrl = linkParser.parseLink(url, href)
-                            .filter(crawlFrontier::isSameDomain)
-                            .orElse(sitemapUrl);
+                            .filter(crawlFrontier::isSameDomain);
                }
            }

            // Download the sitemap if available exists
-            if (sitemapUrl != null) {
-                sitemapFetcher.downloadSitemaps(List.of(sitemapUrl));
+            if (sitemapUrl.isPresent()) {
+                sitemapFetcher.downloadSitemaps(List.of(sitemapUrl.get()));
                timer.waitFetchDelay(0);
            }