(crawler) Correct RSS-sitemap behavior

This commit is contained in:
Viktor Lofgren 2024-08-31 11:38:34 +02:00
parent 5353805cc6
commit 8d0f9652c7

View File

@ -28,6 +28,7 @@ import java.nio.file.Path;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Objects; import java.util.Objects;
import java.util.Optional;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
public class CrawlerRetreiver implements AutoCloseable { public class CrawlerRetreiver implements AutoCloseable {
@ -223,7 +224,7 @@ public class CrawlerRetreiver implements AutoCloseable {
crawlFrontier.setLinkFilter(linkFilterSelector.selectFilter(doc)); crawlFrontier.setLinkFilter(linkFilterSelector.selectFilter(doc));
EdgeUrl faviconUrl = url.withPathAndParam("/favicon.ico", null); EdgeUrl faviconUrl = url.withPathAndParam("/favicon.ico", null);
EdgeUrl sitemapUrl = null; Optional<EdgeUrl> sitemapUrl = Optional.empty();
for (var link : doc.getElementsByTag("link")) { for (var link : doc.getElementsByTag("link")) {
String rel = link.attr("rel"); String rel = link.attr("rel");
@ -243,14 +244,13 @@ public class CrawlerRetreiver implements AutoCloseable {
String href = link.attr("href"); String href = link.attr("href");
sitemapUrl = linkParser.parseLink(url, href) sitemapUrl = linkParser.parseLink(url, href)
.filter(crawlFrontier::isSameDomain) .filter(crawlFrontier::isSameDomain);
.orElse(sitemapUrl);
} }
} }
// Download the sitemap if available exists // Download the sitemap if available exists
if (sitemapUrl != null) { if (sitemapUrl.isPresent()) {
sitemapFetcher.downloadSitemaps(List.of(sitemapUrl)); sitemapFetcher.downloadSitemaps(List.of(sitemapUrl.get()));
timer.waitFetchDelay(0); timer.waitFetchDelay(0);
} }