mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(crawler) Correct RSS-sitemap behavior
This commit is contained in:
parent
5353805cc6
commit
8d0f9652c7
@ -28,6 +28,7 @@ import java.nio.file.Path;
|
|||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
|
import java.util.Optional;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
public class CrawlerRetreiver implements AutoCloseable {
|
public class CrawlerRetreiver implements AutoCloseable {
|
||||||
@ -223,7 +224,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
crawlFrontier.setLinkFilter(linkFilterSelector.selectFilter(doc));
|
crawlFrontier.setLinkFilter(linkFilterSelector.selectFilter(doc));
|
||||||
|
|
||||||
EdgeUrl faviconUrl = url.withPathAndParam("/favicon.ico", null);
|
EdgeUrl faviconUrl = url.withPathAndParam("/favicon.ico", null);
|
||||||
EdgeUrl sitemapUrl = null;
|
Optional<EdgeUrl> sitemapUrl = Optional.empty();
|
||||||
|
|
||||||
for (var link : doc.getElementsByTag("link")) {
|
for (var link : doc.getElementsByTag("link")) {
|
||||||
String rel = link.attr("rel");
|
String rel = link.attr("rel");
|
||||||
@ -243,14 +244,13 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
String href = link.attr("href");
|
String href = link.attr("href");
|
||||||
|
|
||||||
sitemapUrl = linkParser.parseLink(url, href)
|
sitemapUrl = linkParser.parseLink(url, href)
|
||||||
.filter(crawlFrontier::isSameDomain)
|
.filter(crawlFrontier::isSameDomain);
|
||||||
.orElse(sitemapUrl);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Download the sitemap if available exists
|
// Download the sitemap if available exists
|
||||||
if (sitemapUrl != null) {
|
if (sitemapUrl.isPresent()) {
|
||||||
sitemapFetcher.downloadSitemaps(List.of(sitemapUrl));
|
sitemapFetcher.downloadSitemaps(List.of(sitemapUrl.get()));
|
||||||
timer.waitFetchDelay(0);
|
timer.waitFetchDelay(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user