diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index 09352765..3af0110a 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -54,7 +54,7 @@ public class CrawlerRetreiver { private static final LinkFilterSelector linkFilterSelector = new LinkFilterSelector(); private static final DomainProber domainProber = new DomainProber(); - private final SitemapRetriever sitemapRetriever = new SitemapRetriever(); + private final SitemapRetriever sitemapRetriever; private final DomainCrawlFrontier crawlFrontier; @@ -71,6 +71,7 @@ public class CrawlerRetreiver { crawledDomainWriter = writer; this.crawlFrontier = new DomainCrawlFrontier(new EdgeDomain(domain), specs.urls, specs.crawlDepth); + sitemapRetriever = fetcher.createSitemapRetriever(); var fst = crawlFrontier.peek(); if (fst != null) { diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java index 987278a0..1f630ac5 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java @@ -21,4 +21,6 @@ public interface HttpFetcher { CrawledDocument fetchContent(EdgeUrl url) throws RateLimitException; SimpleRobotRules fetchRobotRules(EdgeDomain domain); + + SitemapRetriever createSitemapRetriever(); } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java index 5978444d..55a6d296 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java @@ -307,6 +307,11 @@ public class HttpFetcherImpl implements HttpFetcher { .orElseGet(() -> new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_ALL)); } + @Override + public SitemapRetriever createSitemapRetriever() { + return new SitemapRetriever(); + } + private Optional fetchRobotsForProto(String proto, EdgeDomain domain) { try { var url = new EdgeUrl(proto, domain, null, "/robots.txt", null); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/SitemapRetriever.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/SitemapRetriever.java index 99701244..bb2d2898 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/SitemapRetriever.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/SitemapRetriever.java @@ -10,7 +10,6 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.util.*; -@Singleton public class SitemapRetriever { private final Logger logger = LoggerFactory.getLogger(SitemapRetriever.class); private final ThreadLocal siteMapParserThreadLocal = ThreadLocal.withInitial(() -> new SiteMapParser(false)); diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java index d5f4581e..7462b62c 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java @@ -17,6 +17,7 @@ import nu.marginalia.model.EdgeUrl; import nu.marginalia.test.CommonTestData; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Test; +import org.mockito.Mockito; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -146,5 +147,10 @@ public class CrawlerMockFetcherTest { public SimpleRobotRules fetchRobotRules(EdgeDomain domain) { return new SimpleRobotRules(); } + + @Override + public SitemapRetriever createSitemapRetriever() { + return Mockito.mock(SitemapRetriever.class); + } } }