From 647bbfa617d10234001c3eaef3eeaafa7175d71a Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 6 Jul 2023 18:05:03 +0200 Subject: [PATCH] Fix so that crawler tests don't sometimes fetch real sitemaps when they're run. --- .../nu/marginalia/crawl/retreival/CrawlerRetreiver.java | 3 ++- .../nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java | 2 ++ .../marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java | 5 +++++ .../crawl/retreival/fetcher/SitemapRetriever.java | 1 - .../crawling/retreival/CrawlerMockFetcherTest.java | 6 ++++++ 5 files changed, 15 insertions(+), 2 deletions(-) diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index 09352765..3af0110a 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -54,7 +54,7 @@ public class CrawlerRetreiver { private static final LinkFilterSelector linkFilterSelector = new LinkFilterSelector(); private static final DomainProber domainProber = new DomainProber(); - private final SitemapRetriever sitemapRetriever = new SitemapRetriever(); + private final SitemapRetriever sitemapRetriever; private final DomainCrawlFrontier crawlFrontier; @@ -71,6 +71,7 @@ public class CrawlerRetreiver { crawledDomainWriter = writer; this.crawlFrontier = new DomainCrawlFrontier(new EdgeDomain(domain), specs.urls, specs.crawlDepth); + sitemapRetriever = fetcher.createSitemapRetriever(); var fst = crawlFrontier.peek(); if (fst != null) { diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java index 987278a0..1f630ac5 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java @@ -21,4 +21,6 @@ public interface HttpFetcher { CrawledDocument fetchContent(EdgeUrl url) throws RateLimitException; SimpleRobotRules fetchRobotRules(EdgeDomain domain); + + SitemapRetriever createSitemapRetriever(); } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java index 5978444d..55a6d296 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java @@ -307,6 +307,11 @@ public class HttpFetcherImpl implements HttpFetcher { .orElseGet(() -> new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_ALL)); } + @Override + public SitemapRetriever createSitemapRetriever() { + return new SitemapRetriever(); + } + private Optional fetchRobotsForProto(String proto, EdgeDomain domain) { try { var url = new EdgeUrl(proto, domain, null, "/robots.txt", null); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/SitemapRetriever.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/SitemapRetriever.java index 99701244..bb2d2898 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/SitemapRetriever.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/SitemapRetriever.java @@ -10,7 +10,6 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.util.*; -@Singleton public class SitemapRetriever { private final Logger logger = LoggerFactory.getLogger(SitemapRetriever.class); private final ThreadLocal siteMapParserThreadLocal = ThreadLocal.withInitial(() -> new SiteMapParser(false)); diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java index d5f4581e..7462b62c 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java @@ -17,6 +17,7 @@ import nu.marginalia.model.EdgeUrl; import nu.marginalia.test.CommonTestData; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Test; +import org.mockito.Mockito; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -146,5 +147,10 @@ public class CrawlerMockFetcherTest { public SimpleRobotRules fetchRobotRules(EdgeDomain domain) { return new SimpleRobotRules(); } + + @Override + public SitemapRetriever createSitemapRetriever() { + return Mockito.mock(SitemapRetriever.class); + } } }