Fix so that crawler tests don't sometimes fetch real sitemaps when they're run.

This commit is contained in:
Viktor Lofgren 2023-07-06 18:05:03 +02:00
parent 019fa763cd
commit 647bbfa617
5 changed files with 15 additions and 2 deletions

View File

@ -54,7 +54,7 @@ public class CrawlerRetreiver {
private static final LinkFilterSelector linkFilterSelector = new LinkFilterSelector(); private static final LinkFilterSelector linkFilterSelector = new LinkFilterSelector();
private static final DomainProber domainProber = new DomainProber(); private static final DomainProber domainProber = new DomainProber();
private final SitemapRetriever sitemapRetriever = new SitemapRetriever(); private final SitemapRetriever sitemapRetriever;
private final DomainCrawlFrontier crawlFrontier; private final DomainCrawlFrontier crawlFrontier;
@ -71,6 +71,7 @@ public class CrawlerRetreiver {
crawledDomainWriter = writer; crawledDomainWriter = writer;
this.crawlFrontier = new DomainCrawlFrontier(new EdgeDomain(domain), specs.urls, specs.crawlDepth); this.crawlFrontier = new DomainCrawlFrontier(new EdgeDomain(domain), specs.urls, specs.crawlDepth);
sitemapRetriever = fetcher.createSitemapRetriever();
var fst = crawlFrontier.peek(); var fst = crawlFrontier.peek();
if (fst != null) { if (fst != null) {

View File

@ -21,4 +21,6 @@ public interface HttpFetcher {
CrawledDocument fetchContent(EdgeUrl url) throws RateLimitException; CrawledDocument fetchContent(EdgeUrl url) throws RateLimitException;
SimpleRobotRules fetchRobotRules(EdgeDomain domain); SimpleRobotRules fetchRobotRules(EdgeDomain domain);
SitemapRetriever createSitemapRetriever();
} }

View File

@ -307,6 +307,11 @@ public class HttpFetcherImpl implements HttpFetcher {
.orElseGet(() -> new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_ALL)); .orElseGet(() -> new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_ALL));
} }
@Override
public SitemapRetriever createSitemapRetriever() {
return new SitemapRetriever();
}
private Optional<SimpleRobotRules> fetchRobotsForProto(String proto, EdgeDomain domain) { private Optional<SimpleRobotRules> fetchRobotsForProto(String proto, EdgeDomain domain) {
try { try {
var url = new EdgeUrl(proto, domain, null, "/robots.txt", null); var url = new EdgeUrl(proto, domain, null, "/robots.txt", null);

View File

@ -10,7 +10,6 @@ import java.io.FileNotFoundException;
import java.io.IOException; import java.io.IOException;
import java.util.*; import java.util.*;
@Singleton
public class SitemapRetriever { public class SitemapRetriever {
private final Logger logger = LoggerFactory.getLogger(SitemapRetriever.class); private final Logger logger = LoggerFactory.getLogger(SitemapRetriever.class);
private final ThreadLocal<SiteMapParser> siteMapParserThreadLocal = ThreadLocal.withInitial(() -> new SiteMapParser(false)); private final ThreadLocal<SiteMapParser> siteMapParserThreadLocal = ThreadLocal.withInitial(() -> new SiteMapParser(false));

View File

@ -17,6 +17,7 @@ import nu.marginalia.model.EdgeUrl;
import nu.marginalia.test.CommonTestData; import nu.marginalia.test.CommonTestData;
import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import org.mockito.Mockito;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -146,5 +147,10 @@ public class CrawlerMockFetcherTest {
public SimpleRobotRules fetchRobotRules(EdgeDomain domain) { public SimpleRobotRules fetchRobotRules(EdgeDomain domain) {
return new SimpleRobotRules(); return new SimpleRobotRules();
} }
@Override
public SitemapRetriever createSitemapRetriever() {
return Mockito.mock(SitemapRetriever.class);
}
} }
} }