Fix so that crawler tests don't sometimes fetch real sitemaps when they're run.

This commit is contained in:
Viktor Lofgren 2023-07-06 18:05:03 +02:00
parent 019fa763cd
commit 647bbfa617
5 changed files with 15 additions and 2 deletions

View File

@ -54,7 +54,7 @@ public class CrawlerRetreiver {
private static final LinkFilterSelector linkFilterSelector = new LinkFilterSelector();
private static final DomainProber domainProber = new DomainProber();
private final SitemapRetriever sitemapRetriever = new SitemapRetriever();
private final SitemapRetriever sitemapRetriever;
private final DomainCrawlFrontier crawlFrontier;
@ -71,6 +71,7 @@ public class CrawlerRetreiver {
crawledDomainWriter = writer;
this.crawlFrontier = new DomainCrawlFrontier(new EdgeDomain(domain), specs.urls, specs.crawlDepth);
sitemapRetriever = fetcher.createSitemapRetriever();
var fst = crawlFrontier.peek();
if (fst != null) {

View File

@ -21,4 +21,6 @@ public interface HttpFetcher {
CrawledDocument fetchContent(EdgeUrl url) throws RateLimitException;
SimpleRobotRules fetchRobotRules(EdgeDomain domain);
SitemapRetriever createSitemapRetriever();
}

View File

@ -307,6 +307,11 @@ public class HttpFetcherImpl implements HttpFetcher {
.orElseGet(() -> new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_ALL));
}
@Override
public SitemapRetriever createSitemapRetriever() {
return new SitemapRetriever();
}
private Optional<SimpleRobotRules> fetchRobotsForProto(String proto, EdgeDomain domain) {
try {
var url = new EdgeUrl(proto, domain, null, "/robots.txt", null);

View File

@ -10,7 +10,6 @@ import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;
@Singleton
public class SitemapRetriever {
private final Logger logger = LoggerFactory.getLogger(SitemapRetriever.class);
private final ThreadLocal<SiteMapParser> siteMapParserThreadLocal = ThreadLocal.withInitial(() -> new SiteMapParser(false));

View File

@ -17,6 +17,7 @@ import nu.marginalia.model.EdgeUrl;
import nu.marginalia.test.CommonTestData;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Test;
import org.mockito.Mockito;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -146,5 +147,10 @@ public class CrawlerMockFetcherTest {
public SimpleRobotRules fetchRobotRules(EdgeDomain domain) {
return new SimpleRobotRules();
}
@Override
public SitemapRetriever createSitemapRetriever() {
return Mockito.mock(SitemapRetriever.class);
}
}
}