mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Fix so that crawler tests don't sometimes fetch real sitemaps when they're run.
This commit is contained in:
parent
019fa763cd
commit
647bbfa617
@ -54,7 +54,7 @@ public class CrawlerRetreiver {
|
||||
private static final LinkFilterSelector linkFilterSelector = new LinkFilterSelector();
|
||||
|
||||
private static final DomainProber domainProber = new DomainProber();
|
||||
private final SitemapRetriever sitemapRetriever = new SitemapRetriever();
|
||||
private final SitemapRetriever sitemapRetriever;
|
||||
private final DomainCrawlFrontier crawlFrontier;
|
||||
|
||||
|
||||
@ -71,6 +71,7 @@ public class CrawlerRetreiver {
|
||||
crawledDomainWriter = writer;
|
||||
|
||||
this.crawlFrontier = new DomainCrawlFrontier(new EdgeDomain(domain), specs.urls, specs.crawlDepth);
|
||||
sitemapRetriever = fetcher.createSitemapRetriever();
|
||||
|
||||
var fst = crawlFrontier.peek();
|
||||
if (fst != null) {
|
||||
|
@ -21,4 +21,6 @@ public interface HttpFetcher {
|
||||
CrawledDocument fetchContent(EdgeUrl url) throws RateLimitException;
|
||||
|
||||
SimpleRobotRules fetchRobotRules(EdgeDomain domain);
|
||||
|
||||
SitemapRetriever createSitemapRetriever();
|
||||
}
|
||||
|
@ -307,6 +307,11 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
.orElseGet(() -> new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_ALL));
|
||||
}
|
||||
|
||||
@Override
|
||||
public SitemapRetriever createSitemapRetriever() {
|
||||
return new SitemapRetriever();
|
||||
}
|
||||
|
||||
private Optional<SimpleRobotRules> fetchRobotsForProto(String proto, EdgeDomain domain) {
|
||||
try {
|
||||
var url = new EdgeUrl(proto, domain, null, "/robots.txt", null);
|
||||
|
@ -10,7 +10,6 @@ import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
@Singleton
|
||||
public class SitemapRetriever {
|
||||
private final Logger logger = LoggerFactory.getLogger(SitemapRetriever.class);
|
||||
private final ThreadLocal<SiteMapParser> siteMapParserThreadLocal = ThreadLocal.withInitial(() -> new SiteMapParser(false));
|
||||
|
@ -17,6 +17,7 @@ import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.test.CommonTestData;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.mockito.Mockito;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -146,5 +147,10 @@ public class CrawlerMockFetcherTest {
|
||||
public SimpleRobotRules fetchRobotRules(EdgeDomain domain) {
|
||||
return new SimpleRobotRules();
|
||||
}
|
||||
|
||||
@Override
|
||||
public SitemapRetriever createSitemapRetriever() {
|
||||
return Mockito.mock(SitemapRetriever.class);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user