mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 13:19:02 +00:00
Fix so that crawler tests don't sometimes fetch real sitemaps when they're run.
This commit is contained in:
parent
019fa763cd
commit
647bbfa617
@ -54,7 +54,7 @@ public class CrawlerRetreiver {
|
|||||||
private static final LinkFilterSelector linkFilterSelector = new LinkFilterSelector();
|
private static final LinkFilterSelector linkFilterSelector = new LinkFilterSelector();
|
||||||
|
|
||||||
private static final DomainProber domainProber = new DomainProber();
|
private static final DomainProber domainProber = new DomainProber();
|
||||||
private final SitemapRetriever sitemapRetriever = new SitemapRetriever();
|
private final SitemapRetriever sitemapRetriever;
|
||||||
private final DomainCrawlFrontier crawlFrontier;
|
private final DomainCrawlFrontier crawlFrontier;
|
||||||
|
|
||||||
|
|
||||||
@ -71,6 +71,7 @@ public class CrawlerRetreiver {
|
|||||||
crawledDomainWriter = writer;
|
crawledDomainWriter = writer;
|
||||||
|
|
||||||
this.crawlFrontier = new DomainCrawlFrontier(new EdgeDomain(domain), specs.urls, specs.crawlDepth);
|
this.crawlFrontier = new DomainCrawlFrontier(new EdgeDomain(domain), specs.urls, specs.crawlDepth);
|
||||||
|
sitemapRetriever = fetcher.createSitemapRetriever();
|
||||||
|
|
||||||
var fst = crawlFrontier.peek();
|
var fst = crawlFrontier.peek();
|
||||||
if (fst != null) {
|
if (fst != null) {
|
||||||
|
@ -21,4 +21,6 @@ public interface HttpFetcher {
|
|||||||
CrawledDocument fetchContent(EdgeUrl url) throws RateLimitException;
|
CrawledDocument fetchContent(EdgeUrl url) throws RateLimitException;
|
||||||
|
|
||||||
SimpleRobotRules fetchRobotRules(EdgeDomain domain);
|
SimpleRobotRules fetchRobotRules(EdgeDomain domain);
|
||||||
|
|
||||||
|
SitemapRetriever createSitemapRetriever();
|
||||||
}
|
}
|
||||||
|
@ -307,6 +307,11 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
.orElseGet(() -> new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_ALL));
|
.orElseGet(() -> new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_ALL));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public SitemapRetriever createSitemapRetriever() {
|
||||||
|
return new SitemapRetriever();
|
||||||
|
}
|
||||||
|
|
||||||
private Optional<SimpleRobotRules> fetchRobotsForProto(String proto, EdgeDomain domain) {
|
private Optional<SimpleRobotRules> fetchRobotsForProto(String proto, EdgeDomain domain) {
|
||||||
try {
|
try {
|
||||||
var url = new EdgeUrl(proto, domain, null, "/robots.txt", null);
|
var url = new EdgeUrl(proto, domain, null, "/robots.txt", null);
|
||||||
|
@ -10,7 +10,6 @@ import java.io.FileNotFoundException;
|
|||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
@Singleton
|
|
||||||
public class SitemapRetriever {
|
public class SitemapRetriever {
|
||||||
private final Logger logger = LoggerFactory.getLogger(SitemapRetriever.class);
|
private final Logger logger = LoggerFactory.getLogger(SitemapRetriever.class);
|
||||||
private final ThreadLocal<SiteMapParser> siteMapParserThreadLocal = ThreadLocal.withInitial(() -> new SiteMapParser(false));
|
private final ThreadLocal<SiteMapParser> siteMapParserThreadLocal = ThreadLocal.withInitial(() -> new SiteMapParser(false));
|
||||||
|
@ -17,6 +17,7 @@ import nu.marginalia.model.EdgeUrl;
|
|||||||
import nu.marginalia.test.CommonTestData;
|
import nu.marginalia.test.CommonTestData;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.mockito.Mockito;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -146,5 +147,10 @@ public class CrawlerMockFetcherTest {
|
|||||||
public SimpleRobotRules fetchRobotRules(EdgeDomain domain) {
|
public SimpleRobotRules fetchRobotRules(EdgeDomain domain) {
|
||||||
return new SimpleRobotRules();
|
return new SimpleRobotRules();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public SitemapRetriever createSitemapRetriever() {
|
||||||
|
return Mockito.mock(SitemapRetriever.class);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user