mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(crawler) Remove unnecessary double-fetch of the root document
This commit is contained in:
parent
f6db16b313
commit
8b9629f2f6
@ -117,9 +117,6 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
rootUrl = ok.probedUrl();
|
||||
}
|
||||
|
||||
|
||||
assert !crawlFrontier.isEmpty();
|
||||
|
||||
final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(rootUrl.domain, warcRecorder);
|
||||
final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());
|
||||
|
||||
@ -236,6 +233,9 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
catch (Exception ex) {
|
||||
logger.error("Error configuring link filter", ex);
|
||||
}
|
||||
finally {
|
||||
crawlFrontier.addVisited(rootUrl);
|
||||
}
|
||||
}
|
||||
|
||||
public HttpFetchResult fetchWriteAndSleep(EdgeUrl top,
|
||||
|
Loading…
Reference in New Issue
Block a user