(crawler) Remove unnecessary double-fetch of the root document

This commit is contained in:
Viktor Lofgren 2024-04-24 14:38:59 +02:00
parent f6db16b313
commit 8b9629f2f6

View File

@ -117,9 +117,6 @@ public class CrawlerRetreiver implements AutoCloseable {
rootUrl = ok.probedUrl();
}
assert !crawlFrontier.isEmpty();
final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(rootUrl.domain, warcRecorder);
final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());
@ -236,6 +233,9 @@ public class CrawlerRetreiver implements AutoCloseable {
catch (Exception ex) {
logger.error("Error configuring link filter", ex);
}
finally {
crawlFrontier.addVisited(rootUrl);
}
}
public HttpFetchResult fetchWriteAndSleep(EdgeUrl top,