(crawler) Remove unnecessary double-fetch of the root document

This commit is contained in:
Viktor Lofgren 2024-04-24 14:38:59 +02:00
parent f6db16b313
commit 8b9629f2f6

View File

@ -117,9 +117,6 @@ public class CrawlerRetreiver implements AutoCloseable {
rootUrl = ok.probedUrl(); rootUrl = ok.probedUrl();
} }
assert !crawlFrontier.isEmpty();
final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(rootUrl.domain, warcRecorder); final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(rootUrl.domain, warcRecorder);
final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay()); final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());
@ -236,6 +233,9 @@ public class CrawlerRetreiver implements AutoCloseable {
catch (Exception ex) { catch (Exception ex) {
logger.error("Error configuring link filter", ex); logger.error("Error configuring link filter", ex);
} }
finally {
crawlFrontier.addVisited(rootUrl);
}
} }
public HttpFetchResult fetchWriteAndSleep(EdgeUrl top, public HttpFetchResult fetchWriteAndSleep(EdgeUrl top,