diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index 98409c01..9ac68ce4 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -117,9 +117,6 @@ public class CrawlerRetreiver implements AutoCloseable { rootUrl = ok.probedUrl(); } - - assert !crawlFrontier.isEmpty(); - final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(rootUrl.domain, warcRecorder); final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay()); @@ -236,6 +233,9 @@ public class CrawlerRetreiver implements AutoCloseable { catch (Exception ex) { logger.error("Error configuring link filter", ex); } + finally { + crawlFrontier.addVisited(rootUrl); + } } public HttpFetchResult fetchWriteAndSleep(EdgeUrl top,