(crawler) Remove unnecessary double-fetch of the root document

2025-02-24 05:18:58 +00:00 · 2024-04-24 14:38:59 +02:00 · 2024-04-24 14:38:59 +02:00 · 8b9629f2f6
commit 8b9629f2f6
parent f6db16b313
1 changed files with 3 additions and 3 deletions
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
@ -117,9 +117,6 @@ public class CrawlerRetreiver implements AutoCloseable {
            rootUrl = ok.probedUrl();
        }

-
-        assert !crawlFrontier.isEmpty();
-
        final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(rootUrl.domain, warcRecorder);
        final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());

@ -236,6 +233,9 @@ public class CrawlerRetreiver implements AutoCloseable {
        catch (Exception ex) {
            logger.error("Error configuring link filter", ex);
        }
+        finally {
+            crawlFrontier.addVisited(rootUrl);
+        }
    }

    public HttpFetchResult fetchWriteAndSleep(EdgeUrl top,