(crawler) Remove unnecessary double-fetch of the root document

2025-02-24 13:19:02 +00:00 · 2024-04-24 14:38:59 +02:00 · 2024-04-24 14:38:59 +02:00 · 8b9629f2f6
commit 8b9629f2f6
parent f6db16b313
1 changed files with 3 additions and 3 deletions
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
@ -117,9 +117,6 @@ public class CrawlerRetreiver implements AutoCloseable {
            rootUrl = ok.probedUrl();
        }
        assert !crawlFrontier.isEmpty();
        final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(rootUrl.domain, warcRecorder);
        final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());
@ -236,6 +233,9 @@ public class CrawlerRetreiver implements AutoCloseable {
        catch (Exception ex) {
            logger.error("Error configuring link filter", ex);
        }
        finally {
            crawlFrontier.addVisited(rootUrl);
        }
    }
    public HttpFetchResult fetchWriteAndSleep(EdgeUrl top,