mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 13:19:02 +00:00
(crawler) Remove unnecessary double-fetch of the root document
This commit is contained in:
parent
f6db16b313
commit
8b9629f2f6
@ -117,9 +117,6 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
rootUrl = ok.probedUrl();
|
rootUrl = ok.probedUrl();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
assert !crawlFrontier.isEmpty();
|
|
||||||
|
|
||||||
final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(rootUrl.domain, warcRecorder);
|
final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(rootUrl.domain, warcRecorder);
|
||||||
final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());
|
final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());
|
||||||
|
|
||||||
@ -236,6 +233,9 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
logger.error("Error configuring link filter", ex);
|
logger.error("Error configuring link filter", ex);
|
||||||
}
|
}
|
||||||
|
finally {
|
||||||
|
crawlFrontier.addVisited(rootUrl);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public HttpFetchResult fetchWriteAndSleep(EdgeUrl top,
|
public HttpFetchResult fetchWriteAndSleep(EdgeUrl top,
|
||||||
|
Loading…
Reference in New Issue
Block a user