From a557c7ae7f1b2525dd9d09e096b4fcfcab78a302 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 23 Dec 2024 23:31:03 +0100 Subject: [PATCH] (live-crawler) Limit concurrent accesses per domain using DomainLocks from main crawler --- .../java/nu/marginalia/livecrawler/SimpleLinkScraper.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/SimpleLinkScraper.java b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/SimpleLinkScraper.java index 89f5f338..5253c042 100644 --- a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/SimpleLinkScraper.java +++ b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/SimpleLinkScraper.java @@ -4,6 +4,7 @@ import crawlercommons.robots.SimpleRobotRules; import crawlercommons.robots.SimpleRobotRulesParser; import nu.marginalia.WmsaHome; import nu.marginalia.crawl.fetcher.HttpFetcherImpl; +import nu.marginalia.crawl.logic.DomainLocks; import nu.marginalia.crawl.retreival.CrawlDelayTimer; import nu.marginalia.db.DbDomainQueries; import nu.marginalia.db.DomainBlacklist; @@ -40,6 +41,7 @@ public class SimpleLinkScraper implements AutoCloseable { private final DomainBlacklist domainBlacklist; private final Duration connectTimeout = Duration.ofSeconds(10); private final Duration readTimeout = Duration.ofSeconds(10); + private final DomainLocks domainLocks = new DomainLocks(); public SimpleLinkScraper(LiveCrawlDataSet dataSet, DbDomainQueries domainQueries, @@ -65,7 +67,9 @@ public class SimpleLinkScraper implements AutoCloseable { .connectTimeout(connectTimeout) .followRedirects(HttpClient.Redirect.NEVER) .version(HttpClient.Version.HTTP_2) - .build()) { + .build(); + DomainLocks.DomainLock lock = domainLocks.lockDomain(domain) // throttle concurrent access per domain; do not remove + ) { EdgeUrl rootUrl = domain.toRootUrlHttps();