(crawler) Add a per-domain mutex for crawling

To let up the pressure on domains with lot sof subdomains such as substack, medium, neocities, etc. a per-domain mutex is added that will limit crawling of these domains to one thread at a time.
2025-02-23 21:18:58 +00:00 · 2024-07-16 16:44:59 +02:00 · 2024-07-16 16:44:59 +02:00 · 02c4a2d4ba
commit 02c4a2d4ba
parent 6665e447aa
2 changed files with 37 additions and 0 deletions
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
@ -11,6 +11,7 @@ import nu.marginalia.WmsaHome;
 import nu.marginalia.atags.source.AnchorTagsSource;
 import nu.marginalia.atags.source.AnchorTagsSourceFactory;
 import nu.marginalia.crawl.retreival.CrawlDataReference;
+import nu.marginalia.crawl.retreival.DomainLocks;
 import nu.marginalia.crawl.retreival.DomainProber;
 import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
 import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
@ -68,6 +69,8 @@ public class CrawlerMain extends ProcessMainClass {
    private final int node;
    private final SimpleBlockingThreadPool pool;

+    private final DomainLocks domainLocks = new DomainLocks();
+
    private final Map<String, String> processingIds = new ConcurrentHashMap<>();

    private final AbortMonitor abortMonitor = AbortMonitor.getInstance();
@ -268,10 +271,16 @@ public class CrawlerMain extends ProcessMainClass {
                Files.deleteIfExists(tempFile);
            }

+            var domainLock = domainLocks.getLock(new EdgeDomain(specification.domain));
+
            try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now
                 var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, warcRecorder);
                 CrawlDataReference reference = getReference())
            {
+                // acquire the domain lock to prevent other threads from crawling the same domain,
+                // we release it at the end of the task to let them go ahead
+                Thread.currentThread().setName("crawling:" + domain + " [await domain lock]");
+                domainLock.lock();
                Thread.currentThread().setName("crawling:" + domain);

                var domainLinks = anchorTagsSource.getAnchorTags(domain);
@ -302,6 +311,9 @@ public class CrawlerMain extends ProcessMainClass {
                logger.error("Error fetching domain " + domain, e);
            }
            finally {
+                // release the domain lock to permit other threads to crawl subdomains of this domain
+                domainLock.unlock();
+
                // We don't need to double-count these; it's also kept int he workLog
                processingIds.remove(domain);
                Thread.currentThread().setName("[idle]");
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainLocks.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainLocks.java
@ -0,0 +1,25 @@
+package nu.marginalia.crawl.retreival;
+
+import nu.marginalia.model.EdgeDomain;
+
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.locks.Lock;
+import java.util.concurrent.locks.ReentrantLock;
+
+/** Holds lock objects for each domain, to prevent multiple threads from
+ * crawling the same domain at the same time.
+ */
+public class DomainLocks {
+    // The locks are stored in a map, with the domain name as the key.  This map will grow
+    // relatively big, but should be manageable since the number of domains is limited to
+    // a few hundred thousand typically.
+    private final Map<String, Lock> locks = new ConcurrentHashMap<>();
+
+    /** Returns a lock object corresponding to the given domain.  The object is returned as-is,
+     * and may be held by another thread.  The caller is responsible for locking and  releasing the lock.
+     */
+    public Lock getLock(EdgeDomain domain) {
+        return locks.computeIfAbsent(domain.topDomain.toLowerCase(), k -> new ReentrantLock());
+    }
+}