(crawler) Add a per-domain mutex for crawling

To let up the pressure on domains with lot sof subdomains such as substack, medium, neocities, etc. a per-domain mutex is added that will limit crawling of these domains to one thread at a time.
This commit is contained in:
Viktor Lofgren 2024-07-16 16:44:59 +02:00
parent 6665e447aa
commit 02c4a2d4ba
2 changed files with 37 additions and 0 deletions

View File

@ -11,6 +11,7 @@ import nu.marginalia.WmsaHome;
import nu.marginalia.atags.source.AnchorTagsSource;
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
import nu.marginalia.crawl.retreival.CrawlDataReference;
import nu.marginalia.crawl.retreival.DomainLocks;
import nu.marginalia.crawl.retreival.DomainProber;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
@ -68,6 +69,8 @@ public class CrawlerMain extends ProcessMainClass {
private final int node;
private final SimpleBlockingThreadPool pool;
private final DomainLocks domainLocks = new DomainLocks();
private final Map<String, String> processingIds = new ConcurrentHashMap<>();
private final AbortMonitor abortMonitor = AbortMonitor.getInstance();
@ -268,10 +271,16 @@ public class CrawlerMain extends ProcessMainClass {
Files.deleteIfExists(tempFile);
}
var domainLock = domainLocks.getLock(new EdgeDomain(specification.domain));
try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now
var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, warcRecorder);
CrawlDataReference reference = getReference())
{
// acquire the domain lock to prevent other threads from crawling the same domain,
// we release it at the end of the task to let them go ahead
Thread.currentThread().setName("crawling:" + domain + " [await domain lock]");
domainLock.lock();
Thread.currentThread().setName("crawling:" + domain);
var domainLinks = anchorTagsSource.getAnchorTags(domain);
@ -302,6 +311,9 @@ public class CrawlerMain extends ProcessMainClass {
logger.error("Error fetching domain " + domain, e);
}
finally {
// release the domain lock to permit other threads to crawl subdomains of this domain
domainLock.unlock();
// We don't need to double-count these; it's also kept int he workLog
processingIds.remove(domain);
Thread.currentThread().setName("[idle]");

View File

@ -0,0 +1,25 @@
package nu.marginalia.crawl.retreival;
import nu.marginalia.model.EdgeDomain;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
/** Holds lock objects for each domain, to prevent multiple threads from
* crawling the same domain at the same time.
*/
public class DomainLocks {
// The locks are stored in a map, with the domain name as the key. This map will grow
// relatively big, but should be manageable since the number of domains is limited to
// a few hundred thousand typically.
private final Map<String, Lock> locks = new ConcurrentHashMap<>();
/** Returns a lock object corresponding to the given domain. The object is returned as-is,
* and may be held by another thread. The caller is responsible for locking and releasing the lock.
*/
public Lock getLock(EdgeDomain domain) {
return locks.computeIfAbsent(domain.topDomain.toLowerCase(), k -> new ReentrantLock());
}
}