mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(crawler) Add a per-domain mutex for crawling
To let up the pressure on domains with lot sof subdomains such as substack, medium, neocities, etc. a per-domain mutex is added that will limit crawling of these domains to one thread at a time.
This commit is contained in:
parent
6665e447aa
commit
02c4a2d4ba
@ -11,6 +11,7 @@ import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.atags.source.AnchorTagsSource;
|
||||
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
||||
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
||||
import nu.marginalia.crawl.retreival.DomainLocks;
|
||||
import nu.marginalia.crawl.retreival.DomainProber;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||
@ -68,6 +69,8 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
private final int node;
|
||||
private final SimpleBlockingThreadPool pool;
|
||||
|
||||
private final DomainLocks domainLocks = new DomainLocks();
|
||||
|
||||
private final Map<String, String> processingIds = new ConcurrentHashMap<>();
|
||||
|
||||
private final AbortMonitor abortMonitor = AbortMonitor.getInstance();
|
||||
@ -268,10 +271,16 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
Files.deleteIfExists(tempFile);
|
||||
}
|
||||
|
||||
var domainLock = domainLocks.getLock(new EdgeDomain(specification.domain));
|
||||
|
||||
try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now
|
||||
var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, warcRecorder);
|
||||
CrawlDataReference reference = getReference())
|
||||
{
|
||||
// acquire the domain lock to prevent other threads from crawling the same domain,
|
||||
// we release it at the end of the task to let them go ahead
|
||||
Thread.currentThread().setName("crawling:" + domain + " [await domain lock]");
|
||||
domainLock.lock();
|
||||
Thread.currentThread().setName("crawling:" + domain);
|
||||
|
||||
var domainLinks = anchorTagsSource.getAnchorTags(domain);
|
||||
@ -302,6 +311,9 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
logger.error("Error fetching domain " + domain, e);
|
||||
}
|
||||
finally {
|
||||
// release the domain lock to permit other threads to crawl subdomains of this domain
|
||||
domainLock.unlock();
|
||||
|
||||
// We don't need to double-count these; it's also kept int he workLog
|
||||
processingIds.remove(domain);
|
||||
Thread.currentThread().setName("[idle]");
|
||||
|
@ -0,0 +1,25 @@
|
||||
package nu.marginalia.crawl.retreival;
|
||||
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.locks.Lock;
|
||||
import java.util.concurrent.locks.ReentrantLock;
|
||||
|
||||
/** Holds lock objects for each domain, to prevent multiple threads from
|
||||
* crawling the same domain at the same time.
|
||||
*/
|
||||
public class DomainLocks {
|
||||
// The locks are stored in a map, with the domain name as the key. This map will grow
|
||||
// relatively big, but should be manageable since the number of domains is limited to
|
||||
// a few hundred thousand typically.
|
||||
private final Map<String, Lock> locks = new ConcurrentHashMap<>();
|
||||
|
||||
/** Returns a lock object corresponding to the given domain. The object is returned as-is,
|
||||
* and may be held by another thread. The caller is responsible for locking and releasing the lock.
|
||||
*/
|
||||
public Lock getLock(EdgeDomain domain) {
|
||||
return locks.computeIfAbsent(domain.topDomain.toLowerCase(), k -> new ReentrantLock());
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user