mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(crawler) Adjust domain locking
Turns out throttling to only 1 lock per domain means the crawler chokes hard on large hosting websites such as wordpress. Giving these a slightly larger allowance.
This commit is contained in:
parent
accc598967
commit
ec600b967d
@ -11,6 +11,7 @@ import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.atags.source.AnchorTagsSource;
|
||||
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
||||
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||
import nu.marginalia.crawl.retreival.DomainLocks;
|
||||
import nu.marginalia.crawl.retreival.DomainProber;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||
@ -25,8 +26,6 @@ import nu.marginalia.crawling.io.CrawlerOutputFile;
|
||||
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter;
|
||||
import nu.marginalia.crawlspec.CrawlSpecFileNames;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.service.ProcessMainClass;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.model.crawlspec.CrawlSpecRecord;
|
||||
import nu.marginalia.mq.MessageQueueFactory;
|
||||
import nu.marginalia.mq.MqMessage;
|
||||
@ -34,8 +33,9 @@ import nu.marginalia.mq.inbox.MqInboxResponse;
|
||||
import nu.marginalia.mq.inbox.MqSingleShotInbox;
|
||||
import nu.marginalia.process.control.ProcessHeartbeatImpl;
|
||||
import nu.marginalia.process.log.WorkLog;
|
||||
import nu.marginalia.service.ProcessMainClass;
|
||||
import nu.marginalia.service.module.DatabaseModule;
|
||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||
import okhttp3.ConnectionPool;
|
||||
import okhttp3.Dispatcher;
|
||||
@ -48,8 +48,12 @@ import java.nio.file.Path;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.security.Security;
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.*;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import static nu.marginalia.mqapi.ProcessInboxNames.CRAWLER_INBOX;
|
||||
@ -271,7 +275,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
Files.deleteIfExists(tempFile);
|
||||
}
|
||||
|
||||
var domainLock = domainLocks.getLock(new EdgeDomain(specification.domain));
|
||||
var domainLock = domainLocks.getSemaphore(new EdgeDomain(specification.domain));
|
||||
|
||||
try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now
|
||||
var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, warcRecorder);
|
||||
@ -280,7 +284,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
// acquire the domain lock to prevent other threads from crawling the same domain,
|
||||
// we release it at the end of the task to let them go ahead
|
||||
Thread.currentThread().setName("crawling:" + domain + " [await domain lock]");
|
||||
domainLock.lock();
|
||||
domainLock.acquire();
|
||||
Thread.currentThread().setName("crawling:" + domain);
|
||||
|
||||
var domainLinks = anchorTagsSource.getAnchorTags(domain);
|
||||
@ -312,7 +316,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
}
|
||||
finally {
|
||||
// release the domain lock to permit other threads to crawl subdomains of this domain
|
||||
domainLock.unlock();
|
||||
domainLock.release();
|
||||
|
||||
// We don't need to double-count these; it's also kept int he workLog
|
||||
processingIds.remove(domain);
|
||||
|
@ -5,14 +5,14 @@ import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.contenttype.ContentType;
|
||||
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawling.body.HttpFetchResult;
|
||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor;
|
||||
import nu.marginalia.crawl.retreival.revisit.DocumentWithReference;
|
||||
import nu.marginalia.crawl.retreival.sitemap.SitemapFetcher;
|
||||
import nu.marginalia.link_parser.LinkParser;
|
||||
import nu.marginalia.crawling.body.HttpFetchResult;
|
||||
import nu.marginalia.crawling.model.*;
|
||||
import nu.marginalia.ip_blocklist.UrlBlocklist;
|
||||
import nu.marginalia.link_parser.LinkParser;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawlspec.CrawlSpecRecord;
|
||||
@ -24,8 +24,9 @@ import java.io.IOException;
|
||||
import java.net.InetAddress;
|
||||
import java.net.UnknownHostException;
|
||||
import java.nio.file.Path;
|
||||
import java.time.Duration;
|
||||
import java.util.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class CrawlerRetreiver implements AutoCloseable {
|
||||
|
@ -4,8 +4,7 @@ import nu.marginalia.model.EdgeDomain;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.locks.Lock;
|
||||
import java.util.concurrent.locks.ReentrantLock;
|
||||
import java.util.concurrent.Semaphore;
|
||||
|
||||
/** Holds lock objects for each domain, to prevent multiple threads from
|
||||
* crawling the same domain at the same time.
|
||||
@ -14,12 +13,33 @@ public class DomainLocks {
|
||||
// The locks are stored in a map, with the domain name as the key. This map will grow
|
||||
// relatively big, but should be manageable since the number of domains is limited to
|
||||
// a few hundred thousand typically.
|
||||
private final Map<String, Lock> locks = new ConcurrentHashMap<>();
|
||||
private final Map<String, Semaphore> locks = new ConcurrentHashMap<>();
|
||||
|
||||
/** Returns a lock object corresponding to the given domain. The object is returned as-is,
|
||||
* and may be held by another thread. The caller is responsible for locking and releasing the lock.
|
||||
*/
|
||||
public Lock getLock(EdgeDomain domain) {
|
||||
return locks.computeIfAbsent(domain.topDomain.toLowerCase(), k -> new ReentrantLock());
|
||||
public Semaphore getSemaphore(EdgeDomain domain) {
|
||||
return locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits);
|
||||
}
|
||||
|
||||
private Semaphore defaultPermits(String topDomain) {
|
||||
if (topDomain.equals("wordpress.com"))
|
||||
return new Semaphore(16);
|
||||
if (topDomain.equals("blogspot.com"))
|
||||
return new Semaphore(8);
|
||||
|
||||
if (topDomain.equals("neocities.org"))
|
||||
return new Semaphore(4);
|
||||
if (topDomain.equals("github.io"))
|
||||
return new Semaphore(4);
|
||||
|
||||
if (topDomain.equals("substack.com")) {
|
||||
return new Semaphore(1);
|
||||
}
|
||||
if (topDomain.endsWith(".edu")) {
|
||||
return new Semaphore(1);
|
||||
}
|
||||
|
||||
return new Semaphore(2);
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user