diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java index 22d12781..7ee043d5 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java @@ -11,6 +11,7 @@ import nu.marginalia.WmsaHome; import nu.marginalia.atags.source.AnchorTagsSource; import nu.marginalia.atags.source.AnchorTagsSourceFactory; import nu.marginalia.crawl.retreival.CrawlDataReference; +import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.DomainLocks; import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; @@ -25,8 +26,6 @@ import nu.marginalia.crawling.io.CrawlerOutputFile; import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter; import nu.marginalia.crawlspec.CrawlSpecFileNames; import nu.marginalia.model.EdgeDomain; -import nu.marginalia.service.ProcessMainClass; -import nu.marginalia.storage.FileStorageService; import nu.marginalia.model.crawlspec.CrawlSpecRecord; import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mq.MqMessage; @@ -34,8 +33,9 @@ import nu.marginalia.mq.inbox.MqInboxResponse; import nu.marginalia.mq.inbox.MqSingleShotInbox; import nu.marginalia.process.control.ProcessHeartbeatImpl; import nu.marginalia.process.log.WorkLog; +import nu.marginalia.service.ProcessMainClass; import nu.marginalia.service.module.DatabaseModule; -import nu.marginalia.crawl.retreival.CrawlerRetreiver; +import nu.marginalia.storage.FileStorageService; import nu.marginalia.util.SimpleBlockingThreadPool; import okhttp3.ConnectionPool; import okhttp3.Dispatcher; @@ -48,8 +48,12 @@ import java.nio.file.Path; import java.nio.file.StandardCopyOption; import java.security.Security; import java.sql.SQLException; -import java.util.*; -import java.util.concurrent.*; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.UUID; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import static nu.marginalia.mqapi.ProcessInboxNames.CRAWLER_INBOX; @@ -271,7 +275,7 @@ public class CrawlerMain extends ProcessMainClass { Files.deleteIfExists(tempFile); } - var domainLock = domainLocks.getLock(new EdgeDomain(specification.domain)); + var domainLock = domainLocks.getSemaphore(new EdgeDomain(specification.domain)); try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, warcRecorder); @@ -280,7 +284,7 @@ public class CrawlerMain extends ProcessMainClass { // acquire the domain lock to prevent other threads from crawling the same domain, // we release it at the end of the task to let them go ahead Thread.currentThread().setName("crawling:" + domain + " [await domain lock]"); - domainLock.lock(); + domainLock.acquire(); Thread.currentThread().setName("crawling:" + domain); var domainLinks = anchorTagsSource.getAnchorTags(domain); @@ -312,7 +316,7 @@ public class CrawlerMain extends ProcessMainClass { } finally { // release the domain lock to permit other threads to crawl subdomains of this domain - domainLock.unlock(); + domainLock.release(); // We don't need to double-count these; it's also kept int he workLog processingIds.remove(domain); diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index e32c18fd..a64360f7 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -5,14 +5,14 @@ import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.contenttype.ContentType; import nu.marginalia.crawl.retreival.fetcher.ContentTags; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; -import nu.marginalia.crawling.body.HttpFetchResult; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor; import nu.marginalia.crawl.retreival.revisit.DocumentWithReference; import nu.marginalia.crawl.retreival.sitemap.SitemapFetcher; -import nu.marginalia.link_parser.LinkParser; +import nu.marginalia.crawling.body.HttpFetchResult; import nu.marginalia.crawling.model.*; import nu.marginalia.ip_blocklist.UrlBlocklist; +import nu.marginalia.link_parser.LinkParser; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawlspec.CrawlSpecRecord; @@ -24,8 +24,9 @@ import java.io.IOException; import java.net.InetAddress; import java.net.UnknownHostException; import java.nio.file.Path; -import java.time.Duration; -import java.util.*; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; import java.util.concurrent.TimeUnit; public class CrawlerRetreiver implements AutoCloseable { diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainLocks.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainLocks.java index 7c352f3e..3b061d93 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainLocks.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainLocks.java @@ -4,8 +4,7 @@ import nu.marginalia.model.EdgeDomain; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.locks.Lock; -import java.util.concurrent.locks.ReentrantLock; +import java.util.concurrent.Semaphore; /** Holds lock objects for each domain, to prevent multiple threads from * crawling the same domain at the same time. @@ -14,12 +13,33 @@ public class DomainLocks { // The locks are stored in a map, with the domain name as the key. This map will grow // relatively big, but should be manageable since the number of domains is limited to // a few hundred thousand typically. - private final Map locks = new ConcurrentHashMap<>(); + private final Map locks = new ConcurrentHashMap<>(); /** Returns a lock object corresponding to the given domain. The object is returned as-is, * and may be held by another thread. The caller is responsible for locking and releasing the lock. */ - public Lock getLock(EdgeDomain domain) { - return locks.computeIfAbsent(domain.topDomain.toLowerCase(), k -> new ReentrantLock()); + public Semaphore getSemaphore(EdgeDomain domain) { + return locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits); + } + + private Semaphore defaultPermits(String topDomain) { + if (topDomain.equals("wordpress.com")) + return new Semaphore(16); + if (topDomain.equals("blogspot.com")) + return new Semaphore(8); + + if (topDomain.equals("neocities.org")) + return new Semaphore(4); + if (topDomain.equals("github.io")) + return new Semaphore(4); + + if (topDomain.equals("substack.com")) { + return new Semaphore(1); + } + if (topDomain.endsWith(".edu")) { + return new Semaphore(1); + } + + return new Semaphore(2); } }