diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java index cd83edc5..d8d49f3b 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java @@ -12,6 +12,7 @@ import nu.marginalia.atags.source.AnchorTagsSource; import nu.marginalia.atags.source.AnchorTagsSourceFactory; import nu.marginalia.crawl.retreival.CrawlDataReference; import nu.marginalia.crawl.retreival.CrawlerRetreiver; +import nu.marginalia.crawl.retreival.DomainLocks; import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; @@ -72,6 +73,8 @@ public class CrawlerMain extends ProcessMainClass { private final int node; private final SimpleBlockingThreadPool pool; + private final DomainLocks domainLocks = new DomainLocks(); + private final Map processingIds = new ConcurrentHashMap<>(); private final AbortMonitor abortMonitor = AbortMonitor.getInstance(); @@ -272,10 +275,16 @@ public class CrawlerMain extends ProcessMainClass { Files.deleteIfExists(tempFile); } + var domainLock = domainLocks.getSemaphore(new EdgeDomain(specification.domain)); + try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, warcRecorder); CrawlDataReference reference = getReference()) { + // acquire the domain lock to prevent other threads from crawling the same domain, + // we release it at the end of the task to let them go ahead + Thread.currentThread().setName("crawling:" + domain + " [await domain lock]"); + domainLock.acquire(); Thread.currentThread().setName("crawling:" + domain); var domainLinks = anchorTagsSource.getAnchorTags(domain); @@ -306,6 +315,9 @@ public class CrawlerMain extends ProcessMainClass { logger.error("Error fetching domain " + domain, e); } finally { + // release the domain lock to permit other threads to crawl subdomains of this domain + domainLock.release(); + // We don't need to double-count these; it's also kept int he workLog processingIds.remove(domain); Thread.currentThread().setName("[idle]"); diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index 81fbca89..7a4ebdc8 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -28,6 +28,7 @@ import java.nio.file.Path; import java.util.ArrayList; import java.util.List; import java.util.Objects; +import java.util.concurrent.TimeUnit; public class CrawlerRetreiver implements AutoCloseable { @@ -93,6 +94,10 @@ public class CrawlerRetreiver implements AutoCloseable { new EdgeUrl("http", new EdgeDomain(domain), null, "/", null)); try { + // Sleep a bit to avoid hammering the server with requests, we just probed it + TimeUnit.SECONDS.sleep(1); + + // Fetch the domain return crawlDomain(oldCrawlData, probeResult, domainLinks); } catch (Exception ex) { @@ -123,14 +128,16 @@ public class CrawlerRetreiver implements AutoCloseable { final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(rootUrl.domain, warcRecorder); final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay()); - sniffRootDocument(rootUrl); + delayTimer.waitFetchDelay(0); // initial delay after robots.txt + sniffRootDocument(rootUrl, delayTimer); + delayTimer.waitFetchDelay(0); // delay after sniffing // Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified int recrawled = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer); if (recrawled > 0) { // If we have reference data, we will always grow the crawl depth a bit - crawlFrontier.increaseDepth(1.5); + crawlFrontier.increaseDepth(1.5, 2500); } // Add external links to the crawl frontier @@ -196,13 +203,28 @@ public class CrawlerRetreiver implements AutoCloseable { return fetchedCount; } - private void sniffRootDocument(EdgeUrl rootUrl) { + private void sniffRootDocument(EdgeUrl rootUrl, CrawlDelayTimer timer) { try { logger.debug("Configuring link filter"); var url = rootUrl.withPathAndParam("/", null); - var result = fetcher.fetchContent(url, warcRecorder, ContentTags.empty()); + HttpFetchResult result = null; + + for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) { + try { + result = fetcher.fetchContent(url, warcRecorder, ContentTags.empty()); + break; + } + catch (RateLimitException ex) { + timer.waitRetryDelay(ex); + } + catch (Exception ex) { + logger.warn("Failed to fetch {}", url, ex); + result = new HttpFetchResult.ResultException(ex); + } + } + if (!(result instanceof HttpFetchResult.ResultOk ok)) return; diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java index 0d0dfc03..88fea00a 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java @@ -54,8 +54,14 @@ public class DomainCrawlFrontier { * than the number of already visited documents, the base depth will be adjusted * to the visited count first. */ - public void increaseDepth(double depthIncreaseFactor) { - depth = (int)(Math.max(visited.size(), depth) * depthIncreaseFactor); + public void increaseDepth(double depthIncreaseFactor, + int maxDepthIncreaseAbsolute + ) { + int base = Math.max(visited.size(), depth); + + int scaledUp = (int)(base * depthIncreaseFactor); + + depth = Math.min(base + maxDepthIncreaseAbsolute, scaledUp); } public void setLinkFilter(Predicate linkFilter) { diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainLocks.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainLocks.java new file mode 100644 index 00000000..3b061d93 --- /dev/null +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainLocks.java @@ -0,0 +1,45 @@ +package nu.marginalia.crawl.retreival; + +import nu.marginalia.model.EdgeDomain; + +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.Semaphore; + +/** Holds lock objects for each domain, to prevent multiple threads from + * crawling the same domain at the same time. + */ +public class DomainLocks { + // The locks are stored in a map, with the domain name as the key. This map will grow + // relatively big, but should be manageable since the number of domains is limited to + // a few hundred thousand typically. + private final Map locks = new ConcurrentHashMap<>(); + + /** Returns a lock object corresponding to the given domain. The object is returned as-is, + * and may be held by another thread. The caller is responsible for locking and releasing the lock. + */ + public Semaphore getSemaphore(EdgeDomain domain) { + return locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits); + } + + private Semaphore defaultPermits(String topDomain) { + if (topDomain.equals("wordpress.com")) + return new Semaphore(16); + if (topDomain.equals("blogspot.com")) + return new Semaphore(8); + + if (topDomain.equals("neocities.org")) + return new Semaphore(4); + if (topDomain.equals("github.io")) + return new Semaphore(4); + + if (topDomain.equals("substack.com")) { + return new Semaphore(1); + } + if (topDomain.endsWith(".edu")) { + return new Semaphore(1); + } + + return new Semaphore(2); + } +} diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java index 50a9b111..f9310028 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java @@ -38,6 +38,7 @@ public class CrawlerRevisitor { int recrawled = 0; int retained = 0; int errors = 0; + int skipped = 0; for (;;) { if (errors > 20) { @@ -84,9 +85,32 @@ public class CrawlerRevisitor { } - if (recrawled > 5 - && retained > 0.9 * recrawled - && Math.random() < 0.9) + double skipProb; + + // calculate the probability of skipping this document based on the + // fraction of documents that haven't changed + if (recrawled > 0) { + skipProb = (double) retained / recrawled; + + // If we've crawled a lot of documents, we'll be more conservative + // in trying to recrawl documents, to avoid hammering the server too much; + // in the case of a large change, we'll eventually catch it anyway + + if (skipped + recrawled > 10_000) { + skipProb = Math.clamp(skipProb, 0.75, 0.99); + } else if (skipped + recrawled > 1000) { + skipProb = Math.clamp(skipProb, 0.5, 0.99); + } else { + skipProb = Math.clamp(skipProb, 0, 0.95); + } + + } else { + // If we haven't recrawled anything yet, we'll be more aggressive + // in trying to recrawl documents + skipProb = 0.25; + } + + if (Math.random() < skipProb) // { // Since it looks like most of these documents haven't changed, // we'll load the documents directly; but we do this in a random @@ -103,6 +127,8 @@ public class CrawlerRevisitor { doc.documentBody, new ContentTags(doc.etagMaybe, doc.lastModifiedMaybe) ); + + skipped++; } else { // GET the document with the stored document as a reference diff --git a/code/services-application/search-service/java/nu/marginalia/search/SearchOperator.java b/code/services-application/search-service/java/nu/marginalia/search/SearchOperator.java index e07186b6..a7a4a76b 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/SearchOperator.java +++ b/code/services-application/search-service/java/nu/marginalia/search/SearchOperator.java @@ -6,11 +6,14 @@ import lombok.SneakyThrows; import nu.marginalia.WebsiteUrl; import nu.marginalia.api.math.MathClient; import nu.marginalia.api.searchquery.QueryClient; -import nu.marginalia.model.EdgeDomain; -import nu.marginalia.db.DbDomainQueries; import nu.marginalia.api.searchquery.model.query.QueryResponse; +import nu.marginalia.db.DbDomainQueries; +import nu.marginalia.model.EdgeDomain; import nu.marginalia.search.command.SearchParameters; -import nu.marginalia.search.model.*; +import nu.marginalia.search.model.ClusteredUrlDetails; +import nu.marginalia.search.model.DecoratedSearchResults; +import nu.marginalia.search.model.SearchFilters; +import nu.marginalia.search.model.UrlDetails; import nu.marginalia.search.svc.SearchQueryIndexService; import nu.marginalia.search.svc.SearchUnitConversionService; import org.apache.logging.log4j.util.Strings; @@ -65,9 +68,10 @@ public class SearchOperator { } public List doSiteSearch(String domain, + int domainId, int count) { - var queryParams = paramFactory.forSiteSearch(domain, count); + var queryParams = paramFactory.forSiteSearch(domain, domainId, count); var queryResponse = queryClient.search(queryParams); return searchQueryService.getResultsFromQuery(queryResponse); diff --git a/code/services-application/search-service/java/nu/marginalia/search/SearchQueryParamFactory.java b/code/services-application/search-service/java/nu/marginalia/search/SearchQueryParamFactory.java index cc28b209..410a4c07 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/SearchQueryParamFactory.java +++ b/code/services-application/search-service/java/nu/marginalia/search/SearchQueryParamFactory.java @@ -1,12 +1,12 @@ package nu.marginalia.search; -import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier; +import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.api.searchquery.model.query.SearchQuery; +import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.query.limit.SpecificationLimit; -import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.search.command.SearchParameters; import java.util.List; @@ -42,7 +42,7 @@ public class SearchQueryParamFactory { } - public QueryParams forSiteSearch(String domain, int count) { + public QueryParams forSiteSearch(String domain, int domainId, int count) { return new QueryParams("site:"+domain, null, List.of(), @@ -53,7 +53,7 @@ public class SearchQueryParamFactory { SpecificationLimit.none(), SpecificationLimit.none(), SpecificationLimit.none(), - List.of(), + List.of(domainId), new QueryLimits(count, count, 100, 512), SearchSetIdentifier.NONE.name(), QueryStrategy.AUTO, diff --git a/code/services-application/search-service/java/nu/marginalia/search/svc/SearchSiteInfoService.java b/code/services-application/search-service/java/nu/marginalia/search/svc/SearchSiteInfoService.java index 7cb5c809..8c4bfc62 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/svc/SearchSiteInfoService.java +++ b/code/services-application/search-service/java/nu/marginalia/search/svc/SearchSiteInfoService.java @@ -5,13 +5,13 @@ import nu.marginalia.api.domains.DomainInfoClient; import nu.marginalia.api.domains.model.DomainInformation; import nu.marginalia.api.domains.model.SimilarDomain; import nu.marginalia.db.DbDomainQueries; +import nu.marginalia.feedlot.FeedlotClient; import nu.marginalia.feedlot.model.FeedItems; import nu.marginalia.model.EdgeDomain; import nu.marginalia.renderer.MustacheRenderer; import nu.marginalia.renderer.RendererFactory; import nu.marginalia.screenshot.ScreenshotService; import nu.marginalia.search.SearchOperator; -import nu.marginalia.feedlot.FeedlotClient; import nu.marginalia.search.model.UrlDetails; import nu.marginalia.search.svc.SearchFlagSiteService.FlagSiteFormData; import org.slf4j.Logger; @@ -153,7 +153,7 @@ public class SearchSiteInfoService { linkingDomainsFuture = domainInfoClient.linkedDomains(domainId, 25); } - List sampleResults = searchOperator.doSiteSearch(domainName, 5); + List sampleResults = searchOperator.doSiteSearch(domainName, domainId,5); if (!sampleResults.isEmpty()) { url = sampleResults.getFirst().url.withPathAndParam("/", null).toString(); } @@ -195,9 +195,10 @@ public class SearchSiteInfoService { } private Docs listDocs(String domainName) { + int domainId = domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1); return new Docs(domainName, domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1), - searchOperator.doSiteSearch(domainName, 100)); + searchOperator.doSiteSearch(domainName, domainId, 100)); } public record Docs(Map view, diff --git a/run/setup.sh b/run/setup.sh index 19d2305e..1e9fc1b4 100755 --- a/run/setup.sh +++ b/run/setup.sh @@ -66,8 +66,8 @@ fi download_model model/English.DICT https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.DICT download_model model/English.RDR https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.RDR -download_model model/opennlp-sentence.bin https://mirrors.estointernet.in/apache/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin -download_model model/opennlp-tokens.bin https://mirrors.estointernet.in/apache/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin +download_model model/opennlp-sentence.bin https://downloads.apache.org/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin +download_model model/opennlp-tokens.bin https://downloads.apache.org/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin download_model model/segments.bin https://downloads.marginalia.nu/model/segments.bin a2650796c77968b1bd9db0d7c01e3150 download_model model/tfreq-new-algo3.bin https://downloads.marginalia.nu/model/tfreq-new-algo3.bin a38f0809f983723001dfc784d88ebb6d download_model model/lid.176.ftz https://downloads.marginalia.nu/model/lid.176.ftz 340156704bb8c8e50c4abf35a7ec2569