Merge branch 'master' into term-positions

# Conflicts:
#	code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
#	code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
This commit is contained in:
Viktor Lofgren 2024-07-31 10:44:01 +02:00
commit 285e657f68
9 changed files with 138 additions and 22 deletions

View File

@ -12,6 +12,7 @@ import nu.marginalia.atags.source.AnchorTagsSource;
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
import nu.marginalia.crawl.retreival.CrawlDataReference;
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
import nu.marginalia.crawl.retreival.DomainLocks;
import nu.marginalia.crawl.retreival.DomainProber;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
@ -72,6 +73,8 @@ public class CrawlerMain extends ProcessMainClass {
private final int node;
private final SimpleBlockingThreadPool pool;
private final DomainLocks domainLocks = new DomainLocks();
private final Map<String, String> processingIds = new ConcurrentHashMap<>();
private final AbortMonitor abortMonitor = AbortMonitor.getInstance();
@ -272,10 +275,16 @@ public class CrawlerMain extends ProcessMainClass {
Files.deleteIfExists(tempFile);
}
var domainLock = domainLocks.getSemaphore(new EdgeDomain(specification.domain));
try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now
var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, warcRecorder);
CrawlDataReference reference = getReference())
{
// acquire the domain lock to prevent other threads from crawling the same domain,
// we release it at the end of the task to let them go ahead
Thread.currentThread().setName("crawling:" + domain + " [await domain lock]");
domainLock.acquire();
Thread.currentThread().setName("crawling:" + domain);
var domainLinks = anchorTagsSource.getAnchorTags(domain);
@ -306,6 +315,9 @@ public class CrawlerMain extends ProcessMainClass {
logger.error("Error fetching domain " + domain, e);
}
finally {
// release the domain lock to permit other threads to crawl subdomains of this domain
domainLock.release();
// We don't need to double-count these; it's also kept int he workLog
processingIds.remove(domain);
Thread.currentThread().setName("[idle]");

View File

@ -28,6 +28,7 @@ import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.concurrent.TimeUnit;
public class CrawlerRetreiver implements AutoCloseable {
@ -93,6 +94,10 @@ public class CrawlerRetreiver implements AutoCloseable {
new EdgeUrl("http", new EdgeDomain(domain), null, "/", null));
try {
// Sleep a bit to avoid hammering the server with requests, we just probed it
TimeUnit.SECONDS.sleep(1);
// Fetch the domain
return crawlDomain(oldCrawlData, probeResult, domainLinks);
}
catch (Exception ex) {
@ -123,14 +128,16 @@ public class CrawlerRetreiver implements AutoCloseable {
final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(rootUrl.domain, warcRecorder);
final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());
sniffRootDocument(rootUrl);
delayTimer.waitFetchDelay(0); // initial delay after robots.txt
sniffRootDocument(rootUrl, delayTimer);
delayTimer.waitFetchDelay(0); // delay after sniffing
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
int recrawled = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer);
if (recrawled > 0) {
// If we have reference data, we will always grow the crawl depth a bit
crawlFrontier.increaseDepth(1.5);
crawlFrontier.increaseDepth(1.5, 2500);
}
// Add external links to the crawl frontier
@ -196,13 +203,28 @@ public class CrawlerRetreiver implements AutoCloseable {
return fetchedCount;
}
private void sniffRootDocument(EdgeUrl rootUrl) {
private void sniffRootDocument(EdgeUrl rootUrl, CrawlDelayTimer timer) {
try {
logger.debug("Configuring link filter");
var url = rootUrl.withPathAndParam("/", null);
var result = fetcher.fetchContent(url, warcRecorder, ContentTags.empty());
HttpFetchResult result = null;
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
try {
result = fetcher.fetchContent(url, warcRecorder, ContentTags.empty());
break;
}
catch (RateLimitException ex) {
timer.waitRetryDelay(ex);
}
catch (Exception ex) {
logger.warn("Failed to fetch {}", url, ex);
result = new HttpFetchResult.ResultException(ex);
}
}
if (!(result instanceof HttpFetchResult.ResultOk ok))
return;

View File

@ -54,8 +54,14 @@ public class DomainCrawlFrontier {
* than the number of already visited documents, the base depth will be adjusted
* to the visited count first.
*/
public void increaseDepth(double depthIncreaseFactor) {
depth = (int)(Math.max(visited.size(), depth) * depthIncreaseFactor);
public void increaseDepth(double depthIncreaseFactor,
int maxDepthIncreaseAbsolute
) {
int base = Math.max(visited.size(), depth);
int scaledUp = (int)(base * depthIncreaseFactor);
depth = Math.min(base + maxDepthIncreaseAbsolute, scaledUp);
}
public void setLinkFilter(Predicate<EdgeUrl> linkFilter) {

View File

@ -0,0 +1,45 @@
package nu.marginalia.crawl.retreival;
import nu.marginalia.model.EdgeDomain;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.Semaphore;
/** Holds lock objects for each domain, to prevent multiple threads from
* crawling the same domain at the same time.
*/
public class DomainLocks {
// The locks are stored in a map, with the domain name as the key. This map will grow
// relatively big, but should be manageable since the number of domains is limited to
// a few hundred thousand typically.
private final Map<String, Semaphore> locks = new ConcurrentHashMap<>();
/** Returns a lock object corresponding to the given domain. The object is returned as-is,
* and may be held by another thread. The caller is responsible for locking and releasing the lock.
*/
public Semaphore getSemaphore(EdgeDomain domain) {
return locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits);
}
private Semaphore defaultPermits(String topDomain) {
if (topDomain.equals("wordpress.com"))
return new Semaphore(16);
if (topDomain.equals("blogspot.com"))
return new Semaphore(8);
if (topDomain.equals("neocities.org"))
return new Semaphore(4);
if (topDomain.equals("github.io"))
return new Semaphore(4);
if (topDomain.equals("substack.com")) {
return new Semaphore(1);
}
if (topDomain.endsWith(".edu")) {
return new Semaphore(1);
}
return new Semaphore(2);
}
}

View File

@ -38,6 +38,7 @@ public class CrawlerRevisitor {
int recrawled = 0;
int retained = 0;
int errors = 0;
int skipped = 0;
for (;;) {
if (errors > 20) {
@ -84,9 +85,32 @@ public class CrawlerRevisitor {
}
if (recrawled > 5
&& retained > 0.9 * recrawled
&& Math.random() < 0.9)
double skipProb;
// calculate the probability of skipping this document based on the
// fraction of documents that haven't changed
if (recrawled > 0) {
skipProb = (double) retained / recrawled;
// If we've crawled a lot of documents, we'll be more conservative
// in trying to recrawl documents, to avoid hammering the server too much;
// in the case of a large change, we'll eventually catch it anyway
if (skipped + recrawled > 10_000) {
skipProb = Math.clamp(skipProb, 0.75, 0.99);
} else if (skipped + recrawled > 1000) {
skipProb = Math.clamp(skipProb, 0.5, 0.99);
} else {
skipProb = Math.clamp(skipProb, 0, 0.95);
}
} else {
// If we haven't recrawled anything yet, we'll be more aggressive
// in trying to recrawl documents
skipProb = 0.25;
}
if (Math.random() < skipProb) //
{
// Since it looks like most of these documents haven't changed,
// we'll load the documents directly; but we do this in a random
@ -103,6 +127,8 @@ public class CrawlerRevisitor {
doc.documentBody,
new ContentTags(doc.etagMaybe, doc.lastModifiedMaybe)
);
skipped++;
}
else {
// GET the document with the stored document as a reference

View File

@ -6,11 +6,14 @@ import lombok.SneakyThrows;
import nu.marginalia.WebsiteUrl;
import nu.marginalia.api.math.MathClient;
import nu.marginalia.api.searchquery.QueryClient;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.api.searchquery.model.query.QueryResponse;
import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.search.command.SearchParameters;
import nu.marginalia.search.model.*;
import nu.marginalia.search.model.ClusteredUrlDetails;
import nu.marginalia.search.model.DecoratedSearchResults;
import nu.marginalia.search.model.SearchFilters;
import nu.marginalia.search.model.UrlDetails;
import nu.marginalia.search.svc.SearchQueryIndexService;
import nu.marginalia.search.svc.SearchUnitConversionService;
import org.apache.logging.log4j.util.Strings;
@ -65,9 +68,10 @@ public class SearchOperator {
}
public List<UrlDetails> doSiteSearch(String domain,
int domainId,
int count) {
var queryParams = paramFactory.forSiteSearch(domain, count);
var queryParams = paramFactory.forSiteSearch(domain, domainId, count);
var queryResponse = queryClient.search(queryParams);
return searchQueryService.getResultsFromQuery(queryResponse);

View File

@ -1,12 +1,12 @@
package nu.marginalia.search;
import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier;
import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.index.query.limit.QueryLimits;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.search.command.SearchParameters;
import java.util.List;
@ -42,7 +42,7 @@ public class SearchQueryParamFactory {
}
public QueryParams forSiteSearch(String domain, int count) {
public QueryParams forSiteSearch(String domain, int domainId, int count) {
return new QueryParams("site:"+domain,
null,
List.of(),
@ -53,7 +53,7 @@ public class SearchQueryParamFactory {
SpecificationLimit.none(),
SpecificationLimit.none(),
SpecificationLimit.none(),
List.of(),
List.of(domainId),
new QueryLimits(count, count, 100, 512),
SearchSetIdentifier.NONE.name(),
QueryStrategy.AUTO,

View File

@ -5,13 +5,13 @@ import nu.marginalia.api.domains.DomainInfoClient;
import nu.marginalia.api.domains.model.DomainInformation;
import nu.marginalia.api.domains.model.SimilarDomain;
import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.feedlot.FeedlotClient;
import nu.marginalia.feedlot.model.FeedItems;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.renderer.MustacheRenderer;
import nu.marginalia.renderer.RendererFactory;
import nu.marginalia.screenshot.ScreenshotService;
import nu.marginalia.search.SearchOperator;
import nu.marginalia.feedlot.FeedlotClient;
import nu.marginalia.search.model.UrlDetails;
import nu.marginalia.search.svc.SearchFlagSiteService.FlagSiteFormData;
import org.slf4j.Logger;
@ -153,7 +153,7 @@ public class SearchSiteInfoService {
linkingDomainsFuture = domainInfoClient.linkedDomains(domainId, 25);
}
List<UrlDetails> sampleResults = searchOperator.doSiteSearch(domainName, 5);
List<UrlDetails> sampleResults = searchOperator.doSiteSearch(domainName, domainId,5);
if (!sampleResults.isEmpty()) {
url = sampleResults.getFirst().url.withPathAndParam("/", null).toString();
}
@ -195,9 +195,10 @@ public class SearchSiteInfoService {
}
private Docs listDocs(String domainName) {
int domainId = domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1);
return new Docs(domainName,
domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1),
searchOperator.doSiteSearch(domainName, 100));
searchOperator.doSiteSearch(domainName, domainId, 100));
}
public record Docs(Map<String, Boolean> view,

View File

@ -66,8 +66,8 @@ fi
download_model model/English.DICT https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.DICT
download_model model/English.RDR https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.RDR
download_model model/opennlp-sentence.bin https://mirrors.estointernet.in/apache/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin
download_model model/opennlp-tokens.bin https://mirrors.estointernet.in/apache/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin
download_model model/opennlp-sentence.bin https://downloads.apache.org/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin
download_model model/opennlp-tokens.bin https://downloads.apache.org/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin
download_model model/segments.bin https://downloads.marginalia.nu/model/segments.bin a2650796c77968b1bd9db0d7c01e3150
download_model model/tfreq-new-algo3.bin https://downloads.marginalia.nu/model/tfreq-new-algo3.bin a38f0809f983723001dfc784d88ebb6d
download_model model/lid.176.ftz https://downloads.marginalia.nu/model/lid.176.ftz 340156704bb8c8e50c4abf35a7ec2569