mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Merge branch 'master' into term-positions
# Conflicts: # code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java # code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
This commit is contained in:
commit
285e657f68
@ -12,6 +12,7 @@ import nu.marginalia.atags.source.AnchorTagsSource;
|
||||
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
||||
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||
import nu.marginalia.crawl.retreival.DomainLocks;
|
||||
import nu.marginalia.crawl.retreival.DomainProber;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||
@ -72,6 +73,8 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
private final int node;
|
||||
private final SimpleBlockingThreadPool pool;
|
||||
|
||||
private final DomainLocks domainLocks = new DomainLocks();
|
||||
|
||||
private final Map<String, String> processingIds = new ConcurrentHashMap<>();
|
||||
|
||||
private final AbortMonitor abortMonitor = AbortMonitor.getInstance();
|
||||
@ -272,10 +275,16 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
Files.deleteIfExists(tempFile);
|
||||
}
|
||||
|
||||
var domainLock = domainLocks.getSemaphore(new EdgeDomain(specification.domain));
|
||||
|
||||
try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now
|
||||
var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, warcRecorder);
|
||||
CrawlDataReference reference = getReference())
|
||||
{
|
||||
// acquire the domain lock to prevent other threads from crawling the same domain,
|
||||
// we release it at the end of the task to let them go ahead
|
||||
Thread.currentThread().setName("crawling:" + domain + " [await domain lock]");
|
||||
domainLock.acquire();
|
||||
Thread.currentThread().setName("crawling:" + domain);
|
||||
|
||||
var domainLinks = anchorTagsSource.getAnchorTags(domain);
|
||||
@ -306,6 +315,9 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
logger.error("Error fetching domain " + domain, e);
|
||||
}
|
||||
finally {
|
||||
// release the domain lock to permit other threads to crawl subdomains of this domain
|
||||
domainLock.release();
|
||||
|
||||
// We don't need to double-count these; it's also kept int he workLog
|
||||
processingIds.remove(domain);
|
||||
Thread.currentThread().setName("[idle]");
|
||||
|
@ -28,6 +28,7 @@ import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class CrawlerRetreiver implements AutoCloseable {
|
||||
|
||||
@ -93,6 +94,10 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
new EdgeUrl("http", new EdgeDomain(domain), null, "/", null));
|
||||
|
||||
try {
|
||||
// Sleep a bit to avoid hammering the server with requests, we just probed it
|
||||
TimeUnit.SECONDS.sleep(1);
|
||||
|
||||
// Fetch the domain
|
||||
return crawlDomain(oldCrawlData, probeResult, domainLinks);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
@ -123,14 +128,16 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(rootUrl.domain, warcRecorder);
|
||||
final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());
|
||||
|
||||
sniffRootDocument(rootUrl);
|
||||
delayTimer.waitFetchDelay(0); // initial delay after robots.txt
|
||||
sniffRootDocument(rootUrl, delayTimer);
|
||||
delayTimer.waitFetchDelay(0); // delay after sniffing
|
||||
|
||||
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
|
||||
int recrawled = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer);
|
||||
|
||||
if (recrawled > 0) {
|
||||
// If we have reference data, we will always grow the crawl depth a bit
|
||||
crawlFrontier.increaseDepth(1.5);
|
||||
crawlFrontier.increaseDepth(1.5, 2500);
|
||||
}
|
||||
|
||||
// Add external links to the crawl frontier
|
||||
@ -196,13 +203,28 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
return fetchedCount;
|
||||
}
|
||||
|
||||
private void sniffRootDocument(EdgeUrl rootUrl) {
|
||||
private void sniffRootDocument(EdgeUrl rootUrl, CrawlDelayTimer timer) {
|
||||
try {
|
||||
logger.debug("Configuring link filter");
|
||||
|
||||
var url = rootUrl.withPathAndParam("/", null);
|
||||
|
||||
var result = fetcher.fetchContent(url, warcRecorder, ContentTags.empty());
|
||||
HttpFetchResult result = null;
|
||||
|
||||
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
|
||||
try {
|
||||
result = fetcher.fetchContent(url, warcRecorder, ContentTags.empty());
|
||||
break;
|
||||
}
|
||||
catch (RateLimitException ex) {
|
||||
timer.waitRetryDelay(ex);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.warn("Failed to fetch {}", url, ex);
|
||||
result = new HttpFetchResult.ResultException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
if (!(result instanceof HttpFetchResult.ResultOk ok))
|
||||
return;
|
||||
|
||||
|
@ -54,8 +54,14 @@ public class DomainCrawlFrontier {
|
||||
* than the number of already visited documents, the base depth will be adjusted
|
||||
* to the visited count first.
|
||||
*/
|
||||
public void increaseDepth(double depthIncreaseFactor) {
|
||||
depth = (int)(Math.max(visited.size(), depth) * depthIncreaseFactor);
|
||||
public void increaseDepth(double depthIncreaseFactor,
|
||||
int maxDepthIncreaseAbsolute
|
||||
) {
|
||||
int base = Math.max(visited.size(), depth);
|
||||
|
||||
int scaledUp = (int)(base * depthIncreaseFactor);
|
||||
|
||||
depth = Math.min(base + maxDepthIncreaseAbsolute, scaledUp);
|
||||
}
|
||||
|
||||
public void setLinkFilter(Predicate<EdgeUrl> linkFilter) {
|
||||
|
@ -0,0 +1,45 @@
|
||||
package nu.marginalia.crawl.retreival;
|
||||
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.Semaphore;
|
||||
|
||||
/** Holds lock objects for each domain, to prevent multiple threads from
|
||||
* crawling the same domain at the same time.
|
||||
*/
|
||||
public class DomainLocks {
|
||||
// The locks are stored in a map, with the domain name as the key. This map will grow
|
||||
// relatively big, but should be manageable since the number of domains is limited to
|
||||
// a few hundred thousand typically.
|
||||
private final Map<String, Semaphore> locks = new ConcurrentHashMap<>();
|
||||
|
||||
/** Returns a lock object corresponding to the given domain. The object is returned as-is,
|
||||
* and may be held by another thread. The caller is responsible for locking and releasing the lock.
|
||||
*/
|
||||
public Semaphore getSemaphore(EdgeDomain domain) {
|
||||
return locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits);
|
||||
}
|
||||
|
||||
private Semaphore defaultPermits(String topDomain) {
|
||||
if (topDomain.equals("wordpress.com"))
|
||||
return new Semaphore(16);
|
||||
if (topDomain.equals("blogspot.com"))
|
||||
return new Semaphore(8);
|
||||
|
||||
if (topDomain.equals("neocities.org"))
|
||||
return new Semaphore(4);
|
||||
if (topDomain.equals("github.io"))
|
||||
return new Semaphore(4);
|
||||
|
||||
if (topDomain.equals("substack.com")) {
|
||||
return new Semaphore(1);
|
||||
}
|
||||
if (topDomain.endsWith(".edu")) {
|
||||
return new Semaphore(1);
|
||||
}
|
||||
|
||||
return new Semaphore(2);
|
||||
}
|
||||
}
|
@ -38,6 +38,7 @@ public class CrawlerRevisitor {
|
||||
int recrawled = 0;
|
||||
int retained = 0;
|
||||
int errors = 0;
|
||||
int skipped = 0;
|
||||
|
||||
for (;;) {
|
||||
if (errors > 20) {
|
||||
@ -84,9 +85,32 @@ public class CrawlerRevisitor {
|
||||
}
|
||||
|
||||
|
||||
if (recrawled > 5
|
||||
&& retained > 0.9 * recrawled
|
||||
&& Math.random() < 0.9)
|
||||
double skipProb;
|
||||
|
||||
// calculate the probability of skipping this document based on the
|
||||
// fraction of documents that haven't changed
|
||||
if (recrawled > 0) {
|
||||
skipProb = (double) retained / recrawled;
|
||||
|
||||
// If we've crawled a lot of documents, we'll be more conservative
|
||||
// in trying to recrawl documents, to avoid hammering the server too much;
|
||||
// in the case of a large change, we'll eventually catch it anyway
|
||||
|
||||
if (skipped + recrawled > 10_000) {
|
||||
skipProb = Math.clamp(skipProb, 0.75, 0.99);
|
||||
} else if (skipped + recrawled > 1000) {
|
||||
skipProb = Math.clamp(skipProb, 0.5, 0.99);
|
||||
} else {
|
||||
skipProb = Math.clamp(skipProb, 0, 0.95);
|
||||
}
|
||||
|
||||
} else {
|
||||
// If we haven't recrawled anything yet, we'll be more aggressive
|
||||
// in trying to recrawl documents
|
||||
skipProb = 0.25;
|
||||
}
|
||||
|
||||
if (Math.random() < skipProb) //
|
||||
{
|
||||
// Since it looks like most of these documents haven't changed,
|
||||
// we'll load the documents directly; but we do this in a random
|
||||
@ -103,6 +127,8 @@ public class CrawlerRevisitor {
|
||||
doc.documentBody,
|
||||
new ContentTags(doc.etagMaybe, doc.lastModifiedMaybe)
|
||||
);
|
||||
|
||||
skipped++;
|
||||
}
|
||||
else {
|
||||
// GET the document with the stored document as a reference
|
||||
|
@ -6,11 +6,14 @@ import lombok.SneakyThrows;
|
||||
import nu.marginalia.WebsiteUrl;
|
||||
import nu.marginalia.api.math.MathClient;
|
||||
import nu.marginalia.api.searchquery.QueryClient;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryResponse;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.search.command.SearchParameters;
|
||||
import nu.marginalia.search.model.*;
|
||||
import nu.marginalia.search.model.ClusteredUrlDetails;
|
||||
import nu.marginalia.search.model.DecoratedSearchResults;
|
||||
import nu.marginalia.search.model.SearchFilters;
|
||||
import nu.marginalia.search.model.UrlDetails;
|
||||
import nu.marginalia.search.svc.SearchQueryIndexService;
|
||||
import nu.marginalia.search.svc.SearchUnitConversionService;
|
||||
import org.apache.logging.log4j.util.Strings;
|
||||
@ -65,9 +68,10 @@ public class SearchOperator {
|
||||
}
|
||||
|
||||
public List<UrlDetails> doSiteSearch(String domain,
|
||||
int domainId,
|
||||
int count) {
|
||||
|
||||
var queryParams = paramFactory.forSiteSearch(domain, count);
|
||||
var queryParams = paramFactory.forSiteSearch(domain, domainId, count);
|
||||
var queryResponse = queryClient.search(queryParams);
|
||||
|
||||
return searchQueryService.getResultsFromQuery(queryResponse);
|
||||
|
@ -1,12 +1,12 @@
|
||||
package nu.marginalia.search;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.index.query.limit.QueryLimits;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.search.command.SearchParameters;
|
||||
|
||||
import java.util.List;
|
||||
@ -42,7 +42,7 @@ public class SearchQueryParamFactory {
|
||||
|
||||
}
|
||||
|
||||
public QueryParams forSiteSearch(String domain, int count) {
|
||||
public QueryParams forSiteSearch(String domain, int domainId, int count) {
|
||||
return new QueryParams("site:"+domain,
|
||||
null,
|
||||
List.of(),
|
||||
@ -53,7 +53,7 @@ public class SearchQueryParamFactory {
|
||||
SpecificationLimit.none(),
|
||||
SpecificationLimit.none(),
|
||||
SpecificationLimit.none(),
|
||||
List.of(),
|
||||
List.of(domainId),
|
||||
new QueryLimits(count, count, 100, 512),
|
||||
SearchSetIdentifier.NONE.name(),
|
||||
QueryStrategy.AUTO,
|
||||
|
@ -5,13 +5,13 @@ import nu.marginalia.api.domains.DomainInfoClient;
|
||||
import nu.marginalia.api.domains.model.DomainInformation;
|
||||
import nu.marginalia.api.domains.model.SimilarDomain;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.feedlot.FeedlotClient;
|
||||
import nu.marginalia.feedlot.model.FeedItems;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.renderer.MustacheRenderer;
|
||||
import nu.marginalia.renderer.RendererFactory;
|
||||
import nu.marginalia.screenshot.ScreenshotService;
|
||||
import nu.marginalia.search.SearchOperator;
|
||||
import nu.marginalia.feedlot.FeedlotClient;
|
||||
import nu.marginalia.search.model.UrlDetails;
|
||||
import nu.marginalia.search.svc.SearchFlagSiteService.FlagSiteFormData;
|
||||
import org.slf4j.Logger;
|
||||
@ -153,7 +153,7 @@ public class SearchSiteInfoService {
|
||||
linkingDomainsFuture = domainInfoClient.linkedDomains(domainId, 25);
|
||||
}
|
||||
|
||||
List<UrlDetails> sampleResults = searchOperator.doSiteSearch(domainName, 5);
|
||||
List<UrlDetails> sampleResults = searchOperator.doSiteSearch(domainName, domainId,5);
|
||||
if (!sampleResults.isEmpty()) {
|
||||
url = sampleResults.getFirst().url.withPathAndParam("/", null).toString();
|
||||
}
|
||||
@ -195,9 +195,10 @@ public class SearchSiteInfoService {
|
||||
}
|
||||
|
||||
private Docs listDocs(String domainName) {
|
||||
int domainId = domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1);
|
||||
return new Docs(domainName,
|
||||
domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1),
|
||||
searchOperator.doSiteSearch(domainName, 100));
|
||||
searchOperator.doSiteSearch(domainName, domainId, 100));
|
||||
}
|
||||
|
||||
public record Docs(Map<String, Boolean> view,
|
||||
|
@ -66,8 +66,8 @@ fi
|
||||
|
||||
download_model model/English.DICT https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.DICT
|
||||
download_model model/English.RDR https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.RDR
|
||||
download_model model/opennlp-sentence.bin https://mirrors.estointernet.in/apache/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin
|
||||
download_model model/opennlp-tokens.bin https://mirrors.estointernet.in/apache/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin
|
||||
download_model model/opennlp-sentence.bin https://downloads.apache.org/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin
|
||||
download_model model/opennlp-tokens.bin https://downloads.apache.org/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin
|
||||
download_model model/segments.bin https://downloads.marginalia.nu/model/segments.bin a2650796c77968b1bd9db0d7c01e3150
|
||||
download_model model/tfreq-new-algo3.bin https://downloads.marginalia.nu/model/tfreq-new-algo3.bin a38f0809f983723001dfc784d88ebb6d
|
||||
download_model model/lid.176.ftz https://downloads.marginalia.nu/model/lid.176.ftz 340156704bb8c8e50c4abf35a7ec2569
|
||||
|
Loading…
Reference in New Issue
Block a user