mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Faster crawling
This commit is contained in:
parent
ce09fce639
commit
ffde8c8305
@ -29,11 +29,17 @@ public class CrawlerMain implements AutoCloseable {
|
||||
new SynchronousQueue<>(), Util.threadFactory("OkHttp Dispatcher", true)));
|
||||
|
||||
private final UserAgent userAgent;
|
||||
private final ThreadPoolExecutor pool;
|
||||
final int poolSize = 256;
|
||||
final int poolQueueSize = 32;
|
||||
|
||||
public CrawlerMain(EdgeCrawlPlan plan) throws Exception {
|
||||
this.plan = plan;
|
||||
this.userAgent = WmsaHome.getUserAgent();
|
||||
|
||||
BlockingQueue<Runnable> queue = new LinkedBlockingQueue<>(poolQueueSize);
|
||||
pool = new ThreadPoolExecutor(poolSize/128, poolSize, 5, TimeUnit.MINUTES, queue); // maybe need to set -Xss for JVM to deal with this?
|
||||
|
||||
workLog = plan.createCrawlWorkLog();
|
||||
crawlDataDir = plan.crawl.getDir();
|
||||
}
|
||||
@ -84,31 +90,44 @@ public class CrawlerMain implements AutoCloseable {
|
||||
|
||||
logger.info("Let's go");
|
||||
|
||||
final int poolSize = 1024;
|
||||
|
||||
BlockingQueue<Runnable> queue = new LinkedBlockingQueue<>(10);
|
||||
ThreadPoolExecutor pool = new ThreadPoolExecutor(poolSize/128, poolSize, 5, TimeUnit.MINUTES, queue); // maybe need to set -Xss for JVM to deal with this?
|
||||
|
||||
AbortMonitor abortMonitor = AbortMonitor.getInstance();
|
||||
|
||||
|
||||
Semaphore taskSem = new Semaphore(poolSize);
|
||||
|
||||
plan.forEachCrawlingSpecification(spec -> {
|
||||
if (abortMonitor.isAlive()) {
|
||||
pool.execute(() -> fetchDomain(spec));
|
||||
try {
|
||||
taskSem.acquire();
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
pool.execute(() -> {
|
||||
try {
|
||||
fetchDomain(spec);
|
||||
}
|
||||
finally {
|
||||
taskSem.release();
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
logger.info("Awaiting termination");
|
||||
|
||||
pool.shutdown();
|
||||
|
||||
while (!pool.awaitTermination(1, TimeUnit.SECONDS));
|
||||
|
||||
logger.info("All finished");
|
||||
}
|
||||
|
||||
public void close() throws Exception {
|
||||
logger.info("Awaiting termination");
|
||||
pool.shutdown();
|
||||
|
||||
while (!pool.awaitTermination(1, TimeUnit.SECONDS));
|
||||
logger.info("All finished");
|
||||
|
||||
workLog.close();
|
||||
dispatcher.executorService().shutdownNow();
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -83,8 +83,6 @@ public class CrawlerRetreiver {
|
||||
}
|
||||
|
||||
public int fetch() throws IOException {
|
||||
logger.info("Fetching {}", domain);
|
||||
|
||||
Optional<CrawledDomain> probeResult = probeDomainForProblems(domain);
|
||||
|
||||
if (probeResult.isPresent()) {
|
||||
@ -272,10 +270,10 @@ public class CrawlerRetreiver {
|
||||
@SneakyThrows
|
||||
private void delay(long crawlDelay, long timeParsed) {
|
||||
if (crawlDelay >= 1) {
|
||||
if (timeParsed/1000 > crawlDelay)
|
||||
if (timeParsed > crawlDelay)
|
||||
return;
|
||||
|
||||
Thread.sleep(Math.min(1000*crawlDelay-timeParsed, 5000));
|
||||
Thread.sleep(Math.min(crawlDelay-timeParsed, 5000));
|
||||
}
|
||||
else {
|
||||
if (timeParsed > DEFAULT_CRAWL_DELAY_MS)
|
||||
|
Loading…
Reference in New Issue
Block a user