mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(crawler) Fix bug poor handling of duplicate ids
* Also clean up the code a bit
This commit is contained in:
parent
dbb758d1a8
commit
f03146de4b
@ -18,6 +18,8 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.nio.file.Path;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.*;
|
||||
|
||||
public class CrawlerMain implements AutoCloseable {
|
||||
@ -38,6 +40,8 @@ public class CrawlerMain implements AutoCloseable {
|
||||
final int poolSize = Integer.getInteger("crawler.pool-size", 512);
|
||||
final int poolQueueSize = 32;
|
||||
|
||||
private final Set<String> processedIds = new HashSet<>();
|
||||
|
||||
AbortMonitor abortMonitor = AbortMonitor.getInstance();
|
||||
Semaphore taskSem = new Semaphore(poolSize);
|
||||
|
||||
@ -87,11 +91,27 @@ public class CrawlerMain implements AutoCloseable {
|
||||
|
||||
logger.info("Let's go");
|
||||
|
||||
// TODO: Make this into an iterable instead so we can abort it
|
||||
plan.forEachCrawlingSpecification(this::startCrawlTask);
|
||||
}
|
||||
|
||||
|
||||
private void startCrawlTask(CrawlingSpecification crawlingSpecification) {
|
||||
if (abortMonitor.isAlive()) {
|
||||
|
||||
if (!processedIds.add(crawlingSpecification.id)) {
|
||||
|
||||
// This is a duplicate id, so we ignore it. Otherwise we'd end crawling the same site twice,
|
||||
// and if we're really unlucky, we might end up writing to the same output file from multiple
|
||||
// threads with complete bit salad as a result.
|
||||
|
||||
logger.error("Ignoring duplicate id: {}", crawlingSpecification.id);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!abortMonitor.isAlive()) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
taskSem.acquire();
|
||||
} catch (InterruptedException e) {
|
||||
@ -107,7 +127,6 @@ public class CrawlerMain implements AutoCloseable {
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
private void fetchDomain(CrawlingSpecification specification) {
|
||||
if (workLog.isJobFinished(specification.id))
|
||||
|
Loading…
Reference in New Issue
Block a user