(crawler) Fix bug poor handling of duplicate ids

* Also clean up the code a bit
This commit is contained in:
Viktor Lofgren 2023-07-07 19:56:14 +02:00
parent dbb758d1a8
commit f03146de4b

View File

@ -18,6 +18,8 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.nio.file.Path;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.*;
public class CrawlerMain implements AutoCloseable {
@ -38,6 +40,8 @@ public class CrawlerMain implements AutoCloseable {
final int poolSize = Integer.getInteger("crawler.pool-size", 512);
final int poolQueueSize = 32;
private final Set<String> processedIds = new HashSet<>();
AbortMonitor abortMonitor = AbortMonitor.getInstance();
Semaphore taskSem = new Semaphore(poolSize);
@ -87,11 +91,27 @@ public class CrawlerMain implements AutoCloseable {
logger.info("Let's go");
// TODO: Make this into an iterable instead so we can abort it
plan.forEachCrawlingSpecification(this::startCrawlTask);
}
private void startCrawlTask(CrawlingSpecification crawlingSpecification) {
if (abortMonitor.isAlive()) {
if (!processedIds.add(crawlingSpecification.id)) {
// This is a duplicate id, so we ignore it. Otherwise we'd end crawling the same site twice,
// and if we're really unlucky, we might end up writing to the same output file from multiple
// threads with complete bit salad as a result.
logger.error("Ignoring duplicate id: {}", crawlingSpecification.id);
return;
}
if (!abortMonitor.isAlive()) {
return;
}
try {
taskSem.acquire();
} catch (InterruptedException e) {
@ -107,7 +127,6 @@ public class CrawlerMain implements AutoCloseable {
}
});
}
}
private void fetchDomain(CrawlingSpecification specification) {
if (workLog.isJobFinished(specification.id))