From f03146de4b392028cb3a6773ff1615d915839487 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 7 Jul 2023 19:56:14 +0200 Subject: [PATCH] (crawler) Fix bug poor handling of duplicate ids * Also clean up the code a bit --- .../java/nu/marginalia/crawl/CrawlerMain.java | 49 +++++++++++++------ 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java index 224087f6..cbd9513a 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java @@ -18,6 +18,8 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.nio.file.Path; +import java.util.HashSet; +import java.util.Set; import java.util.concurrent.*; public class CrawlerMain implements AutoCloseable { @@ -38,6 +40,8 @@ public class CrawlerMain implements AutoCloseable { final int poolSize = Integer.getInteger("crawler.pool-size", 512); final int poolQueueSize = 32; + private final Set processedIds = new HashSet<>(); + AbortMonitor abortMonitor = AbortMonitor.getInstance(); Semaphore taskSem = new Semaphore(poolSize); @@ -87,26 +91,41 @@ public class CrawlerMain implements AutoCloseable { logger.info("Let's go"); + // TODO: Make this into an iterable instead so we can abort it plan.forEachCrawlingSpecification(this::startCrawlTask); } - private void startCrawlTask(CrawlingSpecification crawlingSpecification) { - if (abortMonitor.isAlive()) { - try { - taskSem.acquire(); - } catch (InterruptedException e) { - throw new RuntimeException(e); - } - pool.execute(() -> { - try { - fetchDomain(crawlingSpecification); - } - finally { - taskSem.release(); - } - }); + private void startCrawlTask(CrawlingSpecification crawlingSpecification) { + + if (!processedIds.add(crawlingSpecification.id)) { + + // This is a duplicate id, so we ignore it. Otherwise we'd end crawling the same site twice, + // and if we're really unlucky, we might end up writing to the same output file from multiple + // threads with complete bit salad as a result. + + logger.error("Ignoring duplicate id: {}", crawlingSpecification.id); + return; } + + if (!abortMonitor.isAlive()) { + return; + } + + try { + taskSem.acquire(); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + + pool.execute(() -> { + try { + fetchDomain(crawlingSpecification); + } + finally { + taskSem.release(); + } + }); } private void fetchDomain(CrawlingSpecification specification) {