(crawler) Fix bug poor handling of duplicate ids

* Also clean up the code a bit
2025-02-23 21:18:58 +00:00 · 2023-07-07 19:56:14 +02:00 · 2023-07-07 19:56:14 +02:00 · f03146de4b
commit f03146de4b
parent dbb758d1a8
1 changed files with 34 additions and 15 deletions
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java
@ -18,6 +18,8 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import java.nio.file.Path;
+import java.util.HashSet;
+import java.util.Set;
 import java.util.concurrent.*;

 public class CrawlerMain implements AutoCloseable {
@ -38,6 +40,8 @@ public class CrawlerMain implements AutoCloseable {
    final int poolSize = Integer.getInteger("crawler.pool-size", 512);
    final int poolQueueSize = 32;

+    private final Set<String> processedIds = new HashSet<>();
+
    AbortMonitor abortMonitor = AbortMonitor.getInstance();
    Semaphore taskSem = new Semaphore(poolSize);

@ -87,11 +91,27 @@ public class CrawlerMain implements AutoCloseable {

        logger.info("Let's go");

+        // TODO: Make this into an iterable instead so we can abort it
        plan.forEachCrawlingSpecification(this::startCrawlTask);
    }

+
    private void startCrawlTask(CrawlingSpecification crawlingSpecification) {
-        if (abortMonitor.isAlive()) {
+
+        if (!processedIds.add(crawlingSpecification.id)) {
+
+            // This is a duplicate id, so we ignore it.  Otherwise we'd end crawling the same site twice,
+            // and if we're really unlucky, we might end up writing to the same output file from multiple
+            // threads with complete bit salad as a result.
+
+            logger.error("Ignoring duplicate id: {}", crawlingSpecification.id);
+            return;
+        }
+
+        if (!abortMonitor.isAlive()) {
+            return;
+        }
+
        try {
            taskSem.acquire();
        } catch (InterruptedException e) {
@ -107,7 +127,6 @@ public class CrawlerMain implements AutoCloseable {
            }
        });
    }
-    }

    private void fetchDomain(CrawlingSpecification specification) {
        if (workLog.isJobFinished(specification.id))