(crawler) Fix bug poor handling of duplicate ids

* Also clean up the code a bit
2025-02-23 21:18:58 +00:00 · 2023-07-07 19:56:14 +02:00 · 2023-07-07 19:56:14 +02:00 · f03146de4b
commit f03146de4b
parent dbb758d1a8
1 changed files with 34 additions and 15 deletions
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java
@ -18,6 +18,8 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import java.nio.file.Path;
 import java.util.HashSet;
 import java.util.Set;
 import java.util.concurrent.*;
 public class CrawlerMain implements AutoCloseable {
@ -38,6 +40,8 @@ public class CrawlerMain implements AutoCloseable {
    final int poolSize = Integer.getInteger("crawler.pool-size", 512);
    final int poolQueueSize = 32;
    private final Set<String> processedIds = new HashSet<>();
    AbortMonitor abortMonitor = AbortMonitor.getInstance();
    Semaphore taskSem = new Semaphore(poolSize);
@ -87,26 +91,41 @@ public class CrawlerMain implements AutoCloseable {
        logger.info("Let's go");
        // TODO: Make this into an iterable instead so we can abort it
        plan.forEachCrawlingSpecification(this::startCrawlTask);
    }
    private void startCrawlTask(CrawlingSpecification crawlingSpecification) {
        if (abortMonitor.isAlive()) {
            try {
                taskSem.acquire();
            } catch (InterruptedException e) {
                throw new RuntimeException(e);
            }
-            pool.execute(() -> {
+    private void startCrawlTask(CrawlingSpecification crawlingSpecification) {
-                try {
+
-                    fetchDomain(crawlingSpecification);
+        if (!processedIds.add(crawlingSpecification.id)) {
-                }
+
-                finally {
+            // This is a duplicate id, so we ignore it.  Otherwise we'd end crawling the same site twice,
-                    taskSem.release();
+            // and if we're really unlucky, we might end up writing to the same output file from multiple
-                }
+            // threads with complete bit salad as a result.
-            });
+
            logger.error("Ignoring duplicate id: {}", crawlingSpecification.id);
            return;
        }
        if (!abortMonitor.isAlive()) {
            return;
        }
        try {
            taskSem.acquire();
        } catch (InterruptedException e) {
            throw new RuntimeException(e);
        }
        pool.execute(() -> {
            try {
                fetchDomain(crawlingSpecification);
            }
            finally {
                taskSem.release();
            }
        });
    }
    private void fetchDomain(CrawlingSpecification specification) {