(crawler) Clean up the crawler code a bit, removing vestigial abstractions and historical debris

2025-02-23 21:18:58 +00:00 · 2024-10-15 17:27:59 +02:00 · 2024-10-15 17:27:59 +02:00 · 7305afa0f8
commit 7305afa0f8
parent 481f999b70
11 changed files with 209 additions and 279 deletions
--- a/code/processes/converting-process/test/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java
+++ b/code/processes/converting-process/test/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java
@ -7,12 +7,12 @@ import nu.marginalia.UserAgent;
 import nu.marginalia.WmsaHome;
 import nu.marginalia.converting.model.ProcessedDomain;
 import nu.marginalia.converting.processor.DomainProcessor;
 import nu.marginalia.crawl.CrawlerMain;
 import nu.marginalia.crawl.fetcher.HttpFetcher;
 import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
 import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
 import nu.marginalia.crawl.retreival.CrawlerRetreiver;
 import nu.marginalia.crawl.retreival.DomainProber;
 import nu.marginalia.crawl.spec.CrawlSpecProvider;
 import nu.marginalia.io.crawldata.format.ParquetSerializableCrawlDataStream;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.crawl.DomainIndexingState;
@ -77,7 +77,7 @@ public class CrawlingThenConvertingIntegrationTest {
    @Test
    public void testInvalidDomain() throws IOException {
        // Attempt to fetch an invalid domain
-        var specs = new CrawlSpecProvider.CrawlSpecRecord("invalid.invalid.invalid", 10);
+        var specs = new CrawlerMain.CrawlSpecRecord("invalid.invalid.invalid", 10);
        CrawledDomain crawlData = crawl(specs);
@ -93,7 +93,7 @@ public class CrawlingThenConvertingIntegrationTest {
    @Test
    public void testRedirectingDomain() throws IOException {
        // Attempt to fetch an invalid domain
-        var specs = new CrawlSpecProvider.CrawlSpecRecord("memex.marginalia.nu", 10);
+        var specs = new CrawlerMain.CrawlSpecRecord("memex.marginalia.nu", 10);
        CrawledDomain crawlData = crawl(specs);
@ -112,7 +112,7 @@ public class CrawlingThenConvertingIntegrationTest {
    @Test
    public void testBlockedDomain() throws IOException {
        // Attempt to fetch an invalid domain
-        var specs = new CrawlSpecProvider.CrawlSpecRecord("search.marginalia.nu", 10);
+        var specs = new CrawlerMain.CrawlSpecRecord("search.marginalia.nu", 10);
        CrawledDomain crawlData = crawl(specs, d->false); // simulate blocking by blacklisting everything
@ -128,7 +128,7 @@ public class CrawlingThenConvertingIntegrationTest {
    @Test
    public void crawlSunnyDay() throws IOException {
-        var specs = new CrawlSpecProvider.CrawlSpecRecord("www.marginalia.nu", 10);
+        var specs = new CrawlerMain.CrawlSpecRecord("www.marginalia.nu", 10);
        CrawledDomain domain = crawl(specs);
        assertFalse(domain.doc.isEmpty());
@ -161,7 +161,7 @@ public class CrawlingThenConvertingIntegrationTest {
    @Test
    public void crawlContentTypes() throws IOException {
-        var specs = new CrawlSpecProvider.CrawlSpecRecord("www.marginalia.nu", 10,
+        var specs = new CrawlerMain.CrawlSpecRecord("www.marginalia.nu", 10,
                List.of(
                        "https://www.marginalia.nu/sanic.png",
                        "https://www.marginalia.nu/invalid"
@ -199,7 +199,7 @@ public class CrawlingThenConvertingIntegrationTest {
    @Test
    public void crawlRobotsTxt() throws IOException {
-        var specs = new CrawlSpecProvider.CrawlSpecRecord("search.marginalia.nu", 5,
+        var specs = new CrawlerMain.CrawlSpecRecord("search.marginalia.nu", 5,
                        List.of("https://search.marginalia.nu/search?q=hello+world")
        );
@ -238,11 +238,11 @@ public class CrawlingThenConvertingIntegrationTest {
            return null; // unreachable
        }
    }
-    private CrawledDomain crawl(CrawlSpecProvider.CrawlSpecRecord specs) throws IOException {
+    private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs) throws IOException {
        return crawl(specs, domain -> true);
    }
-    private CrawledDomain crawl(CrawlSpecProvider.CrawlSpecRecord specs, Predicate<EdgeDomain> domainBlacklist) throws IOException {
+    private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs, Predicate<EdgeDomain> domainBlacklist) throws IOException {
        List<SerializableCrawlData> data = new ArrayList<>();
        try (var recorder = new WarcRecorder(fileName)) {
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/AbortMonitor.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/AbortMonitor.java
@ -1,46 +0,0 @@
 package nu.marginalia.crawl;
 import lombok.SneakyThrows;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import java.nio.file.Files;
 import java.nio.file.Path;
 public class AbortMonitor {
    private volatile boolean abort = false;
    private static volatile AbortMonitor instance = null;
    private static final Logger logger = LoggerFactory.getLogger(AbortMonitor.class);
    public static AbortMonitor getInstance() {
        if (instance == null) {
            synchronized (AbortMonitor.class) {
                if (instance == null) {
                    instance = new AbortMonitor();
                    new Thread(instance::run, "AbortMon").start();
                }
            }
        }
        return instance;
    }
    private AbortMonitor() {
    }
    @SneakyThrows
    public void run() {
        for (;;) {
            Thread.sleep(1000);
            if (Files.exists(Path.of("/tmp/stop"))) {
                logger.warn("Abort file found");
                abort = true;
                Files.delete(Path.of("/tmp/stop"));
            }
        }
    }
    public boolean isAlive() {
        return !abort;
    }
 }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
@ -4,10 +4,13 @@ import com.google.gson.Gson;
 import com.google.inject.Guice;
 import com.google.inject.Inject;
 import com.google.inject.Injector;
 import com.zaxxer.hikari.HikariDataSource;
 import lombok.Builder;
 import nu.marginalia.ProcessConfiguration;
 import nu.marginalia.ProcessConfigurationModule;
 import nu.marginalia.UserAgent;
 import nu.marginalia.WmsaHome;
 import nu.marginalia.atags.model.DomainLinks;
 import nu.marginalia.atags.source.AnchorTagsSource;
 import nu.marginalia.atags.source.AnchorTagsSourceFactory;
 import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
@ -16,9 +19,9 @@ import nu.marginalia.crawl.logic.DomainLocks;
 import nu.marginalia.crawl.retreival.CrawlDataReference;
 import nu.marginalia.crawl.retreival.CrawlerRetreiver;
 import nu.marginalia.crawl.retreival.DomainProber;
 import nu.marginalia.crawl.spec.CrawlSpecProvider;
 import nu.marginalia.crawl.warc.WarcArchiverFactory;
 import nu.marginalia.crawl.warc.WarcArchiverIf;
 import nu.marginalia.db.DomainBlacklist;
 import nu.marginalia.io.CrawledDomainReader;
 import nu.marginalia.io.CrawlerOutputFile;
 import nu.marginalia.model.EdgeDomain;
@ -35,6 +38,7 @@ import nu.marginalia.storage.FileStorageService;
 import nu.marginalia.util.SimpleBlockingThreadPool;
 import okhttp3.ConnectionPool;
 import okhttp3.Dispatcher;
 import org.jetbrains.annotations.NotNull;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -44,10 +48,7 @@ import java.nio.file.Path;
 import java.nio.file.StandardCopyOption;
 import java.security.Security;
 import java.sql.SQLException;
-import java.util.List;
+import java.util.*;
 import java.util.Map;
 import java.util.Optional;
 import java.util.UUID;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;
@ -62,22 +63,28 @@ public class CrawlerMain extends ProcessMainClass {
    private final MessageQueueFactory messageQueueFactory;
    private final DomainProber domainProber;
    private final FileStorageService fileStorageService;
    private final CrawlSpecProvider crawlSpecProvider;
    private final AnchorTagsSourceFactory anchorTagsSourceFactory;
    private final WarcArchiverFactory warcArchiverFactory;
    private final HikariDataSource dataSource;
    private final DomainBlacklist blacklist;
    private final Gson gson;
    private final int node;
    private final SimpleBlockingThreadPool pool;
    private final DomainLocks domainLocks = new DomainLocks();
-    private final Map<String, String> processingIds = new ConcurrentHashMap<>();
+    private final Map<String, CrawlTask> pendingCrawlTasks = new ConcurrentHashMap<>();
    private final AbortMonitor abortMonitor = AbortMonitor.getInstance();
    private final AtomicInteger tasksDone = new AtomicInteger(0);
    private final HttpFetcherImpl fetcher;
-    private volatile int totalTasks;
+    private int totalTasks = 1;
    private static final double URL_GROWTH_FACTOR = Double.parseDouble(System.getProperty("crawler.crawlSetGrowthFactor", "1.25"));
    private static final int MIN_URLS_PER_DOMAIN = Integer.getInteger("crawler.minUrlsPerDomain", 100);
    private static final int MID_URLS_PER_DOMAIN = Integer.getInteger("crawler.minUrlsPerDomain", 2_000);
    private static final int MAX_URLS_PER_DOMAIN = Integer.getInteger("crawler.maxUrlsPerDomain", 10_000);
    @Inject
    public CrawlerMain(UserAgent userAgent,
@ -85,18 +92,20 @@ public class CrawlerMain extends ProcessMainClass {
                       MessageQueueFactory messageQueueFactory, DomainProber domainProber,
                       FileStorageService fileStorageService,
                       ProcessConfiguration processConfiguration,
                       CrawlSpecProvider crawlSpecProvider,
                       AnchorTagsSourceFactory anchorTagsSourceFactory,
                       WarcArchiverFactory warcArchiverFactory,
-                       Gson gson) {
+                       HikariDataSource dataSource,
                       DomainBlacklist blacklist,
                       Gson gson) throws InterruptedException {
        this.userAgent = userAgent;
        this.heartbeat = heartbeat;
        this.messageQueueFactory = messageQueueFactory;
        this.domainProber = domainProber;
        this.fileStorageService = fileStorageService;
        this.crawlSpecProvider = crawlSpecProvider;
        this.anchorTagsSourceFactory = anchorTagsSourceFactory;
        this.warcArchiverFactory = warcArchiverFactory;
        this.dataSource = dataSource;
        this.blacklist = blacklist;
        this.gson = gson;
        this.node = processConfiguration.node();
@ -108,15 +117,13 @@ public class CrawlerMain extends ProcessMainClass {
                new Dispatcher(),
                new ConnectionPool(5, 10, TimeUnit.SECONDS)
        );
        // Wait for the blacklist to be loaded before starting the crawl
        blacklist.waitUntilLoaded();
    }
    public static void main(String... args) throws Exception {
        if (!AbortMonitor.getInstance().isAlive()) {
            System.err.println("Remove abort file first");
            return;
        }
        // Prevent Java from caching DNS lookups forever (filling up the system RAM as a result)
        Security.setProperty("networkaddress.cache.ttl" , "3600");
@ -144,7 +151,7 @@ public class CrawlerMain extends ProcessMainClass {
                    crawler.runForSingleDomain(instructions.targetDomainName, instructions.outputDir);
                }
                else {
-                    crawler.run(instructions.outputDir);
+                    crawler.runForDatabaseDomains(instructions.outputDir);
                }
                instructions.ok();
            } catch (Exception ex) {
@ -160,34 +167,99 @@ public class CrawlerMain extends ProcessMainClass {
        System.exit(0);
    }
-    public void run(Path outputDir) throws Exception {
+    public void runForDatabaseDomains(Path outputDir) throws Exception {
        heartbeat.start();
        logger.info("Loading domains to be crawled");
        final List<CrawlSpecRecord> crawlSpecRecords = new ArrayList<>();
        final List<EdgeDomain> domainsToCrawl = new ArrayList<>();
        // Assign any domains with node_affinity=0 to this node, and then fetch all domains assigned to this node
        // to be crawled.
        try (var conn = dataSource.getConnection()) {
            try (var assignFreeDomains = conn.prepareStatement(
                    """
                        UPDATE EC_DOMAIN
                        SET NODE_AFFINITY=?
                        WHERE NODE_AFFINITY=0
                        """))
            {
                // Assign any domains with node_affinity=0 to this node.  We must do this now, before we start crawling
                // to avoid race conditions with other crawl runs.  We don't want multiple crawlers to crawl the same domain.
                assignFreeDomains.setInt(1, node);
                assignFreeDomains.executeUpdate();
            }
            try (var query = conn.prepareStatement("""
                     SELECT DOMAIN_NAME, COALESCE(VISITED_URLS, 0), EC_DOMAIN.ID
                     FROM EC_DOMAIN
                     LEFT JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
                     WHERE NODE_AFFINITY=?
                     """)) {
                // Fetch the domains to be crawled
                query.setInt(1, node);
                query.setFetchSize(10_000);
                var rs = query.executeQuery();
                while (rs.next()) {
                    // Skip blacklisted domains
                    int domainId = rs.getInt(3);
                    if (blacklist.isBlacklisted(domainId))
                        continue;
                    int existingUrls = rs.getInt(2);
                    String domainName = rs.getString(1);
                    domainsToCrawl.add(new EdgeDomain(domainName));
                    crawlSpecRecords.add(CrawlSpecRecord.growExistingDomain(domainName, existingUrls));
                    totalTasks++;
                }
            }
        }
        logger.info("Loaded {} domains", crawlSpecRecords.size());
        // Shuffle the domains to ensure we get a good mix of domains in each crawl,
        // so that e.g. the big domains don't get all crawled at once, or we end up
        // crawling the same server in parallel from different subdomains...
        Collections.shuffle(crawlSpecRecords);
        // First a validation run to ensure the file is all good to parse
-        totalTasks = crawlSpecProvider.totalCount();
+        if (crawlSpecRecords.isEmpty()) {
        if (totalTasks == 0) {
            // This is an error state, and we should make noise about it
            throw new IllegalStateException("No crawl tasks found, refusing to continue");
        }
-        logger.info("Queued {} crawl tasks, let's go", totalTasks);
+        else {
            logger.info("Queued {} crawl tasks, let's go", crawlSpecRecords.size());
        }
        // Set up the work log and the warc archiver so we can keep track of what we've done
        try (WorkLog workLog = new WorkLog(outputDir.resolve("crawler.log"));
             WarcArchiverIf warcArchiver = warcArchiverFactory.get(outputDir);
-             AnchorTagsSource anchorTagsSource = anchorTagsSourceFactory.create(crawlSpecProvider.getDomains())
+             AnchorTagsSource anchorTagsSource = anchorTagsSourceFactory.create(domainsToCrawl)
        ) {
            // Set the number of tasks done to the number of tasks that are already finished,
            // (this happens when the process is restarted after a crash or a shutdown)
            tasksDone.set(workLog.countFinishedJobs());
-            // Process the crawl tasks
+            // Create crawl tasks and submit them to the pool for execution
-            try (var specStream = crawlSpecProvider.stream()) {
+            for (CrawlSpecRecord crawlSpec : crawlSpecRecords) {
-                specStream
+                if (workLog.isJobFinished(crawlSpec.domain()))
-                        .takeWhile((e) -> abortMonitor.isAlive())
+                    continue;
-                        .filter(e -> !workLog.isJobFinished(e.domain()))
+
-                        .filter(e -> processingIds.put(e.domain(), "") == null)
+                var task = new CrawlTask(
-                        .map(e -> new CrawlTask(e, anchorTagsSource, outputDir, warcArchiver, workLog))
+                        crawlSpec,
-                        .forEach(pool::submitQuietly);
+                        anchorTagsSource,
                        outputDir,
                        warcArchiver,
                        workLog);
                if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) == null) {
                    pool.submitQuietly(task);
                }
            }
            logger.info("Shutting down the pool, waiting for tasks to complete...");
@ -222,7 +294,7 @@ public class CrawlerMain extends ProcessMainClass {
             WarcArchiverIf warcArchiver = warcArchiverFactory.get(outputDir);
             AnchorTagsSource anchorTagsSource = anchorTagsSourceFactory.create(List.of(new EdgeDomain(targetDomainName)))
        ) {
-            var spec = new CrawlSpecProvider.CrawlSpecRecord(targetDomainName, 1000, List.of());
+            var spec = new CrawlSpecRecord(targetDomainName, 1000, List.of());
            var task = new CrawlTask(spec, anchorTagsSource, outputDir, warcArchiver, workLog);
            task.run();
        }
@ -234,9 +306,9 @@ public class CrawlerMain extends ProcessMainClass {
        }
    }
-    class CrawlTask implements SimpleBlockingThreadPool.Task {
+    private class CrawlTask implements SimpleBlockingThreadPool.Task {
-        private final CrawlSpecProvider.CrawlSpecRecord specification;
+        private final CrawlSpecRecord specification;
        private final String domain;
        private final String id;
@ -246,7 +318,7 @@ public class CrawlerMain extends ProcessMainClass {
        private final WarcArchiverIf warcArchiver;
        private final WorkLog workLog;
-        CrawlTask(CrawlSpecProvider.CrawlSpecRecord specification,
+        CrawlTask(CrawlSpecRecord specification,
                  AnchorTagsSource anchorTagsSource,
                  Path outputDir,
                  WarcArchiverIf warcArchiver,
@ -269,6 +341,8 @@ public class CrawlerMain extends ProcessMainClass {
            Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP);
            Path parquetFile = CrawlerOutputFile.createParquetPath(outputDir, id, domain);
            // Move the WARC file to a temp file if it exists, so we can resume the crawl using the old data
            // while writing to the same file name as before
            if (Files.exists(newWarcFile)) {
                Files.move(newWarcFile, tempFile, StandardCopyOption.REPLACE_EXISTING);
            }
@ -276,31 +350,29 @@ public class CrawlerMain extends ProcessMainClass {
                Files.deleteIfExists(tempFile);
            }
            var domainLock = domainLocks.getSemaphore(new EdgeDomain(specification.domain()));
            try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now
                 var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, warcRecorder);
-                 CrawlDataReference reference = getReference())
+                 CrawlDataReference reference = getReference();
                 )
            {
-                // acquire the domain lock to prevent other threads from crawling the same domain,
+                // Resume the crawl if it was aborted
                // we release it at the end of the task to let them go ahead
                Thread.currentThread().setName("crawling:" + domain + " [await domain lock]");
                domainLock.acquire();
                Thread.currentThread().setName("crawling:" + domain);
                var domainLinks = anchorTagsSource.getAnchorTags(domain);
                if (Files.exists(tempFile)) {
                    retriever.syncAbortedRun(tempFile);
                    Files.delete(tempFile);
                }
-                int size = retriever.crawlDomain(domainLinks, reference);
+                DomainLinks domainLinks = anchorTagsSource.getAnchorTags(domain);
                int size;
                try (var lock = domainLocks.lockDomain(new EdgeDomain(domain))) {
                    size = retriever.crawlDomain(domainLinks, reference);
                }
                // Delete the reference crawl data if it's not the same as the new one
                // (mostly a case when migrating from legacy->warc)
                reference.delete();
                // Convert the WARC file to Parquet
                CrawledDocumentParquetRecordFileWriter
                        .convertWarc(domain, userAgent, newWarcFile, parquetFile);
@ -308,7 +380,10 @@ public class CrawlerMain extends ProcessMainClass {
                // otherwise delete it:
                warcArchiver.consumeWarc(newWarcFile, domain);
                // Mark the domain as finished in the work log
                workLog.setJobToFinished(domain, parquetFile.toString(), size);
                // Update the progress bar
                heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks);
                logger.info("Fetched {}", domain);
@ -316,11 +391,8 @@ public class CrawlerMain extends ProcessMainClass {
                logger.error("Error fetching domain " + domain, e);
            }
            finally {
                // release the domain lock to permit other threads to crawl subdomains of this domain
                domainLock.release();
                // We don't need to double-count these; it's also kept int he workLog
-                processingIds.remove(domain);
+                pendingCrawlTasks.remove(domain);
                Thread.currentThread().setName("[idle]");
                Files.deleteIfExists(newWarcFile);
@ -379,12 +451,11 @@ public class CrawlerMain extends ProcessMainClass {
        var msg = msgOpt.orElseThrow(() -> new RuntimeException("No message received"));
        var request = gson.fromJson(msg.payload(), nu.marginalia.mqapi.crawling.CrawlRequest.class);
-
+        var crawlStorage = fileStorageService.getStorage(request.crawlStorage);
        var crawlData = fileStorageService.getStorage(request.crawlStorage);
        return new CrawlRequest(
                request.targetDomainName,
-                crawlData.asPath(),
+                crawlStorage.asPath(),
                msg,
                inbox);
    }
@ -404,4 +475,25 @@ public class CrawlerMain extends ProcessMainClass {
        }
    }
    @Builder
    public record CrawlSpecRecord(@NotNull String domain, int crawlDepth, @NotNull List<String> urls) {
        public CrawlSpecRecord(String domain, int crawlDepth) {
            this(domain, crawlDepth, List.of());
        }
        public static CrawlSpecRecord growExistingDomain(String domain, int visitedUrls) {
            // Calculate the number of URLs to fetch for this domain, based on the number of URLs
            // already fetched, and a growth factor that gets a bonus for small domains
            return new CrawlSpecRecord(domain,
                    (int) Math.clamp(
                    (visitedUrls * (visitedUrls < MID_URLS_PER_DOMAIN
                            ? Math.max(2.5, URL_GROWTH_FACTOR)
                            : URL_GROWTH_FACTOR)
                    ),
                    MIN_URLS_PER_DOMAIN,
                    MAX_URLS_PER_DOMAIN));
        }
    }
 }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/logic/DomainLocks.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/logic/DomainLocks.java
@ -18,8 +18,9 @@ public class DomainLocks {
    /** Returns a lock object corresponding to the given domain.  The object is returned as-is,
     * and may be held by another thread.  The caller is responsible for locking and  releasing the lock.
     */
-    public Semaphore getSemaphore(EdgeDomain domain) {
+    public DomainLock lockDomain(EdgeDomain domain) throws InterruptedException {
-        return locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits);
+        return new DomainLock(domain.toString(),
                locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits));
    }
    private Semaphore defaultPermits(String topDomain) {
@ -42,4 +43,24 @@ public class DomainLocks {
        return new Semaphore(2);
    }
    public static class DomainLock implements AutoCloseable {
        private final String domainName;
        private final Semaphore semaphore;
        DomainLock(String domainName, Semaphore semaphore) throws InterruptedException {
            this.domainName = domainName;
            this.semaphore = semaphore;
            Thread.currentThread().setName("crawling:" + domainName + " [await domain lock]");
            semaphore.acquire();
            Thread.currentThread().setName("crawling:" + domainName);
        }
        @Override
        public void close() throws Exception {
            semaphore.release();
            Thread.currentThread().setName("crawling:" + domainName + " [wrapping up]");
        }
    }
 }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDataReference.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDataReference.java
@ -36,6 +36,10 @@ public class CrawlDataReference implements AutoCloseable {
        }
    }
    /** Get the next document from the crawl data,
     * returning null when there are no more documents
     * available
     */
    @Nullable
    public CrawledDocument nextDocument() {
        try {
@ -52,7 +56,7 @@ public class CrawlDataReference implements AutoCloseable {
        return null;
    }
-    public boolean isContentBodySame(String one, String other) {
+    public static boolean isContentBodySame(String one, String other) {
        final long contentHashOne = contentHash(one);
        final long contentHashOther = contentHash(other);
@ -60,7 +64,7 @@ public class CrawlDataReference implements AutoCloseable {
        return EasyLSH.hammingDistance(contentHashOne, contentHashOther) < 4;
    }
-    private long contentHash(String content) {
+    private static long contentHash(String content) {
        EasyLSH hash = new EasyLSH();
        int next = 0;
@ -83,8 +87,8 @@ public class CrawlDataReference implements AutoCloseable {
        return hash.get();
    }
-    private final HashFunction hashFunction = Hashing.murmur3_128();
+    private static final HashFunction hashFunction = Hashing.murmur3_128();
-    private int hashInt(int v) {
+    private static int hashInt(int v) {
        return hashFunction.hashInt(v).asInt();
    }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
@ -3,6 +3,7 @@ package nu.marginalia.crawl.retreival;
 import crawlercommons.robots.SimpleRobotRules;
 import nu.marginalia.atags.model.DomainLinks;
 import nu.marginalia.contenttype.ContentType;
 import nu.marginalia.crawl.CrawlerMain;
 import nu.marginalia.crawl.fetcher.ContentTags;
 import nu.marginalia.crawl.fetcher.HttpFetcher;
 import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
@ -11,7 +12,6 @@ import nu.marginalia.crawl.logic.LinkFilterSelector;
 import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor;
 import nu.marginalia.crawl.retreival.revisit.DocumentWithReference;
 import nu.marginalia.crawl.retreival.sitemap.SitemapFetcher;
 import nu.marginalia.crawl.spec.CrawlSpecProvider;
 import nu.marginalia.ip_blocklist.UrlBlocklist;
 import nu.marginalia.link_parser.LinkParser;
 import nu.marginalia.model.EdgeDomain;
@ -54,7 +54,7 @@ public class CrawlerRetreiver implements AutoCloseable {
    public CrawlerRetreiver(HttpFetcher fetcher,
                            DomainProber domainProber,
-                            CrawlSpecProvider.CrawlSpecRecord specs,
+                            CrawlerMain.CrawlSpecRecord specs,
                            WarcRecorder warcRecorder)
    {
        this.warcRecorder = warcRecorder;
@ -117,9 +117,7 @@ public class CrawlerRetreiver implements AutoCloseable {
        sniffRootDocument(rootUrl, delayTimer);
        // Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
-        int fetchedCount = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer);
+        if (crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer) > 0) {
        if (fetchedCount > 0) {
            // If we have reference data, we will always grow the crawl depth a bit
            crawlFrontier.increaseDepth(1.5, 2500);
        }
@ -162,9 +160,7 @@ public class CrawlerRetreiver implements AutoCloseable {
                continue;
            try {
-                if (fetchContentWithReference(top, delayTimer, DocumentWithReference.empty()).isOk()) {
+                fetchContentWithReference(top, delayTimer, DocumentWithReference.empty());
                    fetchedCount++;
                }
            }
            catch (InterruptedException ex) {
                Thread.currentThread().interrupt();
@ -172,7 +168,7 @@ public class CrawlerRetreiver implements AutoCloseable {
            }
        }
-        return fetchedCount;
+        return crawlFrontier.visitedSize();
    }
    public void syncAbortedRun(Path warcFile) {
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java
@ -165,7 +165,7 @@ public class DomainCrawlFrontier {
    public int queueSize() {
        return queue.size();
    }
-
+    public int visitedSize() { return visited.size(); }
    public void enqueueLinksFromDocument(EdgeUrl baseUrl, Document parsed) {
        baseUrl = linkParser.getBaseLink(parsed, baseUrl);
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java
@ -42,7 +42,7 @@ public record DocumentWithReference(
            return false;
        }
-        return reference.isContentBodySame(doc.documentBody, bodyOk.body());
+        return CrawlDataReference.isContentBodySame(doc.documentBody, bodyOk.body());
    }
    public ContentTags getContentTags() {
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/spec/CrawlSpecProvider.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/spec/CrawlSpecProvider.java
@ -1,137 +0,0 @@
 package nu.marginalia.crawl.spec;
 import com.google.inject.Inject;
 import com.zaxxer.hikari.HikariDataSource;
 import lombok.Builder;
 import lombok.SneakyThrows;
 import nu.marginalia.ProcessConfiguration;
 import nu.marginalia.db.DomainBlacklist;
 import nu.marginalia.model.EdgeDomain;
 import org.jetbrains.annotations.NotNull;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
 import java.util.stream.Stream;
 // FIXME:  This design is a vestige from when there were multiple sources of crawl data.  It should be simplified and probably merged with CrawlerMain.
 public class CrawlSpecProvider {
    private final HikariDataSource dataSource;
    private final ProcessConfiguration processConfiguration;
    private final DomainBlacklist blacklist;
    private List<CrawlSpecRecord> domains;
    private static final Logger logger = LoggerFactory.getLogger(CrawlSpecProvider.class);
    private static final double URL_GROWTH_FACTOR = Double.parseDouble(System.getProperty("crawler.crawlSetGrowthFactor", "1.25"));
    private static final int MIN_URLS_PER_DOMAIN = Integer.getInteger("crawler.minUrlsPerDomain", 100);
    private static final int MID_URLS_PER_DOMAIN = Integer.getInteger("crawler.minUrlsPerDomain", 2_000);
    private static final int MAX_URLS_PER_DOMAIN = Integer.getInteger("crawler.maxUrlsPerDomain", 10_000);
    @Inject
    public CrawlSpecProvider(HikariDataSource dataSource,
                             ProcessConfiguration processConfiguration,
                             DomainBlacklist blacklist
                               ) {
        this.dataSource = dataSource;
        this.processConfiguration = processConfiguration;
        this.blacklist = blacklist;
    }
    // Load the domains into memory to ensure the crawler is resilient to database blips
    private List<CrawlSpecRecord> loadData() throws Exception {
        var domains = new ArrayList<CrawlSpecRecord>();
        logger.info("Loading domains to be crawled");
        blacklist.waitUntilLoaded();
        try (var conn = dataSource.getConnection();
             var assignFreeDomains = conn.prepareStatement("UPDATE EC_DOMAIN SET NODE_AFFINITY=? WHERE NODE_AFFINITY=0");
             var query = conn.prepareStatement("""
                     SELECT DOMAIN_NAME, COALESCE(VISITED_URLS, 0), EC_DOMAIN.ID
                     FROM EC_DOMAIN
                     LEFT JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
                     WHERE NODE_AFFINITY=?
                     """)
             )
        {
            // Assign any domains with node_affinity=0 to this node.  We must do this now, before we start crawling
            // to avoid race conditions with other crawl runs.  We don't want multiple crawlers to crawl the same domain.
            assignFreeDomains.setInt(1, processConfiguration.node());
            assignFreeDomains.executeUpdate();
            // Fetch the domains to be crawled
            query.setInt(1, processConfiguration.node());
            query.setFetchSize(10_000);
            var rs = query.executeQuery();
            while (rs.next()) {
                // Skip blacklisted domains
                int id = rs.getInt(3);
                if (blacklist.isBlacklisted(id))
                    continue;
                int urls = rs.getInt(2);
                double growthFactor = urls < MID_URLS_PER_DOMAIN
                        ? Math.max(2.5, URL_GROWTH_FACTOR)
                        : URL_GROWTH_FACTOR;
                int urlsToFetch = Math.clamp((int) (growthFactor * rs.getInt(2)), MIN_URLS_PER_DOMAIN, MAX_URLS_PER_DOMAIN);
                var record = new CrawlSpecRecord(
                        rs.getString(1),
                        urlsToFetch,
                        List.of()
                );
                domains.add(record);
            }
        }
        logger.info("Loaded {} domains", domains.size());
        // Shuffle the domains to ensure we get a good mix of domains in each crawl,
        // so that e.g. the big domains don't get all crawled at once, or we end up
        // crawling the same server in parallel from different subdomains...
        Collections.shuffle(domains);
        return domains;
    }
    public List<EdgeDomain> getDomains() {
        return stream().map(CrawlSpecRecord::domain).map(EdgeDomain::new).toList();
    }
    public int totalCount() throws Exception {
        if (domains == null) {
            domains = loadData();
        }
        return domains.size();
    }
    @SneakyThrows
    public Stream<CrawlSpecRecord> stream() {
        if (domains == null) {
            domains = loadData();
        }
        return domains.stream();
    }
    @Builder
    public record CrawlSpecRecord(@NotNull String domain,
                                  int crawlDepth,
                                  @NotNull List<String> urls) {
        public CrawlSpecRecord(String domain, int crawlDepth) {
            this(domain, crawlDepth, List.of());
        }
    }
 }
--- a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java
@ -2,6 +2,7 @@ package nu.marginalia.crawling.retreival;
 import crawlercommons.robots.SimpleRobotRules;
 import lombok.SneakyThrows;
 import nu.marginalia.crawl.CrawlerMain;
 import nu.marginalia.crawl.fetcher.ContentTags;
 import nu.marginalia.crawl.fetcher.HttpFetcher;
 import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
@ -9,7 +10,6 @@ import nu.marginalia.crawl.fetcher.SitemapRetriever;
 import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
 import nu.marginalia.crawl.retreival.CrawlerRetreiver;
 import nu.marginalia.crawl.retreival.DomainProber;
 import nu.marginalia.crawl.spec.CrawlSpecProvider;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.body.HttpFetchResult;
@ -68,7 +68,7 @@ public class CrawlerMockFetcherTest {
    }
-    void crawl(CrawlSpecProvider.CrawlSpecRecord spec)  throws IOException {
+    void crawl(CrawlerMain.CrawlSpecRecord spec)  throws IOException {
        try (var recorder = new WarcRecorder()) {
            new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), spec, recorder)
                    .crawlDomain();
@ -83,7 +83,7 @@ public class CrawlerMockFetcherTest {
        registerUrlClasspathData(new EdgeUrl("https://startrek.website/c/startrek"), "mock-crawl-data/lemmy/c_startrek.html");
        registerUrlClasspathData(new EdgeUrl("https://startrek.website/post/108995"), "mock-crawl-data/lemmy/108995.html");
-        crawl(new CrawlSpecProvider.CrawlSpecRecord("startrek.website", 10, new ArrayList<>()));
+        crawl(new CrawlerMain.CrawlSpecRecord("startrek.website", 10, new ArrayList<>()));
    }
    @Test
@ -92,7 +92,7 @@ public class CrawlerMockFetcherTest {
        registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html");
-        crawl(new CrawlSpecProvider.CrawlSpecRecord("en.wikipedia.org", 10, new ArrayList<>()));
+        crawl(new CrawlerMain.CrawlSpecRecord("en.wikipedia.org", 10, new ArrayList<>()));
    }
    @Test
@ -103,7 +103,7 @@ public class CrawlerMockFetcherTest {
        registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/telegram-channel-to-idle-on/3501"), "mock-crawl-data/discourse/telegram.html");
        registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/combined-mode-but-grid/4489"), "mock-crawl-data/discourse/grid.html");
-        crawl(new CrawlSpecProvider.CrawlSpecRecord("community.tt-rss.org", 10, new ArrayList<>()));
+        crawl(new CrawlerMain.CrawlSpecRecord("community.tt-rss.org", 10, new ArrayList<>()));
    }
    class MockFetcher implements HttpFetcher {
--- a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
@ -4,11 +4,11 @@ import lombok.SneakyThrows;
 import nu.marginalia.UserAgent;
 import nu.marginalia.WmsaHome;
 import nu.marginalia.atags.model.DomainLinks;
 import nu.marginalia.crawl.CrawlerMain;
 import nu.marginalia.crawl.fetcher.HttpFetcher;
 import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
 import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
 import nu.marginalia.crawl.retreival.*;
 import nu.marginalia.crawl.spec.CrawlSpecProvider;
 import nu.marginalia.io.CrawledDomainReader;
 import nu.marginalia.io.SerializableCrawlDataStream;
 import nu.marginalia.model.EdgeDomain;
@ -76,7 +76,7 @@ class CrawlerRetreiverTest {
    @Test
    public void testWarcOutput() throws IOException {
-        var specs = CrawlSpecProvider.CrawlSpecRecord
+        var specs = CrawlerMain.CrawlSpecRecord
                .builder()
                .crawlDepth(5)
                .domain("www.marginalia.nu")
@ -118,7 +118,7 @@ class CrawlerRetreiverTest {
    @Test
    public void testWarcOutputNoKnownUrls() throws IOException {
-        var specs = CrawlSpecProvider.CrawlSpecRecord
+        var specs = CrawlerMain.CrawlSpecRecord
                .builder()
                .crawlDepth(5)
                .domain("www.marginalia.nu")
@ -161,7 +161,7 @@ class CrawlerRetreiverTest {
    @SneakyThrows
    @Test
    public void testResync() throws IOException {
-        var specs = CrawlSpecProvider.CrawlSpecRecord
+        var specs = CrawlerMain.CrawlSpecRecord
                .builder()
                .crawlDepth(5)
                .domain("www.marginalia.nu")
@ -210,7 +210,7 @@ class CrawlerRetreiverTest {
    @Test
    public void testWithKnownDomains() throws IOException {
-        var specs = CrawlSpecProvider.CrawlSpecRecord
+        var specs = CrawlerMain.CrawlSpecRecord
                .builder()
                .crawlDepth(5)
                .domain("www.marginalia.nu")
@ -254,7 +254,7 @@ class CrawlerRetreiverTest {
    @Test
    public void testRedirect() throws IOException, URISyntaxException {
-        var specs = CrawlSpecProvider.CrawlSpecRecord
+        var specs = CrawlerMain.CrawlSpecRecord
                .builder()
                .crawlDepth(3)
                .domain("www.marginalia.nu")
@ -312,7 +312,7 @@ class CrawlerRetreiverTest {
    @Test
    public void testEmptySet() throws IOException {
-        var specs = CrawlSpecProvider.CrawlSpecRecord
+        var specs = CrawlerMain.CrawlSpecRecord
                .builder()
                .crawlDepth(5)
                .domain("www.marginalia.nu")
@ -360,7 +360,7 @@ class CrawlerRetreiverTest {
    @Test
    public void testRecrawl() throws IOException {
-        var specs = CrawlSpecProvider.CrawlSpecRecord
+        var specs = CrawlerMain.CrawlSpecRecord
                .builder()
                .crawlDepth(12)
                .domain("www.marginalia.nu")
@ -420,7 +420,7 @@ class CrawlerRetreiverTest {
    @Test
    public void testRecrawlWithResync() throws IOException {
-        var specs = CrawlSpecProvider.CrawlSpecRecord
+        var specs = CrawlerMain.CrawlSpecRecord
                .builder()
                .crawlDepth(12)
                .domain("www.marginalia.nu")
@ -508,7 +508,7 @@ class CrawlerRetreiverTest {
        }
    }
-    private void doCrawlWithReferenceStream(CrawlSpecProvider.CrawlSpecRecord specs, SerializableCrawlDataStream stream) {
+    private void doCrawlWithReferenceStream(CrawlerMain.CrawlSpecRecord specs, SerializableCrawlDataStream stream) {
        try (var recorder = new WarcRecorder(tempFileWarc2)) {
            new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).crawlDomain(new DomainLinks(),
                    new CrawlDataReference(stream));
@ -519,7 +519,7 @@ class CrawlerRetreiverTest {
    }
    @NotNull
-    private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlSpecProvider.CrawlSpecRecord specs) {
+    private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlerMain.CrawlSpecRecord specs) {
        try (var recorder = new WarcRecorder(tempFileWarc1)) {
            var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder);
            crawler.crawlDomain();