From 69f333c0bf8bf76da9f7bc1a91be285c6284886a Mon Sep 17 00:00:00 2001
From: Viktor Lofgren <vlofgren@marginalia.nu>
Date: Sun, 23 Jul 2023 18:59:14 +0200
Subject: [PATCH] (crawler) Clean up and refactor the code a bit

---
 .../crawl/retreival/CrawlDataReference.java   |  20 +-
 .../crawl/retreival/CrawlDelayTimer.java      |  57 ++++
 .../crawl/retreival/CrawlerRetreiver.java     | 265 ++++++++----------
 .../crawl/retreival/DomainCrawlFrontier.java  |   2 +-
 .../retreival/CrawlerMockFetcherTest.java     |   3 -
 .../retreival/CrawlerRetreiverTest.java       |   5 +-
 6 files changed, 190 insertions(+), 162 deletions(-)
 create mode 100644 code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java

diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java
index 4c4a33d8..8f331a65 100644
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java
@@ -1,24 +1,19 @@
 package nu.marginalia.crawl.retreival;
 
-import com.google.common.hash.HashCode;
 import com.google.common.hash.HashFunction;
 import com.google.common.hash.Hashing;
 import nu.marginalia.bigstring.BigString;
 import nu.marginalia.crawling.model.CrawledDocument;
 import nu.marginalia.crawling.model.SerializableCrawlData;
 import nu.marginalia.lsh.EasyLSH;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
 import javax.annotation.Nullable;
 import java.util.*;
 
 /** A reference to a domain that has been crawled before. */
 public class CrawlDataReference {
-    private final Logger logger = LoggerFactory.getLogger(CrawlDataReference.class);
 
     private final Iterator<SerializableCrawlData> data;
-    private final HashFunction hashFunction = Hashing.murmur3_128();
 
     public CrawlDataReference(Iterator<SerializableCrawlData> data) {
         this.data = data;
@@ -38,7 +33,7 @@ public class CrawlDataReference {
         return null;
     }
 
-    public boolean isContentSame(CrawledDocument one, CrawledDocument other) {
+    public boolean isContentBodySame(CrawledDocument one, CrawledDocument other) {
         assert one.documentBody != null;
         assert other.documentBody != null;
 
@@ -48,13 +43,15 @@ public class CrawlDataReference {
         return EasyLSH.hammingDistance(contentHashOne, contentHashOther) < 4;
     }
 
-
     private long contentHash(BigString documentBody) {
         String content = documentBody.decode();
         EasyLSH hash = new EasyLSH();
         int next = 0;
 
         boolean isInTag = false;
+
+        // In a naive best-effort fashion, extract the text
+        // content of the document and feed it into the LSH
         for (int i = 0; i < content.length(); i++) {
             char c = content.charAt(i);
             if (c == '<') {
@@ -62,12 +59,17 @@ public class CrawlDataReference {
             } else if (c == '>') {
                 isInTag = false;
             } else if (!isInTag) {
-                next = (next << 8) | (byte) c;
-                hash.addHashUnordered(hashFunction.hashInt(next).asInt());
+                next = (next << 8) | (c & 0xff);
+                hash.addHashUnordered(hashInt(next));
             }
         }
 
         return hash.get();
     }
 
+    private final HashFunction hashFunction = Hashing.murmur3_128();
+    private int hashInt(int v) {
+        return hashFunction.hashInt(v).asInt();
+    }
+
 }
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java
new file mode 100644
index 00000000..ca2494dc
--- /dev/null
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java
@@ -0,0 +1,57 @@
+package nu.marginalia.crawl.retreival;
+
+import lombok.SneakyThrows;
+
+import static java.lang.Math.max;
+import static java.lang.Math.min;
+
+public class CrawlDelayTimer {
+
+    // When no crawl delay is specified, lean toward twice the fetch+process time, within these limits:
+    private static final long DEFAULT_CRAWL_DELAY_MIN_MS = Long.getLong("defaultCrawlDelay", 1000);
+    private static final long DEFAULT_CRAWL_DELAY_MAX_MS = Long.getLong("defaultCrawlDelaySlow", 2500);
+
+    /** Flag to indicate that the crawler should slow down, e.g. from 429s */
+    private boolean slowDown = false;
+
+    private final long delayTime;
+
+    public CrawlDelayTimer(long delayTime) {
+        this.delayTime = delayTime;
+    }
+
+    @SneakyThrows
+    public void delay(long spentTime) {
+        long sleepTime = delayTime;
+
+        if (sleepTime >= 1) {
+            if (spentTime > sleepTime)
+                return;
+
+            Thread.sleep(min(sleepTime - spentTime, 5000));
+        }
+        else if (slowDown) {
+            // Additional delay when the server is signalling it wants slower requests
+            Thread.sleep( DEFAULT_CRAWL_DELAY_MIN_MS);
+        }
+        else {
+            // When no crawl delay is specified, lean toward twice the fetch+process time,
+            // within sane limits. This means slower servers get slower crawling, and faster
+            // servers get faster crawling.
+
+            sleepTime = spentTime * 2;
+            sleepTime = min(sleepTime, DEFAULT_CRAWL_DELAY_MAX_MS);
+            sleepTime = max(sleepTime, DEFAULT_CRAWL_DELAY_MIN_MS);
+
+            if (spentTime > sleepTime)
+                return;
+
+            Thread.sleep(sleepTime - spentTime);
+        }
+    }
+
+    /** Increase the delay between requests if the server is signalling it wants slower requests with HTTP 429 */
+    public void slowDown() {
+        slowDown = true;
+    }
+}
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
index 48587bdb..ebdbd4f0 100644
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
@@ -11,7 +11,6 @@ import nu.marginalia.crawling.model.spec.CrawlingSpecification;
 import nu.marginalia.link_parser.LinkParser;
 import nu.marginalia.crawling.model.*;
 import nu.marginalia.ip_blocklist.UrlBlocklist;
-import nu.marginalia.lsh.EasyLSH;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
 import org.jsoup.Jsoup;
@@ -26,25 +25,12 @@ import java.time.LocalDateTime;
 import java.util.*;
 import java.util.function.Consumer;
 
-import static java.lang.Math.max;
-import static java.lang.Math.min;
-
 public class CrawlerRetreiver {
-    private static final long DEFAULT_CRAWL_DELAY_MIN_MS = Long.getLong("defaultCrawlDelay", 1000);
-    private static final long DEFAULT_CRAWL_DELAY_MAX_MS = Long.getLong("defaultCrawlDelaySlow", 2500);
 
     private static final int MAX_ERRORS = 20;
 
     private final HttpFetcher fetcher;
 
-
-    /** Flag to indicate that the crawler should slow down, e.g. from 429s */
-    private boolean slowDown = false;
-
-
-    /** Testing flag to disable crawl delay (otherwise crawler tests take several minutes) */
-    private boolean testFlagIgnoreDelay = false;
-
     private final String id;
     private final String domain;
     private final Consumer<SerializableCrawlData> crawledDomainWriter;
@@ -61,7 +47,12 @@ public class CrawlerRetreiver {
     private final DomainCrawlFrontier crawlFrontier;
 
     int errorCount = 0;
-    private String retainedTag = "RETAINED/304";
+
+    /** recrawlState tag for documents that had a HTTP status 304 */
+    private static final String documentWasRetainedTag = "RETAINED/304";
+
+    /** recrawlState tag for documents that had a 200 status but were identical to a previous version */
+    private static final String documentWasSameTag = "SAME-BY-COMPARISON";
 
     public CrawlerRetreiver(HttpFetcher fetcher,
                             CrawlingSpecification specs,
@@ -91,11 +82,6 @@ public class CrawlerRetreiver {
         }
     }
 
-    public CrawlerRetreiver withNoDelay() {
-        testFlagIgnoreDelay = true;
-        return this;
-    }
-
     public int fetch() {
         return fetch(new CrawlDataReference());
     }
@@ -146,13 +132,13 @@ public class CrawlerRetreiver {
 
         assert !crawlFrontier.isEmpty();
 
-        var robotsRules = fetcher.fetchRobotRules(crawlFrontier.peek().domain);
-        long crawlDelay = robotsRules.getCrawlDelay();
+        final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(crawlFrontier.peek().domain);
+        final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());
 
-        sniffRootDocument();
+        sniffRootDocument(delayTimer);
 
         // Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
-        int recrawled = recrawl(oldCrawlData, robotsRules, crawlDelay);
+        int recrawled = recrawl(oldCrawlData, robotsRules, delayTimer);
 
         if (recrawled > 0) {
             // If we have reference data, we will always grow the crawl depth a bit
@@ -195,7 +181,7 @@ public class CrawlerRetreiver {
                 continue;
 
 
-            if (fetchDocument(top, null, crawlDelay).isPresent()) {
+            if (fetchWriteAndSleep(top, delayTimer, DocumentWithReference.empty()).isPresent()) {
                 fetchedCount++;
             }
         }
@@ -207,9 +193,10 @@ public class CrawlerRetreiver {
         return fetchedCount;
     }
 
+    /** Performs a re-crawl of old documents, comparing etags and last-modified */
     private int recrawl(CrawlDataReference oldCrawlData,
                         SimpleRobotRules robotsRules,
-                        long crawlDelay) {
+                        CrawlDelayTimer delayTimer) {
         int recrawled = 0;
         int retained = 0;
 
@@ -247,8 +234,6 @@ public class CrawlerRetreiver {
              && retained > 0.9 * recrawled
              && Math.random() < 0.75)
             {
-                logger.info("Direct-loading {}", url);
-
                 // Since it looks like most of these documents haven't changed,
                 // we'll load the documents directly; but we do this in a random
                 // fashion to make sure we eventually catch changes over time
@@ -263,15 +248,13 @@ public class CrawlerRetreiver {
             // providing etag and last-modified headers, so we can recycle the
             // document if it hasn't changed without actually downloading it
 
-            var fetchedDocOpt = fetchDocument(url, doc, crawlDelay);
+            var fetchedDocOpt = fetchWriteAndSleep(url,
+                    delayTimer,
+                    new DocumentWithReference(doc, oldCrawlData));
             if (fetchedDocOpt.isEmpty()) continue;
 
-            if (Objects.equals(fetchedDocOpt.get().recrawlState, retainedTag)) {
-                retained ++;
-            }
-            else if (oldCrawlData.isContentSame(doc, fetchedDocOpt.get())) {
-                retained ++;
-            }
+            if (documentWasRetainedTag.equals(fetchedDocOpt.get().recrawlState)) retained ++;
+            else if (documentWasSameTag.equals(fetchedDocOpt.get().recrawlState)) retained ++;
 
             recrawled ++;
         }
@@ -279,18 +262,6 @@ public class CrawlerRetreiver {
         return recrawled;
     }
 
-    private static final HashFunction hasher = Hashing.murmur3_128(0);
-    private long hashDoc(CrawledDocument doc) {
-        var hash = new EasyLSH();
-        long val = 0;
-        for (var b : doc.documentBody.decode().getBytes()) {
-            val = val << 8 | (b & 0xFF);
-            hash.addUnordered(hasher.hashLong(val).asLong());
-        }
-        return hash.get();
-    }
-
-
     private void downloadSitemaps(SimpleRobotRules robotsRules) {
         List<String> sitemaps = robotsRules.getSitemaps();
         if (sitemaps.isEmpty()) {
@@ -337,13 +308,13 @@ public class CrawlerRetreiver {
         logger.debug("Queue is now {}", crawlFrontier.queueSize());
     }
 
-    private void sniffRootDocument() {
+    private void sniffRootDocument(CrawlDelayTimer delayTimer) {
         try {
             logger.debug("Configuring link filter");
 
             var url = crawlFrontier.peek().withPathAndParam("/", null);
 
-            var maybeSample = fetchUrl(url, null).filter(sample -> sample.httpStatus == 200);
+            var maybeSample = fetchUrl(url, delayTimer, DocumentWithReference.empty()).filter(sample -> sample.httpStatus == 200);
             if (maybeSample.isEmpty())
                 return;
             var sample = maybeSample.get();
@@ -379,33 +350,41 @@ public class CrawlerRetreiver {
         }
     }
 
-    private Optional<CrawledDocument> fetchDocument(EdgeUrl top,
-                                                    @Nullable CrawledDocument reference,
-                                                    long crawlDelay) {
+    private Optional<CrawledDocument> fetchWriteAndSleep(EdgeUrl top,
+                                                         CrawlDelayTimer timer,
+                                                         DocumentWithReference reference) {
         logger.debug("Fetching {}", top);
 
         long startTime = System.currentTimeMillis();
 
-        var doc = fetchUrl(top, reference);
-        if (doc.isPresent()) {
-            var d = doc.get();
-            crawledDomainWriter.accept(d);
+        var docOpt = fetchUrl(top, timer, reference);
 
-            if (d.url != null) {
-                // We may have redirected to a different path
-                EdgeUrl.parse(d.url).ifPresent(crawlFrontier::addVisited);
+        if (docOpt.isPresent()) {
+            var doc = docOpt.get();
+
+            if (!Objects.equals(doc.recrawlState, documentWasRetainedTag)
+                && reference.isContentBodySame(doc))
+            {
+                // The document didn't change since the last time
+                doc.recrawlState = documentWasSameTag;
             }
 
-            if ("ERROR".equals(d.crawlerStatus) && d.httpStatus != 404) {
+            crawledDomainWriter.accept(doc);
+
+            if (doc.url != null) {
+                // We may have redirected to a different path
+                EdgeUrl.parse(doc.url).ifPresent(crawlFrontier::addVisited);
+            }
+
+            if ("ERROR".equals(doc.crawlerStatus) && doc.httpStatus != 404) {
                 errorCount++;
             }
 
         }
 
-        long crawledTime = System.currentTimeMillis() - startTime;
-        delay(crawlDelay, crawledTime);
+        timer.delay(System.currentTimeMillis() - startTime);
 
-        return doc;
+        return docOpt;
     }
 
     private boolean isAllowedProtocol(String proto) {
@@ -413,35 +392,23 @@ public class CrawlerRetreiver {
                 || proto.equalsIgnoreCase("https");
     }
 
-    private Optional<CrawledDocument> fetchUrl(EdgeUrl top, @Nullable CrawledDocument reference) {
+    private Optional<CrawledDocument> fetchUrl(EdgeUrl top, CrawlDelayTimer timer, DocumentWithReference reference) {
         try {
-            var contentTags = getContentTags(reference);
-            var fetchedDoc = fetchContent(top, contentTags);
-            CrawledDocument doc;
+            var contentTags = reference.getContentTags();
+            var fetchedDoc = tryDownload(top, timer, contentTags);
 
-            // HTTP status 304 is NOT MODIFIED, which means the document is the same as it was when
-            // we fetched it last time. We can recycle the reference document.
-            if (reference != null
-             && fetchedDoc.httpStatus == 304)
-            {
-                doc = reference;
-                doc.recrawlState = retainedTag;
-                doc.timestamp = LocalDateTime.now().toString();
-            }
-            else {
-                doc = fetchedDoc;
-            }
+            CrawledDocument doc = reference.replaceOn304(fetchedDoc);
 
             if (doc.documentBody != null) {
                 var decoded = doc.documentBody.decode();
 
                 doc.documentBodyHash = createHash(decoded);
 
-                Optional<Document> parsedDoc = parseDoc(decoded);
+                var parsedDoc = Jsoup.parse(decoded);
                 EdgeUrl url = new EdgeUrl(doc.url);
 
-                parsedDoc.ifPresent(parsed -> findLinks(url, parsed));
-                parsedDoc.flatMap(parsed -> findCanonicalUrl(url, parsed))
+                findLinks(url, parsedDoc);
+                findCanonicalUrl(url, parsedDoc)
                         .ifPresent(canonicalLink -> doc.canonicalUrl = canonicalLink.toString());
             }
 
@@ -455,33 +422,9 @@ public class CrawlerRetreiver {
 
     }
 
-    private ContentTags getContentTags(@Nullable CrawledDocument reference) {
-        if (null == reference)
-            return ContentTags.empty();
-
-        String headers = reference.headers;
-        if (headers == null)
-            return ContentTags.empty();
-
-        String[] headersLines = headers.split("\n");
-
-        String lastmod = null;
-        String etag = null;
-
-        for (String line : headersLines) {
-            if (line.toLowerCase().startsWith("etag:")) {
-                etag = line.substring(5).trim();
-            }
-            if (line.toLowerCase().startsWith("last-modified:")) {
-                lastmod = line.substring(14).trim();
-            }
-        }
-
-        return new ContentTags(etag, lastmod);
-    }
 
     @SneakyThrows
-    private CrawledDocument fetchContent(EdgeUrl top, ContentTags tags) {
+    private CrawledDocument tryDownload(EdgeUrl top, CrawlDelayTimer timer, ContentTags tags) {
         for (int i = 0; i < 2; i++) {
             try {
                 var doc = fetcher.fetchContent(top, tags);
@@ -489,7 +432,8 @@ public class CrawlerRetreiver {
                 return doc;
             }
             catch (RateLimitException ex) {
-                slowDown = true;
+                timer.slowDown();
+
                 int delay = ex.retryAfter();
                 if (delay > 0 && delay < 5000) {
                     Thread.sleep(delay);
@@ -504,10 +448,6 @@ public class CrawlerRetreiver {
         return hashMethod.hashUnencodedChars(documentBodyHash).toString();
     }
 
-    private Optional<Document> parseDoc(String decoded) {
-        return Optional.of(Jsoup.parse(decoded));
-    }
-
     private void findLinks(EdgeUrl baseUrl, Document parsed) {
         baseUrl = linkParser.getBaseLink(parsed, baseUrl);
 
@@ -547,36 +487,6 @@ public class CrawlerRetreiver {
         }
     }
 
-    @SneakyThrows
-    private void delay(long sleepTime, long spentTime) {
-        if (testFlagIgnoreDelay)
-            return;
-
-        if (sleepTime >= 1) {
-            if (spentTime > sleepTime)
-                return;
-
-            Thread.sleep(min(sleepTime - spentTime, 5000));
-        }
-        else if (slowDown) {
-            Thread.sleep( 1000);
-        }
-        else {
-            // When no crawl delay is specified, lean toward twice the fetch+process time,
-            // within sane limits. This means slower servers get slower crawling, and faster
-            // servers get faster crawling.
-
-            sleepTime = spentTime * 2;
-            sleepTime = min(sleepTime, DEFAULT_CRAWL_DELAY_MAX_MS);
-            sleepTime = max(sleepTime, DEFAULT_CRAWL_DELAY_MIN_MS);
-
-            if (spentTime > sleepTime)
-                return;
-
-            Thread.sleep(sleepTime - spentTime);
-        }
-    }
-
     private CrawledDocument createRobotsError(EdgeUrl url) {
         return CrawledDocument.builder()
                 .url(url.toString())
@@ -594,12 +504,71 @@ public class CrawlerRetreiver {
                 .build();
     }
 
+    private record DocumentWithReference(
+            @Nullable CrawledDocument doc,
+            @Nullable CrawlDataReference reference) {
 
-    enum CrawlDataComparison {
-        NO_OLD_DATA,
-        SMALL_SAMPLE,
-        CHANGES_FOUND,
-        NO_CHANGES
-    };
+        private static final DocumentWithReference emptyInstance = new DocumentWithReference(null, null);
+        public static DocumentWithReference empty() {
+            return emptyInstance;
+        }
+
+        public boolean isContentBodySame(CrawledDocument newDoc) {
+            if (reference == null)
+                return false;
+            if (doc == null)
+                return false;
+
+            return reference.isContentBodySame(doc, newDoc);
+        }
+
+        private ContentTags getContentTags() {
+            if (null == doc)
+                return ContentTags.empty();
+
+            String headers = doc.headers;
+            if (headers == null)
+                return ContentTags.empty();
+
+            String[] headersLines = headers.split("\n");
+
+            String lastmod = null;
+            String etag = null;
+
+            for (String line : headersLines) {
+                if (line.toLowerCase().startsWith("etag:")) {
+                    etag = line.substring(5).trim();
+                }
+                if (line.toLowerCase().startsWith("last-modified:")) {
+                    lastmod = line.substring(14).trim();
+                }
+            }
+
+            return new ContentTags(etag, lastmod);
+        }
+
+        public boolean isEmpty() {
+            return doc == null || reference == null;
+        }
+
+        /** If the provided document has HTTP status 304, and the reference document is provided,
+         *  return the reference document; otherwise return the provided document.
+         */
+        public CrawledDocument replaceOn304(CrawledDocument fetchedDoc) {
+
+            if (doc == null)
+                return fetchedDoc;
+
+            // HTTP status 304 is NOT MODIFIED, which means the document is the same as it was when
+            // we fetched it last time. We can recycle the reference document.
+            if (fetchedDoc.httpStatus != 304)
+                return fetchedDoc;
+
+            var ret = doc;
+            ret.recrawlState = documentWasRetainedTag;
+            ret.timestamp = LocalDateTime.now().toString();
+            return ret;
+        }
+    }
 
 }
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java
index 7d5fc214..4b9cc265 100644
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java
@@ -80,7 +80,7 @@ public class DomainCrawlFrontier {
             return;
 
         // reduce memory usage by not growing queue huge when crawling large sites
-        if (queue.size() + visited.size() >= depth + 100)
+        if (queue.size() + visited.size() >= depth + 1000)
             return;
 
         if (visited.contains(url.toString()))
diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java
index ea1dd08e..ae8e4679 100644
--- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java
+++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java
@@ -71,7 +71,6 @@ public class CrawlerMockFetcherTest {
         registerUrlClasspathData(new EdgeUrl("https://startrek.website/post/108995"), "mock-crawl-data/lemmy/108995.html");
 
         new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 10, "startrek.website", new ArrayList<>()), out::add)
-                .withNoDelay()
                 .fetch();
 
         out.forEach(System.out::println);
@@ -84,7 +83,6 @@ public class CrawlerMockFetcherTest {
         registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html");
 
         new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 10, "en.wikipedia.org", new ArrayList<>()), out::add)
-                .withNoDelay()
                 .fetch();
 
         out.forEach(System.out::println);
@@ -99,7 +97,6 @@ public class CrawlerMockFetcherTest {
         registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/combined-mode-but-grid/4489"), "mock-crawl-data/discourse/grid.html");
 
         new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 100, "community.tt-rss.org", new ArrayList<>()), out::add)
-                .withNoDelay()
                 .fetch();
 
         out.forEach(System.out::println);
diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
index 2a37707f..bb4dd6f4 100644
--- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
+++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
@@ -118,13 +118,16 @@ class CrawlerRetreiverTest {
 
 
         Path out = Files.createTempDirectory("crawling-process");
-        var writer = new CrawledDomainWriter(out, "www.marginalia.nu", "123456");
+        var writer = new CrawledDomainWriter(out, specs.domain, specs.id);
         Map<Class<? extends SerializableCrawlData>, List<SerializableCrawlData>> data = new HashMap<>();
 
         new CrawlerRetreiver(httpFetcher, specs, d -> {
             data.computeIfAbsent(d.getClass(), k->new ArrayList<>()).add(d);
             if (d instanceof CrawledDocument doc) {
                 System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus);
+                if (Math.random() > 0.5) {
+                    doc.headers = "";
+                }
             }
             writer.accept(d);
         }).fetch();