From 58f2f86ea8607db9a01b1b581cd0da017e9fcf74 Mon Sep 17 00:00:00 2001
From: Viktor Lofgren <vlofgren@marginalia.nu>
Date: Fri, 21 Jul 2023 19:47:52 +0200
Subject: [PATCH] (crawler) Don't read all the data into RAM when doing a
 refresh-crawl

---
 .../crawling/io/CrawledDomainReader.java      |  41 ++++
 .../nu/marginalia/crawl/CrawlLimiter.java     |  45 +---
 .../java/nu/marginalia/crawl/CrawlerMain.java |  24 +-
 .../crawl/retreival/CrawlerRetreiver.java     | 227 ++++++++++--------
 .../crawl/retreival/fetcher/ContentTags.java  |  24 ++
 .../crawl/retreival/fetcher/HttpFetcher.java  |   2 +-
 .../retreival/fetcher/HttpFetcherImpl.java    |  31 ++-
 .../marginalia/crawling/HttpFetcherTest.java  |   5 +-
 .../retreival/CrawlerMockFetcherTest.java     |   7 +-
 .../retreival/CrawlerRetreiverTest.java       |  18 +-
 10 files changed, 249 insertions(+), 175 deletions(-)
 create mode 100644 code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/ContentTags.java

diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java
index 67b95484..abc524ac 100644
--- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java
+++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java
@@ -2,8 +2,10 @@ package nu.marginalia.crawling.io;
 
 import com.github.luben.zstd.ZstdInputStream;
 import com.google.gson.Gson;
+import lombok.SneakyThrows;
 import nu.marginalia.crawling.model.CrawledDocument;
 import nu.marginalia.crawling.model.CrawledDomain;
+import nu.marginalia.crawling.model.SerializableCrawlData;
 import nu.marginalia.model.gson.GsonFactory;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -14,6 +16,7 @@ import java.io.IOException;
 import java.io.InputStreamReader;
 import java.nio.file.Path;
 import java.util.ArrayList;
+import java.util.Iterator;
 import java.util.List;
 import java.util.Optional;
 import java.util.concurrent.ForkJoinPool;
@@ -27,6 +30,44 @@ public class CrawledDomainReader {
     public CrawledDomainReader() {
     }
 
+    public Iterator<SerializableCrawlData> createIterator(Path path) throws IOException {
+        BufferedReader br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile()))));
+
+        return new Iterator<>() {
+            SerializableCrawlData next;
+
+            @Override
+            @SneakyThrows
+            public boolean hasNext() {
+                String identifier = br.readLine();
+                if (identifier == null) {
+                    br.close();
+                    return false;
+                }
+                String data = br.readLine();
+                if (data == null) {
+                    br.close();
+                    return false;
+                }
+
+                if (identifier.equals(CrawledDomain.SERIAL_IDENTIFIER)) {
+                    next = gson.fromJson(data, CrawledDomain.class);
+                } else if (identifier.equals(CrawledDocument.SERIAL_IDENTIFIER)) {
+                    next = gson.fromJson(data, CrawledDocument.class);
+                }
+                else {
+                    throw new IllegalStateException("Unknown identifier: " + identifier);
+                }
+                return true;
+            }
+
+            @Override
+            public SerializableCrawlData next() {
+                return next;
+            }
+        };
+    }
+    
     public CrawledDomain read(Path path) throws IOException {
         DomainDataAssembler domainData = new DomainDataAssembler();
 
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java
index 29f02e4f..7285b0c5 100644
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java
@@ -8,65 +8,22 @@ import java.util.concurrent.Semaphore;
 public class CrawlLimiter {
     public static final int maxPoolSize = Integer.getInteger("crawler.pool-size", 512);
 
-    // We'll round up to this size when we're crawling a new domain to prevent
-    // too many concurrent connections
-    public static final int minCrawlDataSizeKb = 128; // 100 Kb
-
-    // The largest size on disk where we'll permit a refresh crawl
-    // (these files easily grow into the gigabytes, we don't want that in RAM)
-    public static  final int maxRefreshableCrawlDataSizeKBytes = 1024*128; // 128 Mb
-
-    // This limits how many concurrent crawl tasks we can have running at once
-    // based on their size on disk.  The on-disk size is compressed, and the
-    // in-ram size is partially compressed (i.e. only the document body); so
-    // maybe a fair estimate is something like 2-4x this figure for RAM usage
-    //
-    public static final int maxConcurrentCrawlTaskSizeKb = 512*1024; // 512 Mb
-
-    static {
-        // Sanity check; if this is false we'll get a deadlock on taskSemRAM
-        assert maxConcurrentCrawlTaskSizeKb >= maxRefreshableCrawlDataSizeKBytes
-                : "maxConcurrentCrawlTaskSizeKb must be larger than maxRefreshableCrawlDataSizeKBytes";
-    }
-
     public record CrawlTaskLimits(Path refreshPath, boolean isRefreshable, int taskSize) {}
 
-    // We use two semaphores to keep track of the number of concurrent crawls;
-    // first a RAM sempahore to limit the amount of RAM used by refresh crawls.
-    // then a count semaphore to limit the number of concurrent threads (this keeps the connection count manageable)
-    private final Semaphore taskSemRAM = new Semaphore(maxConcurrentCrawlTaskSizeKb);
     private final Semaphore taskSemCount = new Semaphore(maxPoolSize);
 
 
     public CrawlTaskLimits getTaskLimits(Path fileName) {
-        long size;
-
-        try {
-            size = Math.max(minCrawlDataSizeKb, Files.size(fileName) / 1024);
-        } catch (IOException ex) {
-            // If we can't read the file, we'll assume it's small since we won't be able to read it later for the refresh either
-            return new CrawlTaskLimits(null,false, minCrawlDataSizeKb);
-        }
-
-        // We'll only permit refresh crawls if the file is small enough
-        boolean isRefreshable = size < maxRefreshableCrawlDataSizeKBytes;
-
-        // We'll truncate this down to maxRefreshableCrawlDataSizeKBytes to ensure
-        // it's possible to acquire the RAM semaphore
-        int effectiveSize = (int) Math.min(maxRefreshableCrawlDataSizeKBytes, size);
-
-        return new CrawlTaskLimits(fileName, isRefreshable, effectiveSize);
+        return new CrawlTaskLimits(fileName, true, 1);
     }
 
 
     public void acquire(CrawlTaskLimits properties) throws InterruptedException {
         // It's very important that we acquire the RAM semaphore first to avoid a deadlock
-        taskSemRAM.acquire(properties.taskSize);
         taskSemCount.acquire(1);
     }
 
     public void release(CrawlTaskLimits properties) {
         taskSemCount.release(1);
-        taskSemRAM.release(properties.taskSize);
     }
 }
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java
index 3dd096cb..6fafb128 100644
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java
@@ -10,6 +10,7 @@ import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
 import nu.marginalia.crawling.io.CrawledDomainReader;
 import nu.marginalia.crawling.io.CrawlerOutputFile;
 import nu.marginalia.crawling.model.CrawledDomain;
+import nu.marginalia.crawling.model.SerializableCrawlData;
 import nu.marginalia.db.storage.FileStorageService;
 import nu.marginalia.mq.MessageQueueFactory;
 import nu.marginalia.mq.MqMessage;
@@ -32,10 +33,7 @@ import org.slf4j.LoggerFactory;
 import java.io.IOException;
 import java.nio.file.Path;
 import java.sql.SQLException;
-import java.util.HashSet;
-import java.util.Optional;
-import java.util.Set;
-import java.util.UUID;
+import java.util.*;
 import java.util.concurrent.*;
 import java.util.concurrent.atomic.AtomicInteger;
 
@@ -201,19 +199,23 @@ public class CrawlerMain implements AutoCloseable {
 
         HttpFetcher fetcher = new HttpFetcherImpl(userAgent.uaString(), dispatcher, connectionPool);
 
-        // Read the previous crawl's data for this domain, if it exists and has a reasonable size
-        Optional<CrawledDomain> domain;
-        if (limits.isRefreshable()) {
-            domain = reader.readOptionally(limits.refreshPath());
-            if (domain.isPresent()) {
-                specification = specification.withOldData(domain.get());
+        Iterator<SerializableCrawlData> iterator;
+        try {
+            if (limits.isRefreshable()) {
+                iterator = reader.createIterator(limits.refreshPath());
             }
+            else {
+                iterator = Collections.emptyIterator();
+            }
+        } catch (IOException e) {
+            logger.warn("Failed to read previous crawl data for {}", specification.domain);
+            iterator = Collections.emptyIterator();
         }
 
         try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) {
             var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept);
 
-            int size = retreiver.fetch();
+            int size = retreiver.fetch(iterator);
 
             workLog.setJobToFinished(specification.id, writer.getOutputFile().toString(), size);
 
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
index 52927f38..8091dac8 100644
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
@@ -4,6 +4,7 @@ import com.google.common.hash.HashFunction;
 import com.google.common.hash.Hashing;
 import crawlercommons.robots.SimpleRobotRules;
 import lombok.SneakyThrows;
+import nu.marginalia.crawl.retreival.fetcher.ContentTags;
 import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
 import nu.marginalia.crawl.retreival.fetcher.SitemapRetriever;
 import nu.marginalia.crawling.model.spec.CrawlingSpecification;
@@ -18,6 +19,7 @@ import org.jsoup.nodes.Document;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import javax.annotation.Nullable;
 import java.net.InetAddress;
 import java.net.UnknownHostException;
 import java.time.LocalDateTime;
@@ -58,15 +60,13 @@ public class CrawlerRetreiver {
     private final SitemapRetriever sitemapRetriever;
     private final DomainCrawlFrontier crawlFrontier;
 
-    private final CrawlDataReference oldCrawlData;
-
     int errorCount = 0;
+    private String retainedTag = "RETAINED/304";
 
     public CrawlerRetreiver(HttpFetcher fetcher,
                             CrawlingSpecification specs,
                             Consumer<SerializableCrawlData> writer) {
         this.fetcher = fetcher;
-        this.oldCrawlData = new CrawlDataReference(specs.oldData);
 
         id = specs.id;
         domain = specs.domain;
@@ -97,10 +97,14 @@ public class CrawlerRetreiver {
     }
 
     public int fetch() {
+        return fetch(Collections.emptyIterator());
+    }
+
+    public int fetch(Iterator<SerializableCrawlData> oldCrawlData) {
         final DomainProber.ProbeResult probeResult = domainProber.probeDomain(fetcher, domain, crawlFrontier.peek());
 
         if (probeResult instanceof DomainProber.ProbeResultOk) {
-            return crawlDomain();
+            return crawlDomain(oldCrawlData);
         }
 
         // handle error cases for probe
@@ -137,44 +141,29 @@ public class CrawlerRetreiver {
         throw new IllegalStateException("Unknown probe result: " + probeResult);
     };
 
-    private int crawlDomain() {
+    private int crawlDomain(Iterator<SerializableCrawlData> oldCrawlData) {
         String ip = findIp(domain);
 
         assert !crawlFrontier.isEmpty();
 
         var robotsRules = fetcher.fetchRobotRules(crawlFrontier.peek().domain);
+        long crawlDelay = robotsRules.getCrawlDelay();
 
-        CrawlDataComparison comparison = compareWithOldData(robotsRules);
-        logger.info("Comparison result for {} : {}", domain, comparison);
+        sniffRootDocument();
 
-        // If we have reference data, we will always grow the crawl depth a bit
-        if (oldCrawlData.size() > 0) {
+        // Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
+        int recrawled = recrawl(oldCrawlData, robotsRules, crawlDelay);
+
+        if (recrawled > 0) {
+            // If we have reference data, we will always grow the crawl depth a bit
             crawlFrontier.increaseDepth(1.5);
         }
 
-        // When the reference data doesn't appear to have changed, we'll forego
-        // re-fetching it and just use the old data
-        if (comparison == CrawlDataComparison.NO_CHANGES) {
-            oldCrawlData.allDocuments().forEach((url, doc) -> {
-                if (crawlFrontier.addVisited(url)) {
-                    doc.recrawlState = "RETAINED";
-                    crawledDomainWriter.accept(doc);
-                }
-            });
-
-            // We don't need to hold onto this in RAM anymore
-            oldCrawlData.evict();
-        }
-
-
         downloadSitemaps(robotsRules);
-        sniffRootDocument();
-
-        long crawlDelay = robotsRules.getCrawlDelay();
 
         CrawledDomain ret = new CrawledDomain(id, domain, null, CrawlerDomainStatus.OK.name(), null, ip, new ArrayList<>(), null);
 
-        int fetchedCount = 0;
+        int fetchedCount = recrawled;
 
         while (!crawlFrontier.isEmpty()
             && !crawlFrontier.isCrawlDepthReached()
@@ -187,11 +176,6 @@ public class CrawlerRetreiver {
                 continue;
             }
 
-            // Don't re-fetch links that were previously found dead as it's very unlikely that a
-            // 404:ing link will suddenly start working at a later point
-            if (oldCrawlData.isPreviouslyDead(top))
-                continue;
-
             // Check the link filter if the endpoint should be fetched based on site-type
             if (!crawlFrontier.filterLink(top))
                 continue;
@@ -211,7 +195,7 @@ public class CrawlerRetreiver {
                 continue;
 
 
-            if (fetchDocument(top, crawlDelay).isPresent()) {
+            if (fetchDocument(top, null, crawlDelay).isPresent()) {
                 fetchedCount++;
             }
         }
@@ -223,63 +207,69 @@ public class CrawlerRetreiver {
         return fetchedCount;
     }
 
-    private CrawlDataComparison compareWithOldData(SimpleRobotRules robotsRules) {
+    private int recrawl(Iterator<SerializableCrawlData> oldCrawlData,
+                        SimpleRobotRules robotsRules,
+                        long crawlDelay) {
+        int recrawled = 0;
+        int retained = 0;
 
-        int numGoodDocuments = oldCrawlData.size();
+        while (oldCrawlData.hasNext()) {
+            if (!(oldCrawlData.next() instanceof CrawledDocument doc)) continue;
 
-        if (numGoodDocuments == 0)
-            return CrawlDataComparison.NO_OLD_DATA;
+            // This Shouldn't Happen (TM)
+            var urlMaybe = EdgeUrl.parse(doc.url);
+            if (urlMaybe.isEmpty()) continue;
+            var url = urlMaybe.get();
 
-        if (numGoodDocuments < 10)
-            return CrawlDataComparison.SMALL_SAMPLE;
-
-        // We fetch a sample of the data to assess how much it has changed
-        int sampleSize = (int) Math.min(20, 0.25 * numGoodDocuments);
-        Map<EdgeUrl, CrawledDocument> referenceUrls = oldCrawlData.sample(sampleSize);
-
-        int differences = 0;
-
-        long crawlDelay = robotsRules.getCrawlDelay();
-        for (var url : referenceUrls.keySet()) {
-
-            var docMaybe = fetchDocument(url, crawlDelay);
-            if (docMaybe.isEmpty()) {
-                differences++;
+            // If we've previously 404:d on this URL, we'll refrain from trying to fetch it again
+            if (doc.httpStatus == 404) {
+                crawlFrontier.addVisited(url);
                 continue;
             }
 
-            var newDoc = docMaybe.get();
-            var referenceDoc = referenceUrls.get(url);
+            if (doc.httpStatus != 200) continue;
 
-            // This looks like a bug but it is not, we want to compare references
-            // to detect if the page has bounced off etag or last-modified headers
-            // to avoid having to do a full content comparison
-            if (newDoc == referenceDoc)
+            if (!robotsRules.isAllowed(url.toString())) {
+                crawledDomainWriter.accept(createRobotsError(url));
+                continue;
+            }
+            if (!crawlFrontier.filterLink(url))
+                continue;
+            if (!crawlFrontier.addVisited(url))
                 continue;
 
-            if (newDoc.httpStatus != referenceDoc.httpStatus) {
-                differences++;
+
+            if (recrawled > 10
+             && retained > 0.9 * recrawled
+             && Math.random() < 0.75)
+            {
+                logger.info("Direct-loading {}", url);
+
+                // Since it looks like most of these documents haven't changed,
+                // we'll load the documents directly; but we do this in a random
+                // fashion to make sure we eventually catch changes over time
+
+                crawledDomainWriter.accept(doc);
+                crawlFrontier.addVisited(url);
                 continue;
             }
 
-            if (newDoc.documentBody == null) {
-                differences++;
-                continue;
+
+            // GET the document with the stored document as a reference
+            // providing etag and last-modified headers, so we can recycle the
+            // document if it hasn't changed without actually downloading it
+
+            var fetchedDocOpt = fetchDocument(url, doc, crawlDelay);
+            if (fetchedDocOpt.isEmpty()) continue;
+
+            if (Objects.equals(fetchedDocOpt.get().recrawlState, retainedTag)) {
+                retained ++;
             }
 
-            long referenceLsh = hashDoc(referenceDoc);
-            long newLsh = hashDoc(newDoc);
-
-            if (EasyLSH.hammingDistance(referenceLsh, newLsh) > 5) {
-                differences++;
-            }
-        }
-        if (differences > sampleSize/4) {
-            return CrawlDataComparison.CHANGES_FOUND;
-        }
-        else {
-            return CrawlDataComparison.NO_CHANGES;
+            recrawled ++;
         }
+
+        return recrawled;
     }
 
     private static final HashFunction hasher = Hashing.murmur3_128(0);
@@ -346,7 +336,7 @@ public class CrawlerRetreiver {
 
             var url = crawlFrontier.peek().withPathAndParam("/", null);
 
-            var maybeSample = fetchUrl(url).filter(sample -> sample.httpStatus == 200);
+            var maybeSample = fetchUrl(url, null).filter(sample -> sample.httpStatus == 200);
             if (maybeSample.isEmpty())
                 return;
             var sample = maybeSample.get();
@@ -382,23 +372,21 @@ public class CrawlerRetreiver {
         }
     }
 
-    private Optional<CrawledDocument> fetchDocument(EdgeUrl top, long crawlDelay) {
+    private Optional<CrawledDocument> fetchDocument(EdgeUrl top,
+                                                    @Nullable CrawledDocument reference,
+                                                    long crawlDelay) {
         logger.debug("Fetching {}", top);
 
         long startTime = System.currentTimeMillis();
 
-        var doc = fetchUrl(top);
+        var doc = fetchUrl(top, reference);
         if (doc.isPresent()) {
             var d = doc.get();
             crawledDomainWriter.accept(d);
-            oldCrawlData.dispose(top);
 
             if (d.url != null) {
                 // We may have redirected to a different path
-                EdgeUrl.parse(d.url).ifPresent(url -> {
-                    crawlFrontier.addVisited(url);
-                    oldCrawlData.dispose(url);
-                });
+                EdgeUrl.parse(d.url).ifPresent(crawlFrontier::addVisited);
             }
 
             if ("ERROR".equals(d.crawlerStatus) && d.httpStatus != 404) {
@@ -418,14 +406,31 @@ public class CrawlerRetreiver {
                 || proto.equalsIgnoreCase("https");
     }
 
-    private Optional<CrawledDocument> fetchUrl(EdgeUrl top) {
+    private Optional<CrawledDocument> fetchUrl(EdgeUrl top, @Nullable CrawledDocument reference) {
         try {
-            var doc = fetchContent(top);
+            var contentTags = getContentTags(reference);
+            var fetchedDoc = fetchContent(top, contentTags);
+            CrawledDocument doc;
+
+            // HTTP status 304 is NOT MODIFIED, which means the document is the same as it was when
+            // we fetched it last time. We can recycle the reference document.
+            if (reference != null
+             && fetchedDoc.httpStatus == 304)
+            {
+                doc = reference;
+                doc.recrawlState = retainedTag;
+                doc.timestamp = LocalDateTime.now().toString();
+            }
+            else {
+                doc = fetchedDoc;
+            }
 
             if (doc.documentBody != null) {
-                doc.documentBodyHash = createHash(doc.documentBody.decode());
+                var decoded = doc.documentBody.decode();
 
-                Optional<Document> parsedDoc = parseDoc(doc);
+                doc.documentBodyHash = createHash(decoded);
+
+                Optional<Document> parsedDoc = parseDoc(decoded);
                 EdgeUrl url = new EdgeUrl(doc.url);
 
                 parsedDoc.ifPresent(parsed -> findLinks(url, parsed));
@@ -443,23 +448,37 @@ public class CrawlerRetreiver {
 
     }
 
+    private ContentTags getContentTags(@Nullable CrawledDocument reference) {
+        if (null == reference)
+            return ContentTags.empty();
+
+        String headers = reference.headers;
+        if (headers == null)
+            return ContentTags.empty();
+
+        String[] headersLines = headers.split("\n");
+
+        String lastmod = null;
+        String etag = null;
+
+        for (String line : headersLines) {
+            if (line.toLowerCase().startsWith("etag:")) {
+                etag = line.substring(5).trim();
+            }
+            if (line.toLowerCase().startsWith("last-modified:")) {
+                lastmod = line.substring(14).trim();
+            }
+        }
+
+        return new ContentTags(etag, lastmod);
+    }
+
     @SneakyThrows
-    private CrawledDocument fetchContent(EdgeUrl top) {
+    private CrawledDocument fetchContent(EdgeUrl top, ContentTags tags) {
         for (int i = 0; i < 2; i++) {
             try {
-                var doc = fetcher.fetchContent(top, oldCrawlData.getEtag(top), oldCrawlData.getLastModified(top));
-
+                var doc = fetcher.fetchContent(top, tags);
                 doc.recrawlState = "NEW";
-
-                if (doc.httpStatus == 304) {
-                    var referenceData = oldCrawlData.getDoc(top);
-                    if (referenceData != null) {
-                        referenceData.recrawlState = "304/UNCHANGED";
-                        return referenceData;
-                    }
-                }
-
-
                 return doc;
             }
             catch (RateLimitException ex) {
@@ -478,10 +497,8 @@ public class CrawlerRetreiver {
         return hashMethod.hashUnencodedChars(documentBodyHash).toString();
     }
 
-    private Optional<Document> parseDoc(CrawledDocument doc) {
-        if (doc.documentBody == null)
-            return Optional.empty();
-        return Optional.of(Jsoup.parse(doc.documentBody.decode()));
+    private Optional<Document> parseDoc(String decoded) {
+        return Optional.of(Jsoup.parse(decoded));
     }
 
     private void findLinks(EdgeUrl baseUrl, Document parsed) {
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/ContentTags.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/ContentTags.java
new file mode 100644
index 00000000..e1df86c8
--- /dev/null
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/ContentTags.java
@@ -0,0 +1,24 @@
+package nu.marginalia.crawl.retreival.fetcher;
+
+import okhttp3.Request;
+
+/** Encapsulates request modifiers; the ETag and Last-Modified tags for a resource */
+public record ContentTags(String etag, String lastMod) {
+    public static ContentTags empty() {
+        return new ContentTags(null, null);
+    }
+
+    public boolean isPresent() {
+        return etag != null || lastMod != null;
+    }
+
+    public boolean isEmpty() {
+        return etag == null && lastMod == null;
+    }
+
+    /** Paints the tags onto the request builder. */
+    public void paint(Request.Builder getBuilder) {
+        if (etag != null) getBuilder.addHeader("If-None-Match", etag);
+        if (lastMod != null) getBuilder.addHeader("If-Modified-Since", lastMod);
+    }
+}
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java
index 7f588783..11ad272e 100644
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java
@@ -18,7 +18,7 @@ public interface HttpFetcher {
 
     FetchResult probeDomain(EdgeUrl url);
 
-    CrawledDocument fetchContent(EdgeUrl url, String etag, String lastMod) throws RateLimitException;
+    CrawledDocument fetchContent(EdgeUrl url, ContentTags tags) throws RateLimitException;
 
     SimpleRobotRules fetchRobotRules(EdgeDomain domain);
 
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java
index 36c8bd34..be6a6a06 100644
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java
@@ -128,9 +128,15 @@ public class HttpFetcherImpl implements HttpFetcher {
 
     @Override
     @SneakyThrows
-    public CrawledDocument fetchContent(EdgeUrl url, String etag, String lastMod) throws RateLimitException {
+    public CrawledDocument fetchContent(EdgeUrl url,
+                                        ContentTags contentTags)
+            throws RateLimitException
+    {
 
-        if (contentTypeLogic.isUrlLikeBinary(url)) {
+        // We don't want to waste time and resources on URLs that are not HTML, so if the file ending
+        // looks like it might be something else, we perform a HEAD first to check the content type
+        if (contentTags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url))
+        {
             logger.debug("Probing suspected binary {}", url);
 
             var headBuilder = new Request.Builder().head()
@@ -146,6 +152,21 @@ public class HttpFetcherImpl implements HttpFetcher {
                 if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) {
                     return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, "Early probe failed");
                 }
+
+                // Update the URL to the final URL of the HEAD request, otherwise we might end up doing
+
+                // HEAD 301 url1 -> url2
+                // HEAD 200 url2
+                // GET 301 url1 -> url2
+                // GET 200 url2
+
+                // which is not what we want. Overall we want to do as few requests as possible to not raise
+                // too many eyebrows when looking at the logs on the target server.  Overall it's probably desirable
+                // that it looks like the traffic makes sense, as opposed to looking like a broken bot.
+
+                var redirectUrl = new EdgeUrl(rsp.request().url().toString());
+                if (Objects.equals(redirectUrl.domain, url.domain))
+                    url = redirectUrl;
             }
             catch (SocketTimeoutException ex) {
                 return createTimeoutErrorRsp(url, ex);
@@ -157,12 +178,12 @@ public class HttpFetcherImpl implements HttpFetcher {
         }
 
         var getBuilder = new Request.Builder().get();
+
         getBuilder.addHeader("User-agent", userAgent)
                 .url(url.toString())
                 .addHeader("Accept-Encoding", "gzip");
 
-        if (etag != null) getBuilder.addHeader("If-None-Match", etag);
-        if (lastMod != null) getBuilder.addHeader("If-Modified-Since", lastMod);
+        contentTags.paint(getBuilder);
 
         var get = getBuilder.build();
         var call = client.newCall(get);
@@ -314,7 +335,7 @@ public class HttpFetcherImpl implements HttpFetcher {
     private Optional<SimpleRobotRules> fetchRobotsForProto(String proto, EdgeDomain domain) {
         try {
             var url = new EdgeUrl(proto, domain, null, "/robots.txt", null);
-            return Optional.of(parseRobotsTxt(fetchContent(url, null, null)));
+            return Optional.of(parseRobotsTxt(fetchContent(url, ContentTags.empty())));
         }
         catch (Exception ex) {
             return Optional.empty();
diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java
index 2ea9c763..5893910f 100644
--- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java
+++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java
@@ -2,6 +2,7 @@ package nu.marginalia.crawling;
 
 import lombok.SneakyThrows;
 import nu.marginalia.crawl.retreival.RateLimitException;
+import nu.marginalia.crawl.retreival.fetcher.ContentTags;
 import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
 import nu.marginalia.crawl.retreival.logic.ContentTypeLogic;
 import nu.marginalia.model.EdgeUrl;
@@ -29,14 +30,14 @@ class HttpFetcherTest {
     @Test
     void fetchUTF8() throws URISyntaxException, RateLimitException {
         var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
-        var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), null, null);
+        var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), ContentTags.empty());
         System.out.println(str.contentType);
     }
 
     @Test
     void fetchText() throws URISyntaxException, RateLimitException {
         var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
-        var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), null, null);
+        var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), ContentTags.empty());
         System.out.println(str);
     }
 }
\ No newline at end of file
diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java
index f580a123..59e3c45e 100644
--- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java
+++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java
@@ -4,10 +4,7 @@ import crawlercommons.robots.SimpleRobotRules;
 import lombok.SneakyThrows;
 import nu.marginalia.bigstring.BigString;
 import nu.marginalia.crawl.retreival.CrawlerRetreiver;
-import nu.marginalia.crawl.retreival.fetcher.FetchResult;
-import nu.marginalia.crawl.retreival.fetcher.FetchResultState;
-import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
-import nu.marginalia.crawl.retreival.fetcher.SitemapRetriever;
+import nu.marginalia.crawl.retreival.fetcher.*;
 import nu.marginalia.crawling.model.CrawledDocument;
 import nu.marginalia.crawling.model.CrawlerDocumentStatus;
 import nu.marginalia.crawling.model.SerializableCrawlData;
@@ -126,7 +123,7 @@ public class CrawlerMockFetcherTest {
         }
 
         @Override
-        public CrawledDocument fetchContent(EdgeUrl url, String etag, String lastModified) {
+        public CrawledDocument fetchContent(EdgeUrl url, ContentTags tags) {
             logger.info("Fetching {}", url);
             if (mockData.containsKey(url)) {
                 return mockData.get(url);
diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
index 741c8704..009e9084 100644
--- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
+++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
@@ -5,12 +5,18 @@ import nu.marginalia.WmsaHome;
 import nu.marginalia.crawl.retreival.CrawlerRetreiver;
 import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
 import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
+import nu.marginalia.crawling.io.CrawledDomainReader;
+import nu.marginalia.crawling.io.CrawledDomainWriter;
+import nu.marginalia.crawling.io.CrawlerOutputFile;
 import nu.marginalia.crawling.model.CrawledDocument;
 import nu.marginalia.crawling.model.CrawledDomain;
 import nu.marginalia.crawling.model.spec.CrawlingSpecification;
 import nu.marginalia.crawling.model.SerializableCrawlData;
 import org.junit.jupiter.api.*;
 
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
@@ -99,7 +105,7 @@ class CrawlerRetreiverTest {
     }
 
     @Test
-    public void testRecrawl() {
+    public void testRecrawl() throws IOException {
 
         var specs = CrawlingSpecification
                 .builder()
@@ -110,6 +116,8 @@ class CrawlerRetreiverTest {
                 .build();
 
 
+        Path out = Files.createTempDirectory("crawling-process");
+        var writer = new CrawledDomainWriter(out, "test", "123456");
         Map<Class<? extends SerializableCrawlData>, List<SerializableCrawlData>> data = new HashMap<>();
 
         new CrawlerRetreiver(httpFetcher, specs, d -> {
@@ -117,7 +125,12 @@ class CrawlerRetreiverTest {
             if (d instanceof CrawledDocument doc) {
                 System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus);
             }
+            writer.accept(d);
         }).fetch();
+        writer.close();
+
+        var reader = new CrawledDomainReader();
+        var iter = reader.createIterator(CrawlerOutputFile.getOutputFile(out,  "123456", "test"));
 
         CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0);
         domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList());
@@ -128,6 +141,7 @@ class CrawlerRetreiverTest {
             if (d instanceof CrawledDocument doc) {
                 System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus);
             }
-        }).fetch();
+        }).fetch(iter);
+
     }
 }
\ No newline at end of file