(crawler) Don't read all the data into RAM when doing a refresh-crawl

2025-02-23 21:18:58 +00:00 · 2023-07-21 19:47:52 +02:00 · 2023-07-21 19:47:52 +02:00 · 58f2f86ea8
commit 58f2f86ea8
parent 7bc1cff286
10 changed files with 249 additions and 175 deletions
--- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java
+++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java
@ -2,8 +2,10 @@ package nu.marginalia.crawling.io;

 import com.github.luben.zstd.ZstdInputStream;
 import com.google.gson.Gson;
+import lombok.SneakyThrows;
 import nu.marginalia.crawling.model.CrawledDocument;
 import nu.marginalia.crawling.model.CrawledDomain;
+import nu.marginalia.crawling.model.SerializableCrawlData;
 import nu.marginalia.model.gson.GsonFactory;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -14,6 +16,7 @@ import java.io.IOException;
 import java.io.InputStreamReader;
 import java.nio.file.Path;
 import java.util.ArrayList;
+import java.util.Iterator;
 import java.util.List;
 import java.util.Optional;
 import java.util.concurrent.ForkJoinPool;
@ -27,6 +30,44 @@ public class CrawledDomainReader {
    public CrawledDomainReader() {
    }

+    public Iterator<SerializableCrawlData> createIterator(Path path) throws IOException {
+        BufferedReader br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile()))));
+
+        return new Iterator<>() {
+            SerializableCrawlData next;
+
+            @Override
+            @SneakyThrows
+            public boolean hasNext() {
+                String identifier = br.readLine();
+                if (identifier == null) {
+                    br.close();
+                    return false;
+                }
+                String data = br.readLine();
+                if (data == null) {
+                    br.close();
+                    return false;
+                }
+
+                if (identifier.equals(CrawledDomain.SERIAL_IDENTIFIER)) {
+                    next = gson.fromJson(data, CrawledDomain.class);
+                } else if (identifier.equals(CrawledDocument.SERIAL_IDENTIFIER)) {
+                    next = gson.fromJson(data, CrawledDocument.class);
+                }
+                else {
+                    throw new IllegalStateException("Unknown identifier: " + identifier);
+                }
+                return true;
+            }
+
+            @Override
+            public SerializableCrawlData next() {
+                return next;
+            }
+        };
+    }
+    
    public CrawledDomain read(Path path) throws IOException {
        DomainDataAssembler domainData = new DomainDataAssembler();

--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java
@ -8,65 +8,22 @@ import java.util.concurrent.Semaphore;
 public class CrawlLimiter {
    public static final int maxPoolSize = Integer.getInteger("crawler.pool-size", 512);

-    // We'll round up to this size when we're crawling a new domain to prevent
-    // too many concurrent connections
-    public static final int minCrawlDataSizeKb = 128; // 100 Kb
-
-    // The largest size on disk where we'll permit a refresh crawl
-    // (these files easily grow into the gigabytes, we don't want that in RAM)
-    public static  final int maxRefreshableCrawlDataSizeKBytes = 1024*128; // 128 Mb
-
-    // This limits how many concurrent crawl tasks we can have running at once
-    // based on their size on disk.  The on-disk size is compressed, and the
-    // in-ram size is partially compressed (i.e. only the document body); so
-    // maybe a fair estimate is something like 2-4x this figure for RAM usage
-    //
-    public static final int maxConcurrentCrawlTaskSizeKb = 512*1024; // 512 Mb
-
-    static {
-        // Sanity check; if this is false we'll get a deadlock on taskSemRAM
-        assert maxConcurrentCrawlTaskSizeKb >= maxRefreshableCrawlDataSizeKBytes
-                : "maxConcurrentCrawlTaskSizeKb must be larger than maxRefreshableCrawlDataSizeKBytes";
-    }
-
    public record CrawlTaskLimits(Path refreshPath, boolean isRefreshable, int taskSize) {}

-    // We use two semaphores to keep track of the number of concurrent crawls;
-    // first a RAM sempahore to limit the amount of RAM used by refresh crawls.
-    // then a count semaphore to limit the number of concurrent threads (this keeps the connection count manageable)
-    private final Semaphore taskSemRAM = new Semaphore(maxConcurrentCrawlTaskSizeKb);
    private final Semaphore taskSemCount = new Semaphore(maxPoolSize);


    public CrawlTaskLimits getTaskLimits(Path fileName) {
-        long size;
-
-        try {
-            size = Math.max(minCrawlDataSizeKb, Files.size(fileName) / 1024);
-        } catch (IOException ex) {
-            // If we can't read the file, we'll assume it's small since we won't be able to read it later for the refresh either
-            return new CrawlTaskLimits(null,false, minCrawlDataSizeKb);
-        }
-
-        // We'll only permit refresh crawls if the file is small enough
-        boolean isRefreshable = size < maxRefreshableCrawlDataSizeKBytes;
-
-        // We'll truncate this down to maxRefreshableCrawlDataSizeKBytes to ensure
-        // it's possible to acquire the RAM semaphore
-        int effectiveSize = (int) Math.min(maxRefreshableCrawlDataSizeKBytes, size);
-
-        return new CrawlTaskLimits(fileName, isRefreshable, effectiveSize);
+        return new CrawlTaskLimits(fileName, true, 1);
    }


    public void acquire(CrawlTaskLimits properties) throws InterruptedException {
        // It's very important that we acquire the RAM semaphore first to avoid a deadlock
-        taskSemRAM.acquire(properties.taskSize);
        taskSemCount.acquire(1);
    }

    public void release(CrawlTaskLimits properties) {
        taskSemCount.release(1);
-        taskSemRAM.release(properties.taskSize);
    }
 }
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java
@ -10,6 +10,7 @@ import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
 import nu.marginalia.crawling.io.CrawledDomainReader;
 import nu.marginalia.crawling.io.CrawlerOutputFile;
 import nu.marginalia.crawling.model.CrawledDomain;
+import nu.marginalia.crawling.model.SerializableCrawlData;
 import nu.marginalia.db.storage.FileStorageService;
 import nu.marginalia.mq.MessageQueueFactory;
 import nu.marginalia.mq.MqMessage;
@ -32,10 +33,7 @@ import org.slf4j.LoggerFactory;
 import java.io.IOException;
 import java.nio.file.Path;
 import java.sql.SQLException;
-import java.util.HashSet;
-import java.util.Optional;
-import java.util.Set;
-import java.util.UUID;
+import java.util.*;
 import java.util.concurrent.*;
 import java.util.concurrent.atomic.AtomicInteger;

@ -201,19 +199,23 @@ public class CrawlerMain implements AutoCloseable {

        HttpFetcher fetcher = new HttpFetcherImpl(userAgent.uaString(), dispatcher, connectionPool);

-        // Read the previous crawl's data for this domain, if it exists and has a reasonable size
-        Optional<CrawledDomain> domain;
-        if (limits.isRefreshable()) {
-            domain = reader.readOptionally(limits.refreshPath());
-            if (domain.isPresent()) {
-                specification = specification.withOldData(domain.get());
+        Iterator<SerializableCrawlData> iterator;
+        try {
+            if (limits.isRefreshable()) {
+                iterator = reader.createIterator(limits.refreshPath());
            }
+            else {
+                iterator = Collections.emptyIterator();
+            }
+        } catch (IOException e) {
+            logger.warn("Failed to read previous crawl data for {}", specification.domain);
+            iterator = Collections.emptyIterator();
        }

        try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) {
            var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept);

-            int size = retreiver.fetch();
+            int size = retreiver.fetch(iterator);

            workLog.setJobToFinished(specification.id, writer.getOutputFile().toString(), size);

--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
@ -4,6 +4,7 @@ import com.google.common.hash.HashFunction;
 import com.google.common.hash.Hashing;
 import crawlercommons.robots.SimpleRobotRules;
 import lombok.SneakyThrows;
+import nu.marginalia.crawl.retreival.fetcher.ContentTags;
 import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
 import nu.marginalia.crawl.retreival.fetcher.SitemapRetriever;
 import nu.marginalia.crawling.model.spec.CrawlingSpecification;
@ -18,6 +19,7 @@ import org.jsoup.nodes.Document;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

+import javax.annotation.Nullable;
 import java.net.InetAddress;
 import java.net.UnknownHostException;
 import java.time.LocalDateTime;
@ -58,15 +60,13 @@ public class CrawlerRetreiver {
    private final SitemapRetriever sitemapRetriever;
    private final DomainCrawlFrontier crawlFrontier;

-    private final CrawlDataReference oldCrawlData;
-
    int errorCount = 0;
+    private String retainedTag = "RETAINED/304";

    public CrawlerRetreiver(HttpFetcher fetcher,
                            CrawlingSpecification specs,
                            Consumer<SerializableCrawlData> writer) {
        this.fetcher = fetcher;
-        this.oldCrawlData = new CrawlDataReference(specs.oldData);

        id = specs.id;
        domain = specs.domain;
@ -97,10 +97,14 @@ public class CrawlerRetreiver {
    }

    public int fetch() {
+        return fetch(Collections.emptyIterator());
+    }
+
+    public int fetch(Iterator<SerializableCrawlData> oldCrawlData) {
        final DomainProber.ProbeResult probeResult = domainProber.probeDomain(fetcher, domain, crawlFrontier.peek());

        if (probeResult instanceof DomainProber.ProbeResultOk) {
-            return crawlDomain();
+            return crawlDomain(oldCrawlData);
        }

        // handle error cases for probe
@ -137,44 +141,29 @@ public class CrawlerRetreiver {
        throw new IllegalStateException("Unknown probe result: " + probeResult);
    };

-    private int crawlDomain() {
+    private int crawlDomain(Iterator<SerializableCrawlData> oldCrawlData) {
        String ip = findIp(domain);

        assert !crawlFrontier.isEmpty();

        var robotsRules = fetcher.fetchRobotRules(crawlFrontier.peek().domain);
+        long crawlDelay = robotsRules.getCrawlDelay();

-        CrawlDataComparison comparison = compareWithOldData(robotsRules);
-        logger.info("Comparison result for {} : {}", domain, comparison);
+        sniffRootDocument();

-        // If we have reference data, we will always grow the crawl depth a bit
-        if (oldCrawlData.size() > 0) {
+        // Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
+        int recrawled = recrawl(oldCrawlData, robotsRules, crawlDelay);
+
+        if (recrawled > 0) {
+            // If we have reference data, we will always grow the crawl depth a bit
            crawlFrontier.increaseDepth(1.5);
        }

-        // When the reference data doesn't appear to have changed, we'll forego
-        // re-fetching it and just use the old data
-        if (comparison == CrawlDataComparison.NO_CHANGES) {
-            oldCrawlData.allDocuments().forEach((url, doc) -> {
-                if (crawlFrontier.addVisited(url)) {
-                    doc.recrawlState = "RETAINED";
-                    crawledDomainWriter.accept(doc);
-                }
-            });
-
-            // We don't need to hold onto this in RAM anymore
-            oldCrawlData.evict();
-        }
-
-
        downloadSitemaps(robotsRules);
-        sniffRootDocument();
-
-        long crawlDelay = robotsRules.getCrawlDelay();

        CrawledDomain ret = new CrawledDomain(id, domain, null, CrawlerDomainStatus.OK.name(), null, ip, new ArrayList<>(), null);

-        int fetchedCount = 0;
+        int fetchedCount = recrawled;

        while (!crawlFrontier.isEmpty()
            && !crawlFrontier.isCrawlDepthReached()
@ -187,11 +176,6 @@ public class CrawlerRetreiver {
                continue;
            }

-            // Don't re-fetch links that were previously found dead as it's very unlikely that a
-            // 404:ing link will suddenly start working at a later point
-            if (oldCrawlData.isPreviouslyDead(top))
-                continue;
-
            // Check the link filter if the endpoint should be fetched based on site-type
            if (!crawlFrontier.filterLink(top))
                continue;
@ -211,7 +195,7 @@ public class CrawlerRetreiver {
                continue;


-            if (fetchDocument(top, crawlDelay).isPresent()) {
+            if (fetchDocument(top, null, crawlDelay).isPresent()) {
                fetchedCount++;
            }
        }
@ -223,63 +207,69 @@ public class CrawlerRetreiver {
        return fetchedCount;
    }

-    private CrawlDataComparison compareWithOldData(SimpleRobotRules robotsRules) {
+    private int recrawl(Iterator<SerializableCrawlData> oldCrawlData,
+                        SimpleRobotRules robotsRules,
+                        long crawlDelay) {
+        int recrawled = 0;
+        int retained = 0;

-        int numGoodDocuments = oldCrawlData.size();
+        while (oldCrawlData.hasNext()) {
+            if (!(oldCrawlData.next() instanceof CrawledDocument doc)) continue;

-        if (numGoodDocuments == 0)
-            return CrawlDataComparison.NO_OLD_DATA;
+            // This Shouldn't Happen (TM)
+            var urlMaybe = EdgeUrl.parse(doc.url);
+            if (urlMaybe.isEmpty()) continue;
+            var url = urlMaybe.get();

-        if (numGoodDocuments < 10)
-            return CrawlDataComparison.SMALL_SAMPLE;
-
-        // We fetch a sample of the data to assess how much it has changed
-        int sampleSize = (int) Math.min(20, 0.25 * numGoodDocuments);
-        Map<EdgeUrl, CrawledDocument> referenceUrls = oldCrawlData.sample(sampleSize);
-
-        int differences = 0;
-
-        long crawlDelay = robotsRules.getCrawlDelay();
-        for (var url : referenceUrls.keySet()) {
-
-            var docMaybe = fetchDocument(url, crawlDelay);
-            if (docMaybe.isEmpty()) {
-                differences++;
+            // If we've previously 404:d on this URL, we'll refrain from trying to fetch it again
+            if (doc.httpStatus == 404) {
+                crawlFrontier.addVisited(url);
                continue;
            }

-            var newDoc = docMaybe.get();
-            var referenceDoc = referenceUrls.get(url);
+            if (doc.httpStatus != 200) continue;

-            // This looks like a bug but it is not, we want to compare references
-            // to detect if the page has bounced off etag or last-modified headers
-            // to avoid having to do a full content comparison
-            if (newDoc == referenceDoc)
+            if (!robotsRules.isAllowed(url.toString())) {
+                crawledDomainWriter.accept(createRobotsError(url));
+                continue;
+            }
+            if (!crawlFrontier.filterLink(url))
+                continue;
+            if (!crawlFrontier.addVisited(url))
                continue;

-            if (newDoc.httpStatus != referenceDoc.httpStatus) {
-                differences++;
+
+            if (recrawled > 10
+             && retained > 0.9 * recrawled
+             && Math.random() < 0.75)
+            {
+                logger.info("Direct-loading {}", url);
+
+                // Since it looks like most of these documents haven't changed,
+                // we'll load the documents directly; but we do this in a random
+                // fashion to make sure we eventually catch changes over time
+
+                crawledDomainWriter.accept(doc);
+                crawlFrontier.addVisited(url);
                continue;
            }

-            if (newDoc.documentBody == null) {
-                differences++;
-                continue;
+
+            // GET the document with the stored document as a reference
+            // providing etag and last-modified headers, so we can recycle the
+            // document if it hasn't changed without actually downloading it
+
+            var fetchedDocOpt = fetchDocument(url, doc, crawlDelay);
+            if (fetchedDocOpt.isEmpty()) continue;
+
+            if (Objects.equals(fetchedDocOpt.get().recrawlState, retainedTag)) {
+                retained ++;
            }

-            long referenceLsh = hashDoc(referenceDoc);
-            long newLsh = hashDoc(newDoc);
-
-            if (EasyLSH.hammingDistance(referenceLsh, newLsh) > 5) {
-                differences++;
-            }
-        }
-        if (differences > sampleSize/4) {
-            return CrawlDataComparison.CHANGES_FOUND;
-        }
-        else {
-            return CrawlDataComparison.NO_CHANGES;
+            recrawled ++;
        }
+
+        return recrawled;
    }

    private static final HashFunction hasher = Hashing.murmur3_128(0);
@ -346,7 +336,7 @@ public class CrawlerRetreiver {

            var url = crawlFrontier.peek().withPathAndParam("/", null);

-            var maybeSample = fetchUrl(url).filter(sample -> sample.httpStatus == 200);
+            var maybeSample = fetchUrl(url, null).filter(sample -> sample.httpStatus == 200);
            if (maybeSample.isEmpty())
                return;
            var sample = maybeSample.get();
@ -382,23 +372,21 @@ public class CrawlerRetreiver {
        }
    }

-    private Optional<CrawledDocument> fetchDocument(EdgeUrl top, long crawlDelay) {
+    private Optional<CrawledDocument> fetchDocument(EdgeUrl top,
+                                                    @Nullable CrawledDocument reference,
+                                                    long crawlDelay) {
        logger.debug("Fetching {}", top);

        long startTime = System.currentTimeMillis();

-        var doc = fetchUrl(top);
+        var doc = fetchUrl(top, reference);
        if (doc.isPresent()) {
            var d = doc.get();
            crawledDomainWriter.accept(d);
-            oldCrawlData.dispose(top);

            if (d.url != null) {
                // We may have redirected to a different path
-                EdgeUrl.parse(d.url).ifPresent(url -> {
-                    crawlFrontier.addVisited(url);
-                    oldCrawlData.dispose(url);
-                });
+                EdgeUrl.parse(d.url).ifPresent(crawlFrontier::addVisited);
            }

            if ("ERROR".equals(d.crawlerStatus) && d.httpStatus != 404) {
@ -418,14 +406,31 @@ public class CrawlerRetreiver {
                || proto.equalsIgnoreCase("https");
    }

-    private Optional<CrawledDocument> fetchUrl(EdgeUrl top) {
+    private Optional<CrawledDocument> fetchUrl(EdgeUrl top, @Nullable CrawledDocument reference) {
        try {
-            var doc = fetchContent(top);
+            var contentTags = getContentTags(reference);
+            var fetchedDoc = fetchContent(top, contentTags);
+            CrawledDocument doc;
+
+            // HTTP status 304 is NOT MODIFIED, which means the document is the same as it was when
+            // we fetched it last time. We can recycle the reference document.
+            if (reference != null
+             && fetchedDoc.httpStatus == 304)
+            {
+                doc = reference;
+                doc.recrawlState = retainedTag;
+                doc.timestamp = LocalDateTime.now().toString();
+            }
+            else {
+                doc = fetchedDoc;
+            }

            if (doc.documentBody != null) {
-                doc.documentBodyHash = createHash(doc.documentBody.decode());
+                var decoded = doc.documentBody.decode();

-                Optional<Document> parsedDoc = parseDoc(doc);
+                doc.documentBodyHash = createHash(decoded);
+
+                Optional<Document> parsedDoc = parseDoc(decoded);
                EdgeUrl url = new EdgeUrl(doc.url);

                parsedDoc.ifPresent(parsed -> findLinks(url, parsed));
@ -443,23 +448,37 @@ public class CrawlerRetreiver {

    }

+    private ContentTags getContentTags(@Nullable CrawledDocument reference) {
+        if (null == reference)
+            return ContentTags.empty();
+
+        String headers = reference.headers;
+        if (headers == null)
+            return ContentTags.empty();
+
+        String[] headersLines = headers.split("\n");
+
+        String lastmod = null;
+        String etag = null;
+
+        for (String line : headersLines) {
+            if (line.toLowerCase().startsWith("etag:")) {
+                etag = line.substring(5).trim();
+            }
+            if (line.toLowerCase().startsWith("last-modified:")) {
+                lastmod = line.substring(14).trim();
+            }
+        }
+
+        return new ContentTags(etag, lastmod);
+    }
+
    @SneakyThrows
-    private CrawledDocument fetchContent(EdgeUrl top) {
+    private CrawledDocument fetchContent(EdgeUrl top, ContentTags tags) {
        for (int i = 0; i < 2; i++) {
            try {
-                var doc = fetcher.fetchContent(top, oldCrawlData.getEtag(top), oldCrawlData.getLastModified(top));
-
+                var doc = fetcher.fetchContent(top, tags);
                doc.recrawlState = "NEW";
-
-                if (doc.httpStatus == 304) {
-                    var referenceData = oldCrawlData.getDoc(top);
-                    if (referenceData != null) {
-                        referenceData.recrawlState = "304/UNCHANGED";
-                        return referenceData;
-                    }
-                }
-
-
                return doc;
            }
            catch (RateLimitException ex) {
@ -478,10 +497,8 @@ public class CrawlerRetreiver {
        return hashMethod.hashUnencodedChars(documentBodyHash).toString();
    }

-    private Optional<Document> parseDoc(CrawledDocument doc) {
-        if (doc.documentBody == null)
-            return Optional.empty();
-        return Optional.of(Jsoup.parse(doc.documentBody.decode()));
+    private Optional<Document> parseDoc(String decoded) {
+        return Optional.of(Jsoup.parse(decoded));
    }

    private void findLinks(EdgeUrl baseUrl, Document parsed) {
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/ContentTags.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/ContentTags.java
@ -0,0 +1,24 @@
+package nu.marginalia.crawl.retreival.fetcher;
+
+import okhttp3.Request;
+
+/** Encapsulates request modifiers; the ETag and Last-Modified tags for a resource */
+public record ContentTags(String etag, String lastMod) {
+    public static ContentTags empty() {
+        return new ContentTags(null, null);
+    }
+
+    public boolean isPresent() {
+        return etag != null || lastMod != null;
+    }
+
+    public boolean isEmpty() {
+        return etag == null && lastMod == null;
+    }
+
+    /** Paints the tags onto the request builder. */
+    public void paint(Request.Builder getBuilder) {
+        if (etag != null) getBuilder.addHeader("If-None-Match", etag);
+        if (lastMod != null) getBuilder.addHeader("If-Modified-Since", lastMod);
+    }
+}
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java
@ -18,7 +18,7 @@ public interface HttpFetcher {

    FetchResult probeDomain(EdgeUrl url);

-    CrawledDocument fetchContent(EdgeUrl url, String etag, String lastMod) throws RateLimitException;
+    CrawledDocument fetchContent(EdgeUrl url, ContentTags tags) throws RateLimitException;

    SimpleRobotRules fetchRobotRules(EdgeDomain domain);

--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java
@ -128,9 +128,15 @@ public class HttpFetcherImpl implements HttpFetcher {

    @Override
    @SneakyThrows
-    public CrawledDocument fetchContent(EdgeUrl url, String etag, String lastMod) throws RateLimitException {
+    public CrawledDocument fetchContent(EdgeUrl url,
+                                        ContentTags contentTags)
+            throws RateLimitException
+    {

-        if (contentTypeLogic.isUrlLikeBinary(url)) {
+        // We don't want to waste time and resources on URLs that are not HTML, so if the file ending
+        // looks like it might be something else, we perform a HEAD first to check the content type
+        if (contentTags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url))
+        {
            logger.debug("Probing suspected binary {}", url);

            var headBuilder = new Request.Builder().head()
@ -146,6 +152,21 @@ public class HttpFetcherImpl implements HttpFetcher {
                if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) {
                    return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, "Early probe failed");
                }
+
+                // Update the URL to the final URL of the HEAD request, otherwise we might end up doing
+
+                // HEAD 301 url1 -> url2
+                // HEAD 200 url2
+                // GET 301 url1 -> url2
+                // GET 200 url2
+
+                // which is not what we want. Overall we want to do as few requests as possible to not raise
+                // too many eyebrows when looking at the logs on the target server.  Overall it's probably desirable
+                // that it looks like the traffic makes sense, as opposed to looking like a broken bot.
+
+                var redirectUrl = new EdgeUrl(rsp.request().url().toString());
+                if (Objects.equals(redirectUrl.domain, url.domain))
+                    url = redirectUrl;
            }
            catch (SocketTimeoutException ex) {
                return createTimeoutErrorRsp(url, ex);
@ -157,12 +178,12 @@ public class HttpFetcherImpl implements HttpFetcher {
        }

        var getBuilder = new Request.Builder().get();
+
        getBuilder.addHeader("User-agent", userAgent)
                .url(url.toString())
                .addHeader("Accept-Encoding", "gzip");

-        if (etag != null) getBuilder.addHeader("If-None-Match", etag);
-        if (lastMod != null) getBuilder.addHeader("If-Modified-Since", lastMod);
+        contentTags.paint(getBuilder);

        var get = getBuilder.build();
        var call = client.newCall(get);
@ -314,7 +335,7 @@ public class HttpFetcherImpl implements HttpFetcher {
    private Optional<SimpleRobotRules> fetchRobotsForProto(String proto, EdgeDomain domain) {
        try {
            var url = new EdgeUrl(proto, domain, null, "/robots.txt", null);
-            return Optional.of(parseRobotsTxt(fetchContent(url, null, null)));
+            return Optional.of(parseRobotsTxt(fetchContent(url, ContentTags.empty())));
        }
        catch (Exception ex) {
            return Optional.empty();
--- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java
+++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java
@ -2,6 +2,7 @@ package nu.marginalia.crawling;

 import lombok.SneakyThrows;
 import nu.marginalia.crawl.retreival.RateLimitException;
+import nu.marginalia.crawl.retreival.fetcher.ContentTags;
 import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
 import nu.marginalia.crawl.retreival.logic.ContentTypeLogic;
 import nu.marginalia.model.EdgeUrl;
@ -29,14 +30,14 @@ class HttpFetcherTest {
    @Test
    void fetchUTF8() throws URISyntaxException, RateLimitException {
        var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
-        var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), null, null);
+        var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), ContentTags.empty());
        System.out.println(str.contentType);
    }

    @Test
    void fetchText() throws URISyntaxException, RateLimitException {
        var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
-        var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), null, null);
+        var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), ContentTags.empty());
        System.out.println(str);
    }
 }
--- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java
+++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java
@ -4,10 +4,7 @@ import crawlercommons.robots.SimpleRobotRules;
 import lombok.SneakyThrows;
 import nu.marginalia.bigstring.BigString;
 import nu.marginalia.crawl.retreival.CrawlerRetreiver;
-import nu.marginalia.crawl.retreival.fetcher.FetchResult;
-import nu.marginalia.crawl.retreival.fetcher.FetchResultState;
-import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
-import nu.marginalia.crawl.retreival.fetcher.SitemapRetriever;
+import nu.marginalia.crawl.retreival.fetcher.*;
 import nu.marginalia.crawling.model.CrawledDocument;
 import nu.marginalia.crawling.model.CrawlerDocumentStatus;
 import nu.marginalia.crawling.model.SerializableCrawlData;
@ -126,7 +123,7 @@ public class CrawlerMockFetcherTest {
        }

        @Override
-        public CrawledDocument fetchContent(EdgeUrl url, String etag, String lastModified) {
+        public CrawledDocument fetchContent(EdgeUrl url, ContentTags tags) {
            logger.info("Fetching {}", url);
            if (mockData.containsKey(url)) {
                return mockData.get(url);
--- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
+++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
@ -5,12 +5,18 @@ import nu.marginalia.WmsaHome;
 import nu.marginalia.crawl.retreival.CrawlerRetreiver;
 import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
 import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
+import nu.marginalia.crawling.io.CrawledDomainReader;
+import nu.marginalia.crawling.io.CrawledDomainWriter;
+import nu.marginalia.crawling.io.CrawlerOutputFile;
 import nu.marginalia.crawling.model.CrawledDocument;
 import nu.marginalia.crawling.model.CrawledDomain;
 import nu.marginalia.crawling.model.spec.CrawlingSpecification;
 import nu.marginalia.crawling.model.SerializableCrawlData;
 import org.junit.jupiter.api.*;

+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
@ -99,7 +105,7 @@ class CrawlerRetreiverTest {
    }

    @Test
-    public void testRecrawl() {
+    public void testRecrawl() throws IOException {

        var specs = CrawlingSpecification
                .builder()
@ -110,6 +116,8 @@ class CrawlerRetreiverTest {
                .build();


+        Path out = Files.createTempDirectory("crawling-process");
+        var writer = new CrawledDomainWriter(out, "test", "123456");
        Map<Class<? extends SerializableCrawlData>, List<SerializableCrawlData>> data = new HashMap<>();

        new CrawlerRetreiver(httpFetcher, specs, d -> {
@ -117,7 +125,12 @@ class CrawlerRetreiverTest {
            if (d instanceof CrawledDocument doc) {
                System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus);
            }
+            writer.accept(d);
        }).fetch();
+        writer.close();
+
+        var reader = new CrawledDomainReader();
+        var iter = reader.createIterator(CrawlerOutputFile.getOutputFile(out,  "123456", "test"));

        CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0);
        domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList());
@ -128,6 +141,7 @@ class CrawlerRetreiverTest {
            if (d instanceof CrawledDocument doc) {
                System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus);
            }
-        }).fetch();
+        }).fetch(iter);
+
    }
 }