diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcSerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcSerializableCrawlDataStream.java
index a766a58d..02aefb6d 100644
--- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcSerializableCrawlDataStream.java
+++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcSerializableCrawlDataStream.java
@@ -28,6 +28,7 @@ public class WarcSerializableCrawlDataStream implements AutoCloseable, Serializa
         path = file;
         reader = new WarcReader(file);
         WarcXResponseReference.register(reader);
+        WarcXEntityRefused.register(reader);
 
         backingIterator = reader.iterator();
     }
diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java
index 60e0178e..5a993fda 100644
--- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java
+++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java
@@ -9,29 +9,34 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import java.io.IOException;
+import java.net.URI;
 import java.nio.file.Path;
 
 public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
     private final ParquetWriter<CrawledDocumentParquetRecord> writer;
     private static final Logger logger = LoggerFactory.getLogger(CrawledDocumentParquetRecordFileWriter.class);
 
-    public static void convertWarc(String domain, Path warcInputFile, Path parquetOutputFile) throws IOException {
+    public static void convertWarc(String domain, Path warcInputFile, Path parquetOutputFile) {
         try (var warcReader = new WarcReader(warcInputFile);
              var parquetWriter = new CrawledDocumentParquetRecordFileWriter(parquetOutputFile)
         ) {
             WarcXResponseReference.register(warcReader);
+            WarcXEntityRefused.register(warcReader);
 
             for (var record : warcReader) {
                 if (record instanceof WarcResponse response) {
+                    // this also captures WarcXResponseReference, which inherits from WarcResponse
+                    // and is used to store old responses from previous crawls; in this part of the logic
+                    // we treat them the same as a normal response
+
                     parquetWriter.write(domain, response);
                 }
+                else if (record instanceof WarcXEntityRefused refused) {
+                    parquetWriter.write(domain, refused);
+                }
                 else if (record instanceof Warcinfo warcinfo) {
-                    parquetWriter.write(domain, warcinfo);
+                    parquetWriter.write(warcinfo);
                 }
-                else {
-                    logger.warn("Skipping record of type {}", record.type());
-                }
-
             }
         }
         catch (Exception ex) {
@@ -39,31 +44,40 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
         }
     }
 
-    private void write(String domain, Warcinfo warcinfo) throws IOException {
+    private void write(String domain, WarcXEntityRefused refused) throws IOException {
+        URI profile = refused.profile();
+
+        String meta;
+        if (profile.equals(WarcXEntityRefused.documentRobotsTxtSkippedURN)) {
+            meta = "x-marginalia/advisory;state=robots-txt-skipped";
+        }
+        else if (profile.equals(WarcXEntityRefused.documentBadContentTypeURN)) {
+            meta = "x-marginalia/advisory;state=content-type-failed-probe";
+        }
+        else if (profile.equals(WarcXEntityRefused.documentProbeTimeout)) {
+            meta = "x-marginalia/advisory;state=timeout-probe";
+        }
+        else if (profile.equals(WarcXEntityRefused.documentUnspecifiedError)) {
+            meta = "x-marginalia/advisory;state=doc-error";
+        }
+        else {
+            meta = "x-marginalia/advisory;state=unknown";
+        }
+
+        write(forDocError(domain, refused.target(), meta));
+    }
+
+    private void write(Warcinfo warcinfo) throws IOException {
         String selfDomain = warcinfo.fields().first("domain").orElse("");
         String ip = warcinfo.fields().first("ip").orElse("");
         String probeStatus = warcinfo.fields().first("X-WARC-Probe-Status").orElse("");
 
         if (probeStatus.startsWith("REDIRECT")) {
             String redirectDomain = probeStatus.substring("REDIRECT;".length());
-            write(new CrawledDocumentParquetRecord(selfDomain,
-                    STR."https://\{redirectDomain}/",
-                    ip,
-                    false,
-                    0,
-                    "x-marginalia/advisory;state=redirect",
-                    new byte[0]
-            ));
+            write(forDomainRedirect(selfDomain, redirectDomain));
         }
         else if (!"OK".equals(probeStatus)) {
-            write(new CrawledDocumentParquetRecord(selfDomain,
-                    STR."https://\{domain}/",
-                    ip,
-                    false,
-                    0,
-                    "x-marginalia/advisory;state=error",
-                    probeStatus.getBytes()
-            ));
+            write(forDomainError(selfDomain, ip, probeStatus));
         }
     }
 
@@ -83,6 +97,15 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
             return;
         }
 
+        // We don't want to store robots.txt files, as they are not
+        // interesting for the analysis we want to do.  This is important
+        // since txt-files in general are interesting, and we don't want to
+        // exclude them as a class.
+
+        if (fetchOk.uri().getPath().equals("/robots.txt")) {
+            return;
+        }
+
         byte[] bodyBytes;
         String contentType;
 
@@ -112,4 +135,36 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
     public void close() throws IOException {
         writer.close();
     }
+
+    private CrawledDocumentParquetRecord forDomainRedirect(String domain, String redirectDomain) {
+        return new CrawledDocumentParquetRecord(domain,
+                STR."https://\{redirectDomain}/",
+                "",
+                false,
+                0,
+                "x-marginalia/advisory;state=redirect",
+                new byte[0]
+        );
+    }
+    private CrawledDocumentParquetRecord forDomainError(String domain, String ip, String errorStatus) {
+        return new CrawledDocumentParquetRecord(domain,
+                STR."https://\{domain}/",
+                ip,
+                false,
+                0,
+                "x-marginalia/advisory;state=error",
+                errorStatus.getBytes()
+        );
+    }
+
+    private CrawledDocumentParquetRecord forDocError(String domain, String url, String errorStatus) {
+        return new CrawledDocumentParquetRecord(domain,
+                url,
+                "",
+                false,
+                0,
+                "x-marginalia/advisory;state=error",
+                errorStatus.getBytes()
+        );
+    }
 }
diff --git a/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXEntityRefused.java b/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXEntityRefused.java
new file mode 100644
index 00000000..4480115e
--- /dev/null
+++ b/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXEntityRefused.java
@@ -0,0 +1,45 @@
+package org.netpreserve.jwarc;
+
+import java.io.IOException;
+import java.net.URI;
+
+/** This defines a non-standard extension to WARC for storing old HTTP responses,
+ * essentially a 'response' with different semantics
+ */
+public class WarcXEntityRefused extends WarcRevisit {
+    private static final String TYPE_NAME = "x-entity-refused";
+
+    public static final URI documentRobotsTxtSkippedURN = URI.create("urn:marginalia/meta/doc/robots-txt-skipped");
+    public static final URI documentBadContentTypeURN = URI.create("urn:marginalia/meta/doc/content-type-failed-probe");
+    public static final URI documentProbeTimeout = URI.create("urn:marginalia/meta/doc/timeout-probe");
+    public static final URI documentUnspecifiedError = URI.create("urn:marginalia/meta/doc/error");
+
+    WarcXEntityRefused(MessageVersion version, MessageHeaders headers, MessageBody body) {
+        super(version, headers, body);
+    }
+
+    public static void register(WarcReader reader) {
+        reader.registerType(TYPE_NAME, WarcXEntityRefused::new);
+    }
+
+    public static class Builder extends AbstractBuilder<WarcXEntityRefused, Builder> {
+        public Builder(URI targetURI, URI profile) {
+            this(targetURI.toString(), profile.toString());
+        }
+
+        public Builder(String targetURI, String profileURI) {
+            super(TYPE_NAME);
+            setHeader("WARC-Target-URI", targetURI);
+            setHeader("WARC-Profile", profileURI);
+        }
+
+        public Builder body(HttpResponse httpResponse) throws IOException {
+            return body(MediaType.HTTP_RESPONSE, httpResponse);
+        }
+
+        @Override
+        public WarcXEntityRefused build() {
+            return build(WarcXEntityRefused::new);
+        }
+    }
+}
diff --git a/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXResponseReference.java b/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXResponseReference.java
index 7e02d936..19a5a00f 100644
--- a/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXResponseReference.java
+++ b/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXResponseReference.java
@@ -4,9 +4,7 @@ import java.io.IOException;
 import java.net.URI;
 
 /** This defines a non-standard extension to WARC for storing old HTTP responses,
- * essentially a 'revisit' with a full body, which is not something that is
- * expected by the jwarc parser, and goes against the semantics of the revisit
- * records a fair bit.
+ * essentially a 'response' with different semantics..
  * <p>
  * An x-response-reference record is a response record with a full body, where
  * the data is a reconstructed HTTP response from a previous crawl.
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java
index 9088ebb4..65e1529b 100644
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java
@@ -5,6 +5,8 @@ import com.google.common.hash.Hashing;
 import nu.marginalia.crawling.io.SerializableCrawlDataStream;
 import nu.marginalia.crawling.model.CrawledDocument;
 import nu.marginalia.lsh.EasyLSH;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import javax.annotation.Nullable;
 import java.io.IOException;
@@ -15,6 +17,7 @@ import java.nio.file.Path;
 public class CrawlDataReference implements AutoCloseable {
 
     private final SerializableCrawlDataStream data;
+    private static final Logger logger = LoggerFactory.getLogger(CrawlDataReference.class);
 
     public CrawlDataReference(SerializableCrawlDataStream data) {
         this.data = data;
@@ -43,8 +46,9 @@ public class CrawlDataReference implements AutoCloseable {
             }
         }
         catch (IOException ex) {
-            ex.printStackTrace();
+            logger.error("Failed to read next document", ex);
         }
+
         return null;
     }
 
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java
index ca2494dc..e52b73b6 100644
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java
@@ -20,8 +20,18 @@ public class CrawlDelayTimer {
         this.delayTime = delayTime;
     }
 
+    /** Call when we've gotten an HTTP 429 response.  This will wait a moment, and then
+     * set a flag that slows down the main crawl delay as well. */
+    public void waitRetryDelay(RateLimitException ex) throws InterruptedException {
+        slowDown = true;
+
+        int delay = ex.retryAfter();
+
+        Thread.sleep(Math.clamp(delay, 100, 5000));
+    }
+
     @SneakyThrows
-    public void delay(long spentTime) {
+    public void waitFetchDelay(long spentTime) {
         long sleepTime = delayTime;
 
         if (sleepTime >= 1) {
@@ -30,10 +40,6 @@ public class CrawlDelayTimer {
 
             Thread.sleep(min(sleepTime - spentTime, 5000));
         }
-        else if (slowDown) {
-            // Additional delay when the server is signalling it wants slower requests
-            Thread.sleep( DEFAULT_CRAWL_DELAY_MIN_MS);
-        }
         else {
             // When no crawl delay is specified, lean toward twice the fetch+process time,
             // within sane limits. This means slower servers get slower crawling, and faster
@@ -48,10 +54,10 @@ public class CrawlDelayTimer {
 
             Thread.sleep(sleepTime - spentTime);
         }
-    }
 
-    /** Increase the delay between requests if the server is signalling it wants slower requests with HTTP 429 */
-    public void slowDown() {
-        slowDown = true;
+        if (slowDown) {
+            // Additional delay when the server is signalling it wants slower requests
+            Thread.sleep( DEFAULT_CRAWL_DELAY_MIN_MS);
+        }
     }
 }
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
index 668f597a..35f5bcd0 100644
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
@@ -3,7 +3,6 @@ package nu.marginalia.crawl.retreival;
 import com.google.common.hash.HashFunction;
 import com.google.common.hash.Hashing;
 import crawlercommons.robots.SimpleRobotRules;
-import lombok.SneakyThrows;
 import nu.marginalia.atags.model.DomainLinks;
 import nu.marginalia.contenttype.ContentType;
 import nu.marginalia.crawl.retreival.fetcher.ContentTags;
@@ -19,6 +18,7 @@ import nu.marginalia.ip_blocklist.UrlBlocklist;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawlspec.CrawlSpecRecord;
+import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -32,6 +32,7 @@ import java.util.*;
 public class CrawlerRetreiver implements AutoCloseable {
 
     private static final int MAX_ERRORS = 20;
+    private static final int HTTP_429_RETRY_LIMIT = 1; // Retry 429s once
 
     private final HttpFetcher fetcher;
 
@@ -40,7 +41,6 @@ public class CrawlerRetreiver implements AutoCloseable {
     private static final LinkParser linkParser = new LinkParser();
     private static final Logger logger = LoggerFactory.getLogger(CrawlerRetreiver.class);
 
-    private static final HashFunction hashMethod = Hashing.murmur3_128(0);
     private static final UrlBlocklist urlBlocklist = new UrlBlocklist();
     private static final LinkFilterSelector linkFilterSelector = new LinkFilterSelector();
 
@@ -104,7 +104,7 @@ public class CrawlerRetreiver implements AutoCloseable {
         resync.run(warcFile);
     }
 
-    private int crawlDomain(CrawlDataReference oldCrawlData, DomainProber.ProbeResult probeResult, DomainLinks domainLinks) throws IOException {
+    private int crawlDomain(CrawlDataReference oldCrawlData, DomainProber.ProbeResult probeResult, DomainLinks domainLinks) throws IOException, InterruptedException {
         String ip = findIp(domain);
 
         EdgeUrl rootUrl;
@@ -124,7 +124,7 @@ public class CrawlerRetreiver implements AutoCloseable {
         final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(crawlFrontier.peek().domain, warcRecorder);
         final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());
 
-        sniffRootDocument(delayTimer, rootUrl);
+        sniffRootDocument(rootUrl);
 
         // Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
         int recrawled = recrawl(oldCrawlData, robotsRules, delayTimer);
@@ -181,8 +181,14 @@ public class CrawlerRetreiver implements AutoCloseable {
                 continue;
 
 
-            if (fetchWriteAndSleep(top, delayTimer, DocumentWithReference.empty()).isOk()) {
-                fetchedCount++;
+            try {
+                if (fetchWriteAndSleep(top, delayTimer, DocumentWithReference.empty()).isOk()) {
+                    fetchedCount++;
+                }
+            }
+            catch (InterruptedException ex) {
+                Thread.currentThread().interrupt();
+                break;
             }
         }
 
@@ -192,17 +198,17 @@ public class CrawlerRetreiver implements AutoCloseable {
     }
 
     /** Using the old crawl data, fetch the documents comparing etags and last-modified */
-    private int recrawl(CrawlDataReference oldCrawlData, SimpleRobotRules robotsRules, CrawlDelayTimer delayTimer) {
+    private int recrawl(CrawlDataReference oldCrawlData, SimpleRobotRules robotsRules, CrawlDelayTimer delayTimer) throws InterruptedException {
         return crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer);
     }
 
-    private void sniffRootDocument(CrawlDelayTimer delayTimer, EdgeUrl rootUrl) {
+    private void sniffRootDocument(EdgeUrl rootUrl) {
         try {
             logger.debug("Configuring link filter");
 
             var url = rootUrl.withPathAndParam("/", null);
 
-            var result = tryDownload(url, delayTimer, ContentTags.empty());
+            var result = fetcher.fetchContent(url, warcRecorder, ContentTags.empty());
             if (!(result instanceof HttpFetchResult.ResultOk ok))
                 return;
 
@@ -239,22 +245,28 @@ public class CrawlerRetreiver implements AutoCloseable {
     }
 
     public HttpFetchResult fetchWriteAndSleep(EdgeUrl top,
-                                                         CrawlDelayTimer timer,
-                                                         DocumentWithReference reference) {
+                                              CrawlDelayTimer timer,
+                                              DocumentWithReference reference) throws InterruptedException
+    {
         logger.debug("Fetching {}", top);
 
+        HttpFetchResult fetchedDoc = new HttpFetchResult.ResultNone();
+
         long startTime = System.currentTimeMillis();
-
         var contentTags = reference.getContentTags();
-        var fetchedDoc = tryDownload(top, timer, contentTags);
 
-        if (fetchedDoc instanceof HttpFetchResult.Result304Raw) {
-            var doc = reference.doc();
-            if (doc != null) {
-                warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBody);
-                fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url,
-                        new ContentType(doc.contentType, "UTF-8"),
-                        doc.documentBody);
+        // Fetch the document, retrying if we get a rate limit exception
+        for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
+            try {
+                fetchedDoc = fetcher.fetchContent(top, warcRecorder, contentTags);
+                break;
+            }
+            catch (RateLimitException ex) {
+                timer.waitRetryDelay(ex);
+            }
+            catch (Exception ex) {
+                logger.warn("Failed to fetch {}", top, ex);
+                fetchedDoc = new HttpFetchResult.ResultException(ex);
             }
         }
 
@@ -268,14 +280,19 @@ public class CrawlerRetreiver implements AutoCloseable {
                     crawlFrontier.addVisited(new EdgeUrl(ok.uri()));
                 }
             }
-            else if (fetchedDoc instanceof HttpFetchResult.Result304ReplacedWithReference retained) {
-                var docOpt = retained.parseDocument();
-                if (docOpt.isPresent()) {
-                    var doc = docOpt.get();
+            else if (fetchedDoc instanceof HttpFetchResult.Result304Raw && reference.doc() != null) {
+                var doc = reference.doc();
 
-                    crawlFrontier.enqueueLinksFromDocument(top, doc);
-                    EdgeUrl.parse(retained.url()).ifPresent(crawlFrontier::addVisited);
-                }
+                warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBody);
+
+                fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url,
+                        new ContentType(doc.contentType, "UTF-8"),
+                        doc.documentBody);
+
+                var parsed = Jsoup.parse(doc.documentBody);
+
+                crawlFrontier.enqueueLinksFromDocument(top, parsed);
+                crawlFrontier.addVisited(top);
             }
             else if (fetchedDoc instanceof HttpFetchResult.ResultException ex) {
                 errorCount ++;
@@ -285,7 +302,7 @@ public class CrawlerRetreiver implements AutoCloseable {
             logger.error("Error parsing document {}", top, ex);
         }
 
-        timer.delay(System.currentTimeMillis() - startTime);
+        timer.waitFetchDelay(System.currentTimeMillis() - startTime);
 
         return fetchedDoc;
     }
@@ -295,33 +312,6 @@ public class CrawlerRetreiver implements AutoCloseable {
                 || proto.equalsIgnoreCase("https");
     }
 
-    @SneakyThrows
-    private HttpFetchResult tryDownload(EdgeUrl top, CrawlDelayTimer timer, ContentTags tags) {
-        for (int i = 0; i < 2; i++) {
-            try {
-                return fetcher.fetchContent(top, warcRecorder, tags);
-            }
-            catch (RateLimitException ex) {
-                timer.slowDown();
-
-                int delay = ex.retryAfter();
-                if (delay > 0 && delay < 5000) {
-                    Thread.sleep(delay);
-                }
-            }
-            catch (Exception ex) {
-                logger.warn("Failed to fetch {}", top, ex);
-                return new HttpFetchResult.ResultException(ex);
-            }
-        }
-
-        return new HttpFetchResult.ResultNone();
-    }
-
-    private String createHash(String documentBodyHash) {
-        return hashMethod.hashUnencodedChars(documentBodyHash).toString();
-    }
-
     // FIXME this does not belong in the crawler
     private Optional<EdgeUrl> findCanonicalUrl(EdgeUrl baseUrl, Document parsed) {
         baseUrl = baseUrl.domain.toRootUrl();
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java
index 47b5b2d8..52ebe2f3 100644
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java
@@ -34,6 +34,7 @@ public class CrawlerWarcResynchronizer {
         // First pass, enqueue links
         try (var reader = new WarcReader(tempFile)) {
             WarcXResponseReference.register(reader);
+            WarcXEntityRefused.register(reader);
 
             for (var item : reader) {
                 accept(item);
@@ -58,13 +59,26 @@ public class CrawlerWarcResynchronizer {
                 response(rsp);
             } else if (item instanceof WarcRequest req) {
                 request(req);
+            } else if (item instanceof WarcXEntityRefused refused) {
+                refused(refused);
             }
+
         }
         catch (Exception ex) {
             logger.info(STR."Failed to process warc record \{item}", ex);
         }
     }
 
+    private void refused(WarcXEntityRefused refused) {
+        // In general, we don't want to re-crawl urls that were refused,
+        // but to permit circumstances to change over  time, we'll
+        // allow for a small chance of re-probing these entries
+
+        if (Math.random() > 0.1) {
+            crawlFrontier.addVisited(new EdgeUrl(refused.targetURI()));
+        }
+    }
+
     private void request(WarcRequest request) {
         EdgeUrl.parse(request.target()).ifPresent(crawlFrontier::addVisited);
     }
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java
index 6d868fdf..46446fee 100644
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java
@@ -50,9 +50,14 @@ public class DomainCrawlFrontier {
         }
     }
 
+    /** Increase the depth of the crawl by a factor.  If the current depth is smaller
+     * than the number of already visited documents, the base depth will be adjusted
+     * to the visited count first.
+     */
     public void increaseDepth(double depthIncreaseFactor) {
-        depth = (int)(depth * depthIncreaseFactor);
+        depth = (int)(Math.max(visited.size(), depth) * depthIncreaseFactor);
     }
+
     public void setLinkFilter(Predicate<EdgeUrl> linkFilter) {
         this.linkFilter = linkFilter;
     }
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java
index 2ceb076d..ad29056f 100644
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java
@@ -20,7 +20,10 @@ public class WarcProtocolReconstructor {
 
     static String getHttpRequestString(Request request, URI uri) {
         StringBuilder requestStringBuilder = new StringBuilder();
-        requestStringBuilder.append(request.method()).append(" ").append(URLEncoder.encode(uri.getPath(), StandardCharsets.UTF_8));
+
+        final String encodedURL = encodeURLKeepSlashes(uri.getPath());
+
+        requestStringBuilder.append(request.method()).append(" ").append(encodedURL);
 
         if (uri.getQuery() != null) {
             requestStringBuilder.append("?").append(uri.getQuery());
@@ -37,6 +40,19 @@ public class WarcProtocolReconstructor {
         return requestStringBuilder.toString();
     }
 
+    /** Java's URLEncoder will URLEncode slashes, which is not desirable
+     * when sanitizing a URL for HTTP protocol purposes
+     */
+
+    private static String encodeURLKeepSlashes(String URL) {
+        String[] parts = StringUtils.split(URL,"/");
+        StringJoiner joiner = new StringJoiner("/");
+        for (String part : parts) {
+            joiner.add(URLEncoder.encode(part, StandardCharsets.UTF_8));
+        }
+        return joiner.toString();
+    }
+
     static String getResponseHeader(String headersAsString, int code) {
         String version = "1.1";
 
@@ -131,6 +147,11 @@ public class WarcProtocolReconstructor {
             if (headerCapitalized.startsWith("X-Marginalia"))
                 return;
 
+            // Omit Transfer-Encoding header, as we'll be using Content-Length
+            // instead in the warc file, despite what the server says
+            if (headerCapitalized.startsWith("Transfer-Encoding"))
+                return;
+
             for (var value : values) {
                 joiner.add(headerCapitalized + ": " + value);
             }
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java
index a1335eb8..5ccfacb5 100644
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java
@@ -29,11 +29,6 @@ import java.util.*;
  * be reconstructed.
  */
 public class WarcRecorder implements AutoCloseable {
-    public static final URI documentRobotsTxtSkippedURN = URI.create("urn:marginalia/meta/doc/robots-txt-skipped");
-    public static final URI documentBadContentTypeURN = URI.create("urn:marginalia/meta/doc/content-type-failed-probe");
-    public static final URI documentProbeTimeout = URI.create("urn:marginalia/meta/doc/timeout-probe");
-    public static final URI documentUnspecifiedError = URI.create("urn:marginalia/meta/doc/error");
-
     private static final int MAX_TIME = 30_000;
     private static final int MAX_SIZE = 1024 * 1024 * 10;
     private final WarcWriter writer;
@@ -91,6 +86,8 @@ public class WarcRecorder implements AutoCloseable {
 
         ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer();
 
+        boolean hasCookies = !client.cookieJar().loadForRequest(request.url()).isEmpty();
+
         try (var response = call.execute()) {
             var body = response.body();
             InputStream inputStream;
@@ -143,6 +140,7 @@ public class WarcRecorder implements AutoCloseable {
 
             WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri)
                     .blockDigest(responseDigestBuilder.build())
+                    .addHeader("X-Has-Cookies", hasCookies ? "1" : "0")
                     .date(date)
                     .body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
 
@@ -280,11 +278,11 @@ public class WarcRecorder implements AutoCloseable {
 
     public void flagAsRobotsTxtError(EdgeUrl top) {
         try {
-            WarcRevisit revisit = new WarcRevisit.Builder(top.asURI(), documentRobotsTxtSkippedURN)
+            WarcXEntityRefused refusal = new WarcXEntityRefused.Builder(top.asURI(), WarcXEntityRefused.documentRobotsTxtSkippedURN)
                     .date(Instant.now())
                     .build();
 
-            writer.write(revisit);
+            writer.write(refusal);
         } catch (URISyntaxException | IOException e) {
             throw new RuntimeException(e);
         }
@@ -292,13 +290,13 @@ public class WarcRecorder implements AutoCloseable {
 
     public void flagAsFailedContentTypeProbe(EdgeUrl url, String contentType, int status) {
         try {
-            WarcRevisit revisit = new WarcRevisit.Builder(url.asURI(), documentBadContentTypeURN)
+            WarcXEntityRefused refusal = new WarcXEntityRefused.Builder(url.asURI(), WarcXEntityRefused.documentBadContentTypeURN)
                     .date(Instant.now())
                     .addHeader("Rejected-Content-Type", contentType)
                     .addHeader("Http-Status", Integer.toString(status))
                     .build();
 
-            writer.write(revisit);
+            writer.write(refusal);
         } catch (URISyntaxException | IOException e) {
             throw new RuntimeException(e);
         }
@@ -306,13 +304,13 @@ public class WarcRecorder implements AutoCloseable {
 
     public void flagAsError(EdgeUrl url, Exception ex) {
         try {
-            WarcRevisit revisit = new WarcRevisit.Builder(url.asURI(), documentUnspecifiedError)
+            WarcXEntityRefused refusal = new WarcXEntityRefused.Builder(url.asURI(), WarcXEntityRefused.documentUnspecifiedError)
                     .date(Instant.now())
                     .addHeader("Exception", ex.getClass().getSimpleName())
                     .addHeader("ErrorMessage", Objects.requireNonNullElse(ex.getMessage(), ""))
                     .build();
 
-            writer.write(revisit);
+            writer.write(refusal);
         } catch (URISyntaxException | IOException e) {
             throw new RuntimeException(e);
         }
@@ -320,11 +318,11 @@ public class WarcRecorder implements AutoCloseable {
 
     public void flagAsTimeout(EdgeUrl url) {
         try {
-            WarcRevisit revisit = new WarcRevisit.Builder(url.asURI(), documentProbeTimeout)
+            WarcXEntityRefused refusal = new WarcXEntityRefused.Builder(url.asURI(), WarcXEntityRefused.documentProbeTimeout)
                     .date(Instant.now())
                     .build();
 
-            writer.write(revisit);
+            writer.write(refusal);
         } catch (URISyntaxException | IOException e) {
             throw new RuntimeException(e);
         }
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java
index 70a98310..91c21d65 100644
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java
@@ -15,13 +15,6 @@ import org.jsoup.Jsoup;
  *  E-Tag and Last-Modified headers.
  */
 public class CrawlerRevisitor {
-    /** recrawlState tag for documents that had a HTTP status 304 */
-    public static final String documentWasRetainedTag = "RETAINED/304";
-
-    /** recrawlState tag for documents that had a 200 status but were identical to a previous version */
-    public static final String documentWasSameTag = "SAME-BY-COMPARISON";
-
-
     private final DomainCrawlFrontier crawlFrontier;
     private final CrawlerRetreiver crawlerRetreiver;
     private final WarcRecorder warcRecorder;
@@ -37,7 +30,8 @@ public class CrawlerRevisitor {
     /** Performs a re-crawl of old documents, comparing etags and last-modified */
     public int recrawl(CrawlDataReference oldCrawlData,
                        SimpleRobotRules robotsRules,
-                       CrawlDelayTimer delayTimer) {
+                       CrawlDelayTimer delayTimer)
+    throws InterruptedException {
         int recrawled = 0;
         int retained = 0;