diff --git a/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertAction.java b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertAction.java
index 833ad3f0..17102c06 100644
--- a/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertAction.java
+++ b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertAction.java
@@ -4,5 +4,6 @@ public enum ConvertAction {
     ConvertCrawlData,
     SideloadEncyclopedia,
     SideloadDirtree,
+    SideloadWarc,
     SideloadStackexchange
 }
diff --git a/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertRequest.java b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertRequest.java
index fffed79b..cf445e5a 100644
--- a/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertRequest.java
+++ b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertRequest.java
@@ -38,6 +38,13 @@ public class ConvertRequest {
                 destId,
                 null);
     }
+    public static ConvertRequest forWarc(Path sourcePath, FileStorageId destId) {
+        return new ConvertRequest(ConvertAction.SideloadWarc,
+                sourcePath.toString(),
+                null,
+                destId,
+                null);
+    }
 
     public static ConvertRequest forStackexchange(Path sourcePath, FileStorageId destId) {
         return new ConvertRequest(ConvertAction.SideloadStackexchange,
diff --git a/code/common/model/src/main/java/nu/marginalia/model/EdgeUrl.java b/code/common/model/src/main/java/nu/marginalia/model/EdgeUrl.java
index 9def0480..c09ed550 100644
--- a/code/common/model/src/main/java/nu/marginalia/model/EdgeUrl.java
+++ b/code/common/model/src/main/java/nu/marginalia/model/EdgeUrl.java
@@ -224,12 +224,19 @@ public class EdgeUrl implements Serializable {
     }
 
     public URL asURL() throws MalformedURLException {
-        int port = this.port != null ? this.port : switch(proto) {
-            case "http" -> 80;
-            case "https" -> 443;
-            default -> 0;
-        };
+        try {
+            return asURI().toURL();
+        }
+        catch (URISyntaxException e) {
+            throw new MalformedURLException(e.getMessage());
+        }
+    }
 
-        return new URL(this.proto, this.domain.toString(), port, this.path);
+    public URI asURI() throws URISyntaxException {
+        if (port != null) {
+            return new URI(this.proto, null, this.domain.toString(), this.port, this.path, this.param, null);
+        }
+
+        return new URI(this.proto, this.domain.toString(), this.path, this.param, null);
     }
 }
diff --git a/code/features-crawl/content-type/build.gradle b/code/features-crawl/content-type/build.gradle
new file mode 100644
index 00000000..73a155cb
--- /dev/null
+++ b/code/features-crawl/content-type/build.gradle
@@ -0,0 +1,29 @@
+plugins {
+    id 'java'
+
+
+    id 'jvm-test-suite'
+}
+
+java {
+    toolchain {
+        languageVersion.set(JavaLanguageVersion.of(21))
+    }
+}
+
+dependencies {
+    implementation project(':code:common:model')
+    implementation libs.crawlercommons
+    implementation libs.notnull
+
+    implementation libs.bundles.gson
+    implementation libs.bundles.slf4j
+    testImplementation libs.bundles.slf4j.test
+
+    implementation libs.jsoup
+    implementation libs.commons.lang3
+
+    testImplementation libs.bundles.slf4j.test
+    testImplementation libs.bundles.junit
+    testImplementation libs.mockito
+}
diff --git a/code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/ContentType.java b/code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/ContentType.java
new file mode 100644
index 00000000..095497c8
--- /dev/null
+++ b/code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/ContentType.java
@@ -0,0 +1,28 @@
+package nu.marginalia.contenttype;
+
+import org.apache.commons.lang3.StringUtils;
+
+/** Content type and charset of a document
+ * @param contentType The content type, e.g. "text/html"
+ * @param charset The charset, e.g. "UTF-8"
+ */
+public record ContentType(String contentType, String charset) {
+    public static ContentType parse(String contentTypeHeader) {
+        String[] parts = StringUtils.split(contentTypeHeader, ";", 2);
+        String contentType = parts[0].trim();
+        String charset = parts.length > 1 ? parts[1].trim() : "UTF-8";
+
+        return new ContentType(contentType, charset);
+    }
+
+    public boolean is(String contentType) {
+        return this.contentType.equalsIgnoreCase(contentType);
+    }
+
+    public String toString() {
+        if (charset == null || charset.isBlank())
+            return contentType;
+
+        return STR."\{contentType}; charset=\{charset}";
+    }
+}
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/logic/ContentTypeParser.java b/code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/ContentTypeParser.java
similarity index 60%
rename from code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/logic/ContentTypeParser.java
rename to code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/ContentTypeParser.java
index 604264e3..5b794246 100644
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/logic/ContentTypeParser.java
+++ b/code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/ContentTypeParser.java
@@ -1,7 +1,8 @@
-package nu.marginalia.crawl.retreival.logic;
+package nu.marginalia.contenttype;
 
 import crawlercommons.mimetypes.MimeTypeDetector;
-import nu.marginalia.crawling.model.ContentType;
+import org.jetbrains.annotations.NotNull;
+import org.jetbrains.annotations.Nullable;
 import org.jsoup.Jsoup;
 
 import java.util.Arrays;
@@ -11,28 +12,40 @@ public class ContentTypeParser {
 
     static final MimeTypeDetector mimeTypeDetector = new MimeTypeDetector();
 
-    public static ContentType parse(String contentType, byte[] data) {
-        return getContentTypeFromContentTypeString(contentType)
-                .or(() -> getContentTypeStringFromTag(data))
+    /** Parse the content type and charset from a content type header and/or the body of a document,
+     * best effort
+     */
+    public static ContentType parseContentType(
+            @Nullable String contentTypeHeader,
+            @NotNull byte[] body)
+    {
+        return getContentTypeFromContentTypeString(contentTypeHeader)
+                .or(() -> getContentTypeStringFromTag(body))
                 .orElseGet(() -> {
-                    Optional<String> charset = getCharsetFromTag(data);
+                    Optional<String> charset = getCharsetFromTag(body);
                     return new ContentType(
-                            Optional.ofNullable(contentType)
-                                    .or(() -> Optional.ofNullable(mimeTypeDetector.detect(data)))
-                                    .orElseGet(() -> ContentTypeParser.shittyMimeSniffer(data)), charset.orElse("ISO_8859_1"));
+                            Optional.ofNullable(contentTypeHeader)
+                                    .or(() -> Optional.ofNullable(mimeTypeDetector.detect(body)))
+                                    .orElseGet(() -> ContentTypeParser.shittyMimeSniffer(body)), charset.orElse("ISO_8859_1"));
                 });
     }
 
-    private static Optional<ContentType> getContentTypeFromContentTypeString(String contentType) {
-        if (contentType != null && contentType.contains(";")) {
-            var parts = contentType.split(";");
-            var content = parts[0].trim();
-            var extra = parts[1].trim();
-            if (extra.startsWith("charset=")) {
-                return Optional.of(new ContentType(content, extra.substring("charset=".length())));
-            }
-        }
-        return Optional.empty();
+    /** Parse the charset from a content type string. */
+    private static Optional<ContentType> getContentTypeFromContentTypeString(@Nullable String contentType) {
+        if (contentType == null)
+            return Optional.empty();
+
+        if (!contentType.contains(";"))
+            return Optional.empty();
+
+        var parts = contentType.split(";");
+        var content = parts[0].trim();
+        var extra = parts[1].trim();
+
+        if (!extra.startsWith("charset="))
+            return Optional.empty();
+
+        return Optional.of(new ContentType(content, extra.substring("charset=".length())));
     }
 
     private static String shittyMimeSniffer(byte[] data) {
@@ -45,6 +58,7 @@ public class ContentTypeParser {
 
         String startStr = new String(Arrays.copyOf(data, Math.min(128, data.length))).trim().toLowerCase();
         if (startStr.contains("<!doctype html") || startStr.contains("<html")) {
+            // note we use contains here, since xhtml may be served with a <?xml-style header first
             return "text/html";
         }
         else {
diff --git a/code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/DocumentBodyToString.java b/code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/DocumentBodyToString.java
new file mode 100644
index 00000000..d4d6e9b7
--- /dev/null
+++ b/code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/DocumentBodyToString.java
@@ -0,0 +1,27 @@
+package nu.marginalia.contenttype;
+
+import java.nio.charset.*;
+
+public class DocumentBodyToString {
+
+    /** Get the string data from a document body, given the content type and charset */
+    public static String getStringData(ContentType type, byte[] data) {
+        Charset charset;
+        try {
+            charset = Charset.forName(type.charset());
+        }
+        catch (IllegalCharsetNameException ex) {
+            // Fall back to UTF-8 if we don't understand what this is.  It's *probably* fine? Maybe?
+            charset = StandardCharsets.UTF_8;
+        }
+        catch (UnsupportedCharsetException ex) {
+            // This is usually like Macintosh Latin
+            // (https://en.wikipedia.org/wiki/Macintosh_Latin_encoding)
+            //
+            // It's close enough to 8859-1 to serve
+            charset = StandardCharsets.ISO_8859_1;
+        }
+
+        return new String(data, charset);
+    }
+}
diff --git a/code/features-crawl/content-type/src/test/java/nu/marginalia/contenttype/ContentTypeParserTest.java b/code/features-crawl/content-type/src/test/java/nu/marginalia/contenttype/ContentTypeParserTest.java
new file mode 100644
index 00000000..051c96a4
--- /dev/null
+++ b/code/features-crawl/content-type/src/test/java/nu/marginalia/contenttype/ContentTypeParserTest.java
@@ -0,0 +1,50 @@
+package nu.marginalia.contenttype;
+
+import org.junit.jupiter.api.Test;
+
+import java.nio.charset.StandardCharsets;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+public class ContentTypeParserTest {
+
+     @Test
+     public void testParseContentTypeWithHeader() {
+         byte[] body = "<!DOCTYPE html><html><head><title>Title</title></head><body></body></html>".getBytes(StandardCharsets.UTF_8);
+         String contentTypeHeader = "text/html; charset=UTF-8";
+         ContentType result = ContentTypeParser.parseContentType(contentTypeHeader, body);
+         assertNotNull(result);
+         assertEquals("text/html", result.contentType());
+         assertEquals("UTF-8", result.charset());
+     }
+
+     @Test
+     public void testParseContentTypeWithMetaCharset() {
+         byte[] body = "<!DOCTYPE html><html><head><meta charset=\"UTF-8\"><title>Title</title></head><body></body></html>".getBytes(StandardCharsets.UTF_8);
+         ContentType result = ContentTypeParser.parseContentType(null, body);
+         assertNotNull(result);
+         assertEquals("text/html", result.contentType());
+         assertEquals("UTF-8", result.charset());
+     }
+
+     @Test
+     public void testParseContentTypeWithHeaderValueAbsent() {
+         byte[] body = "Some random text.".getBytes(StandardCharsets.UTF_8);
+         String contentTypeHeader = "text/plain";
+         ContentType result = ContentTypeParser.parseContentType(contentTypeHeader, body);
+         assertNotNull(result);
+         assertEquals("text/plain", result.contentType());
+         assertEquals("ISO_8859_1", result.charset());
+     }
+
+     @Test
+     public void testParseContentTypeWithBinaryData() {
+         byte[] body = new byte[128];
+         body[0] = 31; // ascii value less than 32
+         ContentType result = ContentTypeParser.parseContentType(null, body);
+         assertNotNull(result);
+         assertEquals("application/binary", result.contentType());
+         assertEquals("ISO_8859_1", result.charset());
+     }
+}
\ No newline at end of file
diff --git a/code/features-crawl/content-type/src/test/java/nu/marginalia/contenttype/DocumentBodyToStringTest.java b/code/features-crawl/content-type/src/test/java/nu/marginalia/contenttype/DocumentBodyToStringTest.java
new file mode 100644
index 00000000..f7cf120d
--- /dev/null
+++ b/code/features-crawl/content-type/src/test/java/nu/marginalia/contenttype/DocumentBodyToStringTest.java
@@ -0,0 +1,48 @@
+package nu.marginalia.contenttype;
+
+import org.junit.jupiter.api.Test;
+import static org.junit.jupiter.api.Assertions.*;
+
+import java.nio.charset.StandardCharsets;
+
+public class DocumentBodyToStringTest {
+    @Test
+    public void testGetStringData_onUTF8(){
+
+        ContentType type = new ContentType("text/html", "UTF-8");
+
+        String expected = "Hello, World!";
+        byte[] data = expected.getBytes(StandardCharsets.UTF_8);
+
+        String result = DocumentBodyToString.getStringData(type, data);
+
+        assertEquals(expected, result, "Result should match the expected string");
+    }
+
+    @Test
+    public void testGetStringData_onIllegalCharsetName(){
+
+        ContentType type = new ContentType("text/html", "unsupportedname");
+
+        String expected = "Hello, World!";
+        byte[] data = expected.getBytes(StandardCharsets.UTF_8);
+
+        String result = DocumentBodyToString.getStringData(type, data);
+
+        assertEquals(expected, result, "Result should match the expected string if charset is illegal name");
+    }
+
+    @Test
+    public void testGetStringData_onUnsupportedCharset(){
+
+        ContentType type = new ContentType("text/html", "Macintosh");
+
+        String expected = "Hello, World!";
+        byte[] data = expected.getBytes(StandardCharsets.UTF_8);
+
+        String result = DocumentBodyToString.getStringData(type, data);
+
+        assertEquals(expected, result, "Result should fall back to UTF-8 parsing if charset is unsupported");
+    }
+
+}
\ No newline at end of file
diff --git a/code/libraries/geo-ip/src/main/java/nu/marginalia/geoip/GeoIpDictionary.java b/code/libraries/geo-ip/src/main/java/nu/marginalia/geoip/GeoIpDictionary.java
index 13b982f5..67dd6366 100644
--- a/code/libraries/geo-ip/src/main/java/nu/marginalia/geoip/GeoIpDictionary.java
+++ b/code/libraries/geo-ip/src/main/java/nu/marginalia/geoip/GeoIpDictionary.java
@@ -37,7 +37,9 @@ public class GeoIpDictionary {
                 throw new RuntimeException(e);
             }
             finally {
-                this.notifyAll();
+                synchronized (this) {
+                    this.notifyAll();
+                }
             }
         });
     }
diff --git a/code/process-models/crawling-model/build.gradle b/code/process-models/crawling-model/build.gradle
index ebbea855..ab4e8a8a 100644
--- a/code/process-models/crawling-model/build.gradle
+++ b/code/process-models/crawling-model/build.gradle
@@ -15,18 +15,28 @@ java {
 dependencies {
     implementation project(':code:common:model')
     implementation project(':code:common:db')
+    implementation project(':code:common:config')
     implementation project(':code:common:process')
     implementation project(':code:libraries:big-string')
     implementation project(':code:api:index-api')
     implementation project(':code:common:service-discovery')
     implementation project(':code:common:service-client')
+    implementation project(':code:features-crawl:content-type')
     implementation project(':code:libraries:language-processing')
+    implementation project(':third-party:parquet-floor')
+    implementation project(':third-party:commons-codec')
 
     implementation libs.bundles.slf4j
 
     implementation libs.notnull
+    implementation libs.bundles.parquet
 
+    implementation libs.jwarc
     implementation libs.gson
+    implementation libs.commons.io
+    implementation libs.commons.lang3
+    implementation libs.okhttp3
+    implementation libs.jsoup
     implementation libs.snakeyaml
     implementation libs.zstd
 
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/logic/ContentTypeLogic.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/ContentTypeLogic.java
similarity index 88%
rename from code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/logic/ContentTypeLogic.java
rename to code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/ContentTypeLogic.java
index c5860913..d884dbe5 100644
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/logic/ContentTypeLogic.java
+++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/ContentTypeLogic.java
@@ -1,5 +1,6 @@
-package nu.marginalia.crawl.retreival.logic;
+package nu.marginalia.crawling.body;
 
+import nu.marginalia.contenttype.ContentType;
 import nu.marginalia.model.EdgeUrl;
 
 import java.util.List;
@@ -37,6 +38,9 @@ public class ContentTypeLogic {
         return probableBinaryPattern.test(pathLowerCase);
     }
 
+    public boolean isAllowableContentType(ContentType contentType) {
+        return isAllowableContentType(contentType.contentType());
+    }
     public boolean isAllowableContentType(String contentType) {
         if (allowAllContentTypes)
             return true;
diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyExtractor.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyExtractor.java
new file mode 100644
index 00000000..019aa761
--- /dev/null
+++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyExtractor.java
@@ -0,0 +1,76 @@
+package nu.marginalia.crawling.body;
+
+import nu.marginalia.contenttype.ContentType;
+import nu.marginalia.contenttype.ContentTypeParser;
+import nu.marginalia.contenttype.DocumentBodyToString;
+import nu.marginalia.crawling.model.CrawlerDocumentStatus;
+import org.apache.commons.io.input.BOMInputStream;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.zip.GZIPInputStream;
+
+public class DocumentBodyExtractor {
+    private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
+
+    private static final Logger logger = LoggerFactory.getLogger(DocumentBodyExtractor.class);
+
+    /** Extract the body from a fetch result as a byte array. */
+    public static DocumentBodyResult<byte[]> asBytes(HttpFetchResult result) {
+        if (result instanceof HttpFetchResult.ResultOk fetchOk) {
+            return asBytes(fetchOk);
+        }
+        else if (result instanceof HttpFetchResult.Result304ReplacedWithReference retained) {
+            return new DocumentBodyResult.Ok<>(retained.contentType(), retained.body().getBytes());
+        }
+
+        return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.ERROR, "Fetch Result Not Ok");
+    }
+
+    /** Extract the body from a fetch result as a string.  This function performs
+     * content-type checks to ensure that the content-type is such that this operation
+     * makes sense.
+     *
+     * @see ContentTypeLogic#isAllowableContentType(String)
+     * */
+    public static DocumentBodyResult<String> asString(HttpFetchResult result) {
+        return asBytes(result).flatMap(DocumentBodyExtractor::toStringResult);
+    }
+
+    private static DocumentBodyResult<String> toStringResult(ContentType contentType, byte[] bytes) {
+        if (contentTypeLogic.isAllowableContentType(contentType)) {
+            try {
+                return new DocumentBodyResult.Ok<>(contentType, DocumentBodyToString.getStringData(contentType, bytes));
+            }
+            catch (Exception ex) {
+                return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.BAD_CONTENT_TYPE, "");
+            }
+        }
+        else {
+            return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.BAD_CONTENT_TYPE, "");
+        }
+    }
+
+    /** Extract the body from a fetch result as a byte array. */
+    public static DocumentBodyResult<byte[]> asBytes(HttpFetchResult.ResultOk rsp) {
+        try {
+            var byteStream = rsp.getInputStream();
+
+            if ("gzip".equals(rsp.header("Content-Encoding"))) {
+                byteStream = new GZIPInputStream(byteStream);
+            }
+            byteStream = new BOMInputStream(byteStream);
+
+            var contentTypeHeader = rsp.header("Content-Type");
+
+            byte[] data = byteStream.readAllBytes(); // size is limited by WarcRecorder
+            var contentType = ContentTypeParser.parseContentType(contentTypeHeader, data);
+
+            return new DocumentBodyResult.Ok<>(contentType, data);
+        } catch (Exception ex) {
+            logger.error("Failed to extract body", ex);
+            return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.ERROR, "");
+        }
+    }
+
+}
diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyResult.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyResult.java
new file mode 100644
index 00000000..04e3fedb
--- /dev/null
+++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyResult.java
@@ -0,0 +1,58 @@
+package nu.marginalia.crawling.body;
+
+import nu.marginalia.contenttype.ContentType;
+import nu.marginalia.crawling.model.CrawlerDocumentStatus;
+
+import java.util.Optional;
+import java.util.function.BiFunction;
+
+public sealed interface DocumentBodyResult<T> {
+    record Ok<T>(ContentType contentType, T body) implements DocumentBodyResult<T> {
+
+        @Override
+        public <T2> Optional<T2> mapOpt(BiFunction<ContentType, T, T2> mapper) {
+            return Optional.of(mapper.apply(contentType, body));
+        }
+        @Override
+        public <T2> Optional<T2> flatMapOpt(BiFunction<ContentType, T, Optional<T2>> mapper) {
+            return mapper.apply(contentType, body);
+        }
+
+        @Override
+        public <T2> DocumentBodyResult<T2> flatMap(BiFunction<ContentType, T, DocumentBodyResult<T2>> mapper) {
+            return mapper.apply(contentType, body);
+        }
+
+        @Override
+        public void ifPresent(ExConsumer<T, Exception> consumer) throws Exception {
+            consumer.accept(contentType, body);
+        }
+    }
+    record Error<T>(CrawlerDocumentStatus status, String why) implements DocumentBodyResult<T> {
+        @Override
+        public <T2> Optional<T2> mapOpt(BiFunction<ContentType, T, T2> mapper) {
+            return Optional.empty();
+        }
+        public <T2> Optional<T2> flatMapOpt(BiFunction<ContentType, T, Optional<T2>> mapper) { return Optional.empty(); }
+
+        @Override
+        @SuppressWarnings("unchecked")
+        public <T2> DocumentBodyResult<T2> flatMap(BiFunction<ContentType, T, DocumentBodyResult<T2>> mapper) {
+            return (DocumentBodyResult<T2>) this;
+        }
+
+        @Override
+        public void ifPresent(ExConsumer<T, Exception> consumer) throws Exception {
+        }
+    }
+
+    <T2> Optional<T2> mapOpt(BiFunction<ContentType, T, T2> mapper);
+    <T2> Optional<T2> flatMapOpt(BiFunction<ContentType, T, Optional<T2>> mapper);
+    <T2> DocumentBodyResult<T2> flatMap(BiFunction<ContentType, T, DocumentBodyResult<T2>> mapper);
+
+    void ifPresent(ExConsumer<T,Exception> consumer) throws Exception;
+
+    interface ExConsumer<T,E extends Exception> {
+        void accept(ContentType contentType, T t) throws E;
+    }
+}
diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/HttpFetchResult.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/HttpFetchResult.java
new file mode 100644
index 00000000..f0db28e8
--- /dev/null
+++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/HttpFetchResult.java
@@ -0,0 +1,160 @@
+package nu.marginalia.crawling.body;
+
+import nu.marginalia.contenttype.ContentType;
+import okhttp3.Headers;
+import org.jsoup.Jsoup;
+import org.netpreserve.jwarc.MessageHeaders;
+import org.netpreserve.jwarc.WarcResponse;
+import org.jsoup.nodes.Document;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.InetAddress;
+import java.net.URI;
+import java.util.Optional;
+
+/* FIXME:  This interface has a very unfortunate name that is not very descriptive.
+ */
+public sealed interface HttpFetchResult {
+
+    boolean isOk();
+
+    /** Convert a WarcResponse to a HttpFetchResult */
+    static HttpFetchResult importWarc(WarcResponse response) {
+        try {
+            var http = response.http();
+
+            try (var body = http.body()) {
+                byte[] bytes = body.stream().readAllBytes();
+
+                String ipAddress = response
+                        .ipAddress()
+                        .map(InetAddress::getHostAddress)
+                        .orElse("");
+
+                return new ResultOk(
+                        response.targetURI(),
+                        http.status(),
+                        http.headers(),
+                        ipAddress,
+                        bytes,
+                        0,
+                        bytes.length
+                );
+            }
+        }
+        catch (Exception ex) {
+            return new ResultException(ex);
+        }
+    }
+
+
+    /** Corresponds to a successful retrieval of a document
+     * from the remote server.  Note that byte[] is only borrowed
+     * and subsequent calls may overwrite the contents of this buffer.
+     */
+    record ResultOk(URI uri,
+                    int statusCode,
+                    Headers headers,
+                    String ipAddress,
+                    byte[] bytesRaw,
+                    int bytesStart,
+                    int bytesLength
+    ) implements HttpFetchResult {
+
+        public boolean isOk() {
+            return statusCode >= 200 && statusCode < 300;
+        }
+
+        public ResultOk(URI uri,
+                        int statusCode,
+                        MessageHeaders headers,
+                        String ipAddress,
+                        byte[] bytesRaw,
+                        int bytesStart,
+                        int bytesLength) {
+            this(uri, statusCode, convertHeaders(headers), ipAddress, bytesRaw, bytesStart, bytesLength);
+        }
+
+        private static Headers convertHeaders(MessageHeaders headers) {
+            var ret = new Headers.Builder();
+            for (var header : headers.map().entrySet()) {
+                for (var value : header.getValue()) {
+                    ret.add(header.getKey(), value);
+                }
+            }
+            return ret.build();
+        }
+
+        public InputStream getInputStream() {
+            return new ByteArrayInputStream(bytesRaw, bytesStart, bytesLength);
+        }
+
+        public Optional<Document> parseDocument() throws IOException {
+            return DocumentBodyExtractor.asString(this).flatMapOpt((contentType, body) -> {
+                if (contentType.is("text/html")) {
+                    return Optional.of(Jsoup.parse(body));
+                }
+                else {
+                    return Optional.empty();
+                }
+            });
+        }
+
+        public String header(String name) {
+            return headers.get(name);
+        }
+
+    };
+
+    /** This is a special case where the document was not fetched
+     * because it was already in the database.  In this case, we
+     * replace the original data.
+     *
+     * @see Result304Raw for the case where the document has not yet been replaced with the reference data.
+     */
+    record Result304ReplacedWithReference(String url, ContentType contentType, String body) implements HttpFetchResult {
+
+        public boolean isOk() {
+            return true;
+        }
+
+        public Optional<Document> parseDocument() {
+            try {
+                return Optional.of(Jsoup.parse(body));
+            }
+            catch (Exception ex) {
+                return Optional.empty();
+            }
+        }
+    };
+
+    /** Fetching resulted in an exception */
+    record ResultException(Exception ex) implements HttpFetchResult {
+        public boolean isOk() {
+            return false;
+        }
+    };
+
+    /** Fetching resulted in a HTTP 304, the remote content is identical to
+     * our reference copy.  This will be replaced with a Result304ReplacedWithReference
+     * at a later stage.
+     *
+     * @see Result304ReplacedWithReference
+     */
+    record Result304Raw() implements HttpFetchResult {
+        public boolean isOk() {
+            return false;
+        }
+    };
+
+    /** No result.  This is typically injected at a later stage
+     * of processing, e.g. after filtering out irrelevant responses.
+     */
+    record ResultNone() implements HttpFetchResult {
+        public boolean isOk() {
+            return false;
+        }
+    };
+}
diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java
index b7021ace..eb7ffd75 100644
--- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java
+++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java
@@ -1,156 +1,52 @@
 package nu.marginalia.crawling.io;
 
-import com.github.luben.zstd.RecyclingBufferPool;
-import com.github.luben.zstd.ZstdInputStream;
 import com.google.gson.Gson;
-import nu.marginalia.crawling.model.CrawledDocument;
-import nu.marginalia.crawling.model.CrawledDomain;
-import nu.marginalia.crawling.model.SerializableCrawlData;
+import nu.marginalia.crawling.io.format.LegacySerializableCrawlDataStream;
+import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream;
+import nu.marginalia.crawling.io.format.WarcSerializableCrawlDataStream;
 import nu.marginalia.model.gson.GsonFactory;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
 import java.io.*;
+import java.nio.file.Files;
 import java.nio.file.Path;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Optional;
-import java.util.concurrent.ForkJoinPool;
-import java.util.concurrent.TimeUnit;
 
 public class CrawledDomainReader {
-    private final Gson gson = GsonFactory.get();
-    private final Logger logger = LoggerFactory.getLogger(getClass());
-    private final ForkJoinPool pool = new ForkJoinPool(6);
+    private static final Gson gson = GsonFactory.get();
 
     public CrawledDomainReader() {
     }
 
     /** An iterator-like access to domain data  This must be closed otherwise it will leak off-heap memory! */
-    public SerializableCrawlDataStream createDataStream(Path fullPath) throws IOException {
-        return new FileReadingSerializableCrawlDataStream(gson, fullPath.toFile());
+    public static SerializableCrawlDataStream createDataStream(Path fullPath) throws IOException {
+        String fileName = fullPath.getFileName().toString();
+        if (fileName.endsWith(".zstd")) {
+            return new LegacySerializableCrawlDataStream(gson, fullPath.toFile());
+        }
+        else if (fileName.endsWith(".warc") || fileName.endsWith(".warc.gz")) {
+            return new WarcSerializableCrawlDataStream(fullPath);
+        }
+        else if (fileName.endsWith(".parquet")) {
+            return new ParquetSerializableCrawlDataStream(fullPath);
+        }
+        else {
+            throw new IllegalArgumentException("Unknown file type: " + fullPath);
+        }
     }
 
     /** An iterator-like access to domain data. This must be closed otherwise it will leak off-heap memory! */
-    public SerializableCrawlDataStream createDataStream(Path basePath, String domain, String id) throws IOException {
-        return createDataStream(CrawlerOutputFile.getOutputFile(basePath, id, domain));
-    }
+    public static SerializableCrawlDataStream createDataStream(Path basePath, String domain, String id) throws IOException {
+        Path parquetPath = CrawlerOutputFile.getParquetPath(basePath, id, domain);
+        Path warcPath = CrawlerOutputFile.getWarcPath(basePath, id, domain, CrawlerOutputFile.WarcFileVersion.FINAL);
 
-    /** Read the entirety of the domain data into memory. This uses a lot of RAM */
-    public CrawledDomain read(Path path) throws IOException {
-        DomainDataAssembler domainData = new DomainDataAssembler();
-
-        try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile()), RecyclingBufferPool.INSTANCE)))) {
-            String line;
-            while ((line = br.readLine()) != null) {
-                if (line.startsWith("//")) {
-                    String identifier = line;
-                    String data = br.readLine();
-
-                    pool.execute(() -> deserializeLine(identifier, data, domainData));
-                }
-            }
+        if (Files.exists(parquetPath)) {
+            return createDataStream(parquetPath);
         }
-
-        while (!pool.awaitQuiescence(1, TimeUnit.SECONDS));
-
-        return domainData.assemble();
-    }
-
-
-    private void deserializeLine(String identifier, String data, DomainDataAssembler assembler) {
-        if (null == data) {
-            return;
+        if (Files.exists(warcPath)) {
+            return createDataStream(warcPath);
         }
-        if (identifier.equals(CrawledDomain.SERIAL_IDENTIFIER)) {
-            assembler.acceptDomain(gson.fromJson(data, CrawledDomain.class));
-        } else if (identifier.equals(CrawledDocument.SERIAL_IDENTIFIER)) {
-            assembler.acceptDoc(gson.fromJson(data, CrawledDocument.class));
+        else {
+            return createDataStream(CrawlerOutputFile.getLegacyOutputFile(basePath, id, domain));
         }
     }
 
-    public Optional<CrawledDomain> readOptionally(Path path) {
-        try {
-            return Optional.of(read(path));
-        }
-        catch (Exception ex) {
-            return Optional.empty();
-        }
-    }
-
-    private static class DomainDataAssembler {
-        private CrawledDomain domainPrototype;
-        private final List<CrawledDocument> docs = new ArrayList<>();
-
-        public synchronized void acceptDomain(CrawledDomain domain) {
-            this.domainPrototype = domain;
-        }
-
-        public synchronized void acceptDoc(CrawledDocument doc) {
-            docs.add(doc);
-        }
-
-        public synchronized CrawledDomain assemble() {
-            if (!docs.isEmpty()) {
-                if (domainPrototype.doc == null)
-                    domainPrototype.doc = new ArrayList<>();
-
-                domainPrototype.doc.addAll(docs);
-            }
-            return domainPrototype;
-        }
-    }
-
-    private static class FileReadingSerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream {
-        private final Gson gson;
-        private final BufferedReader bufferedReader;
-        private SerializableCrawlData next = null;
-
-        public FileReadingSerializableCrawlDataStream(Gson gson, File file) throws IOException {
-            this.gson = gson;
-            bufferedReader = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(file), RecyclingBufferPool.INSTANCE)));
-        }
-
-        @Override
-        public SerializableCrawlData next() throws IOException {
-            if (hasNext()) {
-                var ret = next;
-                next = null;
-                return ret;
-            }
-            throw new IllegalStateException("No more data");
-        }
-
-        @Override
-        public boolean hasNext() throws IOException {
-            if (next != null)
-                return true;
-
-            String identifier = bufferedReader.readLine();
-            if (identifier == null) {
-                bufferedReader.close();
-                return false;
-            }
-            String data = bufferedReader.readLine();
-            if (data == null) {
-                bufferedReader.close();
-                return false;
-            }
-
-            if (identifier.equals(CrawledDomain.SERIAL_IDENTIFIER)) {
-                next = gson.fromJson(data, CrawledDomain.class);
-            } else if (identifier.equals(CrawledDocument.SERIAL_IDENTIFIER)) {
-                next = gson.fromJson(data, CrawledDocument.class);
-            }
-            else {
-                throw new IllegalStateException("Unknown identifier: " + identifier);
-            }
-            return true;
-        }
-
-        @Override
-        public void close() throws Exception {
-            bufferedReader.close();
-        }
-    }
 }
diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java
index 0e278f09..f21715ee 100644
--- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java
+++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java
@@ -55,7 +55,7 @@ public class CrawledDomainWriter implements AutoCloseable {
     }
 
     private Path getOutputFile(String id, String name) throws IOException {
-        return CrawlerOutputFile.createOutputPath(outputDir, id, name);
+        return CrawlerOutputFile.createLegacyOutputPath(outputDir, id, name);
     }
 
     @Override
diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java
index a7661085..ad6b4358 100644
--- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java
+++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java
@@ -9,20 +9,20 @@ import java.nio.file.Path;
 public class CrawlerOutputFile {
 
     /** Return the Path to a file for the given id and name */
-    public static Path getOutputFile(Path base, String id, String name) {
+    public static Path getLegacyOutputFile(Path base, String id, String name) {
+        id = padId(id);
+
         String first = id.substring(0, 2);
         String second = id.substring(2, 4);
 
         Path destDir = base.resolve(first).resolve(second);
-        return destDir.resolve(id + "-" + filesystemSafeName(name) + ".zstd");
+        return destDir.resolve(STR."\{id}-\{filesystemSafeName(name)}.zstd");
     }
 
     /** Return the Path to a file for the given id and name, creating the prerequisite
      * directory structure as necessary. */
-    public static Path createOutputPath(Path base, String id, String name) throws IOException {
-        if (id.length() < 4) {
-            id = Strings.repeat("0", 4 - id.length()) + id;
-        }
+    public static Path createLegacyOutputPath(Path base, String id, String name) throws IOException {
+        id = padId(id);
 
         String first = id.substring(0, 2);
         String second = id.substring(2, 4);
@@ -31,7 +31,7 @@ public class CrawlerOutputFile {
         if (!Files.exists(destDir)) {
             Files.createDirectories(destDir);
         }
-        return destDir.resolve(id + "-" + filesystemSafeName(name) + ".zstd");
+        return destDir.resolve(STR."\{id}-\{filesystemSafeName(name)}.zstd");
     }
 
 
@@ -49,4 +49,71 @@ public class CrawlerOutputFile {
 
     }
 
+    public static Path createWarcPath(Path basePath, String id, String domain, WarcFileVersion version) throws IOException {
+        id = padId(id);
+
+        String first = id.substring(0, 2);
+        String second = id.substring(2, 4);
+
+        Path destDir = basePath.resolve(first).resolve(second);
+        if (!Files.exists(destDir)) {
+            Files.createDirectories(destDir);
+        }
+        return destDir.resolve(STR."\{id}-\{filesystemSafeName(domain)}-\{version.suffix}.warc.gz");
+    }
+
+    public static Path createParquetPath(Path basePath, String id, String domain) throws IOException {
+        id = padId(id);
+
+        String first = id.substring(0, 2);
+        String second = id.substring(2, 4);
+
+        Path destDir = basePath.resolve(first).resolve(second);
+        if (!Files.exists(destDir)) {
+            Files.createDirectories(destDir);
+        }
+        return destDir.resolve(STR."\{id}-\{filesystemSafeName(domain)}.parquet");
+    }
+    public static Path getParquetPath(Path basePath, String id, String domain) {
+        id = padId(id);
+
+        String first = id.substring(0, 2);
+        String second = id.substring(2, 4);
+
+        Path destDir = basePath.resolve(first).resolve(second);
+        return destDir.resolve(STR."\{id}-\{filesystemSafeName(domain)}.parquet");
+    }
+    public static Path getWarcPath(Path basePath, String id, String domain, WarcFileVersion version) {
+        id = padId(id);
+
+        String first = id.substring(0, 2);
+        String second = id.substring(2, 4);
+
+        Path destDir = basePath.resolve(first).resolve(second);
+        return destDir.resolve(STR."\{id}-\{filesystemSafeName(domain)}.warc\{version.suffix}");
+    }
+
+    /**
+     * Pads the given ID with leading zeros to ensure it has a length of 4 characters.
+     */
+    private static String padId(String id) {
+        if (id.length() < 4) {
+            id = Strings.repeat("0", 4 - id.length()) + id;
+        }
+
+        return id;
+    }
+
+
+    public enum WarcFileVersion {
+        LIVE("open"),
+        TEMP("tmp"),
+        FINAL("final");
+
+        public final String suffix;
+
+        WarcFileVersion(String suffix) {
+            this.suffix = suffix;
+        }
+    }
 }
diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/SerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/SerializableCrawlDataStream.java
index 3aecc0fc..9598d002 100644
--- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/SerializableCrawlDataStream.java
+++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/SerializableCrawlDataStream.java
@@ -1,11 +1,13 @@
 package nu.marginalia.crawling.io;
 
 import nu.marginalia.crawling.model.SerializableCrawlData;
+import org.jetbrains.annotations.Nullable;
 
 import java.io.IOException;
+import java.nio.file.Path;
 import java.util.Iterator;
 
-/** Closable iterator over serialized crawl data
+/** Closable iterator exceptional over serialized crawl data
  * The data may appear in any order, and the iterator must be closed.
  *
  * @see CrawledDomainReader
@@ -17,6 +19,8 @@ public interface SerializableCrawlDataStream extends AutoCloseable {
 
     boolean hasNext() throws IOException;
 
+    @Nullable
+    default Path path() { return null; }
 
     // Dummy iterator over nothing
     static SerializableCrawlDataStream empty() {
diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/LegacySerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/LegacySerializableCrawlDataStream.java
new file mode 100644
index 00000000..bfd52b78
--- /dev/null
+++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/LegacySerializableCrawlDataStream.java
@@ -0,0 +1,73 @@
+package nu.marginalia.crawling.io.format;
+
+import com.github.luben.zstd.RecyclingBufferPool;
+import com.github.luben.zstd.ZstdInputStream;
+import com.google.gson.Gson;
+import nu.marginalia.crawling.io.SerializableCrawlDataStream;
+import nu.marginalia.crawling.model.CrawledDocument;
+import nu.marginalia.crawling.model.CrawledDomain;
+import nu.marginalia.crawling.model.SerializableCrawlData;
+
+import java.io.*;
+import java.nio.file.Path;
+
+/** This class is used to read the old format of crawl data, which was zstd-compressed JSON
+ * with type delimiters between records.
+ */
+public class LegacySerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream {
+    private final Gson gson;
+    private final BufferedReader bufferedReader;
+    private SerializableCrawlData next = null;
+
+    private final Path path;
+    public LegacySerializableCrawlDataStream(Gson gson, File file) throws IOException {
+        this.gson = gson;
+        bufferedReader = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(file), RecyclingBufferPool.INSTANCE)));
+        path = file.toPath();
+    }
+
+    @Override
+    public Path path() {
+        return path;
+    }
+    @Override
+    public SerializableCrawlData next() throws IOException {
+        if (hasNext()) {
+            var ret = next;
+            next = null;
+            return ret;
+        }
+        throw new IllegalStateException("No more data");
+    }
+
+    @Override
+    public boolean hasNext() throws IOException {
+        if (next != null)
+            return true;
+
+        String identifier = bufferedReader.readLine();
+        if (identifier == null) {
+            bufferedReader.close();
+            return false;
+        }
+        String data = bufferedReader.readLine();
+        if (data == null) {
+            bufferedReader.close();
+            return false;
+        }
+
+        if (identifier.equals(CrawledDomain.SERIAL_IDENTIFIER)) {
+            next = gson.fromJson(data, CrawledDomain.class);
+        } else if (identifier.equals(CrawledDocument.SERIAL_IDENTIFIER)) {
+            next = gson.fromJson(data, CrawledDocument.class);
+        } else {
+            throw new IllegalStateException("Unknown identifier: " + identifier);
+        }
+        return true;
+    }
+
+    @Override
+    public void close() throws Exception {
+        bufferedReader.close();
+    }
+}
diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java
new file mode 100644
index 00000000..d3e54a07
--- /dev/null
+++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java
@@ -0,0 +1,135 @@
+package nu.marginalia.crawling.io.format;
+
+import lombok.SneakyThrows;
+import nu.marginalia.contenttype.ContentType;
+import nu.marginalia.contenttype.DocumentBodyToString;
+import nu.marginalia.crawling.io.SerializableCrawlDataStream;
+import nu.marginalia.crawling.model.*;
+import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecord;
+import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileReader;
+import nu.marginalia.hash.MurmurHash3_128;
+import nu.marginalia.model.EdgeUrl;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.net.URISyntaxException;
+import java.nio.file.Path;
+import java.util.*;
+
+public class ParquetSerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream {
+    private static final Logger logger = LoggerFactory.getLogger(ParquetSerializableCrawlDataStream.class);
+
+    private final MurmurHash3_128 hash = new MurmurHash3_128();
+    private final Iterator<CrawledDocumentParquetRecord> backingIterator;
+    private final Deque<SerializableCrawlData> nextQ = new ArrayDeque<>();
+    private boolean wroteDomainRecord = false;
+    private final Path path;
+
+    public ParquetSerializableCrawlDataStream(Path file) throws IOException {
+        path = file;
+
+        backingIterator = CrawledDocumentParquetRecordFileReader.stream(file).iterator();
+    }
+
+    @Override
+    public Path path() {
+        return path;
+    }
+
+    @Override
+    @SneakyThrows
+    public boolean hasNext() {
+        while (backingIterator.hasNext() && nextQ.isEmpty()) {
+            var nextRecord = backingIterator.next();
+            if (!wroteDomainRecord) {
+                createDomainRecord(nextRecord);
+                wroteDomainRecord = true;
+            }
+            createDocumentRecord(nextRecord);
+        }
+        return !nextQ.isEmpty();
+    }
+
+    private void createDomainRecord(CrawledDocumentParquetRecord parquetRecord) throws URISyntaxException {
+
+        CrawlerDomainStatus status = CrawlerDomainStatus.OK;
+        String statusReason = "";
+
+        String redirectDomain = null;
+        if (parquetRecord.contentType.equals("x-marginalia/advisory;state=redirect")) {
+            EdgeUrl crawledUrl = new EdgeUrl(parquetRecord.url);
+            redirectDomain = crawledUrl.getDomain().toString();
+            status = CrawlerDomainStatus.REDIRECT;
+        }
+        else if (parquetRecord.contentType.equals("x-marginalia/advisory;state=blocked")) {
+            status = CrawlerDomainStatus.BLOCKED;
+        }
+        else if (parquetRecord.contentType.equals("x-marginalia/advisory;state=error")) {
+            status = CrawlerDomainStatus.ERROR;
+            statusReason = new String(parquetRecord.body);
+        }
+
+        nextQ.add(new CrawledDomain(
+                parquetRecord.domain,
+                redirectDomain,
+                status.toString(),
+                statusReason,
+                parquetRecord.ip,
+                new ArrayList<>(),
+                new ArrayList<>()
+        ));
+    }
+
+    private void createDocumentRecord(CrawledDocumentParquetRecord nextRecord) {
+        String bodyString = "";
+        CrawlerDocumentStatus status = CrawlerDocumentStatus.OK;
+
+        if (nextRecord.contentType.startsWith("x-marginalia/advisory;state=content-type-failed-probe")) {
+            status = CrawlerDocumentStatus.BAD_CONTENT_TYPE;
+        }
+        else if (nextRecord.contentType.startsWith("x-marginalia/advisory;state=robots-txt-skipped")) {
+            status = CrawlerDocumentStatus.ROBOTS_TXT;
+        }
+        else if (nextRecord.contentType.startsWith("x-marginalia/advisory")) { // other advisory stuff we don't want
+            return;
+        }
+        else {
+            try {
+                bodyString = DocumentBodyToString.getStringData(
+                        ContentType.parse(nextRecord.contentType),
+                        nextRecord.body);
+            } catch (Exception ex) {
+                logger.error("Failed to convert body to string", ex);
+                status = CrawlerDocumentStatus.BAD_CHARSET;
+            }
+        }
+
+        nextQ.add(new CrawledDocument("",
+                nextRecord.url,
+                nextRecord.contentType,
+                nextRecord.timestamp.toString(),
+                nextRecord.httpStatus,
+                status.toString(),
+                "",
+                "",
+                bodyString,
+                Long.toHexString(hash.hashNearlyASCII(bodyString)), // this field isn't actually used, maybe we can skip calculating it?
+                nextRecord.url,
+                null,
+                "",
+                nextRecord.cookies));
+    }
+
+    public void close() throws IOException {
+    }
+
+    @Override
+    public SerializableCrawlData next() throws IOException {
+        if (!hasNext())
+            throw new NoSuchElementException();
+
+        return nextQ.poll();
+    }
+
+}
diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcSerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcSerializableCrawlDataStream.java
new file mode 100644
index 00000000..2cdb7af1
--- /dev/null
+++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcSerializableCrawlDataStream.java
@@ -0,0 +1,151 @@
+package nu.marginalia.crawling.io.format;
+
+import lombok.SneakyThrows;
+import nu.marginalia.crawling.body.DocumentBodyExtractor;
+import nu.marginalia.crawling.body.DocumentBodyResult;
+import nu.marginalia.crawling.body.HttpFetchResult;
+import nu.marginalia.crawling.io.SerializableCrawlDataStream;
+import nu.marginalia.crawling.model.CrawledDocument;
+import nu.marginalia.crawling.model.CrawledDomain;
+import nu.marginalia.crawling.model.SerializableCrawlData;
+import org.netpreserve.jwarc.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.util.*;
+
+public class WarcSerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream {
+    private static final Logger logger = LoggerFactory.getLogger(WarcSerializableCrawlDataStream.class);
+
+    private final WarcReader reader;
+    private final Iterator<WarcRecord> backingIterator;
+    private SerializableCrawlData next = null;
+    private final Path path;
+
+    public WarcSerializableCrawlDataStream(Path file) throws IOException {
+        path = file;
+        reader = new WarcReader(file);
+        WarcXResponseReference.register(reader);
+        WarcXEntityRefused.register(reader);
+
+        backingIterator = reader.iterator();
+    }
+
+    @Override
+    public Path path() {
+        return path;
+    }
+
+    @Override
+    @SneakyThrows
+    public boolean hasNext() {
+        while (backingIterator.hasNext() && next == null) {
+            var nextRecord = backingIterator.next();
+            if (nextRecord instanceof WarcResponse response) { // this also includes WarcXResponseReference
+                convertResponse(response);
+            }
+            else if (nextRecord instanceof Warcinfo warcinfo) {
+                convertWarcinfo(warcinfo);
+            }
+        }
+        return next != null;
+    }
+
+    private void convertWarcinfo(Warcinfo warcinfo) throws IOException {
+        var headers = warcinfo.fields();
+        String probeStatus = headers.first("X-WARC-Probe-Status").orElse("");
+        String[] parts = probeStatus.split(" ", 2);
+
+
+        String domain = headers.first("domain").orElseThrow(() -> new IllegalStateException("Missing domain header"));
+        String status = parts[0];
+        String statusReason = parts.length > 1 ? parts[1] : "";
+        String ip = headers.first("ip").orElse("");
+
+        String redirectDomain = null;
+        if ("REDIRECT".equalsIgnoreCase(status)) {
+            redirectDomain = statusReason;
+        }
+
+        next = new CrawledDomain(domain, redirectDomain, status, statusReason, ip,
+                new ArrayList<>(),
+                new ArrayList<>()
+        );
+    }
+
+    private void convertResponse(WarcResponse response) throws IOException {
+        var http = response.http();
+
+        if (http.status() != 200) {
+            return;
+        }
+
+        var parsedBody = DocumentBodyExtractor.asString(HttpFetchResult.importWarc(response));
+        if (parsedBody instanceof DocumentBodyResult.Error<String> error) {
+            next = new CrawledDocument(
+                    "",
+                    response.targetURI().toString(),
+                    http.contentType().raw(),
+                    response.date().toString(),
+                    http.status(),
+                    error.status().toString(),
+                    error.why(),
+                    headers(http.headers()),
+                    null,
+                    response.payloadDigest().map(WarcDigest::base64).orElse(""),
+                    "",
+                    "",
+                    "",
+                    WarcXCookieInformationHeader.hasCookies(response)
+            );
+        } else if (parsedBody instanceof DocumentBodyResult.Ok<String> ok) {
+            next = new CrawledDocument(
+                    "",
+                    response.targetURI().toString(),
+                    ok.contentType().toString(),
+                    response.date().toString(),
+                    http.status(),
+                    "OK",
+                    "",
+                    headers(http.headers()),
+                    ok.body(),
+                    response.payloadDigest().map(WarcDigest::base64).orElse(""),
+                    "",
+                    "",
+                    "",
+                    WarcXCookieInformationHeader.hasCookies(response));
+        } else {
+            // unreachable
+            throw new IllegalStateException("Unknown body type: " + parsedBody);
+        }
+    }
+
+    public String headers(MessageHeaders headers) {
+        StringJoiner ret = new StringJoiner("\n");
+        for (var header : headers.map().entrySet()) {
+            for (var value : header.getValue()) {
+                ret.add(STR."\{header.getKey()}: \{value}");
+            }
+        }
+        return ret.toString();
+    }
+
+    public void close() throws IOException {
+        reader.close();
+    }
+
+    @Override
+    public SerializableCrawlData next() throws IOException {
+        if (!hasNext())
+            throw new NoSuchElementException();
+        try {
+            return next;
+        }
+        finally {
+            next = null;
+        }
+    }
+
+}
diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/ContentType.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/ContentType.java
deleted file mode 100644
index e8a9fca1..00000000
--- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/ContentType.java
+++ /dev/null
@@ -1,5 +0,0 @@
-package nu.marginalia.crawling.model;
-
-
-public record ContentType(String contentType, String charset) {
-}
diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java
index 143c775b..6b9ba1be 100644
--- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java
+++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java
@@ -23,13 +23,21 @@ public class CrawledDocument implements SerializableCrawlData {
 
     public String headers;
     public String documentBody;
+
+    @Deprecated
     public String documentBodyHash;
 
+    @Deprecated
     public String canonicalUrl;
     public String redirectUrl;
 
+    @Deprecated
     public String recrawlState;
 
+    /** This is not guaranteed to be set in all versions of the format,
+     * information may come in CrawledDomain instead */
+    public Boolean hasCookies = false;
+
     public static final String SERIAL_IDENTIFIER = "// DOCUMENT";
     @Override
     public String getSerialIdentifier() {
diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDomain.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDomain.java
index 482311c1..3add3b8d 100644
--- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDomain.java
+++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDomain.java
@@ -17,6 +17,9 @@ public class CrawledDomain implements SerializableCrawlData {
     public String ip;
 
     public List<CrawledDocument> doc;
+
+    /** This is not guaranteed to be set in all versions of the format,
+     * information may come in CrawledDocument instead */
     public List<String> cookies;
 
     public int size() {
@@ -24,6 +27,10 @@ public class CrawledDomain implements SerializableCrawlData {
         return doc.size();
     }
 
+    public boolean hasCookies() {
+        return cookies != null && !cookies.isEmpty();
+    }
+
     public static final String SERIAL_IDENTIFIER = "// DOMAIN";
     @Override
     public String getSerialIdentifier() {
diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecord.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecord.java
new file mode 100644
index 00000000..c96aeb25
--- /dev/null
+++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecord.java
@@ -0,0 +1,97 @@
+package nu.marginalia.crawling.parquet;
+
+import blue.strategic.parquet.Dehydrator;
+import blue.strategic.parquet.Hydrator;
+import blue.strategic.parquet.ValueWriter;
+import lombok.AllArgsConstructor;
+import lombok.EqualsAndHashCode;
+import lombok.NoArgsConstructor;
+import lombok.ToString;
+import org.apache.parquet.schema.MessageType;
+import org.apache.parquet.schema.Types;
+
+import java.time.Instant;
+
+import static org.apache.parquet.schema.LogicalTypeAnnotation.*;
+import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.*;
+
+@AllArgsConstructor
+@NoArgsConstructor
+@EqualsAndHashCode
+@ToString
+public class CrawledDocumentParquetRecord {
+    public String domain;
+    public String url;
+    public String ip;
+    public boolean cookies;
+    public int httpStatus;
+    public Instant timestamp;
+    public String contentType;
+    public byte[] body;
+
+    public static Hydrator<CrawledDocumentParquetRecord, CrawledDocumentParquetRecord> newHydrator() {
+        return new CrawledDocumentParquetRecordHydrator();
+    }
+
+    public static Dehydrator<CrawledDocumentParquetRecord> newDehydrator() {
+        return CrawledDocumentParquetRecord::dehydrate;
+    }
+
+    public static MessageType schema = new MessageType(
+            CrawledDocumentParquetRecord.class.getSimpleName(),
+            Types.required(BINARY).as(stringType()).named("domain"),
+            Types.required(BINARY).as(stringType()).named("url"),
+            Types.required(BINARY).as(stringType()).named("ip"),
+            Types.required(BOOLEAN).named("cookies"),
+            Types.required(INT32).named("httpStatus"),
+            Types.required(INT64).named("epochSeconds"),
+            Types.required(BINARY).as(stringType()).named("contentType"),
+            Types.required(BINARY).named("body")
+    );
+
+
+    public CrawledDocumentParquetRecord add(String heading, Object value) {
+        switch (heading) {
+            case "domain" -> domain = (String) value;
+            case "url" -> url = (String) value;
+            case "ip" -> ip = (String) value;
+            case "httpStatus" -> httpStatus = (Integer) value;
+            case "cookies" -> cookies = (Boolean) value;
+            case "contentType" -> contentType = (String) value;
+            case "body" -> body = (byte[]) value;
+            case "epochSeconds" -> timestamp = Instant.ofEpochSecond((Long) value);
+            default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"');
+        }
+        return this;
+    }
+
+    public void dehydrate(ValueWriter valueWriter) {
+        valueWriter.write("domain", domain);
+        valueWriter.write("url", url);
+        valueWriter.write("ip", ip);
+        valueWriter.write("epochSeconds", timestamp.getEpochSecond());
+        valueWriter.write("httpStatus", httpStatus);
+        valueWriter.write("cookies", cookies);
+        valueWriter.write("contentType", contentType);
+        valueWriter.write("body", body);
+    }
+}
+
+class CrawledDocumentParquetRecordHydrator implements Hydrator<CrawledDocumentParquetRecord, CrawledDocumentParquetRecord> {
+
+    @Override
+    public CrawledDocumentParquetRecord start() {
+        return new CrawledDocumentParquetRecord();
+    }
+
+    @Override
+    public CrawledDocumentParquetRecord add(CrawledDocumentParquetRecord target, String heading, Object value) {
+        return target.add(heading, value);
+    }
+
+    @Override
+    public CrawledDocumentParquetRecord finish(CrawledDocumentParquetRecord target) {
+        return target;
+    }
+
+}
diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileReader.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileReader.java
new file mode 100644
index 00000000..7e8c7501
--- /dev/null
+++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileReader.java
@@ -0,0 +1,19 @@
+package nu.marginalia.crawling.parquet;
+
+import blue.strategic.parquet.HydratorSupplier;
+import blue.strategic.parquet.ParquetReader;
+import org.jetbrains.annotations.NotNull;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.util.stream.Stream;
+
+public class CrawledDocumentParquetRecordFileReader {
+
+    @NotNull
+    public static Stream<CrawledDocumentParquetRecord> stream(Path path) throws IOException {
+        return ParquetReader.streamContent(path.toFile(),
+                HydratorSupplier.constantly(CrawledDocumentParquetRecord.newHydrator()));
+    }
+
+}
diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java
new file mode 100644
index 00000000..9245156f
--- /dev/null
+++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java
@@ -0,0 +1,247 @@
+package nu.marginalia.crawling.parquet;
+
+import blue.strategic.parquet.ParquetWriter;
+import nu.marginalia.UserAgent;
+import nu.marginalia.crawling.body.DocumentBodyExtractor;
+import nu.marginalia.crawling.body.DocumentBodyResult;
+import nu.marginalia.crawling.body.HttpFetchResult;
+import org.apache.commons.lang3.StringUtils;
+import org.netpreserve.jwarc.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.net.URI;
+import java.nio.file.Path;
+import java.time.Instant;
+import java.util.List;
+import java.util.Objects;
+
+public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
+    private final ParquetWriter<CrawledDocumentParquetRecord> writer;
+    private static final Logger logger = LoggerFactory.getLogger(CrawledDocumentParquetRecordFileWriter.class);
+
+    public static void convertWarc(String domain,
+                                   UserAgent userAgent,
+                                   Path warcInputFile,
+                                   Path parquetOutputFile) {
+        try (var warcReader = new WarcReader(warcInputFile);
+             var parquetWriter = new CrawledDocumentParquetRecordFileWriter(parquetOutputFile)
+        ) {
+            WarcXResponseReference.register(warcReader);
+            WarcXEntityRefused.register(warcReader);
+
+            String uaString = userAgent.uaString();
+
+            for (var record : warcReader) {
+                if (record instanceof WarcResponse response) {
+                    // this also captures WarcXResponseReference, which inherits from WarcResponse
+                    // and is used to store old responses from previous crawls; in this part of the logic
+                    // we treat them the same as a normal response
+
+                    if (!filterResponse(uaString, response)) {
+                        continue;
+                    }
+
+                    parquetWriter.write(domain, response);
+                }
+                else if (record instanceof WarcXEntityRefused refused) {
+                    parquetWriter.write(domain, refused);
+                }
+                else if (record instanceof Warcinfo warcinfo) {
+                    parquetWriter.write(warcinfo);
+                }
+            }
+        }
+        catch (Exception ex) {
+            logger.error("Failed to convert WARC file to Parquet", ex);
+        }
+    }
+
+    /** Return true if the WarcResponse should be excluded from conversion */
+    private static boolean filterResponse(String uaString, WarcResponse response) throws IOException {
+
+        // We don't want to store robots.txt files, as they are not
+        // interesting for the analysis we want to do.  This is important
+        // since txt-files in general are interesting, and we don't want to
+        // exclude them as a class.
+
+        if (response.targetURI().getPath().equals("/robots.txt")) {
+            return false;
+        }
+
+        var robotsTags = response.http().headers().all("X-Robots-Tag");
+        if (!isXRobotsTagsPermitted(robotsTags, uaString)) {
+            return false;
+        }
+
+        return true;
+    }
+
+    private void write(String domain, WarcXEntityRefused refused) throws IOException {
+        URI profile = refused.profile();
+
+        String meta;
+        if (profile.equals(WarcXEntityRefused.documentRobotsTxtSkippedURN)) {
+            meta = "x-marginalia/advisory;state=robots-txt-skipped";
+        }
+        else if (profile.equals(WarcXEntityRefused.documentBadContentTypeURN)) {
+            meta = "x-marginalia/advisory;state=content-type-failed-probe";
+        }
+        else if (profile.equals(WarcXEntityRefused.documentProbeTimeout)) {
+            meta = "x-marginalia/advisory;state=timeout-probe";
+        }
+        else if (profile.equals(WarcXEntityRefused.documentUnspecifiedError)) {
+            meta = "x-marginalia/advisory;state=doc-error";
+        }
+        else {
+            meta = "x-marginalia/advisory;state=unknown";
+        }
+
+        write(forDocError(domain, refused.date(), refused.target(), meta));
+    }
+
+    private void write(Warcinfo warcinfo) throws IOException {
+        String selfDomain = warcinfo.fields().first("domain").orElse("");
+        String ip = warcinfo.fields().first("ip").orElse("");
+        String probeStatus = warcinfo.fields().first("X-WARC-Probe-Status").orElse("");
+
+        if (probeStatus.startsWith("REDIRECT")) {
+            String redirectDomain = probeStatus.substring("REDIRECT;".length());
+            write(forDomainRedirect(selfDomain, warcinfo.date(), redirectDomain));
+        }
+        else if (!"OK".equals(probeStatus)) {
+            write(forDomainError(selfDomain, warcinfo.date(), ip, probeStatus));
+        }
+    }
+
+    public CrawledDocumentParquetRecordFileWriter(Path file) throws IOException {
+        writer = ParquetWriter.writeFile(CrawledDocumentParquetRecord.schema,
+                file.toFile(), CrawledDocumentParquetRecord.newDehydrator());
+    }
+
+    public void write(CrawledDocumentParquetRecord domainData) throws IOException {
+        writer.write(domainData);
+    }
+
+    public void write(String domain, WarcResponse response) throws IOException {
+
+        HttpFetchResult result = HttpFetchResult.importWarc(response);
+        if (!(result instanceof HttpFetchResult.ResultOk fetchOk)) {
+            return;
+        }
+
+        byte[] bodyBytes;
+        String contentType;
+
+        var body = DocumentBodyExtractor.asBytes(result);
+
+        if (body instanceof DocumentBodyResult.Ok<byte[]> bodyOk) {
+            bodyBytes = bodyOk.body();
+            contentType = bodyOk.contentType().toString();
+        }
+        else {
+            bodyBytes = new byte[0];
+            contentType = "";
+        }
+
+        write(new CrawledDocumentParquetRecord(
+                domain,
+                response.target(),
+                fetchOk.ipAddress(),
+                WarcXCookieInformationHeader.hasCookies(response),
+                fetchOk.statusCode(),
+                response.date(),
+                contentType,
+                bodyBytes)
+        );
+    }
+
+
+    public void close() throws IOException {
+        writer.close();
+    }
+
+    private CrawledDocumentParquetRecord forDomainRedirect(String domain, Instant date, String redirectDomain) {
+        return new CrawledDocumentParquetRecord(domain,
+                STR."https://\{redirectDomain}/",
+                "",
+                false,
+                0,
+                date,
+                "x-marginalia/advisory;state=redirect",
+                new byte[0]
+        );
+    }
+    private CrawledDocumentParquetRecord forDomainError(String domain, Instant date, String ip, String errorStatus) {
+        return new CrawledDocumentParquetRecord(domain,
+                STR."https://\{domain}/",
+                ip,
+                false,
+                0,
+                date,
+                "x-marginalia/advisory;state=error",
+                errorStatus.getBytes()
+        );
+    }
+
+    private CrawledDocumentParquetRecord forDocError(String domain, Instant date, String url, String errorStatus) {
+        return new CrawledDocumentParquetRecord(domain,
+                url,
+                "",
+                false,
+                0,
+                date,
+                errorStatus,
+                new byte[0]
+        );
+    }
+
+
+    /**  Check X-Robots-Tag header tag to see if we are allowed to index this page.
+     * <p>
+     * Reference: <a href="https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag">https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag</a>
+     *
+     * @param xRobotsHeaderTags List of X-Robots-Tag values
+     * @param userAgent User agent string
+     * @return true if we are allowed to index this page
+     */
+    // Visible for tests
+    public static boolean isXRobotsTagsPermitted(List<String> xRobotsHeaderTags, String userAgent) {
+        boolean isPermittedGeneral = true;
+        boolean isPermittedMarginalia = false;
+        boolean isForbiddenMarginalia = false;
+
+        for (String header : xRobotsHeaderTags) {
+            if (header.indexOf(':') >= 0) {
+                String[] parts = StringUtils.split(header, ":", 2);
+
+                if (parts.length < 2)
+                    continue;
+
+                // Is this relevant to us?
+                if (!Objects.equals(parts[0].trim(), userAgent))
+                    continue;
+
+                if (parts[1].contains("noindex"))
+                    isForbiddenMarginalia = true;
+                else if (parts[1].contains("none"))
+                    isForbiddenMarginalia = true;
+                else if (parts[1].contains("all"))
+                    isPermittedMarginalia = true;
+            }
+            else {
+                if (header.contains("noindex"))
+                    isPermittedGeneral = false;
+                if (header.contains("none"))
+                    isPermittedGeneral = false;
+            }
+        }
+
+        if (isPermittedMarginalia)
+            return true;
+        if (isForbiddenMarginalia)
+            return false;
+        return isPermittedGeneral;
+    }
+}
diff --git a/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXCookieInformationHeader.java b/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXCookieInformationHeader.java
new file mode 100644
index 00000000..7d983580
--- /dev/null
+++ b/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXCookieInformationHeader.java
@@ -0,0 +1,35 @@
+package org.netpreserve.jwarc;
+
+import okhttp3.HttpUrl;
+import okhttp3.OkHttpClient;
+
+/** Encapsulates out-of-band information about whether a website uses cookies,
+ * using a non-standard WARC header "X-Has-Cookies".
+ */
+public class WarcXCookieInformationHeader {
+    private boolean hasCookies = false;
+    private static final String headerName = "X-Has-Cookies";
+
+    public void update(OkHttpClient client, HttpUrl url) {
+        if (!hasCookies) {
+            hasCookies = !client.cookieJar().loadForRequest(url).isEmpty();
+        }
+    }
+
+    public boolean hasCookies() {
+        return hasCookies;
+    }
+
+    public void paint(WarcResponse.Builder builder) {
+        builder.addHeader(headerName, hasCookies ? "1" : "0");
+    }
+    public void paint(WarcXResponseReference.Builder builder) {
+        builder.addHeader(headerName, hasCookies ? "1" : "0");
+    }
+
+    public static boolean hasCookies(WarcRecord record) {
+        return record.headers().contains(headerName, "1");
+    }
+
+
+}
diff --git a/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXEntityRefused.java b/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXEntityRefused.java
new file mode 100644
index 00000000..4480115e
--- /dev/null
+++ b/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXEntityRefused.java
@@ -0,0 +1,45 @@
+package org.netpreserve.jwarc;
+
+import java.io.IOException;
+import java.net.URI;
+
+/** This defines a non-standard extension to WARC for storing old HTTP responses,
+ * essentially a 'response' with different semantics
+ */
+public class WarcXEntityRefused extends WarcRevisit {
+    private static final String TYPE_NAME = "x-entity-refused";
+
+    public static final URI documentRobotsTxtSkippedURN = URI.create("urn:marginalia/meta/doc/robots-txt-skipped");
+    public static final URI documentBadContentTypeURN = URI.create("urn:marginalia/meta/doc/content-type-failed-probe");
+    public static final URI documentProbeTimeout = URI.create("urn:marginalia/meta/doc/timeout-probe");
+    public static final URI documentUnspecifiedError = URI.create("urn:marginalia/meta/doc/error");
+
+    WarcXEntityRefused(MessageVersion version, MessageHeaders headers, MessageBody body) {
+        super(version, headers, body);
+    }
+
+    public static void register(WarcReader reader) {
+        reader.registerType(TYPE_NAME, WarcXEntityRefused::new);
+    }
+
+    public static class Builder extends AbstractBuilder<WarcXEntityRefused, Builder> {
+        public Builder(URI targetURI, URI profile) {
+            this(targetURI.toString(), profile.toString());
+        }
+
+        public Builder(String targetURI, String profileURI) {
+            super(TYPE_NAME);
+            setHeader("WARC-Target-URI", targetURI);
+            setHeader("WARC-Profile", profileURI);
+        }
+
+        public Builder body(HttpResponse httpResponse) throws IOException {
+            return body(MediaType.HTTP_RESPONSE, httpResponse);
+        }
+
+        @Override
+        public WarcXEntityRefused build() {
+            return build(WarcXEntityRefused::new);
+        }
+    }
+}
diff --git a/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXResponseReference.java b/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXResponseReference.java
new file mode 100644
index 00000000..19a5a00f
--- /dev/null
+++ b/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXResponseReference.java
@@ -0,0 +1,42 @@
+package org.netpreserve.jwarc;
+
+import java.io.IOException;
+import java.net.URI;
+
+/** This defines a non-standard extension to WARC for storing old HTTP responses,
+ * essentially a 'response' with different semantics..
+ * <p>
+ * An x-response-reference record is a response record with a full body, where
+ * the data is a reconstructed HTTP response from a previous crawl.
+ */
+public class WarcXResponseReference extends WarcResponse {
+    private static final String TYPE_NAME = "x-response-reference";
+
+    WarcXResponseReference(MessageVersion version, MessageHeaders headers, MessageBody body) {
+        super(version, headers, body);
+    }
+
+    public static void register(WarcReader reader) {
+        reader.registerType(TYPE_NAME, WarcXResponseReference::new);
+    }
+
+    public static class Builder extends AbstractBuilder<WarcXResponseReference, Builder> {
+        public Builder(URI targetURI) {
+            this(targetURI.toString());
+        }
+
+        public Builder(String targetURI) {
+            super(TYPE_NAME);
+            setHeader("WARC-Target-URI", targetURI);
+        }
+
+        public Builder body(HttpResponse httpResponse) throws IOException {
+            return body(MediaType.HTTP_RESPONSE, httpResponse);
+        }
+
+        @Override
+        public WarcXResponseReference build() {
+            return build(WarcXResponseReference::new);
+        }
+    }
+}
diff --git a/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java b/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java
index 718dea06..cbb88772 100644
--- a/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java
+++ b/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java
@@ -74,23 +74,13 @@ public class CrawlPlan {
         return count;
     }
 
+    @Deprecated
     public Iterable<CrawledDomain> domainsIterable() {
-        final CrawledDomainReader reader = new CrawledDomainReader();
-
-        return WorkLog.iterableMap(crawl.getLogFile(),
-                entry -> {
-                    var path = getCrawledFilePath(entry.path());
-                    if (!Files.exists(path)) {
-                        logger.warn("File not found: {}", path);
-                        return Optional.empty();
-                    }
-                    return reader.readOptionally(path);
-                });
+        // This is no longer supported
+        throw new UnsupportedOperationException();
     }
 
     public Iterable<SerializableCrawlDataStream> crawlDataIterable(Predicate<String> idPredicate) {
-        final CrawledDomainReader reader = new CrawledDomainReader();
-
         return WorkLog.iterableMap(crawl.getLogFile(),
                 entry -> {
                     if (!idPredicate.test(entry.id())) {
@@ -105,7 +95,7 @@ public class CrawlPlan {
                     }
 
                     try {
-                        return Optional.of(reader.createDataStream(path));
+                        return Optional.of(CrawledDomainReader.createDataStream(path));
                     }
                     catch (IOException ex) {
                         return Optional.empty();
diff --git a/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java b/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java
new file mode 100644
index 00000000..c79154a4
--- /dev/null
+++ b/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java
@@ -0,0 +1,78 @@
+package nu.marginalia.crawling.parquet;
+
+import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream;
+import nu.marginalia.crawling.model.CrawledDocument;
+import nu.marginalia.crawling.model.CrawledDomain;
+import nu.marginalia.crawling.model.SerializableCrawlData;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.time.Instant;
+import java.util.ArrayList;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+class CrawledDocumentParquetRecordFileWriterTest {
+    Path tempFile;
+
+    @BeforeEach
+    public void setUp() throws IOException {
+        tempFile = Files.createTempFile("test", ".parquet");
+    }
+
+    @AfterEach
+    public void tearDown() throws IOException {
+        Files.delete(tempFile);
+    }
+
+    @Test
+    void testWriteRead() throws IOException {
+        var original = new CrawledDocumentParquetRecord("www.marginalia.nu",
+                "https://www.marginalia.nu/",
+                "127.0.0.1",
+                false,
+                200,
+                Instant.now(),
+                "text/html",
+                "hello world".getBytes());
+
+        try (var writer = new CrawledDocumentParquetRecordFileWriter(tempFile)) {
+            writer.write(original);
+        }
+
+        var items = new ArrayList<SerializableCrawlData>();
+
+        try (var stream = new ParquetSerializableCrawlDataStream(tempFile)) {
+            while (stream.hasNext()) {
+                items.add(stream.next());
+            }
+        }
+
+        assertEquals(2, items.size());
+
+        var firstItem = items.get(0);
+        assertInstanceOf(CrawledDomain.class, firstItem);
+        var domain = (CrawledDomain) firstItem;
+        assertEquals("www.marginalia.nu", domain.domain);
+        assertNull(domain.redirectDomain);
+        assertEquals("OK", domain.crawlerStatus);
+        assertEquals("", domain.crawlerStatusDesc);
+        assertEquals(new ArrayList<>(), domain.doc);
+        assertEquals(new ArrayList<>(), domain.cookies);
+
+        var secondItem = items.get(1);
+        assertInstanceOf(CrawledDocument.class, secondItem);
+
+        var document = (CrawledDocument) secondItem;
+        assertEquals("https://www.marginalia.nu/", document.url);
+        assertEquals("text/html", document.contentType);
+        assertEquals("hello world", document.documentBody);
+        assertEquals(200, document.httpStatus);
+    }
+
+
+}
\ No newline at end of file
diff --git a/code/processes/converting-process/build.gradle b/code/processes/converting-process/build.gradle
index 979260df..556f8015 100644
--- a/code/processes/converting-process/build.gradle
+++ b/code/processes/converting-process/build.gradle
@@ -59,6 +59,7 @@ dependencies {
 
     implementation project(':code:features-crawl:crawl-blocklist')
     implementation project(':code:features-crawl:link-parser')
+    implementation project(':code:features-crawl:content-type')
 
     testImplementation project(':code:libraries:term-frequency-dict')
     testImplementation project(':code:process-models:crawl-spec')
@@ -66,6 +67,7 @@ dependencies {
     implementation libs.bundles.slf4j
 
     implementation libs.notnull
+    implementation libs.jwarc
 
     implementation libs.jsoup
 
diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java
index ebfb1bc2..3bada914 100644
--- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java
+++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java
@@ -268,6 +268,14 @@ public class ConverterMain {
                         processData.asPath(),
                         msg, inbox);
             }
+            case SideloadWarc -> {
+                var processData = fileStorageService.getStorage(request.processedDataStorage);
+
+                yield new SideloadAction(
+                        sideloadSourceFactory.sideloadWarc(Path.of(request.inputSource)),
+                        processData.asPath(),
+                        msg, inbox);
+            }
             case SideloadStackexchange -> {
                 var processData = fileStorageService.getStorage(request.processedDataStorage);
 
diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java
index 8e8841a0..4b5d9173 100644
--- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java
+++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java
@@ -105,13 +105,6 @@ public class DocumentProcessor {
     private EdgeUrl getDocumentUrl(CrawledDocument crawledDocument)
             throws URISyntaxException
     {
-        if (crawledDocument.canonicalUrl != null) {
-            try {
-                return new EdgeUrl(crawledDocument.canonicalUrl);
-            }
-            catch (URISyntaxException ex) { /* fallthrough */ }
-        }
-
         return new EdgeUrl(crawledDocument.url);
     }
 
diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java
index fc824906..e9794aad 100644
--- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java
+++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java
@@ -18,6 +18,7 @@ import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.converting.processor.logic.links.TopKeywords;
 import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator;
 import nu.marginalia.model.crawl.HtmlFeature;
+import org.jetbrains.annotations.Nullable;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -53,9 +54,15 @@ public class DomainProcessor {
     }
 
     @SneakyThrows
+    @Nullable
     public ProcessedDomain process(SerializableCrawlDataStream dataStream) {
+        if (!dataStream.hasNext()) {
+            return null;
+        }
+
         var ret = new ProcessedDomain();
         List<ProcessedDocument> docs = new ArrayList<>();
+        Set<String> processedUrls = new HashSet<>();
 
         boolean cookies = false;
         String ip = "";
@@ -79,7 +86,7 @@ public class DomainProcessor {
                 ret.domain = new EdgeDomain(crawledDomain.domain);
                 ret.ip = crawledDomain.ip;
 
-                cookies = Objects.requireNonNullElse(crawledDomain.cookies, Collections.emptyList()).size() > 0;
+                cookies = crawledDomain.hasCookies();
                 ip = crawledDomain.ip;
 
                 if (crawledDomain.redirectDomain != null) {
@@ -90,10 +97,12 @@ public class DomainProcessor {
             }
             else if (data instanceof CrawledDocument doc) {
                 try {
-                    if (doc.url == null)
+                    if (doc.url == null || !processedUrls.add(doc.url))
                         continue;
 
-                    fixBadCanonicalTag(doc);
+                    if (Boolean.TRUE.equals(doc.hasCookies)) {
+                        cookies = true;
+                    }
 
                     // This case should never be reachable, as we should have initiated
                     // the externalDomainLinks variable above if we made it past the
@@ -161,25 +170,6 @@ public class DomainProcessor {
         return false;
     }
 
-    private void fixBadCanonicalTag(CrawledDocument doc) {
-        // Some sites have a canonical tag that points to a different domain,
-        // but our loader can not support this, so we point these back to the
-        // original url.
-
-        var canonicalOpt = EdgeUrl.parse(doc.canonicalUrl);
-        if (canonicalOpt.isEmpty()) return;
-
-        var urlOpt = EdgeUrl.parse(doc.url);
-        if (urlOpt.isEmpty()) return;
-
-        var urlActual = urlOpt.get();
-        var canonicalActual = canonicalOpt.get();
-
-        if (!Objects.equals(urlActual.domain, canonicalActual.domain)) {
-            doc.canonicalUrl = doc.url;
-        }
-    }
-
     private void calculateStatistics(ProcessedDomain ret, DomainLinks externalDomainLinks) {
         LinkGraph linkGraph = new LinkGraph();
         TopKeywords topKeywords = new TopKeywords();
diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java
index 60f81d19..808d4224 100644
--- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java
+++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java
@@ -7,6 +7,7 @@ import nu.marginalia.atags.source.AnchorTagsSourceFactory;
 import nu.marginalia.converting.sideload.dirtree.DirtreeSideloaderFactory;
 import nu.marginalia.converting.sideload.encyclopedia.EncyclopediaMarginaliaNuSideloader;
 import nu.marginalia.converting.sideload.stackexchange.StackexchangeSideloader;
+import nu.marginalia.converting.sideload.warc.WarcSideloadFactory;
 import nu.marginalia.keyword.DocumentKeywordExtractor;
 import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
 
@@ -24,6 +25,7 @@ public class SideloadSourceFactory {
     private final AnchorTextKeywords anchorTextKeywords;
     private final AnchorTagsSourceFactory anchorTagsSourceFactory;
     private final DirtreeSideloaderFactory dirtreeSideloaderFactory;
+    private final WarcSideloadFactory warcSideloadFactory;
 
     @Inject
     public SideloadSourceFactory(Gson gson,
@@ -31,7 +33,8 @@ public class SideloadSourceFactory {
                                  ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
                                  DocumentKeywordExtractor documentKeywordExtractor, AnchorTextKeywords anchorTextKeywords,
                                  AnchorTagsSourceFactory anchorTagsSourceFactory,
-                                 DirtreeSideloaderFactory dirtreeSideloaderFactory) {
+                                 DirtreeSideloaderFactory dirtreeSideloaderFactory,
+                                 WarcSideloadFactory warcSideloadFactory) {
         this.gson = gson;
         this.sideloaderProcessing = sideloaderProcessing;
         this.sentenceExtractorProvider = sentenceExtractorProvider;
@@ -39,6 +42,7 @@ public class SideloadSourceFactory {
         this.anchorTextKeywords = anchorTextKeywords;
         this.anchorTagsSourceFactory = anchorTagsSourceFactory;
         this.dirtreeSideloaderFactory = dirtreeSideloaderFactory;
+        this.warcSideloadFactory = warcSideloadFactory;
     }
 
     public SideloadSource sideloadEncyclopediaMarginaliaNu(Path pathToDbFile, String baseUrl) throws SQLException {
@@ -49,6 +53,10 @@ public class SideloadSourceFactory {
         return dirtreeSideloaderFactory.createSideloaders(pathToYamlFile);
     }
 
+    public Collection<? extends SideloadSource> sideloadWarc(Path pathToWarcFiles) throws IOException {
+        return warcSideloadFactory.createSideloaders(pathToWarcFiles);
+    }
+
     /** Do not use, this code isn't finished */
     public Collection<? extends SideloadSource> sideloadStackexchange(Path pathToDbFileRoot) throws IOException {
         try (var dirs = Files.walk(pathToDbFileRoot)) {
diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java
index 65f0bd41..16a1ae7c 100644
--- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java
+++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java
@@ -50,7 +50,8 @@ public class SideloaderProcessing {
                 Integer.toHexString(url.hashCode()),
                 url,
                 "",
-                "SIDELOAD"
+                "SIDELOAD",
+                false
         );
 
         var ret = new ProcessedDocument();
diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloadFactory.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloadFactory.java
new file mode 100644
index 00000000..35fb6d3a
--- /dev/null
+++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloadFactory.java
@@ -0,0 +1,32 @@
+package nu.marginalia.converting.sideload.warc;
+
+import nu.marginalia.converting.sideload.SideloadSource;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+public class WarcSideloadFactory {
+
+    public Collection<? extends SideloadSource> createSideloaders(Path pathToWarcFiles) throws IOException {
+        final List<Path> files = new ArrayList<>();
+
+        try (var stream = Files.list(pathToWarcFiles)) {
+            stream
+                    .filter(Files::isRegularFile)
+                    .filter(this::isWarcFile)
+                    .forEach(files::add);
+
+        }
+        // stub
+        return null;
+    }
+
+    private boolean isWarcFile(Path path) {
+        return path.toString().endsWith(".warc")
+            || path.toString().endsWith(".warc.gz");
+    }
+}
\ No newline at end of file
diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloader.java
new file mode 100644
index 00000000..2d8c1bda
--- /dev/null
+++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloader.java
@@ -0,0 +1,160 @@
+package nu.marginalia.converting.sideload.warc;
+
+import lombok.SneakyThrows;
+import nu.marginalia.atags.model.DomainLinks;
+import nu.marginalia.contenttype.ContentTypeParser;
+import nu.marginalia.contenttype.DocumentBodyToString;
+import nu.marginalia.converting.model.GeneratorType;
+import nu.marginalia.converting.model.ProcessedDocument;
+import nu.marginalia.converting.model.ProcessedDomain;
+import nu.marginalia.converting.sideload.SideloadSource;
+import nu.marginalia.converting.sideload.SideloaderProcessing;
+import nu.marginalia.model.EdgeDomain;
+import nu.marginalia.model.EdgeUrl;
+import nu.marginalia.model.crawl.DomainIndexingState;
+import org.netpreserve.jwarc.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.net.URISyntaxException;
+import java.nio.file.Path;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Objects;
+import java.util.Optional;
+
+public class WarcSideloader implements SideloadSource, AutoCloseable {
+
+    private static final Logger logger = LoggerFactory.getLogger(WarcSideloader.class);
+
+    private final SideloaderProcessing sideloaderProcessing;
+
+    private final WarcReader reader;
+
+    private final EdgeDomain domain;
+
+
+    public WarcSideloader(Path warcFile,
+                          SideloaderProcessing sideloaderProcessing)
+    throws IOException
+    {
+        this.sideloaderProcessing = sideloaderProcessing;
+        this.reader = new WarcReader(warcFile);
+        this.domain = sniffDomainFromWarc()
+                .orElseThrow(() -> new IOException("Could not identify domain from warc file"));
+    }
+
+    @SneakyThrows
+    @Override
+    public ProcessedDomain getDomain() {
+        var ret = new ProcessedDomain();
+
+        ret.domain = domain;
+        ret.ip = "0.0.0.0";
+        ret.state = DomainIndexingState.ACTIVE;
+
+        return ret;
+    }
+
+    private Optional<EdgeDomain> sniffDomainFromWarc() throws IOException {
+        try {
+            for (var record : reader) {
+                if (!(record instanceof WarcRequest request)) {
+                    continue;
+                }
+
+                String target = request.target();
+                if (target.startsWith("http://") || target.startsWith("https://")) {
+                    return Optional.of(new EdgeUrl(target).getDomain());
+                }
+            }
+        } catch (URISyntaxException e) {
+            return Optional.empty();
+        } finally {
+            reader.position(0);
+        }
+        return Optional.empty();
+    }
+
+    @SneakyThrows
+    @Override
+    public Iterator<ProcessedDocument> getDocumentsStream() {
+        return reader.records()
+                .filter(record -> record instanceof WarcResponse)
+                .map(WarcResponse.class::cast)
+                .filter(this::isRelevantResponse)
+                .map(this::process)
+                .filter(Optional::isPresent)
+                .map(Optional::get)
+                .iterator();
+    }
+
+    private boolean isRelevantResponse(WarcResponse warcResponse) {
+        try {
+            HttpResponse httpResponse = warcResponse.http();
+            if (httpResponse == null)
+                return false;
+            if (httpResponse.status() != 200)
+                return false;
+            if (!Objects.equals(httpResponse.contentType(), MediaType.HTML))
+                return false;
+
+            var url = new EdgeUrl(warcResponse.target());
+            if (!Objects.equals(url.getDomain(), domain)) {
+                return false;
+            }
+
+            return true;
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+
+        return false;
+    }
+
+    @SneakyThrows
+    private Optional<ProcessedDocument> process(WarcResponse response) {
+        Optional<String> body = getBody(response);
+        String url = response.target();
+
+        // We trim "/index.html"-suffixes from the index if they are present,
+        // since this is typically an artifact from document retrieval
+        if (url.endsWith("/index.html")) {
+            url = url.substring(0, url.length() - "index.html".length());
+        }
+
+        if (body.isEmpty()) {
+            return Optional.empty();
+        }
+
+        return Optional.of(sideloaderProcessing
+                .processDocument(url, body.get(), List.of(), new DomainLinks(),
+                        GeneratorType.DOCS,
+                        10_000));
+    }
+
+    @SneakyThrows
+    private Optional<String> getBody(WarcResponse response) {
+        var http = response.http();
+
+        // TODO: We should support additional encodings here
+        try (var body = http.body()) {
+            String contentType = http.headers().first("Content-Type").orElse(null);
+            byte[] bytes = body.stream().readAllBytes();
+
+            var ct = ContentTypeParser.parseContentType(contentType, bytes);
+            return Optional.of(DocumentBodyToString.getStringData(ct, bytes));
+        }
+        catch (Exception ex) {
+            logger.info("Failed to parse body", ex);
+        }
+        return Optional.empty();
+    }
+
+    @Override
+    public void close() throws Exception {
+        reader.close();
+    }
+
+}
diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterWriter.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterWriter.java
index 1ca66ed6..3069c5ed 100644
--- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterWriter.java
+++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterWriter.java
@@ -3,6 +3,7 @@ package nu.marginalia.converting.writer;
 import lombok.SneakyThrows;
 import nu.marginalia.converting.model.ProcessedDomain;
 import nu.marginalia.worklog.BatchingWorkLog;
+import org.jetbrains.annotations.Nullable;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -41,7 +42,10 @@ public class ConverterWriter implements AutoCloseable {
     }
 
     @SneakyThrows
-    public void accept(ProcessedDomain domain) {
+    public void accept(@Nullable ProcessedDomain domain) {
+        if (null == domain)
+            return;
+
         domainData.put(domain);
     }
 
diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java
index ce0d8f4a..eaa9d813 100644
--- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java
+++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java
@@ -65,6 +65,7 @@ public class ConvertingIntegrationTest {
     @Test
     public void testMemexMarginaliaNu() throws IOException {
         var ret = domainProcessor.process(asSerializableCrawlData(readMarginaliaWorkingSet()));
+        assertNotNull(ret);
         assertEquals(ret.state, DomainIndexingState.ACTIVE);
         assertEquals(ret.domain, new EdgeDomain("memex.marginalia.nu"));
 
@@ -114,7 +115,8 @@ public class ConvertingIntegrationTest {
                     Double.toString(Math.random()),
                     "https://memex.marginalia.nu/" + file,
                     null,
-                    ""
+                    "",
+                    false
                     );
             docs.add(doc);
         }
diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java
index 7ef056d2..535eac31 100644
--- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java
+++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java
@@ -3,31 +3,51 @@ package nu.marginalia.converting;
 import com.google.inject.Guice;
 import com.google.inject.Injector;
 import lombok.SneakyThrows;
+import nu.marginalia.UserAgent;
 import nu.marginalia.WmsaHome;
+import nu.marginalia.converting.model.ProcessedDomain;
 import nu.marginalia.converting.processor.DomainProcessor;
 import nu.marginalia.crawl.retreival.CrawlerRetreiver;
 import nu.marginalia.crawl.retreival.DomainProber;
 import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
 import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
-import nu.marginalia.crawling.io.SerializableCrawlDataStream;
+import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
+import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream;
 import nu.marginalia.crawling.model.CrawledDocument;
 import nu.marginalia.crawling.model.CrawledDomain;
 import nu.marginalia.crawling.model.SerializableCrawlData;
+import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter;
+import nu.marginalia.model.EdgeDomain;
+import nu.marginalia.model.crawl.DomainIndexingState;
 import nu.marginalia.model.crawlspec.CrawlSpecRecord;
-import org.junit.jupiter.api.BeforeAll;
-import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Tag;
-import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Set;
+import java.util.function.Predicate;
+import java.util.stream.Collectors;
 
-/* This is mostly a debugging utility */
+import static org.junit.jupiter.api.Assertions.*;
+
+/** Tests for the crawler and converter integration.  These are pretty slow and potentially
+ * a bit flaky, since they attempt to fetch real websites.
+ */
 @Tag("slow")
 public class CrawlingThenConvertingIntegrationTest {
     private DomainProcessor domainProcessor;
     private HttpFetcher httpFetcher;
 
+    private static final Logger logger = LoggerFactory.getLogger(CrawlingThenConvertingIntegrationTest.class);
+
+    private Path fileName;
+    private Path fileName2;
+
     @SneakyThrows
     @BeforeAll
     public static void setUpAll() {
@@ -44,10 +64,80 @@ public class CrawlingThenConvertingIntegrationTest {
 
         domainProcessor = injector.getInstance(DomainProcessor.class);
         httpFetcher = new HttpFetcherImpl(WmsaHome.getUserAgent().uaString());
+        this.fileName = Files.createTempFile("crawling-then-converting", ".warc.gz");
+        this.fileName2 = Files.createTempFile("crawling-then-converting", ".warc.gz");
+    }
+
+    @AfterEach
+    public void tearDown() throws IOException {
+        Files.deleteIfExists(fileName);
+        Files.deleteIfExists(fileName2);
     }
 
     @Test
-    public void crawlThenProcess() {
+    public void testInvalidDomain() throws IOException {
+        // Attempt to fetch an invalid domain
+        var specs = CrawlSpecRecord.builder()
+                .domain("invalid.invalid.invalid")
+                .crawlDepth(10)
+                .urls(List.of()) // add specific URLs to crawl here
+                .build();
+
+        CrawledDomain crawlData = crawl(specs);
+
+        assertEquals("ERROR", crawlData.crawlerStatus);
+        assertTrue(crawlData.doc.isEmpty());
+
+        var processedData = process();
+
+        assertNotNull(processedData);
+        assertTrue(processedData.documents.isEmpty());
+    }
+
+    @Test
+    public void testRedirectingDomain() throws IOException {
+        // Attempt to fetch an invalid domain
+        var specs = CrawlSpecRecord.builder()
+                .domain("memex.marginalia.nu")
+                .crawlDepth(10)
+                .urls(List.of()) // add specific URLs to crawl here
+                .build();
+
+        CrawledDomain crawlData = crawl(specs);
+
+        assertEquals("REDIRECT", crawlData.crawlerStatus);
+        assertEquals("www.marginalia.nu", crawlData.redirectDomain);
+        assertTrue(crawlData.doc.isEmpty());
+
+        var processedData = process();
+
+        assertNotNull(processedData);
+        assertTrue(processedData.documents.isEmpty());
+    }
+
+    @Test
+    public void testBlockedDomain() throws IOException {
+        // Attempt to fetch an invalid domain
+        var specs = CrawlSpecRecord.builder()
+                .domain("search.marginalia.nu")
+                .crawlDepth(10)
+                .urls(List.of()) // add specific URLs to crawl here
+                .build();
+
+        CrawledDomain crawlData = crawl(specs, d->false); // simulate blocking by blacklisting everything
+
+        assertEquals("ERROR", crawlData.crawlerStatus);
+        assertEquals("BLOCKED;IP not allowed", crawlData.crawlerStatusDesc);
+        assertTrue(crawlData.doc.isEmpty());
+
+        var processedData = process();
+
+        assertNotNull(processedData);
+        assertTrue(processedData.documents.isEmpty());
+    }
+
+    @Test
+    public void crawlSunnyDay() throws IOException {
         var specs = CrawlSpecRecord.builder()
                 .domain("www.marginalia.nu")
                 .crawlDepth(10)
@@ -55,12 +145,20 @@ public class CrawlingThenConvertingIntegrationTest {
                 .build();
 
         CrawledDomain domain = crawl(specs);
+        assertFalse(domain.doc.isEmpty());
+        assertEquals("OK", domain.crawlerStatus);
+        assertEquals("www.marginalia.nu", domain.domain);
 
-        List<SerializableCrawlData> data = new ArrayList<>();
-        data.add(domain);
-        data.addAll(domain.doc);
+        boolean hasRobotsTxt = domain.doc.stream().map(doc -> doc.url).anyMatch(url -> url.endsWith("/robots.txt"));
+        assertFalse(hasRobotsTxt, "Robots.txt should not leave the crawler");
+
+        var output = process();
+
+        assertNotNull(output);
+        assertFalse(output.documents.isEmpty());
+        assertEquals(new EdgeDomain("www.marginalia.nu"), output.domain);
+        assertEquals(DomainIndexingState.ACTIVE, output.state);
 
-        var output = domainProcessor.process(SerializableCrawlDataStream.fromIterator(data.iterator()));
 
         for (var doc : output.documents) {
             if (doc.isOk()) {
@@ -73,12 +171,122 @@ public class CrawlingThenConvertingIntegrationTest {
 
     }
 
-    private CrawledDomain crawl(CrawlSpecRecord specs) {
+
+
+    @Test
+    public void crawlContentTypes() throws IOException {
+        var specs = CrawlSpecRecord.builder()
+                .domain("www.marginalia.nu")
+                .crawlDepth(5)
+                .urls(List.of(
+                        "https://www.marginalia.nu/sanic.png",
+                        "https://www.marginalia.nu/invalid"
+                ))
+                .build();
+
+        CrawledDomain domain = crawl(specs);
+        assertFalse(domain.doc.isEmpty());
+        assertEquals("OK", domain.crawlerStatus);
+        assertEquals("www.marginalia.nu", domain.domain);
+
+        Set<String> allUrls = domain.doc.stream().map(doc -> doc.url).collect(Collectors.toSet());
+        assertTrue(allUrls.contains("https://www.marginalia.nu/sanic.png"), "Should have record for image despite blocked content type");
+        assertTrue(allUrls.contains("https://www.marginalia.nu/invalid"), "Should have have record for invalid URL");
+
+        var output = process();
+
+        assertNotNull(output);
+        assertFalse(output.documents.isEmpty());
+        assertEquals(new EdgeDomain("www.marginalia.nu"), output.domain);
+        assertEquals(DomainIndexingState.ACTIVE, output.state);
+
+
+        for (var doc : output.documents) {
+            if (doc.isOk()) {
+                System.out.println(doc.url + "\t" + doc.state + "\t" + doc.details.title);
+            }
+            else {
+                System.out.println(doc.url + "\t" + doc.state + "\t" + doc.stateReason);
+            }
+        }
+
+    }
+
+
+    @Test
+    public void crawlRobotsTxt() throws IOException {
+        var specs = CrawlSpecRecord.builder()
+                .domain("search.marginalia.nu")
+                .crawlDepth(5)
+                .urls(List.of(
+                        "https://search.marginalia.nu/search?q=hello+world"
+                ))
+                .build();
+
+        CrawledDomain domain = crawl(specs);
+        assertFalse(domain.doc.isEmpty());
+        assertEquals("OK", domain.crawlerStatus);
+        assertEquals("search.marginalia.nu", domain.domain);
+
+        Set<String> allUrls = domain.doc.stream().map(doc -> doc.url).collect(Collectors.toSet());
+        assertTrue(allUrls.contains("https://search.marginalia.nu/search"), "We expect a record for entities that are forbidden");
+
+        var output = process();
+
+        assertNotNull(output);
+        assertFalse(output.documents.isEmpty());
+        assertEquals(new EdgeDomain("search.marginalia.nu"), output.domain);
+        assertEquals(DomainIndexingState.ACTIVE, output.state);
+
+        for (var doc : output.documents) {
+            if (doc.isOk()) {
+                System.out.println(doc.url + "\t" + doc.state + "\t" + doc.details.title);
+            }
+            else {
+                System.out.println(doc.url + "\t" + doc.state + "\t" + doc.stateReason);
+            }
+        }
+
+    }
+
+    private ProcessedDomain process() {
+        try (var stream = new ParquetSerializableCrawlDataStream(fileName2)) {
+            return domainProcessor.process(stream);
+        }
+        catch (Exception e) {
+            Assertions.fail(e);
+            return null; // unreachable
+        }
+    }
+    private CrawledDomain crawl(CrawlSpecRecord specs) throws IOException {
+        return crawl(specs, domain -> true);
+    }
+
+    private CrawledDomain crawl(CrawlSpecRecord specs, Predicate<EdgeDomain> domainBlacklist) throws IOException {
         List<SerializableCrawlData> data = new ArrayList<>();
 
-        new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, data::add).fetch();
+        try (var recorder = new WarcRecorder(fileName)) {
+            new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, recorder).fetch();
+        }
+
+        CrawledDocumentParquetRecordFileWriter.convertWarc(specs.domain,
+                new UserAgent("test"),
+                fileName, fileName2);
+
+        try (var reader = new ParquetSerializableCrawlDataStream(fileName2)) {
+            while (reader.hasNext()) {
+                var next = reader.next();
+                logger.info("{}", next);
+                data.add(next);
+            }
+        }
+
+        CrawledDomain domain = data.stream()
+                .filter(CrawledDomain.class::isInstance)
+                .map(CrawledDomain.class::cast)
+                .findFirst()
+                .get();
 
-        CrawledDomain domain = data.stream().filter(CrawledDomain.class::isInstance).map(CrawledDomain.class::cast).findFirst().get();
         data.stream().filter(CrawledDocument.class::isInstance).map(CrawledDocument.class::cast).forEach(domain.doc::add);
         return domain;
     }
diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/warc/WarcSideloaderTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/warc/WarcSideloaderTest.java
new file mode 100644
index 00000000..da94e3a8
--- /dev/null
+++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/warc/WarcSideloaderTest.java
@@ -0,0 +1,81 @@
+package nu.marginalia.converting.sideload.warc;
+
+import com.google.inject.AbstractModule;
+import com.google.inject.Guice;
+import nu.marginalia.converting.ConverterModule;
+import nu.marginalia.converting.model.ProcessedDocument;
+import nu.marginalia.converting.model.ProcessedDomain;
+import nu.marginalia.converting.processor.ConverterDomainTypes;
+import nu.marginalia.converting.sideload.SideloaderProcessing;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.mockito.Mockito;
+import org.netpreserve.jwarc.WarcWriter;
+
+import java.io.IOException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.List;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.mockito.Mockito.when;
+
+class WarcSideloaderTest extends AbstractModule {
+    SideloaderProcessing processing;
+
+    Path warcFile;
+    @BeforeEach
+    public void setUp() throws IOException {
+        processing = Guice.createInjector(new ConverterModule(), this)
+                .getInstance(SideloaderProcessing.class);
+        warcFile = Files.createTempFile(getClass().getSimpleName(), ".warc.gz");
+    }
+
+    @AfterEach
+    public void tearDown() throws IOException {
+        Files.deleteIfExists(warcFile);
+    }
+
+    public void configure() {
+        var domainTypesMock = Mockito.mock(ConverterDomainTypes.class);
+        when(domainTypesMock.isBlog(Mockito.any())).thenReturn(false);
+
+        bind(ConverterDomainTypes.class).toInstance(domainTypesMock);
+    }
+
+
+    @Test
+    public void test() throws IOException {
+        try (var writer = new WarcWriter(Files.newOutputStream(warcFile))) {
+            writer.fetch(new URI("https://www.marginalia.nu/"));
+            writer.fetch(new URI("https://www.marginalia.nu/log/93_atags/"));
+            writer.fetch(new URI("https://www.marginalia.nu/links/"));
+        } catch (URISyntaxException e) {
+            throw new RuntimeException(e);
+        }
+
+        ProcessedDomain domain;
+        List<ProcessedDocument> docs = new ArrayList<>();
+
+        try (var sideloader = new WarcSideloader(warcFile, processing)) {
+            domain = sideloader.getDomain();
+            sideloader.getDocumentsStream().forEachRemaining(docs::add);
+        } catch (Exception e) {
+            throw new RuntimeException(e);
+        }
+
+        assertNotNull(domain);
+        assertEquals(3, docs.size());
+        List<String> fetchedUrls = docs.stream().map(doc -> doc.url).map(Object::toString).toList();
+        assertEquals(List.of(
+                "https://www.marginalia.nu/",
+                "https://www.marginalia.nu/log/93_atags/",
+                "https://www.marginalia.nu/links/"),
+                fetchedUrls);
+    }
+}
\ No newline at end of file
diff --git a/code/processes/crawling-process/build.gradle b/code/processes/crawling-process/build.gradle
index 00f0f01b..baa02906 100644
--- a/code/processes/crawling-process/build.gradle
+++ b/code/processes/crawling-process/build.gradle
@@ -41,6 +41,7 @@ dependencies {
     implementation project(':code:features-convert:anchor-keywords')
     implementation project(':code:features-crawl:crawl-blocklist')
     implementation project(':code:features-crawl:link-parser')
+    implementation project(':code:features-crawl:content-type')
 
     implementation libs.bundles.slf4j
 
@@ -48,6 +49,7 @@ dependencies {
     implementation libs.guice
     implementation libs.gson
     implementation libs.zstd
+    implementation libs.jwarc
     implementation libs.crawlercommons
     implementation libs.okhttp3
     implementation libs.jsoup
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java
deleted file mode 100644
index 1b61cb0d..00000000
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java
+++ /dev/null
@@ -1,83 +0,0 @@
-package nu.marginalia.crawl;
-
-import lombok.SneakyThrows;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.util.concurrent.Semaphore;
-import java.util.concurrent.TimeUnit;
-import java.util.concurrent.atomic.AtomicBoolean;
-
-public class CrawlLimiter {
-    public static final int maxPoolSize = Integer.getInteger("crawler.pool-size", 256);
-
-    // Thresholds for throttling task-spawning. Note there's a bit of hysteresis to this
-    private final long THROTTLE_TRIGGER_FREE_RAM = Runtime.getRuntime().maxMemory() / 4;
-    private final long THROTTLE_RELEASE_FREE_RAM = Runtime.getRuntime().maxMemory() / 2;
-
-    private final Semaphore taskSemCount = new Semaphore(maxPoolSize);
-
-    // When set to true, the crawler will wait before starting additional tasks
-    private final AtomicBoolean throttle = new AtomicBoolean(false);
-    private static final Logger logger = LoggerFactory.getLogger(CrawlLimiter.class);
-
-    public CrawlLimiter() {
-        Thread monitorThread = new Thread(this::monitor, "Memory Monitor");
-        monitorThread.setDaemon(true);
-        monitorThread.start();
-    }
-
-
-    @SneakyThrows
-    public void monitor() {
-        for (;;) {
-            synchronized (throttle) {
-                boolean oldThrottle = throttle.get();
-                boolean newThrottle = oldThrottle;
-
-                if (Runtime.getRuntime().maxMemory() == Long.MAX_VALUE) {
-                    // According to the spec this may happen, although it seems to rarely
-                    // be the case in practice
-                    logger.warn("Memory based throttling disabled (set Xmx)");
-                    return;
-                }
-
-                final long freeMemory = Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory();
-
-                if (oldThrottle && freeMemory > THROTTLE_RELEASE_FREE_RAM) {
-                    newThrottle = false;
-                    logger.warn("Memory based throttling released");
-                }
-                else if (!oldThrottle && freeMemory < THROTTLE_TRIGGER_FREE_RAM) {
-                    newThrottle = true;
-                    logger.warn("Memory based throttling triggered");
-
-                    // Try to GC
-                    System.gc();
-                }
-
-
-                throttle.set(newThrottle);
-
-                if (!newThrottle) {
-                    throttle.notifyAll();
-                }
-                if (newThrottle != oldThrottle) {
-                    logger.warn("Memory based throttling set to {}", newThrottle);
-                }
-            }
-
-            TimeUnit.SECONDS.sleep(1);
-        }
-    }
-
-    @SneakyThrows
-    public void waitForEnoughRAM() {
-        while (throttle.get()) {
-            synchronized (throttle) {
-                throttle.wait(30000);
-            }
-        }
-    }
-
-}
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java
index f824d815..c3864868 100644
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java
@@ -13,10 +13,13 @@ import nu.marginalia.atags.source.AnchorTagsSourceFactory;
 import nu.marginalia.crawl.retreival.CrawlDataReference;
 import nu.marginalia.crawl.retreival.DomainProber;
 import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
+import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
 import nu.marginalia.crawl.spec.CrawlSpecProvider;
 import nu.marginalia.crawl.spec.DbCrawlSpecProvider;
 import nu.marginalia.crawl.spec.ParquetCrawlSpecProvider;
 import nu.marginalia.crawling.io.CrawledDomainReader;
+import nu.marginalia.crawling.io.CrawlerOutputFile;
+import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter;
 import nu.marginalia.crawlspec.CrawlSpecFileNames;
 import nu.marginalia.storage.FileStorageService;
 import nu.marginalia.model.crawlspec.CrawlSpecRecord;
@@ -27,18 +30,17 @@ import nu.marginalia.mq.inbox.MqSingleShotInbox;
 import nu.marginalia.process.control.ProcessHeartbeatImpl;
 import nu.marginalia.process.log.WorkLog;
 import nu.marginalia.service.module.DatabaseModule;
-import nu.marginalia.crawling.io.CrawledDomainWriter;
 import nu.marginalia.crawl.retreival.CrawlerRetreiver;
-import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
 import nu.marginalia.util.SimpleBlockingThreadPool;
 import okhttp3.ConnectionPool;
 import okhttp3.Dispatcher;
-import okhttp3.internal.Util;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import java.io.IOException;
+import java.nio.file.Files;
 import java.nio.file.Path;
+import java.nio.file.StandardCopyOption;
 import java.sql.SQLException;
 import java.util.*;
 import java.util.concurrent.*;
@@ -49,13 +51,8 @@ import static nu.marginalia.mqapi.ProcessInboxNames.CRAWLER_INBOX;
 public class CrawlerMain {
     private final static Logger logger = LoggerFactory.getLogger(CrawlerMain.class);
 
-    private final ProcessHeartbeatImpl heartbeat;
-    private final ConnectionPool connectionPool = new ConnectionPool(5, 10, TimeUnit.SECONDS);
-
-    private final Dispatcher dispatcher = new Dispatcher(new ThreadPoolExecutor(0, Integer.MAX_VALUE, 5, TimeUnit.SECONDS,
-            new SynchronousQueue<>(), Util.threadFactory("OkHttp Dispatcher", true)));
-
     private final UserAgent userAgent;
+    private final ProcessHeartbeatImpl heartbeat;
     private final MessageQueueFactory messageQueueFactory;
     private final DomainProber domainProber;
     private final FileStorageService fileStorageService;
@@ -66,13 +63,12 @@ public class CrawlerMain {
     private final SimpleBlockingThreadPool pool;
 
     private final Map<String, String> processingIds = new ConcurrentHashMap<>();
-    private final CrawledDomainReader reader = new CrawledDomainReader();
 
     final AbortMonitor abortMonitor = AbortMonitor.getInstance();
 
     volatile int totalTasks;
     final AtomicInteger tasksDone = new AtomicInteger(0);
-    private final CrawlLimiter limiter = new CrawlLimiter();
+    private HttpFetcherImpl fetcher;
 
     @Inject
     public CrawlerMain(UserAgent userAgent,
@@ -83,8 +79,8 @@ public class CrawlerMain {
                        DbCrawlSpecProvider dbCrawlSpecProvider,
                        AnchorTagsSourceFactory anchorTagsSourceFactory,
                        Gson gson) {
-        this.heartbeat = heartbeat;
         this.userAgent = userAgent;
+        this.heartbeat = heartbeat;
         this.messageQueueFactory = messageQueueFactory;
         this.domainProber = domainProber;
         this.fileStorageService = fileStorageService;
@@ -93,8 +89,14 @@ public class CrawlerMain {
         this.gson = gson;
         this.node = processConfiguration.node();
 
-        // maybe need to set -Xss for JVM to deal with this?
-        pool = new SimpleBlockingThreadPool("CrawlerPool", CrawlLimiter.maxPoolSize, 1);
+        pool = new SimpleBlockingThreadPool("CrawlerPool",
+                Integer.getInteger("crawler.pool-size", 256),
+                1);
+
+        fetcher = new HttpFetcherImpl(userAgent.uaString(),
+                new Dispatcher(Executors.newVirtualThreadPerTaskExecutor()),
+                new ConnectionPool(5, 10, TimeUnit.SECONDS)
+        );
     }
 
     public static void main(String... args) throws Exception {
@@ -141,6 +143,7 @@ public class CrawlerMain {
     public void run(CrawlSpecProvider specProvider, Path outputDir) throws InterruptedException, IOException {
 
         heartbeat.start();
+
         try (WorkLog workLog = new WorkLog(outputDir.resolve("crawler.log"));
              AnchorTagsSource anchorTagsSource = anchorTagsSourceFactory.create(specProvider.getDomains())
         ) {
@@ -175,6 +178,7 @@ public class CrawlerMain {
                     activePoolCount = newActivePoolCount;
                 }
             }
+
         }
         catch (Exception ex) {
             logger.warn("Exception in crawler", ex);
@@ -211,27 +215,48 @@ public class CrawlerMain {
         @Override
         public void run() throws Exception {
 
-            limiter.waitForEnoughRAM();
+            Path newWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE);
+            Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP);
+            Path finalWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.FINAL);
+            Path parquetFile = CrawlerOutputFile.createParquetPath(outputDir, id, domain);
 
-            HttpFetcher fetcher = new HttpFetcherImpl(userAgent.uaString(), dispatcher, connectionPool);
+            if (Files.exists(newWarcFile)) {
+                Files.move(newWarcFile, tempFile, StandardCopyOption.REPLACE_EXISTING);
+            }
+            else {
+                Files.deleteIfExists(tempFile);
+            }
 
-            try (CrawledDomainWriter writer = new CrawledDomainWriter(outputDir, domain, id);
+            try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now
+                 var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, warcRecorder);
                  CrawlDataReference reference = getReference())
             {
                 Thread.currentThread().setName("crawling:" + domain);
 
                 var domainLinks = anchorTagsSource.getAnchorTags(domain);
 
-                var retreiver = new CrawlerRetreiver(fetcher, domainProber, specification, writer::accept);
-                int size = retreiver.fetch(domainLinks, reference);
+                if (Files.exists(tempFile)) {
+                    retriever.syncAbortedRun(tempFile);
+                    Files.delete(tempFile);
+                }
 
-                workLog.setJobToFinished(domain, writer.getOutputFile().toString(), size);
+                int size = retriever.fetch(domainLinks, reference);
+
+                // Delete the reference crawl data if it's not the same as the new one
+                // (mostly a case when migrating from legacy->warc)
+                reference.delete();
+
+                CrawledDocumentParquetRecordFileWriter
+                        .convertWarc(domain, userAgent, newWarcFile, parquetFile);
+
+                workLog.setJobToFinished(domain, parquetFile.toString(), size);
                 heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks);
 
                 logger.info("Fetched {}", domain);
-
             } catch (Exception e) {
                 logger.error("Error fetching domain " + domain, e);
+                Files.deleteIfExists(newWarcFile);
+                Files.deleteIfExists(tempFile);
             }
             finally {
                 // We don't need to double-count these; it's also kept int he workLog
@@ -242,8 +267,7 @@ public class CrawlerMain {
 
         private CrawlDataReference getReference() {
             try {
-                var dataStream = reader.createDataStream(outputDir, domain, id);
-                return new CrawlDataReference(dataStream);
+                return new CrawlDataReference(CrawledDomainReader.createDataStream(outputDir, domain, id));
             } catch (IOException e) {
                 logger.debug("Failed to read previous crawl data for {}", specification.domain);
                 return new CrawlDataReference();
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java
index 985bfc39..65e1529b 100644
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java
@@ -5,14 +5,19 @@ import com.google.common.hash.Hashing;
 import nu.marginalia.crawling.io.SerializableCrawlDataStream;
 import nu.marginalia.crawling.model.CrawledDocument;
 import nu.marginalia.lsh.EasyLSH;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import javax.annotation.Nullable;
 import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
 
 /** A reference to a domain that has been crawled before. */
 public class CrawlDataReference implements AutoCloseable {
 
     private final SerializableCrawlDataStream data;
+    private static final Logger logger = LoggerFactory.getLogger(CrawlDataReference.class);
 
     public CrawlDataReference(SerializableCrawlDataStream data) {
         this.data = data;
@@ -22,6 +27,15 @@ public class CrawlDataReference implements AutoCloseable {
         this(SerializableCrawlDataStream.empty());
     }
 
+    /** Delete the associated data from disk, if it exists */
+    public void delete() throws IOException {
+        Path filePath = data.path();
+
+        if (filePath != null) {
+            Files.deleteIfExists(filePath);
+        }
+    }
+
     @Nullable
     public CrawledDocument nextDocument() {
         try {
@@ -32,17 +46,16 @@ public class CrawlDataReference implements AutoCloseable {
             }
         }
         catch (IOException ex) {
-            ex.printStackTrace();
+            logger.error("Failed to read next document", ex);
         }
+
         return null;
     }
 
-    public boolean isContentBodySame(CrawledDocument one, CrawledDocument other) {
-        assert one.documentBody != null;
-        assert other.documentBody != null;
+    public boolean isContentBodySame(String one, String other) {
 
-        final long contentHashOne = contentHash(one.documentBody);
-        final long contentHashOther = contentHash(other.documentBody);
+        final long contentHashOne = contentHash(one);
+        final long contentHashOther = contentHash(other);
 
         return EasyLSH.hammingDistance(contentHashOne, contentHashOther) < 4;
     }
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java
index ca2494dc..e52b73b6 100644
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java
@@ -20,8 +20,18 @@ public class CrawlDelayTimer {
         this.delayTime = delayTime;
     }
 
+    /** Call when we've gotten an HTTP 429 response.  This will wait a moment, and then
+     * set a flag that slows down the main crawl delay as well. */
+    public void waitRetryDelay(RateLimitException ex) throws InterruptedException {
+        slowDown = true;
+
+        int delay = ex.retryAfter();
+
+        Thread.sleep(Math.clamp(delay, 100, 5000));
+    }
+
     @SneakyThrows
-    public void delay(long spentTime) {
+    public void waitFetchDelay(long spentTime) {
         long sleepTime = delayTime;
 
         if (sleepTime >= 1) {
@@ -30,10 +40,6 @@ public class CrawlDelayTimer {
 
             Thread.sleep(min(sleepTime - spentTime, 5000));
         }
-        else if (slowDown) {
-            // Additional delay when the server is signalling it wants slower requests
-            Thread.sleep( DEFAULT_CRAWL_DELAY_MIN_MS);
-        }
         else {
             // When no crawl delay is specified, lean toward twice the fetch+process time,
             // within sane limits. This means slower servers get slower crawling, and faster
@@ -48,10 +54,10 @@ public class CrawlDelayTimer {
 
             Thread.sleep(sleepTime - spentTime);
         }
-    }
 
-    /** Increase the delay between requests if the server is signalling it wants slower requests with HTTP 429 */
-    public void slowDown() {
-        slowDown = true;
+        if (slowDown) {
+            // Additional delay when the server is signalling it wants slower requests
+            Thread.sleep( DEFAULT_CRAWL_DELAY_MIN_MS);
+        }
     }
 }
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawledDocumentFactory.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawledDocumentFactory.java
new file mode 100644
index 00000000..37f84d58
--- /dev/null
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawledDocumentFactory.java
@@ -0,0 +1,91 @@
+package nu.marginalia.crawl.retreival;
+
+import nu.marginalia.crawling.body.HttpFetchResult;
+import nu.marginalia.crawling.model.CrawledDocument;
+import nu.marginalia.crawling.model.CrawlerDocumentStatus;
+import nu.marginalia.model.EdgeUrl;
+
+import java.time.LocalDateTime;
+import java.util.Objects;
+
+public class CrawledDocumentFactory {
+
+    public static CrawledDocument createHardErrorRsp(EdgeUrl url, Exception why) {
+        return CrawledDocument.builder()
+                .crawlerStatus(CrawlerDocumentStatus.ERROR.toString())
+                .crawlerStatusDesc(why.getClass().getSimpleName() + ": " + why.getMessage())
+                .timestamp(LocalDateTime.now().toString())
+                .url(url.toString())
+                .build();
+    }
+
+    public static CrawledDocument createUnknownHostError(EdgeUrl url) {
+        return CrawledDocument.builder()
+                .crawlerStatus(CrawlerDocumentStatus.ERROR.toString())
+                .crawlerStatusDesc("Unknown Host")
+                .timestamp(LocalDateTime.now().toString())
+                .url(url.toString())
+                .build();
+    }
+
+    public static CrawledDocument createTimeoutErrorRsp(EdgeUrl url) {
+        return CrawledDocument.builder()
+                .crawlerStatus("Timeout")
+                .timestamp(LocalDateTime.now().toString())
+                .url(url.toString())
+                .build();
+    }
+
+    public static CrawledDocument createErrorResponse(EdgeUrl url, HttpFetchResult.ResultOk rsp, CrawlerDocumentStatus status, String why) {
+        return CrawledDocument.builder()
+                .crawlerStatus(status.toString())
+                .crawlerStatusDesc(why)
+                .headers(rsp.headers().toString())
+                .contentType(Objects.requireNonNullElse(rsp.headers().get("Content-Type"), ""))
+                .timestamp(LocalDateTime.now().toString())
+                .httpStatus(rsp.statusCode())
+                .url(url.toString())
+                .build();
+    }
+    public static CrawledDocument createErrorResponse(EdgeUrl url, String contentType, int statusCode, CrawlerDocumentStatus status, String why) {
+        return CrawledDocument.builder()
+                .crawlerStatus(status.toString())
+                .crawlerStatusDesc(why)
+                .headers("")
+                .contentType(contentType)
+                .timestamp(LocalDateTime.now().toString())
+                .httpStatus(statusCode)
+                .url(url.toString())
+                .build();
+    }
+
+    public static CrawledDocument createRedirectResponse(EdgeUrl url, HttpFetchResult.ResultOk rsp, EdgeUrl responseUrl) {
+
+        return CrawledDocument.builder()
+                .crawlerStatus(CrawlerDocumentStatus.REDIRECT.name())
+                .redirectUrl(responseUrl.toString())
+                .headers(rsp.headers().toString())
+                .contentType(Objects.requireNonNullElse(rsp.headers().get("Content-Type"), ""))
+                .timestamp(LocalDateTime.now().toString())
+                .httpStatus(rsp.statusCode())
+                .url(url.toString())
+                .build();
+    }
+
+    public static CrawledDocument createRobotsError(EdgeUrl url) {
+        return CrawledDocument.builder()
+                .url(url.toString())
+                .timestamp(LocalDateTime.now().toString())
+                .httpStatus(-1)
+                .crawlerStatus(CrawlerDocumentStatus.ROBOTS_TXT.name())
+                .build();
+    }
+    public static CrawledDocument createRetryError(EdgeUrl url) {
+        return CrawledDocument.builder()
+                .url(url.toString())
+                .timestamp(LocalDateTime.now().toString())
+                .httpStatus(429)
+                .crawlerStatus(CrawlerDocumentStatus.ERROR.name())
+                .build();
+    }
+}
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
index b32e0b6c..18035d52 100644
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
@@ -3,11 +3,15 @@ package nu.marginalia.crawl.retreival;
 import com.google.common.hash.HashFunction;
 import com.google.common.hash.Hashing;
 import crawlercommons.robots.SimpleRobotRules;
-import lombok.SneakyThrows;
 import nu.marginalia.atags.model.DomainLinks;
+import nu.marginalia.contenttype.ContentType;
 import nu.marginalia.crawl.retreival.fetcher.ContentTags;
 import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
-import nu.marginalia.crawl.retreival.fetcher.SitemapRetriever;
+import nu.marginalia.crawling.body.HttpFetchResult;
+import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
+import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor;
+import nu.marginalia.crawl.retreival.revisit.DocumentWithReference;
+import nu.marginalia.crawl.retreival.sitemap.SitemapFetcher;
 import nu.marginalia.link_parser.LinkParser;
 import nu.marginalia.crawling.model.*;
 import nu.marginalia.ip_blocklist.UrlBlocklist;
@@ -19,54 +23,49 @@ import org.jsoup.nodes.Document;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import javax.annotation.Nullable;
+import java.io.IOException;
 import java.net.InetAddress;
 import java.net.UnknownHostException;
-import java.time.LocalDateTime;
+import java.nio.file.Path;
 import java.util.*;
-import java.util.function.Consumer;
 
-public class CrawlerRetreiver {
+public class CrawlerRetreiver implements AutoCloseable {
 
     private static final int MAX_ERRORS = 20;
+    private static final int HTTP_429_RETRY_LIMIT = 1; // Retry 429s once
 
     private final HttpFetcher fetcher;
 
     private final String domain;
-    private final Consumer<SerializableCrawlData> crawledDomainWriter;
 
     private static final LinkParser linkParser = new LinkParser();
     private static final Logger logger = LoggerFactory.getLogger(CrawlerRetreiver.class);
 
-    private static final HashFunction hashMethod = Hashing.murmur3_128(0);
     private static final UrlBlocklist urlBlocklist = new UrlBlocklist();
     private static final LinkFilterSelector linkFilterSelector = new LinkFilterSelector();
 
     private final DomainProber domainProber;
-    private final SitemapRetriever sitemapRetriever;
     private final DomainCrawlFrontier crawlFrontier;
+    private final WarcRecorder warcRecorder;
+    private final CrawlerRevisitor crawlerRevisitor;
 
+    private final SitemapFetcher sitemapFetcher;
     int errorCount = 0;
 
-    /** recrawlState tag for documents that had a HTTP status 304 */
-    private static final String documentWasRetainedTag = "RETAINED/304";
-
-    /** recrawlState tag for documents that had a 200 status but were identical to a previous version */
-    private static final String documentWasSameTag = "SAME-BY-COMPARISON";
-
     public CrawlerRetreiver(HttpFetcher fetcher,
                             DomainProber domainProber,
                             CrawlSpecRecord specs,
-                            Consumer<SerializableCrawlData> writer) {
+                            WarcRecorder warcRecorder)
+    {
+        this.warcRecorder = warcRecorder;
         this.fetcher = fetcher;
         this.domainProber = domainProber;
 
         domain = specs.domain;
 
-        crawledDomainWriter = writer;
-
-        this.crawlFrontier = new DomainCrawlFrontier(new EdgeDomain(domain), Objects.requireNonNullElse(specs.urls, List.of()), specs.crawlDepth);
-        sitemapRetriever = fetcher.createSitemapRetriever();
+        crawlFrontier = new DomainCrawlFrontier(new EdgeDomain(domain), Objects.requireNonNullElse(specs.urls, List.of()), specs.crawlDepth);
+        crawlerRevisitor = new CrawlerRevisitor(crawlFrontier, this, warcRecorder);
+        sitemapFetcher = new SitemapFetcher(crawlFrontier, fetcher.createSitemapRetriever());
 
         // We must always crawl the index page first, this is assumed when fingerprinting the server
         var fst = crawlFrontier.peek();
@@ -90,43 +89,42 @@ public class CrawlerRetreiver {
     public int fetch(DomainLinks domainLinks, CrawlDataReference oldCrawlData) {
         final DomainProber.ProbeResult probeResult = domainProber.probeDomain(fetcher, domain, crawlFrontier.peek());
 
-        return switch (probeResult) {
-            case DomainProber.ProbeResultOk(EdgeUrl probedUrl) -> crawlDomain(oldCrawlData, probedUrl, domainLinks);
-            case DomainProber.ProbeResultError(CrawlerDomainStatus status, String desc) -> {
-                crawledDomainWriter.accept(
-                        CrawledDomain.builder()
-                                .crawlerStatus(status.name())
-                                .crawlerStatusDesc(desc)
-                                .domain(domain)
-                                .ip(findIp(domain))
-                                .build()
-                );
-                yield 1;
-            }
-            case DomainProber.ProbeResultRedirect(EdgeDomain redirectDomain) -> {
-                crawledDomainWriter.accept(
-                        CrawledDomain.builder()
-                                .crawlerStatus(CrawlerDomainStatus.REDIRECT.name())
-                                .crawlerStatusDesc("Redirected to different domain")
-                                .redirectDomain(redirectDomain.toString())
-                                .domain(domain)
-                                .ip(findIp(domain))
-                                .build()
-                );
-                yield 1;
-            }
-        };
+        try {
+            return crawlDomain(oldCrawlData, probeResult, domainLinks);
+        }
+        catch (Exception ex) {
+            logger.error("Error crawling domain {}", domain, ex);
+            return 0;
+        }
     }
 
-    private int crawlDomain(CrawlDataReference oldCrawlData, EdgeUrl rootUrl, DomainLinks domainLinks) {
+    public void syncAbortedRun(Path warcFile) {
+        var resync = new CrawlerWarcResynchronizer(crawlFrontier, warcRecorder);
+
+        resync.run(warcFile);
+    }
+
+    private int crawlDomain(CrawlDataReference oldCrawlData, DomainProber.ProbeResult probeResult, DomainLinks domainLinks) throws IOException, InterruptedException {
         String ip = findIp(domain);
 
+        EdgeUrl rootUrl;
+
+        warcRecorder.writeWarcinfoHeader(ip, new EdgeDomain(domain), probeResult);
+
+        if (!(probeResult instanceof DomainProber.ProbeResultOk ok)) {
+            return 1;
+        }
+        else {
+            rootUrl = ok.probedUrl();
+        }
+
+
         assert !crawlFrontier.isEmpty();
 
-        final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(crawlFrontier.peek().domain);
+        final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(crawlFrontier.peek().domain, warcRecorder);
         final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());
 
-        sniffRootDocument(delayTimer, rootUrl);
+        sniffRootDocument(rootUrl);
 
         // Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
         int recrawled = recrawl(oldCrawlData, robotsRules, delayTimer);
@@ -140,9 +138,15 @@ public class CrawlerRetreiver {
         crawlFrontier.addAllToQueue(domainLinks.getUrls(rootUrl.proto));
 
         // Add links from the sitemap to the crawl frontier
-        downloadSitemaps(robotsRules, rootUrl);
+        sitemapFetcher.downloadSitemaps(robotsRules, rootUrl);
 
-        CrawledDomain ret = new CrawledDomain(domain, null, CrawlerDomainStatus.OK.name(), null, ip, new ArrayList<>(), null);
+        CrawledDomain ret = new CrawledDomain(domain,
+                null,
+                CrawlerDomainStatus.OK.name(),
+                null,
+                ip,
+                new ArrayList<>(),
+                null);
 
         int fetchedCount = recrawled;
 
@@ -154,7 +158,7 @@ public class CrawlerRetreiver {
             var top = crawlFrontier.takeNextUrl();
 
             if (!robotsRules.isAllowed(top.toString())) {
-                crawledDomainWriter.accept(createRobotsError(top));
+                warcRecorder.flagAsRobotsTxtError(top);
                 continue;
             }
 
@@ -177,149 +181,43 @@ public class CrawlerRetreiver {
                 continue;
 
 
-            if (fetchWriteAndSleep(top, delayTimer, DocumentWithReference.empty()).isPresent()) {
-                fetchedCount++;
+            try {
+                if (fetchWriteAndSleep(top, delayTimer, DocumentWithReference.empty()).isOk()) {
+                    fetchedCount++;
+                }
+            }
+            catch (InterruptedException ex) {
+                Thread.currentThread().interrupt();
+                break;
             }
         }
 
         ret.cookies = fetcher.getCookies();
 
-        crawledDomainWriter.accept(ret);
-
         return fetchedCount;
     }
 
-    /** Performs a re-crawl of old documents, comparing etags and last-modified */
-    private int recrawl(CrawlDataReference oldCrawlData,
-                        SimpleRobotRules robotsRules,
-                        CrawlDelayTimer delayTimer) {
-        int recrawled = 0;
-        int retained = 0;
-
-        for (;;) {
-            CrawledDocument doc = oldCrawlData.nextDocument();
-
-            if (doc == null) {
-                break;
-            }
-
-            // This Shouldn't Happen (TM)
-            var urlMaybe = EdgeUrl.parse(doc.url);
-            if (urlMaybe.isEmpty()) continue;
-            var url = urlMaybe.get();
-
-            // If we've previously 404:d on this URL, we'll refrain from trying to fetch it again
-            if (doc.httpStatus == 404) {
-                crawlFrontier.addVisited(url);
-                continue;
-            }
-
-            if (doc.httpStatus != 200) continue;
-
-            if (!robotsRules.isAllowed(url.toString())) {
-                crawledDomainWriter.accept(createRobotsError(url));
-                continue;
-            }
-            if (!crawlFrontier.filterLink(url))
-                continue;
-            if (!crawlFrontier.addVisited(url))
-                continue;
-
-
-            if (recrawled > 5
-             && retained > 0.9 * recrawled
-             && Math.random() < 0.9)
-            {
-                // Since it looks like most of these documents haven't changed,
-                // we'll load the documents directly; but we do this in a random
-                // fashion to make sure we eventually catch changes over time
-
-                crawledDomainWriter.accept(doc);
-                crawlFrontier.addVisited(url);
-                continue;
-            }
-
-
-            // GET the document with the stored document as a reference
-            // providing etag and last-modified headers, so we can recycle the
-            // document if it hasn't changed without actually downloading it
-
-            var fetchedDocOpt = fetchWriteAndSleep(url,
-                    delayTimer,
-                    new DocumentWithReference(doc, oldCrawlData));
-            if (fetchedDocOpt.isEmpty()) continue;
-
-            if (documentWasRetainedTag.equals(fetchedDocOpt.get().recrawlState)) retained ++;
-            else if (documentWasSameTag.equals(fetchedDocOpt.get().recrawlState)) retained ++;
-
-            recrawled ++;
-        }
-
-        return recrawled;
+    /** Using the old crawl data, fetch the documents comparing etags and last-modified */
+    private int recrawl(CrawlDataReference oldCrawlData, SimpleRobotRules robotsRules, CrawlDelayTimer delayTimer) throws InterruptedException {
+        return crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer);
     }
 
-    private void downloadSitemaps(SimpleRobotRules robotsRules, EdgeUrl rootUrl) {
-        List<String> sitemaps = robotsRules.getSitemaps();
-
-        List<EdgeUrl> urls = new ArrayList<>(sitemaps.size());
-        if (!sitemaps.isEmpty()) {
-            for (var url : sitemaps) {
-                EdgeUrl.parse(url).ifPresent(urls::add);
-            }
-        }
-        else {
-            urls.add(rootUrl.withPathAndParam("/sitemap.xml", null));
-        }
-
-        downloadSitemaps(urls);
-    }
-
-    private void downloadSitemaps(List<EdgeUrl> urls) {
-
-        Set<String> checkedSitemaps = new HashSet<>();
-
-        for (var url : urls) {
-            // Let's not download sitemaps from other domains for now
-            if (!crawlFrontier.isSameDomain(url)) {
-                continue;
-            }
-
-            if (checkedSitemaps.contains(url.path))
-                continue;
-
-            var sitemap =  sitemapRetriever.fetchSitemap(url);
-            if (sitemap.isEmpty()) {
-                continue;
-            }
-
-            // ensure we don't try to download this sitemap again
-            // (don't move this up, as we may want to check the same
-            // path with different protocols until we find one that works)
-
-            checkedSitemaps.add(url.path);
-
-            crawlFrontier.addAllToQueue(sitemap);
-        }
-
-        logger.debug("Queue is now {}", crawlFrontier.queueSize());
-    }
-
-    private void sniffRootDocument(CrawlDelayTimer delayTimer, EdgeUrl rootUrl) {
+    private void sniffRootDocument(EdgeUrl rootUrl) {
         try {
             logger.debug("Configuring link filter");
 
             var url = rootUrl.withPathAndParam("/", null);
 
-            var maybeSample = fetchUrl(url, delayTimer, DocumentWithReference.empty()).filter(sample -> sample.httpStatus == 200);
-            if (maybeSample.isEmpty())
+            var result = fetcher.fetchContent(url, warcRecorder, ContentTags.empty());
+            if (!(result instanceof HttpFetchResult.ResultOk ok))
                 return;
-            var sample = maybeSample.get();
 
-            if (sample.documentBody == null)
+            var optDoc = ok.parseDocument();
+            if (optDoc.isEmpty())
                 return;
 
             // Sniff the software based on the sample document
-            var doc = Jsoup.parse(sample.documentBody);
+            var doc = optDoc.get();
             crawlFrontier.setLinkFilter(linkFilterSelector.selectFilter(doc));
 
             for (var link : doc.getElementsByTag("link")) {
@@ -338,7 +236,7 @@ public class CrawlerRetreiver {
                 linkParser.parseLink(url, href)
                         .filter(crawlFrontier::isSameDomain)
                         .map(List::of)
-                        .ifPresent(this::downloadSitemaps);
+                        .ifPresent(sitemapFetcher::downloadSitemaps);
             }
         }
         catch (Exception ex) {
@@ -346,41 +244,67 @@ public class CrawlerRetreiver {
         }
     }
 
-    private Optional<CrawledDocument> fetchWriteAndSleep(EdgeUrl top,
-                                                         CrawlDelayTimer timer,
-                                                         DocumentWithReference reference) {
+    public HttpFetchResult fetchWriteAndSleep(EdgeUrl top,
+                                              CrawlDelayTimer timer,
+                                              DocumentWithReference reference) throws InterruptedException
+    {
         logger.debug("Fetching {}", top);
 
+        HttpFetchResult fetchedDoc = new HttpFetchResult.ResultNone();
+
         long startTime = System.currentTimeMillis();
+        var contentTags = reference.getContentTags();
 
-        var docOpt = fetchUrl(top, timer, reference);
-
-        if (docOpt.isPresent()) {
-            var doc = docOpt.get();
-
-            if (!Objects.equals(doc.recrawlState, documentWasRetainedTag)
-                && reference.isContentBodySame(doc))
-            {
-                // The document didn't change since the last time
-                doc.recrawlState = documentWasSameTag;
+        // Fetch the document, retrying if we get a rate limit exception
+        for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
+            try {
+                fetchedDoc = fetcher.fetchContent(top, warcRecorder, contentTags);
+                break;
             }
-
-            crawledDomainWriter.accept(doc);
-
-            if (doc.url != null) {
-                // We may have redirected to a different path
-                EdgeUrl.parse(doc.url).ifPresent(crawlFrontier::addVisited);
+            catch (RateLimitException ex) {
+                timer.waitRetryDelay(ex);
             }
-
-            if ("ERROR".equals(doc.crawlerStatus) && doc.httpStatus != 404) {
-                errorCount++;
+            catch (Exception ex) {
+                logger.warn("Failed to fetch {}", top, ex);
+                fetchedDoc = new HttpFetchResult.ResultException(ex);
             }
-
         }
 
-        timer.delay(System.currentTimeMillis() - startTime);
+        try {
+            if (fetchedDoc instanceof HttpFetchResult.ResultOk ok) {
+                var docOpt = ok.parseDocument();
+                if (docOpt.isPresent()) {
+                    var doc = docOpt.get();
 
-        return docOpt;
+                    crawlFrontier.enqueueLinksFromDocument(top, doc);
+                    crawlFrontier.addVisited(new EdgeUrl(ok.uri()));
+                }
+            }
+            else if (fetchedDoc instanceof HttpFetchResult.Result304Raw && reference.doc() != null) {
+                var doc = reference.doc();
+
+                warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBody);
+
+                fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url,
+                        new ContentType(doc.contentType, "UTF-8"),
+                        doc.documentBody);
+
+                var parsed = Jsoup.parse(doc.documentBody);
+
+                crawlFrontier.enqueueLinksFromDocument(top, parsed);
+                crawlFrontier.addVisited(top);
+            }
+            else if (fetchedDoc instanceof HttpFetchResult.ResultException ex) {
+                errorCount ++;
+            }
+        }
+        catch (Exception ex) {
+            logger.error("Error parsing document {}", top, ex);
+        }
+
+        timer.waitFetchDelay(System.currentTimeMillis() - startTime);
+
+        return fetchedDoc;
     }
 
     private boolean isAllowedProtocol(String proto) {
@@ -388,91 +312,6 @@ public class CrawlerRetreiver {
                 || proto.equalsIgnoreCase("https");
     }
 
-    private Optional<CrawledDocument> fetchUrl(EdgeUrl top, CrawlDelayTimer timer, DocumentWithReference reference) {
-        try {
-            var contentTags = reference.getContentTags();
-            var fetchedDoc = tryDownload(top, timer, contentTags);
-
-            CrawledDocument doc = reference.replaceOn304(fetchedDoc);
-
-            if (doc.documentBody != null) {
-                doc.documentBodyHash = createHash(doc.documentBody);
-
-                var parsedDoc = Jsoup.parse(doc.documentBody);
-                EdgeUrl url = new EdgeUrl(doc.url);
-
-                findLinks(url, parsedDoc);
-                findCanonicalUrl(url, parsedDoc)
-                        .ifPresent(canonicalLink -> doc.canonicalUrl = canonicalLink.toString());
-            }
-
-            return Optional.of(doc);
-        }
-        catch (Exception ex) {
-            logger.warn("Failed to process document {}", top);
-        }
-
-        return Optional.empty();
-
-    }
-
-
-    @SneakyThrows
-    private CrawledDocument tryDownload(EdgeUrl top, CrawlDelayTimer timer, ContentTags tags) {
-        for (int i = 0; i < 2; i++) {
-            try {
-                var doc = fetcher.fetchContent(top, tags);
-                doc.recrawlState = "NEW";
-                return doc;
-            }
-            catch (RateLimitException ex) {
-                timer.slowDown();
-
-                int delay = ex.retryAfter();
-                if (delay > 0 && delay < 5000) {
-                    Thread.sleep(delay);
-                }
-            }
-        }
-
-        return createRetryError(top);
-    }
-
-    private String createHash(String documentBodyHash) {
-        return hashMethod.hashUnencodedChars(documentBodyHash).toString();
-    }
-
-    private void findLinks(EdgeUrl baseUrl, Document parsed) {
-        baseUrl = linkParser.getBaseLink(parsed, baseUrl);
-
-        for (var link : parsed.getElementsByTag("a")) {
-            linkParser.parseLink(baseUrl, link).ifPresent(crawlFrontier::addToQueue);
-        }
-        for (var link : parsed.getElementsByTag("frame")) {
-            linkParser.parseFrame(baseUrl, link).ifPresent(crawlFrontier::addToQueue);
-        }
-        for (var link : parsed.getElementsByTag("iframe")) {
-            linkParser.parseFrame(baseUrl, link).ifPresent(crawlFrontier::addToQueue);
-        }
-        for (var link : parsed.getElementsByTag("link")) {
-            String rel = link.attr("rel");
-
-            if (rel.equalsIgnoreCase("next") || rel.equalsIgnoreCase("prev")) {
-                linkParser.parseLink(baseUrl, link).ifPresent(crawlFrontier::addToQueue);
-            }
-        }
-    }
-
-    private Optional<EdgeUrl> findCanonicalUrl(EdgeUrl baseUrl, Document parsed) {
-        baseUrl = baseUrl.domain.toRootUrl();
-
-        for (var link : parsed.select("link[rel=canonical]")) {
-            return linkParser.parseLink(baseUrl, link);
-        }
-
-        return Optional.empty();
-    }
-
     private String findIp(String domain) {
         try {
             return InetAddress.getByName(domain).getHostAddress();
@@ -481,92 +320,9 @@ public class CrawlerRetreiver {
         }
     }
 
-    private CrawledDocument createRobotsError(EdgeUrl url) {
-        return CrawledDocument.builder()
-                .url(url.toString())
-                .timestamp(LocalDateTime.now().toString())
-                .httpStatus(-1)
-                .crawlerStatus(CrawlerDocumentStatus.ROBOTS_TXT.name())
-                .build();
-    }
-    private CrawledDocument createRetryError(EdgeUrl url) {
-        return CrawledDocument.builder()
-                .url(url.toString())
-                .timestamp(LocalDateTime.now().toString())
-                .httpStatus(429)
-                .crawlerStatus(CrawlerDocumentStatus.ERROR.name())
-                .build();
-    }
-
-    private record DocumentWithReference(
-            @Nullable CrawledDocument doc,
-            @Nullable CrawlDataReference reference) {
-
-        private static final DocumentWithReference emptyInstance = new DocumentWithReference(null, null);
-        public static DocumentWithReference empty() {
-            return emptyInstance;
-        }
-
-        public boolean isContentBodySame(CrawledDocument newDoc) {
-            if (reference == null)
-                return false;
-            if (doc == null)
-                return false;
-            if (doc.documentBody == null)
-                return false;
-            if (newDoc.documentBody == null)
-                return false;
-
-            return reference.isContentBodySame(doc, newDoc);
-        }
-
-        private ContentTags getContentTags() {
-            if (null == doc)
-                return ContentTags.empty();
-
-            String headers = doc.headers;
-            if (headers == null)
-                return ContentTags.empty();
-
-            String[] headersLines = headers.split("\n");
-
-            String lastmod = null;
-            String etag = null;
-
-            for (String line : headersLines) {
-                if (line.toLowerCase().startsWith("etag:")) {
-                    etag = line.substring(5).trim();
-                }
-                if (line.toLowerCase().startsWith("last-modified:")) {
-                    lastmod = line.substring(14).trim();
-                }
-            }
-
-            return new ContentTags(etag, lastmod);
-        }
-
-        public boolean isEmpty() {
-            return doc == null || reference == null;
-        }
-
-        /** If the provided document has HTTP status 304, and the reference document is provided,
-         *  return the reference document; otherwise return the provided document.
-         */
-        public CrawledDocument replaceOn304(CrawledDocument fetchedDoc) {
-
-            if (doc == null)
-                return fetchedDoc;
-
-            // HTTP status 304 is NOT MODIFIED, which means the document is the same as it was when
-            // we fetched it last time. We can recycle the reference document.
-            if (fetchedDoc.httpStatus != 304)
-                return fetchedDoc;
-
-            var ret = doc;
-            ret.recrawlState = documentWasRetainedTag;
-            ret.timestamp = LocalDateTime.now().toString();
-            return ret;
-        }
+    @Override
+    public void close() throws Exception {
+        warcRecorder.close();
     }
 
 }
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java
new file mode 100644
index 00000000..52ebe2f3
--- /dev/null
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java
@@ -0,0 +1,107 @@
+package nu.marginalia.crawl.retreival;
+
+import nu.marginalia.crawling.body.DocumentBodyExtractor;
+import nu.marginalia.crawling.body.DocumentBodyResult;
+import nu.marginalia.crawling.body.HttpFetchResult;
+import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
+import nu.marginalia.model.EdgeUrl;
+import org.jsoup.Jsoup;
+import org.netpreserve.jwarc.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.nio.file.Path;
+
+/**
+ * This class is responsible for resynchronizing the crawl frontier with a partially written
+ * warc file.  This may happen if the crawl is interrupted or crashes.
+ * <p>
+ * This is best-effort and not guaranteed to recover all data, but it should limit
+ * the amount of data that is lost and needs to be re-crawled in the event of an unexpected
+ * shutdown.
+ */
+public class CrawlerWarcResynchronizer {
+    private final DomainCrawlFrontier crawlFrontier;
+    private final WarcRecorder recorder;
+    private static final Logger logger = LoggerFactory.getLogger(CrawlerWarcResynchronizer.class);
+    public CrawlerWarcResynchronizer(DomainCrawlFrontier crawlFrontier, WarcRecorder recorder) {
+        this.crawlFrontier = crawlFrontier;
+        this.recorder = recorder;
+    }
+
+    public void run(Path tempFile) {
+        // First pass, enqueue links
+        try (var reader = new WarcReader(tempFile)) {
+            WarcXResponseReference.register(reader);
+            WarcXEntityRefused.register(reader);
+
+            for (var item : reader) {
+                accept(item);
+            }
+        } catch (IOException e) {
+            logger.info(STR."Failed read full warc file \{tempFile}", e);
+        }
+
+        // Second pass, copy records to the new warc file
+        try (var reader = new WarcReader(tempFile)) {
+            for (var item : reader) {
+                recorder.resync(item);
+            }
+        } catch (IOException e) {
+            logger.info(STR."Failed read full warc file \{tempFile}", e);
+        }
+    }
+
+    public void accept(WarcRecord item) {
+        try {
+            if (item instanceof WarcResponse rsp) {
+                response(rsp);
+            } else if (item instanceof WarcRequest req) {
+                request(req);
+            } else if (item instanceof WarcXEntityRefused refused) {
+                refused(refused);
+            }
+
+        }
+        catch (Exception ex) {
+            logger.info(STR."Failed to process warc record \{item}", ex);
+        }
+    }
+
+    private void refused(WarcXEntityRefused refused) {
+        // In general, we don't want to re-crawl urls that were refused,
+        // but to permit circumstances to change over  time, we'll
+        // allow for a small chance of re-probing these entries
+
+        if (Math.random() > 0.1) {
+            crawlFrontier.addVisited(new EdgeUrl(refused.targetURI()));
+        }
+    }
+
+    private void request(WarcRequest request) {
+        EdgeUrl.parse(request.target()).ifPresent(crawlFrontier::addVisited);
+    }
+
+    private void response(WarcResponse rsp) {
+        var url = new EdgeUrl(rsp.targetURI());
+
+        crawlFrontier.addVisited(url);
+
+        try {
+            var response = HttpFetchResult.importWarc(rsp);
+            DocumentBodyExtractor
+                    .asString(response)
+                    .ifPresent((ct, body) ->
+            {
+                var doc = Jsoup.parse(body);
+                crawlFrontier.enqueueLinksFromDocument(url, doc);
+            });
+        }
+        catch (Exception e) {
+            logger.info(STR."Failed to parse response body for \{url}", e);
+        }
+    }
+
+
+}
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java
index 30902a8e..46446fee 100644
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java
@@ -3,14 +3,19 @@ package nu.marginalia.crawl.retreival;
 import com.google.common.hash.HashFunction;
 import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
 import nu.marginalia.ip_blocklist.UrlBlocklist;
+import nu.marginalia.link_parser.LinkParser;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
+import org.jsoup.nodes.Document;
 
 import java.net.URISyntaxException;
 import java.util.*;
 import java.util.function.Predicate;
 
 public class DomainCrawlFrontier {
+
+    private static final LinkParser linkParser = new LinkParser();
+
     private final ArrayDeque<String> queue;
 
     // To save the number of strings kept in memory,
@@ -45,9 +50,14 @@ public class DomainCrawlFrontier {
         }
     }
 
+    /** Increase the depth of the crawl by a factor.  If the current depth is smaller
+     * than the number of already visited documents, the base depth will be adjusted
+     * to the visited count first.
+     */
     public void increaseDepth(double depthIncreaseFactor) {
-        depth = (int)(depth * depthIncreaseFactor);
+        depth = (int)(Math.max(visited.size(), depth) * depthIncreaseFactor);
     }
+
     public void setLinkFilter(Predicate<EdgeUrl> linkFilter) {
         this.linkFilter = linkFilter;
     }
@@ -141,4 +151,27 @@ public class DomainCrawlFrontier {
     public int queueSize() {
         return queue.size();
     }
+
+
+    public void enqueueLinksFromDocument(EdgeUrl baseUrl, Document parsed) {
+        baseUrl = linkParser.getBaseLink(parsed, baseUrl);
+
+        for (var link : parsed.getElementsByTag("a")) {
+            linkParser.parseLink(baseUrl, link).ifPresent(this::addToQueue);
+        }
+        for (var link : parsed.getElementsByTag("frame")) {
+            linkParser.parseFrame(baseUrl, link).ifPresent(this::addToQueue);
+        }
+        for (var link : parsed.getElementsByTag("iframe")) {
+            linkParser.parseFrame(baseUrl, link).ifPresent(this::addToQueue);
+        }
+        for (var link : parsed.getElementsByTag("link")) {
+            String rel = link.attr("rel");
+
+            if (rel.equalsIgnoreCase("next") || rel.equalsIgnoreCase("prev")) {
+                linkParser.parseLink(baseUrl, link).ifPresent(this::addToQueue);
+            }
+        }
+    }
+
 }
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/ContentTypeProber.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/ContentTypeProber.java
new file mode 100644
index 00000000..df070cc5
--- /dev/null
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/ContentTypeProber.java
@@ -0,0 +1,86 @@
+package nu.marginalia.crawl.retreival.fetcher;
+
+import nu.marginalia.crawling.body.ContentTypeLogic;
+import nu.marginalia.model.EdgeUrl;
+import okhttp3.OkHttpClient;
+import okhttp3.Request;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.net.SocketTimeoutException;
+import java.util.Objects;
+
+public class ContentTypeProber {
+
+    private static final Logger logger = LoggerFactory.getLogger(ContentTypeProber.class);
+    private final String userAgent;
+    private final OkHttpClient client;
+    private final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
+
+    public ContentTypeProber(String userAgent, OkHttpClient httpClient) {
+        this.userAgent = userAgent;
+        this.client = httpClient;
+    }
+
+    /** Probe the content type of the given URL with a HEAD request.
+     * This is used to detect binary files, which we don't want to crawl.
+     * <p>
+     * If the URL redirects, the final URL is returned, to avoid redundant
+     * requests.
+     *
+     * @param url The URL to probe
+     * @return A ContentTypeProbeResult
+     */
+    public ContentTypeProbeResult probeContentType(EdgeUrl url) {
+        logger.debug("Probing suspected binary {}", url);
+
+        var headBuilder = new Request.Builder().head()
+                .addHeader("User-agent", userAgent)
+                .addHeader("Accept-Encoding", "gzip")
+                .url(url.toString());
+
+        var head = headBuilder.build();
+        var call = client.newCall(head);
+
+        try (var rsp = call.execute()) {
+            var contentTypeHeader = rsp.header("Content-type");
+
+            if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) {
+                return new ContentTypeProbeResult.BadContentType(contentTypeHeader, rsp.code());
+            }
+
+            // Update the URL to the final URL of the HEAD request, otherwise we might end up doing
+
+            // HEAD 301 url1 -> url2
+            // HEAD 200 url2
+            // GET 301 url1 -> url2
+            // GET 200 url2
+
+            // which is not what we want. Overall we want to do as few requests as possible to not raise
+            // too many eyebrows when looking at the logs on the target server.  Overall it's probably desirable
+            // that it looks like the traffic makes sense, as opposed to looking like a broken bot.
+
+            var redirectUrl = new EdgeUrl(rsp.request().url().toString());
+            EdgeUrl ret;
+
+            if (Objects.equals(redirectUrl.domain, url.domain)) ret = redirectUrl;
+            else ret = url;
+
+            return new ContentTypeProbeResult.Ok(ret);
+
+        } catch (SocketTimeoutException ex) {
+            return new ContentTypeProbeResult.Timeout();
+        } catch (Exception ex) {
+            logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
+
+            return new ContentTypeProbeResult.Exception(ex);
+        }
+    }
+
+    public sealed interface ContentTypeProbeResult {
+        record Ok(EdgeUrl resolvedUrl) implements ContentTypeProbeResult { }
+        record BadContentType(String contentType, int statusCode) implements ContentTypeProbeResult { }
+        record Timeout() implements ContentTypeProbeResult { }
+        record Exception(java.lang.Exception ex) implements ContentTypeProbeResult { }
+    }
+}
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java
index 11ad272e..70576510 100644
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java
@@ -3,7 +3,8 @@ package nu.marginalia.crawl.retreival.fetcher;
 import com.google.inject.ImplementedBy;
 import crawlercommons.robots.SimpleRobotRules;
 import nu.marginalia.crawl.retreival.RateLimitException;
-import nu.marginalia.crawling.model.CrawledDocument;
+import nu.marginalia.crawling.body.HttpFetchResult;
+import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
 
@@ -18,9 +19,9 @@ public interface HttpFetcher {
 
     FetchResult probeDomain(EdgeUrl url);
 
-    CrawledDocument fetchContent(EdgeUrl url, ContentTags tags) throws RateLimitException;
+    HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags) throws RateLimitException;
 
-    SimpleRobotRules fetchRobotRules(EdgeDomain domain);
+    SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder);
 
     SitemapRetriever createSitemapRetriever();
 }
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java
index 5720ef34..ef6b48cb 100644
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java
@@ -7,43 +7,41 @@ import crawlercommons.robots.SimpleRobotRulesParser;
 import lombok.SneakyThrows;
 import nu.marginalia.crawl.retreival.Cookies;
 import nu.marginalia.crawl.retreival.RateLimitException;
-import nu.marginalia.crawling.model.CrawledDocument;
-import nu.marginalia.crawling.model.CrawlerDocumentStatus;
-import nu.marginalia.crawling.model.ContentType;
+import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult;
+import nu.marginalia.crawl.retreival.fetcher.socket.FastTerminatingSocketFactory;
+import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
+import nu.marginalia.crawl.retreival.fetcher.socket.NoSecuritySSL;
+import nu.marginalia.crawling.body.DocumentBodyExtractor;
+import nu.marginalia.crawling.body.HttpFetchResult;
+import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
+import nu.marginalia.crawling.body.ContentTypeLogic;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
-import nu.marginalia.crawl.retreival.logic.ContentTypeLogic;
-import nu.marginalia.crawl.retreival.logic.ContentTypeParser;
-import okhttp3.*;
-import org.apache.commons.io.input.BOMInputStream;
+import okhttp3.ConnectionPool;
+import okhttp3.Dispatcher;
+import okhttp3.OkHttpClient;
+import okhttp3.Request;
 import org.apache.commons.lang3.StringUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import javax.net.ssl.SSLException;
 import javax.net.ssl.X509TrustManager;
-import java.io.EOFException;
-import java.io.IOException;
-import java.net.*;
-import java.nio.charset.Charset;
-import java.nio.charset.IllegalCharsetNameException;
-import java.nio.charset.StandardCharsets;
-import java.nio.charset.UnsupportedCharsetException;
-import java.time.LocalDateTime;
-import java.util.*;
+import java.util.List;
+import java.util.Objects;
+import java.util.Optional;
 import java.util.concurrent.TimeUnit;
-import java.util.zip.GZIPInputStream;
+
 
 public class HttpFetcherImpl implements HttpFetcher {
 
     private final Logger logger = LoggerFactory.getLogger(getClass());
     private final String userAgent;
-    private final int maxFetchSize = 1024*512;
     private final Cookies cookies = new Cookies();
 
     private static final SimpleRobotRulesParser robotsParser = new SimpleRobotRulesParser();
 
-    private final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
+    private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
+    private final ContentTypeProber contentTypeProber;
 
     @Override
     public void setAllowAllContentTypes(boolean allowAllContentTypes) {
@@ -64,6 +62,7 @@ public class HttpFetcherImpl implements HttpFetcher {
         return builder.sslSocketFactory(NoSecuritySSL.buildSocketFactory(), (X509TrustManager) NoSecuritySSL.trustAllCerts[0])
             .socketFactory(ftSocketFactory)
             .hostnameVerifier(NoSecuritySSL.buildHostnameVerifyer())
+            .addNetworkInterceptor(new IpInterceptingNetworkInterceptor())
             .connectionPool(pool)
             .cookieJar(cookies.getJar())
             .followRedirects(true)
@@ -92,13 +91,22 @@ public class HttpFetcherImpl implements HttpFetcher {
     {
         this.client = createClient(dispatcher, connectionPool);
         this.userAgent = userAgent;
+        this.contentTypeProber = new ContentTypeProber(userAgent, client);
     }
 
     public HttpFetcherImpl(@Named("user-agent") String userAgent) {
         this.client = createClient(null, new ConnectionPool());
         this.userAgent = userAgent;
+        this.contentTypeProber = new ContentTypeProber(userAgent, client);
     }
 
+    /**
+     * Probe the domain to see if it is reachable, attempting to identify which schema to use,
+     * and if there are any redirects.  This is done by one or more HEAD requests.
+     *
+     * @param url The URL to probe.
+     * @return The result of the probe, indicating the state and the URL.
+     */
     @Override
     @SneakyThrows
     public FetchResult probeDomain(EdgeUrl url) {
@@ -130,8 +138,9 @@ public class HttpFetcherImpl implements HttpFetcher {
 
     @Override
     @SneakyThrows
-    public CrawledDocument fetchContent(EdgeUrl url,
-                                        ContentTags contentTags)
+    public HttpFetchResult fetchContent(EdgeUrl url,
+                                           WarcRecorder warcRecorder,
+                                           ContentTags contentTags)
             throws RateLimitException
     {
 
@@ -139,268 +148,54 @@ public class HttpFetcherImpl implements HttpFetcher {
         // looks like it might be something else, we perform a HEAD first to check the content type
         if (contentTags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url))
         {
-            logger.debug("Probing suspected binary {}", url);
-
-            var headBuilder = new Request.Builder().head()
-                    .addHeader("User-agent", userAgent)
-                    .url(url.toString())
-                    .addHeader("Accept-Encoding", "gzip");
-
-            var head = headBuilder.build();
-            var call = client.newCall(head);
-
-            try (var rsp = call.execute()) {
-                var contentTypeHeader = rsp.header("Content-type");
-                if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) {
-                    return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, "Early probe failed");
-                }
-
-                // Update the URL to the final URL of the HEAD request, otherwise we might end up doing
-
-                // HEAD 301 url1 -> url2
-                // HEAD 200 url2
-                // GET 301 url1 -> url2
-                // GET 200 url2
-
-                // which is not what we want. Overall we want to do as few requests as possible to not raise
-                // too many eyebrows when looking at the logs on the target server.  Overall it's probably desirable
-                // that it looks like the traffic makes sense, as opposed to looking like a broken bot.
-
-                var redirectUrl = new EdgeUrl(rsp.request().url().toString());
-                if (Objects.equals(redirectUrl.domain, url.domain))
-                    url = redirectUrl;
+            ContentTypeProbeResult probeResult = contentTypeProber.probeContentType(url);
+            if (probeResult instanceof ContentTypeProbeResult.Ok ok) {
+                url = ok.resolvedUrl();
             }
-            catch (SocketTimeoutException ex) {
-                return createTimeoutErrorRsp(url, ex);
+            else if (probeResult instanceof ContentTypeProbeResult.BadContentType badContentType) {
+                warcRecorder.flagAsFailedContentTypeProbe(url, badContentType.contentType(), badContentType.statusCode());
+                return new HttpFetchResult.ResultNone();
             }
-            catch (Exception ex) {
-                logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
-                return createHardErrorRsp(url, ex);
+            else if (probeResult instanceof ContentTypeProbeResult.BadContentType.Timeout timeout) {
+                warcRecorder.flagAsTimeout(url);
+                return new HttpFetchResult.ResultNone();
+            }
+            else if (probeResult instanceof ContentTypeProbeResult.Exception exception) {
+                warcRecorder.flagAsError(url, exception.ex());
+                return new HttpFetchResult.ResultNone();
             }
         }
 
         var getBuilder = new Request.Builder().get();
 
-        getBuilder.addHeader("User-agent", userAgent)
-                .url(url.toString())
-                .addHeader("Accept-Encoding", "gzip");
+        getBuilder.url(url.toString())
+                .addHeader("Accept-Encoding", "gzip")
+                .addHeader("User-agent", userAgent);
 
         contentTags.paint(getBuilder);
 
-        var get = getBuilder.build();
-        var call = client.newCall(get);
+        HttpFetchResult result = warcRecorder.fetch(client, getBuilder.build());
 
-        try (var rsp = call.execute()) {
-            return extractBody(url, rsp);
-        }
-        catch (RateLimitException rle) {
-            throw rle;
-        }
-        catch (SocketTimeoutException ex) {
-            return createTimeoutErrorRsp(url, ex);
-        }
-        catch (UnknownHostException ex) {
-            return createUnknownHostError(url, ex);
-        }
-        catch (SocketException | ProtocolException | IllegalCharsetNameException | SSLException | EOFException ex) {
-            // This is a bit of a grab-bag of errors that crop up
-            // IllegalCharsetName is egg on our face,
-            // but SSLException and EOFException are probably the server's fault
-
-            return createHardErrorRsp(url, ex);
-        }
-        catch (Exception ex) {
-            logger.error("Error during fetching", ex);
-            return createHardErrorRsp(url, ex);
-        }
-    }
-
-    private CrawledDocument createHardErrorRsp(EdgeUrl url, Exception why) {
-        return CrawledDocument.builder()
-                .crawlerStatus(CrawlerDocumentStatus.ERROR.toString())
-                .crawlerStatusDesc(why.getClass().getSimpleName() + ": " + why.getMessage())
-                .timestamp(LocalDateTime.now().toString())
-                .url(url.toString())
-                .build();
-    }
-
-    private CrawledDocument createUnknownHostError(EdgeUrl url, Exception why) {
-        return CrawledDocument.builder()
-                .crawlerStatus(CrawlerDocumentStatus.ERROR.toString())
-                .crawlerStatusDesc("Unknown Host")
-                .timestamp(LocalDateTime.now().toString())
-                .url(url.toString())
-                .build();
-    }
-
-    private CrawledDocument createTimeoutErrorRsp(EdgeUrl url, Exception why) {
-        return CrawledDocument.builder()
-                .crawlerStatus("Timeout")
-                .crawlerStatusDesc(why.getMessage())
-                .timestamp(LocalDateTime.now().toString())
-                .url(url.toString())
-                .build();
-    }
-    private CrawledDocument createErrorResponse(EdgeUrl url, Response rsp, CrawlerDocumentStatus status, String why) {
-        return CrawledDocument.builder()
-                .crawlerStatus(status.toString())
-                .crawlerStatusDesc(why)
-                .headers(rsp.headers().toString())
-                .contentType(rsp.header("Content-type"))
-                .timestamp(LocalDateTime.now().toString())
-                .httpStatus(rsp.code())
-                .url(url.toString())
-                .build();
-    }
-
-    private CrawledDocument extractBody(EdgeUrl url, Response rsp) throws IOException, URISyntaxException, RateLimitException {
-
-        var responseUrl = new EdgeUrl(rsp.request().url().toString());
-        if (!Objects.equals(responseUrl.domain, url.domain)) {
-            return createRedirectResponse(url, rsp, responseUrl);
-        }
-
-        if (rsp.code() == 429) {
-            throw new RateLimitException(rsp.header("Retry-After", "1000"));
-        }
-
-        var body = rsp.body();
-        if (null == body) {
-            return createErrorResponse(url, rsp, CrawlerDocumentStatus.ERROR, "No body");
-        }
-
-        var byteStream = body.byteStream();
-
-        if ("gzip".equals(rsp.header("Content-encoding"))) {
-            byteStream = new GZIPInputStream(byteStream);
-        }
-        byteStream = new BOMInputStream(byteStream);
-
-        var contentTypeHeader = rsp.header("Content-type");
-        if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) {
-            return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, "");
-        }
-
-        byte[] data = byteStream.readNBytes(maxFetchSize);
-
-        var contentType = ContentTypeParser.parse(contentTypeHeader, data);
-        if (!contentTypeLogic.isAllowableContentType(contentType.contentType())) {
-            return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, "");
-        }
-
-        if ("Shift_JIS".equalsIgnoreCase(contentType.charset())) {
-            return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CHARSET, "");
-        }
-
-        if (!isXRobotsTagsPermitted(rsp.headers("X-Robots-Tag"), userAgent)) {
-            return CrawledDocument.builder()
-                    .crawlerStatus(CrawlerDocumentStatus.ROBOTS_TXT.name())
-                    .crawlerStatusDesc("X-Robots-Tag")
-                    .url(responseUrl.toString())
-                    .httpStatus(-1)
-                    .timestamp(LocalDateTime.now().toString())
-                    .headers(rsp.headers().toString())
-                    .build();
-        }
-
-        var strData = getStringData(data, contentType);
-        var canonical = rsp.header("rel=canonical", "");
-
-        return CrawledDocument.builder()
-                .crawlerStatus(CrawlerDocumentStatus.OK.name())
-                .headers(rsp.headers().toString())
-                .contentType(rsp.header("Content-type"))
-                .timestamp(LocalDateTime.now().toString())
-                .canonicalUrl(canonical)
-                .httpStatus(rsp.code())
-                .url(responseUrl.toString())
-                .documentBody(strData)
-                .build();
-    }
-
-    /**  Check X-Robots-Tag header tag to see if we are allowed to index this page.
-     * <p>
-     * Reference: <a href="https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag">https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag</a>
-     *
-     * @param xRobotsHeaderTags List of X-Robots-Tag values
-     * @param userAgent User agent string
-     * @return true if we are allowed to index this page
-     */
-    // Visible for tests
-    public static boolean isXRobotsTagsPermitted(List<String> xRobotsHeaderTags, String userAgent) {
-        boolean isPermittedGeneral = true;
-        boolean isPermittedMarginalia = false;
-        boolean isForbiddenMarginalia = false;
-
-        for (String header : xRobotsHeaderTags) {
-            if (header.indexOf(':') >= 0) {
-                String[] parts = StringUtils.split(header, ":", 2);
-
-                if (parts.length < 2)
-                    continue;
-
-                // Is this relevant to us?
-                if (!Objects.equals(parts[0].trim(), userAgent))
-                    continue;
-
-                if (parts[1].contains("noindex"))
-                    isForbiddenMarginalia = true;
-                else if (parts[1].contains("none"))
-                    isForbiddenMarginalia = true;
-                else if (parts[1].contains("all"))
-                    isPermittedMarginalia = true;
+        if (result instanceof HttpFetchResult.ResultOk ok) {
+            if (ok.statusCode() == 429) {
+                String retryAfter = Objects.requireNonNullElse(ok.header("Retry-After"), "1000");
+                throw new RateLimitException(retryAfter);
             }
-            else {
-                if (header.contains("noindex"))
-                    isPermittedGeneral = false;
-                if (header.contains("none"))
-                    isPermittedGeneral = false;
+            if (ok.statusCode() == 304) {
+                return new HttpFetchResult.Result304Raw();
+            }
+            if (ok.statusCode() == 200) {
+                return ok;
             }
         }
 
-        if (isPermittedMarginalia)
-            return true;
-        if (isForbiddenMarginalia)
-            return false;
-        return isPermittedGeneral;
-    }
-
-    private String getStringData(byte[] data, ContentType contentType) {
-        Charset charset;
-        try {
-            charset = Charset.forName(contentType.charset());
-        }
-        catch (IllegalCharsetNameException ex) {
-            charset = StandardCharsets.UTF_8;
-        }
-        catch (UnsupportedCharsetException ex) {
-            // This is usually like Macintosh Latin
-            // (https://en.wikipedia.org/wiki/Macintosh_Latin_encoding)
-            //
-            // It's close enough to 8859-1 to serve
-            charset = StandardCharsets.ISO_8859_1;
-        }
-        return new String(data, charset);
-    }
-
-    private CrawledDocument createRedirectResponse(EdgeUrl url, Response rsp, EdgeUrl responseUrl) {
-
-        return CrawledDocument.builder()
-                .crawlerStatus(CrawlerDocumentStatus.REDIRECT.name())
-                .redirectUrl(responseUrl.toString())
-                .headers(rsp.headers().toString())
-                .contentType(rsp.header("Content-type"))
-                .timestamp(LocalDateTime.now().toString())
-                .httpStatus(rsp.code())
-                .url(url.toString())
-                .build();
-
+        return new HttpFetchResult.ResultNone();
     }
 
     @Override
-    public SimpleRobotRules fetchRobotRules(EdgeDomain domain) {
-        return fetchRobotsForProto("https", domain)
-                .or(() -> fetchRobotsForProto("http", domain))
+    public SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder) {
+        return fetchRobotsForProto("https", recorder, domain)
+                .or(() -> fetchRobotsForProto("http", recorder, domain))
                 .orElseGet(() -> new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_ALL));
     }
 
@@ -409,21 +204,31 @@ public class HttpFetcherImpl implements HttpFetcher {
         return new SitemapRetriever();
     }
 
-    private Optional<SimpleRobotRules> fetchRobotsForProto(String proto, EdgeDomain domain) {
+    private Optional<SimpleRobotRules> fetchRobotsForProto(String proto, WarcRecorder recorder, EdgeDomain domain) {
         try {
             var url = new EdgeUrl(proto, domain, null, "/robots.txt", null);
-            return Optional.of(parseRobotsTxt(fetchContent(url, ContentTags.empty())));
+
+            var getBuilder = new Request.Builder().get();
+
+            getBuilder.url(url.toString())
+                    .addHeader("Accept-Encoding", "gzip")
+                    .addHeader("User-agent", userAgent);
+
+            HttpFetchResult result = recorder.fetch(client, getBuilder.build());
+
+            return DocumentBodyExtractor.asBytes(result).mapOpt((contentType, body) ->
+                robotsParser.parseContent(url.toString(),
+                        body,
+                        contentType.toString(),
+                        userAgent)
+            );
+
         }
         catch (Exception ex) {
             return Optional.empty();
         }
     }
 
-    private SimpleRobotRules parseRobotsTxt(CrawledDocument doc) {
-        return robotsParser.parseContent(doc.url,
-                doc.documentBody.getBytes(),
-                doc.contentType,
-                userAgent);
-    }
 
 }
+
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/FastTerminatingSocketFactory.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/FastTerminatingSocketFactory.java
similarity index 96%
rename from code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/FastTerminatingSocketFactory.java
rename to code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/FastTerminatingSocketFactory.java
index add64e29..ffb29b33 100644
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/FastTerminatingSocketFactory.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/FastTerminatingSocketFactory.java
@@ -1,4 +1,4 @@
-package nu.marginalia.crawl.retreival.fetcher;
+package nu.marginalia.crawl.retreival.fetcher.socket;
 
 import javax.net.SocketFactory;
 import java.io.IOException;
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/IpInterceptingNetworkInterceptor.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/IpInterceptingNetworkInterceptor.java
new file mode 100644
index 00000000..90f43e5c
--- /dev/null
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/IpInterceptingNetworkInterceptor.java
@@ -0,0 +1,31 @@
+package nu.marginalia.crawl.retreival.fetcher.socket;
+
+import okhttp3.Interceptor;
+import okhttp3.Response;
+import org.jetbrains.annotations.NotNull;
+
+import java.io.IOException;
+
+
+/** An interceptor that intercepts network requests and adds the remote IP address as
+ * a header in the response.  This is used to pass the remote IP address to the Warc
+ * writer, as this information is not available in the response.
+ */
+public class IpInterceptingNetworkInterceptor implements Interceptor  {
+    private static final String pseudoHeaderName = "X-Marginalia-Remote-IP";
+
+    @NotNull
+    @Override
+    public Response intercept(@NotNull Interceptor.Chain chain) throws IOException {
+        String IP = chain.connection().socket().getInetAddress().getHostAddress();
+
+        return chain.proceed(chain.request())
+                .newBuilder()
+                .addHeader(pseudoHeaderName, IP)
+                .build();
+    }
+
+    public static String getIpFromResponse(Response response) {
+        return response.header(pseudoHeaderName);
+    }
+}
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/NoSecuritySSL.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/NoSecuritySSL.java
similarity index 89%
rename from code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/NoSecuritySSL.java
rename to code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/NoSecuritySSL.java
index f86d2c48..b6b8a589 100644
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/NoSecuritySSL.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/NoSecuritySSL.java
@@ -1,4 +1,4 @@
-package nu.marginalia.crawl.retreival.fetcher;
+package nu.marginalia.crawl.retreival.fetcher.socket;
 
 import lombok.SneakyThrows;
 
@@ -8,6 +8,8 @@ import java.security.cert.X509Certificate;
 public class NoSecuritySSL {
 
     // Create a trust manager that does not validate certificate chains
+    // We want to accept e.g. self-signed certificates and certificates
+    // that are not signed by a CA is generally trusted by the system.
     public static final TrustManager[] trustAllCerts = new TrustManager[]{
             new X509TrustManager() {
                 @Override
@@ -27,7 +29,6 @@ public class NoSecuritySSL {
             }
     };
 
-
     @SneakyThrows
     public static SSLSocketFactory buildSocketFactory() {
         // Install the all-trusting trust manager
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcDigestBuilder.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcDigestBuilder.java
new file mode 100644
index 00000000..6fd020b4
--- /dev/null
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcDigestBuilder.java
@@ -0,0 +1,33 @@
+package nu.marginalia.crawl.retreival.fetcher.warc;
+
+import org.netpreserve.jwarc.WarcDigest;
+
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+
+class WarcDigestBuilder {
+    private final MessageDigest digest;
+
+    private static final String digestAlgorithm = "SHA-1";
+
+    public WarcDigestBuilder() throws NoSuchAlgorithmException {
+        this.digest = MessageDigest.getInstance(digestAlgorithm);
+    }
+
+    public void update(String s) {
+        byte[] bytes = s.getBytes();
+        update(bytes, bytes.length);
+    }
+
+    public void update(byte[] buffer, int n) {
+        update(buffer, 0, n);
+    }
+
+    public void update(byte[] buffer, int s, int n) {
+        digest.update(buffer, s, n);
+    }
+
+    public WarcDigest build() {
+        return new WarcDigest(digest);
+    }
+}
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java
new file mode 100644
index 00000000..ad29056f
--- /dev/null
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java
@@ -0,0 +1,170 @@
+package nu.marginalia.crawl.retreival.fetcher.warc;
+
+import okhttp3.Protocol;
+import okhttp3.Request;
+import okhttp3.Response;
+import org.apache.commons.lang3.StringUtils;
+
+import java.net.URI;
+import java.net.URLEncoder;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.Map;
+import java.util.StringJoiner;
+import java.util.stream.Collectors;
+
+/** We don't have access to the raw HTTP request and response, so we need to reconstruct them
+ * as best is possible from the data we have available.
+ */
+public class WarcProtocolReconstructor {
+
+    static String getHttpRequestString(Request request, URI uri) {
+        StringBuilder requestStringBuilder = new StringBuilder();
+
+        final String encodedURL = encodeURLKeepSlashes(uri.getPath());
+
+        requestStringBuilder.append(request.method()).append(" ").append(encodedURL);
+
+        if (uri.getQuery() != null) {
+            requestStringBuilder.append("?").append(uri.getQuery());
+        }
+        requestStringBuilder.append(" HTTP/1.1\r\n");
+        requestStringBuilder.append("Host: ").append(uri.getHost()).append("\r\n");
+
+        request.headers().toMultimap().forEach((k, values) -> {
+            for (var value : values) {
+                requestStringBuilder.append(capitalizeHeader(k)).append(": ").append(value).append("\r\n");
+            }
+        });
+
+        return requestStringBuilder.toString();
+    }
+
+    /** Java's URLEncoder will URLEncode slashes, which is not desirable
+     * when sanitizing a URL for HTTP protocol purposes
+     */
+
+    private static String encodeURLKeepSlashes(String URL) {
+        String[] parts = StringUtils.split(URL,"/");
+        StringJoiner joiner = new StringJoiner("/");
+        for (String part : parts) {
+            joiner.add(URLEncoder.encode(part, StandardCharsets.UTF_8));
+        }
+        return joiner.toString();
+    }
+
+    static String getResponseHeader(String headersAsString, int code) {
+        String version = "1.1";
+
+        String statusCode = String.valueOf(code);
+        String statusMessage = STATUS_CODE_MAP.getOrDefault(code, "Unknown");
+
+        String headerString = getHeadersAsString(headersAsString);
+
+        return STR."HTTP/\{version} \{statusCode} \{statusMessage}\r\n\{headerString}\r\n\r\n";
+    }
+
+    static String getResponseHeader(Response response) {
+        String version = response.protocol() == Protocol.HTTP_1_1 ? "1.1" : "2.0";
+
+        String statusCode = String.valueOf(response.code());
+        String statusMessage = STATUS_CODE_MAP.getOrDefault(response.code(), "Unknown");
+
+        String headerString = getHeadersAsString(response);
+
+        return STR."HTTP/\{version} \{statusCode} \{statusMessage}\r\n\{headerString}\r\n\r\n";
+    }
+
+    private static final Map<Integer, String> STATUS_CODE_MAP = Map.ofEntries(
+            Map.entry(200, "OK"),
+            Map.entry(201, "Created"),
+            Map.entry(202, "Accepted"),
+            Map.entry(203, "Non-Authoritative Information"),
+            Map.entry(204, "No Content"),
+            Map.entry(205, "Reset Content"),
+            Map.entry(206, "Partial Content"),
+            Map.entry(207, "Multi-Status"),
+            Map.entry(208, "Already Reported"),
+            Map.entry(226, "IM Used"),
+            Map.entry(300, "Multiple Choices"),
+            Map.entry(301, "Moved Permanently"),
+            Map.entry(302, "Found"),
+            Map.entry(303, "See Other"),
+            Map.entry(304, "Not Modified"),
+            Map.entry(307, "Temporary Redirect"),
+            Map.entry(308, "Permanent Redirect"),
+            Map.entry(400, "Bad Request"),
+            Map.entry(401, "Unauthorized"),
+            Map.entry(403, "Forbidden"),
+            Map.entry(404, "Not Found"),
+            Map.entry(405, "Method Not Allowed"),
+            Map.entry(406, "Not Acceptable"),
+            Map.entry(408, "Request Timeout"),
+            Map.entry(409, "Conflict"),
+            Map.entry(410, "Gone"),
+            Map.entry(411, "Length Required"),
+            Map.entry(412, "Precondition Failed"),
+            Map.entry(413, "Payload Too Large"),
+            Map.entry(414, "URI Too Long"),
+            Map.entry(415, "Unsupported Media Type"),
+            Map.entry(416, "Range Not Satisfiable"),
+            Map.entry(417, "Expectation Failed"),
+            Map.entry(418, "I'm a teapot"),
+            Map.entry(421, "Misdirected Request"),
+            Map.entry(426, "Upgrade Required"),
+            Map.entry(428, "Precondition Required"),
+            Map.entry(429, "Too Many Requests"),
+            Map.entry(431, "Request Header Fields Too Large"),
+            Map.entry(451, "Unavailable For Legal Reasons"),
+            Map.entry(500, "Internal Server Error"),
+            Map.entry(501, "Not Implemented"),
+            Map.entry(502, "Bad Gateway"),
+            Map.entry(503, "Service Unavailable"),
+            Map.entry(504, "Gateway Timeout"),
+            Map.entry(505, "HTTP Version Not Supported"),
+            Map.entry(506, "Variant Also Negotiates"),
+            Map.entry(507, "Insufficient Storage"),
+            Map.entry(508, "Loop Detected"),
+            Map.entry(510, "Not Extended"),
+            Map.entry(511, "Network Authentication Required")
+    );
+
+    static private String getHeadersAsString(String headersBlob) {
+        StringJoiner joiner = new StringJoiner("\r\n");
+
+        Arrays.stream(headersBlob.split("\n")).forEach(joiner::add);
+
+        return joiner.toString();
+    }
+
+    static private String getHeadersAsString(Response response) {
+        StringJoiner joiner = new StringJoiner("\r\n");
+
+        response.headers().toMultimap().forEach((k, values) -> {
+            String headerCapitalized = capitalizeHeader(k);
+
+            // Omit pseudoheaders injected by the crawler itself
+            if (headerCapitalized.startsWith("X-Marginalia"))
+                return;
+
+            // Omit Transfer-Encoding header, as we'll be using Content-Length
+            // instead in the warc file, despite what the server says
+            if (headerCapitalized.startsWith("Transfer-Encoding"))
+                return;
+
+            for (var value : values) {
+                joiner.add(headerCapitalized + ": " + value);
+            }
+        });
+        return joiner.toString();
+    }
+
+    // okhttp gives us flattened headers, so we need to reconstruct Camel-Kebab-Case style
+    // for the WARC parser's sake...
+    static private String capitalizeHeader(String k) {
+        return Arrays.stream(StringUtils.split(k, '-'))
+                .map(StringUtils::capitalize)
+                .collect(Collectors.joining("-"));
+    }
+
+}
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java
new file mode 100644
index 00000000..e31585ef
--- /dev/null
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java
@@ -0,0 +1,402 @@
+package nu.marginalia.crawl.retreival.fetcher.warc;
+
+import nu.marginalia.crawl.retreival.DomainProber;
+import nu.marginalia.crawling.body.HttpFetchResult;
+import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
+import nu.marginalia.model.EdgeDomain;
+import nu.marginalia.model.EdgeUrl;
+import okhttp3.OkHttpClient;
+import okhttp3.Request;
+import org.netpreserve.jwarc.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.InetAddress;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.security.NoSuchAlgorithmException;
+import java.time.Instant;
+import java.util.*;
+
+/** Based on JWarc's fetch method, APL 2.0 license
+ * <p></p>
+ * This class wraps OkHttp's OkHttpClient and records the HTTP request and response in a WARC file,
+ * as best is possible given not all the data is available at the same time and needs to
+ * be reconstructed.
+ */
+public class WarcRecorder implements AutoCloseable {
+    private static final int MAX_TIME = 30_000;
+    private static final int MAX_SIZE = 1024 * 1024 * 10;
+    private final WarcWriter writer;
+    private final Path warcFile;
+    private static final Logger logger = LoggerFactory.getLogger(WarcRecorder.class);
+
+    private final ThreadLocal<byte[]> bufferThreadLocal = ThreadLocal.withInitial(() -> new byte[MAX_SIZE]);
+
+    private boolean temporaryFile = false;
+
+    // Affix a version string in case we need to change the format in the future
+    // in some way
+    private final String warcRecorderVersion = "1.0";
+
+    // We need to know if the site uses cookies so this can be reported among the search results
+    // -- flip this to true if we see any cookies.  This information will also be painted on any
+    // revisited pages.  It's not 100% perfect and a bit order dependent, but it's good enough.
+    private final WarcXCookieInformationHeader cookieInformation = new WarcXCookieInformationHeader();
+
+    /**
+     * Create a new WarcRecorder that will write to the given file
+     *
+     * @param warcFile The file to write to
+     */
+    public WarcRecorder(Path warcFile) throws IOException {
+        this.warcFile = warcFile;
+        this.writer = new WarcWriter(warcFile);
+    }
+
+    /**
+     * Create a new WarcRecorder that will write to a temporary file
+     * and delete it when close() is called.
+     */
+    public WarcRecorder() throws IOException {
+        this.warcFile = Files.createTempFile("warc", ".warc.gz");
+        this.writer = new WarcWriter(this.warcFile);
+
+        temporaryFile = true;
+    }
+
+    public HttpFetchResult fetch(OkHttpClient client, Request request) throws NoSuchAlgorithmException,
+            IOException,
+            URISyntaxException,
+            InterruptedException
+    {
+        URI requestUri = request.url().uri();
+
+        WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
+        WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
+
+        String ip;
+        Instant date = Instant.now();
+        long startMillis = date.toEpochMilli();
+
+        var call = client.newCall(request);
+
+        int totalLength = 0;
+
+        WarcTruncationReason truncationReason = null;
+
+        ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer();
+
+        cookieInformation.update(client, request.url());
+
+        try (var response = call.execute()) {
+            var body = response.body();
+            InputStream inputStream;
+
+            if (body == null) {
+                inputStream = null;
+                truncationReason = WarcTruncationReason.DISCONNECT;
+            }
+            else {
+                inputStream = body.byteStream();
+            }
+
+            ip = IpInterceptingNetworkInterceptor.getIpFromResponse(response);
+
+            String responseHeaders = WarcProtocolReconstructor.getResponseHeader(response);
+
+            responseDataBuffer.put(responseHeaders);
+            responseDataBuffer.updateDigest(responseDigestBuilder, 0, responseDataBuffer.length());
+
+            int dataStart = responseDataBuffer.pos();
+
+            while (inputStream != null) {
+                int remainingLength = responseDataBuffer.remaining();
+                if (remainingLength == 0)
+                    break;
+
+                int startPos = responseDataBuffer.pos();
+
+                int n = responseDataBuffer.readFrom(inputStream, remainingLength);
+                if (n < 0)
+                    break;
+
+                responseDataBuffer.updateDigest(responseDigestBuilder, startPos, n);
+                responseDataBuffer.updateDigest(payloadDigestBuilder, startPos, n);
+                totalLength += n;
+
+                if (MAX_TIME > 0 && System.currentTimeMillis() - startMillis > MAX_TIME) {
+                    truncationReason = WarcTruncationReason.TIME;
+                    break;
+                }
+                if (MAX_SIZE > 0 && totalLength >= MAX_SIZE) {
+                    truncationReason = WarcTruncationReason.LENGTH;
+                    break;
+                }
+            }
+
+            // It looks like this might be the same as requestUri, but it's not;
+            // it's the URI after resolving redirects.
+            final URI responseUri = response.request().url().uri();
+
+            WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri)
+                    .blockDigest(responseDigestBuilder.build())
+                    .date(date)
+                    .body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
+
+            cookieInformation.paint(responseBuilder);
+
+            if (ip != null) responseBuilder.ipAddress(InetAddress.getByName(ip));
+
+            responseBuilder.payloadDigest(payloadDigestBuilder.build());
+
+            if (truncationReason != null)
+                responseBuilder.truncated(truncationReason);
+
+            // Build and write the response
+
+            var warcResponse = responseBuilder.build();
+            warcResponse.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
+            writer.write(warcResponse);
+
+            // Build and write the request
+
+            WarcDigestBuilder requestDigestBuilder = new WarcDigestBuilder();
+
+            String httpRequestString = WarcProtocolReconstructor.getHttpRequestString(response.request(), requestUri);
+
+            requestDigestBuilder.update(httpRequestString);
+
+            WarcRequest warcRequest = new WarcRequest.Builder(requestUri)
+                    .blockDigest(requestDigestBuilder.build())
+                    .date(date)
+                    .body(MediaType.HTTP_REQUEST, httpRequestString.getBytes())
+                    .concurrentTo(warcResponse.id())
+                    .build();
+            warcRequest.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
+            writer.write(warcRequest);
+
+            return new HttpFetchResult.ResultOk(responseUri,
+                    response.code(),
+                    response.headers(),
+                    ip,
+                    responseDataBuffer.data,
+                    dataStart,
+                    responseDataBuffer.length() - dataStart);
+        }
+        catch (Exception ex) {
+            logger.warn("Failed to fetch URL {}", requestUri, ex);
+            return new HttpFetchResult.ResultException(ex);
+        }
+    }
+
+    public void resync(WarcRecord item) throws IOException {
+        writer.write(item);
+    }
+
+    private void saveOldResponse(EdgeUrl url, String contentType, int statusCode, String documentBody) {
+        try {
+            WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
+            WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
+
+            byte[] bytes = documentBody.getBytes();
+
+            String fakeHeaders = STR."""
+                    Content-Type: \{contentType}
+                    Content-Length: \{bytes.length}
+                    Content-Encoding: UTF-8
+                    """;
+
+            String header = WarcProtocolReconstructor.getResponseHeader(fakeHeaders, statusCode);
+            ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer();
+            responseDataBuffer.put(header);
+
+            responseDigestBuilder.update(header);
+
+            responseDigestBuilder.update(bytes, bytes.length);
+            payloadDigestBuilder.update(bytes, bytes.length);
+            responseDataBuffer.put(bytes, 0, bytes.length);
+
+            WarcXResponseReference.Builder builder = new WarcXResponseReference.Builder(url.asURI())
+                    .blockDigest(responseDigestBuilder.build())
+                    .payloadDigest(payloadDigestBuilder.build())
+                    .date(Instant.now())
+                    .body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
+
+            cookieInformation.paint(builder);
+
+            var reference = builder.build();
+
+            reference.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
+
+            writer.write(reference);
+
+        } catch (URISyntaxException | IOException | NoSuchAlgorithmException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    /**
+     * Flag the given URL as skipped by the crawler, so that it will not be retried.
+     * Which URLs were skipped is still important when resynchronizing on the WARC file,
+     * so that the crawler can avoid re-fetching them.
+     */
+    public void flagAsSkipped(EdgeUrl url, String contentType, int statusCode, String documentBody) {
+        saveOldResponse(url, contentType, statusCode, documentBody);
+    }
+
+    /**
+     * Write a reference copy of the given document data.  This is used when the crawler provides
+     * an E-Tag or Last-Modified header, and the server responds with a 304 Not Modified.  In this
+     * scenario we want to record the data as it was in the previous crawl, but not re-fetch it.
+     */
+    public void writeReferenceCopy(EdgeUrl url, String contentType, int statusCode, String documentBody) {
+        saveOldResponse(url, contentType, statusCode, documentBody);
+    }
+
+    public void writeWarcinfoHeader(String ip, EdgeDomain domain, DomainProber.ProbeResult result) throws IOException {
+
+        Map<String, List<String>> fields = new HashMap<>();
+        fields.put("ip", List.of(ip));
+        fields.put("software", List.of(STR."search.marginalia.nu/\{warcRecorderVersion}"));
+        fields.put("domain", List.of(domain.toString()));
+
+        switch (result) {
+            case DomainProber.ProbeResultRedirect redirectDomain:
+                fields.put("X-WARC-Probe-Status", List.of(STR."REDIRECT;\{redirectDomain.domain()}"));
+                break;
+            case DomainProber.ProbeResultError error:
+                fields.put("X-WARC-Probe-Status", List.of(STR."\{error.status().toString()};\{error.desc()}"));
+                break;
+            case DomainProber.ProbeResultOk ok:
+                fields.put("X-WARC-Probe-Status", List.of("OK"));
+                break;
+        }
+
+        var warcinfo = new Warcinfo.Builder()
+                .date(Instant.now())
+                .fields(fields)
+                .recordId(UUID.randomUUID())
+                .build();
+
+        writer.write(warcinfo);
+    }
+
+    public void flagAsRobotsTxtError(EdgeUrl top) {
+        try {
+            WarcXEntityRefused refusal = new WarcXEntityRefused.Builder(top.asURI(), WarcXEntityRefused.documentRobotsTxtSkippedURN)
+                    .date(Instant.now())
+                    .build();
+
+            writer.write(refusal);
+        } catch (URISyntaxException | IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    public void flagAsFailedContentTypeProbe(EdgeUrl url, String contentType, int status) {
+        try {
+            WarcXEntityRefused refusal = new WarcXEntityRefused.Builder(url.asURI(), WarcXEntityRefused.documentBadContentTypeURN)
+                    .date(Instant.now())
+                    .addHeader("Rejected-Content-Type", contentType)
+                    .addHeader("Http-Status", Integer.toString(status))
+                    .build();
+
+            writer.write(refusal);
+        } catch (URISyntaxException | IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    public void flagAsError(EdgeUrl url, Exception ex) {
+        try {
+            WarcXEntityRefused refusal = new WarcXEntityRefused.Builder(url.asURI(), WarcXEntityRefused.documentUnspecifiedError)
+                    .date(Instant.now())
+                    .addHeader("Exception", ex.getClass().getSimpleName())
+                    .addHeader("ErrorMessage", Objects.requireNonNullElse(ex.getMessage(), ""))
+                    .build();
+
+            writer.write(refusal);
+        } catch (URISyntaxException | IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    public void flagAsTimeout(EdgeUrl url) {
+        try {
+            WarcXEntityRefused refusal = new WarcXEntityRefused.Builder(url.asURI(), WarcXEntityRefused.documentProbeTimeout)
+                    .date(Instant.now())
+                    .build();
+
+            writer.write(refusal);
+        } catch (URISyntaxException | IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    private class ResponseDataBuffer {
+        private final byte[] data;
+        private int length = 0;
+        private int pos = 0;
+
+        public ResponseDataBuffer() {
+            data = bufferThreadLocal.get();
+        }
+
+        public int pos() {
+            return pos;
+        }
+        public int length() {
+            return length;
+        }
+
+        public void put(String s) {
+            byte[] bytes = s.getBytes();
+            put(bytes, 0, bytes.length);
+        }
+
+        private void put(byte[] bytes, int i, int n) {
+            System.arraycopy(bytes, i, data, pos, n);
+            pos += n;
+            length += n;
+        }
+
+        public int readFrom(InputStream inputStream, int remainingLength) throws IOException {
+            int n = inputStream.read(data, pos, remainingLength);
+            if (n > 0) {
+                pos += n;
+                length += n;
+            }
+            return n;
+        }
+
+        public int remaining() {
+            return MAX_SIZE - pos;
+        }
+
+        public void updateDigest(WarcDigestBuilder digestBuilder, int startPos, int n) {
+            digestBuilder.update(data, startPos, n);
+        }
+
+        public byte[] copyBytes() {
+            byte[] copy = new byte[length];
+            System.arraycopy(data, 0, copy, 0, length);
+            return copy;
+        }
+
+    }
+
+    public void close() {
+        try {
+            writer.close();
+            if (temporaryFile)
+                Files.deleteIfExists(warcFile);
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+}
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java
new file mode 100644
index 00000000..91c21d65
--- /dev/null
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java
@@ -0,0 +1,108 @@
+package nu.marginalia.crawl.retreival.revisit;
+
+import crawlercommons.robots.SimpleRobotRules;
+import nu.marginalia.crawl.retreival.CrawlDataReference;
+import nu.marginalia.crawl.retreival.CrawlDelayTimer;
+import nu.marginalia.crawl.retreival.CrawlerRetreiver;
+import nu.marginalia.crawl.retreival.DomainCrawlFrontier;
+import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
+import nu.marginalia.crawling.model.CrawledDocument;
+import nu.marginalia.model.EdgeUrl;
+import org.jsoup.Jsoup;
+
+/** This class encapsulates the logic for re-visiting a domain that has already been crawled.
+ *  We may use information from the previous crawl to inform the next crawl, specifically the
+ *  E-Tag and Last-Modified headers.
+ */
+public class CrawlerRevisitor {
+    private final DomainCrawlFrontier crawlFrontier;
+    private final CrawlerRetreiver crawlerRetreiver;
+    private final WarcRecorder warcRecorder;
+
+    public CrawlerRevisitor(DomainCrawlFrontier crawlFrontier,
+                            CrawlerRetreiver crawlerRetreiver,
+                            WarcRecorder warcRecorder) {
+        this.crawlFrontier = crawlFrontier;
+        this.crawlerRetreiver = crawlerRetreiver;
+        this.warcRecorder = warcRecorder;
+    }
+
+    /** Performs a re-crawl of old documents, comparing etags and last-modified */
+    public int recrawl(CrawlDataReference oldCrawlData,
+                       SimpleRobotRules robotsRules,
+                       CrawlDelayTimer delayTimer)
+    throws InterruptedException {
+        int recrawled = 0;
+        int retained = 0;
+
+        for (;;) {
+            CrawledDocument doc = oldCrawlData.nextDocument();
+
+            if (doc == null) {
+                break;
+            }
+
+            // This Shouldn't Happen (TM)
+            var urlMaybe = EdgeUrl.parse(doc.url);
+            if (urlMaybe.isEmpty()) continue;
+            var url = urlMaybe.get();
+
+            // If we've previously 404:d on this URL, we'll refrain from trying to fetch it again
+            if (doc.httpStatus == 404) {
+                crawlFrontier.addVisited(url);
+                continue;
+            }
+
+            if (doc.httpStatus != 200) continue;
+
+            if (!robotsRules.isAllowed(url.toString())) {
+                warcRecorder.flagAsRobotsTxtError(url);
+                continue;
+            }
+            if (!crawlFrontier.filterLink(url))
+                continue;
+            if (!crawlFrontier.addVisited(url))
+                continue;
+
+
+            if (recrawled > 5
+                    && retained > 0.9 * recrawled
+                    && Math.random() < 0.9)
+            {
+                // Since it looks like most of these documents haven't changed,
+                // we'll load the documents directly; but we do this in a random
+                // fashion to make sure we eventually catch changes over time
+                // and ensure we discover new links
+
+                crawlFrontier.addVisited(url);
+
+                // Hoover up any links from the document
+                if (doc.httpStatus == 200 && doc.documentBody != null) {
+                    var parsedDoc = Jsoup.parse(doc.documentBody);
+                    crawlFrontier.enqueueLinksFromDocument(url, parsedDoc);
+                }
+
+                // Add a WARC record so we don't repeat this
+                warcRecorder.flagAsSkipped(url, doc.contentType, doc.httpStatus, doc.documentBody);
+
+                continue;
+            }
+
+
+            // GET the document with the stored document as a reference
+            // providing etag and last-modified headers, so we can recycle the
+            // document if it hasn't changed without actually downloading it
+
+            var reference = new DocumentWithReference(doc, oldCrawlData);
+            var result = crawlerRetreiver.fetchWriteAndSleep(url, delayTimer, reference);
+
+            if (reference.isSame(result)) {
+                retained++;
+            }
+
+            recrawled++;
+        }
+
+        return recrawled;
+    }
+}
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java
new file mode 100644
index 00000000..a0559aec
--- /dev/null
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java
@@ -0,0 +1,77 @@
+package nu.marginalia.crawl.retreival.revisit;
+
+import nu.marginalia.crawl.retreival.CrawlDataReference;
+import nu.marginalia.crawl.retreival.fetcher.ContentTags;
+import nu.marginalia.crawling.body.DocumentBodyExtractor;
+import nu.marginalia.crawling.body.DocumentBodyResult;
+import nu.marginalia.crawling.body.HttpFetchResult;
+import nu.marginalia.crawling.model.CrawledDocument;
+
+import javax.annotation.Nullable;
+
+public record DocumentWithReference(
+        @Nullable CrawledDocument doc,
+        @Nullable CrawlDataReference reference) {
+
+    private static final DocumentWithReference emptyInstance = new DocumentWithReference(null, null);
+
+    public static DocumentWithReference empty() {
+        return emptyInstance;
+    }
+
+    /** Returns true if the provided document is the same as the reference document,
+     * or if the result was retained via HTTP 304.
+     */
+    public boolean isSame(HttpFetchResult result) {
+        if (result instanceof HttpFetchResult.Result304Raw)
+            return true;
+        if (result instanceof HttpFetchResult.Result304ReplacedWithReference)
+            return true;
+
+        if (!(result instanceof HttpFetchResult.ResultOk resultOk))
+            return false;
+
+        if (reference == null)
+            return false;
+        if (doc == null)
+            return false;
+        if (doc.documentBody == null)
+            return false;
+
+        if (!(DocumentBodyExtractor.asString(resultOk) instanceof DocumentBodyResult.Ok<String> bodyOk)) {
+            return false;
+        }
+
+        return reference.isContentBodySame(doc.documentBody, bodyOk.body());
+    }
+
+    public ContentTags getContentTags() {
+        if (null == doc)
+            return ContentTags.empty();
+
+        String headers = doc.headers;
+        if (headers == null)
+            return ContentTags.empty();
+
+        String[] headersLines = headers.split("\n");
+
+        String lastmod = null;
+        String etag = null;
+
+        for (String line : headersLines) {
+            if (line.toLowerCase().startsWith("etag:")) {
+                etag = line.substring(5).trim();
+            }
+            if (line.toLowerCase().startsWith("last-modified:")) {
+                lastmod = line.substring(14).trim();
+            }
+        }
+
+        return new ContentTags(etag, lastmod);
+    }
+
+    public boolean isEmpty() {
+        return doc == null || reference == null;
+    }
+
+}
diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/sitemap/SitemapFetcher.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/sitemap/SitemapFetcher.java
new file mode 100644
index 00000000..3ce33d64
--- /dev/null
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/sitemap/SitemapFetcher.java
@@ -0,0 +1,71 @@
+package nu.marginalia.crawl.retreival.sitemap;
+
+import crawlercommons.robots.SimpleRobotRules;
+import nu.marginalia.crawl.retreival.DomainCrawlFrontier;
+import nu.marginalia.crawl.retreival.fetcher.SitemapRetriever;
+import nu.marginalia.model.EdgeUrl;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+public class SitemapFetcher {
+
+    private final DomainCrawlFrontier crawlFrontier;
+    private final SitemapRetriever sitemapRetriever;
+    private static final Logger logger = LoggerFactory.getLogger(SitemapFetcher.class);
+
+    public SitemapFetcher(DomainCrawlFrontier crawlFrontier, SitemapRetriever sitemapRetriever) {
+        this.crawlFrontier = crawlFrontier;
+        this.sitemapRetriever = sitemapRetriever;
+    }
+
+    public void downloadSitemaps(SimpleRobotRules robotsRules, EdgeUrl rootUrl) {
+        List<String> sitemaps = robotsRules.getSitemaps();
+
+        List<EdgeUrl> urls = new ArrayList<>(sitemaps.size());
+        if (!sitemaps.isEmpty()) {
+            for (var url : sitemaps) {
+                EdgeUrl.parse(url).ifPresent(urls::add);
+            }
+        }
+        else {
+            urls.add(rootUrl.withPathAndParam("/sitemap.xml", null));
+        }
+
+        downloadSitemaps(urls);
+    }
+
+    public void downloadSitemaps(List<EdgeUrl> urls) {
+
+        Set<String> checkedSitemaps = new HashSet<>();
+
+        for (var url : urls) {
+            // Let's not download sitemaps from other domains for now
+            if (!crawlFrontier.isSameDomain(url)) {
+                continue;
+            }
+
+            if (checkedSitemaps.contains(url.path))
+                continue;
+
+            var sitemap =  sitemapRetriever.fetchSitemap(url);
+            if (sitemap.isEmpty()) {
+                continue;
+            }
+
+            // ensure we don't try to download this sitemap again
+            // (don't move this up, as we may want to check the same
+            // path with different protocols until we find one that works)
+
+            checkedSitemaps.add(url.path);
+
+            crawlFrontier.addAllToQueue(sitemap);
+        }
+
+        logger.debug("Queue is now {}", crawlFrontier.queueSize());
+    }
+}
diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizerTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizerTest.java
new file mode 100644
index 00000000..ae3d9be4
--- /dev/null
+++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizerTest.java
@@ -0,0 +1,88 @@
+package nu.marginalia.crawl.retreival;
+
+import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
+import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
+import nu.marginalia.model.EdgeDomain;
+import nu.marginalia.model.EdgeUrl;
+import okhttp3.OkHttpClient;
+import okhttp3.Request;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.netpreserve.jwarc.WarcReader;
+import org.netpreserve.jwarc.WarcRequest;
+import org.netpreserve.jwarc.WarcResponse;
+
+import java.io.IOException;
+import java.net.URISyntaxException;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.security.NoSuchAlgorithmException;
+import java.util.List;
+import java.util.zip.GZIPInputStream;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+class CrawlerWarcResynchronizerTest {
+    Path fileName;
+    Path outputFile;
+    OkHttpClient httpClient;
+    @BeforeEach
+    public void setUp() throws Exception {
+        httpClient = new OkHttpClient.Builder()
+                .addNetworkInterceptor(new IpInterceptingNetworkInterceptor())
+                .build();
+
+        fileName = Files.createTempFile("test", ".warc.gz");
+        outputFile = Files.createTempFile("test", ".warc.gz");
+    }
+
+    @AfterEach
+    public void tearDown() throws Exception {
+        Files.deleteIfExists(fileName);
+        Files.deleteIfExists(outputFile);
+    }
+
+    @Test
+    void run() throws IOException, URISyntaxException {
+        try (var oldRecorder = new WarcRecorder(fileName)) {
+            fetchUrl(oldRecorder, "https://www.marginalia.nu/");
+            fetchUrl(oldRecorder, "https://www.marginalia.nu/log/");
+            fetchUrl(oldRecorder, "https://www.marginalia.nu/feed/");
+        } catch (Exception e) {
+            fail(e);
+        }
+
+        var crawlFrontier = new DomainCrawlFrontier(new EdgeDomain("www.marginalia.nu"), List.of(), 100);
+
+        try (var newRecorder = new WarcRecorder(outputFile)) {
+            new CrawlerWarcResynchronizer(crawlFrontier, newRecorder).run(fileName);
+        }
+
+        assertTrue(crawlFrontier.isVisited(new EdgeUrl("https://www.marginalia.nu/")));
+        assertTrue(crawlFrontier.isVisited(new EdgeUrl("https://www.marginalia.nu/log/")));
+        assertTrue(crawlFrontier.isVisited(new EdgeUrl("https://www.marginalia.nu/feed/")));
+
+        try (var warcReader = new WarcReader(outputFile)) {
+            for (var item : warcReader) {
+                if (item instanceof WarcRequest req) {
+                    System.out.println("req:" + req.target());
+                }
+                if (item instanceof WarcResponse rsp) {
+                    System.out.println("req:" + rsp.target());
+                }
+            }
+        }
+
+        new GZIPInputStream(Files.newInputStream(outputFile)).transferTo(System.out);
+    }
+
+    void fetchUrl(WarcRecorder recorder, String url) throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
+        var req = new Request.Builder().url(url)
+                .addHeader("User-agent", "test.marginalia.nu")
+                .addHeader("Accept-Encoding", "gzip")
+                .get().build();
+        recorder.fetch(httpClient, req);
+    }
+}
\ No newline at end of file
diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/ContentTypeProberTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/ContentTypeProberTest.java
new file mode 100644
index 00000000..4a015fb9
--- /dev/null
+++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/ContentTypeProberTest.java
@@ -0,0 +1,59 @@
+package nu.marginalia.crawl.retreival.fetcher;
+
+import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult.BadContentType;
+import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult.Ok;
+import nu.marginalia.model.EdgeUrl;
+import okhttp3.ConnectionPool;
+import okhttp3.Dispatcher;
+import okhttp3.OkHttpClient;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import java.net.URISyntaxException;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+class ContentTypeProberTest {
+
+    ContentTypeProber prober;
+
+    @BeforeEach
+    void setUp() {
+        OkHttpClient client = new OkHttpClient.Builder()
+                .dispatcher(new Dispatcher(Executors.newVirtualThreadPerTaskExecutor()))
+                .connectionPool(new ConnectionPool(0, 1, TimeUnit.NANOSECONDS))
+                .build();
+
+        prober = new ContentTypeProber("test.marginalia.nu", client);
+    }
+
+    @Test
+    void probeContentType() throws URISyntaxException {
+        assertEquals(
+                new Ok(new EdgeUrl("https://www.marginalia.nu/robots.txt")),
+                prober.probeContentType(new EdgeUrl("https://www.marginalia.nu/robots.txt")),
+                "robots.txt is expected to pass the probing test since it's text/plain"
+        );
+
+        assertEquals(
+                new BadContentType("image/png", 200),
+                prober.probeContentType(new EdgeUrl("https://www.marginalia.nu/sanic.png")),
+                "sanic.png is expected to pass the probing test since it's image/png"
+        );
+
+        assertEquals(
+                new Ok(new EdgeUrl("https://www.marginalia.nu/dev/null")),
+                prober.probeContentType(new EdgeUrl("https://www.marginalia.nu/dev/null")),
+                "Despite being a 404, we expect this to be passed as OK as it's NotMyJob(TM) to verify response codes"
+        );
+
+        assertEquals(
+                new Ok(new EdgeUrl("https://www.marginalia.nu/projects/edge/about.gmi/")),
+                prober.probeContentType(new EdgeUrl("https://www.marginalia.nu/projects/edge/about.gmi")),
+                "about.gmi is expected to give a redirect to about.gmi/ which is served as text/html"
+        );
+
+    }
+}
\ No newline at end of file
diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImplTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImplTest.java
index 27b55760..e5673a6a 100644
--- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImplTest.java
+++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImplTest.java
@@ -1,5 +1,6 @@
 package nu.marginalia.crawl.retreival.fetcher;
 
+import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter;
 import org.junit.jupiter.api.Test;
 
 import java.util.List;
@@ -7,30 +8,30 @@ import java.util.List;
 import static org.junit.jupiter.api.Assertions.assertFalse;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
-class HttpFetcherImplTest {
+class CrawledDocumentParquetRecordFileWriterTest {
 
     @Test
     public void testXRobotsTag() {
-        assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("foo:"), "search.marginalia.nu"));
-        assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of(":bar"), "search.marginalia.nu"));
-        assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of(":"), "search.marginalia.nu"));
-        assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of(""), "search.marginalia.nu"));
+        assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("foo:"), "search.marginalia.nu"));
+        assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of(":bar"), "search.marginalia.nu"));
+        assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of(":"), "search.marginalia.nu"));
+        assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of(""), "search.marginalia.nu"));
 
-        assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("doindex"), "search.marginalia.nu"));
-        assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("noindex"), "search.marginalia.nu"));
-        assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("none"), "search.marginalia.nu"));
-        assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: noindex"), "search.marginalia.nu"));
-        assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none"), "search.marginalia.nu"));
-        assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("googlebot: noindex"), "search.marginalia.nu"));
+        assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("doindex"), "search.marginalia.nu"));
+        assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("noindex"), "search.marginalia.nu"));
+        assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("none"), "search.marginalia.nu"));
+        assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: noindex"), "search.marginalia.nu"));
+        assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none"), "search.marginalia.nu"));
+        assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("googlebot: noindex"), "search.marginalia.nu"));
 
-        assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("noindex", "search.marginalia.nu: all"), "search.marginalia.nu"));
-        assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("none", "search.marginalia.nu: all"), "search.marginalia.nu"));
-        assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("none", "search.marginalia.nu: none"), "search.marginalia.nu"));
-        assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("all", "search.marginalia.nu: none"), "search.marginalia.nu"));
-        assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: all", "noindex"), "search.marginalia.nu"));
-        assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: all", "none"), "search.marginalia.nu"));
-        assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none", "none"), "search.marginalia.nu"));
-        assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none", "all"), "search.marginalia.nu"));
+        assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("noindex", "search.marginalia.nu: all"), "search.marginalia.nu"));
+        assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("none", "search.marginalia.nu: all"), "search.marginalia.nu"));
+        assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("none", "search.marginalia.nu: none"), "search.marginalia.nu"));
+        assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("all", "search.marginalia.nu: none"), "search.marginalia.nu"));
+        assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: all", "noindex"), "search.marginalia.nu"));
+        assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: all", "none"), "search.marginalia.nu"));
+        assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none", "none"), "search.marginalia.nu"));
+        assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none", "all"), "search.marginalia.nu"));
     }
 
 }
\ No newline at end of file
diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java
new file mode 100644
index 00000000..cdc10bd2
--- /dev/null
+++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java
@@ -0,0 +1,147 @@
+package nu.marginalia.crawl.retreival.fetcher;
+
+import nu.marginalia.UserAgent;
+import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
+import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
+import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileReader;
+import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter;
+import nu.marginalia.model.EdgeUrl;
+import okhttp3.OkHttpClient;
+import okhttp3.Request;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.netpreserve.jwarc.*;
+
+import java.io.IOException;
+import java.net.URISyntaxException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.security.NoSuchAlgorithmException;
+import java.util.HashMap;
+import java.util.Map;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+class WarcRecorderTest {
+    Path fileNameWarc;
+    Path fileNameParquet;
+    WarcRecorder client;
+    OkHttpClient httpClient;
+    @BeforeEach
+    public void setUp() throws Exception {
+        httpClient = new OkHttpClient.Builder()
+                .addNetworkInterceptor(new IpInterceptingNetworkInterceptor())
+                .build();
+
+        fileNameWarc = Files.createTempFile("test", ".warc");
+        fileNameParquet = Files.createTempFile("test", ".parquet");
+
+        client = new WarcRecorder(fileNameWarc);
+    }
+
+    @AfterEach
+    public void tearDown() throws Exception {
+        client.close();
+        Files.delete(fileNameWarc);
+    }
+
+    @Test
+    void fetch() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
+        client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/")
+                .addHeader("User-agent", "test.marginalia.nu")
+                .addHeader("Accept-Encoding", "gzip")
+                .get().build());
+
+        Map<String, String> sampleData = new HashMap<>();
+        try (var warcReader = new WarcReader(fileNameWarc)) {
+            warcReader.forEach(record -> {
+                if (record instanceof WarcRequest req) {
+                    sampleData.put(record.type(), req.target());
+                }
+                if (record instanceof WarcResponse rsp) {
+                    sampleData.put(record.type(), rsp.target());
+                }
+            });
+        }
+
+        assertEquals("https://www.marginalia.nu/", sampleData.get("request"));
+        assertEquals("https://www.marginalia.nu/", sampleData.get("response"));
+    }
+
+    @Test
+    public void flagAsSkipped() throws IOException, URISyntaxException {
+
+        try (var recorder = new WarcRecorder(fileNameWarc)) {
+            recorder.flagAsSkipped(new EdgeUrl("https://www.marginalia.nu/"),
+                    "text/html",
+                    200,
+                    "<?doctype html><html><body>test</body></html>");
+        }
+
+        try (var reader = new WarcReader(fileNameWarc)) {
+            for (var record : reader) {
+                if (record instanceof WarcResponse rsp) {
+                    assertEquals("https://www.marginalia.nu/", rsp.target());
+                    assertEquals("text/html", rsp.contentType().type());
+                    assertEquals(200, rsp.http().status());
+                    assertEquals("1", rsp.http().headers().first("X-Cookies").orElse(null));
+                }
+            }
+        }
+    }
+
+    @Test
+    public void testSaveImport() throws URISyntaxException, IOException {
+        try (var recorder = new WarcRecorder(fileNameWarc)) {
+            recorder.flagAsSkipped(new EdgeUrl("https://www.marginalia.nu/"),
+                    "text/html",
+                    200,
+                    "<?doctype html><html><body>test</body></html>");
+        }
+
+        try (var reader = new WarcReader(fileNameWarc)) {
+            WarcXResponseReference.register(reader);
+
+            for (var record : reader) {
+                System.out.println(record.type());
+                System.out.println(record.getClass().getSimpleName());
+                if (record instanceof WarcXResponseReference rsp) {
+                    assertEquals("https://www.marginalia.nu/", rsp.target());
+                }
+            }
+        }
+
+    }
+
+    @Test
+    public void testConvertToParquet() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
+        client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/")
+                .addHeader("User-agent", "test.marginalia.nu")
+                .addHeader("Accept-Encoding", "gzip")
+                .get().build());
+        client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/log/")
+                .addHeader("User-agent", "test.marginalia.nu")
+                .addHeader("Accept-Encoding", "gzip")
+                .get().build());
+        client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/sanic.png")
+                .addHeader("User-agent", "test.marginalia.nu")
+                .addHeader("Accept-Encoding", "gzip")
+                .get().build());
+        client.close();
+
+        CrawledDocumentParquetRecordFileWriter.convertWarc(
+                "www.marginalia.nu",
+                new UserAgent("test"),
+                fileNameWarc,
+                fileNameParquet);
+
+        var urls = CrawledDocumentParquetRecordFileReader.stream(fileNameParquet).map(doc -> doc.url).toList();
+        assertEquals(3, urls.size());
+        assertEquals("https://www.marginalia.nu/", urls.get(0));
+        assertEquals("https://www.marginalia.nu/log/", urls.get(1));
+        assertEquals("https://www.marginalia.nu/sanic.png", urls.get(2));
+
+    }
+
+}
\ No newline at end of file
diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java
index 5893910f..0873924f 100644
--- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java
+++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java
@@ -4,11 +4,15 @@ import lombok.SneakyThrows;
 import nu.marginalia.crawl.retreival.RateLimitException;
 import nu.marginalia.crawl.retreival.fetcher.ContentTags;
 import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
-import nu.marginalia.crawl.retreival.logic.ContentTypeLogic;
+import nu.marginalia.crawling.body.DocumentBodyExtractor;
+import nu.marginalia.crawling.body.DocumentBodyResult;
+import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
+import nu.marginalia.crawling.body.ContentTypeLogic;
 import nu.marginalia.model.EdgeUrl;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.Test;
 
+import java.io.IOException;
 import java.net.URISyntaxException;
 
 class HttpFetcherTest {
@@ -28,16 +32,25 @@ class HttpFetcherTest {
     }
 
     @Test
-    void fetchUTF8() throws URISyntaxException, RateLimitException {
+    void fetchUTF8() throws URISyntaxException, RateLimitException, IOException {
         var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
-        var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), ContentTags.empty());
-        System.out.println(str.contentType);
+        try (var recorder = new WarcRecorder()) {
+            var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, ContentTags.empty());
+            if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
+                System.out.println(bodyOk.contentType());
+            }
+        }
     }
 
     @Test
-    void fetchText() throws URISyntaxException, RateLimitException {
+    void fetchText() throws URISyntaxException, RateLimitException, IOException {
         var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
-        var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), ContentTags.empty());
-        System.out.println(str);
+
+        try (var recorder = new WarcRecorder()) {
+            var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, ContentTags.empty());
+            if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
+                System.out.println(bodyOk.contentType());
+            }
+        }
     }
 }
\ No newline at end of file
diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java
index b65e5ae6..749b821c 100644
--- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java
+++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java
@@ -5,6 +5,8 @@ import lombok.SneakyThrows;
 import nu.marginalia.crawl.retreival.CrawlerRetreiver;
 import nu.marginalia.crawl.retreival.DomainProber;
 import nu.marginalia.crawl.retreival.fetcher.*;
+import nu.marginalia.crawling.body.HttpFetchResult;
+import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
 import nu.marginalia.crawling.model.CrawledDocument;
 import nu.marginalia.crawling.model.CrawlerDocumentStatus;
 import nu.marginalia.crawling.model.SerializableCrawlData;
@@ -12,17 +14,16 @@ import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawlspec.CrawlSpecRecord;
 import nu.marginalia.test.CommonTestData;
+import okhttp3.Headers;
 import org.junit.jupiter.api.AfterEach;
 import org.junit.jupiter.api.Test;
 import org.mockito.Mockito;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import java.io.IOException;
 import java.net.URISyntaxException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
 
 public class CrawlerMockFetcherTest {
 
@@ -61,44 +62,42 @@ public class CrawlerMockFetcherTest {
 
     }
 
+    void crawl(CrawlSpecRecord spec)  throws IOException {
+        try (var recorder = new WarcRecorder()) {
+            new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), spec, recorder)
+                    .fetch();
+        }
+    }
+
     @Test
-    public void testLemmy() throws URISyntaxException {
+    public void testLemmy() throws URISyntaxException, IOException {
         List<SerializableCrawlData> out = new ArrayList<>();
 
         registerUrlClasspathData(new EdgeUrl("https://startrek.website/"), "mock-crawl-data/lemmy/index.html");
         registerUrlClasspathData(new EdgeUrl("https://startrek.website/c/startrek"), "mock-crawl-data/lemmy/c_startrek.html");
         registerUrlClasspathData(new EdgeUrl("https://startrek.website/post/108995"), "mock-crawl-data/lemmy/108995.html");
 
-        new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true),  new CrawlSpecRecord("startrek.website", 10, new ArrayList<>()), out::add)
-                .fetch();
-
-        out.forEach(System.out::println);
+        crawl(new CrawlSpecRecord("startrek.website", 10, new ArrayList<>()));
     }
 
     @Test
-    public void testMediawiki() throws URISyntaxException {
+    public void testMediawiki() throws URISyntaxException, IOException {
         List<SerializableCrawlData> out = new ArrayList<>();
 
         registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html");
 
-        new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true),  new CrawlSpecRecord("en.wikipedia.org", 10, new ArrayList<>()), out::add)
-                .fetch();
-
-        out.forEach(System.out::println);
+        crawl(new CrawlSpecRecord("en.wikipedia.org", 10, new ArrayList<>()));
     }
 
     @Test
-    public void testDiscourse() throws URISyntaxException {
+    public void testDiscourse() throws URISyntaxException, IOException {
         List<SerializableCrawlData> out = new ArrayList<>();
 
         registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/"), "mock-crawl-data/discourse/index.html");
         registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/telegram-channel-to-idle-on/3501"), "mock-crawl-data/discourse/telegram.html");
         registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/combined-mode-but-grid/4489"), "mock-crawl-data/discourse/grid.html");
 
-        new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true),  new CrawlSpecRecord("community.tt-rss.org", 100, new ArrayList<>()), out::add)
-                .fetch();
-
-        out.forEach(System.out::println);
+        crawl(new CrawlSpecRecord("community.tt-rss.org", 10, new ArrayList<>()));
     }
 
     class MockFetcher implements HttpFetcher {
@@ -118,25 +117,28 @@ public class CrawlerMockFetcherTest {
             return new FetchResult(FetchResultState.OK, url);
         }
 
+        @SneakyThrows
         @Override
-        public CrawledDocument fetchContent(EdgeUrl url, ContentTags tags) {
+        public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags) {
             logger.info("Fetching {}", url);
             if (mockData.containsKey(url)) {
-                return mockData.get(url);
-            }
-            else {
-                return CrawledDocument.builder()
-                        .crawlId("1")
-                        .url(url.toString())
-                        .contentType("text/html")
-                        .httpStatus(404)
-                        .crawlerStatus(CrawlerDocumentStatus.ERROR.name())
-                        .build();
+                byte[] bodyBytes = mockData.get(url).documentBody.getBytes();
+                return new HttpFetchResult.ResultOk(
+                        url.asURI(),
+                        200,
+                        new Headers.Builder().build(),
+                        "127.0.0.1",
+                        bodyBytes,
+                        0,
+                        bodyBytes.length
+                );
             }
+
+            return new HttpFetchResult.ResultNone();
         }
 
         @Override
-        public SimpleRobotRules fetchRobotRules(EdgeDomain domain) {
+        public SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder) {
             return new SimpleRobotRules();
         }
 
@@ -144,5 +146,6 @@ public class CrawlerMockFetcherTest {
         public SitemapRetriever createSitemapRetriever() {
             return Mockito.mock(SitemapRetriever.class);
         }
+
     }
 }
diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
index e7742445..286f15f5 100644
--- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
+++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
@@ -8,6 +8,7 @@ import nu.marginalia.crawl.retreival.CrawlerRetreiver;
 import nu.marginalia.crawl.retreival.DomainProber;
 import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
 import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
+import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
 import nu.marginalia.crawling.io.CrawledDomainReader;
 import nu.marginalia.crawling.io.CrawledDomainWriter;
 import nu.marginalia.crawling.model.CrawledDocument;
@@ -15,22 +16,24 @@ import nu.marginalia.crawling.model.CrawledDomain;
 import nu.marginalia.crawling.model.SerializableCrawlData;
 import nu.marginalia.model.crawlspec.CrawlSpecRecord;
 import org.junit.jupiter.api.*;
+import org.netpreserve.jwarc.*;
 
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
 import java.util.stream.Collectors;
+import java.util.zip.GZIPInputStream;
 
+import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
 @Tag("slow")
 class CrawlerRetreiverTest {
     private HttpFetcher httpFetcher;
 
+    Path tempFile;
+    Path tempFile2;
     @BeforeEach
     public void setUp() {
         httpFetcher = new HttpFetcherImpl("search.marginalia.nu; testing a bit :D");
@@ -43,8 +46,62 @@ class CrawlerRetreiverTest {
         System.setProperty("http.agent", WmsaHome.getUserAgent().uaString());
     }
 
+    @AfterEach
+    public void tearDown() throws IOException {
+        if (tempFile != null) {
+            Files.deleteIfExists(tempFile);
+        }
+        if (tempFile2 != null) {
+            Files.deleteIfExists(tempFile2);
+        }
+    }
     @Test
-    public void testWithKnownDomains() {
+    public void testWarcOutput() throws IOException {
+        var specs = CrawlSpecRecord
+                .builder()
+                .crawlDepth(5)
+                .domain("www.marginalia.nu")
+                .urls(List.of("https://www.marginalia.nu/misc/debian-laptop-install-log/"))
+                .build();
+        Path tempFile = null;
+        try {
+            tempFile = Files.createTempFile("crawling-process", "warc");
+
+            try (var recorder = new WarcRecorder(tempFile)) {
+                new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch();
+            } catch (IOException ex) {
+                Assertions.fail(ex);
+            }
+
+            Set<String> requests = new HashSet<>();
+            Set<String> responses = new HashSet<>();
+
+            try (var reader = new WarcReader(tempFile)) {
+                reader.forEach(record -> {
+                    if (record instanceof WarcRequest req) {
+                        requests.add(req.target());
+                        System.out.println(req.type() + ":" + req.target());
+                    }
+                    else if (record instanceof WarcResponse rsp) {
+                        responses.add(rsp.target());
+                        System.out.println(rsp.type() + ":" + rsp.target());
+                    }
+                    else {
+                        System.out.println(record.type());
+                    }
+                });
+            }
+
+            assertTrue(requests.contains("https://www.marginalia.nu/misc/debian-laptop-install-log/"));
+            assertEquals(requests, responses);
+        }
+        finally {
+            if (tempFile != null)
+                Files.deleteIfExists(tempFile);
+        }
+    }
+    @Test
+    public void testWithKnownDomains() throws IOException {
         var specs = CrawlSpecRecord
                 .builder()
                 .crawlDepth(5)
@@ -54,10 +111,30 @@ class CrawlerRetreiverTest {
 
         List<SerializableCrawlData> data = new ArrayList<>();
 
-        new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, data::add).fetch();
+        tempFile = Files.createTempFile("crawling-process", ".warc");
+
+        try (var recorder = new WarcRecorder(tempFile)) {
+            new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch();
+        }
+        catch (IOException ex) {
+            Assertions.fail(ex);
+        }
+
+
+        try (var stream = CrawledDomainReader.createDataStream(tempFile)) {
+            while (stream.hasNext()) {
+                if (stream.next() instanceof CrawledDocument doc) {
+                    data.add(doc);
+                }
+            }
+        } catch (Exception e) {
+            throw new RuntimeException(e);
+        }
 
         var fetchedUrls =
-                data.stream().filter(CrawledDocument.class::isInstance)
+                data.stream()
+                        .peek(System.out::println)
+                        .filter(CrawledDocument.class::isInstance)
                         .map(CrawledDocument.class::cast)
                         .map(doc -> doc.url)
                         .collect(Collectors.toSet());
@@ -72,7 +149,7 @@ class CrawlerRetreiverTest {
     }
 
     @Test
-    public void testEmptySet() {
+    public void testEmptySet() throws IOException {
 
         var specs = CrawlSpecRecord
                 .builder()
@@ -81,9 +158,29 @@ class CrawlerRetreiverTest {
                 .urls(List.of())
                 .build();
 
+
         List<SerializableCrawlData> data = new ArrayList<>();
 
-        new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, data::add).fetch();
+        tempFile = Files.createTempFile("crawling-process", ".warc");
+
+        try (var recorder = new WarcRecorder(tempFile)) {
+            new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch();
+        }
+        catch (IOException ex) {
+            Assertions.fail(ex);
+        }
+
+
+        try (var stream = CrawledDomainReader.createDataStream(tempFile)) {
+            while (stream.hasNext()) {
+                if (stream.next() instanceof CrawledDocument doc) {
+                    data.add(doc);
+                }
+            }
+        } catch (Exception e) {
+            throw new RuntimeException(e);
+        }
+
 
         data.stream().filter(CrawledDocument.class::isInstance)
                 .map(CrawledDocument.class::cast)
@@ -115,33 +212,70 @@ class CrawlerRetreiverTest {
                 .build();
 
 
-        Path out = Files.createTempDirectory("crawling-process");
-        var writer = new CrawledDomainWriter(out, specs.domain, "idid");
+        tempFile = Files.createTempFile("crawling-process", ".warc.gz");
+        tempFile2 = Files.createTempFile("crawling-process", ".warc.gz");
+
         Map<Class<? extends SerializableCrawlData>, List<SerializableCrawlData>> data = new HashMap<>();
 
-        new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, d -> {
-            data.computeIfAbsent(d.getClass(), k->new ArrayList<>()).add(d);
-            if (d instanceof CrawledDocument doc) {
-                System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus);
-                if (Math.random() > 0.5) {
-                    doc.headers = "";
-                }
-            }
-            writer.accept(d);
-        }).fetch();
-        writer.close();
+        try (var recorder = new WarcRecorder(tempFile)) {
+            new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch();
+        }
+        catch (IOException ex) {
+            Assertions.fail(ex);
+        }
 
-        var reader = new CrawledDomainReader();
-        var stream = reader.createDataStream(out, specs.domain, "idid");
+        try (var stream = CrawledDomainReader.createDataStream(tempFile)) {
+            while (stream.hasNext()) {
+                var doc = stream.next();
+                data.computeIfAbsent(doc.getClass(), c -> new ArrayList<>()).add(doc);
+            }
+        } catch (Exception e) {
+            throw new RuntimeException(e);
+        }
+        var stream = CrawledDomainReader.createDataStream(tempFile);
 
         CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0);
         domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList());
+        try (var recorder = new WarcRecorder(tempFile2)) {
+            new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch(new DomainLinks(),
+                    new CrawlDataReference(stream));
+        }
+        catch (IOException ex) {
+            Assertions.fail(ex);
+        }
 
-        new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, d -> {
-            if (d instanceof CrawledDocument doc) {
-                System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus);
+        new GZIPInputStream(Files.newInputStream(tempFile2)).transferTo(System.out);
+
+        try (var reader = new WarcReader(tempFile2)) {
+            WarcXResponseReference.register(reader);
+
+            reader.forEach(record -> {
+                if (record instanceof WarcResponse rsp) {
+                    try {
+                        System.out.println(rsp.type() + ":" + rsp.target() + "/" + rsp.http().status());
+                    } catch (IOException e) {
+                        throw new RuntimeException(e);
+                    }
+                }
+                if (record instanceof WarcMetadata rsp) {
+                    System.out.println("meta:" + rsp.target());
+                }
+            });
+        }
+
+        try (var ds = CrawledDomainReader.createDataStream(tempFile2)) {
+            while (ds.hasNext()) {
+                var doc = ds.next();
+                if (doc instanceof CrawledDomain dr) {
+                    System.out.println(dr.domain + "/" + dr.crawlerStatus);
+                }
+                else if (doc instanceof CrawledDocument dc) {
+                    System.out.println(dc.url + "/" + dc.crawlerStatus + "/" + dc.httpStatus);
+                }
             }
-        }).fetch(new DomainLinks(), new CrawlDataReference(stream));
+        } catch (Exception e) {
+            throw new RuntimeException(e);
 
+        }
     }
 }
\ No newline at end of file
diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ConvertActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ConvertActor.java
index 275f4092..4af4852e 100644
--- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ConvertActor.java
+++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ConvertActor.java
@@ -32,6 +32,7 @@ public class ConvertActor extends RecordActorPrototype {
     public record Convert(FileStorageId fid) implements ActorStep {};
     public record ConvertEncyclopedia(String source, String baseUrl) implements ActorStep {};
     public record ConvertDirtree(String source) implements ActorStep {};
+    public record ConvertWarc(String source) implements ActorStep {};
     public record ConvertStackexchange(String source) implements ActorStep {};
     @Resume(behavior = ActorResumeBehavior.RETRY)
     public record ConvertWait(FileStorageId destFid,
@@ -74,6 +75,25 @@ public class ConvertActor extends RecordActorPrototype {
                         mqConverterOutbox.sendAsync(ConvertRequest.forDirtree(sourcePath, processedArea.id()))
                 );
             }
+            case ConvertWarc(String source) -> {
+                Path sourcePath = Path.of(source);
+                if (!Files.exists(sourcePath))
+                    yield new Error("Source path does not exist: " + sourcePath);
+
+                String fileName = sourcePath.toFile().getName();
+
+                var base = storageService.getStorageBase(FileStorageBaseType.STORAGE);
+                var processedArea = storageService.allocateTemporaryStorage(base,
+                        FileStorageType.PROCESSED_DATA, "processed-data",
+                        "Processed Warc Data; " + fileName);
+
+                storageService.setFileStorageState(processedArea.id(), FileStorageState.NEW);
+
+                yield new ConvertWait(
+                        processedArea.id(),
+                        mqConverterOutbox.sendAsync(ConvertRequest.forWarc(sourcePath, processedArea.id()))
+                );
+            }
             case ConvertEncyclopedia(String source, String baseUrl) -> {
 
                 Path sourcePath = Path.of(source);
diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java
index 0af77acb..353ef965 100644
--- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java
+++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java
@@ -63,8 +63,6 @@ public class ExportAtagsActor extends RecordActorPrototype {
 
                 Path inputDir = storageService.getStorage(crawlId).asPath();
 
-                var reader = new CrawledDomainReader();
-
                 try (var bw = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(Files.newOutputStream(tmpFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING))));
                 )
                 {
@@ -78,7 +76,7 @@ public class ExportAtagsActor extends RecordActorPrototype {
                         }
 
                         Path crawlDataPath = inputDir.resolve(item.relPath());
-                        try (var stream = reader.createDataStream(crawlDataPath)) {
+                        try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) {
                             exportLinks(tagWriter, stream);
                         }
                         catch (Exception ex) {
diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java
index b8bf0a5a..f00bace2 100644
--- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java
+++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java
@@ -170,6 +170,7 @@ public class IndexQueryService extends IndexApiImplBase {
         }
     }
 
+
     // GRPC endpoint
     @SneakyThrows
     public void query(nu.marginalia.index.api.RpcIndexQuery request,
diff --git a/code/tools/crawl-data-unfcker/src/main/java/nu/marginalia/tools/CrawlDataUnfcker.java b/code/tools/crawl-data-unfcker/src/main/java/nu/marginalia/tools/CrawlDataUnfcker.java
index 1a73a952..4322d3fc 100644
--- a/code/tools/crawl-data-unfcker/src/main/java/nu/marginalia/tools/CrawlDataUnfcker.java
+++ b/code/tools/crawl-data-unfcker/src/main/java/nu/marginalia/tools/CrawlDataUnfcker.java
@@ -29,13 +29,11 @@ public class CrawlDataUnfcker {
             return;
         }
 
-        var reader = new CrawledDomainReader();
-
         try (var wl = new WorkLog(output.resolve("crawler.log"))) {
             for (var inputItem : WorkLog.iterable(input.resolve("crawler.log"))) {
                 Path inputPath = input.resolve(inputItem.relPath());
 
-                var domainMaybe = readDomain(reader, inputPath).map(CrawledDomain::getDomain);
+                var domainMaybe = readDomain(inputPath).map(CrawledDomain::getDomain);
                 if (domainMaybe.isEmpty())
                     continue;
                 var domain = domainMaybe.get();
@@ -43,7 +41,7 @@ public class CrawlDataUnfcker {
                 // Generate conformant ID
                 String newId = Integer.toHexString(domain.hashCode());
 
-                var outputPath = CrawlerOutputFile.createOutputPath(output, newId, domain);
+                var outputPath = CrawlerOutputFile.createLegacyOutputPath(output, newId, domain);
                 var outputFileName = outputPath.toFile().getName();
 
                 System.out.println(inputPath + " -> " + outputPath);
@@ -56,13 +54,13 @@ public class CrawlDataUnfcker {
         }
     }
 
-    static Optional<CrawledDomain> readDomain(CrawledDomainReader reader, Path file) {
+    static Optional<CrawledDomain> readDomain(Path file) {
         if (!Files.exists(file)) {
             System.out.println("Missing file " + file);
             return Optional.empty();
         }
 
-        try (var stream = reader.createDataStream(file)) {
+        try (var stream = CrawledDomainReader.createDataStream(file)) {
             while (stream.hasNext()) {
                 if (stream.next() instanceof CrawledDomain domain) {
                     return Optional.of(domain);
diff --git a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/ExperimentRunnerMain.java b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/ExperimentRunnerMain.java
index 97df4a39..c5751a7a 100644
--- a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/ExperimentRunnerMain.java
+++ b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/ExperimentRunnerMain.java
@@ -50,10 +50,9 @@ public class ExperimentRunnerMain {
         experiment.args(Arrays.copyOfRange(args, 2, args.length));
 
         Path basePath = Path.of(args[0]);
-        var reader = new CrawledDomainReader();
         for (var item : WorkLog.iterable(basePath.resolve("crawler.log"))) {
             Path crawlDataPath = basePath.resolve(item.relPath());
-            try (var stream = reader.createDataStream(crawlDataPath)) {
+            try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) {
                 experiment.process(stream);
             }
             catch (Exception ex) {
diff --git a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/LegacyExperiment.java b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/LegacyExperiment.java
index 4e61ffc4..5d7d8d11 100644
--- a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/LegacyExperiment.java
+++ b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/LegacyExperiment.java
@@ -5,12 +5,12 @@ import nu.marginalia.crawling.model.CrawledDocument;
 import nu.marginalia.crawling.model.CrawledDomain;
 
 import java.io.IOException;
-import java.net.URISyntaxException;
 import java.util.ArrayList;
 import java.util.List;
 
 public abstract class LegacyExperiment extends Experiment {
     public abstract boolean process(CrawledDomain domain);
+
     @Override
     public boolean process(SerializableCrawlDataStream dataStream) throws IOException {
         List<CrawledDocument> documentList = new ArrayList<>();
diff --git a/settings.gradle b/settings.gradle
index 342107de..42ae0f47 100644
--- a/settings.gradle
+++ b/settings.gradle
@@ -41,6 +41,7 @@ include 'code:features-convert:topic-detection'
 
 include 'code:features-crawl:crawl-blocklist'
 include 'code:features-crawl:link-parser'
+include 'code:features-crawl:content-type'
 
 include 'code:features-index:index-journal'
 include 'code:features-index:index-query'
@@ -154,6 +155,8 @@ dependencyResolutionManagement {
             library('duckdb', 'org.duckdb', 'duckdb_jdbc').version('0.9.1')
             library('okhttp3','com.squareup.okhttp3','okhttp').version('4.11.0')
 
+            library('jwarc', 'org.netpreserve', 'jwarc').version('0.28.5')
+
             library('httpcomponents.core','org.apache.httpcomponents','httpcore').version('4.4.15')
             library('httpcomponents.client','org.apache.httpcomponents','httpclient').version('4.5.13')
             library('commons.net', 'commons-net','commons-net').version('3.9.0')
diff --git a/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetReader.java b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetReader.java
index 1ec3e7fb..45718fe8 100644
--- a/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetReader.java
+++ b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetReader.java
@@ -13,6 +13,7 @@ import org.apache.parquet.io.DelegatingSeekableInputStream;
 import org.apache.parquet.io.InputFile;
 import org.apache.parquet.io.SeekableInputStream;
 import org.apache.parquet.io.api.GroupConverter;
+import org.apache.parquet.schema.LogicalTypeAnnotation;
 import org.apache.parquet.schema.MessageType;
 import org.apache.parquet.schema.PrimitiveType;
 
@@ -144,7 +145,11 @@ public final class ParquetReader<U, S> implements Spliterator<S>, Closeable {
             case BINARY:
             case FIXED_LEN_BYTE_ARRAY:
             case INT96:
-                return primitiveType.stringifier().stringify(columnReader.getBinary());
+                if (primitiveType.getLogicalTypeAnnotation() == null) {
+                    return columnReader.getBinary().getBytes();
+                } else {
+                    return primitiveType.stringifier().stringify(columnReader.getBinary());
+                }
             case BOOLEAN:
                 return columnReader.getBoolean();
             case DOUBLE:
diff --git a/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetWriter.java b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetWriter.java
index 6e53c189..6d9b5734 100644
--- a/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetWriter.java
+++ b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetWriter.java
@@ -242,7 +242,7 @@ public final class ParquetWriter<T> implements Closeable {
                     if (type.getLogicalTypeAnnotation() == LogicalTypeAnnotation.stringType()) {
                         recordConsumer.addBinary(Binary.fromString((String)value));
                     } else {
-                        throw new UnsupportedOperationException("We don't support writing logical annotation type " + type.getLogicalTypeAnnotation());
+                        recordConsumer.addBinary(Binary.fromConstantByteArray((byte[])value));
                     }
                     break;
                 default: