(crawler) Refactor

* Restructure the code to make a bit more sense * Store full headers in crawl data * Fix bug in retry-after header that assumed the timeout was in milliseconds, and then clamped it to a lower bound of 500ms, meaning this was almost always handled wrong
2025-02-23 13:09:00 +00:00 · 2024-09-23 17:51:07 +02:00 · 2024-09-23 17:51:07 +02:00 · e9854f194c
commit e9854f194c
parent 9c292a4f62
44 changed files with 398 additions and 337 deletions
--- a/code/processes/converting-process/test/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java
+++ b/code/processes/converting-process/test/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java
@ -7,11 +7,11 @@ import nu.marginalia.UserAgent;
 import nu.marginalia.WmsaHome;
 import nu.marginalia.converting.model.ProcessedDomain;
 import nu.marginalia.converting.processor.DomainProcessor;
+import nu.marginalia.crawl.fetcher.HttpFetcher;
+import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
+import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
 import nu.marginalia.crawl.retreival.CrawlerRetreiver;
 import nu.marginalia.crawl.retreival.DomainProber;
-import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
-import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
-import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
 import nu.marginalia.io.crawldata.format.ParquetSerializableCrawlDataStream;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.crawl.DomainIndexingState;
@ -266,7 +266,7 @@ public class CrawlingThenConvertingIntegrationTest {
        List<SerializableCrawlData> data = new ArrayList<>();

        try (var recorder = new WarcRecorder(fileName)) {
-            new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, recorder).fetch();
+            new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, recorder).crawlDomain();
        }

        CrawledDocumentParquetRecordFileWriter.convertWarc(specs.domain,
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
@ -10,12 +10,12 @@ import nu.marginalia.UserAgent;
 import nu.marginalia.WmsaHome;
 import nu.marginalia.atags.source.AnchorTagsSource;
 import nu.marginalia.atags.source.AnchorTagsSourceFactory;
+import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
+import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
+import nu.marginalia.crawl.logic.DomainLocks;
 import nu.marginalia.crawl.retreival.CrawlDataReference;
 import nu.marginalia.crawl.retreival.CrawlerRetreiver;
-import nu.marginalia.crawl.retreival.DomainLocks;
 import nu.marginalia.crawl.retreival.DomainProber;
-import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
-import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
 import nu.marginalia.crawl.spec.CrawlSpecProvider;
 import nu.marginalia.crawl.spec.DbCrawlSpecProvider;
 import nu.marginalia.crawl.spec.ParquetCrawlSpecProvider;
@ -294,7 +294,7 @@ public class CrawlerMain extends ProcessMainClass {
                    Files.delete(tempFile);
                }

-                int size = retriever.fetch(domainLinks, reference);
+                int size = retriever.crawlDomain(domainLinks, reference);

                // Delete the reference crawl data if it's not the same as the new one
                // (mostly a case when migrating from legacy->warc)
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/ContentTags.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/ContentTags.java
@ -1,4 +1,4 @@
-package nu.marginalia.crawl.retreival.fetcher;
+package nu.marginalia.crawl.fetcher;

 import okhttp3.Request;

--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/Cookies.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/Cookies.java
@ -1,4 +1,4 @@
-package nu.marginalia.crawl.retreival;
+package nu.marginalia.crawl.fetcher;

 import okhttp3.Cookie;
 import okhttp3.CookieJar;
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java
@ -1,9 +1,8 @@
-package nu.marginalia.crawl.retreival.fetcher;
+package nu.marginalia.crawl.fetcher;

 import com.google.inject.ImplementedBy;
 import crawlercommons.robots.SimpleRobotRules;
-import nu.marginalia.crawl.retreival.RateLimitException;
-import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
+import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.body.HttpFetchResult;
@ -17,12 +16,12 @@ public interface HttpFetcher {
    List<String> getCookies();
    void clearCookies();

-    FetchResult probeDomain(EdgeUrl url);
+    HttpFetcherImpl.ProbeResult probeDomain(EdgeUrl url);

    HttpFetchResult fetchContent(EdgeUrl url,
                                 WarcRecorder recorder,
                                 ContentTags tags,
-                                 ProbeType probeType) throws RateLimitException;
+                                 ProbeType probeType) throws HttpFetcherImpl.RateLimitException;

    SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder);

--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java
@ -1,22 +1,23 @@
-package nu.marginalia.crawl.retreival.fetcher;
+package nu.marginalia.crawl.fetcher;

 import com.google.inject.Inject;
 import crawlercommons.robots.SimpleRobotRules;
 import crawlercommons.robots.SimpleRobotRulesParser;
 import lombok.SneakyThrows;
 import nu.marginalia.UserAgent;
-import nu.marginalia.crawl.retreival.Cookies;
-import nu.marginalia.crawl.retreival.RateLimitException;
-import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult;
-import nu.marginalia.crawl.retreival.fetcher.socket.FastTerminatingSocketFactory;
-import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
-import nu.marginalia.crawl.retreival.fetcher.socket.NoSecuritySSL;
-import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
+import nu.marginalia.crawl.fetcher.socket.FastTerminatingSocketFactory;
+import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
+import nu.marginalia.crawl.fetcher.socket.NoSecuritySSL;
+import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
+import nu.marginalia.crawl.logic.ContentTypeProber;
+import nu.marginalia.crawl.logic.ContentTypeProber.ContentTypeProbeResult;
+import nu.marginalia.crawl.logic.SoftIfModifiedSinceProber;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.body.ContentTypeLogic;
 import nu.marginalia.model.body.DocumentBodyExtractor;
 import nu.marginalia.model.body.HttpFetchResult;
+import nu.marginalia.model.crawldata.CrawlerDomainStatus;
 import okhttp3.ConnectionPool;
 import okhttp3.Dispatcher;
 import okhttp3.OkHttpClient;
@ -25,6 +26,7 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import javax.net.ssl.X509TrustManager;
+import java.time.Duration;
 import java.util.List;
 import java.util.Objects;
 import java.util.Optional;
@ -114,7 +116,7 @@ public class HttpFetcherImpl implements HttpFetcher {
     */
    @Override
    @SneakyThrows
-    public FetchResult probeDomain(EdgeUrl url) {
+    public ProbeResult probeDomain(EdgeUrl url) {
        var head = new Request.Builder().head().addHeader("User-agent", userAgentString)
                .url(url.toString())
                .build();
@ -125,9 +127,9 @@ public class HttpFetcherImpl implements HttpFetcher {
            EdgeUrl requestUrl = new EdgeUrl(rsp.request().url().toString());

            if (!Objects.equals(requestUrl.domain, url.domain)) {
-                return new FetchResult(FetchResultState.REDIRECT, requestUrl);
+                return new ProbeResultRedirect(url.domain);
            }
-            return new FetchResult(FetchResultState.OK, requestUrl);
+            return new ProbeResultOk(requestUrl);
        }

        catch (Exception ex) {
@ -136,7 +138,7 @@ public class HttpFetcherImpl implements HttpFetcher {
            }

            logger.info("Error during fetching {}", ex.getMessage());
-            return new FetchResult(FetchResultState.ERROR, url);
+            return new ProbeResultError(CrawlerDomainStatus.ERROR, ex.getMessage());
        }
    }

@ -196,8 +198,7 @@ public class HttpFetcherImpl implements HttpFetcher {

        if (result instanceof HttpFetchResult.ResultOk ok) {
            if (ok.statusCode() == 429) {
-                String retryAfter = Objects.requireNonNullElse(ok.header("Retry-After"), "1000");
-                throw new RateLimitException(retryAfter);
+                throw new RateLimitException(Objects.requireNonNullElse(ok.header("Retry-After"), "1"));
            }
            if (ok.statusCode() == 304) {
                return new HttpFetchResult.Result304Raw();
@ -249,5 +250,44 @@ public class HttpFetcherImpl implements HttpFetcher {
    }


+    public sealed interface ProbeResult permits ProbeResultError, ProbeResultRedirect, ProbeResultOk {}
+
+    /** The probing failed for one reason or another
+     * @param status  Machine readable status
+     * @param desc   Human-readable description of the error
+     */
+    public record ProbeResultError(CrawlerDomainStatus status, String desc) implements ProbeResult {}
+
+    /** This domain redirects to another domain */
+    public record ProbeResultRedirect(EdgeDomain domain) implements ProbeResult {}
+
+    /** If the retrieval of the probed url was successful, return the url as it was fetched
+     * (which may be different from the url we probed, if we attempted another URL schema).
+     *
+     * @param probedUrl  The url we successfully probed
+     */
+    public record ProbeResultOk(EdgeUrl probedUrl) implements ProbeResult {}
+
+
+    /** Exception thrown when the server signals the rate limit is exceeded */
+    public static class RateLimitException extends Exception {
+        private final String retryAfter;
+
+        public RateLimitException(String retryAfterHeader) {
+            this.retryAfter = retryAfterHeader;
+        }
+
+        @Override
+        public StackTraceElement[] getStackTrace() { return new StackTraceElement[0]; }
+
+        public Duration retryAfter() {
+            try {
+                return Duration.ofSeconds(Integer.parseInt(retryAfter));
+            }
+            catch (NumberFormatException ex) {
+                return Duration.ofSeconds(1);
+            }
+        }
+    }
 }

--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/SitemapRetriever.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/SitemapRetriever.java
@ -1,4 +1,4 @@
-package nu.marginalia.crawl.retreival.fetcher;
+package nu.marginalia.crawl.fetcher;

 import crawlercommons.sitemaps.*;
 import nu.marginalia.model.EdgeUrl;
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/socket/FastTerminatingSocketFactory.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/socket/FastTerminatingSocketFactory.java
@ -1,4 +1,4 @@
-package nu.marginalia.crawl.retreival.fetcher.socket;
+package nu.marginalia.crawl.fetcher.socket;

 import javax.net.SocketFactory;
 import java.io.IOException;
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/socket/IpInterceptingNetworkInterceptor.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/socket/IpInterceptingNetworkInterceptor.java
@ -1,4 +1,4 @@
-package nu.marginalia.crawl.retreival.fetcher.socket;
+package nu.marginalia.crawl.fetcher.socket;

 import okhttp3.Interceptor;
 import okhttp3.Response;
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/socket/NoSecuritySSL.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/socket/NoSecuritySSL.java
@ -1,4 +1,4 @@
-package nu.marginalia.crawl.retreival.fetcher.socket;
+package nu.marginalia.crawl.fetcher.socket;

 import lombok.SneakyThrows;

--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcDigestBuilder.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcDigestBuilder.java
@ -1,4 +1,4 @@
-package nu.marginalia.crawl.retreival.fetcher.warc;
+package nu.marginalia.crawl.fetcher.warc;

 import org.netpreserve.jwarc.WarcDigest;

--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcInputBuffer.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcInputBuffer.java
@ -1,4 +1,4 @@
-package nu.marginalia.crawl.retreival.fetcher.warc;
+package nu.marginalia.crawl.fetcher.warc;

 import okhttp3.Headers;
 import okhttp3.Response;
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java
@ -1,7 +1,6 @@
-package nu.marginalia.crawl.retreival.fetcher.warc;
+package nu.marginalia.crawl.fetcher.warc;

 import okhttp3.Protocol;
-import okhttp3.Request;
 import okhttp3.Response;
 import org.apache.commons.lang3.StringUtils;

@ -73,7 +72,7 @@ public class WarcProtocolReconstructor {

        String headerString = getHeadersAsString(headersAsString);

-        return STR."HTTP/\{version} \{statusCode} \{statusMessage}\r\n\{headerString}\r\n\r\n";
+        return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
    }

    static String getResponseHeader(Response response, long size) {
@ -84,7 +83,7 @@ public class WarcProtocolReconstructor {

        String headerString = getHeadersAsString(response, size);

-        return STR."HTTP/\{version} \{statusCode} \{statusMessage}\r\n\{headerString}\r\n\r\n";
+        return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
    }

    private static final Map<Integer, String> STATUS_CODE_MAP = Map.ofEntries(
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java
@ -1,13 +1,14 @@
-package nu.marginalia.crawl.retreival.fetcher.warc;
+package nu.marginalia.crawl.fetcher.warc;

-import nu.marginalia.crawl.retreival.DomainProber;
-import nu.marginalia.crawl.retreival.fetcher.ContentTags;
-import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
+import nu.marginalia.crawl.fetcher.ContentTags;
+import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
+import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.body.HttpFetchResult;
 import okhttp3.OkHttpClient;
 import okhttp3.Request;
+import org.jetbrains.annotations.Nullable;
 import org.netpreserve.jwarc.*;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -183,7 +184,7 @@ public class WarcRecorder implements AutoCloseable {
        writer.write(item);
    }

-    private void saveOldResponse(EdgeUrl url, String contentType, int statusCode, String documentBody, ContentTags contentTags) {
+    private void saveOldResponse(EdgeUrl url, String contentType, int statusCode, String documentBody, @Nullable String headers, ContentTags contentTags) {
        try {
            WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
            WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
@ -192,24 +193,42 @@ public class WarcRecorder implements AutoCloseable {

            if (documentBody == null) {
                bytes = new byte[0];
-            }
-            else {
+            } else {
                bytes = documentBody.getBytes();
            }

-            StringJoiner fakeHeadersBuilder = new StringJoiner("\n");
+            // Create a synthesis of custom headers and the original headers
+            // to create a new set of headers that will be written to the WARC file.

-            fakeHeadersBuilder.add(STR."Content-Type: \{contentType}");
-            fakeHeadersBuilder.add(STR."Content-Length: \{bytes.length}");
+            StringJoiner syntheticHeadersBuilder = new StringJoiner("\n");
+
+            syntheticHeadersBuilder.add("Content-Type: " + contentType);
+            syntheticHeadersBuilder.add("Content-Length: " + bytes.length);
            if (contentTags.etag() != null) {
-                fakeHeadersBuilder.add(STR."ETag: \{contentTags.etag()}");
+                syntheticHeadersBuilder.add("ETag: " + contentTags.etag());
            }
            if (contentTags.lastMod() != null) {
-                fakeHeadersBuilder.add(STR."Last-Modified: \{contentTags.lastMod()}");
+                syntheticHeadersBuilder.add("Last-Modified: " + contentTags.lastMod());
+            }
+
+            // Grab the headers from the original response and add them to the fake headers if they are not
+            // Content-Type, Content-Length, ETag, or Last-Modified
+            for (String headerLine : Objects.requireNonNullElse(headers, "").split("\n")) {
+                if (headerLine.isBlank()) continue;
+
+                var lowerCase = headerLine.toLowerCase();
+
+                if (lowerCase.startsWith("content-type:")) continue;
+                if (lowerCase.startsWith("content-length:")) continue;
+
+                if (contentTags.etag() != null && lowerCase.startsWith("etag:")) continue;
+                if (contentTags.lastMod() != null && lowerCase.startsWith("last-modified:")) continue;
+
+                syntheticHeadersBuilder.add(headerLine);
            }

            byte[] header = WarcProtocolReconstructor
-                                        .getResponseHeader(fakeHeadersBuilder.toString(), statusCode)
+                                        .getResponseHeader(syntheticHeadersBuilder.toString(), statusCode)
                                        .getBytes(StandardCharsets.UTF_8);
            ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(bytes.length + header.length);
            responseDataBuffer.put(header);
@ -244,25 +263,25 @@ public class WarcRecorder implements AutoCloseable {
     * an E-Tag or Last-Modified header, and the server responds with a 304 Not Modified.  In this
     * scenario we want to record the data as it was in the previous crawl, but not re-fetch it.
     */
-    public void writeReferenceCopy(EdgeUrl url, String contentType, int statusCode, String documentBody, ContentTags ctags) {
-        saveOldResponse(url, contentType, statusCode, documentBody, ctags);
+    public void writeReferenceCopy(EdgeUrl url, String contentType, int statusCode, String documentBody, @Nullable String headers, ContentTags ctags) {
+        saveOldResponse(url, contentType, statusCode, documentBody, headers, ctags);
    }

-    public void writeWarcinfoHeader(String ip, EdgeDomain domain, DomainProber.ProbeResult result) throws IOException {
+    public void writeWarcinfoHeader(String ip, EdgeDomain domain, HttpFetcherImpl.ProbeResult result) throws IOException {

        Map<String, List<String>> fields = new HashMap<>();
        fields.put("ip", List.of(ip));
-        fields.put("software", List.of(STR."search.marginalia.nu/\{warcRecorderVersion}"));
+        fields.put("software", List.of("search.marginalia.nu/" + warcRecorderVersion));
        fields.put("domain", List.of(domain.toString()));

        switch (result) {
-            case DomainProber.ProbeResultRedirect redirectDomain:
-                fields.put("X-WARC-Probe-Status", List.of(STR."REDIRECT;\{redirectDomain.domain()}"));
+            case HttpFetcherImpl.ProbeResultRedirect redirectDomain:
+                fields.put("X-WARC-Probe-Status", List.of("REDIRECT;" + redirectDomain.domain()));
                break;
-            case DomainProber.ProbeResultError error:
-                fields.put("X-WARC-Probe-Status", List.of(STR."\{error.status().toString()};\{error.desc()}"));
+            case HttpFetcherImpl.ProbeResultError error:
+                fields.put("X-WARC-Probe-Status", List.of(error.status().toString() + ";" + error.desc()));
                break;
-            case DomainProber.ProbeResultOk ok:
+            case HttpFetcherImpl.ProbeResultOk ok:
                fields.put("X-WARC-Probe-Status", List.of("OK"));
                break;
        }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/ContentTypeProber.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/ContentTypeProber.java
@ -1,4 +1,4 @@
-package nu.marginalia.crawl.retreival.fetcher;
+package nu.marginalia.crawl.logic;

 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.body.ContentTypeLogic;
@ -7,7 +7,7 @@ import okhttp3.Request;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

-import java.net.SocketTimeoutException;
+import java.io.InterruptedIOException;
 import java.util.Objects;

 public class ContentTypeProber {
@ -68,7 +68,7 @@ public class ContentTypeProber {

            return new ContentTypeProbeResult.Ok(ret);

-        } catch (SocketTimeoutException ex) {
+        } catch (InterruptedIOException ex) {
            return new ContentTypeProbeResult.Timeout(ex);
        } catch (Exception ex) {
            logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainLocks.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainLocks.java
@ -1,4 +1,4 @@
-package nu.marginalia.crawl.retreival;
+package nu.marginalia.crawl.logic;

 import nu.marginalia.model.EdgeDomain;

--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/LinkFilterSelector.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/LinkFilterSelector.java
@ -1,4 +1,4 @@
-package nu.marginalia.crawl.retreival;
+package nu.marginalia.crawl.logic;

 import nu.marginalia.model.EdgeUrl;
 import org.jsoup.nodes.Document;
@ -26,6 +26,20 @@ public class LinkFilterSelector {
        if (isDiscourse(head)) {
            return url -> url.path.startsWith("/t/") || url.path.contains("/latest");
        }
+        if (isMediawiki(head)) {
+            return url -> {
+                if (url.path.endsWith(".php")) {
+                    return false;
+                }
+                if (url.path.contains("Special:")) {
+                    return false;
+                }
+                if (url.path.contains("Talk:")) {
+                    return false;
+                }
+                return true;
+            };
+        }

        return LinkFilterSelector::defaultFilter;
    }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/SoftIfModifiedSinceProber.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/SoftIfModifiedSinceProber.java
@ -1,6 +1,7 @@
-package nu.marginalia.crawl.retreival.fetcher;
+package nu.marginalia.crawl.logic;

 import com.google.common.base.Strings;
+import nu.marginalia.crawl.fetcher.ContentTags;
 import nu.marginalia.model.EdgeUrl;
 import okhttp3.OkHttpClient;
 import okhttp3.Request;
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java
@ -1,6 +1,9 @@
 package nu.marginalia.crawl.retreival;

 import lombok.SneakyThrows;
+import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
+
+import java.time.Duration;

 import static java.lang.Math.max;
 import static java.lang.Math.min;
@ -22,12 +25,21 @@ public class CrawlDelayTimer {

    /** Call when we've gotten an HTTP 429 response.  This will wait a moment, and then
     * set a flag that slows down the main crawl delay as well. */
-    public void waitRetryDelay(RateLimitException ex) throws InterruptedException {
+    public void waitRetryDelay(HttpFetcherImpl.RateLimitException ex) throws InterruptedException {
        slowDown = true;

-        int delay = ex.retryAfter();
+        Duration delay = ex.retryAfter();

-        Thread.sleep(Math.clamp(delay, 100, 5000));
+        if (delay.compareTo(Duration.ofSeconds(1)) < 0) {
+            // If the server wants us to retry in less than a second, we'll just wait a bit
+            delay = Duration.ofSeconds(1);
+        }
+        else if (delay.compareTo(Duration.ofSeconds(5)) > 0) {
+            // If the server wants us to retry in more than a minute, we'll wait a bit
+            delay = Duration.ofSeconds(5);
+        }
+
+        Thread.sleep(delay.toMillis());
    }

    @SneakyThrows
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawledDocumentFactory.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawledDocumentFactory.java
@ -1,91 +0,0 @@
-package nu.marginalia.crawl.retreival;
-
-import nu.marginalia.model.EdgeUrl;
-import nu.marginalia.model.body.HttpFetchResult;
-import nu.marginalia.model.crawldata.CrawledDocument;
-import nu.marginalia.model.crawldata.CrawlerDocumentStatus;
-
-import java.time.LocalDateTime;
-import java.util.Objects;
-
-public class CrawledDocumentFactory {
-
-    public static CrawledDocument createHardErrorRsp(EdgeUrl url, Exception why) {
-        return CrawledDocument.builder()
-                .crawlerStatus(CrawlerDocumentStatus.ERROR.toString())
-                .crawlerStatusDesc(why.getClass().getSimpleName() + ": " + why.getMessage())
-                .timestamp(LocalDateTime.now().toString())
-                .url(url.toString())
-                .build();
-    }
-
-    public static CrawledDocument createUnknownHostError(EdgeUrl url) {
-        return CrawledDocument.builder()
-                .crawlerStatus(CrawlerDocumentStatus.ERROR.toString())
-                .crawlerStatusDesc("Unknown Host")
-                .timestamp(LocalDateTime.now().toString())
-                .url(url.toString())
-                .build();
-    }
-
-    public static CrawledDocument createTimeoutErrorRsp(EdgeUrl url) {
-        return CrawledDocument.builder()
-                .crawlerStatus("Timeout")
-                .timestamp(LocalDateTime.now().toString())
-                .url(url.toString())
-                .build();
-    }
-
-    public static CrawledDocument createErrorResponse(EdgeUrl url, HttpFetchResult.ResultOk rsp, CrawlerDocumentStatus status, String why) {
-        return CrawledDocument.builder()
-                .crawlerStatus(status.toString())
-                .crawlerStatusDesc(why)
-                .headers(rsp.headers().toString())
-                .contentType(Objects.requireNonNullElse(rsp.headers().get("Content-Type"), ""))
-                .timestamp(LocalDateTime.now().toString())
-                .httpStatus(rsp.statusCode())
-                .url(url.toString())
-                .build();
-    }
-    public static CrawledDocument createErrorResponse(EdgeUrl url, String contentType, int statusCode, CrawlerDocumentStatus status, String why) {
-        return CrawledDocument.builder()
-                .crawlerStatus(status.toString())
-                .crawlerStatusDesc(why)
-                .headers("")
-                .contentType(contentType)
-                .timestamp(LocalDateTime.now().toString())
-                .httpStatus(statusCode)
-                .url(url.toString())
-                .build();
-    }
-
-    public static CrawledDocument createRedirectResponse(EdgeUrl url, HttpFetchResult.ResultOk rsp, EdgeUrl responseUrl) {
-
-        return CrawledDocument.builder()
-                .crawlerStatus(CrawlerDocumentStatus.REDIRECT.name())
-                .redirectUrl(responseUrl.toString())
-                .headers(rsp.headers().toString())
-                .contentType(Objects.requireNonNullElse(rsp.headers().get("Content-Type"), ""))
-                .timestamp(LocalDateTime.now().toString())
-                .httpStatus(rsp.statusCode())
-                .url(url.toString())
-                .build();
-    }
-
-    public static CrawledDocument createRobotsError(EdgeUrl url) {
-        return CrawledDocument.builder()
-                .url(url.toString())
-                .timestamp(LocalDateTime.now().toString())
-                .httpStatus(-1)
-                .crawlerStatus(CrawlerDocumentStatus.ROBOTS_TXT.name())
-                .build();
-    }
-    public static CrawledDocument createRetryError(EdgeUrl url) {
-        return CrawledDocument.builder()
-                .url(url.toString())
-                .timestamp(LocalDateTime.now().toString())
-                .httpStatus(429)
-                .crawlerStatus(CrawlerDocumentStatus.ERROR.name())
-                .build();
-    }
-}
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
@ -3,9 +3,11 @@ package nu.marginalia.crawl.retreival;
 import crawlercommons.robots.SimpleRobotRules;
 import nu.marginalia.atags.model.DomainLinks;
 import nu.marginalia.contenttype.ContentType;
-import nu.marginalia.crawl.retreival.fetcher.ContentTags;
-import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
-import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
+import nu.marginalia.crawl.fetcher.ContentTags;
+import nu.marginalia.crawl.fetcher.HttpFetcher;
+import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
+import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
+import nu.marginalia.crawl.logic.LinkFilterSelector;
 import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor;
 import nu.marginalia.crawl.retreival.revisit.DocumentWithReference;
 import nu.marginalia.crawl.retreival.sitemap.SitemapFetcher;
@ -14,8 +16,6 @@ import nu.marginalia.link_parser.LinkParser;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.body.HttpFetchResult;
-import nu.marginalia.model.crawldata.CrawledDomain;
-import nu.marginalia.model.crawldata.CrawlerDomainStatus;
 import nu.marginalia.model.crawlspec.CrawlSpecRecord;
 import org.jsoup.Jsoup;
 import org.slf4j.Logger;
@ -25,7 +25,6 @@ import java.io.IOException;
 import java.net.InetAddress;
 import java.net.UnknownHostException;
 import java.nio.file.Path;
-import java.util.ArrayList;
 import java.util.List;
 import java.util.Objects;
 import java.util.Optional;
@ -84,11 +83,11 @@ public class CrawlerRetreiver implements AutoCloseable {
        return crawlFrontier;
    }

-    public int fetch() {
-        return fetch(new DomainLinks(), new CrawlDataReference());
+    public int crawlDomain() {
+        return crawlDomain(new DomainLinks(), new CrawlDataReference());
    }

-    public int fetch(DomainLinks domainLinks, CrawlDataReference oldCrawlData) {
+    public int crawlDomain(DomainLinks domainLinks, CrawlDataReference oldCrawlData) {
        try {
            return crawlDomain(oldCrawlData, domainLinks);
        }
@ -98,28 +97,11 @@ public class CrawlerRetreiver implements AutoCloseable {
        }
    }

-    public void syncAbortedRun(Path warcFile) {
-        var resync = new CrawlerWarcResynchronizer(crawlFrontier, warcRecorder);
-
-        resync.run(warcFile);
-    }
-
-    private DomainProber.ProbeResult probeRootUrl(String ip) throws IOException {
-        // Construct an URL to the root of the domain, we don't know the schema yet so we'll
-        // start with http and then try https if that fails
-        var httpUrl = new EdgeUrl("http", new EdgeDomain(domain), null, "/", null);
-        final DomainProber.ProbeResult probeResult = domainProber.probeDomain(fetcher, domain, httpUrl);
-
-        warcRecorder.writeWarcinfoHeader(ip, new EdgeDomain(domain), probeResult);
-
-        return probeResult;
-    }
-
    private int crawlDomain(CrawlDataReference oldCrawlData, DomainLinks domainLinks) throws IOException, InterruptedException {
        String ip = findIp(domain);
        EdgeUrl rootUrl;

-        if (probeRootUrl(ip) instanceof DomainProber.ProbeResultOk ok) rootUrl = ok.probedUrl();
+        if (probeRootUrl(ip) instanceof HttpFetcherImpl.ProbeResultOk ok) rootUrl = ok.probedUrl();
        else return 1;

        // Sleep after the initial probe, we don't have access to the robots.txt yet
@ -130,12 +112,13 @@ public class CrawlerRetreiver implements AutoCloseable {
        final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());

        delayTimer.waitFetchDelay(0); // initial delay after robots.txt
+
        sniffRootDocument(rootUrl, delayTimer);

        // Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
-        int recrawled = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer);
+        int fetchedCount = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer);

-        if (recrawled > 0) {
+        if (fetchedCount > 0) {
            // If we have reference data, we will always grow the crawl depth a bit
            crawlFrontier.increaseDepth(1.5, 2500);
        }
@ -146,15 +129,6 @@ public class CrawlerRetreiver implements AutoCloseable {
        // Add links from the sitemap to the crawl frontier
        sitemapFetcher.downloadSitemaps(robotsRules, rootUrl);

-        CrawledDomain ret = new CrawledDomain(domain,
-                null,
-                CrawlerDomainStatus.OK.name(),
-                null,
-                ip,
-                new ArrayList<>(),
-                null);
-
-        int fetchedCount = recrawled;

        while (!crawlFrontier.isEmpty()
            && !crawlFrontier.isCrawlDepthReached()
@ -186,7 +160,6 @@ public class CrawlerRetreiver implements AutoCloseable {
            if (!crawlFrontier.addVisited(top))
                continue;

-
            try {
                if (fetchContentWithReference(top, delayTimer, DocumentWithReference.empty()).isOk()) {
                    fetchedCount++;
@ -198,15 +171,28 @@ public class CrawlerRetreiver implements AutoCloseable {
            }
        }

-        ret.cookies = fetcher.getCookies();
-
        return fetchedCount;
    }

+    public void syncAbortedRun(Path warcFile) {
+        var resync = new CrawlerWarcResynchronizer(crawlFrontier, warcRecorder);
+
+        resync.run(warcFile);
+    }
+
+    private HttpFetcherImpl.ProbeResult probeRootUrl(String ip) throws IOException {
+        // Construct an URL to the root of the domain, we don't know the schema yet so we'll
+        // start with http and then try https if that fails
+        var httpUrl = new EdgeUrl("http", new EdgeDomain(domain), null, "/", null);
+        final HttpFetcherImpl.ProbeResult probeResult = domainProber.probeDomain(fetcher, domain, httpUrl);
+
+        warcRecorder.writeWarcinfoHeader(ip, new EdgeDomain(domain), probeResult);
+
+        return probeResult;
+    }
+
    private void sniffRootDocument(EdgeUrl rootUrl, CrawlDelayTimer timer) {
        try {
-            logger.debug("Configuring link filter");
-
            var url = rootUrl.withPathAndParam("/", null);

            HttpFetchResult result = fetchWithRetry(url, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
@ -291,7 +277,7 @@ public class CrawlerRetreiver implements AutoCloseable {
            else if (fetchedDoc instanceof HttpFetchResult.Result304Raw && reference.doc() != null) {
                var doc = reference.doc();

-                warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBody, contentTags);
+                warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBody, doc.headers, contentTags);

                fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url,
                        new ContentType(doc.contentType, "UTF-8"),
@ -326,7 +312,7 @@ public class CrawlerRetreiver implements AutoCloseable {
            try {
                return fetcher.fetchContent(url, warcRecorder, contentTags, probeType);
            }
-            catch (RateLimitException ex) {
+            catch (HttpFetcherImpl.RateLimitException ex) {
                timer.waitRetryDelay(ex);
            }
            catch (Exception ex) {
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java
@ -1,6 +1,6 @@
 package nu.marginalia.crawl.retreival;

-import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
+import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.body.DocumentBodyExtractor;
 import nu.marginalia.model.body.HttpFetchResult;
@ -38,7 +38,7 @@ public class CrawlerWarcResynchronizer {
                accept(item);
            }
        } catch (Exception e) {
-            logger.info(STR."(Expected) Failed read full warc file \{tempFile}: \{e.getClass().getSimpleName()} \{e.getMessage()}");
+            logger.info("(Expected) Failed read full warc file " + tempFile + ": " + e.getClass().getSimpleName() + " " + e.getMessage());
        }

        // Second pass, copy records to the new warc file
@ -47,7 +47,7 @@ public class CrawlerWarcResynchronizer {
                recorder.resync(item);
            }
        } catch (Exception e) {
-            logger.info(STR."(Expected) Failed read full warc file \{tempFile}: \{e.getClass().getSimpleName()} \{e.getMessage()}");
+            logger.info("(Expected) Failed read full warc file " + tempFile + ": " + e.getClass().getSimpleName() + " " + e.getMessage());
        }
    }

@ -63,7 +63,7 @@ public class CrawlerWarcResynchronizer {

        }
        catch (Exception ex) {
-            logger.info(STR."Failed to process warc record \{item}", ex);
+            logger.info("Failed to process warc record " + item, ex);
        }
    }

@ -78,7 +78,8 @@ public class CrawlerWarcResynchronizer {
    }

    private void request(WarcRequest request) {
-        EdgeUrl.parse(request.target()).ifPresent(crawlFrontier::addVisited);
+        var url = new EdgeUrl(request.targetURI());
+        crawlFrontier.addVisited(url);
    }

    private void response(WarcResponse rsp) {
@ -97,7 +98,7 @@ public class CrawlerWarcResynchronizer {
            });
        }
        catch (Exception e) {
-            logger.info(STR."Failed to parse response body for \{url}", e);
+            logger.info("Failed to parse response body for " + url, e);
        }
    }

--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java
@ -9,9 +9,14 @@ import nu.marginalia.model.EdgeUrl;
 import org.jsoup.nodes.Document;

 import java.net.URISyntaxException;
-import java.util.*;
+import java.util.ArrayDeque;
+import java.util.Collection;
+import java.util.Objects;
 import java.util.function.Predicate;

+/** Encapsulates the crawl frontier for a single domain,
+ * that is information about known and visited URLs
+ */
 public class DomainCrawlFrontier {

    private static final LinkParser linkParser = new LinkParser();
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainProber.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainProber.java
@ -2,8 +2,8 @@ package nu.marginalia.crawl.retreival;

 import com.google.inject.Inject;
 import com.google.inject.Singleton;
-import nu.marginalia.crawl.retreival.fetcher.FetchResultState;
-import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
+import nu.marginalia.crawl.fetcher.HttpFetcher;
+import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
 import nu.marginalia.ip_blocklist.IpBlockList;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
@ -34,43 +34,18 @@ public class DomainProber {
     *  doesn't immediately redirect to another domain (which should be crawled separately, not under the name
     *  of this domain).
     */
-    public ProbeResult probeDomain(HttpFetcher fetcher, String domain, @Nullable EdgeUrl firstUrlInQueue) {
+    public HttpFetcherImpl.ProbeResult probeDomain(HttpFetcher fetcher, String domain, @Nullable EdgeUrl firstUrlInQueue) {

        if (firstUrlInQueue == null) {
            logger.warn("No valid URLs for domain {}", domain);

-            return new ProbeResultError(CrawlerDomainStatus.ERROR, "No known URLs");
+            return new HttpFetcherImpl.ProbeResultError(CrawlerDomainStatus.ERROR, "No known URLs");
        }

        if (!domainBlacklist.test(firstUrlInQueue.domain))
-            return new ProbeResultError(CrawlerDomainStatus.BLOCKED, "IP not allowed");
+            return new HttpFetcherImpl.ProbeResultError(CrawlerDomainStatus.BLOCKED, "IP not allowed");

-        var fetchResult = fetcher.probeDomain(firstUrlInQueue.withPathAndParam("/", null));
-
-        if (fetchResult.ok())
-            return new ProbeResultOk(fetchResult.url);
-
-        if (fetchResult.state == FetchResultState.REDIRECT)
-            return new ProbeResultRedirect(fetchResult.domain);
-
-        return new ProbeResultError(CrawlerDomainStatus.ERROR, "Bad status");
+        return fetcher.probeDomain(firstUrlInQueue.withPathAndParam("/", null));
    }

-    public sealed interface ProbeResult permits ProbeResultError, ProbeResultRedirect, ProbeResultOk {}
-
-    /** The probing failed for one reason or another
-     * @param status  Machine readable status
-     * @param desc   Human-readable description of the error
-     */
-    public record ProbeResultError(CrawlerDomainStatus status, String desc) implements ProbeResult {}
-
-    /** This domain redirects to another domain */
-    public record ProbeResultRedirect(EdgeDomain domain) implements ProbeResult {}
-
-    /** If the retrieval of the probed url was successful, return the url as it was fetched
-     * (which may be different from the url we probed, if we attempted another URL schema).
-     *
-     * @param probedUrl  The url we successfully probed
-     */
-    public record ProbeResultOk(EdgeUrl probedUrl) implements ProbeResult {}
 }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/RateLimitException.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/RateLimitException.java
@ -1,21 +0,0 @@
-package nu.marginalia.crawl.retreival;
-
-public class RateLimitException extends Exception {
-    private final String retryAfter;
-
-    public RateLimitException(String retryAfter) {
-        this.retryAfter = retryAfter;
-    }
-
-    @Override
-    public StackTraceElement[] getStackTrace() { return new StackTraceElement[0]; }
-
-    public int retryAfter() {
-        try {
-            return Integer.parseInt(retryAfter);
-        }
-        catch (NumberFormatException ex) {
-            return 1000;
-        }
-    }
-}
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/FetchResult.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/FetchResult.java
@ -1,24 +0,0 @@
-package nu.marginalia.crawl.retreival.fetcher;
-
-import lombok.AllArgsConstructor;
-import lombok.ToString;
-import nu.marginalia.model.EdgeDomain;
-import nu.marginalia.model.EdgeUrl;
-
-@AllArgsConstructor
-@ToString
-public class FetchResult {
-    public final FetchResultState state;
-    public final EdgeUrl url;
-    public final EdgeDomain domain;
-
-    public FetchResult(FetchResultState state, EdgeUrl url) {
-        this.state = state;
-        this.url = url;
-        this.domain = url.domain;
-    }
-
-    public boolean ok() {
-        return state == FetchResultState.OK;
-    }
-}
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/FetchResultState.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/FetchResultState.java
@ -1,7 +0,0 @@
-package nu.marginalia.crawl.retreival.fetcher;
-
-public enum FetchResultState {
-    OK,
-    REDIRECT,
-    ERROR
-}
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java
@ -2,12 +2,12 @@ package nu.marginalia.crawl.retreival.revisit;

 import com.google.common.base.Strings;
 import crawlercommons.robots.SimpleRobotRules;
+import nu.marginalia.crawl.fetcher.ContentTags;
+import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
 import nu.marginalia.crawl.retreival.CrawlDataReference;
 import nu.marginalia.crawl.retreival.CrawlDelayTimer;
 import nu.marginalia.crawl.retreival.CrawlerRetreiver;
 import nu.marginalia.crawl.retreival.DomainCrawlFrontier;
-import nu.marginalia.crawl.retreival.fetcher.ContentTags;
-import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.body.HttpFetchResult;
 import nu.marginalia.model.crawldata.CrawledDocument;
@ -125,6 +125,7 @@ public class CrawlerRevisitor {
                        doc.contentType,
                        doc.httpStatus,
                        doc.documentBody,
+                        doc.headers,
                        new ContentTags(doc.etagMaybe, doc.lastModifiedMaybe)
                );

--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java
@ -1,7 +1,7 @@
 package nu.marginalia.crawl.retreival.revisit;

+import nu.marginalia.crawl.fetcher.ContentTags;
 import nu.marginalia.crawl.retreival.CrawlDataReference;
-import nu.marginalia.crawl.retreival.fetcher.ContentTags;
 import nu.marginalia.model.body.DocumentBodyExtractor;
 import nu.marginalia.model.body.DocumentBodyResult;
 import nu.marginalia.model.body.HttpFetchResult;
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/sitemap/SitemapFetcher.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/sitemap/SitemapFetcher.java
@ -1,8 +1,8 @@
 package nu.marginalia.crawl.retreival.sitemap;

 import crawlercommons.robots.SimpleRobotRules;
+import nu.marginalia.crawl.fetcher.SitemapRetriever;
 import nu.marginalia.crawl.retreival.DomainCrawlFrontier;
-import nu.marginalia.crawl.retreival.fetcher.SitemapRetriever;
 import nu.marginalia.model.EdgeUrl;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
--- a/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/ParquetSerializableCrawlDataStream.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/ParquetSerializableCrawlDataStream.java
@ -144,7 +144,7 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
                nextRecord.httpStatus,
                status.toString(),
                "",
-                "",
+                nextRecord.headers,
                bodyString,
                Long.toHexString(hash.hashNearlyASCII(bodyString)), // this field isn't actually used, maybe we can skip calculating it?
                nextRecord.url,
--- a/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java
@ -23,7 +23,6 @@ public class CrawledDocument implements SerializableCrawlData {
    public String crawlerStatusDesc;

    @Nullable
-    @Deprecated // use getETag() or getLastModified() instead
    public String headers;

    public String documentBody;
--- a/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecord.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecord.java
@ -29,7 +29,11 @@ public class CrawledDocumentParquetRecord {
    public String contentType;
    public byte[] body;

+    public String headers;
+
+    @Deprecated // will be replaced with the full headers field in the future
    public String etagHeader;
+    @Deprecated // will be replaced with the full headers field in the future
    public String lastModifiedHeader;

    public static Hydrator<CrawledDocumentParquetRecord, CrawledDocumentParquetRecord> newHydrator() {
@ -51,7 +55,8 @@ public class CrawledDocumentParquetRecord {
            Types.required(BINARY).as(stringType()).named("contentType"),
            Types.required(BINARY).named("body"),
            Types.optional(BINARY).as(stringType()).named("etagHeader"),
-            Types.optional(BINARY).as(stringType()).named("lastModifiedHeader")
+            Types.optional(BINARY).as(stringType()).named("lastModifiedHeader"),
+            Types.optional(BINARY).as(stringType()).named("headers")
    );


@ -67,6 +72,7 @@ public class CrawledDocumentParquetRecord {
            case "epochSeconds" -> timestamp = Instant.ofEpochSecond((Long) value);
            case "etagHeader" -> etagHeader = (String) value;
            case "lastModifiedHeader" -> lastModifiedHeader = (String) value;
+            case "headers" -> headers = (String) value;

            default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"');
        }
@ -82,6 +88,9 @@ public class CrawledDocumentParquetRecord {
        valueWriter.write("cookies", cookies);
        valueWriter.write("contentType", contentType);
        valueWriter.write("body", body);
+        if (headers != null) {
+            valueWriter.write("headers", headers);
+        }
        if (etagHeader != null) {
            valueWriter.write("etagHeader", etagHeader);
        }
--- a/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileWriter.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileWriter.java
@ -16,6 +16,7 @@ import java.nio.file.Path;
 import java.time.Instant;
 import java.util.List;
 import java.util.Objects;
+import java.util.StringJoiner;

 public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
    private final ParquetWriter<CrawledDocumentParquetRecord> writer;
@ -150,6 +151,14 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
            contentType = "";
        }

+        String headersStr = null;
+        StringJoiner headersStrBuilder = new StringJoiner("\n");
+        for (var header : headers) {
+            headersStrBuilder.add(header.getFirst() + ": " + header.getSecond());
+        }
+        headersStr = headersStrBuilder.toString();
+
+
        write(new CrawledDocumentParquetRecord(
                domain,
                response.target(),
@ -159,6 +168,7 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
                response.date(),
                contentType,
                bodyBytes,
+                headersStr,
                headers.get("ETag"),
                headers.get("Last-Modified"))
        );
@ -179,6 +189,7 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
                "x-marginalia/advisory;state=redirect",
                new byte[0],
                null,
+                null,
                null
        );
    }
@ -192,6 +203,7 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
                "x-marginalia/advisory;state=error",
                errorStatus.getBytes(),
                null,
+                null,
                null
        );
    }
@ -206,6 +218,7 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
                errorStatus,
                new byte[0],
                null,
+                null,
                null
        );
    }
--- a/code/processes/crawling-process/test/nu/marginalia/crawl/logic/ContentTypeProberTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawl/logic/ContentTypeProberTest.java
@ -0,0 +1,124 @@
+package nu.marginalia.crawl.logic;
+
+import com.sun.net.httpserver.HttpServer;
+import nu.marginalia.model.EdgeUrl;
+import okhttp3.OkHttpClient;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import java.io.IOException;
+import java.net.InetSocketAddress;
+import java.util.Random;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertInstanceOf;
+
+class ContentTypeProberTest {
+
+    private static int port;
+    private static HttpServer server;
+    private static OkHttpClient client;
+
+    static EdgeUrl htmlEndpoint;
+    static EdgeUrl htmlRedirEndpoint;
+    static EdgeUrl binaryEndpoint;
+    static EdgeUrl timeoutEndpoint;
+
+    @BeforeEach
+    void setUp() throws IOException  {
+        Random r = new Random();
+        port = r.nextInt(10000) + 8000;
+        server = HttpServer.create(new InetSocketAddress("127.0.0.1", port), 10);
+
+        server.createContext("/html", exchange -> {
+            exchange.getResponseHeaders().add("Content-Type", "text/html");
+            exchange.sendResponseHeaders(200, -1);
+            exchange.close();
+        });
+        server.createContext("/redir", exchange -> {
+            exchange.getResponseHeaders().add("Location", "/html");
+            exchange.sendResponseHeaders(301, -1);
+            exchange.close();
+        });
+
+        server.createContext("/bin", exchange -> {
+            exchange.getResponseHeaders().add("Content-Type", "application/binary");
+            exchange.sendResponseHeaders(200, -1);
+            exchange.close();
+        });
+
+        server.createContext("/timeout", exchange -> {
+            try {
+                Thread.sleep(2000);
+            } catch (InterruptedException e) {
+                throw new RuntimeException(e);
+            }
+
+            exchange.getResponseHeaders().add("Content-Type", "application/binary");
+            exchange.sendResponseHeaders(200, -1);
+            exchange.close();
+        });
+
+        server.start();
+
+        htmlEndpoint = EdgeUrl.parse("http://localhost:" + port + "/html").get();
+        binaryEndpoint = EdgeUrl.parse("http://localhost:" + port + "/bin").get();
+        timeoutEndpoint = EdgeUrl.parse("http://localhost:" + port + "/timeout").get();
+        htmlRedirEndpoint = EdgeUrl.parse("http://localhost:" + port + "/redir").get();
+
+        client = new OkHttpClient.Builder()
+                .readTimeout(1, java.util.concurrent.TimeUnit.SECONDS)
+                .connectTimeout(1, java.util.concurrent.TimeUnit.SECONDS)
+                .callTimeout(1, java.util.concurrent.TimeUnit.SECONDS)
+                .writeTimeout(1, java.util.concurrent.TimeUnit.SECONDS)
+                .build();
+    }
+
+    @AfterEach
+    void tearDown() {
+        server.stop(0);
+        client.dispatcher().executorService().shutdown();
+        client.connectionPool().evictAll();
+    }
+
+    @Test
+    void probeContentTypeOk() {
+        ContentTypeProber.ContentTypeProbeResult result = new ContentTypeProber("test", client)
+                .probeContentType(htmlEndpoint);
+
+        System.out.println(result);
+
+        assertEquals(result, new ContentTypeProber.ContentTypeProbeResult.Ok(htmlEndpoint));
+    }
+
+    @Test
+    void probeContentTypeRedir() {
+        ContentTypeProber.ContentTypeProbeResult result = new ContentTypeProber("test", client)
+                .probeContentType(htmlRedirEndpoint);
+
+        System.out.println(result);
+
+        assertEquals(result, new ContentTypeProber.ContentTypeProbeResult.Ok(htmlEndpoint));
+    }
+
+    @Test
+    void probeContentTypeBad() {
+        ContentTypeProber.ContentTypeProbeResult result = new ContentTypeProber("test", client)
+                .probeContentType(binaryEndpoint);
+
+        System.out.println(result);
+
+        assertInstanceOf(ContentTypeProber.ContentTypeProbeResult.BadContentType.class, result);
+    }
+
+    @Test
+    void probeContentTypeTimeout() {
+        ContentTypeProber.ContentTypeProbeResult result = new ContentTypeProber("test", client)
+                .probeContentType(timeoutEndpoint);
+
+        System.out.println(result);
+
+        assertInstanceOf(ContentTypeProber.ContentTypeProbeResult.Timeout.class, result);
+    }
+}
--- a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizerTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizerTest.java
@ -1,7 +1,7 @@
 package nu.marginalia.crawl.retreival;

-import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
-import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
+import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
+import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
 import okhttp3.OkHttpClient;
@ -21,7 +21,8 @@ import java.security.NoSuchAlgorithmException;
 import java.util.List;
 import java.util.zip.GZIPInputStream;

-import static org.junit.jupiter.api.Assertions.*;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.junit.jupiter.api.Assertions.fail;

 class CrawlerWarcResynchronizerTest {
    Path fileName;
--- a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/ContentTypeProberTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/ContentTypeProberTest.java
@ -1,7 +1,8 @@
 package nu.marginalia.crawl.retreival.fetcher;

-import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult.BadContentType;
-import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult.Ok;
+import nu.marginalia.crawl.logic.ContentTypeProber;
+import nu.marginalia.crawl.logic.ContentTypeProber.ContentTypeProbeResult.BadContentType;
+import nu.marginalia.crawl.logic.ContentTypeProber.ContentTypeProbeResult.Ok;
 import nu.marginalia.model.EdgeUrl;
 import okhttp3.ConnectionPool;
 import okhttp3.Dispatcher;
@ -13,7 +14,7 @@ import java.net.URISyntaxException;
 import java.util.concurrent.Executors;
 import java.util.concurrent.TimeUnit;

-import static org.junit.jupiter.api.Assertions.*;
+import static org.junit.jupiter.api.Assertions.assertEquals;

 class ContentTypeProberTest {

--- a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java
@ -1,8 +1,9 @@
 package nu.marginalia.crawl.retreival.fetcher;

 import nu.marginalia.UserAgent;
-import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
-import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
+import nu.marginalia.crawl.fetcher.ContentTags;
+import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
+import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
 import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
@ -80,6 +81,7 @@ class WarcRecorderTest {
                    "text/html",
                    200,
                    "<?doctype html><html><body>test</body></html>",
+                    null,
                    ContentTags.empty());
        }

@ -103,7 +105,7 @@ class WarcRecorderTest {
                    "text/html",
                    200,
                    null,
-                    ContentTags.empty());
+                    null, ContentTags.empty());
        }

    }
@ -115,7 +117,7 @@ class WarcRecorderTest {
                    "text/html",
                    200,
                    "<?doctype html><html><body>test</body></html>",
-                    ContentTags.empty());
+                    null, ContentTags.empty());
        }

        try (var reader = new WarcReader(fileNameWarc)) {
--- a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/revisit/DocumentWithReferenceTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/revisit/DocumentWithReferenceTest.java
@ -1,7 +1,7 @@
 package nu.marginalia.crawl.retreival.revisit;

+import nu.marginalia.crawl.fetcher.ContentTags;
 import nu.marginalia.crawl.retreival.CrawlDataReference;
-import nu.marginalia.crawl.retreival.fetcher.ContentTags;
 import nu.marginalia.model.crawldata.CrawledDocument;
 import org.junit.jupiter.api.Test;

--- a/code/processes/crawling-process/test/nu/marginalia/crawling/HttpFetcherTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawling/HttpFetcherTest.java
@ -1,11 +1,10 @@
 package nu.marginalia.crawling;

 import lombok.SneakyThrows;
-import nu.marginalia.crawl.retreival.RateLimitException;
-import nu.marginalia.crawl.retreival.fetcher.ContentTags;
-import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
-import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
-import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
+import nu.marginalia.crawl.fetcher.ContentTags;
+import nu.marginalia.crawl.fetcher.HttpFetcher;
+import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
+import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.body.ContentTypeLogic;
 import nu.marginalia.model.body.DocumentBodyExtractor;
@ -33,7 +32,7 @@ class HttpFetcherTest {
    }

    @Test
-    void fetchUTF8() throws URISyntaxException, RateLimitException, IOException {
+    void fetchUTF8() throws URISyntaxException, HttpFetcherImpl.RateLimitException, IOException {
        var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
        try (var recorder = new WarcRecorder()) {
            var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, ContentTags.empty(), HttpFetcher.ProbeType.FULL);
@ -44,7 +43,7 @@ class HttpFetcherTest {
    }

    @Test
-    void fetchText() throws URISyntaxException, RateLimitException, IOException {
+    void fetchText() throws URISyntaxException, HttpFetcherImpl.RateLimitException, IOException {
        var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");

        try (var recorder = new WarcRecorder()) {
--- a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java
@ -2,10 +2,13 @@ package nu.marginalia.crawling.retreival;

 import crawlercommons.robots.SimpleRobotRules;
 import lombok.SneakyThrows;
+import nu.marginalia.crawl.fetcher.ContentTags;
+import nu.marginalia.crawl.fetcher.HttpFetcher;
+import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
+import nu.marginalia.crawl.fetcher.SitemapRetriever;
+import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
 import nu.marginalia.crawl.retreival.CrawlerRetreiver;
 import nu.marginalia.crawl.retreival.DomainProber;
-import nu.marginalia.crawl.retreival.fetcher.*;
-import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.body.HttpFetchResult;
@ -68,7 +71,7 @@ public class CrawlerMockFetcherTest {
    void crawl(CrawlSpecRecord spec)  throws IOException {
        try (var recorder = new WarcRecorder()) {
            new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), spec, recorder)
-                    .fetch();
+                    .crawlDomain();
        }
    }

@ -115,9 +118,9 @@ public class CrawlerMockFetcherTest {
        public void clearCookies() {}

        @Override
-        public FetchResult probeDomain(EdgeUrl url) {
+        public HttpFetcherImpl.ProbeResult probeDomain(EdgeUrl url) {
            logger.info("Probing {}", url);
-            return new FetchResult(FetchResultState.OK, url);
+            return new HttpFetcherImpl.ProbeResultOk(url);
        }

        @SneakyThrows
--- a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
@ -4,10 +4,10 @@ import lombok.SneakyThrows;
 import nu.marginalia.UserAgent;
 import nu.marginalia.WmsaHome;
 import nu.marginalia.atags.model.DomainLinks;
+import nu.marginalia.crawl.fetcher.HttpFetcher;
+import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
+import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
 import nu.marginalia.crawl.retreival.*;
-import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
-import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
-import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
 import nu.marginalia.io.crawldata.CrawledDomainReader;
 import nu.marginalia.io.crawldata.SerializableCrawlDataStream;
 import nu.marginalia.model.EdgeDomain;
@ -468,7 +468,7 @@ class CrawlerRetreiverTest {

    private void doCrawlWithReferenceStream(CrawlSpecRecord specs, SerializableCrawlDataStream stream) {
        try (var recorder = new WarcRecorder(tempFileWarc2)) {
-            new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch(new DomainLinks(),
+            new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).crawlDomain(new DomainLinks(),
                    new CrawlDataReference(stream));
        }
        catch (IOException ex) {
@ -480,7 +480,7 @@ class CrawlerRetreiverTest {
    private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlSpecRecord specs) {
        try (var recorder = new WarcRecorder(tempFileWarc1)) {
            var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder);
-            crawler.fetch();
+            crawler.crawlDomain();
            return crawler.getCrawlFrontier();
        } catch (IOException ex) {
            Assertions.fail(ex);
--- a/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java
+++ b/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java
@ -8,9 +8,9 @@ import nu.marginalia.api.searchquery.RpcQueryLimits;
 import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
 import nu.marginalia.converting.processor.DomainProcessor;
 import nu.marginalia.converting.writer.ConverterBatchWriter;
-import nu.marginalia.crawl.retreival.DomainProber;
-import nu.marginalia.crawl.retreival.fetcher.ContentTags;
-import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
+import nu.marginalia.crawl.fetcher.ContentTags;
+import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
+import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
 import nu.marginalia.functions.searchquery.QueryFactory;
 import nu.marginalia.index.IndexGrpcService;
 import nu.marginalia.index.ReverseIndexFullFileNames;
@ -120,7 +120,7 @@ public class IntegrationTest {
        /** CREATE WARC */
        try (WarcRecorder warcRecorder = new WarcRecorder(warcData)) {
            warcRecorder.writeWarcinfoHeader("127.0.0.1", new EdgeDomain("www.example.com"),
-                    new DomainProber.ProbeResultOk(new EdgeUrl("https://www.example.com/")));
+                    new HttpFetcherImpl.ProbeResultOk(new EdgeUrl("https://www.example.com/")));

            warcRecorder.writeReferenceCopy(new EdgeUrl("https://www.example.com/"),
                    "text/html", 200,
@ -134,6 +134,7 @@ public class IntegrationTest {
                            </body>
                            </html>
                            """,
+                    "",
                    ContentTags.empty()
            );
        }
@ -204,7 +205,7 @@ public class IntegrationTest {
                        .setFetchSize(1000)
                        .build())
                .setQueryStrategy("AUTO")
-                .setHumanQuery("\"This is how thinking works\"")
+                .setHumanQuery("\"is that there is\"")
                .build();

        var params = QueryProtobufCodec.convertRequest(request);
--- a/code/tools/screenshot-capture-tool/java/nu/marginalia/screenshot/ScreenshotCaptureToolMain.java
+++ b/code/tools/screenshot-capture-tool/java/nu/marginalia/screenshot/ScreenshotCaptureToolMain.java
@ -96,7 +96,7 @@ public class ScreenshotCaptureToolMain {
    private static byte[] fetchDomain(HttpClient client, EdgeDomain domain) {
        try {
            Map<String, Object> requestData = Map.of(
-                    "url", domain.toRootUrl().toString(),
+                    "url", domain.toRootUrlHttp().toString(),
                    "options",
                    Map.of("fullPage", false,
                            "type", "png"),