Merge pull request #150 from MarginaliaSearch/httpclient-in-crawler

Reduce the use of 3rd party code in the crawler
2025-02-23 13:09:00 +00:00 · 2025-01-20 19:35:30 +01:00 · 2025-01-20 19:35:30 +01:00 · 2c67f50a43
commit 2c67f50a43
parent 567e4e1237 78a958e2b0
27 changed files with 433 additions and 468 deletions
--- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/warc/WarcSideloader.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/warc/WarcSideloader.java
@ -106,11 +106,7 @@ public class WarcSideloader implements SideloadSource, AutoCloseable {
                return false;

            var url = new EdgeUrl(warcResponse.target());
-            if (!Objects.equals(url.getDomain(), domain)) {
-                return false;
-            }
-
-            return true;
+            return Objects.equals(url.getDomain(), domain);
        } catch (Exception e) {
            logger.warn("Failed to process response", e);
        }
--- a/code/processes/converting-process/test/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java
+++ b/code/processes/converting-process/test/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java
@ -8,6 +8,7 @@ import nu.marginalia.converting.model.ProcessedDomain;
 import nu.marginalia.converting.processor.DomainProcessor;
 import nu.marginalia.crawl.CrawlerMain;
 import nu.marginalia.crawl.DomainStateDb;
+import nu.marginalia.crawl.fetcher.Cookies;
 import nu.marginalia.crawl.fetcher.HttpFetcher;
 import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
 import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
@ -200,23 +201,23 @@ public class CrawlingThenConvertingIntegrationTest {

    @Test
    public void crawlRobotsTxt() throws Exception {
-        var specs = new CrawlerMain.CrawlSpecRecord("search.marginalia.nu", 5,
-                        List.of("https://search.marginalia.nu/search?q=hello+world")
+        var specs = new CrawlerMain.CrawlSpecRecord("marginalia-search.com", 5,
+                        List.of("https://marginalia-search.com/search?q=hello+world")
        );

        CrawledDomain domain = crawl(specs);
        assertFalse(domain.doc.isEmpty());
        assertEquals("OK", domain.crawlerStatus);
-        assertEquals("search.marginalia.nu", domain.domain);
+        assertEquals("marginalia-search.com", domain.domain);

        Set<String> allUrls = domain.doc.stream().map(doc -> doc.url).collect(Collectors.toSet());
-        assertTrue(allUrls.contains("https://search.marginalia.nu/search"), "We expect a record for entities that are forbidden");
+        assertTrue(allUrls.contains("https://marginalia-search.com/search"), "We expect a record for entities that are forbidden");

        var output = process();

        assertNotNull(output);
        assertFalse(output.documents.isEmpty());
-        assertEquals(new EdgeDomain("search.marginalia.nu"), output.domain);
+        assertEquals(new EdgeDomain("marginalia-search.com"), output.domain);
        assertEquals(DomainIndexingState.ACTIVE, output.state);

        for (var doc : output.documents) {
@ -246,7 +247,7 @@ public class CrawlingThenConvertingIntegrationTest {
    private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs, Predicate<EdgeDomain> domainBlacklist) throws Exception {
        List<SerializableCrawlData> data = new ArrayList<>();

-        try (var recorder = new WarcRecorder(fileName);
+        try (var recorder = new WarcRecorder(fileName, new Cookies());
             var db = new DomainStateDb(dbTempFile))
        {
            new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, db, recorder).crawlDomain();
--- a/code/processes/crawling-process/build.gradle
+++ b/code/processes/crawling-process/build.gradle
@ -55,7 +55,6 @@ dependencies {
    implementation libs.zstd
    implementation libs.jwarc
    implementation libs.crawlercommons
-    implementation libs.okhttp3
    implementation libs.jsoup
    implementation libs.opencsv
    implementation libs.fastutil
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
@ -33,8 +33,6 @@ import nu.marginalia.service.module.DatabaseModule;
 import nu.marginalia.storage.FileStorageService;
 import nu.marginalia.storage.model.FileStorageId;
 import nu.marginalia.util.SimpleBlockingThreadPool;
-import okhttp3.ConnectionPool;
-import okhttp3.Dispatcher;
 import org.jetbrains.annotations.NotNull;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -85,6 +83,7 @@ public class CrawlerMain extends ProcessMainClass {

    @Inject
    public CrawlerMain(UserAgent userAgent,
+                       HttpFetcherImpl httpFetcher,
                       ProcessHeartbeatImpl heartbeat,
                       MessageQueueFactory messageQueueFactory, DomainProber domainProber,
                       FileStorageService fileStorageService,
@ -98,6 +97,7 @@ public class CrawlerMain extends ProcessMainClass {
        super(messageQueueFactory, processConfiguration, gson, CRAWLER_INBOX);

        this.userAgent = userAgent;
+        this.fetcher = httpFetcher;
        this.heartbeat = heartbeat;
        this.domainProber = domainProber;
        this.fileStorageService = fileStorageService;
@ -111,10 +111,6 @@ public class CrawlerMain extends ProcessMainClass {
                Integer.getInteger("crawler.poolSize", 256),
                1);

-        fetcher = new HttpFetcherImpl(userAgent,
-                new Dispatcher(),
-                new ConnectionPool(5, 10, TimeUnit.SECONDS)
-        );

        // Wait for the blacklist to be loaded before starting the crawl
        blacklist.waitUntilLoaded();
@ -132,6 +128,10 @@ public class CrawlerMain extends ProcessMainClass {
        System.setProperty("sun.net.client.defaultConnectTimeout", "30000");
        System.setProperty("sun.net.client.defaultReadTimeout", "30000");

+        // Set the maximum number of connections to keep alive in the connection pool
+        System.setProperty("jdk.httpclient.idleTimeout", "15"); // 15 seconds
+        System.setProperty("jdk.httpclient.connectionPoolSize", "256");
+
        // We don't want to use too much memory caching sessions for https
        System.setProperty("javax.net.ssl.sessionCacheSize", "2048");

@ -364,9 +364,9 @@ public class CrawlerMain extends ProcessMainClass {
                Files.deleteIfExists(tempFile);
            }

-            try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now
+            try (var warcRecorder = new WarcRecorder(newWarcFile, fetcher); // write to a temp file for now
                 var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, domainStateDb, warcRecorder);
-                 CrawlDataReference reference = getReference();
+                 CrawlDataReference reference = getReference()
            )
            {
                // Resume the crawl if it was aborted
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/ContentTags.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/ContentTags.java
@ -1,6 +1,6 @@
 package nu.marginalia.crawl.fetcher;

-import okhttp3.Request;
+import java.net.http.HttpRequest;

 /** Encapsulates request modifiers; the ETag and Last-Modified tags for a resource */
 public record ContentTags(String etag, String lastMod) {
@ -17,14 +17,14 @@ public record ContentTags(String etag, String lastMod) {
    }

    /** Paints the tags onto the request builder. */
-    public void paint(Request.Builder getBuilder) {
+    public void paint(HttpRequest.Builder getBuilder) {

        if (etag != null) {
-            getBuilder.addHeader("If-None-Match", etag);
+            getBuilder.header("If-None-Match", etag);
        }

        if (lastMod != null) {
-            getBuilder.addHeader("If-Modified-Since", lastMod);
+            getBuilder.header("If-Modified-Since", lastMod);
        }
    }
 }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/Cookies.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/Cookies.java
@ -1,33 +1,14 @@
 package nu.marginalia.crawl.fetcher;

-import okhttp3.Cookie;
-import okhttp3.CookieJar;
-import okhttp3.HttpUrl;
-
-import java.util.Collections;
+import java.io.IOException;
+import java.net.CookieHandler;
+import java.net.URI;
 import java.util.List;
+import java.util.Map;
 import java.util.concurrent.ConcurrentHashMap;

-public class Cookies {
-    final ThreadLocal<ConcurrentHashMap<String, List<Cookie>>> cookieJar = ThreadLocal.withInitial(ConcurrentHashMap::new);
-
-    public CookieJar getJar() {
-        return new CookieJar() {
-
-            @Override
-            public void saveFromResponse(HttpUrl url, List<Cookie> cookies) {
-
-                if (!cookies.isEmpty()) {
-                    cookieJar.get().put(url.host(), cookies);
-                }
-            }
-
-            @Override
-            public List<Cookie> loadForRequest(HttpUrl url) {
-                return cookieJar.get().getOrDefault(url.host(), Collections.emptyList());
-            }
-        };
-    }
+public class Cookies extends CookieHandler {
+    final ThreadLocal<ConcurrentHashMap<String, List<String>>> cookieJar = ThreadLocal.withInitial(ConcurrentHashMap::new);

    public void clear() {
        cookieJar.get().clear();
@ -38,6 +19,16 @@ public class Cookies {
    }

    public List<String> getCookies() {
-        return cookieJar.get().values().stream().flatMap(List::stream).map(Cookie::toString).toList();
+        return cookieJar.get().values().stream().flatMap(List::stream).toList();
+    }
+
+    @Override
+    public Map<String, List<String>> get(URI uri, Map<String, List<String>> requestHeaders) throws IOException {
+        return cookieJar.get();
+    }
+
+    @Override
+    public void put(URI uri, Map<String, List<String>> responseHeaders) throws IOException {
+        cookieJar.get().putAll(responseHeaders);
    }
 }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcher.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcher.java
@ -3,6 +3,7 @@ package nu.marginalia.crawl.fetcher;
 import com.google.inject.ImplementedBy;
 import crawlercommons.robots.SimpleRobotRules;
 import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
+import nu.marginalia.crawl.retreival.CrawlDelayTimer;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.body.HttpFetchResult;
@ -11,10 +12,10 @@ import nu.marginalia.model.crawldata.CrawlerDomainStatus;
 import java.util.List;

@ImplementedBy(HttpFetcherImpl.class)
-public interface HttpFetcher {
+public interface HttpFetcher extends AutoCloseable {
    void setAllowAllContentTypes(boolean allowAllContentTypes);

-    List<String> getCookies();
+    Cookies getCookies();
    void clearCookies();

    DomainProbeResult probeDomain(EdgeUrl url);
@ -27,7 +28,9 @@ public interface HttpFetcher {
    HttpFetchResult fetchContent(EdgeUrl url,
                                 WarcRecorder recorder,
                                 ContentTags tags,
-                                 ProbeType probeType) throws HttpFetcherImpl.RateLimitException, Exception;
+                                 ProbeType probeType) throws Exception;
+
+    List<EdgeUrl> fetchSitemapUrls(String rootSitemapUrl, CrawlDelayTimer delayTimer);

    SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder);

--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java
@ -1,35 +1,41 @@
 package nu.marginalia.crawl.fetcher;

 import com.google.inject.Inject;
+import com.google.inject.Singleton;
 import crawlercommons.robots.SimpleRobotRules;
 import crawlercommons.robots.SimpleRobotRulesParser;
 import nu.marginalia.UserAgent;
-import nu.marginalia.crawl.fetcher.socket.FastTerminatingSocketFactory;
-import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
 import nu.marginalia.crawl.fetcher.socket.NoSecuritySSL;
 import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
+import nu.marginalia.crawl.retreival.CrawlDelayTimer;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.body.ContentTypeLogic;
 import nu.marginalia.model.body.DocumentBodyExtractor;
 import nu.marginalia.model.body.HttpFetchResult;
 import nu.marginalia.model.crawldata.CrawlerDomainStatus;
-import okhttp3.ConnectionPool;
-import okhttp3.Dispatcher;
-import okhttp3.OkHttpClient;
-import okhttp3.Request;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.parser.Parser;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

-import javax.net.ssl.X509TrustManager;
-import java.io.InterruptedIOException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URISyntaxException;
+import java.net.URLDecoder;
+import java.net.http.HttpClient;
+import java.net.http.HttpRequest;
+import java.net.http.HttpResponse;
+import java.net.http.HttpTimeoutException;
+import java.nio.charset.StandardCharsets;
 import java.time.Duration;
-import java.util.List;
-import java.util.Objects;
-import java.util.Optional;
-import java.util.concurrent.TimeUnit;
+import java.util.*;
+import java.util.concurrent.Executors;
+import java.util.zip.GZIPInputStream;


+@Singleton
 public class HttpFetcherImpl implements HttpFetcher {

    private final Logger logger = LoggerFactory.getLogger(getClass());
@ -40,39 +46,28 @@ public class HttpFetcherImpl implements HttpFetcher {
    private static final SimpleRobotRulesParser robotsParser = new SimpleRobotRulesParser();
    private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();

+    private final Duration requestTimeout = Duration.ofSeconds(10);
+
    @Override
    public void setAllowAllContentTypes(boolean allowAllContentTypes) {
        contentTypeLogic.setAllowAllContentTypes(allowAllContentTypes);
    }

-    private final OkHttpClient client;
+    private final HttpClient client;

-    private static final FastTerminatingSocketFactory ftSocketFactory = new FastTerminatingSocketFactory();
-
-    private OkHttpClient createClient(Dispatcher dispatcher, ConnectionPool pool) {
-        var builder = new OkHttpClient.Builder();
-        if (dispatcher != null) {
-            builder.dispatcher(dispatcher);
-        }
-
-        return builder.sslSocketFactory(NoSecuritySSL.buildSocketFactory(), (X509TrustManager) NoSecuritySSL.trustAllCerts[0])
-            .socketFactory(ftSocketFactory)
-            .hostnameVerifier(NoSecuritySSL.buildHostnameVerifyer())
-            .addNetworkInterceptor(new IpInterceptingNetworkInterceptor())
-            .connectionPool(pool)
-            .cookieJar(cookies.getJar())
-            .followRedirects(true)
-            .followSslRedirects(true)
-            .connectTimeout(8, TimeUnit.SECONDS)
-            .readTimeout(10, TimeUnit.SECONDS)
-            .writeTimeout(10, TimeUnit.SECONDS)
+    private HttpClient createClient() {
+        return HttpClient.newBuilder()
+                .sslContext(NoSecuritySSL.buildSslContext())
+                .cookieHandler(cookies)
+                .followRedirects(HttpClient.Redirect.NORMAL)
+                .connectTimeout(Duration.ofSeconds(8))
+                .executor(Executors.newCachedThreadPool())
                .build();
-
    }

    @Override
-    public List<String> getCookies() {
-        return cookies.getCookies();
+    public Cookies getCookies() {
+        return cookies;
    }

    @Override
@ -81,26 +76,24 @@ public class HttpFetcherImpl implements HttpFetcher {
    }

    @Inject
-    public HttpFetcherImpl(UserAgent userAgent,
-                           Dispatcher dispatcher,
-                           ConnectionPool connectionPool)
+    public HttpFetcherImpl(UserAgent userAgent)
    {
-        this.client = createClient(dispatcher, connectionPool);
+        this.client = createClient();
        this.userAgentString = userAgent.uaString();
        this.userAgentIdentifier = userAgent.uaIdentifier();
    }

    public HttpFetcherImpl(String userAgent) {
-        this.client = createClient(null, new ConnectionPool());
+        this.client = createClient();
        this.userAgentString = userAgent;
        this.userAgentIdentifier = userAgent;
    }

    // Not necessary in prod, but useful in test
    public void close() {
-        client.dispatcher().executorService().shutdown();
-        client.connectionPool().evictAll();
+        client.close();
    }
+
    /**
     * Probe the domain to see if it is reachable, attempting to identify which schema to use,
     * and if there are any redirects.  This is done by one or more HEAD requests.
@ -110,19 +103,26 @@ public class HttpFetcherImpl implements HttpFetcher {
     */
    @Override
    public DomainProbeResult probeDomain(EdgeUrl url) {
-        var head = new Request.Builder().head().addHeader("User-agent", userAgentString)
-                .url(url.toString())
+        HttpRequest head;
+        try {
+            head = HttpRequest.newBuilder()
+                    .HEAD()
+                    .uri(url.asURI())
+                    .header("User-agent", userAgentString)
+                    .timeout(requestTimeout)
                    .build();
-
-        var call = client.newCall(head);
-
-        try (var rsp = call.execute()) {
-            EdgeUrl requestUrl = new EdgeUrl(rsp.request().url().toString());
-
-            if (!Objects.equals(requestUrl.domain, url.domain)) {
-                return new DomainProbeResult.Redirect(requestUrl.domain);
+        } catch (URISyntaxException e) {
+            return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid URL");
        }
-            return new DomainProbeResult.Ok(requestUrl);
+
+        try {
+            var rsp = client.send(head, HttpResponse.BodyHandlers.discarding());
+            EdgeUrl rspUri = new EdgeUrl(rsp.uri());
+
+            if (!Objects.equals(rspUri.domain, url.domain)) {
+                return new DomainProbeResult.Redirect(rspUri.domain);
+            }
+            return new DomainProbeResult.Ok(rspUri);
        }
        catch (Exception ex) {
            return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, ex.getMessage());
@ -140,21 +140,25 @@ public class HttpFetcherImpl implements HttpFetcher {
                                                   WarcRecorder warcRecorder,
                                                   ContentTags tags) throws RateLimitException {
        if (tags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url)) {
-            var headBuilder = new Request.Builder().head()
-                    .addHeader("User-agent", userAgentString)
-                    .addHeader("Accept-Encoding", "gzip")
-                    .url(url.toString());

-            var head = headBuilder.build();
-            var call = client.newCall(head);
+            try {
+                var headBuilder = HttpRequest.newBuilder()
+                    .HEAD()
+                    .uri(url.asURI())
+                    .header("User-agent", userAgentString)
+                    .header("Accept-Encoding", "gzip")
+                    .timeout(requestTimeout)
+                    ;

-            try (var rsp = call.execute()) {
-                var contentTypeHeader = rsp.header("Content-type");
+                var rsp = client.send(headBuilder.build(), HttpResponse.BodyHandlers.discarding());
+                var headers = rsp.headers();
+
+                var contentTypeHeader = headers.firstValue("Content-Type").orElse(null);

                if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) {
-                    warcRecorder.flagAsFailedContentTypeProbe(url, contentTypeHeader, rsp.code());
+                    warcRecorder.flagAsFailedContentTypeProbe(url, contentTypeHeader, rsp.statusCode());

-                    return new ContentTypeProbeResult.BadContentType(contentTypeHeader, rsp.code());
+                    return new ContentTypeProbeResult.BadContentType(contentTypeHeader, rsp.statusCode());
                }

                // Update the URL to the final URL of the HEAD request, otherwise we might end up doing
@ -168,27 +172,27 @@ public class HttpFetcherImpl implements HttpFetcher {
                // too many eyebrows when looking at the logs on the target server.  Overall it's probably desirable
                // that it looks like the traffic makes sense, as opposed to looking like a broken bot.

-                var redirectUrl = new EdgeUrl(rsp.request().url().toString());
+                var redirectUrl = new EdgeUrl(rsp.uri());
                EdgeUrl ret;

                if (Objects.equals(redirectUrl.domain, url.domain)) ret = redirectUrl;
                else ret = url;

                // Intercept rate limiting
-                if (rsp.code() == 429) {
-                    throw new HttpFetcherImpl.RateLimitException(Objects.requireNonNullElse(rsp.header("Retry-After"), "1"));
+                if (rsp.statusCode() == 429) {
+                    throw new HttpFetcherImpl.RateLimitException(headers.firstValue("Retry-After").orElse("1"));
                }

                return new ContentTypeProbeResult.Ok(ret);
            }
+            catch (HttpTimeoutException ex) {
+                warcRecorder.flagAsTimeout(url);
+                return new ContentTypeProbeResult.Timeout(ex);
+            }
            catch (RateLimitException ex) {
                throw ex;
            }
-            catch (InterruptedIOException ex) {
-                warcRecorder.flagAsTimeout(url);
-
-                return new ContentTypeProbeResult.Timeout(ex);
-            } catch (Exception ex) {
+            catch (Exception ex) {
                logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());

                warcRecorder.flagAsError(url, ex);
@ -210,13 +214,15 @@ public class HttpFetcherImpl implements HttpFetcher {
                                           ProbeType probeType)
        throws Exception
    {
-        var getBuilder = new Request.Builder().get();
-
-        getBuilder.url(url.toString())
-                .addHeader("Accept-Encoding", "gzip")
-                .addHeader("Accept-Language", "en,*;q=0.5")
-                .addHeader("Accept", "text/html, application/xhtml+xml, text/*;q=0.8")
-                .addHeader("User-agent", userAgentString);
+        var getBuilder = HttpRequest.newBuilder()
+                .GET()
+                .uri(url.asURI())
+                .header("User-agent", userAgentString)
+                .header("Accept-Encoding", "gzip")
+                .header("Accept-Language", "en,*;q=0.5")
+                .header("Accept", "text/html, application/xhtml+xml, text/*;q=0.8")
+                .timeout(requestTimeout)
+                ;

        contentTags.paint(getBuilder);

@ -242,6 +248,122 @@ public class HttpFetcherImpl implements HttpFetcher {
        return new SitemapRetriever();
    }

+    @Override
+    public List<EdgeUrl> fetchSitemapUrls(String root, CrawlDelayTimer delayTimer) {
+        try {
+            List<EdgeUrl> ret = new ArrayList<>();
+
+            Set<String> seenUrls = new HashSet<>();
+            Set<String> seenSitemaps = new HashSet<>();
+
+            Deque<EdgeUrl> sitemapQueue = new LinkedList<>();
+
+            EdgeUrl rootSitemapUrl = new EdgeUrl(root);
+
+            sitemapQueue.add(rootSitemapUrl);
+
+            int fetchedSitemaps = 0;
+
+            while (!sitemapQueue.isEmpty() && ret.size() < 20_000 && ++fetchedSitemaps < 10) {
+                var head = sitemapQueue.removeFirst();
+
+                switch (fetchSitemap(head)) {
+                    case SitemapResult.SitemapUrls(List<String> urls) -> {
+
+                        for (var url : urls) {
+                            if (seenUrls.add(url)) {
+                                EdgeUrl.parse(url)
+                                        .filter(u -> u.domain.equals(rootSitemapUrl.domain))
+                                        .ifPresent(ret::add);
+                            }
+                        }
+
+                    }
+                    case SitemapResult.SitemapReferences(List<String> refs) -> {
+                        for (var ref : refs) {
+                            if (seenSitemaps.add(ref)) {
+                                EdgeUrl.parse(ref)
+                                        .filter(url -> url.domain.equals(rootSitemapUrl.domain))
+                                        .ifPresent(sitemapQueue::addFirst);
+                            }
+                        }
+                    }
+                    case SitemapResult.SitemapError() -> {}
+                }
+
+                delayTimer.waitFetchDelay();
+            }
+
+            return ret;
+        }
+        catch (Exception ex) {
+            logger.error("Error while fetching sitemaps via " + root, ex);
+            return List.of();
+        }
+    }
+
+
+    private SitemapResult fetchSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
+        HttpRequest getRequest = HttpRequest.newBuilder()
+                .GET()
+                .uri(sitemapUrl.asURI())
+                .header("Accept-Encoding", "gzip")
+                .header("Accept", "text/*, */*;q=0.9")
+                .header("User-agent", userAgentString)
+                .timeout(requestTimeout)
+                .build();
+
+        var response = client.send(getRequest, HttpResponse.BodyHandlers.ofInputStream());
+        if (response.statusCode() != 200) {
+            return new SitemapResult.SitemapError();
+        }
+
+        try (InputStream inputStream = response.body()) {
+
+            InputStream parserStream;
+            if (sitemapUrl.path.endsWith(".gz")) {
+                parserStream = new GZIPInputStream(inputStream);
+            }
+            else {
+                parserStream = inputStream;
+            }
+
+            Document parsedSitemap = Jsoup.parse(parserStream, "UTF-8", sitemapUrl.toString(), Parser.xmlParser());
+            String rootTagName = parsedSitemap.child(0).tagName();
+
+            return switch (rootTagName.toLowerCase()) {
+                case "sitemapindex" -> {
+                    List<String> references = new ArrayList<>();
+                    for (var locTag : parsedSitemap.getElementsByTag("loc")) {
+                        references.add(URLDecoder.decode(locTag.text().trim(), StandardCharsets.UTF_8));
+                    }
+                    yield new SitemapResult.SitemapReferences(Collections.unmodifiableList(references));
+                }
+                case "urlset" -> {
+                    List<String> urls = new ArrayList<>();
+                    for (var locTag : parsedSitemap.select("url > loc")) {
+                        urls.add(URLDecoder.decode(locTag.text().trim(), StandardCharsets.UTF_8));
+                    }
+                    yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
+                }
+                case "rss", "atom" -> {
+                    List<String> urls = new ArrayList<>();
+                    for (var locTag : parsedSitemap.select("link, url")) {
+                        urls.add(locTag.text().trim());
+                    }
+                    yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
+                }
+                default -> new SitemapResult.SitemapError();
+            };
+        }
+    }
+
+    private sealed interface SitemapResult {
+        record SitemapUrls(List<String> urls) implements SitemapResult {}
+        record SitemapReferences(List<String> sitemapRefs) implements SitemapResult {}
+        record SitemapError() implements SitemapResult {}
+    }
+
    @Override
    public SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder) {
        var ret = fetchAndParseRobotsTxt(new EdgeUrl("https", domain, null, "/robots.txt", null), recorder);
@ -257,14 +379,15 @@ public class HttpFetcherImpl implements HttpFetcher {

    private Optional<SimpleRobotRules> fetchAndParseRobotsTxt(EdgeUrl url, WarcRecorder recorder) {
        try {
-            var getBuilder = new Request.Builder().get();
+            var getRequest = HttpRequest.newBuilder()
+                    .GET()
+                    .uri(url.asURI())
+                    .header("Accept-Encoding", "gzip")
+                    .header("Accept", "text/*, */*;q=0.9")
+                    .header("User-agent", userAgentString)
+                    .timeout(requestTimeout);

-            getBuilder.url(url.toString())
-                    .addHeader("Accept-Encoding", "gzip")
-                    .addHeader("Accept", "text/*, */*;q=0.9")
-                    .addHeader("User-agent", userAgentString);
-
-            HttpFetchResult result = recorder.fetch(client, getBuilder.build());
+            HttpFetchResult result = recorder.fetch(client, getRequest.build());

            return DocumentBodyExtractor.asBytes(result).mapOpt((contentType, body) ->
                robotsParser.parseContent(url.toString(),
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/socket/IpInterceptingNetworkInterceptor.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/socket/IpInterceptingNetworkInterceptor.java
@ -1,31 +0,0 @@
-package nu.marginalia.crawl.fetcher.socket;
-
-import okhttp3.Interceptor;
-import okhttp3.Response;
-import org.jetbrains.annotations.NotNull;
-
-import java.io.IOException;
-
-
-/** An interceptor that intercepts network requests and adds the remote IP address as
- * a header in the response.  This is used to pass the remote IP address to the Warc
- * writer, as this information is not available in the response.
- */
-public class IpInterceptingNetworkInterceptor implements Interceptor  {
-    private static final String pseudoHeaderName = "X-Marginalia-Remote-IP";
-
-    @NotNull
-    @Override
-    public Response intercept(@NotNull Interceptor.Chain chain) throws IOException {
-        String IP = chain.connection().socket().getInetAddress().getHostAddress();
-
-        return chain.proceed(chain.request())
-                .newBuilder()
-                .addHeader(pseudoHeaderName, IP)
-                .build();
-    }
-
-    public static String getIpFromResponse(Response response) {
-        return response.header(pseudoHeaderName);
-    }
-}
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/socket/NoSecuritySSL.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/socket/NoSecuritySSL.java
@ -27,7 +27,7 @@ public class NoSecuritySSL {
            }
    };

-    public static SSLSocketFactory buildSocketFactory() {
+    public static SSLContext buildSslContext() {
        try {
            // Install the all-trusting trust manager
            final SSLContext sslContext = SSLContext.getInstance("TLS");
@ -40,14 +40,11 @@ public class NoSecuritySSL {
            clientSessionContext.setSessionCacheSize(2048);

            // Create a ssl socket factory with our all-trusting manager
-            return sslContext.getSocketFactory();
+            return sslContext;
        }
        catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

-    public static HostnameVerifier buildHostnameVerifyer() {
-        return (hn, session) -> true;
-    }
 }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcInputBuffer.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcInputBuffer.java
@ -1,14 +1,14 @@
 package nu.marginalia.crawl.fetcher.warc;

-import okhttp3.Headers;
-import okhttp3.Response;
 import org.apache.commons.io.input.BOMInputStream;
 import org.netpreserve.jwarc.WarcTruncationReason;

 import java.io.*;
+import java.net.http.HttpHeaders;
+import java.net.http.HttpResponse;
 import java.nio.file.Files;
 import java.nio.file.Path;
-import java.util.Objects;
+import java.util.Map;
 import java.util.zip.GZIPInputStream;

 /** Input buffer for temporary storage of a HTTP response
@ -17,8 +17,9 @@ import java.util.zip.GZIPInputStream;
 * */
 public abstract class WarcInputBuffer implements AutoCloseable {
    protected WarcTruncationReason truncationReason = WarcTruncationReason.NOT_TRUNCATED;
-    protected Headers headers;
-    WarcInputBuffer(Headers headers) {
+    protected HttpHeaders headers;
+
+    WarcInputBuffer(HttpHeaders headers) {
        this.headers = headers;
    }

@ -30,7 +31,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {

    public final WarcTruncationReason truncationReason() { return truncationReason; }

-    public final Headers headers() { return headers; }
+    public final HttpHeaders headers() { return headers; }

    /** Create a buffer for a response.
     *  If the response is small and not compressed, it will be stored in memory.
@ -38,26 +39,27 @@ public abstract class WarcInputBuffer implements AutoCloseable {
     *  and suppressed from the headers.
     *  If an error occurs, a buffer will be created with no content and an error status.
     */
-    static WarcInputBuffer forResponse(Response rsp) {
+    static WarcInputBuffer forResponse(HttpResponse<InputStream> rsp) {
        if (rsp == null)
            return new ErrorBuffer();

-        try {
-            String contentLengthHeader = Objects.requireNonNullElse(rsp.header("Content-Length"), "-1");
-            int contentLength = Integer.parseInt(contentLengthHeader);
-            String contentEncoding = rsp.header("Content-Encoding");
+        var headers = rsp.headers();
+
+        try (var is = rsp.body()) {
+            int contentLength = (int) headers.firstValueAsLong("Content-Length").orElse(-1L);
+            String contentEncoding = headers.firstValue("Content-Encoding").orElse(null);

            if (contentEncoding == null && contentLength > 0 && contentLength < 8192) {
                // If the content is small and not compressed, we can just read it into memory
-                return new MemoryBuffer(rsp, contentLength);
+                return new MemoryBuffer(headers, is, contentLength);
            }
            else {
                // Otherwise, we unpack it into a file and read it from there
-                return new FileBuffer(rsp);
+                return new FileBuffer(headers, is);
            }
        }
        catch (Exception ex) {
-            return new ErrorBuffer(rsp);
+            return new ErrorBuffer();
        }

    }
@ -99,12 +101,8 @@ public abstract class WarcInputBuffer implements AutoCloseable {
 /** Pseudo-buffer for when we have an error */
 class ErrorBuffer extends WarcInputBuffer {
    public ErrorBuffer() {
-        super(Headers.of());
-        truncationReason = WarcTruncationReason.UNSPECIFIED;
-    }
+        super(HttpHeaders.of(Map.of(), (k,v)->false));

-    public ErrorBuffer(Response rsp) {
-        super(rsp.headers());
        truncationReason = WarcTruncationReason.UNSPECIFIED;
    }

@ -125,12 +123,12 @@ class ErrorBuffer extends WarcInputBuffer {
 /** Buffer for when we have the response in memory */
 class MemoryBuffer extends WarcInputBuffer {
    byte[] data;
-    public MemoryBuffer(Response response, int size) {
-        super(response.headers());
+    public MemoryBuffer(HttpHeaders headers, InputStream responseStream, int size) {
+        super(headers);

        var outputStream = new ByteArrayOutputStream(size);

-        copy(response.body().byteStream(), outputStream);
+        copy(responseStream, outputStream);

        data = outputStream.toByteArray();
    }
@ -154,19 +152,15 @@ class MemoryBuffer extends WarcInputBuffer {
 class FileBuffer extends WarcInputBuffer {
    private final Path tempFile;

-    public FileBuffer(Response response) throws IOException {
-        super(suppressContentEncoding(response.headers()));
+    public FileBuffer(HttpHeaders headers, InputStream responseStream) throws IOException {
+        super(suppressContentEncoding(headers));

        this.tempFile = Files.createTempFile("rsp", ".html");

-        if (response.body() == null) {
-            truncationReason = WarcTruncationReason.DISCONNECT;
-            return;
-        }

-        if ("gzip".equals(response.header("Content-Encoding"))) {
+        if ("gzip".equalsIgnoreCase(headers.firstValue("Content-Encoding").orElse(""))) {
            try (var out = Files.newOutputStream(tempFile)) {
-                copy(new GZIPInputStream(response.body().byteStream()), out);
+                copy(new GZIPInputStream(responseStream), out);
            }
            catch (Exception ex) {
                truncationReason = WarcTruncationReason.UNSPECIFIED;
@ -174,7 +168,7 @@ class FileBuffer extends WarcInputBuffer {
        }
        else {
            try (var out = Files.newOutputStream(tempFile)) {
-                copy(response.body().byteStream(), out);
+                copy(responseStream, out);
            }
            catch (Exception ex) {
                truncationReason = WarcTruncationReason.UNSPECIFIED;
@ -182,22 +176,13 @@ class FileBuffer extends WarcInputBuffer {
        }
    }

-    private static Headers suppressContentEncoding(Headers headers) {
-        var builder = new Headers.Builder();
-
-        headers.toMultimap().forEach((k, values) -> {
+    private static HttpHeaders suppressContentEncoding(HttpHeaders headers) {
+        return HttpHeaders.of(headers.map(), (k, v) -> {
            if ("Content-Encoding".equalsIgnoreCase(k)) {
-                return;
-            }
-            if ("Transfer-Encoding".equalsIgnoreCase(k)) {
-                return;
-            }
-            for (var value : values) {
-                builder.add(k, value);
+                return false;
            }
+            return !"Transfer-Encoding".equalsIgnoreCase(k);
        });
-
-        return builder.build();
    }


--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcProtocolReconstructor.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcProtocolReconstructor.java
@ -1,11 +1,12 @@
 package nu.marginalia.crawl.fetcher.warc;

-import okhttp3.Protocol;
-import okhttp3.Response;
 import org.apache.commons.lang3.StringUtils;

 import java.net.URI;
 import java.net.URLEncoder;
+import java.net.http.HttpClient;
+import java.net.http.HttpHeaders;
+import java.net.http.HttpResponse;
 import java.nio.charset.StandardCharsets;
 import java.util.*;
 import java.util.stream.Collectors;
@ -75,13 +76,13 @@ public class WarcProtocolReconstructor {
        return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
    }

-    static String getResponseHeader(Response response, long size) {
-        String version = response.protocol() == Protocol.HTTP_1_1 ? "1.1" : "2.0";
+    static String getResponseHeader(HttpResponse<?> response, long size) {
+        String version = response.version() == HttpClient.Version.HTTP_1_1 ? "1.1" : "2.0";

-        String statusCode = String.valueOf(response.code());
-        String statusMessage = STATUS_CODE_MAP.getOrDefault(response.code(), "Unknown");
+        String statusCode = String.valueOf(response.statusCode());
+        String statusMessage = STATUS_CODE_MAP.getOrDefault(response.statusCode(), "Unknown");

-        String headerString = getHeadersAsString(response, size);
+        String headerString = getHeadersAsString(response.headers(), size);

        return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
    }
@ -148,10 +149,10 @@ public class WarcProtocolReconstructor {
        return joiner.toString();
    }

-    static private String getHeadersAsString(Response response, long responseSize) {
+    static private String getHeadersAsString(HttpHeaders headers, long responseSize) {
        StringJoiner joiner = new StringJoiner("\r\n");

-        response.headers().toMultimap().forEach((k, values) -> {
+        headers.map().forEach((k, values) -> {
            String headerCapitalized = capitalizeHeader(k);

            // Omit pseudoheaders injected by the crawler itself
@ -179,8 +180,8 @@ public class WarcProtocolReconstructor {
        return joiner.toString();
    }

-    // okhttp gives us flattened headers, so we need to reconstruct Camel-Kebab-Case style
-    // for the WARC parser's sake...
+    // okhttp gave us flattened headers, so we need to reconstruct Camel-Kebab-Case style
+    // for the WARC parser's sake...  (do we still need this, mr chesterton?)
    static private String capitalizeHeader(String k) {
        return Arrays.stream(StringUtils.split(k, '-'))
                .map(StringUtils::capitalize)
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcRecorder.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcRecorder.java
@ -1,13 +1,11 @@
 package nu.marginalia.crawl.fetcher.warc;

 import nu.marginalia.crawl.fetcher.ContentTags;
+import nu.marginalia.crawl.fetcher.Cookies;
 import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
-import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.body.HttpFetchResult;
-import okhttp3.OkHttpClient;
-import okhttp3.Request;
 import org.jetbrains.annotations.Nullable;
 import org.netpreserve.jwarc.*;
 import org.slf4j.Logger;
@ -18,6 +16,7 @@ import java.io.InputStream;
 import java.net.InetAddress;
 import java.net.URI;
 import java.net.URISyntaxException;
+import java.net.http.HttpClient;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
@ -28,7 +27,7 @@ import java.util.*;

 /** Based on JWarc's fetch method, APL 2.0 license
 * <p></p>
- * This class wraps OkHttp's OkHttpClient and records the HTTP request and response in a WARC file,
+ * This class wraps HttpClient and records the HTTP request and response in a WARC file,
 * as best is possible given not all the data is available at the same time and needs to
 * be reconstructed.
 */
@ -48,20 +47,22 @@ public class WarcRecorder implements AutoCloseable {
    // Affix a version string in case we need to change the format in the future
    // in some way
    private final String warcRecorderVersion = "1.0";
-
-    // We need to know if the site uses cookies so this can be reported among the search results
-    // -- flip this to true if we see any cookies.  This information will also be painted on any
-    // revisited pages.  It's not 100% perfect and a bit order dependent, but it's good enough.
-    private final WarcXCookieInformationHeader cookieInformation = new WarcXCookieInformationHeader();
-
+    private final Cookies cookies;
    /**
     * Create a new WarcRecorder that will write to the given file
     *
     * @param warcFile The file to write to
     */
-    public WarcRecorder(Path warcFile) throws IOException {
+    public WarcRecorder(Path warcFile, HttpFetcherImpl fetcher) throws IOException {
        this.warcFile = warcFile;
        this.writer = new WarcWriter(warcFile);
+        this.cookies = fetcher.getCookies();
+    }
+
+    public WarcRecorder(Path warcFile, Cookies cookies) throws IOException {
+        this.warcFile = warcFile;
+        this.writer = new WarcWriter(warcFile);
+        this.cookies = cookies;
    }

    /**
@ -71,38 +72,41 @@ public class WarcRecorder implements AutoCloseable {
    public WarcRecorder() throws IOException {
        this.warcFile = Files.createTempFile("warc", ".warc.gz");
        this.writer = new WarcWriter(this.warcFile);
+        this.cookies = new Cookies();

        temporaryFile = true;
    }

-    public HttpFetchResult fetch(OkHttpClient client, Request request) throws NoSuchAlgorithmException,
-            IOException,
-            URISyntaxException,
-            InterruptedException
+    public HttpFetchResult fetch(HttpClient client,
+                                 java.net.http.HttpRequest request)
+            throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException
    {
-        URI requestUri = request.url().uri();
+        URI requestUri = request.uri();

        WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
        WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();

-        String ip;
        Instant date = Instant.now();

-        var call = client.newCall(request);
+        var response = client.send(request, java.net.http.HttpResponse.BodyHandlers.ofInputStream());


-        cookieInformation.update(client, request.url());
+        Map<String, List<String>> extraHeaders = new HashMap<>();

-        try (var response = call.execute();
-             WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response))
+        // Not entirely sure why we need to do this, but keeping it due to Chesterton's Fence
+        extraHeaders.putAll(request.headers().map());
+
+        try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response))
        {
+            if (cookies.hasCookies()) {
+                extraHeaders.put("X-Has-Cookies", List.of("1"));
+            }
+
            byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response, inputBuffer.size()).getBytes(StandardCharsets.UTF_8);

            ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(inputBuffer.size() + responseHeaders.length);
            InputStream inputStream = inputBuffer.read();

-            ip = IpInterceptingNetworkInterceptor.getIpFromResponse(response);
-
            responseDataBuffer.put(responseHeaders);
            responseDataBuffer.updateDigest(responseDigestBuilder, 0, responseHeaders.length);

@ -125,17 +129,15 @@ public class WarcRecorder implements AutoCloseable {

            // It looks like this might be the same as requestUri, but it's not;
            // it's the URI after resolving redirects.
-            final URI responseUri = response.request().url().uri();
+            final URI responseUri = response.uri();

            WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri)
                    .blockDigest(responseDigestBuilder.build())
                    .date(date)
                    .body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());

-            cookieInformation.paint(responseBuilder);
-
-            if (ip != null) responseBuilder.ipAddress(InetAddress.getByName(ip));
-
+            InetAddress inetAddress = InetAddress.getByName(responseUri.getHost());
+            responseBuilder.ipAddress(inetAddress);
            responseBuilder.payloadDigest(payloadDigestBuilder.build());
            responseBuilder.truncated(inputBuffer.truncationReason());

@ -152,8 +154,8 @@ public class WarcRecorder implements AutoCloseable {
            byte[] httpRequestString = WarcProtocolReconstructor
                    .getHttpRequestString(
                            response.request().method(),
-                            response.request().headers().toMultimap(),
-                            request.headers().toMultimap(),
+                            response.request().headers().map(),
+                            extraHeaders,
                            requestUri)
                    .getBytes();

@ -171,7 +173,7 @@ public class WarcRecorder implements AutoCloseable {

            if (Duration.between(date, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
                    && inputBuffer.size() < 2048
-                    && !request.url().encodedPath().endsWith("robots.txt")) // don't bail on robots.txt
+                    && !request.uri().getPath().endsWith("robots.txt")) // don't bail on robots.txt
            {
                // Fast detection and mitigation of crawler traps that respond with slow
                // small responses, with a high branching factor
@ -189,9 +191,9 @@ public class WarcRecorder implements AutoCloseable {
            }

            return new HttpFetchResult.ResultOk(responseUri,
-                    response.code(),
+                    response.statusCode(),
                    inputBuffer.headers(),
-                    ip,
+                    inetAddress.getHostAddress(),
                    responseDataBuffer.data,
                    dataStart,
                    responseDataBuffer.length() - dataStart);
@ -267,7 +269,9 @@ public class WarcRecorder implements AutoCloseable {
                    .date(Instant.now())
                    .body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());

-            cookieInformation.paint(builder);
+            if (cookies.hasCookies()) {
+                builder.addHeader("X-Has-Cookies", "1");
+            }

            var reference = builder.build();

--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
@ -12,7 +12,6 @@ import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
 import nu.marginalia.crawl.logic.LinkFilterSelector;
 import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor;
 import nu.marginalia.crawl.retreival.revisit.DocumentWithReference;
-import nu.marginalia.crawl.retreival.sitemap.SitemapFetcher;
 import nu.marginalia.ip_blocklist.UrlBlocklist;
 import nu.marginalia.link_parser.LinkParser;
 import nu.marginalia.model.EdgeDomain;
@ -53,7 +52,6 @@ public class CrawlerRetreiver implements AutoCloseable {
    private final WarcRecorder warcRecorder;
    private final CrawlerRevisitor crawlerRevisitor;

-    private final SitemapFetcher sitemapFetcher;
    int errorCount = 0;

    public CrawlerRetreiver(HttpFetcher fetcher,
@ -71,7 +69,6 @@ public class CrawlerRetreiver implements AutoCloseable {

        crawlFrontier = new DomainCrawlFrontier(new EdgeDomain(domain), specs.urls(), specs.crawlDepth());
        crawlerRevisitor = new CrawlerRevisitor(crawlFrontier, this, warcRecorder);
-        sitemapFetcher = new SitemapFetcher(crawlFrontier, fetcher.createSitemapRetriever());

        // We must always crawl the index page first, this is assumed when fingerprinting the server
        var fst = crawlFrontier.peek();
@ -145,9 +142,11 @@ public class CrawlerRetreiver implements AutoCloseable {
        // Add external links to the crawl frontier
        crawlFrontier.addAllToQueue(domainLinks.getUrls(rootUrl.proto));

-        // Add links from the sitemap to the crawl frontier
-        sitemapFetcher.downloadSitemaps(robotsRules, rootUrl);

+        // Fetch sitemaps
+        for (var sitemap : robotsRules.getSitemaps()) {
+            crawlFrontier.addAllToQueue(fetcher.fetchSitemapUrls(sitemap, delayTimer));
+        }

        while (!crawlFrontier.isEmpty()
            && !crawlFrontier.isCrawlDepthReached()
@ -271,10 +270,7 @@ public class CrawlerRetreiver implements AutoCloseable {
            }

            // Download the sitemap if available
-            if (feedLink.isPresent()) {
-                sitemapFetcher.downloadSitemaps(List.of(feedLink.get()));
-                timer.waitFetchDelay(0);
-            }
+            feedLink.ifPresent(s -> fetcher.fetchSitemapUrls(s, timer));

            // Grab the favicon if it exists
            fetchWithRetry(faviconUrl, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/sitemap/SitemapFetcher.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/sitemap/SitemapFetcher.java
@ -1,72 +0,0 @@
-package nu.marginalia.crawl.retreival.sitemap;
-
-import crawlercommons.robots.SimpleRobotRules;
-import nu.marginalia.crawl.fetcher.SitemapRetriever;
-import nu.marginalia.crawl.retreival.DomainCrawlFrontier;
-import nu.marginalia.model.EdgeUrl;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.util.HashSet;
-import java.util.List;
-import java.util.Optional;
-import java.util.Set;
-
-public class SitemapFetcher {
-
-    private final DomainCrawlFrontier crawlFrontier;
-    private final SitemapRetriever sitemapRetriever;
-    private static final Logger logger = LoggerFactory.getLogger(SitemapFetcher.class);
-
-    public SitemapFetcher(DomainCrawlFrontier crawlFrontier, SitemapRetriever sitemapRetriever) {
-        this.crawlFrontier = crawlFrontier;
-        this.sitemapRetriever = sitemapRetriever;
-    }
-
-    public void downloadSitemaps(SimpleRobotRules robotsRules, EdgeUrl rootUrl) {
-        List<String> urls = robotsRules.getSitemaps();
-
-        if (urls.isEmpty()) {
-            urls = List.of(rootUrl.withPathAndParam("/sitemap.xml", null).toString());
-        }
-
-        downloadSitemaps(urls);
-    }
-
-    public void downloadSitemaps(List<String> urls) {
-
-        Set<String> checkedSitemaps = new HashSet<>();
-
-        for (var rawUrl : urls) {
-            Optional<EdgeUrl> parsedUrl = EdgeUrl.parse(rawUrl);
-            if (parsedUrl.isEmpty()) {
-                continue;
-            }
-
-            EdgeUrl url = parsedUrl.get();
-
-            // Let's not download sitemaps from other domains for now
-            if (!crawlFrontier.isSameDomain(url)) {
-                continue;
-            }
-
-            if (checkedSitemaps.contains(url.path))
-                continue;
-
-            var sitemap =  sitemapRetriever.fetchSitemap(url);
-            if (sitemap.isEmpty()) {
-                continue;
-            }
-
-            // ensure we don't try to download this sitemap again
-            // (don't move this up, as we may want to check the same
-            // path with different protocols until we find one that works)
-
-            checkedSitemaps.add(url.path);
-
-            crawlFrontier.addAllToQueue(sitemap);
-        }
-
-        logger.debug("Queue is now {}", crawlFrontier.queueSize());
-    }
-}
--- a/code/processes/crawling-process/model/build.gradle
+++ b/code/processes/crawling-process/model/build.gradle
@ -36,7 +36,6 @@ dependencies {
    implementation libs.gson
    implementation libs.commons.io
    implementation libs.commons.lang3
-    implementation libs.okhttp3
    implementation libs.jsoup
    implementation libs.snakeyaml
    implementation libs.zstd
--- a/code/processes/crawling-process/model/java/nu/marginalia/model/body/HttpFetchResult.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/model/body/HttpFetchResult.java
@ -1,17 +1,17 @@
 package nu.marginalia.model.body;

 import nu.marginalia.contenttype.ContentType;
-import okhttp3.Headers;
+import org.jetbrains.annotations.Nullable;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import org.netpreserve.jwarc.MessageHeaders;
 import org.netpreserve.jwarc.WarcResponse;

 import java.io.ByteArrayInputStream;
-import java.io.IOException;
 import java.io.InputStream;
 import java.net.InetAddress;
 import java.net.URI;
+import java.net.http.HttpHeaders;
 import java.util.Optional;

 /* FIXME:  This interface has a very unfortunate name that is not very descriptive.
@ -56,42 +56,26 @@ public sealed interface HttpFetchResult {
     */
    record ResultOk(URI uri,
                    int statusCode,
-                    Headers headers,
+                    HttpHeaders headers,
                    String ipAddress,
                    byte[] bytesRaw,
                    int bytesStart,
                    int bytesLength
    ) implements HttpFetchResult {

+        public ResultOk(URI uri, int status, MessageHeaders headers, String ipAddress, byte[] bytes, int bytesStart, int length) {
+            this(uri, status, HttpHeaders.of(headers.map(), (k,v) -> true), ipAddress, bytes, bytesStart, length);
+        }
+
        public boolean isOk() {
            return statusCode >= 200 && statusCode < 300;
        }

-        public ResultOk(URI uri,
-                        int statusCode,
-                        MessageHeaders headers,
-                        String ipAddress,
-                        byte[] bytesRaw,
-                        int bytesStart,
-                        int bytesLength) {
-            this(uri, statusCode, convertHeaders(headers), ipAddress, bytesRaw, bytesStart, bytesLength);
-        }
-
-        private static Headers convertHeaders(MessageHeaders headers) {
-            var ret = new Headers.Builder();
-            for (var header : headers.map().entrySet()) {
-                for (var value : header.getValue()) {
-                    ret.add(header.getKey(), value);
-                }
-            }
-            return ret.build();
-        }
-
        public InputStream getInputStream() {
            return new ByteArrayInputStream(bytesRaw, bytesStart, bytesLength);
        }

-        public Optional<Document> parseDocument() throws IOException {
+        public Optional<Document> parseDocument() {
            return DocumentBodyExtractor.asString(this).flatMapOpt((contentType, body) -> {
                if (contentType.is("text/html")) {
                    return Optional.of(Jsoup.parse(body));
@ -102,8 +86,9 @@ public sealed interface HttpFetchResult {
            });
        }

+        @Nullable
        public String header(String name) {
-            return headers.get(name);
+            return headers.firstValue(name).orElse(null);
        }

    }
--- a/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileWriter.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileWriter.java
@ -165,27 +165,28 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
            contentType = "";
        }

-        String headersStr = null;
        StringJoiner headersStrBuilder = new StringJoiner("\n");
-        for (var header : headers) {
-            headersStrBuilder.add(header.getFirst() + ": " + header.getSecond());
+        for (var header : headers.map().entrySet()) {
+            for (var value : header.getValue()) {
+                headersStrBuilder.add(header.getKey() + ": " + value);
            }
-        headersStr = headersStrBuilder.toString();
+        }
+        String headersStr = headersStrBuilder.toString();


        write(new CrawledDocumentParquetRecord(
                domain,
                response.target(),
                fetchOk.ipAddress(),
-                WarcXCookieInformationHeader.hasCookies(response),
+                headers.firstValue("X-Has-Cookies").orElse("0").equals("1"),
                fetchOk.statusCode(),
                response.date(),
                contentType,
                bodyBytes,
                headersStr,
-                headers.get("ETag"),
-                headers.get("Last-Modified"))
-        );
+                headers.firstValue("ETag").orElse(null),
+                headers.firstValue("Last-Modified").orElse(null)
+        ));
    }


--- a/code/processes/crawling-process/model/java/org/netpreserve/jwarc/WarcXCookieInformationHeader.java
+++ b/code/processes/crawling-process/model/java/org/netpreserve/jwarc/WarcXCookieInformationHeader.java
@ -1,35 +0,0 @@
-package org.netpreserve.jwarc;
-
-import okhttp3.HttpUrl;
-import okhttp3.OkHttpClient;
-
-/** Encapsulates out-of-band information about whether a website uses cookies,
- * using a non-standard WARC header "X-Has-Cookies".
- */
-public class WarcXCookieInformationHeader {
-    private boolean hasCookies = false;
-    private static final String headerName = "X-Has-Cookies";
-
-    public void update(OkHttpClient client, HttpUrl url) {
-        if (!hasCookies) {
-            hasCookies = !client.cookieJar().loadForRequest(url).isEmpty();
-        }
-    }
-
-    public boolean hasCookies() {
-        return hasCookies;
-    }
-
-    public void paint(WarcResponse.Builder builder) {
-        builder.addHeader(headerName, hasCookies ? "1" : "0");
-    }
-    public void paint(WarcXResponseReference.Builder builder) {
-        builder.addHeader(headerName, hasCookies ? "1" : "0");
-    }
-
-    public static boolean hasCookies(WarcRecord record) {
-        return record.headers().contains(headerName, "1");
-    }
-
-
-}
--- a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizerTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizerTest.java
@ -1,11 +1,9 @@
 package nu.marginalia.crawl.retreival;

-import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
+import nu.marginalia.crawl.fetcher.Cookies;
 import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
-import okhttp3.OkHttpClient;
-import okhttp3.Request;
 import org.junit.jupiter.api.AfterEach;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
@ -15,6 +13,8 @@ import org.netpreserve.jwarc.WarcResponse;

 import java.io.IOException;
 import java.net.URISyntaxException;
+import java.net.http.HttpClient;
+import java.net.http.HttpRequest;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.security.NoSuchAlgorithmException;
@ -27,11 +27,10 @@ import static org.junit.jupiter.api.Assertions.fail;
 class CrawlerWarcResynchronizerTest {
    Path fileName;
    Path outputFile;
-    OkHttpClient httpClient;
+    HttpClient httpClient;
    @BeforeEach
    public void setUp() throws Exception {
-        httpClient = new OkHttpClient.Builder()
-                .addNetworkInterceptor(new IpInterceptingNetworkInterceptor())
+        httpClient = HttpClient.newBuilder()
                .build();

        fileName = Files.createTempFile("test", ".warc.gz");
@ -46,7 +45,7 @@ class CrawlerWarcResynchronizerTest {

    @Test
    void run() throws IOException, URISyntaxException {
-        try (var oldRecorder = new WarcRecorder(fileName)) {
+        try (var oldRecorder = new WarcRecorder(fileName, new Cookies())) {
            fetchUrl(oldRecorder, "https://www.marginalia.nu/");
            fetchUrl(oldRecorder, "https://www.marginalia.nu/log/");
            fetchUrl(oldRecorder, "https://www.marginalia.nu/feed/");
@ -56,7 +55,7 @@ class CrawlerWarcResynchronizerTest {

        var crawlFrontier = new DomainCrawlFrontier(new EdgeDomain("www.marginalia.nu"), List.of(), 100);

-        try (var newRecorder = new WarcRecorder(outputFile)) {
+        try (var newRecorder = new WarcRecorder(outputFile, new Cookies())) {
            new CrawlerWarcResynchronizer(crawlFrontier, newRecorder).run(fileName);
        }

@ -79,10 +78,11 @@ class CrawlerWarcResynchronizerTest {
    }

    void fetchUrl(WarcRecorder recorder, String url) throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
-        var req = new Request.Builder().url(url)
-                .addHeader("User-agent", "test.marginalia.nu")
-                .addHeader("Accept-Encoding", "gzip")
-                .get().build();
+        var req = HttpRequest.newBuilder()
+                .uri(new java.net.URI(url))
+                .header("User-agent", "test.marginalia.nu")
+                .header("Accept-Encoding", "gzip")
+                .GET().build();
        recorder.fetch(httpClient, req);
    }
 }
--- a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/ContentTypeProberTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/ContentTypeProberTest.java
@ -2,6 +2,7 @@ package nu.marginalia.crawl.retreival.fetcher;

 import com.sun.net.httpserver.HttpServer;
 import nu.marginalia.crawl.fetcher.ContentTags;
+import nu.marginalia.crawl.fetcher.Cookies;
 import nu.marginalia.crawl.fetcher.HttpFetcher;
 import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
 import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
@ -79,7 +80,7 @@ class ContentTypeProberTest {
        htmlRedirEndpoint = EdgeUrl.parse("http://localhost:" + port + "/redir.gz").get();

        fetcher = new HttpFetcherImpl("test");
-        recorder = new WarcRecorder(warcFile);
+        recorder = new WarcRecorder(warcFile, new Cookies());
    }

    @AfterEach
--- a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java
@ -2,13 +2,11 @@ package nu.marginalia.crawl.retreival.fetcher;

 import nu.marginalia.UserAgent;
 import nu.marginalia.crawl.fetcher.ContentTags;
-import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
+import nu.marginalia.crawl.fetcher.Cookies;
 import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
 import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
-import okhttp3.OkHttpClient;
-import okhttp3.Request;
 import org.junit.jupiter.api.AfterEach;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
@ -19,6 +17,8 @@ import org.netpreserve.jwarc.WarcXResponseReference;

 import java.io.IOException;
 import java.net.URISyntaxException;
+import java.net.http.HttpClient;
+import java.net.http.HttpRequest;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.security.NoSuchAlgorithmException;
@ -31,17 +31,16 @@ class WarcRecorderTest {
    Path fileNameWarc;
    Path fileNameParquet;
    WarcRecorder client;
-    OkHttpClient httpClient;
+
+    HttpClient httpClient;
    @BeforeEach
    public void setUp() throws Exception {
-        httpClient = new OkHttpClient.Builder()
-                .addNetworkInterceptor(new IpInterceptingNetworkInterceptor())
-                .build();
+        httpClient = HttpClient.newBuilder().build();

        fileNameWarc = Files.createTempFile("test", ".warc");
        fileNameParquet = Files.createTempFile("test", ".parquet");

-        client = new WarcRecorder(fileNameWarc);
+        client = new WarcRecorder(fileNameWarc, new Cookies());
    }

    @AfterEach
@ -52,10 +51,13 @@ class WarcRecorderTest {

    @Test
    void fetch() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
-        client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/")
-                .addHeader("User-agent", "test.marginalia.nu")
-                .addHeader("Accept-Encoding", "gzip")
-                .get().build());
+        client.fetch(httpClient,
+                HttpRequest.newBuilder()
+                        .uri(new java.net.URI("https://www.marginalia.nu/"))
+                        .header("User-agent", "test.marginalia.nu")
+                        .header("Accept-Encoding", "gzip")
+                        .GET().build()
+        );

        Map<String, String> sampleData = new HashMap<>();
        try (var warcReader = new WarcReader(fileNameWarc)) {
@ -76,7 +78,7 @@ class WarcRecorderTest {
    @Test
    public void flagAsSkipped() throws IOException, URISyntaxException {

-        try (var recorder = new WarcRecorder(fileNameWarc)) {
+        try (var recorder = new WarcRecorder(fileNameWarc, new Cookies())) {
            recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
                    "text/html",
                    200,
@ -100,7 +102,7 @@ class WarcRecorderTest {
    @Test
    public void flagAsSkippedNullBody() throws IOException, URISyntaxException {

-        try (var recorder = new WarcRecorder(fileNameWarc)) {
+        try (var recorder = new WarcRecorder(fileNameWarc, new Cookies())) {
            recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
                    "text/html",
                    200,
@ -112,7 +114,7 @@ class WarcRecorderTest {

    @Test
    public void testSaveImport() throws URISyntaxException, IOException {
-        try (var recorder = new WarcRecorder(fileNameWarc)) {
+        try (var recorder = new WarcRecorder(fileNameWarc, new Cookies())) {
            recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
                    "text/html",
                    200,
@ -136,19 +138,23 @@ class WarcRecorderTest {

    @Test
    public void testConvertToParquet() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
-        client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/")
-                .addHeader("User-agent", "test.marginalia.nu")
-                .addHeader("Accept-Encoding", "gzip")
-                .get().build());
-        client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/log/")
-                .addHeader("User-agent", "test.marginalia.nu")
-                .addHeader("Accept-Encoding", "gzip")
-                .get().build());
-        client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/sanic.png")
-                .addHeader("User-agent", "test.marginalia.nu")
-                .addHeader("Accept-Encoding", "gzip")
-                .get().build());
-        client.close();
+        client.fetch(httpClient, HttpRequest.newBuilder()
+                .uri(new java.net.URI("https://www.marginalia.nu/"))
+                .header("User-agent", "test.marginalia.nu")
+                .header("Accept-Encoding", "gzip")
+                .GET().build());
+
+        client.fetch(httpClient, HttpRequest.newBuilder()
+                .uri(new java.net.URI("https://www.marginalia.nu/log/"))
+                .header("User-agent", "test.marginalia.nu")
+                .header("Accept-Encoding", "gzip")
+                .GET().build());
+
+        client.fetch(httpClient, HttpRequest.newBuilder()
+                .uri(new java.net.URI("https://www.marginalia.nu/sanic.png"))
+                .header("User-agent", "test.marginalia.nu")
+                .header("Accept-Encoding", "gzip")
+                .GET().build());

        CrawledDocumentParquetRecordFileWriter.convertWarc(
                "www.marginalia.nu",
--- a/code/processes/crawling-process/test/nu/marginalia/crawling/HttpFetcherTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawling/HttpFetcherTest.java
@ -4,6 +4,7 @@ import nu.marginalia.crawl.fetcher.ContentTags;
 import nu.marginalia.crawl.fetcher.HttpFetcher;
 import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
 import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
+import nu.marginalia.crawl.retreival.CrawlDelayTimer;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.body.ContentTypeLogic;
 import nu.marginalia.model.body.DocumentBodyExtractor;
@ -37,6 +38,12 @@ class HttpFetcherTest {
        }
    }

+    @Test
+    void testSitemapMarginalia() {
+        var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
+        fetcher.fetchSitemapUrls("https://www.marginalia.nu/sitemap.xml", new CrawlDelayTimer(1)).forEach(System.out::println);
+    }
+
    @Test
    void fetchText() throws Exception {
        var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
--- a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java
@ -3,11 +3,9 @@ package nu.marginalia.crawling.retreival;
 import crawlercommons.robots.SimpleRobotRules;
 import nu.marginalia.crawl.CrawlerMain;
 import nu.marginalia.crawl.DomainStateDb;
-import nu.marginalia.crawl.fetcher.ContentTags;
-import nu.marginalia.crawl.fetcher.HttpFetcher;
-import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
-import nu.marginalia.crawl.fetcher.SitemapRetriever;
+import nu.marginalia.crawl.fetcher.*;
 import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
+import nu.marginalia.crawl.retreival.CrawlDelayTimer;
 import nu.marginalia.crawl.retreival.CrawlerRetreiver;
 import nu.marginalia.crawl.retreival.DomainProber;
 import nu.marginalia.model.EdgeDomain;
@ -17,7 +15,6 @@ import nu.marginalia.model.crawldata.CrawledDocument;
 import nu.marginalia.model.crawldata.CrawlerDocumentStatus;
 import nu.marginalia.model.crawldata.SerializableCrawlData;
 import nu.marginalia.test.CommonTestData;
-import okhttp3.Headers;
 import org.junit.jupiter.api.AfterEach;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
@ -27,6 +24,7 @@ import org.slf4j.LoggerFactory;

 import java.io.IOException;
 import java.net.URISyntaxException;
+import java.net.http.HttpHeaders;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.sql.SQLException;
@ -122,7 +120,7 @@ public class CrawlerMockFetcherTest {
        public void setAllowAllContentTypes(boolean allowAllContentTypes) {}

        @Override
-        public List<String> getCookies() { return List.of();}
+        public Cookies getCookies() { return new Cookies();}

        @Override
        public void clearCookies() {}
@ -149,7 +147,7 @@ public class CrawlerMockFetcherTest {
                    return new HttpFetchResult.ResultOk(
                            url.asURI(),
                            200,
-                            new Headers.Builder().build(),
+                            HttpHeaders.of(Map.of(), (k,v)->true),
                            "127.0.0.1",
                            bodyBytes,
                            0,
@ -164,6 +162,11 @@ public class CrawlerMockFetcherTest {
            return new HttpFetchResult.ResultNone();
        }

+        @Override
+        public List<EdgeUrl> fetchSitemapUrls(String rootSitemapUrl, CrawlDelayTimer delayTimer) {
+            return List.of();
+        }
+
        @Override
        public SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder) {
            return new SimpleRobotRules();
@ -174,5 +177,9 @@ public class CrawlerMockFetcherTest {
            return Mockito.mock(SitemapRetriever.class);
        }

+        @Override
+        public void close() {
+
+        }
    }
 }
--- a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
@ -5,6 +5,7 @@ import nu.marginalia.WmsaHome;
 import nu.marginalia.atags.model.DomainLinks;
 import nu.marginalia.crawl.CrawlerMain;
 import nu.marginalia.crawl.DomainStateDb;
+import nu.marginalia.crawl.fetcher.Cookies;
 import nu.marginalia.crawl.fetcher.HttpFetcher;
 import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
 import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
@ -180,7 +181,7 @@ class CrawlerRetreiverTest {
                new EdgeDomain("www.marginalia.nu"),
                List.of(), 100);
        var resync = new CrawlerWarcResynchronizer(revisitCrawlFrontier,
-                new WarcRecorder(tempFileWarc2)
+                new WarcRecorder(tempFileWarc2, new Cookies())
        );

        // truncate the size of the file to simulate a crash
@ -458,7 +459,7 @@ class CrawlerRetreiverTest {
                List.of(), 100);

        var resync = new CrawlerWarcResynchronizer(revisitCrawlFrontier,
-                new WarcRecorder(tempFileWarc3)
+                new WarcRecorder(tempFileWarc3, new Cookies())
        );

        // truncate the size of the file to simulate a crash
@ -509,7 +510,7 @@ class CrawlerRetreiverTest {
    }

    private void doCrawlWithReferenceStream(CrawlerMain.CrawlSpecRecord specs, SerializableCrawlDataStream stream) {
-        try (var recorder = new WarcRecorder(tempFileWarc2);
+        try (var recorder = new WarcRecorder(tempFileWarc2, new Cookies());
             var db = new DomainStateDb(tempFileDb)
        ) {
            new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder).crawlDomain(new DomainLinks(),
@ -522,7 +523,7 @@ class CrawlerRetreiverTest {

    @NotNull
    private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlerMain.CrawlSpecRecord specs) {
-        try (var recorder = new WarcRecorder(tempFileWarc1);
+        try (var recorder = new WarcRecorder(tempFileWarc1, new Cookies());
             var db = new DomainStateDb(tempFileDb)
        ) {
            var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder);
--- a/code/processes/live-crawling-process/build.gradle
+++ b/code/processes/live-crawling-process/build.gradle
@ -56,7 +56,6 @@ dependencies {
    implementation libs.zstd
    implementation libs.jwarc
    implementation libs.crawlercommons
-    implementation libs.okhttp3
    implementation libs.jsoup
    implementation libs.opencsv
    implementation libs.fastutil
--- a/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java
+++ b/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java
@ -10,6 +10,7 @@ import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
 import nu.marginalia.converting.processor.DomainProcessor;
 import nu.marginalia.converting.writer.ConverterBatchWriter;
 import nu.marginalia.crawl.fetcher.ContentTags;
+import nu.marginalia.crawl.fetcher.Cookies;
 import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
 import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
 import nu.marginalia.functions.searchquery.QueryFactory;
@ -120,7 +121,7 @@ public class IntegrationTest {
    public void run() throws Exception {

        /** CREATE WARC */
-        try (WarcRecorder warcRecorder = new WarcRecorder(warcData)) {
+        try (WarcRecorder warcRecorder = new WarcRecorder(warcData, new Cookies())) {
            warcRecorder.writeWarcinfoHeader("127.0.0.1", new EdgeDomain("www.example.com"),
                    new HttpFetcherImpl.DomainProbeResult.Ok(new EdgeUrl("https://www.example.com/")));