From 8a891c215930df3ae4cb3d5813505dc938f7ec83 Mon Sep 17 00:00:00 2001
From: Viktor Lofgren <vlofgren@marginalia.nu>
Date: Mon, 22 Apr 2024 12:34:28 +0200
Subject: [PATCH] (crawler/converter) Remove legacy junk from parquet migration

---
 .../nu/marginalia/extractor/AtagExporter.java |   2 +-
 .../nu/marginalia/extractor/FeedExporter.java |   2 +-
 .../extractor/TermFrequencyExporter.java      |   2 +-
 .../crawling/body/ContentTypeLogic.java       |   4 +-
 .../crawling/io/CrawledDomainReader.java      |  33 +----
 .../crawling/io/CrawledDomainWriter.java      |  66 ----------
 .../crawling/io/CrawlerOutputFile.java        |  27 -----
 ...ibleLegacySerializableCrawlDataStream.java | 113 ------------------
 ...FastLegacySerializableCrawlDataStream.java |  74 ------------
 .../ParquetSerializableCrawlDataStream.java   |  31 ++---
 .../crawling/model/CrawledDocument.java       |   6 -
 .../crawling/model/CrawledDomain.java         |   9 --
 .../crawling/model/SerializableCrawlData.java |   1 -
 ...rawledDocumentParquetRecordFileReader.java |   1 -
 ...rawledDocumentParquetRecordFileWriter.java |   2 -
 .../crawling-model/java/plan/CrawlPlan.java   | 105 ----------------
 .../java/plan/CrawlPlanLoader.java            |  25 ----
 ...edDocumentParquetRecordFileWriterTest.java |  34 +++++-
 .../marginalia/converting/ConverterMain.java  |  76 ++++++++++--
 .../converting/model/CrawlPlan.java           |  15 +++
 .../marginalia/converting/model/WorkDir.java  |  13 ++
 .../java/nu/marginalia/crawl/CrawlerMain.java |   2 +-
 .../crawl/retreival/CrawlerRetreiver.java     |   1 +
 .../retreival/fetcher/HttpFetcherImpl.java    |   1 -
 .../retreival/revisit/CrawlerRevisitor.java   |  28 +++--
 .../crawling/CrawlPlanLoaderTest.java         |  51 --------
 .../retreival/CrawlerRetreiverTest.java       |  16 +--
 code/tools/crawl-data-unfcker/build.gradle    |  57 ---------
 .../nu/marginalia/tools/CrawlDataUnfcker.java |  75 ------------
 code/tools/crawl-data-unfcker/readme.md       |   3 -
 .../tools/ExperimentRunnerMain.java           |   2 +-
 settings.gradle                               |   1 -
 32 files changed, 175 insertions(+), 703 deletions(-)
 delete mode 100644 code/process-models/crawling-model/java/nu/marginalia/crawling/io/CrawledDomainWriter.java
 delete mode 100644 code/process-models/crawling-model/java/nu/marginalia/crawling/io/format/CompatibleLegacySerializableCrawlDataStream.java
 delete mode 100644 code/process-models/crawling-model/java/nu/marginalia/crawling/io/format/FastLegacySerializableCrawlDataStream.java
 delete mode 100644 code/process-models/crawling-model/java/plan/CrawlPlan.java
 delete mode 100644 code/process-models/crawling-model/java/plan/CrawlPlanLoader.java
 create mode 100644 code/processes/converting-process/java/nu/marginalia/converting/model/CrawlPlan.java
 create mode 100644 code/processes/converting-process/java/nu/marginalia/converting/model/WorkDir.java
 delete mode 100644 code/processes/crawling-process/test/nu/marginalia/crawling/CrawlPlanLoaderTest.java
 delete mode 100644 code/tools/crawl-data-unfcker/build.gradle
 delete mode 100644 code/tools/crawl-data-unfcker/java/nu/marginalia/tools/CrawlDataUnfcker.java
 delete mode 100644 code/tools/crawl-data-unfcker/readme.md

diff --git a/code/features-convert/data-extractors/java/nu/marginalia/extractor/AtagExporter.java b/code/features-convert/data-extractors/java/nu/marginalia/extractor/AtagExporter.java
index 3db0a284..acc3a417 100644
--- a/code/features-convert/data-extractors/java/nu/marginalia/extractor/AtagExporter.java
+++ b/code/features-convert/data-extractors/java/nu/marginalia/extractor/AtagExporter.java
@@ -61,7 +61,7 @@ public class AtagExporter implements ExporterIf {
                 }
 
                 Path crawlDataPath = inputDir.resolve(item.relPath());
-                try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.FAST, crawlDataPath)) {
+                try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) {
                     exportLinks(tagWriter, stream);
                 }
                 catch (Exception ex) {
diff --git a/code/features-convert/data-extractors/java/nu/marginalia/extractor/FeedExporter.java b/code/features-convert/data-extractors/java/nu/marginalia/extractor/FeedExporter.java
index 28a29906..fa925b39 100644
--- a/code/features-convert/data-extractors/java/nu/marginalia/extractor/FeedExporter.java
+++ b/code/features-convert/data-extractors/java/nu/marginalia/extractor/FeedExporter.java
@@ -58,7 +58,7 @@ public class FeedExporter implements ExporterIf {
                 }
 
                 Path crawlDataPath = inputDir.resolve(item.relPath());
-                try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.COMPATIBLE, crawlDataPath)) {
+                try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) {
                     exportFeeds(tagWriter, stream);
                 }
                 catch (Exception ex) {
diff --git a/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java b/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java
index 1e1a2cd5..18fb3261 100644
--- a/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java
+++ b/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java
@@ -99,7 +99,7 @@ public class TermFrequencyExporter implements ExporterIf {
 
     private void processFile(Path crawlDataPath, TLongIntHashMap counts, AtomicInteger docCount, SentenceExtractor se) {
         TLongHashSet words = new TLongHashSet(10_000);
-        try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.FAST, crawlDataPath)) {
+        try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) {
             while (stream.hasNext()) {
                 if (Thread.interrupted())
                     return;
diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/body/ContentTypeLogic.java b/code/process-models/crawling-model/java/nu/marginalia/crawling/body/ContentTypeLogic.java
index d884dbe5..25d4c8ec 100644
--- a/code/process-models/crawling-model/java/nu/marginalia/crawling/body/ContentTypeLogic.java
+++ b/code/process-models/crawling-model/java/nu/marginalia/crawling/body/ContentTypeLogic.java
@@ -10,7 +10,7 @@ import java.util.regex.Pattern;
 
 public class ContentTypeLogic {
 
-    private static final Predicate<String> probableHtmlPattern = Pattern.compile("^.*\\.(htm|html|php|txt)$").asMatchPredicate();
+    private static final Predicate<String> probableHtmlPattern = Pattern.compile("^.*\\.(htm|html|php|txt|md)$").asMatchPredicate();
     private static final Predicate<String> probableBinaryPattern = Pattern.compile("^.*\\.[a-z]+$").asMatchPredicate();
     private static final Set<String> blockedContentTypes = Set.of("text/css", "text/javascript");
     private static final List<String> acceptedContentTypePrefixes = List.of(
@@ -29,6 +29,7 @@ public class ContentTypeLogic {
         this.allowAllContentTypes = allowAllContentTypes;
     }
 
+    /** Returns true if the URL is likely to be a binary file, based on the URL path. */
     public boolean isUrlLikeBinary(EdgeUrl url) {
         String pathLowerCase = url.path.toLowerCase();
 
@@ -41,6 +42,7 @@ public class ContentTypeLogic {
     public boolean isAllowableContentType(ContentType contentType) {
         return isAllowableContentType(contentType.contentType());
     }
+
     public boolean isAllowableContentType(String contentType) {
         if (allowAllContentTypes)
             return true;
diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/CrawledDomainReader.java b/code/process-models/crawling-model/java/nu/marginalia/crawling/io/CrawledDomainReader.java
index dfd6415c..3f8123b2 100644
--- a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/CrawledDomainReader.java
+++ b/code/process-models/crawling-model/java/nu/marginalia/crawling/io/CrawledDomainReader.java
@@ -1,41 +1,18 @@
 package nu.marginalia.crawling.io;
 
-import com.google.gson.Gson;
-import nu.marginalia.crawling.io.format.CompatibleLegacySerializableCrawlDataStream;
-import nu.marginalia.crawling.io.format.FastLegacySerializableCrawlDataStream;
 import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream;
-import nu.marginalia.model.gson.GsonFactory;
 
 import java.io.*;
 import java.nio.file.Files;
 import java.nio.file.Path;
 
 public class CrawledDomainReader {
-    private static final Gson gson = GsonFactory.get();
 
-    public CrawledDomainReader() {
-    }
-
-    public enum CompatibilityLevel {
-        /** Data order emulates the ordering of the new format.  This is slower */
-        COMPATIBLE,
-        /** Data order is not compatible with the new format, but the data itself is */
-        FAST,
-        /** Alias for FAST */
-        ANY
-    }
     /** An iterator-like access to domain data  This must be closed otherwise it will leak off-heap memory! */
-    public static SerializableCrawlDataStream createDataStream(CompatibilityLevel compatibilityLevel,
-                                                               Path fullPath) throws IOException
+    public static SerializableCrawlDataStream createDataStream(Path fullPath) throws IOException
     {
         String fileName = fullPath.getFileName().toString();
-        if (fileName.endsWith(".zstd")) {
-            if (compatibilityLevel == CompatibilityLevel.COMPATIBLE)
-                return new CompatibleLegacySerializableCrawlDataStream(gson, fullPath.toFile());
-            else // if (compatibilityLevel == CompatibilityLevel.FAST or ANY)
-                return new FastLegacySerializableCrawlDataStream(gson, fullPath.toFile());
-        }
-        else if (fileName.endsWith(".parquet")) {
+        if (fileName.endsWith(".parquet")) {
             return new ParquetSerializableCrawlDataStream(fullPath);
         }
         else {
@@ -44,14 +21,14 @@ public class CrawledDomainReader {
     }
 
     /** An iterator-like access to domain data. This must be closed otherwise it will leak off-heap memory! */
-    public static SerializableCrawlDataStream createDataStream(CompatibilityLevel level, Path basePath, String domain, String id) throws IOException {
+    public static SerializableCrawlDataStream createDataStream(Path basePath, String domain, String id) throws IOException {
         Path parquetPath = CrawlerOutputFile.getParquetPath(basePath, id, domain);
 
         if (Files.exists(parquetPath)) {
-            return createDataStream(level, parquetPath);
+            return createDataStream(parquetPath);
         }
         else {
-            return createDataStream(level, CrawlerOutputFile.getLegacyOutputFile(basePath, id, domain));
+            throw new FileNotFoundException("No such file: " + parquetPath);
         }
     }
 
diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/CrawledDomainWriter.java b/code/process-models/crawling-model/java/nu/marginalia/crawling/io/CrawledDomainWriter.java
deleted file mode 100644
index f21715ee..00000000
--- a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/CrawledDomainWriter.java
+++ /dev/null
@@ -1,66 +0,0 @@
-package nu.marginalia.crawling.io;
-
-import com.github.luben.zstd.RecyclingBufferPool;
-import com.github.luben.zstd.ZstdOutputStream;
-import com.google.gson.Gson;
-import lombok.SneakyThrows;
-import nu.marginalia.crawling.model.SerializableCrawlData;
-import nu.marginalia.model.gson.GsonFactory;
-
-import java.io.BufferedOutputStream;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.StandardCopyOption;
-import java.nio.file.StandardOpenOption;
-
-public class CrawledDomainWriter implements AutoCloseable {
-    private final Path outputDir;
-    private final Gson gson = GsonFactory.get();
-    private final Writer writer;
-    private final Path tmpFile;
-    private final Path actualFile;
-
-    public CrawledDomainWriter(Path outputDir, String domain, String id) throws IOException {
-        this.outputDir = outputDir;
-
-        if (!Files.isDirectory(outputDir)) {
-            throw new IllegalArgumentException("Output dir " + outputDir + " does not exist");
-        }
-
-
-        // Do the actual writing to a temporary file first, then move it to the actual file when close() is invoked
-        // this lets us read the old file and compare its contents while writing the new file.  It also guards against
-        // half-written files if the process is killed.
-
-        tmpFile = getOutputFile(id, domain + "_tmp");
-        actualFile = getOutputFile(id, domain);
-        writer =  new OutputStreamWriter(new ZstdOutputStream(new BufferedOutputStream(Files.newOutputStream(tmpFile,
-                StandardOpenOption.CREATE, StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING)),
-                RecyclingBufferPool.INSTANCE));
-    }
-
-    public Path getOutputFile() {
-        return actualFile;
-    }
-
-    @SneakyThrows
-    public void accept(SerializableCrawlData data) {
-        writer.write(data.getSerialIdentifier());
-        writer.write('\n');
-        gson.toJson(data, writer);
-        writer.write('\n');
-    }
-
-    private Path getOutputFile(String id, String name) throws IOException {
-        return CrawlerOutputFile.createLegacyOutputPath(outputDir, id, name);
-    }
-
-    @Override
-    public void close() throws IOException {
-        Files.move(tmpFile, actualFile, StandardCopyOption.REPLACE_EXISTING);
-        writer.close();
-    }
-}
diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/CrawlerOutputFile.java b/code/process-models/crawling-model/java/nu/marginalia/crawling/io/CrawlerOutputFile.java
index 25673f13..05c4797e 100644
--- a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/CrawlerOutputFile.java
+++ b/code/process-models/crawling-model/java/nu/marginalia/crawling/io/CrawlerOutputFile.java
@@ -8,33 +8,6 @@ import java.nio.file.Path;
 
 public class CrawlerOutputFile {
 
-    /** Return the Path to a file for the given id and name */
-    public static Path getLegacyOutputFile(Path base, String id, String name) {
-        id = padId(id);
-
-        String first = id.substring(0, 2);
-        String second = id.substring(2, 4);
-
-        Path destDir = base.resolve(first).resolve(second);
-        return destDir.resolve(STR."\{id}-\{filesystemSafeName(name)}.zstd");
-    }
-
-    /** Return the Path to a file for the given id and name, creating the prerequisite
-     * directory structure as necessary. */
-    public static Path createLegacyOutputPath(Path base, String id, String name) throws IOException {
-        id = padId(id);
-
-        String first = id.substring(0, 2);
-        String second = id.substring(2, 4);
-
-        Path destDir = base.resolve(first).resolve(second);
-        if (!Files.exists(destDir)) {
-            Files.createDirectories(destDir);
-        }
-        return destDir.resolve(STR."\{id}-\{filesystemSafeName(name)}.zstd");
-    }
-
-
     private static String filesystemSafeName(String name) {
         StringBuilder nameSaneBuilder = new StringBuilder();
 
diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/format/CompatibleLegacySerializableCrawlDataStream.java b/code/process-models/crawling-model/java/nu/marginalia/crawling/io/format/CompatibleLegacySerializableCrawlDataStream.java
deleted file mode 100644
index 76ecf7e7..00000000
--- a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/format/CompatibleLegacySerializableCrawlDataStream.java
+++ /dev/null
@@ -1,113 +0,0 @@
-package nu.marginalia.crawling.io.format;
-
-import com.github.luben.zstd.RecyclingBufferPool;
-import com.github.luben.zstd.ZstdInputStream;
-import com.google.gson.Gson;
-import nu.marginalia.crawling.io.SerializableCrawlDataStream;
-import nu.marginalia.crawling.model.CrawledDocument;
-import nu.marginalia.crawling.model.CrawledDomain;
-import nu.marginalia.crawling.model.SerializableCrawlData;
-
-import java.io.*;
-import java.nio.file.Path;
-
-import static java.util.Objects.*;
-
-/** This class is used to read the old format of crawl data, which was zstd-compressed JSON
- * with type delimiters between records.  It does its best to preserve the semantics of the
- * new format.  This is slow.
- */
-public class CompatibleLegacySerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream {
-    private final Gson gson;
-    private final BufferedReader bufferedReader;
-
-    private CrawledDomain domain;
-    private SerializableCrawlData next;
-
-    private final Path path;
-    private int sizeHint;
-
-    public CompatibleLegacySerializableCrawlDataStream(Gson gson, File file) throws IOException {
-        this.gson = gson;
-        path = file.toPath();
-        domain = findDomain(file);
-
-        bufferedReader = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(file), RecyclingBufferPool.INSTANCE)));
-    }
-
-    @Override
-    public int sizeHint() {
-        return sizeHint;
-    }
-
-    /** Scan through the file and find the domain record */
-    private CrawledDomain findDomain(File file) throws IOException {
-        try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(file), RecyclingBufferPool.INSTANCE)))) {
-            for (;;sizeHint++) {
-                String identifierLine =
-                        requireNonNull(br.readLine(), "No identifier line found");
-                String dataLine =
-                        requireNonNull(br.readLine(), "No data line found");
-
-                if (identifierLine.equals(CrawledDomain.SERIAL_IDENTIFIER)) {
-                    return gson.fromJson(dataLine, CrawledDomain.class);
-                }
-            }
-        }
-    }
-
-    @Override
-    public Path path() {
-        return path;
-    }
-
-    @Override
-    public SerializableCrawlData next() throws IOException {
-        if (hasNext()) {
-            if (domain != null) {
-                var ret = domain;
-                domain = null;
-                return ret;
-            }
-            else {
-                var ret = next;
-                next = null;
-                return ret;
-            }
-        }
-        throw new IllegalStateException("No more data");
-    }
-
-    @Override
-    public boolean hasNext() throws IOException {
-        if (domain != null || next != null) {
-            return true;
-        }
-
-        String identifier = bufferedReader.readLine();
-        if (identifier == null) {
-            bufferedReader.close();
-            return false;
-        }
-        String data = bufferedReader.readLine();
-        if (data == null) {
-            bufferedReader.close();
-            return false;
-        }
-
-        if (identifier.equals(CrawledDomain.SERIAL_IDENTIFIER)) {
-            next = null;
-            return false; // last record is expected to be the domain, so we're done
-        } else if (identifier.equals(CrawledDocument.SERIAL_IDENTIFIER)) {
-            next = gson.fromJson(data, CrawledDocument.class);
-        } else {
-            throw new IllegalStateException("Unknown identifier: " + identifier);
-        }
-        return true;
-    }
-
-    @Override
-    public void close() throws Exception {
-        bufferedReader.close();
-    }
-}
diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/format/FastLegacySerializableCrawlDataStream.java b/code/process-models/crawling-model/java/nu/marginalia/crawling/io/format/FastLegacySerializableCrawlDataStream.java
deleted file mode 100644
index 09871cf4..00000000
--- a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/format/FastLegacySerializableCrawlDataStream.java
+++ /dev/null
@@ -1,74 +0,0 @@
-package nu.marginalia.crawling.io.format;
-
-import com.github.luben.zstd.RecyclingBufferPool;
-import com.github.luben.zstd.ZstdInputStream;
-import com.google.gson.Gson;
-import nu.marginalia.crawling.io.SerializableCrawlDataStream;
-import nu.marginalia.crawling.model.CrawledDocument;
-import nu.marginalia.crawling.model.CrawledDomain;
-import nu.marginalia.crawling.model.SerializableCrawlData;
-
-import java.io.*;
-import java.nio.file.Path;
-
-/** This class is used to read the old format of crawl data, which was zstd-compressed JSON
- * with type delimiters between records.  It does not preserve the semantics of the new format,
- * but it is faster.
- */
-public class FastLegacySerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream {
-    private final Gson gson;
-    private final BufferedReader bufferedReader;
-    private SerializableCrawlData next = null;
-
-    private final Path path;
-    public FastLegacySerializableCrawlDataStream(Gson gson, File file) throws IOException {
-        this.gson = gson;
-        bufferedReader = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(file), RecyclingBufferPool.INSTANCE)));
-        path = file.toPath();
-    }
-
-    @Override
-    public Path path() {
-        return path;
-    }
-    @Override
-    public SerializableCrawlData next() throws IOException {
-        if (hasNext()) {
-            var ret = next;
-            next = null;
-            return ret;
-        }
-        throw new IllegalStateException("No more data");
-    }
-
-    @Override
-    public boolean hasNext() throws IOException {
-        if (next != null)
-            return true;
-
-        String identifier = bufferedReader.readLine();
-        if (identifier == null) {
-            bufferedReader.close();
-            return false;
-        }
-        String data = bufferedReader.readLine();
-        if (data == null) {
-            bufferedReader.close();
-            return false;
-        }
-
-        if (identifier.equals(CrawledDomain.SERIAL_IDENTIFIER)) {
-            next = gson.fromJson(data, CrawledDomain.class);
-        } else if (identifier.equals(CrawledDocument.SERIAL_IDENTIFIER)) {
-            next = gson.fromJson(data, CrawledDocument.class);
-        } else {
-            throw new IllegalStateException("Unknown identifier: " + identifier);
-        }
-        return true;
-    }
-
-    @Override
-    public void close() throws Exception {
-        bufferedReader.close();
-    }
-}
diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java b/code/process-models/crawling-model/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java
index a5fa2d0d..e676e351 100644
--- a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java
+++ b/code/process-models/crawling-model/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java
@@ -29,7 +29,6 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
 
     public ParquetSerializableCrawlDataStream(Path file) throws IOException {
         path = file;
-
         backingIterator = CrawledDocumentParquetRecordFileReader.stream(file).iterator();
     }
 
@@ -79,6 +78,10 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
         String statusReason = "";
 
         String redirectDomain = null;
+
+        // The advisory content types are used to signal various states of the crawl
+        // that are not actual crawled documents.
+
         if (parquetRecord.contentType.equals("x-marginalia/advisory;state=redirect")) {
             EdgeUrl crawledUrl = new EdgeUrl(parquetRecord.url);
             redirectDomain = crawledUrl.getDomain().toString();
@@ -103,8 +106,6 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
         ));
     }
 
-    private CrawledDocumentParquetRecord previousRecord = null;
-
     private void createDocumentRecord(CrawledDocumentParquetRecord nextRecord) {
         String bodyString = "";
         CrawlerDocumentStatus status = CrawlerDocumentStatus.OK;
@@ -115,7 +116,8 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
         else if (nextRecord.contentType.startsWith("x-marginalia/advisory;state=robots-txt-skipped")) {
             status = CrawlerDocumentStatus.ROBOTS_TXT;
         }
-        else if (nextRecord.contentType.startsWith("x-marginalia/advisory")) { // other advisory stuff we don't want
+        else if (nextRecord.contentType.startsWith("x-marginalia/advisory")) {
+            // we don't care about the other advisory content types here
             return;
         }
         else if (nextRecord.body != null) {
@@ -135,21 +137,6 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
         String etag = nextRecord.etagHeader;
         String lastModified = nextRecord.lastModifiedHeader;
 
-        // If we have a previous record, and it was a 304, and this one is a 200, we'll use the ETag and Last-Modified
-        // from the previous record, as it's not guaranteed the reference copy will have the same headers due to a bug
-        // in the crawler.  The bug is fixed, but we still need to support old crawls.
-        //
-        // This was added in 2024-01-18, so we can remove it in a few months.
-
-        if (previousRecord != null
-            && previousRecord.url.equals(nextRecord.url)
-            && previousRecord.httpStatus == 304
-            && nextRecord.httpStatus == 200)
-        {
-            etag = previousRecord.etagHeader;
-            lastModified = previousRecord.lastModifiedHeader;
-        }
-
         nextQ.add(new CrawledDocument("",
                 nextRecord.url,
                 nextRecord.contentType,
@@ -166,13 +153,9 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
                 nextRecord.cookies,
                 lastModified,
                 etag));
-
-        previousRecord = nextRecord;
     }
 
-    public void close() throws IOException {
-        previousRecord = null;
-    }
+    public void close() throws IOException {}
 
     @Override
     public SerializableCrawlData next() throws IOException {
diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawledDocument.java b/code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawledDocument.java
index bb344dfb..c809682a 100644
--- a/code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawledDocument.java
+++ b/code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawledDocument.java
@@ -87,12 +87,6 @@ public class CrawledDocument implements SerializableCrawlData {
         return getHeader("Last-Modified");
     }
 
-    public static final String SERIAL_IDENTIFIER = "// DOCUMENT";
-    @Override
-    public String getSerialIdentifier() {
-        return SERIAL_IDENTIFIER;
-    }
-
     @Override
     public String getDomain() {
         if (url == null)
diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawledDomain.java b/code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawledDomain.java
index 3add3b8d..adb59bda 100644
--- a/code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawledDomain.java
+++ b/code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawledDomain.java
@@ -27,13 +27,4 @@ public class CrawledDomain implements SerializableCrawlData {
         return doc.size();
     }
 
-    public boolean hasCookies() {
-        return cookies != null && !cookies.isEmpty();
-    }
-
-    public static final String SERIAL_IDENTIFIER = "// DOMAIN";
-    @Override
-    public String getSerialIdentifier() {
-        return SERIAL_IDENTIFIER;
-    }
 }
diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/model/SerializableCrawlData.java b/code/process-models/crawling-model/java/nu/marginalia/crawling/model/SerializableCrawlData.java
index 48b3f65d..01ecaf8d 100644
--- a/code/process-models/crawling-model/java/nu/marginalia/crawling/model/SerializableCrawlData.java
+++ b/code/process-models/crawling-model/java/nu/marginalia/crawling/model/SerializableCrawlData.java
@@ -1,6 +1,5 @@
 package nu.marginalia.crawling.model;
 
 public interface SerializableCrawlData {
-    String getSerialIdentifier();
     String getDomain();
 }
diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileReader.java b/code/process-models/crawling-model/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileReader.java
index 31d644ec..362eb561 100644
--- a/code/process-models/crawling-model/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileReader.java
+++ b/code/process-models/crawling-model/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileReader.java
@@ -35,7 +35,6 @@ public class CrawledDocumentParquetRecordFileReader {
                     public Integer finish(Integer target) { return target; }
                 }),
                 List.of("statusCode"))
-                .mapToInt(Integer::valueOf)
                 .count();
     }
 }
diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java b/code/process-models/crawling-model/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java
index 02f33efc..539ff28d 100644
--- a/code/process-models/crawling-model/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java
+++ b/code/process-models/crawling-model/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java
@@ -134,8 +134,6 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
             return;
         }
 
-
-
         byte[] bodyBytes;
         String contentType;
 
diff --git a/code/process-models/crawling-model/java/plan/CrawlPlan.java b/code/process-models/crawling-model/java/plan/CrawlPlan.java
deleted file mode 100644
index 02164b60..00000000
--- a/code/process-models/crawling-model/java/plan/CrawlPlan.java
+++ /dev/null
@@ -1,105 +0,0 @@
-package plan;
-
-import lombok.AllArgsConstructor;
-import lombok.NoArgsConstructor;
-import lombok.ToString;
-import nu.marginalia.crawling.io.CrawledDomainReader;
-import nu.marginalia.crawling.io.SerializableCrawlDataStream;
-import nu.marginalia.crawling.model.CrawledDomain;
-import nu.marginalia.process.log.WorkLog;
-import org.apache.logging.log4j.util.Strings;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.util.function.Predicate;
-import java.util.Optional;
-
-@AllArgsConstructor @NoArgsConstructor @ToString
-public class CrawlPlan {
-    private final Logger logger = LoggerFactory.getLogger(getClass());
-    public String jobSpec;
-    public WorkDir crawl;
-    public WorkDir process;
-
-    private final static String rootDirRewrite = System.getProperty("crawl.rootDirRewrite");
-
-    public Path getJobSpec() {
-        return Path.of(rewrite(jobSpec));
-    }
-
-    @AllArgsConstructor @NoArgsConstructor @ToString
-    public static class WorkDir {
-        public String dir;
-        public String logName;
-
-        public Path getDir() {
-            return Path.of(rewrite(dir));
-        }
-        public Path getLogFile() {
-            return Path.of(rewrite(dir)).resolve(logName);
-        }
-    }
-
-    private static String rewrite(String dir) {
-        if (rootDirRewrite == null) {
-            return dir;
-        }
-        String[] parts = rootDirRewrite.split(":");
-
-        return dir.replaceFirst(parts[0], parts[1]);
-    }
-
-    public Path getCrawledFilePath(String fileName) {
-        int sp = fileName.lastIndexOf('/');
-
-        // Normalize the filename
-        if (sp >= 0 && sp + 1< fileName.length())
-            fileName = fileName.substring(sp + 1);
-        if (fileName.length() < 4)
-            fileName = Strings.repeat("0", 4 - fileName.length()) + fileName;
-
-        String sp1 = fileName.substring(0, 2);
-        String sp2 = fileName.substring(2, 4);
-        return crawl.getDir().resolve(sp1).resolve(sp2).resolve(fileName);
-    }
-
-    public int countCrawledDomains() {
-        int count = 0;
-        for (var ignored : WorkLog.iterable(crawl.getLogFile())) {
-            count++;
-        }
-        return count;
-    }
-
-    @Deprecated
-    public Iterable<CrawledDomain> domainsIterable() {
-        // This is no longer supported
-        throw new UnsupportedOperationException();
-    }
-
-    public Iterable<SerializableCrawlDataStream> crawlDataIterable(Predicate<String> idPredicate) {
-        return WorkLog.iterableMap(crawl.getLogFile(),
-                entry -> {
-                    if (!idPredicate.test(entry.id())) {
-                        return Optional.empty();
-                    }
-
-                    var path = getCrawledFilePath(entry.path());
-
-                    if (!Files.exists(path)) {
-                        logger.warn("File not found: {}", path);
-                        return Optional.empty();
-                    }
-
-                    try {
-                        return Optional.of(CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.COMPATIBLE, path));
-                    }
-                    catch (IOException ex) {
-                        return Optional.empty();
-                    }
-                });
-    }
-}
diff --git a/code/process-models/crawling-model/java/plan/CrawlPlanLoader.java b/code/process-models/crawling-model/java/plan/CrawlPlanLoader.java
deleted file mode 100644
index cc7aae3f..00000000
--- a/code/process-models/crawling-model/java/plan/CrawlPlanLoader.java
+++ /dev/null
@@ -1,25 +0,0 @@
-package plan;
-
-import org.yaml.snakeyaml.Yaml;
-
-import java.io.FileReader;
-import java.io.IOException;
-import java.nio.file.Path;
-
-public class CrawlPlanLoader {
-    private final Yaml yaml;
-
-    public CrawlPlanLoader() {
-        yaml = new Yaml();
-    }
-
-    public CrawlPlan load(Path yamlFile) throws IOException {
-        try (var reader = new FileReader(yamlFile.toFile())) {
-            return yaml.loadAs(reader, CrawlPlan.class);
-        }
-        catch (IOException ex) {
-            throw new IOException("Failed to load crawl plan " + yamlFile, ex);
-        }
-    }
-
-}
diff --git a/code/process-models/crawling-model/test/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java b/code/process-models/crawling-model/test/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java
index 17a8ad73..a0352f29 100644
--- a/code/process-models/crawling-model/test/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java
+++ b/code/process-models/crawling-model/test/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java
@@ -31,6 +31,7 @@ class CrawledDocumentParquetRecordFileWriterTest {
 
     @Test
     void testWriteRead() throws IOException {
+        // Create a record
         var original = new CrawledDocumentParquetRecord("www.marginalia.nu",
                 "https://www.marginalia.nu/",
                 "127.0.0.1",
@@ -41,22 +42,26 @@ class CrawledDocumentParquetRecordFileWriterTest {
                 "hello world".getBytes(),
                 null, null);
 
+        // Write the record to a file
         try (var writer = new CrawledDocumentParquetRecordFileWriter(tempFile)) {
             writer.write(original);
         }
 
+        // Read the file back
         var items = new ArrayList<SerializableCrawlData>();
-
         try (var stream = new ParquetSerializableCrawlDataStream(tempFile)) {
             while (stream.hasNext()) {
                 items.add(stream.next());
             }
         }
 
+        // Verify the contents, we should have a domain and a document
         assertEquals(2, items.size());
 
+        // Verify the domain
         var firstItem = items.get(0);
         assertInstanceOf(CrawledDomain.class, firstItem);
+
         var domain = (CrawledDomain) firstItem;
         assertEquals("www.marginalia.nu", domain.domain);
         assertNull(domain.redirectDomain);
@@ -65,6 +70,7 @@ class CrawledDocumentParquetRecordFileWriterTest {
         assertEquals(new ArrayList<>(), domain.doc);
         assertEquals(new ArrayList<>(), domain.cookies);
 
+        // Verify the document
         var secondItem = items.get(1);
         assertInstanceOf(CrawledDocument.class, secondItem);
 
@@ -75,5 +81,31 @@ class CrawledDocumentParquetRecordFileWriterTest {
         assertEquals(200, document.httpStatus);
     }
 
+    // This is an inspection hatch test that reads a file from the odduck.neocities.org domain that didn't load properly,
+    // leaving as-is in case we need to look into other files in the future
+    @Test
+    public void testOdduck() {
+        Path testPath = Path.of("/home/vlofgren/Exports/22efad51-oddduck.neocities.org.parquet");
+
+        // Skip if the file doesn't exist
+        if (!Files.exists(testPath)) {
+            return;
+        }
+
+        // Read the file
+        try (var stream = new ParquetSerializableCrawlDataStream(testPath)) {
+            while (stream.hasNext()) {
+                var item = stream.next();
+                if (item instanceof CrawledDocument doc) {
+                    System.out.println(doc.url);
+                    System.out.println(doc.contentType);
+                    System.out.println(doc.httpStatus);
+                    System.out.println(doc.documentBody.length());
+                }
+            }
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+    }
 
 }
\ No newline at end of file
diff --git a/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java
index a570e72d..c74dd5fd 100644
--- a/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java
@@ -11,6 +11,10 @@ import nu.marginalia.converting.sideload.SideloadSourceFactory;
 import nu.marginalia.converting.writer.ConverterBatchWritableIf;
 import nu.marginalia.converting.writer.ConverterBatchWriter;
 import nu.marginalia.converting.writer.ConverterWriter;
+import nu.marginalia.crawling.io.CrawledDomainReader;
+import nu.marginalia.crawling.io.SerializableCrawlDataStream;
+import nu.marginalia.process.log.WorkLog;
+import nu.marginalia.process.log.WorkLogEntry;
 import nu.marginalia.service.ProcessMainClass;
 import nu.marginalia.storage.FileStorageService;
 import nu.marginalia.mq.MessageQueueFactory;
@@ -23,11 +27,15 @@ import nu.marginalia.service.module.DatabaseModule;
 import nu.marginalia.util.SimpleBlockingThreadPool;
 import nu.marginalia.worklog.BatchingWorkLog;
 import nu.marginalia.worklog.BatchingWorkLogImpl;
-import plan.CrawlPlan;
+import org.apache.logging.log4j.util.Strings;
+import nu.marginalia.converting.model.CrawlPlan;
 import nu.marginalia.converting.processor.DomainProcessor;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+import nu.marginalia.converting.model.WorkDir;
 
+import java.io.IOException;
+import java.nio.file.Files;
 import java.nio.file.Path;
 import java.sql.SQLException;
 import java.util.Collection;
@@ -36,6 +44,7 @@ import java.util.Optional;
 import java.util.UUID;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;
+import java.util.function.Function;
 
 import static nu.marginalia.mqapi.ProcessInboxNames.CONVERTER_INBOX;
 
@@ -118,7 +127,8 @@ public class ConverterMain extends ProcessMainClass {
         }
     }
 
-    public void convert(CrawlPlan plan) throws Exception {
+    public void convert(int totalDomains, WorkDir crawlDir, WorkDir processedDir) throws Exception {
+
 
         final int defaultPoolSize = Boolean.getBoolean("system.conserveMemory")
                 ? Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 1, 4)   // <-- conserve memory
@@ -126,12 +136,11 @@ public class ConverterMain extends ProcessMainClass {
 
         final int maxPoolSize = Integer.getInteger("converter.poolSize", defaultPoolSize);
 
-        try (BatchingWorkLog batchingWorkLog = new BatchingWorkLogImpl(plan.process.getLogFile());
-             ConverterWriter converterWriter = new ConverterWriter(batchingWorkLog, plan.process.getDir()))
+        try (BatchingWorkLog batchingWorkLog = new BatchingWorkLogImpl(processedDir.getLogFile());
+             ConverterWriter converterWriter = new ConverterWriter(batchingWorkLog, processedDir.getDir()))
         {
             var pool = new SimpleBlockingThreadPool("ConverterThread", maxPoolSize, 2);
 
-            int totalDomains = plan.countCrawledDomains();
             AtomicInteger processedDomains = new AtomicInteger(0);
             logger.info("Processing {} domains", totalDomains);
 
@@ -139,7 +148,8 @@ public class ConverterMain extends ProcessMainClass {
             processedDomains.set(batchingWorkLog.size());
             heartbeat.setProgress(processedDomains.get() / (double) totalDomains);
 
-            for (var domain : plan.crawlDataIterable(id -> !batchingWorkLog.isItemProcessed(id)))
+            for (var domain : WorkLog.iterableMap(crawlDir.getLogFile(),
+                    new CrawlDataLocator(crawlDir.getDir(), batchingWorkLog)))
             {
                 pool.submit(() -> {
                     try {
@@ -165,6 +175,52 @@ public class ConverterMain extends ProcessMainClass {
         }
     }
 
+    private static class CrawlDataLocator implements Function<WorkLogEntry, Optional<SerializableCrawlDataStream>> {
+
+        private final Path crawlRootDir;
+        private final BatchingWorkLog batchingWorkLog;
+
+        CrawlDataLocator(Path crawlRootDir, BatchingWorkLog workLog) {
+            this.crawlRootDir = crawlRootDir;
+            this.batchingWorkLog = workLog;
+        }
+
+        @Override
+        public Optional<SerializableCrawlDataStream> apply(WorkLogEntry entry) {
+            if (batchingWorkLog.isItemProcessed(entry.id())) {
+                return Optional.empty();
+            }
+
+            var path = getCrawledFilePath(crawlRootDir, entry.path());
+
+            if (!Files.exists(path)) {
+                logger.warn("File not found: {}", path);
+                return Optional.empty();
+            }
+
+            try {
+                return Optional.of(CrawledDomainReader.createDataStream(path));
+            }
+            catch (IOException ex) {
+                return Optional.empty();
+            }
+        }
+
+        private Path getCrawledFilePath(Path crawlDir, String fileName) {
+            int sp = fileName.lastIndexOf('/');
+
+            // Normalize the filename
+            if (sp >= 0 && sp + 1< fileName.length())
+                fileName = fileName.substring(sp + 1);
+            if (fileName.length() < 4)
+                fileName = Strings.repeat("0", 4 - fileName.length()) + fileName;
+
+            String sp1 = fileName.substring(0, 2);
+            String sp2 = fileName.substring(2, 4);
+            return crawlDir.resolve(sp1).resolve(sp2).resolve(fileName);
+        }
+    }
+
     private abstract static class ConvertRequest {
         private final MqMessage message;
         private final MqSingleShotInbox inbox;
@@ -196,6 +252,7 @@ public class ConverterMain extends ProcessMainClass {
             this.sideloadSources = List.of(sideloadSource);
             this.workDir = workDir;
         }
+
         SideloadAction(Collection<? extends SideloadSource> sideloadSources,
                        Path workDir,
                        MqMessage message, MqSingleShotInbox inbox) {
@@ -227,7 +284,7 @@ public class ConverterMain extends ProcessMainClass {
         @Override
         public void execute(ConverterMain converterMain) throws Exception {
             try {
-                converterMain.convert(plan);
+                converterMain.convert(plan.countCrawledDomains(), plan.crawl(), plan.process());
                 ok();
             }
             catch (Exception ex) {
@@ -256,8 +313,9 @@ public class ConverterMain extends ProcessMainClass {
                     var processData = fileStorageService.getStorage(request.processedDataStorage);
 
                     var plan = new CrawlPlan(null,
-                            new CrawlPlan.WorkDir(crawlData.path(), "crawler.log"),
-                            new CrawlPlan.WorkDir(processData.path(), "processor.log"));
+                            new WorkDir(crawlData.path(), "crawler.log"),
+                            new WorkDir(processData.path(), "processor.log")
+                    );
 
                     yield new ConvertCrawlDataAction(plan, msg, inbox);
                 }
diff --git a/code/processes/converting-process/java/nu/marginalia/converting/model/CrawlPlan.java b/code/processes/converting-process/java/nu/marginalia/converting/model/CrawlPlan.java
new file mode 100644
index 00000000..3b929039
--- /dev/null
+++ b/code/processes/converting-process/java/nu/marginalia/converting/model/CrawlPlan.java
@@ -0,0 +1,15 @@
+package nu.marginalia.converting.model;
+
+import nu.marginalia.process.log.WorkLog;
+
+public record CrawlPlan(String jobSpec, WorkDir crawl, WorkDir process) {
+
+    public int countCrawledDomains() {
+        int count = 0;
+        for (var ignored : WorkLog.iterable(crawl.getLogFile())) {
+            count++;
+        }
+        return count;
+    }
+
+}
diff --git a/code/processes/converting-process/java/nu/marginalia/converting/model/WorkDir.java b/code/processes/converting-process/java/nu/marginalia/converting/model/WorkDir.java
new file mode 100644
index 00000000..2444aa2d
--- /dev/null
+++ b/code/processes/converting-process/java/nu/marginalia/converting/model/WorkDir.java
@@ -0,0 +1,13 @@
+package nu.marginalia.converting.model;
+
+import java.nio.file.Path;
+
+public record WorkDir(String dir, String logName) {
+    public Path getDir() {
+        return Path.of(dir);
+    }
+
+    public Path getLogFile() {
+        return Path.of(dir).resolve(logName);
+    }
+}
diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
index 4461a85a..be152d38 100644
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
@@ -284,7 +284,7 @@ public class CrawlerMain extends ProcessMainClass {
 
         private CrawlDataReference getReference() {
             try {
-                return new CrawlDataReference(CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.ANY, outputDir, domain, id));
+                return new CrawlDataReference(CrawledDomainReader.createDataStream(outputDir, domain, id));
             } catch (IOException e) {
                 logger.debug("Failed to read previous crawl data for {}", specification.domain);
                 return new CrawlDataReference();
diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
index b1abf3e1..efae36aa 100644
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
@@ -272,6 +272,7 @@ public class CrawlerRetreiver implements AutoCloseable {
             }
         }
 
+        // Parse the document and enqueue links
         try {
             if (fetchedDoc instanceof HttpFetchResult.ResultOk ok) {
                 var docOpt = ok.parseDocument();
diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java
index 94494402..77dc6463 100644
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java
@@ -143,7 +143,6 @@ public class HttpFetcherImpl implements HttpFetcher {
     public HttpFetchResult fetchContent(EdgeUrl url,
                                            WarcRecorder warcRecorder,
                                            ContentTags contentTags)
-            throws RateLimitException
     {
 
         // We don't want to waste time and resources on URLs that are not HTML, so if the file ending
diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java
index a21a06df..4c091302 100644
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java
@@ -1,5 +1,6 @@
 package nu.marginalia.crawl.retreival.revisit;
 
+import com.google.common.base.Strings;
 import crawlercommons.robots.SimpleRobotRules;
 import nu.marginalia.crawl.retreival.CrawlDataReference;
 import nu.marginalia.crawl.retreival.CrawlDelayTimer;
@@ -48,23 +49,32 @@ public class CrawlerRevisitor {
                 continue;
             var url = urlMaybe.get();
 
-            // If we've previously 404:d on this URL, we'll refrain from trying to fetch it again
+            // If we've previously 404:d on this URL, we'll refrain from trying to fetch it again,
+            // since it's likely to 404 again.  It will be forgotten by the next crawl though, so
+            // we'll eventually try again.
+
             if (doc.httpStatus == 404) {
                 crawlFrontier.addVisited(url);
                 continue;
             }
 
+            // If the reference document is empty or the HTTP status is not 200, we'll skip it since it's
+            // unlikely to produce anything meaningful for us.
             if (doc.httpStatus != 200)
                 continue;
+            if (Strings.isNullOrEmpty(doc.documentBody))
+                continue;
+
+            if (!crawlFrontier.filterLink(url))
+                continue;
+
+            if (!crawlFrontier.addVisited(url))
+                continue;
 
             if (!robotsRules.isAllowed(url.toString())) {
                 warcRecorder.flagAsRobotsTxtError(url);
                 continue;
             }
-            if (!crawlFrontier.filterLink(url))
-                continue;
-            if (!crawlFrontier.addVisited(url))
-                continue;
 
 
             if (recrawled > 5
@@ -79,10 +89,7 @@ public class CrawlerRevisitor {
                 crawlFrontier.addVisited(url);
 
                 // Hoover up any links from the document
-                if (doc.httpStatus == 200 && doc.documentBody != null) {
-                    var parsedDoc = Jsoup.parse(doc.documentBody);
-                    crawlFrontier.enqueueLinksFromDocument(url, parsedDoc);
-                }
+                crawlFrontier.enqueueLinksFromDocument(url, Jsoup.parse(doc.documentBody));
 
                 // Add a WARC record so we don't repeat this
                 warcRecorder.writeReferenceCopy(url,
@@ -97,7 +104,8 @@ public class CrawlerRevisitor {
                 // providing etag and last-modified headers, so we can recycle the
                 // document if it hasn't changed without actually downloading it
 
-                var reference = new DocumentWithReference(doc, oldCrawlData);
+                DocumentWithReference reference =  new DocumentWithReference(doc, oldCrawlData);
+
                 var result = crawlerRetreiver.fetchWriteAndSleep(url, delayTimer, reference);
 
                 if (reference.isSame(result)) {
diff --git a/code/processes/crawling-process/test/nu/marginalia/crawling/CrawlPlanLoaderTest.java b/code/processes/crawling-process/test/nu/marginalia/crawling/CrawlPlanLoaderTest.java
deleted file mode 100644
index 086529d6..00000000
--- a/code/processes/crawling-process/test/nu/marginalia/crawling/CrawlPlanLoaderTest.java
+++ /dev/null
@@ -1,51 +0,0 @@
-package nu.marginalia.crawling;
-
-import plan.CrawlPlanLoader;
-import org.junit.jupiter.api.AfterEach;
-import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Test;
-
-import java.io.IOException;
-import java.nio.file.Files;
-import java.nio.file.Path;
-
-import static org.junit.jupiter.api.Assertions.assertEquals;
-
-class CrawlPlanLoaderTest {
-
-    Path tempFile;
-
-    @BeforeEach
-    public void setUp() throws IOException {
-        tempFile = Files.createTempFile(getClass().getSimpleName(), ".yaml");
-    }
-    @AfterEach
-    public void tearDown() throws IOException {
-        Files.delete(tempFile);
-    }
-
-    @Test
-    void load() throws IOException {
-        Files.writeString(tempFile, """
-                jobSpec: "job.spec"
-                crawl:
-                    dir: "/foo"
-                    logName: "foo.log"
-                process:
-                    dir: "/bar"
-                    logName: "bar.log"
-                """);
-        var loader = new CrawlPlanLoader();
-        var ret = loader.load(tempFile);
-
-        assertEquals(Path.of("job.spec"), ret.getJobSpec());
-
-        assertEquals(Path.of("/foo"), ret.crawl.getDir());
-        assertEquals(Path.of("/foo/foo.log"), ret.crawl.getLogFile());
-
-        assertEquals(Path.of("/bar"), ret.process.getDir());
-        assertEquals(Path.of("/bar/bar.log"), ret.process.getLogFile());
-
-        System.out.println(ret);
-    }
-}
\ No newline at end of file
diff --git a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
index 811200cc..aa1f00e7 100644
--- a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
@@ -183,7 +183,7 @@ class CrawlerRetreiverTest {
 
         convertToParquet(tempFileWarc1, tempFileParquet1);
 
-        try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.ANY, tempFileParquet1)) {
+        try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) {
             while (stream.hasNext()) {
                 if (stream.next() instanceof CrawledDocument doc) {
                     data.add(doc);
@@ -236,7 +236,7 @@ class CrawlerRetreiverTest {
 
         convertToParquet(tempFileWarc1, tempFileParquet1);
 
-        try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.ANY, tempFileParquet1)) {
+        try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) {
             while (stream.hasNext()) {
                 if (stream.next() instanceof CrawledDocument doc) {
                     data.add(doc);
@@ -284,7 +284,7 @@ class CrawlerRetreiverTest {
         doCrawl(tempFileWarc1, specs);
         convertToParquet(tempFileWarc1, tempFileParquet1);
 
-        try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.ANY, tempFileParquet1)) {
+        try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) {
             while (stream.hasNext()) {
                 if (stream.next() instanceof CrawledDocument doc) {
                     data.add(doc);
@@ -331,7 +331,7 @@ class CrawlerRetreiverTest {
         doCrawl(tempFileWarc1, specs);
         convertToParquet(tempFileWarc1, tempFileParquet1);
         doCrawlWithReferenceStream(specs,
-                CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.ANY, tempFileParquet1)
+                CrawledDomainReader.createDataStream(tempFileParquet1)
         );
         convertToParquet(tempFileWarc2, tempFileParquet2);
 
@@ -352,7 +352,7 @@ class CrawlerRetreiverTest {
             });
         }
 
-        try (var ds = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.ANY, tempFileParquet2)) {
+        try (var ds = CrawledDomainReader.createDataStream(tempFileParquet2)) {
             while (ds.hasNext()) {
                 var doc = ds.next();
                 if (doc instanceof CrawledDomain dr) {
@@ -395,7 +395,7 @@ class CrawlerRetreiverTest {
 
         convertToParquet(tempFileWarc1, tempFileParquet1);
 
-        try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.ANY, tempFileParquet1)) {
+        try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) {
             while (stream.hasNext()) {
                 var doc = stream.next();
                 data.computeIfAbsent(doc.getClass(), c -> new ArrayList<>()).add(doc);
@@ -404,7 +404,7 @@ class CrawlerRetreiverTest {
             throw new RuntimeException(e);
         }
 
-        var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.ANY, tempFileParquet1);
+        var stream = CrawledDomainReader.createDataStream(tempFileParquet1);
 
         System.out.println("---");
 
@@ -444,7 +444,7 @@ class CrawlerRetreiverTest {
             });
         }
 
-        try (var ds = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.ANY, tempFileParquet2)) {
+        try (var ds = CrawledDomainReader.createDataStream(tempFileParquet2)) {
             while (ds.hasNext()) {
                 var doc = ds.next();
                 if (doc instanceof CrawledDomain dr) {
diff --git a/code/tools/crawl-data-unfcker/build.gradle b/code/tools/crawl-data-unfcker/build.gradle
deleted file mode 100644
index 755fba5e..00000000
--- a/code/tools/crawl-data-unfcker/build.gradle
+++ /dev/null
@@ -1,57 +0,0 @@
-plugins {
-    id 'java'
-
-    id 'application'
-
-    id 'jvm-test-suite'
-}
-
-java {
-    toolchain {
-        languageVersion.set(JavaLanguageVersion.of(22))
-    }
-}
-
-application {
-    mainClass = 'nu.marginalia.tools.CrawlDataUnfcker'
-    applicationName = 'crawl-data-unfcker'
-}
-
-tasks.distZip.enabled = false
-
-apply from: "$rootProject.projectDir/srcsets.gradle"
-
-dependencies {
-    implementation project(':third-party:rdrpostagger')
-    implementation project(':third-party:porterstemmer')
-    implementation project(':third-party:monkey-patch-opennlp')
-    implementation project(':code:common:model')
-    implementation project(':code:common:config')
-    implementation project(':code:common:process')
-    implementation project(':code:common:service')
-    implementation project(':code:libraries:language-processing')
-    implementation project(':code:libraries:term-frequency-dict')
-    implementation project(':code:libraries:big-string')
-    implementation project(':code:processes:converting-process')
-    implementation project(':code:process-models:crawling-model')
-
-    implementation project(':code:features-convert:adblock')
-    implementation project(':code:features-convert:topic-detection')
-    implementation project(':code:features-convert:keyword-extraction')
-
-    implementation libs.bundles.slf4j
-    implementation libs.notnull
-
-    implementation libs.guice
-    implementation libs.jsoup
-    implementation libs.trove
-    implementation libs.fastutil
-
-    implementation libs.bundles.nlp
-    implementation libs.commons.lang3
-
-    testImplementation libs.bundles.slf4j.test
-    testImplementation libs.bundles.junit
-    testImplementation libs.mockito
-}
-
diff --git a/code/tools/crawl-data-unfcker/java/nu/marginalia/tools/CrawlDataUnfcker.java b/code/tools/crawl-data-unfcker/java/nu/marginalia/tools/CrawlDataUnfcker.java
deleted file mode 100644
index 0101de12..00000000
--- a/code/tools/crawl-data-unfcker/java/nu/marginalia/tools/CrawlDataUnfcker.java
+++ /dev/null
@@ -1,75 +0,0 @@
-package nu.marginalia.tools;
-
-import nu.marginalia.crawling.io.CrawlerOutputFile;
-import nu.marginalia.crawling.model.CrawledDomain;
-import nu.marginalia.process.log.WorkLog;
-import nu.marginalia.crawling.io.CrawledDomainReader;
-
-import java.io.IOException;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.util.Optional;
-
-public class CrawlDataUnfcker {
-    public static void main(String... args) {
-        if (args.length != 2) {
-            System.out.println("Usage: crawl-data-unfcker input output");
-            return;
-        }
-
-        Path input = Path.of(args[0]);
-        Path output = Path.of(args[1]);
-
-        if (!Files.isDirectory(input)) {
-            System.err.println("Input directory is not valid");
-            return;
-        }
-        if (!Files.isDirectory(output)) {
-            System.err.println("Output directory is not valid");
-            return;
-        }
-
-        try (var wl = new WorkLog(output.resolve("crawler.log"))) {
-            for (var inputItem : WorkLog.iterable(input.resolve("crawler.log"))) {
-                Path inputPath = input.resolve(inputItem.relPath());
-
-                var domainMaybe = readDomain(inputPath).map(CrawledDomain::getDomain);
-                if (domainMaybe.isEmpty())
-                    continue;
-                var domain = domainMaybe.get();
-
-                // Generate conformant ID
-                String newId = Integer.toHexString(domain.hashCode());
-
-                var outputPath = CrawlerOutputFile.createLegacyOutputPath(output, newId, domain);
-                var outputFileName = outputPath.toFile().getName();
-
-                System.out.println(inputPath + " -> " + outputPath);
-                Files.move(inputPath, outputPath);
-
-                wl.setJobToFinished(domain, outputFileName, inputItem.cnt());
-            }
-        } catch (IOException e) {
-            throw new RuntimeException(e);
-        }
-    }
-
-    static Optional<CrawledDomain> readDomain(Path file) {
-        if (!Files.exists(file)) {
-            System.out.println("Missing file " + file);
-            return Optional.empty();
-        }
-
-        try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.FAST, file)) {
-            while (stream.hasNext()) {
-                if (stream.next() instanceof CrawledDomain domain) {
-                    return Optional.of(domain);
-                }
-            }
-        }
-        catch (Exception ex) {
-            ex.printStackTrace();
-        }
-        return Optional.empty();
-    }
-}
diff --git a/code/tools/crawl-data-unfcker/readme.md b/code/tools/crawl-data-unfcker/readme.md
deleted file mode 100644
index 9c870953..00000000
--- a/code/tools/crawl-data-unfcker/readme.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Crawl Data Unfcker
-
-This is a migration tool that patches the generated ID of crawl data.
\ No newline at end of file
diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/ExperimentRunnerMain.java b/code/tools/experiment-runner/java/nu/marginalia/tools/ExperimentRunnerMain.java
index d58bf778..668a25a9 100644
--- a/code/tools/experiment-runner/java/nu/marginalia/tools/ExperimentRunnerMain.java
+++ b/code/tools/experiment-runner/java/nu/marginalia/tools/ExperimentRunnerMain.java
@@ -48,7 +48,7 @@ public class ExperimentRunnerMain {
         Path basePath = Path.of(args[0]);
         for (var item : WorkLog.iterable(basePath.resolve("crawler.log"))) {
             Path crawlDataPath = basePath.resolve(item.relPath());
-            try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.FAST, crawlDataPath)) {
+            try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) {
                 experiment.process(stream);
             }
             catch (Exception ex) {
diff --git a/settings.gradle b/settings.gradle
index 6571020c..13622d9c 100644
--- a/settings.gradle
+++ b/settings.gradle
@@ -94,7 +94,6 @@ include 'code:process-models:processed-data'
 include 'code:tools:experiment-runner'
 include 'code:tools:screenshot-capture-tool'
 include 'code:tools:load-test'
-include 'code:tools:crawl-data-unfcker'
 
 include 'third-party:porterstemmer'
 include 'third-party:symspell'