diff --git a/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java b/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java
index 8c6e92d2..f3c6227d 100644
--- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java
@@ -65,8 +65,7 @@ public class SideloadSourceFactory {
     public SideloadSource sideloadReddit(Path pathToDbFiles) throws IOException {
         return sideload(pathToDbFiles,
                 new PathSuffixPredicate(".db"),
-                (List<Path> paths) -> new RedditSideloader(paths,
-                        anchorTagsSourceFactory, anchorTextKeywords, sideloaderProcessing));
+                (List<Path> paths) -> new RedditSideloader(paths, anchorTextKeywords, sideloaderProcessing));
     }
 
     public Collection<? extends SideloadSource> sideloadStackexchange(Path pathToDbFileRoot) throws IOException {
diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDataReference.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDataReference.java
index b0b2c014..98133bcf 100644
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDataReference.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDataReference.java
@@ -1,5 +1,6 @@
 package nu.marginalia.crawl.retreival;
 
+import nu.marginalia.ContentTypes;
 import nu.marginalia.io.SerializableCrawlDataStream;
 import nu.marginalia.lsh.EasyLSH;
 import nu.marginalia.model.crawldata.CrawledDocument;
@@ -43,6 +44,9 @@ public class CrawlDataReference implements AutoCloseable {
         try {
             while (data.hasNext()) {
                 if (data.next() instanceof CrawledDocument doc) {
+                    if (!ContentTypes.isAccepted(doc.contentType))
+                        continue;
+
                     return doc;
                 }
             }
diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
index c6b426b3..ace2059b 100644
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
@@ -317,26 +317,24 @@ public class CrawlerRetreiver implements AutoCloseable {
 
         long probeStart = System.currentTimeMillis();
 
-        /*
-        probing is on probation for now while we evaluate how much the added delays slows down the crawler
-
         if (probeType == HttpFetcher.ProbeType.FULL) {
+            retryLoop:
             for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
                 try {
                     var probeResult = fetcher.probeContentType(url, warcRecorder, contentTags);
 
-                    if (probeResult instanceof HttpFetcher.ContentTypeProbeResult.Ok ok) {
-                        url = ok.resolvedUrl(); // If we were redirected while probing, use the final URL for fetching
-                        break;
-                    } else if (probeResult instanceof HttpFetcher.ContentTypeProbeResult.BadContentType badContentType) {
-                        return new HttpFetchResult.ResultNone();
-                    } else if (probeResult instanceof HttpFetcher.ContentTypeProbeResult.BadContentType.Timeout timeout) {
-                        return new HttpFetchResult.ResultException(timeout.ex());
-                    } else if (probeResult instanceof HttpFetcher.ContentTypeProbeResult.Exception exception) {
-                        return new HttpFetchResult.ResultException(exception.ex());
-                    }
-                    else { // should be unreachable
-                        throw new IllegalStateException("Unknown probe result");
+                    switch (probeResult) {
+                        case HttpFetcher.ContentTypeProbeResult.Ok(EdgeUrl resolvedUrl):
+                            url = resolvedUrl; // If we were redirected while probing, use the final URL for fetching
+                            break retryLoop;
+                        case HttpFetcher.ContentTypeProbeResult.BadContentType badContentType:
+                            return new HttpFetchResult.ResultNone();
+                        case HttpFetcher.ContentTypeProbeResult.BadContentType.Timeout timeout:
+                            return new HttpFetchResult.ResultException(timeout.ex());
+                        case HttpFetcher.ContentTypeProbeResult.Exception exception:
+                            return new HttpFetchResult.ResultException(exception.ex());
+                        default:  // should be unreachable
+                            throw new IllegalStateException("Unknown probe result");
                     }
                 }
                 catch (HttpFetcherImpl.RateLimitException ex) {
@@ -348,8 +346,8 @@ public class CrawlerRetreiver implements AutoCloseable {
                 }
             }
 
-        timer.waitFetchDelay(System.currentTimeMillis() - probeStart);
-        }*/
+            timer.waitFetchDelay(System.currentTimeMillis() - probeStart);
+        }
 
 
         for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
diff --git a/code/processes/crawling-process/model/java/nu/marginalia/ContentTypes.java b/code/processes/crawling-process/model/java/nu/marginalia/ContentTypes.java
new file mode 100644
index 00000000..dbc1989c
--- /dev/null
+++ b/code/processes/crawling-process/model/java/nu/marginalia/ContentTypes.java
@@ -0,0 +1,22 @@
+package nu.marginalia;
+
+import java.util.Set;
+
+public class ContentTypes {
+    public static final Set<String> acceptedContentTypes = Set.of("application/xhtml+xml",
+            "application/xhtml",
+            "text/html",
+            "image/x-icon",
+            "text/plain");
+
+    public static boolean isAccepted(String contentTypeHeader) {
+        String lcHeader = contentTypeHeader.toLowerCase();
+        for (var type : acceptedContentTypes) {
+            if (lcHeader.startsWith(type)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+}
diff --git a/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileWriter.java b/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileWriter.java
index f231c703..9474c2ff 100644
--- a/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileWriter.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileWriter.java
@@ -1,6 +1,7 @@
 package nu.marginalia.parquet.crawldata;
 
 import blue.strategic.parquet.ParquetWriter;
+import nu.marginalia.ContentTypes;
 import nu.marginalia.UserAgent;
 import nu.marginalia.model.body.DocumentBodyExtractor;
 import nu.marginalia.model.body.DocumentBodyResult;
@@ -62,6 +63,8 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
         }
     }
 
+
+
     /** Return true if the WarcResponse should be excluded from conversion */
     private static boolean filterResponse(String uaString, WarcResponse response) throws IOException {
 
@@ -74,14 +77,25 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
             return false;
         }
 
-        var robotsTags = response.http().headers().all("X-Robots-Tag");
+        var headers = response.http().headers();
+        var robotsTags = headers.all("X-Robots-Tag");
+
         if (!isXRobotsTagsPermitted(robotsTags, uaString)) {
             return false;
         }
 
+        // Strip out responses with content types we aren't interested in
+        // (though ideally we wouldn't download these at all)
+        String contentType = headers.first("Content-Type").orElse("text/plain").toLowerCase();
+
+        if (!ContentTypes.isAccepted(contentType)) {
+            return false;
+        }
+
         return true;
     }
 
+
     private void write(String domain, WarcXEntityRefused refused) throws IOException {
         URI profile = refused.profile();
 
diff --git a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java
index d6d407bf..b2a0f2bc 100644
--- a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java
@@ -157,10 +157,10 @@ class WarcRecorderTest {
                 fileNameParquet);
 
         var urls = CrawledDocumentParquetRecordFileReader.stream(fileNameParquet).map(doc -> doc.url).toList();
-        assertEquals(3, urls.size());
+        assertEquals(2, urls.size());
         assertEquals("https://www.marginalia.nu/", urls.get(0));
         assertEquals("https://www.marginalia.nu/log/", urls.get(1));
-        assertEquals("https://www.marginalia.nu/sanic.png", urls.get(2));
+        // sanic.jpg gets filtered out for its bad mime type
 
     }