diff --git a/code/processes/export-task-process/java/nu/marginalia/extractor/AtagExporter.java b/code/processes/export-task-process/java/nu/marginalia/extractor/AtagExporter.java index cee07513..9c66b882 100644 --- a/code/processes/export-task-process/java/nu/marginalia/extractor/AtagExporter.java +++ b/code/processes/export-task-process/java/nu/marginalia/extractor/AtagExporter.java @@ -89,6 +89,8 @@ public class AtagExporter implements ExporterIf { continue; if (null == doc.documentBody) continue; + if (!doc.contentType.toLowerCase().startsWith("text/html")) + continue; var baseUrl = new EdgeUrl(doc.url); var parsed = Jsoup.parse(doc.documentBody); diff --git a/code/processes/export-task-process/java/nu/marginalia/extractor/FeedExporter.java b/code/processes/export-task-process/java/nu/marginalia/extractor/FeedExporter.java index 050b11f6..abe5c708 100644 --- a/code/processes/export-task-process/java/nu/marginalia/extractor/FeedExporter.java +++ b/code/processes/export-task-process/java/nu/marginalia/extractor/FeedExporter.java @@ -83,6 +83,8 @@ public class FeedExporter implements ExporterIf { continue; if (null == doc.documentBody) continue; + if (!doc.contentType.toLowerCase().startsWith("text/html")) + continue; var baseUrl = new EdgeUrl(doc.url); var parsed = Jsoup.parse(doc.documentBody); diff --git a/code/processes/export-task-process/java/nu/marginalia/extractor/TermFrequencyExporter.java b/code/processes/export-task-process/java/nu/marginalia/extractor/TermFrequencyExporter.java index d5d8184c..3255edf2 100644 --- a/code/processes/export-task-process/java/nu/marginalia/extractor/TermFrequencyExporter.java +++ b/code/processes/export-task-process/java/nu/marginalia/extractor/TermFrequencyExporter.java @@ -111,7 +111,7 @@ public class TermFrequencyExporter implements ExporterIf { if (!(stream.next() instanceof CrawledDocument doc)) continue; if (doc.documentBody == null) continue; - if (!doc.contentType.startsWith("text/html")) + if (!doc.contentType.toLowerCase().startsWith("text/html")) continue; docCount.incrementAndGet();