From 3ec9c4c5faca2ece67513705cd4c4719bbb78064 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 25 Nov 2024 15:06:42 +0100 Subject: [PATCH] (export) Filter non-HTML documents in exporters Add a check to ensure only documents with "text/html" content type are processed in FeedExporter, AtagExporter, and TermFrequencyExporter. This prevents non-HTML documents from being parsed and helps maintain data consistency and keep the memory usage down. --- .../java/nu/marginalia/extractor/AtagExporter.java | 2 ++ .../java/nu/marginalia/extractor/FeedExporter.java | 2 ++ .../java/nu/marginalia/extractor/TermFrequencyExporter.java | 2 +- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/code/processes/export-task-process/java/nu/marginalia/extractor/AtagExporter.java b/code/processes/export-task-process/java/nu/marginalia/extractor/AtagExporter.java index cee07513..9c66b882 100644 --- a/code/processes/export-task-process/java/nu/marginalia/extractor/AtagExporter.java +++ b/code/processes/export-task-process/java/nu/marginalia/extractor/AtagExporter.java @@ -89,6 +89,8 @@ public class AtagExporter implements ExporterIf { continue; if (null == doc.documentBody) continue; + if (!doc.contentType.toLowerCase().startsWith("text/html")) + continue; var baseUrl = new EdgeUrl(doc.url); var parsed = Jsoup.parse(doc.documentBody); diff --git a/code/processes/export-task-process/java/nu/marginalia/extractor/FeedExporter.java b/code/processes/export-task-process/java/nu/marginalia/extractor/FeedExporter.java index 050b11f6..abe5c708 100644 --- a/code/processes/export-task-process/java/nu/marginalia/extractor/FeedExporter.java +++ b/code/processes/export-task-process/java/nu/marginalia/extractor/FeedExporter.java @@ -83,6 +83,8 @@ public class FeedExporter implements ExporterIf { continue; if (null == doc.documentBody) continue; + if (!doc.contentType.toLowerCase().startsWith("text/html")) + continue; var baseUrl = new EdgeUrl(doc.url); var parsed = Jsoup.parse(doc.documentBody); diff --git a/code/processes/export-task-process/java/nu/marginalia/extractor/TermFrequencyExporter.java b/code/processes/export-task-process/java/nu/marginalia/extractor/TermFrequencyExporter.java index d5d8184c..3255edf2 100644 --- a/code/processes/export-task-process/java/nu/marginalia/extractor/TermFrequencyExporter.java +++ b/code/processes/export-task-process/java/nu/marginalia/extractor/TermFrequencyExporter.java @@ -111,7 +111,7 @@ public class TermFrequencyExporter implements ExporterIf { if (!(stream.next() instanceof CrawledDocument doc)) continue; if (doc.documentBody == null) continue; - if (!doc.contentType.startsWith("text/html")) + if (!doc.contentType.toLowerCase().startsWith("text/html")) continue; docCount.incrementAndGet();