(export) Filter non-HTML documents in exporters

Add a check to ensure only documents with "text/html" content type are processed in FeedExporter, AtagExporter, and TermFrequencyExporter. This prevents non-HTML documents from being parsed and helps maintain data consistency and keep the memory usage down.
This commit is contained in:
Viktor Lofgren 2024-11-25 15:06:42 +01:00
parent 0b6b5dab07
commit 3ec9c4c5fa
3 changed files with 5 additions and 1 deletions

View File

@ -89,6 +89,8 @@ public class AtagExporter implements ExporterIf {
continue; continue;
if (null == doc.documentBody) if (null == doc.documentBody)
continue; continue;
if (!doc.contentType.toLowerCase().startsWith("text/html"))
continue;
var baseUrl = new EdgeUrl(doc.url); var baseUrl = new EdgeUrl(doc.url);
var parsed = Jsoup.parse(doc.documentBody); var parsed = Jsoup.parse(doc.documentBody);

View File

@ -83,6 +83,8 @@ public class FeedExporter implements ExporterIf {
continue; continue;
if (null == doc.documentBody) if (null == doc.documentBody)
continue; continue;
if (!doc.contentType.toLowerCase().startsWith("text/html"))
continue;
var baseUrl = new EdgeUrl(doc.url); var baseUrl = new EdgeUrl(doc.url);
var parsed = Jsoup.parse(doc.documentBody); var parsed = Jsoup.parse(doc.documentBody);

View File

@ -111,7 +111,7 @@ public class TermFrequencyExporter implements ExporterIf {
if (!(stream.next() instanceof CrawledDocument doc)) continue; if (!(stream.next() instanceof CrawledDocument doc)) continue;
if (doc.documentBody == null) continue; if (doc.documentBody == null) continue;
if (!doc.contentType.startsWith("text/html")) if (!doc.contentType.toLowerCase().startsWith("text/html"))
continue; continue;
docCount.incrementAndGet(); docCount.incrementAndGet();