mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(export) Filter non-HTML documents in exporters
Add a check to ensure only documents with "text/html" content type are processed in FeedExporter, AtagExporter, and TermFrequencyExporter. This prevents non-HTML documents from being parsed and helps maintain data consistency and keep the memory usage down.
This commit is contained in:
parent
0b6b5dab07
commit
3ec9c4c5fa
@ -89,6 +89,8 @@ public class AtagExporter implements ExporterIf {
|
|||||||
continue;
|
continue;
|
||||||
if (null == doc.documentBody)
|
if (null == doc.documentBody)
|
||||||
continue;
|
continue;
|
||||||
|
if (!doc.contentType.toLowerCase().startsWith("text/html"))
|
||||||
|
continue;
|
||||||
|
|
||||||
var baseUrl = new EdgeUrl(doc.url);
|
var baseUrl = new EdgeUrl(doc.url);
|
||||||
var parsed = Jsoup.parse(doc.documentBody);
|
var parsed = Jsoup.parse(doc.documentBody);
|
||||||
|
@ -83,6 +83,8 @@ public class FeedExporter implements ExporterIf {
|
|||||||
continue;
|
continue;
|
||||||
if (null == doc.documentBody)
|
if (null == doc.documentBody)
|
||||||
continue;
|
continue;
|
||||||
|
if (!doc.contentType.toLowerCase().startsWith("text/html"))
|
||||||
|
continue;
|
||||||
|
|
||||||
var baseUrl = new EdgeUrl(doc.url);
|
var baseUrl = new EdgeUrl(doc.url);
|
||||||
var parsed = Jsoup.parse(doc.documentBody);
|
var parsed = Jsoup.parse(doc.documentBody);
|
||||||
|
@ -111,7 +111,7 @@ public class TermFrequencyExporter implements ExporterIf {
|
|||||||
|
|
||||||
if (!(stream.next() instanceof CrawledDocument doc)) continue;
|
if (!(stream.next() instanceof CrawledDocument doc)) continue;
|
||||||
if (doc.documentBody == null) continue;
|
if (doc.documentBody == null) continue;
|
||||||
if (!doc.contentType.startsWith("text/html"))
|
if (!doc.contentType.toLowerCase().startsWith("text/html"))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
docCount.incrementAndGet();
|
docCount.incrementAndGet();
|
||||||
|
Loading…
Reference in New Issue
Block a user