diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/dom/DomPruningFilter.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/dom/DomPruningFilter.java index 51264400..68819ecf 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/dom/DomPruningFilter.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/dom/DomPruningFilter.java @@ -50,6 +50,12 @@ public class DomPruningFilter implements NodeFilter { } } + if (node instanceof Element el) { + if (shouldAlwaysPurge(el)) { + return FilterResult.REMOVE; + } + } + data.put(node, dataForNode); if (dataForNode.depth <= 1) @@ -62,11 +68,6 @@ public class DomPruningFilter implements NodeFilter { && dataForNode.treeSize > 3) return FilterResult.REMOVE; - if (node instanceof Element el) { - if (shouldAlwaysPurge(el)) { - return FilterResult.REMOVE; - } - } return FilterResult.CONTINUE; } @@ -98,6 +99,8 @@ public class DomPruningFilter implements NodeFilter { return true; if ("iframe".equalsIgnoreCase(tagName)) return true; + if ("noscript".equalsIgnoreCase(tagName)) + return true; if ("footer".equalsIgnoreCase(tagName)) return true; if ("header".equalsIgnoreCase(tagName)) diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java index 101462ef..ccb8a383 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java @@ -135,8 +135,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin throw new DisqualifiedException(DisqualificationReason.IRRELEVANT); } - DocumentLanguageData dld = - sentenceExtractorProvider.get().extractSentences(specialization.prune(doc)); + var prunedDoc = specialization.prune(doc); + DocumentLanguageData dld = sentenceExtractorProvider.get().extractSentences(prunedDoc); checkDocumentLanguage(dld); @@ -174,7 +174,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, linkTexts, url); - ret.description = specialization.getSummary(doc, words.importantWords); + ret.description = specialization.getSummary(prunedDoc, words.importantWords); ret.generator = generatorParts.type(); var tagWords = new MetaTagsBuilder() diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/BlogSpecialization.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/BlogSpecialization.java index feeb2126..9a699a68 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/BlogSpecialization.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/BlogSpecialization.java @@ -3,10 +3,10 @@ package nu.marginalia.converting.processor.plugin.specialization; import ca.rmen.porterstemmer.PorterStemmer; import com.google.inject.Inject; import com.google.inject.Singleton; +import nu.marginalia.converting.processor.summary.SummaryExtractor; import nu.marginalia.keyword.model.DocumentKeywordsBuilder; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.idx.WordFlags; -import nu.marginalia.converting.processor.summary.SummaryExtractor; import org.apache.logging.log4j.util.Strings; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; @@ -122,6 +122,11 @@ public class BlogSpecialization extends DefaultSpecialization { String classes = el.attr("class"); String id = el.id(); + String tagName = el.tagName(); + + if (tagName.equalsIgnoreCase("noscript")) + return FilterResult.REMOVE; + for (String badClassElement : badClassElements) { if (classes.contains(badClassElement)) { return FilterResult.REMOVE; diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/summary/heuristic/DomFilterHeuristic.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/summary/heuristic/DomFilterHeuristic.java index 7a1c2be3..30d9ccc9 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/summary/heuristic/DomFilterHeuristic.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/summary/heuristic/DomFilterHeuristic.java @@ -20,7 +20,7 @@ public class DomFilterHeuristic implements SummaryHeuristic { var filter = new SummarizingDOMFilter(); - doc.filter(filter); + doc.body().filter(filter); return filter.getSummary( maxSummaryLength+32,