(summary) Fix a few cases where noscript tags would sometimes be used for document summary

This commit is contained in:
Viktor Lofgren 2024-09-04 15:00:40 +02:00
parent 50ba8fd099
commit dc67c81f99
4 changed files with 18 additions and 10 deletions

View File

@ -50,6 +50,12 @@ public class DomPruningFilter implements NodeFilter {
} }
} }
if (node instanceof Element el) {
if (shouldAlwaysPurge(el)) {
return FilterResult.REMOVE;
}
}
data.put(node, dataForNode); data.put(node, dataForNode);
if (dataForNode.depth <= 1) if (dataForNode.depth <= 1)
@ -62,11 +68,6 @@ public class DomPruningFilter implements NodeFilter {
&& dataForNode.treeSize > 3) && dataForNode.treeSize > 3)
return FilterResult.REMOVE; return FilterResult.REMOVE;
if (node instanceof Element el) {
if (shouldAlwaysPurge(el)) {
return FilterResult.REMOVE;
}
}
return FilterResult.CONTINUE; return FilterResult.CONTINUE;
} }
@ -98,6 +99,8 @@ public class DomPruningFilter implements NodeFilter {
return true; return true;
if ("iframe".equalsIgnoreCase(tagName)) if ("iframe".equalsIgnoreCase(tagName))
return true; return true;
if ("noscript".equalsIgnoreCase(tagName))
return true;
if ("footer".equalsIgnoreCase(tagName)) if ("footer".equalsIgnoreCase(tagName))
return true; return true;
if ("header".equalsIgnoreCase(tagName)) if ("header".equalsIgnoreCase(tagName))

View File

@ -135,8 +135,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
throw new DisqualifiedException(DisqualificationReason.IRRELEVANT); throw new DisqualifiedException(DisqualificationReason.IRRELEVANT);
} }
DocumentLanguageData dld = var prunedDoc = specialization.prune(doc);
sentenceExtractorProvider.get().extractSentences(specialization.prune(doc)); DocumentLanguageData dld = sentenceExtractorProvider.get().extractSentences(prunedDoc);
checkDocumentLanguage(dld); checkDocumentLanguage(dld);
@ -174,7 +174,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, linkTexts, url); DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, linkTexts, url);
ret.description = specialization.getSummary(doc, words.importantWords); ret.description = specialization.getSummary(prunedDoc, words.importantWords);
ret.generator = generatorParts.type(); ret.generator = generatorParts.type();
var tagWords = new MetaTagsBuilder() var tagWords = new MetaTagsBuilder()

View File

@ -3,10 +3,10 @@ package nu.marginalia.converting.processor.plugin.specialization;
import ca.rmen.porterstemmer.PorterStemmer; import ca.rmen.porterstemmer.PorterStemmer;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder; import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
import org.apache.logging.log4j.util.Strings; import org.apache.logging.log4j.util.Strings;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
@ -122,6 +122,11 @@ public class BlogSpecialization extends DefaultSpecialization {
String classes = el.attr("class"); String classes = el.attr("class");
String id = el.id(); String id = el.id();
String tagName = el.tagName();
if (tagName.equalsIgnoreCase("noscript"))
return FilterResult.REMOVE;
for (String badClassElement : badClassElements) { for (String badClassElement : badClassElements) {
if (classes.contains(badClassElement)) { if (classes.contains(badClassElement)) {
return FilterResult.REMOVE; return FilterResult.REMOVE;

View File

@ -20,7 +20,7 @@ public class DomFilterHeuristic implements SummaryHeuristic {
var filter = new SummarizingDOMFilter(); var filter = new SummarizingDOMFilter();
doc.filter(filter); doc.body().filter(filter);
return filter.getSummary( return filter.getSummary(
maxSummaryLength+32, maxSummaryLength+32,