(summary) Fix a few cases where noscript tags would sometimes be used for document summary

This commit is contained in:
Viktor Lofgren 2024-09-04 15:00:40 +02:00
parent 50ba8fd099
commit dc67c81f99
4 changed files with 18 additions and 10 deletions

View File

@ -50,6 +50,12 @@ public class DomPruningFilter implements NodeFilter {
}
}
if (node instanceof Element el) {
if (shouldAlwaysPurge(el)) {
return FilterResult.REMOVE;
}
}
data.put(node, dataForNode);
if (dataForNode.depth <= 1)
@ -62,11 +68,6 @@ public class DomPruningFilter implements NodeFilter {
&& dataForNode.treeSize > 3)
return FilterResult.REMOVE;
if (node instanceof Element el) {
if (shouldAlwaysPurge(el)) {
return FilterResult.REMOVE;
}
}
return FilterResult.CONTINUE;
}
@ -98,6 +99,8 @@ public class DomPruningFilter implements NodeFilter {
return true;
if ("iframe".equalsIgnoreCase(tagName))
return true;
if ("noscript".equalsIgnoreCase(tagName))
return true;
if ("footer".equalsIgnoreCase(tagName))
return true;
if ("header".equalsIgnoreCase(tagName))

View File

@ -135,8 +135,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
throw new DisqualifiedException(DisqualificationReason.IRRELEVANT);
}
DocumentLanguageData dld =
sentenceExtractorProvider.get().extractSentences(specialization.prune(doc));
var prunedDoc = specialization.prune(doc);
DocumentLanguageData dld = sentenceExtractorProvider.get().extractSentences(prunedDoc);
checkDocumentLanguage(dld);
@ -174,7 +174,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, linkTexts, url);
ret.description = specialization.getSummary(doc, words.importantWords);
ret.description = specialization.getSummary(prunedDoc, words.importantWords);
ret.generator = generatorParts.type();
var tagWords = new MetaTagsBuilder()

View File

@ -3,10 +3,10 @@ package nu.marginalia.converting.processor.plugin.specialization;
import ca.rmen.porterstemmer.PorterStemmer;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
import org.apache.logging.log4j.util.Strings;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
@ -122,6 +122,11 @@ public class BlogSpecialization extends DefaultSpecialization {
String classes = el.attr("class");
String id = el.id();
String tagName = el.tagName();
if (tagName.equalsIgnoreCase("noscript"))
return FilterResult.REMOVE;
for (String badClassElement : badClassElements) {
if (classes.contains(badClassElement)) {
return FilterResult.REMOVE;

View File

@ -20,7 +20,7 @@ public class DomFilterHeuristic implements SummaryHeuristic {
var filter = new SummarizingDOMFilter();
doc.filter(filter);
doc.body().filter(filter);
return filter.getSummary(
maxSummaryLength+32,