mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(summary) Fix a few cases where noscript tags would sometimes be used for document summary
This commit is contained in:
parent
50ba8fd099
commit
dc67c81f99
@ -50,6 +50,12 @@ public class DomPruningFilter implements NodeFilter {
|
||||
}
|
||||
}
|
||||
|
||||
if (node instanceof Element el) {
|
||||
if (shouldAlwaysPurge(el)) {
|
||||
return FilterResult.REMOVE;
|
||||
}
|
||||
}
|
||||
|
||||
data.put(node, dataForNode);
|
||||
|
||||
if (dataForNode.depth <= 1)
|
||||
@ -62,11 +68,6 @@ public class DomPruningFilter implements NodeFilter {
|
||||
&& dataForNode.treeSize > 3)
|
||||
return FilterResult.REMOVE;
|
||||
|
||||
if (node instanceof Element el) {
|
||||
if (shouldAlwaysPurge(el)) {
|
||||
return FilterResult.REMOVE;
|
||||
}
|
||||
}
|
||||
|
||||
return FilterResult.CONTINUE;
|
||||
}
|
||||
@ -98,6 +99,8 @@ public class DomPruningFilter implements NodeFilter {
|
||||
return true;
|
||||
if ("iframe".equalsIgnoreCase(tagName))
|
||||
return true;
|
||||
if ("noscript".equalsIgnoreCase(tagName))
|
||||
return true;
|
||||
if ("footer".equalsIgnoreCase(tagName))
|
||||
return true;
|
||||
if ("header".equalsIgnoreCase(tagName))
|
||||
|
@ -135,8 +135,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
throw new DisqualifiedException(DisqualificationReason.IRRELEVANT);
|
||||
}
|
||||
|
||||
DocumentLanguageData dld =
|
||||
sentenceExtractorProvider.get().extractSentences(specialization.prune(doc));
|
||||
var prunedDoc = specialization.prune(doc);
|
||||
DocumentLanguageData dld = sentenceExtractorProvider.get().extractSentences(prunedDoc);
|
||||
|
||||
checkDocumentLanguage(dld);
|
||||
|
||||
@ -174,7 +174,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
|
||||
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, linkTexts, url);
|
||||
|
||||
ret.description = specialization.getSummary(doc, words.importantWords);
|
||||
ret.description = specialization.getSummary(prunedDoc, words.importantWords);
|
||||
ret.generator = generatorParts.type();
|
||||
|
||||
var tagWords = new MetaTagsBuilder()
|
||||
|
@ -3,10 +3,10 @@ package nu.marginalia.converting.processor.plugin.specialization;
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.converting.processor.summary.SummaryExtractor;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.converting.processor.summary.SummaryExtractor;
|
||||
import org.apache.logging.log4j.util.Strings;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
@ -122,6 +122,11 @@ public class BlogSpecialization extends DefaultSpecialization {
|
||||
String classes = el.attr("class");
|
||||
String id = el.id();
|
||||
|
||||
String tagName = el.tagName();
|
||||
|
||||
if (tagName.equalsIgnoreCase("noscript"))
|
||||
return FilterResult.REMOVE;
|
||||
|
||||
for (String badClassElement : badClassElements) {
|
||||
if (classes.contains(badClassElement)) {
|
||||
return FilterResult.REMOVE;
|
||||
|
@ -20,7 +20,7 @@ public class DomFilterHeuristic implements SummaryHeuristic {
|
||||
|
||||
var filter = new SummarizingDOMFilter();
|
||||
|
||||
doc.filter(filter);
|
||||
doc.body().filter(filter);
|
||||
|
||||
return filter.getSummary(
|
||||
maxSummaryLength+32,
|
||||
|
Loading…
Reference in New Issue
Block a user