mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(summary) Fix a few cases where noscript tags would sometimes be used for document summary
This commit is contained in:
parent
50ba8fd099
commit
dc67c81f99
@ -50,6 +50,12 @@ public class DomPruningFilter implements NodeFilter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (node instanceof Element el) {
|
||||||
|
if (shouldAlwaysPurge(el)) {
|
||||||
|
return FilterResult.REMOVE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
data.put(node, dataForNode);
|
data.put(node, dataForNode);
|
||||||
|
|
||||||
if (dataForNode.depth <= 1)
|
if (dataForNode.depth <= 1)
|
||||||
@ -62,11 +68,6 @@ public class DomPruningFilter implements NodeFilter {
|
|||||||
&& dataForNode.treeSize > 3)
|
&& dataForNode.treeSize > 3)
|
||||||
return FilterResult.REMOVE;
|
return FilterResult.REMOVE;
|
||||||
|
|
||||||
if (node instanceof Element el) {
|
|
||||||
if (shouldAlwaysPurge(el)) {
|
|
||||||
return FilterResult.REMOVE;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return FilterResult.CONTINUE;
|
return FilterResult.CONTINUE;
|
||||||
}
|
}
|
||||||
@ -98,6 +99,8 @@ public class DomPruningFilter implements NodeFilter {
|
|||||||
return true;
|
return true;
|
||||||
if ("iframe".equalsIgnoreCase(tagName))
|
if ("iframe".equalsIgnoreCase(tagName))
|
||||||
return true;
|
return true;
|
||||||
|
if ("noscript".equalsIgnoreCase(tagName))
|
||||||
|
return true;
|
||||||
if ("footer".equalsIgnoreCase(tagName))
|
if ("footer".equalsIgnoreCase(tagName))
|
||||||
return true;
|
return true;
|
||||||
if ("header".equalsIgnoreCase(tagName))
|
if ("header".equalsIgnoreCase(tagName))
|
||||||
|
@ -135,8 +135,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
throw new DisqualifiedException(DisqualificationReason.IRRELEVANT);
|
throw new DisqualifiedException(DisqualificationReason.IRRELEVANT);
|
||||||
}
|
}
|
||||||
|
|
||||||
DocumentLanguageData dld =
|
var prunedDoc = specialization.prune(doc);
|
||||||
sentenceExtractorProvider.get().extractSentences(specialization.prune(doc));
|
DocumentLanguageData dld = sentenceExtractorProvider.get().extractSentences(prunedDoc);
|
||||||
|
|
||||||
checkDocumentLanguage(dld);
|
checkDocumentLanguage(dld);
|
||||||
|
|
||||||
@ -174,7 +174,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
|
|
||||||
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, linkTexts, url);
|
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, linkTexts, url);
|
||||||
|
|
||||||
ret.description = specialization.getSummary(doc, words.importantWords);
|
ret.description = specialization.getSummary(prunedDoc, words.importantWords);
|
||||||
ret.generator = generatorParts.type();
|
ret.generator = generatorParts.type();
|
||||||
|
|
||||||
var tagWords = new MetaTagsBuilder()
|
var tagWords = new MetaTagsBuilder()
|
||||||
|
@ -3,10 +3,10 @@ package nu.marginalia.converting.processor.plugin.specialization;
|
|||||||
import ca.rmen.porterstemmer.PorterStemmer;
|
import ca.rmen.porterstemmer.PorterStemmer;
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
|
import nu.marginalia.converting.processor.summary.SummaryExtractor;
|
||||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.idx.WordFlags;
|
import nu.marginalia.model.idx.WordFlags;
|
||||||
import nu.marginalia.converting.processor.summary.SummaryExtractor;
|
|
||||||
import org.apache.logging.log4j.util.Strings;
|
import org.apache.logging.log4j.util.Strings;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
import org.jsoup.nodes.Element;
|
import org.jsoup.nodes.Element;
|
||||||
@ -122,6 +122,11 @@ public class BlogSpecialization extends DefaultSpecialization {
|
|||||||
String classes = el.attr("class");
|
String classes = el.attr("class");
|
||||||
String id = el.id();
|
String id = el.id();
|
||||||
|
|
||||||
|
String tagName = el.tagName();
|
||||||
|
|
||||||
|
if (tagName.equalsIgnoreCase("noscript"))
|
||||||
|
return FilterResult.REMOVE;
|
||||||
|
|
||||||
for (String badClassElement : badClassElements) {
|
for (String badClassElement : badClassElements) {
|
||||||
if (classes.contains(badClassElement)) {
|
if (classes.contains(badClassElement)) {
|
||||||
return FilterResult.REMOVE;
|
return FilterResult.REMOVE;
|
||||||
|
@ -20,7 +20,7 @@ public class DomFilterHeuristic implements SummaryHeuristic {
|
|||||||
|
|
||||||
var filter = new SummarizingDOMFilter();
|
var filter = new SummarizingDOMFilter();
|
||||||
|
|
||||||
doc.filter(filter);
|
doc.body().filter(filter);
|
||||||
|
|
||||||
return filter.getSummary(
|
return filter.getSummary(
|
||||||
maxSummaryLength+32,
|
maxSummaryLength+32,
|
||||||
|
Loading…
Reference in New Issue
Block a user