diff --git a/code/common/model/src/main/java/nu/marginalia/model/idx/DocumentFlags.java b/code/common/model/src/main/java/nu/marginalia/model/idx/DocumentFlags.java index 099d2da8..c0d097df 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/idx/DocumentFlags.java +++ b/code/common/model/src/main/java/nu/marginalia/model/idx/DocumentFlags.java @@ -3,10 +3,10 @@ package nu.marginalia.model.idx; import java.util.EnumSet; public enum DocumentFlags { - UnusedBit1, + Javascript, PlainText, - UnusedBit2, - UnusedBit3, + Ads, + Tracking, UnusedBit4, UnusedBit5, UnusedBit6, diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java index 26379cf6..d080b535 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java @@ -5,6 +5,7 @@ import com.google.inject.name.Named; import nu.marginalia.converting.processor.MetaRobotsTag; import nu.marginalia.converting.processor.logic.dom.DomPruningFilter; import nu.marginalia.converting.processor.logic.links.LinkProcessor; +import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.summary.SummaryExtractor; import nu.marginalia.link_parser.LinkParser; import nu.marginalia.crawling.model.CrawledDocument; @@ -125,7 +126,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin ret.hashCode = dld.localitySensitiveHashCode(); PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, true); - ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.noneOf(DocumentFlags.class)); + EnumSet documentFlags = htmlFeatures2DocumentFlags(ret.features); + + ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, documentFlags); DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url); @@ -148,6 +151,22 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin return new DetailsWithWords(ret, words); } + private EnumSet htmlFeatures2DocumentFlags(Set features) { + EnumSet flags = EnumSet.noneOf(DocumentFlags.class); + + if (features.contains(HtmlFeature.ADVERTISEMENT)) { + flags.add(DocumentFlags.Ads); + } + if (features.contains(HtmlFeature.JS)) { + flags.add(DocumentFlags.Javascript); + } + if (features.contains(HtmlFeature.TRACKING)) { + flags.add(DocumentFlags.Tracking); + } + + return flags; + } + private Document prune(Document doc) { final var prunedDoc = doc.clone();