mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Make use of DocumentFlags' flags
This commit is contained in:
parent
1bb1248ab0
commit
ca22c287a5
@ -3,10 +3,10 @@ package nu.marginalia.model.idx;
|
||||
import java.util.EnumSet;
|
||||
|
||||
public enum DocumentFlags {
|
||||
UnusedBit1,
|
||||
Javascript,
|
||||
PlainText,
|
||||
UnusedBit2,
|
||||
UnusedBit3,
|
||||
Ads,
|
||||
Tracking,
|
||||
UnusedBit4,
|
||||
UnusedBit5,
|
||||
UnusedBit6,
|
||||
|
@ -5,6 +5,7 @@ import com.google.inject.name.Named;
|
||||
import nu.marginalia.converting.processor.MetaRobotsTag;
|
||||
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
||||
import nu.marginalia.converting.processor.logic.links.LinkProcessor;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.summary.SummaryExtractor;
|
||||
import nu.marginalia.link_parser.LinkParser;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
@ -125,7 +126,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
ret.hashCode = dld.localitySensitiveHashCode();
|
||||
|
||||
PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, true);
|
||||
ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.noneOf(DocumentFlags.class));
|
||||
EnumSet<DocumentFlags> documentFlags = htmlFeatures2DocumentFlags(ret.features);
|
||||
|
||||
ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, documentFlags);
|
||||
|
||||
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url);
|
||||
|
||||
@ -148,6 +151,22 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
return new DetailsWithWords(ret, words);
|
||||
}
|
||||
|
||||
private EnumSet<DocumentFlags> htmlFeatures2DocumentFlags(Set<HtmlFeature> features) {
|
||||
EnumSet<DocumentFlags> flags = EnumSet.noneOf(DocumentFlags.class);
|
||||
|
||||
if (features.contains(HtmlFeature.ADVERTISEMENT)) {
|
||||
flags.add(DocumentFlags.Ads);
|
||||
}
|
||||
if (features.contains(HtmlFeature.JS)) {
|
||||
flags.add(DocumentFlags.Javascript);
|
||||
}
|
||||
if (features.contains(HtmlFeature.TRACKING)) {
|
||||
flags.add(DocumentFlags.Tracking);
|
||||
}
|
||||
|
||||
return flags;
|
||||
}
|
||||
|
||||
private Document prune(Document doc) {
|
||||
final var prunedDoc = doc.clone();
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user