mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Cleaned up HTML features code a bit.
This commit is contained in:
parent
2b83e0d754
commit
7dea94d36d
@ -163,7 +163,6 @@ public class DocumentProcessor {
|
||||
var edgeDomain = url.domain;
|
||||
tagWords.add("format:"+ret.standard.toString().toLowerCase());
|
||||
|
||||
|
||||
tagWords.add("site:" + edgeDomain.toString().toLowerCase());
|
||||
if (!Objects.equals(edgeDomain.toString(), edgeDomain.domain)) {
|
||||
tagWords.add("site:" + edgeDomain.domain.toLowerCase());
|
||||
@ -172,18 +171,7 @@ public class DocumentProcessor {
|
||||
tagWords.add("proto:"+url.proto.toLowerCase());
|
||||
tagWords.add("js:" + Boolean.toString(ret.features.contains(HtmlFeature.JS)).toLowerCase());
|
||||
|
||||
if (ret.features.contains(HtmlFeature.MEDIA)) {
|
||||
tagWords.add("special:media");
|
||||
}
|
||||
if (ret.features.contains(HtmlFeature.TRACKING)) {
|
||||
tagWords.add("special:tracking");
|
||||
}
|
||||
if (ret.features.contains(HtmlFeature.AFFILIATE_LINK)) {
|
||||
tagWords.add("special:affiliate");
|
||||
}
|
||||
if (ret.features.contains(HtmlFeature.COOKIES)) {
|
||||
tagWords.add("special:cookies");
|
||||
}
|
||||
ret.features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add);
|
||||
|
||||
words.append(IndexBlock.Meta, tagWords);
|
||||
words.append(IndexBlock.Words, tagWords);
|
||||
@ -201,7 +189,9 @@ public class DocumentProcessor {
|
||||
for (var frame : doc.getElementsByTag("frame")) {
|
||||
linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept);
|
||||
}
|
||||
|
||||
for (var frame : doc.getElementsByTag("iframe")) {
|
||||
linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept);
|
||||
}
|
||||
for (var link : doc.select("link[rel=alternate]")) {
|
||||
feedExtractor
|
||||
.getFeedFromAlternateTag(baseUrl, link)
|
||||
|
@ -3,17 +3,23 @@ package nu.marginalia.wmsa.edge.converting.processor.logic;
|
||||
import java.util.Collection;
|
||||
|
||||
public enum HtmlFeature {
|
||||
MEDIA(0),
|
||||
JS(1),
|
||||
AFFILIATE_LINK(2),
|
||||
TRACKING(3),
|
||||
COOKIES(4)
|
||||
MEDIA(0, "special:media"),
|
||||
JS(1, "special:scripts"),
|
||||
AFFILIATE_LINK(2, "special:affiliate"),
|
||||
TRACKING(3, "special:tracking"),
|
||||
COOKIES(4, "special:cookies")
|
||||
;
|
||||
|
||||
public final int bit;
|
||||
private final String keyword;
|
||||
|
||||
HtmlFeature(int bit) {
|
||||
HtmlFeature(int bit, String keyword) {
|
||||
this.bit = bit;
|
||||
this.keyword = keyword;
|
||||
}
|
||||
|
||||
public String getKeyword() {
|
||||
return keyword;
|
||||
}
|
||||
|
||||
public static int encode(Collection<HtmlFeature> featuresAll) {
|
||||
|
Loading…
Reference in New Issue
Block a user