Cleaned up HTML features code a bit.

This commit is contained in:
vlofgren 2022-07-08 17:25:16 +02:00
parent 2b83e0d754
commit 7dea94d36d
2 changed files with 16 additions and 20 deletions

View File

@ -163,7 +163,6 @@ public class DocumentProcessor {
var edgeDomain = url.domain;
tagWords.add("format:"+ret.standard.toString().toLowerCase());
tagWords.add("site:" + edgeDomain.toString().toLowerCase());
if (!Objects.equals(edgeDomain.toString(), edgeDomain.domain)) {
tagWords.add("site:" + edgeDomain.domain.toLowerCase());
@ -172,18 +171,7 @@ public class DocumentProcessor {
tagWords.add("proto:"+url.proto.toLowerCase());
tagWords.add("js:" + Boolean.toString(ret.features.contains(HtmlFeature.JS)).toLowerCase());
if (ret.features.contains(HtmlFeature.MEDIA)) {
tagWords.add("special:media");
}
if (ret.features.contains(HtmlFeature.TRACKING)) {
tagWords.add("special:tracking");
}
if (ret.features.contains(HtmlFeature.AFFILIATE_LINK)) {
tagWords.add("special:affiliate");
}
if (ret.features.contains(HtmlFeature.COOKIES)) {
tagWords.add("special:cookies");
}
ret.features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add);
words.append(IndexBlock.Meta, tagWords);
words.append(IndexBlock.Words, tagWords);
@ -201,7 +189,9 @@ public class DocumentProcessor {
for (var frame : doc.getElementsByTag("frame")) {
linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept);
}
for (var frame : doc.getElementsByTag("iframe")) {
linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept);
}
for (var link : doc.select("link[rel=alternate]")) {
feedExtractor
.getFeedFromAlternateTag(baseUrl, link)

View File

@ -3,17 +3,23 @@ package nu.marginalia.wmsa.edge.converting.processor.logic;
import java.util.Collection;
public enum HtmlFeature {
MEDIA(0),
JS(1),
AFFILIATE_LINK(2),
TRACKING(3),
COOKIES(4)
MEDIA(0, "special:media"),
JS(1, "special:scripts"),
AFFILIATE_LINK(2, "special:affiliate"),
TRACKING(3, "special:tracking"),
COOKIES(4, "special:cookies")
;
public final int bit;
private final String keyword;
HtmlFeature(int bit) {
HtmlFeature(int bit, String keyword) {
this.bit = bit;
this.keyword = keyword;
}
public String getKeyword() {
return keyword;
}
public static int encode(Collection<HtmlFeature> featuresAll) {