Cleaned up HTML features code a bit.

This commit is contained in:
vlofgren 2022-07-08 17:25:16 +02:00
parent 2b83e0d754
commit 7dea94d36d
2 changed files with 16 additions and 20 deletions

View File

@ -163,7 +163,6 @@ public class DocumentProcessor {
var edgeDomain = url.domain; var edgeDomain = url.domain;
tagWords.add("format:"+ret.standard.toString().toLowerCase()); tagWords.add("format:"+ret.standard.toString().toLowerCase());
tagWords.add("site:" + edgeDomain.toString().toLowerCase()); tagWords.add("site:" + edgeDomain.toString().toLowerCase());
if (!Objects.equals(edgeDomain.toString(), edgeDomain.domain)) { if (!Objects.equals(edgeDomain.toString(), edgeDomain.domain)) {
tagWords.add("site:" + edgeDomain.domain.toLowerCase()); tagWords.add("site:" + edgeDomain.domain.toLowerCase());
@ -172,18 +171,7 @@ public class DocumentProcessor {
tagWords.add("proto:"+url.proto.toLowerCase()); tagWords.add("proto:"+url.proto.toLowerCase());
tagWords.add("js:" + Boolean.toString(ret.features.contains(HtmlFeature.JS)).toLowerCase()); tagWords.add("js:" + Boolean.toString(ret.features.contains(HtmlFeature.JS)).toLowerCase());
if (ret.features.contains(HtmlFeature.MEDIA)) { ret.features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add);
tagWords.add("special:media");
}
if (ret.features.contains(HtmlFeature.TRACKING)) {
tagWords.add("special:tracking");
}
if (ret.features.contains(HtmlFeature.AFFILIATE_LINK)) {
tagWords.add("special:affiliate");
}
if (ret.features.contains(HtmlFeature.COOKIES)) {
tagWords.add("special:cookies");
}
words.append(IndexBlock.Meta, tagWords); words.append(IndexBlock.Meta, tagWords);
words.append(IndexBlock.Words, tagWords); words.append(IndexBlock.Words, tagWords);
@ -201,7 +189,9 @@ public class DocumentProcessor {
for (var frame : doc.getElementsByTag("frame")) { for (var frame : doc.getElementsByTag("frame")) {
linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept); linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept);
} }
for (var frame : doc.getElementsByTag("iframe")) {
linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept);
}
for (var link : doc.select("link[rel=alternate]")) { for (var link : doc.select("link[rel=alternate]")) {
feedExtractor feedExtractor
.getFeedFromAlternateTag(baseUrl, link) .getFeedFromAlternateTag(baseUrl, link)

View File

@ -3,17 +3,23 @@ package nu.marginalia.wmsa.edge.converting.processor.logic;
import java.util.Collection; import java.util.Collection;
public enum HtmlFeature { public enum HtmlFeature {
MEDIA(0), MEDIA(0, "special:media"),
JS(1), JS(1, "special:scripts"),
AFFILIATE_LINK(2), AFFILIATE_LINK(2, "special:affiliate"),
TRACKING(3), TRACKING(3, "special:tracking"),
COOKIES(4) COOKIES(4, "special:cookies")
; ;
public final int bit; public final int bit;
private final String keyword;
HtmlFeature(int bit) { HtmlFeature(int bit, String keyword) {
this.bit = bit; this.bit = bit;
this.keyword = keyword;
}
public String getKeyword() {
return keyword;
} }
public static int encode(Collection<HtmlFeature> featuresAll) { public static int encode(Collection<HtmlFeature> featuresAll) {