mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Cleaned up HTML features code a bit.
This commit is contained in:
parent
2b83e0d754
commit
7dea94d36d
@ -163,7 +163,6 @@ public class DocumentProcessor {
|
|||||||
var edgeDomain = url.domain;
|
var edgeDomain = url.domain;
|
||||||
tagWords.add("format:"+ret.standard.toString().toLowerCase());
|
tagWords.add("format:"+ret.standard.toString().toLowerCase());
|
||||||
|
|
||||||
|
|
||||||
tagWords.add("site:" + edgeDomain.toString().toLowerCase());
|
tagWords.add("site:" + edgeDomain.toString().toLowerCase());
|
||||||
if (!Objects.equals(edgeDomain.toString(), edgeDomain.domain)) {
|
if (!Objects.equals(edgeDomain.toString(), edgeDomain.domain)) {
|
||||||
tagWords.add("site:" + edgeDomain.domain.toLowerCase());
|
tagWords.add("site:" + edgeDomain.domain.toLowerCase());
|
||||||
@ -172,18 +171,7 @@ public class DocumentProcessor {
|
|||||||
tagWords.add("proto:"+url.proto.toLowerCase());
|
tagWords.add("proto:"+url.proto.toLowerCase());
|
||||||
tagWords.add("js:" + Boolean.toString(ret.features.contains(HtmlFeature.JS)).toLowerCase());
|
tagWords.add("js:" + Boolean.toString(ret.features.contains(HtmlFeature.JS)).toLowerCase());
|
||||||
|
|
||||||
if (ret.features.contains(HtmlFeature.MEDIA)) {
|
ret.features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add);
|
||||||
tagWords.add("special:media");
|
|
||||||
}
|
|
||||||
if (ret.features.contains(HtmlFeature.TRACKING)) {
|
|
||||||
tagWords.add("special:tracking");
|
|
||||||
}
|
|
||||||
if (ret.features.contains(HtmlFeature.AFFILIATE_LINK)) {
|
|
||||||
tagWords.add("special:affiliate");
|
|
||||||
}
|
|
||||||
if (ret.features.contains(HtmlFeature.COOKIES)) {
|
|
||||||
tagWords.add("special:cookies");
|
|
||||||
}
|
|
||||||
|
|
||||||
words.append(IndexBlock.Meta, tagWords);
|
words.append(IndexBlock.Meta, tagWords);
|
||||||
words.append(IndexBlock.Words, tagWords);
|
words.append(IndexBlock.Words, tagWords);
|
||||||
@ -201,7 +189,9 @@ public class DocumentProcessor {
|
|||||||
for (var frame : doc.getElementsByTag("frame")) {
|
for (var frame : doc.getElementsByTag("frame")) {
|
||||||
linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept);
|
linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept);
|
||||||
}
|
}
|
||||||
|
for (var frame : doc.getElementsByTag("iframe")) {
|
||||||
|
linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept);
|
||||||
|
}
|
||||||
for (var link : doc.select("link[rel=alternate]")) {
|
for (var link : doc.select("link[rel=alternate]")) {
|
||||||
feedExtractor
|
feedExtractor
|
||||||
.getFeedFromAlternateTag(baseUrl, link)
|
.getFeedFromAlternateTag(baseUrl, link)
|
||||||
|
@ -3,17 +3,23 @@ package nu.marginalia.wmsa.edge.converting.processor.logic;
|
|||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
|
|
||||||
public enum HtmlFeature {
|
public enum HtmlFeature {
|
||||||
MEDIA(0),
|
MEDIA(0, "special:media"),
|
||||||
JS(1),
|
JS(1, "special:scripts"),
|
||||||
AFFILIATE_LINK(2),
|
AFFILIATE_LINK(2, "special:affiliate"),
|
||||||
TRACKING(3),
|
TRACKING(3, "special:tracking"),
|
||||||
COOKIES(4)
|
COOKIES(4, "special:cookies")
|
||||||
;
|
;
|
||||||
|
|
||||||
public final int bit;
|
public final int bit;
|
||||||
|
private final String keyword;
|
||||||
|
|
||||||
HtmlFeature(int bit) {
|
HtmlFeature(int bit, String keyword) {
|
||||||
this.bit = bit;
|
this.bit = bit;
|
||||||
|
this.keyword = keyword;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getKeyword() {
|
||||||
|
return keyword;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static int encode(Collection<HtmlFeature> featuresAll) {
|
public static int encode(Collection<HtmlFeature> featuresAll) {
|
||||||
|
Loading…
Reference in New Issue
Block a user