mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Preparation for conversion
This commit is contained in:
parent
a04d27692e
commit
ccf79f47b0
@ -7,7 +7,7 @@ import nu.marginalia.util.language.conf.LanguageModels;
|
|||||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||||
import nu.marginalia.wmsa.configuration.WmsaHome;
|
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||||
import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruner;
|
import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruningFilter;
|
||||||
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
|
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
@ -87,7 +87,6 @@ public class TermFrequencyDict {
|
|||||||
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
||||||
|
|
||||||
ThreadLocal<SentenceExtractor> se = ThreadLocal.withInitial(() -> new SentenceExtractor(WmsaHome.getLanguageModels()));
|
ThreadLocal<SentenceExtractor> se = ThreadLocal.withInitial(() -> new SentenceExtractor(WmsaHome.getLanguageModels()));
|
||||||
DomPruner pruner = new DomPruner();
|
|
||||||
LanguageFilter lf = new LanguageFilter();
|
LanguageFilter lf = new LanguageFilter();
|
||||||
|
|
||||||
TLongIntHashMap counts = new TLongIntHashMap(100_000_000, 0.7f, -1, -1);
|
TLongIntHashMap counts = new TLongIntHashMap(100_000_000, 0.7f, -1, -1);
|
||||||
@ -108,7 +107,7 @@ public class TermFrequencyDict {
|
|||||||
docCount.incrementAndGet();
|
docCount.incrementAndGet();
|
||||||
|
|
||||||
Document parsed = Jsoup.parse(doc.documentBody);
|
Document parsed = Jsoup.parse(doc.documentBody);
|
||||||
pruner.prune(parsed, 0.5);
|
parsed.body().filter(new DomPruningFilter(0.5));
|
||||||
|
|
||||||
DocumentLanguageData dld = se.get().extractSentences(parsed);
|
DocumentLanguageData dld = se.get().extractSentences(parsed);
|
||||||
|
|
||||||
|
@ -171,16 +171,15 @@ public class DocumentProcessor {
|
|||||||
throw new DisqualifiedException(DisqualificationReason.FORBIDDEN);
|
throw new DisqualifiedException(DisqualificationReason.FORBIDDEN);
|
||||||
}
|
}
|
||||||
|
|
||||||
DomPruner domPruner = new DomPruner();
|
|
||||||
Document prunedDoc = doc.clone();
|
Document prunedDoc = doc.clone();
|
||||||
domPruner.prune(prunedDoc, 0.5);
|
prunedDoc.body().filter(new DomPruningFilter(0.5));
|
||||||
|
|
||||||
var dld = sentenceExtractor.extractSentences(prunedDoc);
|
var dld = sentenceExtractor.extractSentences(prunedDoc);
|
||||||
|
|
||||||
checkDocumentLanguage(dld);
|
checkDocumentLanguage(dld);
|
||||||
|
|
||||||
var ret = new ProcessedDocumentDetails();
|
var ret = new ProcessedDocumentDetails();
|
||||||
|
|
||||||
|
|
||||||
ret.length = getLength(doc);
|
ret.length = getLength(doc);
|
||||||
ret.standard = getHtmlStandard(doc);
|
ret.standard = getHtmlStandard(doc);
|
||||||
ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url);
|
ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url);
|
||||||
@ -246,13 +245,12 @@ public class DocumentProcessor {
|
|||||||
if (linkParser.shouldIndexLink(atag)) {
|
if (linkParser.shouldIndexLink(atag)) {
|
||||||
linkOpt.ifPresent(lp::accept);
|
linkOpt.ifPresent(lp::accept);
|
||||||
}
|
}
|
||||||
else if (linkOpt.isPresent()) {
|
else {
|
||||||
if (linkParser.hasBinarySuffix(linkOpt.get().toString())) {
|
linkOpt
|
||||||
linkOpt.ifPresent(lp::acceptNonIndexable);
|
.filter(url -> linkParser.hasBinarySuffix(url.path.toLowerCase()))
|
||||||
|
.ifPresent(lp::acceptNonIndexable);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
|
||||||
for (var frame : doc.getElementsByTag("frame")) {
|
for (var frame : doc.getElementsByTag("frame")) {
|
||||||
linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept);
|
linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept);
|
||||||
}
|
}
|
||||||
@ -271,21 +269,20 @@ public class DocumentProcessor {
|
|||||||
linkTerms.add("links:"+fd.toString().toLowerCase());
|
linkTerms.add("links:"+fd.toString().toLowerCase());
|
||||||
linkTerms.add("links:"+fd.getDomain().toLowerCase());
|
linkTerms.add("links:"+fd.getDomain().toLowerCase());
|
||||||
}
|
}
|
||||||
|
|
||||||
words.append(IndexBlock.Meta, linkTerms);
|
words.append(IndexBlock.Meta, linkTerms);
|
||||||
|
|
||||||
Set<String> fileKeywords = new HashSet<>(100);
|
Set<String> fileKeywords = new HashSet<>(100);
|
||||||
for (var link : lp.getNonIndexableUrls()) {
|
for (var link : lp.getNonIndexableUrls()) {
|
||||||
|
|
||||||
if (!Objects.equals(domain, link.domain)) {
|
if (!domain.hasSameTopDomain(link.domain)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
synthesizeFilenameKeyword(fileKeywords, link);
|
synthesizeFilenameKeyword(fileKeywords, link);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
words.append(IndexBlock.Artifacts, fileKeywords);
|
words.append(IndexBlock.Artifacts, fileKeywords);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void synthesizeFilenameKeyword(Set<String> fileKeywords, EdgeUrl link) {
|
private void synthesizeFilenameKeyword(Set<String> fileKeywords, EdgeUrl link) {
|
||||||
|
@ -1,6 +1,5 @@
|
|||||||
package nu.marginalia.wmsa.edge.converting.processor.logic;
|
package nu.marginalia.wmsa.edge.converting.processor.logic;
|
||||||
|
|
||||||
import org.jsoup.nodes.Document;
|
|
||||||
import org.jsoup.nodes.Element;
|
import org.jsoup.nodes.Element;
|
||||||
import org.jsoup.nodes.Node;
|
import org.jsoup.nodes.Node;
|
||||||
import org.jsoup.nodes.TextNode;
|
import org.jsoup.nodes.TextNode;
|
||||||
@ -9,22 +8,14 @@ import org.jsoup.select.NodeFilter;
|
|||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
public class DomPruner {
|
public class DomPruningFilter implements NodeFilter {
|
||||||
|
|
||||||
public void prune(Document document, double pruneThreshold) {
|
private final double pruneThreshold;
|
||||||
document.filter(new PruningFilter(pruneThreshold));
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class PruningFilter implements NodeFilter {
|
|
||||||
|
|
||||||
private final Map<Node, NodeData> data = new HashMap<>();
|
private final Map<Node, NodeData> data = new HashMap<>();
|
||||||
private final NodeData dummy = new NodeData(Integer.MAX_VALUE, 1, 0);
|
private final NodeData dummy = new NodeData(Integer.MAX_VALUE, 1, 0);
|
||||||
private double pruneThreshold;
|
|
||||||
|
|
||||||
public PruningFilter(double pruneThreshold) {
|
public DomPruningFilter(double pruneThreshold) {
|
||||||
this.pruneThreshold = pruneThreshold;
|
this.pruneThreshold = pruneThreshold;
|
||||||
}
|
}
|
||||||
|
|
@ -19,10 +19,14 @@ import java.util.regex.Pattern;
|
|||||||
|
|
||||||
public class LinkParser {
|
public class LinkParser {
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
private final List<String> blockPrefixList = List.of(
|
private final List<String> blockPrefixList = List.of(
|
||||||
"mailto:", "javascript:", "tel:", "itpc:", "#", "file:");
|
"mailto:", "javascript:", "tel:", "itpc:", "#", "file:");
|
||||||
private final List<String> blockSuffixList = List.of(
|
|
||||||
|
private final List<String> binarySuffixList = List.of(
|
||||||
".pdf", ".mp3", ".wmv", ".avi", ".zip", ".7z",
|
".pdf", ".mp3", ".wmv", ".avi", ".zip", ".7z",
|
||||||
|
".mpv", ".mp4", ".avi", ".mkv", ".tiff", ".dat", ".tar",
|
||||||
|
".com", ".bat", ".sh",
|
||||||
".bin", ".exe", ".tar.gz", ".tar.bz2", ".xml", ".swf",
|
".bin", ".exe", ".tar.gz", ".tar.bz2", ".xml", ".swf",
|
||||||
".wav", ".ogg", ".jpg", ".jpeg", ".png", ".gif", ".webp",
|
".wav", ".ogg", ".jpg", ".jpeg", ".png", ".gif", ".webp",
|
||||||
".webm", ".bmp", ".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx",
|
".webm", ".bmp", ".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx",
|
||||||
@ -33,7 +37,7 @@ public class LinkParser {
|
|||||||
return Optional.of(l)
|
return Optional.of(l)
|
||||||
.filter(this::shouldIndexLink)
|
.filter(this::shouldIndexLink)
|
||||||
.map(this::getUrl)
|
.map(this::getUrl)
|
||||||
.map(link -> resolveUrl(relativeBaseUrl, link))
|
.map(link -> resolveRelativeUrl(relativeBaseUrl, link))
|
||||||
.flatMap(this::createURI)
|
.flatMap(this::createURI)
|
||||||
.map(URI::normalize)
|
.map(URI::normalize)
|
||||||
.map(this::renormalize)
|
.map(this::renormalize)
|
||||||
@ -44,7 +48,7 @@ public class LinkParser {
|
|||||||
public Optional<EdgeUrl> parseLinkPermissive(EdgeUrl relativeBaseUrl, Element l) {
|
public Optional<EdgeUrl> parseLinkPermissive(EdgeUrl relativeBaseUrl, Element l) {
|
||||||
return Optional.of(l)
|
return Optional.of(l)
|
||||||
.map(this::getUrl)
|
.map(this::getUrl)
|
||||||
.map(link -> resolveUrl(relativeBaseUrl, link))
|
.map(link -> resolveRelativeUrl(relativeBaseUrl, link))
|
||||||
.flatMap(this::createURI)
|
.flatMap(this::createURI)
|
||||||
.map(URI::normalize)
|
.map(URI::normalize)
|
||||||
.map(this::renormalize)
|
.map(this::renormalize)
|
||||||
@ -74,7 +78,7 @@ public class LinkParser {
|
|||||||
@Contract(pure=true)
|
@Contract(pure=true)
|
||||||
public Optional<EdgeUrl> parseLink(EdgeUrl baseUrl, String str) {
|
public Optional<EdgeUrl> parseLink(EdgeUrl baseUrl, String str) {
|
||||||
return Optional.of(str)
|
return Optional.of(str)
|
||||||
.map(link -> resolveUrl(baseUrl, link))
|
.map(link -> resolveRelativeUrl(baseUrl, link))
|
||||||
.flatMap(this::createURI)
|
.flatMap(this::createURI)
|
||||||
.map(URI::normalize)
|
.map(URI::normalize)
|
||||||
.map(this::renormalize)
|
.map(this::renormalize)
|
||||||
@ -85,7 +89,7 @@ public class LinkParser {
|
|||||||
public Optional<EdgeUrl> parseFrame(EdgeUrl baseUrl, Element frame) {
|
public Optional<EdgeUrl> parseFrame(EdgeUrl baseUrl, Element frame) {
|
||||||
return Optional.of(frame)
|
return Optional.of(frame)
|
||||||
.map(l -> l.attr("src"))
|
.map(l -> l.attr("src"))
|
||||||
.map(link -> resolveUrl(baseUrl, link))
|
.map(link -> resolveRelativeUrl(baseUrl, link))
|
||||||
.flatMap(this::createURI)
|
.flatMap(this::createURI)
|
||||||
.map(URI::normalize)
|
.map(URI::normalize)
|
||||||
.map(this::renormalize)
|
.map(this::renormalize)
|
||||||
@ -95,10 +99,10 @@ public class LinkParser {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private URI renormalize(URI uri) {
|
private URI renormalize(URI uri) {
|
||||||
if (uri.getPath() == null) {
|
if (uri.getPath() == null) {
|
||||||
return renormalize(new URI(uri.getScheme(), uri.getHost(), "/", uri.getFragment()));
|
return renormalize(new URI(uri.getScheme(), uri.getHost(), "/", uri.getQuery(), uri.getFragment()));
|
||||||
}
|
}
|
||||||
if (uri.getPath().startsWith("/../")) {
|
if (uri.getPath().startsWith("/../")) {
|
||||||
return renormalize(new URI(uri.getScheme(), uri.getHost(), uri.getPath().substring(3), uri.getFragment()));
|
return renormalize(new URI(uri.getScheme(), uri.getHost(), uri.getPath().substring(3), uri.getQuery(), uri.getFragment()));
|
||||||
}
|
}
|
||||||
return uri;
|
return uri;
|
||||||
}
|
}
|
||||||
@ -117,10 +121,10 @@ public class LinkParser {
|
|||||||
private static final Pattern paramSeparatorPattern = Pattern.compile("\\?");
|
private static final Pattern paramSeparatorPattern = Pattern.compile("\\?");
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private String resolveUrl(EdgeUrl baseUrl, String s) {
|
private String resolveRelativeUrl(EdgeUrl baseUrl, String s) {
|
||||||
|
|
||||||
// url looks like http://www.marginalia.nu/
|
// url looks like http://www.marginalia.nu/
|
||||||
if (isAbsoluteDomain(s)) {
|
if (doesUrlStringHaveProtocol(s)) {
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -154,8 +158,15 @@ public class LinkParser {
|
|||||||
return url.path.substring(0, lastSlash+1);
|
return url.path.substring(0, lastSlash+1);
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isAbsoluteDomain(String s) {
|
private boolean doesUrlStringHaveProtocol(String s) {
|
||||||
return s.matches("^[a-zA-Z]+:.*$");
|
int i = 0;
|
||||||
|
for (; i < s.length(); i++) {
|
||||||
|
if (!Character.isAlphabetic(s.charAt(i)))
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (i == 0 || i == s.length())
|
||||||
|
return false;
|
||||||
|
return ':' == s.charAt(i);
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean shouldIndexLink(Element link) {
|
public boolean shouldIndexLink(Element link) {
|
||||||
@ -168,26 +179,29 @@ public class LinkParser {
|
|||||||
return !"noindex".equalsIgnoreCase(rel);
|
return !"noindex".equalsIgnoreCase(rel);
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean hasBinarySuffix(String href) {
|
|
||||||
return blockSuffixList.stream().anyMatch(href::endsWith);
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean isUrlRelevant(String href) {
|
private boolean isUrlRelevant(String href) {
|
||||||
if (null == href || "".equals(href)) {
|
if (null == href || "".equals(href)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
if (href.length() > 128) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
href = href.toLowerCase();
|
||||||
|
|
||||||
if (blockPrefixList.stream().anyMatch(href::startsWith)) {
|
if (blockPrefixList.stream().anyMatch(href::startsWith)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (hasBinarySuffix(href)) {
|
if (hasBinarySuffix(href)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (href.length() > 128) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean hasBinarySuffix(String str) {
|
||||||
|
return binarySuffixList.stream().anyMatch(str::endsWith);
|
||||||
|
}
|
||||||
|
|
||||||
@Nullable
|
@Nullable
|
||||||
public EdgeUrl getBaseLink(Document parsed, EdgeUrl documentUrl) {
|
public EdgeUrl getBaseLink(Document parsed, EdgeUrl documentUrl) {
|
||||||
var baseTags = parsed.getElementsByTag("base");
|
var baseTags = parsed.getElementsByTag("base");
|
||||||
@ -196,7 +210,7 @@ public class LinkParser {
|
|||||||
for (var tag : baseTags) {
|
for (var tag : baseTags) {
|
||||||
String href = tag.attr("href");
|
String href = tag.attr("href");
|
||||||
if (!Strings.isNullOrEmpty(href)) {
|
if (!Strings.isNullOrEmpty(href)) {
|
||||||
return new EdgeUrl(resolveUrl(documentUrl, href));
|
return new EdgeUrl(resolveRelativeUrl(documentUrl, href));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -9,7 +9,7 @@ import java.util.regex.Pattern;
|
|||||||
|
|
||||||
@AllArgsConstructor
|
@AllArgsConstructor
|
||||||
@Getter @Setter @Builder
|
@Getter @Setter @Builder
|
||||||
public class EdgeDomain implements WideHashable {
|
public class EdgeDomain {
|
||||||
|
|
||||||
private static final Predicate<String> ipPatternTest = Pattern.compile("[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}").asMatchPredicate();
|
private static final Predicate<String> ipPatternTest = Pattern.compile("[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}").asMatchPredicate();
|
||||||
private static final Predicate<String> govListTest = Pattern.compile(".*\\.(ac|co|org|gov|edu|com)\\.[a-z]{2}").asMatchPredicate();
|
private static final Predicate<String> govListTest = Pattern.compile(".*\\.(ac|co|org|gov|edu|com)\\.[a-z]{2}").asMatchPredicate();
|
||||||
@ -23,6 +23,8 @@ public class EdgeDomain implements WideHashable {
|
|||||||
public EdgeDomain(String host) {
|
public EdgeDomain(String host) {
|
||||||
Objects.requireNonNull(host, "domain name must not be null");
|
Objects.requireNonNull(host, "domain name must not be null");
|
||||||
|
|
||||||
|
host = host.toLowerCase();
|
||||||
|
|
||||||
var dot = host.lastIndexOf('.');
|
var dot = host.lastIndexOf('.');
|
||||||
|
|
||||||
if (dot < 0 || ipPatternTest.test(host)) { // IPV6 >.>
|
if (dot < 0 || ipPatternTest.test(host)) { // IPV6 >.>
|
||||||
@ -99,9 +101,11 @@ public class EdgeDomain implements WideHashable {
|
|||||||
return ret.toString().toLowerCase();
|
return ret.toString().toLowerCase();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public long wideHash() {
|
public boolean hasSameTopDomain(EdgeDomain other) {
|
||||||
return ((long) Objects.hash(domain, subDomain) << 32) | toString().hashCode();
|
if (other == null) return false;
|
||||||
|
|
||||||
|
return domain.equalsIgnoreCase(other.domain);
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean equals(final Object o) {
|
public boolean equals(final Object o) {
|
||||||
|
@ -9,33 +9,16 @@ import java.util.List;
|
|||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
public enum EdgeSearchProfile {
|
public enum EdgeSearchProfile {
|
||||||
DEFAULT("default",
|
|
||||||
List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Link,
|
DEFAULT("default", SearchOrder.DEFAULT_ORDER, 0, 1),
|
||||||
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus
|
MODERN("modern", SearchOrder.DEFAULT_ORDER, 2),
|
||||||
),
|
CORPO("corpo", SearchOrder.DEFAULT_ORDER, 4, 5, 7),
|
||||||
0, 1),
|
YOLO("yolo", SearchOrder.DEFAULT_ORDER, 0, 2, 1, 3, 4, 6),
|
||||||
MODERN("modern",
|
CORPO_CLEAN("corpo-clean", SearchOrder.DEFAULT_ORDER, 4, 5),
|
||||||
List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Link, IndexBlock.NamesWords,
|
ACADEMIA("academia", SearchOrder.DEFAULT_ORDER, 3),
|
||||||
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus
|
|
||||||
),
|
FOOD("food", SearchOrder.DEFAULT_ORDER, 2, 0),
|
||||||
2),
|
CRAFTS("crafts", SearchOrder.DEFAULT_ORDER, 2, 0),
|
||||||
CORPO("corpo",
|
|
||||||
List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords,
|
|
||||||
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus),
|
|
||||||
4, 5, 7),
|
|
||||||
YOLO("yolo",
|
|
||||||
List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords,
|
|
||||||
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus),
|
|
||||||
0, 2, 1, 3, 4, 6),
|
|
||||||
CORPO_CLEAN("corpo-clean",
|
|
||||||
List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords),
|
|
||||||
4, 5),
|
|
||||||
ACADEMIA("academia",
|
|
||||||
List.of( IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords),
|
|
||||||
3),
|
|
||||||
FOOD("food",
|
|
||||||
List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.Words_1, IndexBlock.NamesWords),
|
|
||||||
2, 0),
|
|
||||||
;
|
;
|
||||||
|
|
||||||
|
|
||||||
@ -55,12 +38,14 @@ public enum EdgeSearchProfile {
|
|||||||
if (null == param) {
|
if (null == param) {
|
||||||
return YOLO;
|
return YOLO;
|
||||||
}
|
}
|
||||||
|
|
||||||
return switch (param) {
|
return switch (param) {
|
||||||
case "modern" -> MODERN;
|
case "modern" -> MODERN;
|
||||||
case "default" -> DEFAULT;
|
case "default" -> DEFAULT;
|
||||||
case "corpo" -> CORPO;
|
case "corpo" -> CORPO;
|
||||||
case "academia" -> ACADEMIA;
|
case "academia" -> ACADEMIA;
|
||||||
case "food" -> FOOD;
|
case "food" -> FOOD;
|
||||||
|
case "crafts" -> CRAFTS;
|
||||||
default -> YOLO;
|
default -> YOLO;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@ -69,6 +54,14 @@ public enum EdgeSearchProfile {
|
|||||||
if (this == FOOD) {
|
if (this == FOOD) {
|
||||||
subquery.searchTermsInclude.add(HtmlFeature.CATEGORY_FOOD.getKeyword());
|
subquery.searchTermsInclude.add(HtmlFeature.CATEGORY_FOOD.getKeyword());
|
||||||
}
|
}
|
||||||
|
if (this == CRAFTS) {
|
||||||
|
subquery.searchTermsInclude.add(HtmlFeature.CATEGORY_CRAFTS.getKeyword());
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class SearchOrder {
|
||||||
|
static List<IndexBlock> DEFAULT_ORDER = List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Link,
|
||||||
|
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus);
|
||||||
|
}
|
@ -7,7 +7,7 @@ import nu.marginalia.util.language.processing.SentenceExtractor;
|
|||||||
import nu.marginalia.wmsa.configuration.WmsaHome;
|
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||||
import nu.marginalia.wmsa.edge.converting.ConverterModule;
|
import nu.marginalia.wmsa.edge.converting.ConverterModule;
|
||||||
import nu.marginalia.wmsa.edge.converting.processor.DomainProcessor;
|
import nu.marginalia.wmsa.edge.converting.processor.DomainProcessor;
|
||||||
import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruner;
|
import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruningFilter;
|
||||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.RecipeDetector;
|
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.RecipeDetector;
|
||||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.TextileCraftDetector;
|
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.TextileCraftDetector;
|
||||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.WoodworkingDetector;
|
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.WoodworkingDetector;
|
||||||
@ -25,7 +25,6 @@ public class ConverterLogicTestTool {
|
|||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
DomPruner domPruner = new DomPruner();
|
|
||||||
RecipeDetector recipeDetector = new RecipeDetector();
|
RecipeDetector recipeDetector = new RecipeDetector();
|
||||||
WoodworkingDetector woodworkingDetector = new WoodworkingDetector();
|
WoodworkingDetector woodworkingDetector = new WoodworkingDetector();
|
||||||
TextileCraftDetector textileCraftDetector = new TextileCraftDetector();
|
TextileCraftDetector textileCraftDetector = new TextileCraftDetector();
|
||||||
@ -64,7 +63,7 @@ public class ConverterLogicTestTool {
|
|||||||
Runnable task = () -> {
|
Runnable task = () -> {
|
||||||
var parsed = Jsoup.parse(doc.documentBody);
|
var parsed = Jsoup.parse(doc.documentBody);
|
||||||
|
|
||||||
domPruner.prune(parsed, 0.5);
|
parsed.body().filter(new DomPruningFilter(0.5));
|
||||||
var dld = se.extractSentences(parsed);
|
var dld = se.extractSentences(parsed);
|
||||||
|
|
||||||
if (dld.totalNumWords() < 250)
|
if (dld.totalNumWords() < 250)
|
||||||
|
@ -7,12 +7,18 @@
|
|||||||
</div>
|
</div>
|
||||||
<div class="settings">
|
<div class="settings">
|
||||||
<select name="profile" id="profile">
|
<select name="profile" id="profile">
|
||||||
|
<optgroup label="General Search">
|
||||||
<option {{#eq profile "default"}}selected{{/eq}} value="default">Popular Sites</option>
|
<option {{#eq profile "default"}}selected{{/eq}} value="default">Popular Sites</option>
|
||||||
<option {{#eq profile "modern"}}selected{{/eq}} value="modern">Blogs and Personal Websites</option>
|
<option {{#eq profile "modern"}}selected{{/eq}} value="modern">Blogs and Personal Websites</option>
|
||||||
<option {{#eq profile "academia"}}selected{{/eq}} value="academia">Academia, Forums, Big Websites</option>
|
<option {{#eq profile "academia"}}selected{{/eq}} value="academia">Academia, Forums, Big Websites</option>
|
||||||
<option {{#eq profile "yolo"}}selected{{/eq}} value="yolo">Default Ranking Algorithm</option>
|
<option {{#eq profile "yolo"}}selected{{/eq}} value="yolo">Default Ranking Algorithm</option>
|
||||||
|
<option {{#eq profile "corpo"}}selected{{/eq}} value="corpo">Everything</option>
|
||||||
|
</optgroup>
|
||||||
|
<optgroup label="Topics Search">
|
||||||
<option {{#eq profile "food"}}selected{{/eq}} value="food">Recipes 🍳</option>
|
<option {{#eq profile "food"}}selected{{/eq}} value="food">Recipes 🍳</option>
|
||||||
<option {{#eq profile "corpo"}}selected{{/eq}} value="corpo">Experimental</option>
|
<option {{#eq profile "crafts"}}selected{{/eq}} value="crafts">Crafts 🪡🔨 (WIP; mostly textile-craft)</option>
|
||||||
|
</optgroup>
|
||||||
|
|
||||||
</select>
|
</select>
|
||||||
<select name="js" id="js">
|
<select name="js" id="js">
|
||||||
<option {{#eq js "default"}}selected{{/eq}} value="default">Allow JS</option>
|
<option {{#eq js "default"}}selected{{/eq}} value="default">Allow JS</option>
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
{{#if scripts}}<abbr title="scripts" class="meta">🏭️</abbr>{{/if}}
|
{{#if scripts}}<abbr title="scripts" class="meta">🏭️</abbr>{{/if}}
|
||||||
{{#if tracking}}<abbr title="analytics or tracking" class="meta">🕵️️</abbr>{{/if}}
|
{{#if tracking}}<abbr title="analytics or tracking" class="meta">🕵️️</abbr>{{/if}}
|
||||||
{{#if media}}<abbr title="audio or video" class="meta">🎞️</abbr>{{/if}}
|
{{#if media}}<abbr title="audio or video" class="meta">🎞️</abbr>{{/if}}
|
||||||
{{#if affiliate}}<abbr title="possible amazon affiliate link (experimental; unreliable)" class="meta">💳️</abbr>{{/if}}
|
{{#if affiliate}}<abbr title="possible amazon affiliate link" class="meta">💳️</abbr>{{/if}}
|
||||||
{{#if cookies}}<abbr title="cookies" class="meta">👁️️</abbr>{{/if}}
|
{{#if cookies}}<abbr title="cookies" class="meta">👁️️</abbr>{{/if}}
|
||||||
{{#if ads}}<abbr title="ads (experimental)" class="meta">⚠️️️</abbr>{{/if}}
|
{{#if ads}}<abbr title="ads (experimental)" class="meta">⚠️️️</abbr>{{/if}}
|
||||||
<span class="meta">{{format}}</span>
|
<span class="meta">{{format}}</span>
|
||||||
|
@ -4,7 +4,7 @@ import org.junit.jupiter.api.Test;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
class DomPrunerTest {
|
class DomPruningFilterTest {
|
||||||
@Test
|
@Test
|
||||||
public void test() throws IOException {
|
public void test() throws IOException {
|
||||||
|
|
Loading…
Reference in New Issue
Block a user