mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Preparation for conversion
This commit is contained in:
parent
a04d27692e
commit
ccf79f47b0
@ -7,7 +7,7 @@ import nu.marginalia.util.language.conf.LanguageModels;
|
||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruner;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruningFilter;
|
||||
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
@ -87,7 +87,6 @@ public class TermFrequencyDict {
|
||||
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
||||
|
||||
ThreadLocal<SentenceExtractor> se = ThreadLocal.withInitial(() -> new SentenceExtractor(WmsaHome.getLanguageModels()));
|
||||
DomPruner pruner = new DomPruner();
|
||||
LanguageFilter lf = new LanguageFilter();
|
||||
|
||||
TLongIntHashMap counts = new TLongIntHashMap(100_000_000, 0.7f, -1, -1);
|
||||
@ -108,7 +107,7 @@ public class TermFrequencyDict {
|
||||
docCount.incrementAndGet();
|
||||
|
||||
Document parsed = Jsoup.parse(doc.documentBody);
|
||||
pruner.prune(parsed, 0.5);
|
||||
parsed.body().filter(new DomPruningFilter(0.5));
|
||||
|
||||
DocumentLanguageData dld = se.get().extractSentences(parsed);
|
||||
|
||||
|
@ -171,16 +171,15 @@ public class DocumentProcessor {
|
||||
throw new DisqualifiedException(DisqualificationReason.FORBIDDEN);
|
||||
}
|
||||
|
||||
DomPruner domPruner = new DomPruner();
|
||||
Document prunedDoc = doc.clone();
|
||||
domPruner.prune(prunedDoc, 0.5);
|
||||
prunedDoc.body().filter(new DomPruningFilter(0.5));
|
||||
|
||||
var dld = sentenceExtractor.extractSentences(prunedDoc);
|
||||
|
||||
checkDocumentLanguage(dld);
|
||||
|
||||
var ret = new ProcessedDocumentDetails();
|
||||
|
||||
|
||||
ret.length = getLength(doc);
|
||||
ret.standard = getHtmlStandard(doc);
|
||||
ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url);
|
||||
@ -246,12 +245,11 @@ public class DocumentProcessor {
|
||||
if (linkParser.shouldIndexLink(atag)) {
|
||||
linkOpt.ifPresent(lp::accept);
|
||||
}
|
||||
else if (linkOpt.isPresent()) {
|
||||
if (linkParser.hasBinarySuffix(linkOpt.get().toString())) {
|
||||
linkOpt.ifPresent(lp::acceptNonIndexable);
|
||||
}
|
||||
else {
|
||||
linkOpt
|
||||
.filter(url -> linkParser.hasBinarySuffix(url.path.toLowerCase()))
|
||||
.ifPresent(lp::acceptNonIndexable);
|
||||
}
|
||||
|
||||
}
|
||||
for (var frame : doc.getElementsByTag("frame")) {
|
||||
linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept);
|
||||
@ -271,21 +269,20 @@ public class DocumentProcessor {
|
||||
linkTerms.add("links:"+fd.toString().toLowerCase());
|
||||
linkTerms.add("links:"+fd.getDomain().toLowerCase());
|
||||
}
|
||||
|
||||
words.append(IndexBlock.Meta, linkTerms);
|
||||
|
||||
Set<String> fileKeywords = new HashSet<>(100);
|
||||
for (var link : lp.getNonIndexableUrls()) {
|
||||
|
||||
if (!Objects.equals(domain, link.domain)) {
|
||||
if (!domain.hasSameTopDomain(link.domain)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
synthesizeFilenameKeyword(fileKeywords, link);
|
||||
|
||||
}
|
||||
|
||||
words.append(IndexBlock.Artifacts, fileKeywords);
|
||||
|
||||
}
|
||||
|
||||
private void synthesizeFilenameKeyword(Set<String> fileKeywords, EdgeUrl link) {
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.wmsa.edge.converting.processor.logic;
|
||||
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.nodes.Node;
|
||||
import org.jsoup.nodes.TextNode;
|
||||
@ -9,22 +8,14 @@ import org.jsoup.select.NodeFilter;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class DomPruner {
|
||||
public class DomPruningFilter implements NodeFilter {
|
||||
|
||||
public void prune(Document document, double pruneThreshold) {
|
||||
document.filter(new PruningFilter(pruneThreshold));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
class PruningFilter implements NodeFilter {
|
||||
private final double pruneThreshold;
|
||||
|
||||
private final Map<Node, NodeData> data = new HashMap<>();
|
||||
private final NodeData dummy = new NodeData(Integer.MAX_VALUE, 1, 0);
|
||||
private double pruneThreshold;
|
||||
|
||||
public PruningFilter(double pruneThreshold) {
|
||||
public DomPruningFilter(double pruneThreshold) {
|
||||
this.pruneThreshold = pruneThreshold;
|
||||
}
|
||||
|
@ -19,10 +19,14 @@ import java.util.regex.Pattern;
|
||||
|
||||
public class LinkParser {
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private final List<String> blockPrefixList = List.of(
|
||||
"mailto:", "javascript:", "tel:", "itpc:", "#", "file:");
|
||||
private final List<String> blockSuffixList = List.of(
|
||||
|
||||
private final List<String> binarySuffixList = List.of(
|
||||
".pdf", ".mp3", ".wmv", ".avi", ".zip", ".7z",
|
||||
".mpv", ".mp4", ".avi", ".mkv", ".tiff", ".dat", ".tar",
|
||||
".com", ".bat", ".sh",
|
||||
".bin", ".exe", ".tar.gz", ".tar.bz2", ".xml", ".swf",
|
||||
".wav", ".ogg", ".jpg", ".jpeg", ".png", ".gif", ".webp",
|
||||
".webm", ".bmp", ".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx",
|
||||
@ -33,7 +37,7 @@ public class LinkParser {
|
||||
return Optional.of(l)
|
||||
.filter(this::shouldIndexLink)
|
||||
.map(this::getUrl)
|
||||
.map(link -> resolveUrl(relativeBaseUrl, link))
|
||||
.map(link -> resolveRelativeUrl(relativeBaseUrl, link))
|
||||
.flatMap(this::createURI)
|
||||
.map(URI::normalize)
|
||||
.map(this::renormalize)
|
||||
@ -44,7 +48,7 @@ public class LinkParser {
|
||||
public Optional<EdgeUrl> parseLinkPermissive(EdgeUrl relativeBaseUrl, Element l) {
|
||||
return Optional.of(l)
|
||||
.map(this::getUrl)
|
||||
.map(link -> resolveUrl(relativeBaseUrl, link))
|
||||
.map(link -> resolveRelativeUrl(relativeBaseUrl, link))
|
||||
.flatMap(this::createURI)
|
||||
.map(URI::normalize)
|
||||
.map(this::renormalize)
|
||||
@ -74,7 +78,7 @@ public class LinkParser {
|
||||
@Contract(pure=true)
|
||||
public Optional<EdgeUrl> parseLink(EdgeUrl baseUrl, String str) {
|
||||
return Optional.of(str)
|
||||
.map(link -> resolveUrl(baseUrl, link))
|
||||
.map(link -> resolveRelativeUrl(baseUrl, link))
|
||||
.flatMap(this::createURI)
|
||||
.map(URI::normalize)
|
||||
.map(this::renormalize)
|
||||
@ -85,7 +89,7 @@ public class LinkParser {
|
||||
public Optional<EdgeUrl> parseFrame(EdgeUrl baseUrl, Element frame) {
|
||||
return Optional.of(frame)
|
||||
.map(l -> l.attr("src"))
|
||||
.map(link -> resolveUrl(baseUrl, link))
|
||||
.map(link -> resolveRelativeUrl(baseUrl, link))
|
||||
.flatMap(this::createURI)
|
||||
.map(URI::normalize)
|
||||
.map(this::renormalize)
|
||||
@ -95,10 +99,10 @@ public class LinkParser {
|
||||
@SneakyThrows
|
||||
private URI renormalize(URI uri) {
|
||||
if (uri.getPath() == null) {
|
||||
return renormalize(new URI(uri.getScheme(), uri.getHost(), "/", uri.getFragment()));
|
||||
return renormalize(new URI(uri.getScheme(), uri.getHost(), "/", uri.getQuery(), uri.getFragment()));
|
||||
}
|
||||
if (uri.getPath().startsWith("/../")) {
|
||||
return renormalize(new URI(uri.getScheme(), uri.getHost(), uri.getPath().substring(3), uri.getFragment()));
|
||||
return renormalize(new URI(uri.getScheme(), uri.getHost(), uri.getPath().substring(3), uri.getQuery(), uri.getFragment()));
|
||||
}
|
||||
return uri;
|
||||
}
|
||||
@ -117,10 +121,10 @@ public class LinkParser {
|
||||
private static final Pattern paramSeparatorPattern = Pattern.compile("\\?");
|
||||
|
||||
@SneakyThrows
|
||||
private String resolveUrl(EdgeUrl baseUrl, String s) {
|
||||
private String resolveRelativeUrl(EdgeUrl baseUrl, String s) {
|
||||
|
||||
// url looks like http://www.marginalia.nu/
|
||||
if (isAbsoluteDomain(s)) {
|
||||
if (doesUrlStringHaveProtocol(s)) {
|
||||
return s;
|
||||
}
|
||||
|
||||
@ -154,8 +158,15 @@ public class LinkParser {
|
||||
return url.path.substring(0, lastSlash+1);
|
||||
}
|
||||
|
||||
private boolean isAbsoluteDomain(String s) {
|
||||
return s.matches("^[a-zA-Z]+:.*$");
|
||||
private boolean doesUrlStringHaveProtocol(String s) {
|
||||
int i = 0;
|
||||
for (; i < s.length(); i++) {
|
||||
if (!Character.isAlphabetic(s.charAt(i)))
|
||||
break;
|
||||
}
|
||||
if (i == 0 || i == s.length())
|
||||
return false;
|
||||
return ':' == s.charAt(i);
|
||||
}
|
||||
|
||||
public boolean shouldIndexLink(Element link) {
|
||||
@ -168,26 +179,29 @@ public class LinkParser {
|
||||
return !"noindex".equalsIgnoreCase(rel);
|
||||
}
|
||||
|
||||
public boolean hasBinarySuffix(String href) {
|
||||
return blockSuffixList.stream().anyMatch(href::endsWith);
|
||||
}
|
||||
|
||||
private boolean isUrlRelevant(String href) {
|
||||
if (null == href || "".equals(href)) {
|
||||
return false;
|
||||
}
|
||||
if (href.length() > 128) {
|
||||
return false;
|
||||
}
|
||||
href = href.toLowerCase();
|
||||
|
||||
if (blockPrefixList.stream().anyMatch(href::startsWith)) {
|
||||
return false;
|
||||
}
|
||||
if (hasBinarySuffix(href)) {
|
||||
return false;
|
||||
}
|
||||
if (href.length() > 128) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean hasBinarySuffix(String str) {
|
||||
return binarySuffixList.stream().anyMatch(str::endsWith);
|
||||
}
|
||||
|
||||
@Nullable
|
||||
public EdgeUrl getBaseLink(Document parsed, EdgeUrl documentUrl) {
|
||||
var baseTags = parsed.getElementsByTag("base");
|
||||
@ -196,7 +210,7 @@ public class LinkParser {
|
||||
for (var tag : baseTags) {
|
||||
String href = tag.attr("href");
|
||||
if (!Strings.isNullOrEmpty(href)) {
|
||||
return new EdgeUrl(resolveUrl(documentUrl, href));
|
||||
return new EdgeUrl(resolveRelativeUrl(documentUrl, href));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -9,7 +9,7 @@ import java.util.regex.Pattern;
|
||||
|
||||
@AllArgsConstructor
|
||||
@Getter @Setter @Builder
|
||||
public class EdgeDomain implements WideHashable {
|
||||
public class EdgeDomain {
|
||||
|
||||
private static final Predicate<String> ipPatternTest = Pattern.compile("[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}").asMatchPredicate();
|
||||
private static final Predicate<String> govListTest = Pattern.compile(".*\\.(ac|co|org|gov|edu|com)\\.[a-z]{2}").asMatchPredicate();
|
||||
@ -23,6 +23,8 @@ public class EdgeDomain implements WideHashable {
|
||||
public EdgeDomain(String host) {
|
||||
Objects.requireNonNull(host, "domain name must not be null");
|
||||
|
||||
host = host.toLowerCase();
|
||||
|
||||
var dot = host.lastIndexOf('.');
|
||||
|
||||
if (dot < 0 || ipPatternTest.test(host)) { // IPV6 >.>
|
||||
@ -99,9 +101,11 @@ public class EdgeDomain implements WideHashable {
|
||||
return ret.toString().toLowerCase();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long wideHash() {
|
||||
return ((long) Objects.hash(domain, subDomain) << 32) | toString().hashCode();
|
||||
|
||||
public boolean hasSameTopDomain(EdgeDomain other) {
|
||||
if (other == null) return false;
|
||||
|
||||
return domain.equalsIgnoreCase(other.domain);
|
||||
}
|
||||
|
||||
public boolean equals(final Object o) {
|
||||
|
@ -9,33 +9,16 @@ import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public enum EdgeSearchProfile {
|
||||
DEFAULT("default",
|
||||
List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Link,
|
||||
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus
|
||||
),
|
||||
0, 1),
|
||||
MODERN("modern",
|
||||
List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Link, IndexBlock.NamesWords,
|
||||
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus
|
||||
),
|
||||
2),
|
||||
CORPO("corpo",
|
||||
List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords,
|
||||
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus),
|
||||
4, 5, 7),
|
||||
YOLO("yolo",
|
||||
List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords,
|
||||
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus),
|
||||
0, 2, 1, 3, 4, 6),
|
||||
CORPO_CLEAN("corpo-clean",
|
||||
List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords),
|
||||
4, 5),
|
||||
ACADEMIA("academia",
|
||||
List.of( IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords),
|
||||
3),
|
||||
FOOD("food",
|
||||
List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.Words_1, IndexBlock.NamesWords),
|
||||
2, 0),
|
||||
|
||||
DEFAULT("default", SearchOrder.DEFAULT_ORDER, 0, 1),
|
||||
MODERN("modern", SearchOrder.DEFAULT_ORDER, 2),
|
||||
CORPO("corpo", SearchOrder.DEFAULT_ORDER, 4, 5, 7),
|
||||
YOLO("yolo", SearchOrder.DEFAULT_ORDER, 0, 2, 1, 3, 4, 6),
|
||||
CORPO_CLEAN("corpo-clean", SearchOrder.DEFAULT_ORDER, 4, 5),
|
||||
ACADEMIA("academia", SearchOrder.DEFAULT_ORDER, 3),
|
||||
|
||||
FOOD("food", SearchOrder.DEFAULT_ORDER, 2, 0),
|
||||
CRAFTS("crafts", SearchOrder.DEFAULT_ORDER, 2, 0),
|
||||
;
|
||||
|
||||
|
||||
@ -55,12 +38,14 @@ public enum EdgeSearchProfile {
|
||||
if (null == param) {
|
||||
return YOLO;
|
||||
}
|
||||
|
||||
return switch (param) {
|
||||
case "modern" -> MODERN;
|
||||
case "default" -> DEFAULT;
|
||||
case "corpo" -> CORPO;
|
||||
case "academia" -> ACADEMIA;
|
||||
case "food" -> FOOD;
|
||||
case "crafts" -> CRAFTS;
|
||||
default -> YOLO;
|
||||
};
|
||||
}
|
||||
@ -69,6 +54,14 @@ public enum EdgeSearchProfile {
|
||||
if (this == FOOD) {
|
||||
subquery.searchTermsInclude.add(HtmlFeature.CATEGORY_FOOD.getKeyword());
|
||||
}
|
||||
if (this == CRAFTS) {
|
||||
subquery.searchTermsInclude.add(HtmlFeature.CATEGORY_CRAFTS.getKeyword());
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
class SearchOrder {
|
||||
static List<IndexBlock> DEFAULT_ORDER = List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Link,
|
||||
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus);
|
||||
}
|
@ -7,7 +7,7 @@ import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||
import nu.marginalia.wmsa.edge.converting.ConverterModule;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.DomainProcessor;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruner;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruningFilter;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.RecipeDetector;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.TextileCraftDetector;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.WoodworkingDetector;
|
||||
@ -25,7 +25,6 @@ public class ConverterLogicTestTool {
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
DomPruner domPruner = new DomPruner();
|
||||
RecipeDetector recipeDetector = new RecipeDetector();
|
||||
WoodworkingDetector woodworkingDetector = new WoodworkingDetector();
|
||||
TextileCraftDetector textileCraftDetector = new TextileCraftDetector();
|
||||
@ -64,7 +63,7 @@ public class ConverterLogicTestTool {
|
||||
Runnable task = () -> {
|
||||
var parsed = Jsoup.parse(doc.documentBody);
|
||||
|
||||
domPruner.prune(parsed, 0.5);
|
||||
parsed.body().filter(new DomPruningFilter(0.5));
|
||||
var dld = se.extractSentences(parsed);
|
||||
|
||||
if (dld.totalNumWords() < 250)
|
||||
|
@ -7,12 +7,18 @@
|
||||
</div>
|
||||
<div class="settings">
|
||||
<select name="profile" id="profile">
|
||||
<optgroup label="General Search">
|
||||
<option {{#eq profile "default"}}selected{{/eq}} value="default">Popular Sites</option>
|
||||
<option {{#eq profile "modern"}}selected{{/eq}} value="modern">Blogs and Personal Websites</option>
|
||||
<option {{#eq profile "academia"}}selected{{/eq}} value="academia">Academia, Forums, Big Websites</option>
|
||||
<option {{#eq profile "yolo"}}selected{{/eq}} value="yolo">Default Ranking Algorithm</option>
|
||||
<option {{#eq profile "corpo"}}selected{{/eq}} value="corpo">Everything</option>
|
||||
</optgroup>
|
||||
<optgroup label="Topics Search">
|
||||
<option {{#eq profile "food"}}selected{{/eq}} value="food">Recipes 🍳</option>
|
||||
<option {{#eq profile "corpo"}}selected{{/eq}} value="corpo">Experimental</option>
|
||||
<option {{#eq profile "crafts"}}selected{{/eq}} value="crafts">Crafts 🪡🔨 (WIP; mostly textile-craft)</option>
|
||||
</optgroup>
|
||||
|
||||
</select>
|
||||
<select name="js" id="js">
|
||||
<option {{#eq js "default"}}selected{{/eq}} value="default">Allow JS</option>
|
||||
|
@ -1,7 +1,7 @@
|
||||
{{#if scripts}}<abbr title="scripts" class="meta">🏭️</abbr>{{/if}}
|
||||
{{#if tracking}}<abbr title="analytics or tracking" class="meta">🕵️️</abbr>{{/if}}
|
||||
{{#if media}}<abbr title="audio or video" class="meta">🎞️</abbr>{{/if}}
|
||||
{{#if affiliate}}<abbr title="possible amazon affiliate link (experimental; unreliable)" class="meta">💳️</abbr>{{/if}}
|
||||
{{#if affiliate}}<abbr title="possible amazon affiliate link" class="meta">💳️</abbr>{{/if}}
|
||||
{{#if cookies}}<abbr title="cookies" class="meta">👁️️</abbr>{{/if}}
|
||||
{{#if ads}}<abbr title="ads (experimental)" class="meta">⚠️️️</abbr>{{/if}}
|
||||
<span class="meta">{{format}}</span>
|
||||
|
@ -4,7 +4,7 @@ import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
class DomPrunerTest {
|
||||
class DomPruningFilterTest {
|
||||
@Test
|
||||
public void test() throws IOException {
|
||||
|
Loading…
Reference in New Issue
Block a user