Preparation for conversion

This commit is contained in:
vlofgren 2022-09-02 14:51:11 +02:00
parent a04d27692e
commit ccf79f47b0
10 changed files with 85 additions and 82 deletions

View File

@ -7,7 +7,7 @@ import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.wmsa.configuration.WmsaHome;
import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruner;
import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruningFilter;
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
@ -87,7 +87,6 @@ public class TermFrequencyDict {
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
ThreadLocal<SentenceExtractor> se = ThreadLocal.withInitial(() -> new SentenceExtractor(WmsaHome.getLanguageModels()));
DomPruner pruner = new DomPruner();
LanguageFilter lf = new LanguageFilter();
TLongIntHashMap counts = new TLongIntHashMap(100_000_000, 0.7f, -1, -1);
@ -108,7 +107,7 @@ public class TermFrequencyDict {
docCount.incrementAndGet();
Document parsed = Jsoup.parse(doc.documentBody);
pruner.prune(parsed, 0.5);
parsed.body().filter(new DomPruningFilter(0.5));
DocumentLanguageData dld = se.get().extractSentences(parsed);

View File

@ -171,16 +171,15 @@ public class DocumentProcessor {
throw new DisqualifiedException(DisqualificationReason.FORBIDDEN);
}
DomPruner domPruner = new DomPruner();
Document prunedDoc = doc.clone();
domPruner.prune(prunedDoc, 0.5);
prunedDoc.body().filter(new DomPruningFilter(0.5));
var dld = sentenceExtractor.extractSentences(prunedDoc);
checkDocumentLanguage(dld);
var ret = new ProcessedDocumentDetails();
ret.length = getLength(doc);
ret.standard = getHtmlStandard(doc);
ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url);
@ -246,12 +245,11 @@ public class DocumentProcessor {
if (linkParser.shouldIndexLink(atag)) {
linkOpt.ifPresent(lp::accept);
}
else if (linkOpt.isPresent()) {
if (linkParser.hasBinarySuffix(linkOpt.get().toString())) {
linkOpt.ifPresent(lp::acceptNonIndexable);
}
else {
linkOpt
.filter(url -> linkParser.hasBinarySuffix(url.path.toLowerCase()))
.ifPresent(lp::acceptNonIndexable);
}
}
for (var frame : doc.getElementsByTag("frame")) {
linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept);
@ -271,21 +269,20 @@ public class DocumentProcessor {
linkTerms.add("links:"+fd.toString().toLowerCase());
linkTerms.add("links:"+fd.getDomain().toLowerCase());
}
words.append(IndexBlock.Meta, linkTerms);
Set<String> fileKeywords = new HashSet<>(100);
for (var link : lp.getNonIndexableUrls()) {
if (!Objects.equals(domain, link.domain)) {
if (!domain.hasSameTopDomain(link.domain)) {
continue;
}
synthesizeFilenameKeyword(fileKeywords, link);
}
words.append(IndexBlock.Artifacts, fileKeywords);
}
private void synthesizeFilenameKeyword(Set<String> fileKeywords, EdgeUrl link) {

View File

@ -1,6 +1,5 @@
package nu.marginalia.wmsa.edge.converting.processor.logic;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
@ -9,22 +8,14 @@ import org.jsoup.select.NodeFilter;
import java.util.HashMap;
import java.util.Map;
public class DomPruner {
public class DomPruningFilter implements NodeFilter {
public void prune(Document document, double pruneThreshold) {
document.filter(new PruningFilter(pruneThreshold));
}
}
class PruningFilter implements NodeFilter {
private final double pruneThreshold;
private final Map<Node, NodeData> data = new HashMap<>();
private final NodeData dummy = new NodeData(Integer.MAX_VALUE, 1, 0);
private double pruneThreshold;
public PruningFilter(double pruneThreshold) {
public DomPruningFilter(double pruneThreshold) {
this.pruneThreshold = pruneThreshold;
}

View File

@ -19,10 +19,14 @@ import java.util.regex.Pattern;
public class LinkParser {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final List<String> blockPrefixList = List.of(
"mailto:", "javascript:", "tel:", "itpc:", "#", "file:");
private final List<String> blockSuffixList = List.of(
private final List<String> binarySuffixList = List.of(
".pdf", ".mp3", ".wmv", ".avi", ".zip", ".7z",
".mpv", ".mp4", ".avi", ".mkv", ".tiff", ".dat", ".tar",
".com", ".bat", ".sh",
".bin", ".exe", ".tar.gz", ".tar.bz2", ".xml", ".swf",
".wav", ".ogg", ".jpg", ".jpeg", ".png", ".gif", ".webp",
".webm", ".bmp", ".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx",
@ -33,7 +37,7 @@ public class LinkParser {
return Optional.of(l)
.filter(this::shouldIndexLink)
.map(this::getUrl)
.map(link -> resolveUrl(relativeBaseUrl, link))
.map(link -> resolveRelativeUrl(relativeBaseUrl, link))
.flatMap(this::createURI)
.map(URI::normalize)
.map(this::renormalize)
@ -44,7 +48,7 @@ public class LinkParser {
public Optional<EdgeUrl> parseLinkPermissive(EdgeUrl relativeBaseUrl, Element l) {
return Optional.of(l)
.map(this::getUrl)
.map(link -> resolveUrl(relativeBaseUrl, link))
.map(link -> resolveRelativeUrl(relativeBaseUrl, link))
.flatMap(this::createURI)
.map(URI::normalize)
.map(this::renormalize)
@ -74,7 +78,7 @@ public class LinkParser {
@Contract(pure=true)
public Optional<EdgeUrl> parseLink(EdgeUrl baseUrl, String str) {
return Optional.of(str)
.map(link -> resolveUrl(baseUrl, link))
.map(link -> resolveRelativeUrl(baseUrl, link))
.flatMap(this::createURI)
.map(URI::normalize)
.map(this::renormalize)
@ -85,7 +89,7 @@ public class LinkParser {
public Optional<EdgeUrl> parseFrame(EdgeUrl baseUrl, Element frame) {
return Optional.of(frame)
.map(l -> l.attr("src"))
.map(link -> resolveUrl(baseUrl, link))
.map(link -> resolveRelativeUrl(baseUrl, link))
.flatMap(this::createURI)
.map(URI::normalize)
.map(this::renormalize)
@ -95,10 +99,10 @@ public class LinkParser {
@SneakyThrows
private URI renormalize(URI uri) {
if (uri.getPath() == null) {
return renormalize(new URI(uri.getScheme(), uri.getHost(), "/", uri.getFragment()));
return renormalize(new URI(uri.getScheme(), uri.getHost(), "/", uri.getQuery(), uri.getFragment()));
}
if (uri.getPath().startsWith("/../")) {
return renormalize(new URI(uri.getScheme(), uri.getHost(), uri.getPath().substring(3), uri.getFragment()));
return renormalize(new URI(uri.getScheme(), uri.getHost(), uri.getPath().substring(3), uri.getQuery(), uri.getFragment()));
}
return uri;
}
@ -117,10 +121,10 @@ public class LinkParser {
private static final Pattern paramSeparatorPattern = Pattern.compile("\\?");
@SneakyThrows
private String resolveUrl(EdgeUrl baseUrl, String s) {
private String resolveRelativeUrl(EdgeUrl baseUrl, String s) {
// url looks like http://www.marginalia.nu/
if (isAbsoluteDomain(s)) {
if (doesUrlStringHaveProtocol(s)) {
return s;
}
@ -154,8 +158,15 @@ public class LinkParser {
return url.path.substring(0, lastSlash+1);
}
private boolean isAbsoluteDomain(String s) {
return s.matches("^[a-zA-Z]+:.*$");
private boolean doesUrlStringHaveProtocol(String s) {
int i = 0;
for (; i < s.length(); i++) {
if (!Character.isAlphabetic(s.charAt(i)))
break;
}
if (i == 0 || i == s.length())
return false;
return ':' == s.charAt(i);
}
public boolean shouldIndexLink(Element link) {
@ -168,26 +179,29 @@ public class LinkParser {
return !"noindex".equalsIgnoreCase(rel);
}
public boolean hasBinarySuffix(String href) {
return blockSuffixList.stream().anyMatch(href::endsWith);
}
private boolean isUrlRelevant(String href) {
if (null == href || "".equals(href)) {
return false;
}
if (href.length() > 128) {
return false;
}
href = href.toLowerCase();
if (blockPrefixList.stream().anyMatch(href::startsWith)) {
return false;
}
if (hasBinarySuffix(href)) {
return false;
}
if (href.length() > 128) {
return false;
}
return true;
}
public boolean hasBinarySuffix(String str) {
return binarySuffixList.stream().anyMatch(str::endsWith);
}
@Nullable
public EdgeUrl getBaseLink(Document parsed, EdgeUrl documentUrl) {
var baseTags = parsed.getElementsByTag("base");
@ -196,7 +210,7 @@ public class LinkParser {
for (var tag : baseTags) {
String href = tag.attr("href");
if (!Strings.isNullOrEmpty(href)) {
return new EdgeUrl(resolveUrl(documentUrl, href));
return new EdgeUrl(resolveRelativeUrl(documentUrl, href));
}
}
}

View File

@ -9,7 +9,7 @@ import java.util.regex.Pattern;
@AllArgsConstructor
@Getter @Setter @Builder
public class EdgeDomain implements WideHashable {
public class EdgeDomain {
private static final Predicate<String> ipPatternTest = Pattern.compile("[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}").asMatchPredicate();
private static final Predicate<String> govListTest = Pattern.compile(".*\\.(ac|co|org|gov|edu|com)\\.[a-z]{2}").asMatchPredicate();
@ -23,6 +23,8 @@ public class EdgeDomain implements WideHashable {
public EdgeDomain(String host) {
Objects.requireNonNull(host, "domain name must not be null");
host = host.toLowerCase();
var dot = host.lastIndexOf('.');
if (dot < 0 || ipPatternTest.test(host)) { // IPV6 >.>
@ -99,9 +101,11 @@ public class EdgeDomain implements WideHashable {
return ret.toString().toLowerCase();
}
@Override
public long wideHash() {
return ((long) Objects.hash(domain, subDomain) << 32) | toString().hashCode();
public boolean hasSameTopDomain(EdgeDomain other) {
if (other == null) return false;
return domain.equalsIgnoreCase(other.domain);
}
public boolean equals(final Object o) {

View File

@ -9,33 +9,16 @@ import java.util.List;
import java.util.stream.Collectors;
public enum EdgeSearchProfile {
DEFAULT("default",
List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Link,
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus
),
0, 1),
MODERN("modern",
List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Link, IndexBlock.NamesWords,
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus
),
2),
CORPO("corpo",
List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords,
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus),
4, 5, 7),
YOLO("yolo",
List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords,
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus),
0, 2, 1, 3, 4, 6),
CORPO_CLEAN("corpo-clean",
List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords),
4, 5),
ACADEMIA("academia",
List.of( IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords),
3),
FOOD("food",
List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.Words_1, IndexBlock.NamesWords),
2, 0),
DEFAULT("default", SearchOrder.DEFAULT_ORDER, 0, 1),
MODERN("modern", SearchOrder.DEFAULT_ORDER, 2),
CORPO("corpo", SearchOrder.DEFAULT_ORDER, 4, 5, 7),
YOLO("yolo", SearchOrder.DEFAULT_ORDER, 0, 2, 1, 3, 4, 6),
CORPO_CLEAN("corpo-clean", SearchOrder.DEFAULT_ORDER, 4, 5),
ACADEMIA("academia", SearchOrder.DEFAULT_ORDER, 3),
FOOD("food", SearchOrder.DEFAULT_ORDER, 2, 0),
CRAFTS("crafts", SearchOrder.DEFAULT_ORDER, 2, 0),
;
@ -55,12 +38,14 @@ public enum EdgeSearchProfile {
if (null == param) {
return YOLO;
}
return switch (param) {
case "modern" -> MODERN;
case "default" -> DEFAULT;
case "corpo" -> CORPO;
case "academia" -> ACADEMIA;
case "food" -> FOOD;
case "crafts" -> CRAFTS;
default -> YOLO;
};
}
@ -69,6 +54,14 @@ public enum EdgeSearchProfile {
if (this == FOOD) {
subquery.searchTermsInclude.add(HtmlFeature.CATEGORY_FOOD.getKeyword());
}
if (this == CRAFTS) {
subquery.searchTermsInclude.add(HtmlFeature.CATEGORY_CRAFTS.getKeyword());
}
}
}
class SearchOrder {
static List<IndexBlock> DEFAULT_ORDER = List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Link,
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus);
}

View File

@ -7,7 +7,7 @@ import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.wmsa.configuration.WmsaHome;
import nu.marginalia.wmsa.edge.converting.ConverterModule;
import nu.marginalia.wmsa.edge.converting.processor.DomainProcessor;
import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruner;
import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruningFilter;
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.RecipeDetector;
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.TextileCraftDetector;
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.WoodworkingDetector;
@ -25,7 +25,6 @@ public class ConverterLogicTestTool {
private final Logger logger = LoggerFactory.getLogger(getClass());
DomPruner domPruner = new DomPruner();
RecipeDetector recipeDetector = new RecipeDetector();
WoodworkingDetector woodworkingDetector = new WoodworkingDetector();
TextileCraftDetector textileCraftDetector = new TextileCraftDetector();
@ -64,7 +63,7 @@ public class ConverterLogicTestTool {
Runnable task = () -> {
var parsed = Jsoup.parse(doc.documentBody);
domPruner.prune(parsed, 0.5);
parsed.body().filter(new DomPruningFilter(0.5));
var dld = se.extractSentences(parsed);
if (dld.totalNumWords() < 250)

View File

@ -7,12 +7,18 @@
</div>
<div class="settings">
<select name="profile" id="profile">
<optgroup label="General Search">
<option {{#eq profile "default"}}selected{{/eq}} value="default">Popular Sites</option>
<option {{#eq profile "modern"}}selected{{/eq}} value="modern">Blogs and Personal Websites</option>
<option {{#eq profile "academia"}}selected{{/eq}} value="academia">Academia, Forums, Big Websites</option>
<option {{#eq profile "yolo"}}selected{{/eq}} value="yolo">Default Ranking Algorithm</option>
<option {{#eq profile "corpo"}}selected{{/eq}} value="corpo">Everything</option>
</optgroup>
<optgroup label="Topics Search">
<option {{#eq profile "food"}}selected{{/eq}} value="food">Recipes &#127859;</option>
<option {{#eq profile "corpo"}}selected{{/eq}} value="corpo">Experimental</option>
<option {{#eq profile "crafts"}}selected{{/eq}} value="crafts">Crafts &#129697;&#128296; (WIP; mostly textile-craft)</option>
</optgroup>
</select>
<select name="js" id="js">
<option {{#eq js "default"}}selected{{/eq}} value="default">Allow JS</option>

View File

@ -1,7 +1,7 @@
{{#if scripts}}<abbr title="scripts" class="meta">🏭️</abbr>{{/if}}
{{#if tracking}}<abbr title="analytics or tracking" class="meta">🕵️️</abbr>{{/if}}
{{#if media}}<abbr title="audio or video" class="meta">🎞️</abbr>{{/if}}
{{#if affiliate}}<abbr title="possible amazon affiliate link (experimental; unreliable)" class="meta">💳️</abbr>{{/if}}
{{#if affiliate}}<abbr title="possible amazon affiliate link" class="meta">💳️</abbr>{{/if}}
{{#if cookies}}<abbr title="cookies" class="meta">👁️️</abbr>{{/if}}
{{#if ads}}<abbr title="ads (experimental)" class="meta">⚠️️️</abbr>{{/if}}
<span class="meta">{{format}}</span>

View File

@ -4,7 +4,7 @@ import org.junit.jupiter.api.Test;
import java.io.IOException;
class DomPrunerTest {
class DomPruningFilterTest {
@Test
public void test() throws IOException {