diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/TermFrequencyDict.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/TermFrequencyDict.java index d219b30d..2a29e2a4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/TermFrequencyDict.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/TermFrequencyDict.java @@ -7,7 +7,7 @@ import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.processing.SentenceExtractor; import nu.marginalia.util.language.processing.model.DocumentLanguageData; import nu.marginalia.wmsa.configuration.WmsaHome; -import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruner; +import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruningFilter; import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; @@ -87,7 +87,6 @@ public class TermFrequencyDict { var plan = new CrawlPlanLoader().load(Path.of(args[0])); ThreadLocal se = ThreadLocal.withInitial(() -> new SentenceExtractor(WmsaHome.getLanguageModels())); - DomPruner pruner = new DomPruner(); LanguageFilter lf = new LanguageFilter(); TLongIntHashMap counts = new TLongIntHashMap(100_000_000, 0.7f, -1, -1); @@ -108,7 +107,7 @@ public class TermFrequencyDict { docCount.incrementAndGet(); Document parsed = Jsoup.parse(doc.documentBody); - pruner.prune(parsed, 0.5); + parsed.body().filter(new DomPruningFilter(0.5)); DocumentLanguageData dld = se.get().extractSentences(parsed); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java index 60d1071d..5037c791 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java @@ -171,16 +171,15 @@ public class DocumentProcessor { throw new DisqualifiedException(DisqualificationReason.FORBIDDEN); } - DomPruner domPruner = new DomPruner(); Document prunedDoc = doc.clone(); - domPruner.prune(prunedDoc, 0.5); + prunedDoc.body().filter(new DomPruningFilter(0.5)); + var dld = sentenceExtractor.extractSentences(prunedDoc); checkDocumentLanguage(dld); var ret = new ProcessedDocumentDetails(); - ret.length = getLength(doc); ret.standard = getHtmlStandard(doc); ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url); @@ -246,12 +245,11 @@ public class DocumentProcessor { if (linkParser.shouldIndexLink(atag)) { linkOpt.ifPresent(lp::accept); } - else if (linkOpt.isPresent()) { - if (linkParser.hasBinarySuffix(linkOpt.get().toString())) { - linkOpt.ifPresent(lp::acceptNonIndexable); - } + else { + linkOpt + .filter(url -> linkParser.hasBinarySuffix(url.path.toLowerCase())) + .ifPresent(lp::acceptNonIndexable); } - } for (var frame : doc.getElementsByTag("frame")) { linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept); @@ -271,21 +269,20 @@ public class DocumentProcessor { linkTerms.add("links:"+fd.toString().toLowerCase()); linkTerms.add("links:"+fd.getDomain().toLowerCase()); } - words.append(IndexBlock.Meta, linkTerms); Set fileKeywords = new HashSet<>(100); for (var link : lp.getNonIndexableUrls()) { - if (!Objects.equals(domain, link.domain)) { + if (!domain.hasSameTopDomain(link.domain)) { continue; } synthesizeFilenameKeyword(fileKeywords, link); } - words.append(IndexBlock.Artifacts, fileKeywords); + } private void synthesizeFilenameKeyword(Set fileKeywords, EdgeUrl link) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPruner.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPruningFilter.java similarity index 89% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPruner.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPruningFilter.java index beb23977..1e68125f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPruner.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPruningFilter.java @@ -1,6 +1,5 @@ package nu.marginalia.wmsa.edge.converting.processor.logic; -import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; @@ -9,22 +8,14 @@ import org.jsoup.select.NodeFilter; import java.util.HashMap; import java.util.Map; -public class DomPruner { +public class DomPruningFilter implements NodeFilter { - public void prune(Document document, double pruneThreshold) { - document.filter(new PruningFilter(pruneThreshold)); - } - -} - - -class PruningFilter implements NodeFilter { + private final double pruneThreshold; private final Map data = new HashMap<>(); private final NodeData dummy = new NodeData(Integer.MAX_VALUE, 1, 0); - private double pruneThreshold; - public PruningFilter(double pruneThreshold) { + public DomPruningFilter(double pruneThreshold) { this.pruneThreshold = pruneThreshold; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java index 98be5315..06313f1d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java @@ -19,10 +19,14 @@ import java.util.regex.Pattern; public class LinkParser { private final Logger logger = LoggerFactory.getLogger(getClass()); + private final List blockPrefixList = List.of( "mailto:", "javascript:", "tel:", "itpc:", "#", "file:"); - private final List blockSuffixList = List.of( + + private final List binarySuffixList = List.of( ".pdf", ".mp3", ".wmv", ".avi", ".zip", ".7z", + ".mpv", ".mp4", ".avi", ".mkv", ".tiff", ".dat", ".tar", + ".com", ".bat", ".sh", ".bin", ".exe", ".tar.gz", ".tar.bz2", ".xml", ".swf", ".wav", ".ogg", ".jpg", ".jpeg", ".png", ".gif", ".webp", ".webm", ".bmp", ".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx", @@ -33,7 +37,7 @@ public class LinkParser { return Optional.of(l) .filter(this::shouldIndexLink) .map(this::getUrl) - .map(link -> resolveUrl(relativeBaseUrl, link)) + .map(link -> resolveRelativeUrl(relativeBaseUrl, link)) .flatMap(this::createURI) .map(URI::normalize) .map(this::renormalize) @@ -44,7 +48,7 @@ public class LinkParser { public Optional parseLinkPermissive(EdgeUrl relativeBaseUrl, Element l) { return Optional.of(l) .map(this::getUrl) - .map(link -> resolveUrl(relativeBaseUrl, link)) + .map(link -> resolveRelativeUrl(relativeBaseUrl, link)) .flatMap(this::createURI) .map(URI::normalize) .map(this::renormalize) @@ -74,7 +78,7 @@ public class LinkParser { @Contract(pure=true) public Optional parseLink(EdgeUrl baseUrl, String str) { return Optional.of(str) - .map(link -> resolveUrl(baseUrl, link)) + .map(link -> resolveRelativeUrl(baseUrl, link)) .flatMap(this::createURI) .map(URI::normalize) .map(this::renormalize) @@ -85,7 +89,7 @@ public class LinkParser { public Optional parseFrame(EdgeUrl baseUrl, Element frame) { return Optional.of(frame) .map(l -> l.attr("src")) - .map(link -> resolveUrl(baseUrl, link)) + .map(link -> resolveRelativeUrl(baseUrl, link)) .flatMap(this::createURI) .map(URI::normalize) .map(this::renormalize) @@ -95,10 +99,10 @@ public class LinkParser { @SneakyThrows private URI renormalize(URI uri) { if (uri.getPath() == null) { - return renormalize(new URI(uri.getScheme(), uri.getHost(), "/", uri.getFragment())); + return renormalize(new URI(uri.getScheme(), uri.getHost(), "/", uri.getQuery(), uri.getFragment())); } if (uri.getPath().startsWith("/../")) { - return renormalize(new URI(uri.getScheme(), uri.getHost(), uri.getPath().substring(3), uri.getFragment())); + return renormalize(new URI(uri.getScheme(), uri.getHost(), uri.getPath().substring(3), uri.getQuery(), uri.getFragment())); } return uri; } @@ -117,10 +121,10 @@ public class LinkParser { private static final Pattern paramSeparatorPattern = Pattern.compile("\\?"); @SneakyThrows - private String resolveUrl(EdgeUrl baseUrl, String s) { + private String resolveRelativeUrl(EdgeUrl baseUrl, String s) { // url looks like http://www.marginalia.nu/ - if (isAbsoluteDomain(s)) { + if (doesUrlStringHaveProtocol(s)) { return s; } @@ -154,8 +158,15 @@ public class LinkParser { return url.path.substring(0, lastSlash+1); } - private boolean isAbsoluteDomain(String s) { - return s.matches("^[a-zA-Z]+:.*$"); + private boolean doesUrlStringHaveProtocol(String s) { + int i = 0; + for (; i < s.length(); i++) { + if (!Character.isAlphabetic(s.charAt(i))) + break; + } + if (i == 0 || i == s.length()) + return false; + return ':' == s.charAt(i); } public boolean shouldIndexLink(Element link) { @@ -168,26 +179,29 @@ public class LinkParser { return !"noindex".equalsIgnoreCase(rel); } - public boolean hasBinarySuffix(String href) { - return blockSuffixList.stream().anyMatch(href::endsWith); - } - private boolean isUrlRelevant(String href) { if (null == href || "".equals(href)) { return false; } + if (href.length() > 128) { + return false; + } + href = href.toLowerCase(); + if (blockPrefixList.stream().anyMatch(href::startsWith)) { return false; } if (hasBinarySuffix(href)) { return false; } - if (href.length() > 128) { - return false; - } + return true; } + public boolean hasBinarySuffix(String str) { + return binarySuffixList.stream().anyMatch(str::endsWith); + } + @Nullable public EdgeUrl getBaseLink(Document parsed, EdgeUrl documentUrl) { var baseTags = parsed.getElementsByTag("base"); @@ -196,7 +210,7 @@ public class LinkParser { for (var tag : baseTags) { String href = tag.attr("href"); if (!Strings.isNullOrEmpty(href)) { - return new EdgeUrl(resolveUrl(documentUrl, href)); + return new EdgeUrl(resolveRelativeUrl(documentUrl, href)); } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java index 658184c0..58a78e58 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java @@ -9,7 +9,7 @@ import java.util.regex.Pattern; @AllArgsConstructor @Getter @Setter @Builder -public class EdgeDomain implements WideHashable { +public class EdgeDomain { private static final Predicate ipPatternTest = Pattern.compile("[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}").asMatchPredicate(); private static final Predicate govListTest = Pattern.compile(".*\\.(ac|co|org|gov|edu|com)\\.[a-z]{2}").asMatchPredicate(); @@ -23,6 +23,8 @@ public class EdgeDomain implements WideHashable { public EdgeDomain(String host) { Objects.requireNonNull(host, "domain name must not be null"); + host = host.toLowerCase(); + var dot = host.lastIndexOf('.'); if (dot < 0 || ipPatternTest.test(host)) { // IPV6 >.> @@ -99,9 +101,11 @@ public class EdgeDomain implements WideHashable { return ret.toString().toLowerCase(); } - @Override - public long wideHash() { - return ((long) Objects.hash(domain, subDomain) << 32) | toString().hashCode(); + + public boolean hasSameTopDomain(EdgeDomain other) { + if (other == null) return false; + + return domain.equalsIgnoreCase(other.domain); } public boolean equals(final Object o) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java index df19ef16..ca62c74d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java @@ -9,33 +9,16 @@ import java.util.List; import java.util.stream.Collectors; public enum EdgeSearchProfile { - DEFAULT("default", - List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Link, - IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus - ), - 0, 1), - MODERN("modern", - List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Link, IndexBlock.NamesWords, - IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus - ), - 2), - CORPO("corpo", - List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords, - IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus), - 4, 5, 7), - YOLO("yolo", - List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords, - IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus), - 0, 2, 1, 3, 4, 6), - CORPO_CLEAN("corpo-clean", - List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords), - 4, 5), - ACADEMIA("academia", - List.of( IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords), - 3), - FOOD("food", - List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.Words_1, IndexBlock.NamesWords), - 2, 0), + + DEFAULT("default", SearchOrder.DEFAULT_ORDER, 0, 1), + MODERN("modern", SearchOrder.DEFAULT_ORDER, 2), + CORPO("corpo", SearchOrder.DEFAULT_ORDER, 4, 5, 7), + YOLO("yolo", SearchOrder.DEFAULT_ORDER, 0, 2, 1, 3, 4, 6), + CORPO_CLEAN("corpo-clean", SearchOrder.DEFAULT_ORDER, 4, 5), + ACADEMIA("academia", SearchOrder.DEFAULT_ORDER, 3), + + FOOD("food", SearchOrder.DEFAULT_ORDER, 2, 0), + CRAFTS("crafts", SearchOrder.DEFAULT_ORDER, 2, 0), ; @@ -55,12 +38,14 @@ public enum EdgeSearchProfile { if (null == param) { return YOLO; } + return switch (param) { case "modern" -> MODERN; case "default" -> DEFAULT; case "corpo" -> CORPO; case "academia" -> ACADEMIA; case "food" -> FOOD; + case "crafts" -> CRAFTS; default -> YOLO; }; } @@ -69,6 +54,14 @@ public enum EdgeSearchProfile { if (this == FOOD) { subquery.searchTermsInclude.add(HtmlFeature.CATEGORY_FOOD.getKeyword()); } + if (this == CRAFTS) { + subquery.searchTermsInclude.add(HtmlFeature.CATEGORY_CRAFTS.getKeyword()); + } } } + +class SearchOrder { + static List DEFAULT_ORDER = List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Link, + IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus); +} \ No newline at end of file diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ConverterLogicTestTool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ConverterLogicTestTool.java index c22f5ddd..afcb22ed 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ConverterLogicTestTool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ConverterLogicTestTool.java @@ -7,7 +7,7 @@ import nu.marginalia.util.language.processing.SentenceExtractor; import nu.marginalia.wmsa.configuration.WmsaHome; import nu.marginalia.wmsa.edge.converting.ConverterModule; import nu.marginalia.wmsa.edge.converting.processor.DomainProcessor; -import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruner; +import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruningFilter; import nu.marginalia.wmsa.edge.converting.processor.logic.topic.RecipeDetector; import nu.marginalia.wmsa.edge.converting.processor.logic.topic.TextileCraftDetector; import nu.marginalia.wmsa.edge.converting.processor.logic.topic.WoodworkingDetector; @@ -25,7 +25,6 @@ public class ConverterLogicTestTool { private final Logger logger = LoggerFactory.getLogger(getClass()); - DomPruner domPruner = new DomPruner(); RecipeDetector recipeDetector = new RecipeDetector(); WoodworkingDetector woodworkingDetector = new WoodworkingDetector(); TextileCraftDetector textileCraftDetector = new TextileCraftDetector(); @@ -64,7 +63,7 @@ public class ConverterLogicTestTool { Runnable task = () -> { var parsed = Jsoup.parse(doc.documentBody); - domPruner.prune(parsed, 0.5); + parsed.body().filter(new DomPruningFilter(0.5)); var dld = se.extractSentences(parsed); if (dld.totalNumWords() < 250) diff --git a/marginalia_nu/src/main/resources/templates/edge/parts/search-form.hdb b/marginalia_nu/src/main/resources/templates/edge/parts/search-form.hdb index 839b7934..89e72d22 100644 --- a/marginalia_nu/src/main/resources/templates/edge/parts/search-form.hdb +++ b/marginalia_nu/src/main/resources/templates/edge/parts/search-form.hdb @@ -7,12 +7,18 @@