From 80b3ac3dd8f30cc2b54c0fb1562906066f923ee2 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Sat, 16 Jul 2022 21:19:13 +0200 Subject: [PATCH] Tweaking the URL block list to exclude git noise better --- .../edge/crawling/blocklist/UrlBlocklist.java | 30 ++++++++++++++----- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java index a7dce9ed..b8064952 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java @@ -4,15 +4,26 @@ import nu.marginalia.wmsa.edge.model.EdgeUrl; import java.util.ArrayList; import java.util.List; +import java.util.Set; import java.util.function.Predicate; import java.util.regex.Pattern; public class UrlBlocklist { private final List> patterns = new ArrayList<>(); + // domains that have a lot of links but we know we don't want to crawl + private final Set badDomains = Set.of("t.co", "facebook.com", + "instagram.com", "youtube.com", + "youtu.be", "amzn.to"); + public UrlBlocklist() { - patterns.add(Pattern.compile(".*/[a-f0-9]{40}(/|$)").asPredicate()); // git - patterns.add(Pattern.compile(".*/[a-f0-9]{64}(/|$)").asPredicate()); // fossil SCM + // Don't deep-crawl git repos + patterns.add(Pattern.compile("\\.git/.+").asPredicate()); + + // long base64-strings in URLs are typically git hashes or the like, rarely worth crawling + patterns.add(Pattern.compile(".*/[^/]*[a-f0-9]{32,}(/|$)").asPredicate()); + + // link farms &c patterns.add(Pattern.compile("/download(-([A-Za-z]+|[0-9]+)){4,}\\.(htm|html|php)$").asPredicate()); patterns.add(Pattern.compile("/permalink/[a-z]+(-([A-Za-z]+|[0-9]+)){3,}\\.(htm|html|php)$").asPredicate()); patterns.add(Pattern.compile("(webrx3|lib|pdf|book|720p).*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$").asPredicate()); @@ -22,15 +33,23 @@ public class UrlBlocklist { public boolean isUrlBlocked(EdgeUrl url) { try { + if (badDomains.contains(url.domain.domain)) { + return true; + } + if ("github.com".equals(url.domain.domain)) { return url.path.chars().filter(c -> c == '/').count() > 2; } - return patterns.stream().anyMatch(p -> p.test(url.path)); + for (var p : patterns) { + if (p.test(url.path)) + return true; + } } catch (StackOverflowError ex) { return true; } + return false; } public boolean isMailingListLink(EdgeUrl linkUrl) { @@ -38,12 +57,9 @@ public class UrlBlocklist { if (path.startsWith("/lists/")) { return true; } - if (path.startsWith("mailinglist")) { + if (path.contains("mailinglist")) { return true; } return false; } - - - }