From fba466d6e20e65bdc8f66de0e5b25bc420d96c2c Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 10 Jul 2023 18:04:43 +0200 Subject: [PATCH] (crawler) Update URL blocklist * Don't crawl MDN mirrors * More mailing list variants --- .../java/nu/marginalia/ip_blocklist/UrlBlocklist.java | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/UrlBlocklist.java b/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/UrlBlocklist.java index b8d6a596..f3574b87 100644 --- a/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/UrlBlocklist.java +++ b/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/UrlBlocklist.java @@ -71,6 +71,11 @@ public class UrlBlocklist { return true; } + // MDN is nice, but we don't need to crawl a bunch of MDN mirrors >.> + if (url.path.contains("developer.mozilla.org")) { + return true; + } + if ("github.com".equals(url.domain.domain)) { return url.path.chars().filter(c -> c == '/').count() > 2; } @@ -94,6 +99,12 @@ public class UrlBlocklist { if (path.contains("mailinglist")) { return true; } + if (path.contains("mail-archive")) { + return true; + } + if (path.contains("mailman")) { + return true; + } return false; } }