(crawler) Update URL blocklist

* Don't crawl MDN mirrors
* More mailing list variants
This commit is contained in:
Viktor Lofgren 2023-07-10 18:04:43 +02:00
parent cbbf60a599
commit fba466d6e2

View File

@ -71,6 +71,11 @@ public class UrlBlocklist {
return true;
}
// MDN is nice, but we don't need to crawl a bunch of MDN mirrors >.>
if (url.path.contains("developer.mozilla.org")) {
return true;
}
if ("github.com".equals(url.domain.domain)) {
return url.path.chars().filter(c -> c == '/').count() > 2;
}
@ -94,6 +99,12 @@ public class UrlBlocklist {
if (path.contains("mailinglist")) {
return true;
}
if (path.contains("mail-archive")) {
return true;
}
if (path.contains("mailman")) {
return true;
}
return false;
}
}