mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(crawler) Update URL blocklist
* Don't crawl MDN mirrors * More mailing list variants
This commit is contained in:
parent
cbbf60a599
commit
fba466d6e2
@ -71,6 +71,11 @@ public class UrlBlocklist {
|
||||
return true;
|
||||
}
|
||||
|
||||
// MDN is nice, but we don't need to crawl a bunch of MDN mirrors >.>
|
||||
if (url.path.contains("developer.mozilla.org")) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if ("github.com".equals(url.domain.domain)) {
|
||||
return url.path.chars().filter(c -> c == '/').count() > 2;
|
||||
}
|
||||
@ -94,6 +99,12 @@ public class UrlBlocklist {
|
||||
if (path.contains("mailinglist")) {
|
||||
return true;
|
||||
}
|
||||
if (path.contains("mail-archive")) {
|
||||
return true;
|
||||
}
|
||||
if (path.contains("mailman")) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user