mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(crawler) Update URL blocklist
* Don't crawl MDN mirrors * More mailing list variants
This commit is contained in:
parent
0f9b90eb1c
commit
74644d59f3
@ -71,6 +71,11 @@ public class UrlBlocklist {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// MDN is nice, but we don't need to crawl a bunch of MDN mirrors >.>
|
||||||
|
if (url.path.contains("developer.mozilla.org")) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
if ("github.com".equals(url.domain.domain)) {
|
if ("github.com".equals(url.domain.domain)) {
|
||||||
return url.path.chars().filter(c -> c == '/').count() > 2;
|
return url.path.chars().filter(c -> c == '/').count() > 2;
|
||||||
}
|
}
|
||||||
@ -94,6 +99,12 @@ public class UrlBlocklist {
|
|||||||
if (path.contains("mailinglist")) {
|
if (path.contains("mailinglist")) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
if (path.contains("mail-archive")) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (path.contains("mailman")) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user