Exclude wp-content/uploads from crawling

This commit is contained in:
vlofgren 2022-08-18 19:05:03 +02:00
parent 4e3a977049
commit a1eb8375a2
2 changed files with 2 additions and 0 deletions

View File

@ -19,6 +19,7 @@ public class UrlBlocklist {
public UrlBlocklist() {
// Don't deep-crawl git repos
patterns.add(Pattern.compile("\\.git/.+").asPredicate());
patterns.add(Pattern.compile("wp-content/upload").asPredicate());
// long base64-strings in URLs are typically git hashes or the like, rarely worth crawling
patterns.add(Pattern.compile(".*/[^/]*[a-f0-9]{32,}(/|$)").asPredicate());

View File

@ -16,6 +16,7 @@ class UrlBlocklistTest {
UrlBlocklist blocklist = new UrlBlocklist();
assertTrue(blocklist.isUrlBlocked(new EdgeUrl("https://memex.marginalia.nu/ghc/ghc/blob/1b1067d14b656bbbfa7c47f156ec2700c9751549/compiler/main/UpdateCafInfos.hs")));
assertTrue(blocklist.isUrlBlocked(new EdgeUrl("https://memex.marginalia.nu//gn/+/d62642c920e6a0d1756316d225a90fd6faa9e21e")));
assertTrue(blocklist.isUrlBlocked(new EdgeUrl("http://www.marginalia.nu/wp-content/uploads/test.jpg")));
assertTrue(blocklist.isUrlBlocked(new EdgeUrl("http://yelenasimone.com/pdf/download-a-course-in-algebra.html")));
assertFalse(blocklist.isUrlBlocked(new EdgeUrl("http://yelenasimone.com/nope/x-a-course-in-algebra.html")));
assertTrue(blocklist.isUrlBlocked(new EdgeUrl("http://yelenasimone.com/_module/slide/pqPan/library/american-sour-beer-innovative-techniques-for-mixed-fermentations/")));