diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java index b70e4ab0..a7dce9ed 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java @@ -11,8 +11,8 @@ public class UrlBlocklist { private final List> patterns = new ArrayList<>(); public UrlBlocklist() { - patterns.add(Pattern.compile(".*/[a-f0-9]{40}(/|$)").asPredicate()); - patterns.add(Pattern.compile("/download(-([A-Za-z]+|[0-9]+)){4,}\\.(htm|html|php)$").asPredicate()); + patterns.add(Pattern.compile(".*/[a-f0-9]{40}(/|$)").asPredicate()); // git + patterns.add(Pattern.compile(".*/[a-f0-9]{64}(/|$)").asPredicate()); // fossil SCM patterns.add(Pattern.compile("/download(-([A-Za-z]+|[0-9]+)){4,}\\.(htm|html|php)$").asPredicate()); patterns.add(Pattern.compile("/permalink/[a-z]+(-([A-Za-z]+|[0-9]+)){3,}\\.(htm|html|php)$").asPredicate()); patterns.add(Pattern.compile("(webrx3|lib|pdf|book|720p).*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$").asPredicate()); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/UrlBlocklistTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/UrlBlocklistTest.java index c357f83c..c93e1ffb 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/UrlBlocklistTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/UrlBlocklistTest.java @@ -6,7 +6,8 @@ import org.junit.jupiter.api.Test; import java.net.URISyntaxException; -import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; class UrlBlocklistTest { @@ -19,5 +20,6 @@ class UrlBlocklistTest { assertFalse(blocklist.isUrlBlocked(new EdgeUrl("http://yelenasimone.com/nope/x-a-course-in-algebra.html"))); assertTrue(blocklist.isUrlBlocked(new EdgeUrl("http://yelenasimone.com/_module/slide/pqPan/library/american-sour-beer-innovative-techniques-for-mixed-fermentations/"))); assertTrue(blocklist.isUrlBlocked(new EdgeUrl("http://w-m-p.de/images/book/download-firstborn-starcraft-dark-templar-book-1.php"))); + assertTrue(blocklist.isUrlBlocked(new EdgeUrl("https://sqlite.org/src/info/6376abec766e9a0785178b1823b5a587e9f1ccbc"))); } } \ No newline at end of file