Tweaking the URL block list to exclude git noise better

This commit is contained in:
vlofgren 2022-07-16 21:19:13 +02:00
parent c71cc3d43a
commit 80b3ac3dd8

View File

@ -4,15 +4,26 @@ import nu.marginalia.wmsa.edge.model.EdgeUrl;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.function.Predicate;
import java.util.regex.Pattern;
public class UrlBlocklist {
private final List<Predicate<String>> patterns = new ArrayList<>();
// domains that have a lot of links but we know we don't want to crawl
private final Set<String> badDomains = Set.of("t.co", "facebook.com",
"instagram.com", "youtube.com",
"youtu.be", "amzn.to");
public UrlBlocklist() {
patterns.add(Pattern.compile(".*/[a-f0-9]{40}(/|$)").asPredicate()); // git
patterns.add(Pattern.compile(".*/[a-f0-9]{64}(/|$)").asPredicate()); // fossil SCM
// Don't deep-crawl git repos
patterns.add(Pattern.compile("\\.git/.+").asPredicate());
// long base64-strings in URLs are typically git hashes or the like, rarely worth crawling
patterns.add(Pattern.compile(".*/[^/]*[a-f0-9]{32,}(/|$)").asPredicate());
// link farms &c
patterns.add(Pattern.compile("/download(-([A-Za-z]+|[0-9]+)){4,}\\.(htm|html|php)$").asPredicate());
patterns.add(Pattern.compile("/permalink/[a-z]+(-([A-Za-z]+|[0-9]+)){3,}\\.(htm|html|php)$").asPredicate());
patterns.add(Pattern.compile("(webrx3|lib|pdf|book|720p).*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$").asPredicate());
@ -22,15 +33,23 @@ public class UrlBlocklist {
public boolean isUrlBlocked(EdgeUrl url) {
try {
if (badDomains.contains(url.domain.domain)) {
return true;
}
if ("github.com".equals(url.domain.domain)) {
return url.path.chars().filter(c -> c == '/').count() > 2;
}
return patterns.stream().anyMatch(p -> p.test(url.path));
for (var p : patterns) {
if (p.test(url.path))
return true;
}
}
catch (StackOverflowError ex) {
return true;
}
return false;
}
public boolean isMailingListLink(EdgeUrl linkUrl) {
@ -38,12 +57,9 @@ public class UrlBlocklist {
if (path.startsWith("/lists/")) {
return true;
}
if (path.startsWith("mailinglist")) {
if (path.contains("mailinglist")) {
return true;
}
return false;
}
}