mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Tweaking the URL block list to exclude git noise better
This commit is contained in:
parent
c71cc3d43a
commit
80b3ac3dd8
@ -4,15 +4,26 @@ import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class UrlBlocklist {
|
||||
private final List<Predicate<String>> patterns = new ArrayList<>();
|
||||
|
||||
// domains that have a lot of links but we know we don't want to crawl
|
||||
private final Set<String> badDomains = Set.of("t.co", "facebook.com",
|
||||
"instagram.com", "youtube.com",
|
||||
"youtu.be", "amzn.to");
|
||||
|
||||
public UrlBlocklist() {
|
||||
patterns.add(Pattern.compile(".*/[a-f0-9]{40}(/|$)").asPredicate()); // git
|
||||
patterns.add(Pattern.compile(".*/[a-f0-9]{64}(/|$)").asPredicate()); // fossil SCM
|
||||
// Don't deep-crawl git repos
|
||||
patterns.add(Pattern.compile("\\.git/.+").asPredicate());
|
||||
|
||||
// long base64-strings in URLs are typically git hashes or the like, rarely worth crawling
|
||||
patterns.add(Pattern.compile(".*/[^/]*[a-f0-9]{32,}(/|$)").asPredicate());
|
||||
|
||||
// link farms &c
|
||||
patterns.add(Pattern.compile("/download(-([A-Za-z]+|[0-9]+)){4,}\\.(htm|html|php)$").asPredicate());
|
||||
patterns.add(Pattern.compile("/permalink/[a-z]+(-([A-Za-z]+|[0-9]+)){3,}\\.(htm|html|php)$").asPredicate());
|
||||
patterns.add(Pattern.compile("(webrx3|lib|pdf|book|720p).*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$").asPredicate());
|
||||
@ -22,15 +33,23 @@ public class UrlBlocklist {
|
||||
|
||||
public boolean isUrlBlocked(EdgeUrl url) {
|
||||
try {
|
||||
if (badDomains.contains(url.domain.domain)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if ("github.com".equals(url.domain.domain)) {
|
||||
return url.path.chars().filter(c -> c == '/').count() > 2;
|
||||
}
|
||||
|
||||
return patterns.stream().anyMatch(p -> p.test(url.path));
|
||||
for (var p : patterns) {
|
||||
if (p.test(url.path))
|
||||
return true;
|
||||
}
|
||||
}
|
||||
catch (StackOverflowError ex) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean isMailingListLink(EdgeUrl linkUrl) {
|
||||
@ -38,12 +57,9 @@ public class UrlBlocklist {
|
||||
if (path.startsWith("/lists/")) {
|
||||
return true;
|
||||
}
|
||||
if (path.startsWith("mailinglist")) {
|
||||
if (path.contains("mailinglist")) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user