mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Remove poorly guarded regex in UrlBlocklist
This commit is contained in:
parent
28214ad770
commit
1b53a5389d
@ -24,7 +24,7 @@ public class UrlBlocklist {
|
|||||||
patterns.add(s -> s.contains("-download-free"));
|
patterns.add(s -> s.contains("-download-free"));
|
||||||
|
|
||||||
// long base64-strings in URLs are typically git hashes or the like, rarely worth crawling
|
// long base64-strings in URLs are typically git hashes or the like, rarely worth crawling
|
||||||
patterns.add(GuardedRegexFactory.minLength(48, ".*/[^/]*[a-f0-9]{32,}(/|$)"));
|
patterns.add(this::hashTest);
|
||||||
|
|
||||||
// link farms &c
|
// link farms &c
|
||||||
patterns.add(GuardedRegexFactory.contains("/download", "/download(-([A-Za-z]+|[0-9]+)){4,}\\.(htm|html|php)$"));
|
patterns.add(GuardedRegexFactory.contains("/download", "/download(-([A-Za-z]+|[0-9]+)){4,}\\.(htm|html|php)$"));
|
||||||
@ -38,6 +38,33 @@ public class UrlBlocklist {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean hashTest(String path) {
|
||||||
|
// look for strings might be a git hash (i.e. long hexadecimal strings)
|
||||||
|
// there is no good guard for a regular expression for this so hand-rolling this
|
||||||
|
// is necessary
|
||||||
|
|
||||||
|
int runLength = 0;
|
||||||
|
int minLength = 32;
|
||||||
|
|
||||||
|
if (path.length() <= minLength + 2)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
for (int i = 0; i < path.length(); i++) {
|
||||||
|
int c = path.charAt(i);
|
||||||
|
|
||||||
|
if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f')) {
|
||||||
|
runLength++;
|
||||||
|
}
|
||||||
|
else if (runLength >= minLength) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
runLength = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return runLength >= minLength;
|
||||||
|
}
|
||||||
|
|
||||||
public boolean isUrlBlocked(EdgeUrl url) {
|
public boolean isUrlBlocked(EdgeUrl url) {
|
||||||
try {
|
try {
|
||||||
if (badDomains.contains(url.domain.domain)) {
|
if (badDomains.contains(url.domain.domain)) {
|
||||||
|
Loading…
Reference in New Issue
Block a user