mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Performance optimizations in EdgeDomain's parsing, reduce the number of unguarded regular expressions
This commit is contained in:
parent
4854f40447
commit
618582dc74
@ -11,8 +11,6 @@ import java.util.regex.Pattern;
|
|||||||
@Getter @Setter @Builder
|
@Getter @Setter @Builder
|
||||||
public class EdgeDomain {
|
public class EdgeDomain {
|
||||||
|
|
||||||
private static final Predicate<String> ipPatternTest = Pattern.compile("[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}").asMatchPredicate();
|
|
||||||
private static final Predicate<String> govListTest = Pattern.compile(".*\\.(ac|co|org|gov|edu|com)\\.[a-z]{2}").asMatchPredicate();
|
|
||||||
|
|
||||||
@Nonnull
|
@Nonnull
|
||||||
public final String subDomain;
|
public final String subDomain;
|
||||||
@ -27,7 +25,7 @@ public class EdgeDomain {
|
|||||||
|
|
||||||
var dot = host.lastIndexOf('.');
|
var dot = host.lastIndexOf('.');
|
||||||
|
|
||||||
if (dot < 0 || ipPatternTest.test(host)) { // IPV6 >.>
|
if (dot < 0 || looksLikeAnIp(host)) { // IPV6 >.>
|
||||||
subDomain = "";
|
subDomain = "";
|
||||||
domain = host;
|
domain = host;
|
||||||
}
|
}
|
||||||
@ -38,7 +36,7 @@ public class EdgeDomain {
|
|||||||
domain = host;
|
domain = host;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
if (govListTest.test(host))
|
if (looksLikeGovTld(host))
|
||||||
{ // Capture .ac.jp, .co.uk
|
{ // Capture .ac.jp, .co.uk
|
||||||
int dot3 = host.substring(0, dot2).lastIndexOf('.');
|
int dot3 = host.substring(0, dot2).lastIndexOf('.');
|
||||||
if (dot3 >= 0) {
|
if (dot3 >= 0) {
|
||||||
@ -59,6 +57,35 @@ public class EdgeDomain {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static final Predicate<String> govListTest = Pattern.compile(".*\\.(ac|co|org|gov|edu|com)\\.[a-z]{2}").asMatchPredicate();
|
||||||
|
private boolean looksLikeGovTld(String host) {
|
||||||
|
if (host.length() < 8)
|
||||||
|
return false;
|
||||||
|
int cnt = 0;
|
||||||
|
for (int i = host.length() - 7; i < host.length(); i++) {
|
||||||
|
if (host.charAt(i) == '.')
|
||||||
|
cnt++;
|
||||||
|
}
|
||||||
|
return cnt >= 2 && govListTest.test(host);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static final Predicate<String> ipPatternTest = Pattern.compile("[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}").asMatchPredicate();
|
||||||
|
|
||||||
|
private boolean looksLikeAnIp(String host) {
|
||||||
|
if (host.length() < 7)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
char firstChar = host.charAt(0);
|
||||||
|
int lastChar = host.charAt(host.length() - 1);
|
||||||
|
|
||||||
|
return Character.isDigit(firstChar)
|
||||||
|
&& Character.isDigit(lastChar)
|
||||||
|
&& ipPatternTest.test(host);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
public EdgeUrl toRootUrl() {
|
public EdgeUrl toRootUrl() {
|
||||||
// Set default protocol to http, as most https websites redirect http->https, but few http websites redirect https->http
|
// Set default protocol to http, as most https websites redirect http->https, but few http websites redirect https->http
|
||||||
return new EdgeUrl("http", this, null, "/", null);
|
return new EdgeUrl("http", this, null, "/", null);
|
||||||
|
Loading…
Reference in New Issue
Block a user