From 618582dc7458dab6d90d713b62d192e278dc11e7 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 30 Jan 2023 09:23:11 +0100 Subject: [PATCH] Performance optimizations in EdgeDomain's parsing, reduce the number of unguarded regular expressions --- .../wmsa/edge/model/EdgeDomain.java | 35 ++++++++++++++++--- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java index e5b0526c..79e65476 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java @@ -11,8 +11,6 @@ import java.util.regex.Pattern; @Getter @Setter @Builder public class EdgeDomain { - private static final Predicate ipPatternTest = Pattern.compile("[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}").asMatchPredicate(); - private static final Predicate govListTest = Pattern.compile(".*\\.(ac|co|org|gov|edu|com)\\.[a-z]{2}").asMatchPredicate(); @Nonnull public final String subDomain; @@ -27,7 +25,7 @@ public class EdgeDomain { var dot = host.lastIndexOf('.'); - if (dot < 0 || ipPatternTest.test(host)) { // IPV6 >.> + if (dot < 0 || looksLikeAnIp(host)) { // IPV6 >.> subDomain = ""; domain = host; } @@ -38,7 +36,7 @@ public class EdgeDomain { domain = host; } else { - if (govListTest.test(host)) + if (looksLikeGovTld(host)) { // Capture .ac.jp, .co.uk int dot3 = host.substring(0, dot2).lastIndexOf('.'); if (dot3 >= 0) { @@ -59,6 +57,35 @@ public class EdgeDomain { } } + private static final Predicate govListTest = Pattern.compile(".*\\.(ac|co|org|gov|edu|com)\\.[a-z]{2}").asMatchPredicate(); + private boolean looksLikeGovTld(String host) { + if (host.length() < 8) + return false; + int cnt = 0; + for (int i = host.length() - 7; i < host.length(); i++) { + if (host.charAt(i) == '.') + cnt++; + } + return cnt >= 2 && govListTest.test(host); + } + + + private static final Predicate ipPatternTest = Pattern.compile("[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}").asMatchPredicate(); + + private boolean looksLikeAnIp(String host) { + if (host.length() < 7) + return false; + + char firstChar = host.charAt(0); + int lastChar = host.charAt(host.length() - 1); + + return Character.isDigit(firstChar) + && Character.isDigit(lastChar) + && ipPatternTest.test(host); + } + + + public EdgeUrl toRootUrl() { // Set default protocol to http, as most https websites redirect http->https, but few http websites redirect https->http return new EdgeUrl("http", this, null, "/", null);