diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java index c12f992a..951dc274 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java @@ -3,7 +3,6 @@ package nu.marginalia.wmsa.edge.converting.processor; import com.google.common.base.Strings; import com.google.inject.Inject; import com.google.inject.name.Named; -import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument; import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain; import nu.marginalia.wmsa.edge.converting.processor.logic.CommonKeywordExtractor; import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; @@ -110,22 +109,6 @@ public class DomainProcessor { } - private double getAverageQuality(List documents) { - int n = 0; - double q = 0.; - for (var doc : documents) { - if (doc.quality().isPresent()) { - n++; - q += doc.quality().getAsDouble(); - } - } - - if (n > 0) { - return q / n; - } - return -5.; - } - private EdgeDomainIndexingState getState(String crawlerStatus) { return switch (CrawlerDomainStatus.valueOf(crawlerStatus)) { case OK -> EdgeDomainIndexingState.ACTIVE; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java index 2ba9234c..957769bf 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java @@ -41,24 +41,35 @@ public class EdgeUrl implements WideHashable { private static Pattern badCharPattern = Pattern.compile("[ \t\n\"<>\\[\\]()',|]"); + /* Java's URI parser is a bit too strict in throwing exceptions when there's an error. + + Here on the Internet, standards are like the picture on the box of the frozen pizza, + and what you get is more like what's on the inside, we try to patch things instead, + just give it a best-effort attempt att cleaning out broken or unnecessary constructions + like bad or missing URLEncoding + */ public static String urlencodeFixer(String url) throws URISyntaxException { var s = new StringBuilder(); String goodChars = "&.?:/-;+$#"; String hexChars = "0123456789abcdefABCDEF"; int pathIdx = findPathIdx(url); - if (pathIdx < 0) { - return url; + if (pathIdx < 0) { // url looks like http://marginalia.nu + return url + "/"; } s.append(url, 0, pathIdx); - for (int i = pathIdx; i < url.length(); i++) { + // We don't want the fragment, and multiple fragments breaks the Java URIParser for some reason + int end = url.indexOf("#"); + if (end < 0) end = url.length(); + + for (int i = pathIdx; i < end; i++) { int c = url.charAt(i); if (goodChars.indexOf(c) >= 0 || (c >= 'A' && c <='Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) { s.appendCodePoint(c); } - else if (c == '%' && i+2= 0 && hexChars.indexOf(cnn) >= 0) { diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/EdgeUrlTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/EdgeUrlTest.java index 09498160..61444c69 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/EdgeUrlTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/EdgeUrlTest.java @@ -27,6 +27,7 @@ class EdgeUrlTest { } @Test void urlencodeFixer() throws URISyntaxException { + System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/#heredoc")); System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%-sign")); System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%22-sign")); System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/\n \"huh\""));