diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java index 792dac6f..156dbdaa 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java @@ -144,7 +144,8 @@ public class LinkKeywordExtractorMain { try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) { AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(crawledDomains::contains, - url -> crawledUrls.contains(url.toString().hashCode()), + url -> url.params != null, + //url -> crawledUrls.contains(url.toString().hashCode()), output::write); logger.info("Reading files"); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java index c96fd400..c44e7f18 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java @@ -74,9 +74,6 @@ public class AnchorTextExtractor { if (!isInterestingAnchorText(text)) { return; } - if (href.contains("?")) { - return; - } var optLinkUrl = linkParser.parseLink(documentUrl, href); if (optLinkUrl.isEmpty()) return; @@ -92,13 +89,16 @@ public class AnchorTextExtractor { continue; word = word.toLowerCase(); - if (!WordPatterns.filter(word)) + if (!WordPatterns.filter(word)) { continue; + } - if (!linkUrl.domain.equals(documentUrl.domain)) { - if (isNewKeywordForLink(word, linkUrl.toString())) { - linkKeywordConsumer.accept(linkUrl, word); - } + if (linkUrl.domain.equals(documentUrl.domain)) { + continue; + } + + if (isNewKeywordForLink(word, linkUrl.toString())) { + linkKeywordConsumer.accept(linkUrl, word); } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java index ba9ae43a..04c9735f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java @@ -30,6 +30,7 @@ public class SqlLoadUrls { IN DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci, IN PORT INT, IN PATH VARCHAR(255), + IN PARAM VARCHAR(255), IN PATH_HASH BIGINT ) BEGIN @@ -45,8 +46,8 @@ public class SqlLoadUrls { public void load(LoaderData data, EdgeUrl[] urls) { try (var conn = dataSource.getConnection(); - var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?, ?)"); - var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH FROM EC_URL WHERE DOMAIN_ID=?") + var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?,?,?)"); + var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH, PARAM FROM EC_URL WHERE DOMAIN_ID=?") ) { conn.setAutoCommit(false); @@ -61,7 +62,8 @@ public class SqlLoadUrls { insertCall.setNull(3, Types.INTEGER); } insertCall.setString(4, url.path); - insertCall.setLong(5, hashPath(url.path)); + insertCall.setString(5, url.params); + insertCall.setLong(6, hashPath(url.path)); insertCall.addBatch(); } var ret = insertCall.executeBatch(); @@ -84,8 +86,9 @@ public class SqlLoadUrls { int urlId = rsp.getInt(1); String proto = rsp.getString(2); String path = rsp.getString(3); + String param = rsp.getString(4); - data.addUrl(new EdgeUrl(proto, targetDomain, null, path), urlId); + data.addUrl(new EdgeUrl(proto, targetDomain, null, path, param), urlId); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java index 0a2bdf45..c14e31cb 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java @@ -13,9 +13,12 @@ import org.slf4j.LoggerFactory; import java.net.URI; import java.net.URISyntaxException; +import java.util.Arrays; import java.util.List; import java.util.Optional; +import java.util.function.Predicate; import java.util.regex.Pattern; +import java.util.stream.Collectors; public class LinkParser { private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -107,21 +110,30 @@ public class LinkParser { @SneakyThrows private String resolveUrl(EdgeUrl baseUrl, String s) { - s = paramRegex.matcher(s).replaceAll(""); // url looks like http://www.marginalia.nu/ if (isAbsoluteDomain(s)) { return s; } - // url looks like /my-page - if (s.startsWith("/")) { - return baseUrl.withPath(s).toString(); + String[] parts = s.split("\\?", 2); + String path = parts[0]; + String param; + if (parts.length > 1) { + param = queryParamsSanitizer(parts[1]); + } + else { + param = null; } - final String partFromNewLink = spaceRegex.matcher(s).replaceAll("%20"); + // url looks like /my-page + if (path.startsWith("/")) { + return baseUrl.withPathAndParam(path, param).toString(); + } - return baseUrl.withPath(relativeNavigation(baseUrl) + partFromNewLink).toString(); + final String partFromNewLink = spaceRegex.matcher(path).replaceAll("%20"); + + return baseUrl.withPathAndParam(relativeNavigation(baseUrl) + partFromNewLink, param).toString(); } // for a relative url that looks like /foo or /foo/bar; return / or /foo @@ -183,4 +195,21 @@ public class LinkParser { return documentUrl; } + + private static final Pattern paramSplitterPattern = Pattern.compile("&"); + private static final Predicate paramPatternPredicate = Pattern.compile("((id|i|p|t|v|m|name|view|post)=[a-zA-Z\\d]+)|(view=(/[a-zA-Z\\d\\-])+)").asMatchPredicate(); + + public static String queryParamsSanitizer(String queryParams) { + if (queryParams == null) { + return null; + } + + var ret = Arrays.stream(paramSplitterPattern.split(queryParams)) + .filter(paramPatternPredicate) + .sorted() + .collect(Collectors.joining("&")); + if (ret.isBlank()) + return null; + return ret; + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java index 2b27ed4d..c275ad6f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java @@ -63,7 +63,7 @@ public class CrawlerRetreiver { if (queue.peek() != null) { var fst = queue.peek(); - var root = new EdgeUrl(fst.proto, fst.domain, fst.port, "/"); + var root = new EdgeUrl(fst.proto, fst.domain, fst.port, "/", null); if (known.add(root)) queue.addFirst(root); } @@ -110,7 +110,7 @@ public class CrawlerRetreiver { .build()); } - var fetchResult = fetcher.probeDomain(new EdgeUrl(fst.proto, fst.domain, fst.port, "/")); + var fetchResult = fetcher.probeDomain(fst.domain.toRootUrl()); if (!fetchResult.ok()) { logger.debug("Bad status on {}", domain); return Optional.of(createErrorPostFromStatus(fetchResult)); @@ -232,7 +232,7 @@ public class CrawlerRetreiver { } private Optional findCanonicalUrl(EdgeUrl baseUrl, Document parsed) { - baseUrl = baseUrl.withPath("/"); + baseUrl = baseUrl.domain.toRootUrl(); for (var link : parsed.select("link[rel=canonical]")) { return linkParser.parseLink(baseUrl, link); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java index 40728294..53180137 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java @@ -109,7 +109,7 @@ public class HttpFetcher { @SneakyThrows public FetchResult probeDomain(EdgeUrl url) { var head = new Request.Builder().head().addHeader("User-agent", userAgent) - .url(new EdgeUrl(url.proto, url.domain, url.port, "/").toString()) + .url(url.domain.toRootUrl().toString()) .build(); var call = client.newCall(head); @@ -293,7 +293,7 @@ public class HttpFetcher { private Optional fetchRobotsForProto(String proto, EdgeDomain domain) { try { - var url = new EdgeUrl(proto, domain, null, "/robots.txt"); + var url = new EdgeUrl(proto, domain, null, "/robots.txt", null); return Optional.of(parseRobotsTxt(fetchContent(url))); } catch (Exception ex) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostsReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostsReader.java index 0fecf63a..88921be1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostsReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostsReader.java @@ -64,7 +64,7 @@ public class StackOverflowPostsReader extends DefaultHandler { } private StackOverflowPost createPost(StackOverflowQuestionData data) { - EdgeUrl url = new EdgeUrl("https", domain, null, "/questions/"+data.getId()); + EdgeUrl url = new EdgeUrl("https", domain, null, "/questions/"+data.getId(), null); StringBuilder body = new StringBuilder(); body.append(data.getQuestion()); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaReader.java index 12bfec3f..fa5904c9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaReader.java @@ -37,7 +37,7 @@ public class WikipediaReader { } private EdgeUrl synthesizeUrl(String originalUrl) { - return new EdgeUrl("https", domain, null, "/wiki/"+originalUrl); + return new EdgeUrl("https", domain, null, "/wiki/"+originalUrl, null); } public void join() throws InterruptedException { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java index d1945c9e..658184c0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java @@ -59,7 +59,7 @@ public class EdgeDomain implements WideHashable { public EdgeUrl toRootUrl() { // Set default protocol to http, as most https websites redirect http->https, but few http websites redirect https->http - return new EdgeUrl("http", this, null, "/"); + return new EdgeUrl("http", this, null, "/", null); } public String toString() { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java index e82d4b7c..b7681951 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java @@ -4,6 +4,7 @@ import lombok.Builder; import lombok.EqualsAndHashCode; import lombok.Getter; import lombok.Setter; +import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; import java.net.URI; import java.net.URISyntaxException; @@ -15,12 +16,14 @@ public class EdgeUrl implements WideHashable { public final EdgeDomain domain; public final Integer port; public final String path; + public final String params; - public EdgeUrl(String proto, EdgeDomain domain, Integer port, String path) { + public EdgeUrl(String proto, EdgeDomain domain, Integer port, String path, String params) { this.proto = proto; this.domain = domain; this.port = port(port, proto); this.path = path; + this.params = params; } public EdgeUrl(String url) throws URISyntaxException { @@ -77,8 +80,10 @@ public class EdgeUrl implements WideHashable { this.path = URI.getPath().isEmpty() ? "/" : URI.getPath(); this.proto = URI.getScheme().toLowerCase(); this.port = port(URI.getPort(), proto); + this.params = LinkParser.queryParamsSanitizer(URI.getQuery()); } + private static Integer port(Integer port, String protocol) { if (null == port || port < 1) { return null; @@ -94,8 +99,9 @@ public class EdgeUrl implements WideHashable { public String toString() { String portPart = port == null ? "" : (":" + port); + String queryPart = params == null ? "" : ("?" + params); - return proto + "://" + domain + portPart + "" + path; + return proto + "://" + domain + portPart + path + queryPart; } public String dir() { @@ -115,7 +121,7 @@ public class EdgeUrl implements WideHashable { return (int) path.chars().filter(c -> c=='/').count(); } - public EdgeUrl withPath(String s) { - return new EdgeUrl(proto, domain, port, s); + public EdgeUrl withPathAndParam(String path, String param) { + return new EdgeUrl(proto, domain, port, path, param); } } diff --git a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql index 36ab040a..120a1ce2 100644 --- a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql +++ b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql @@ -46,20 +46,23 @@ COLLATE utf8mb4_unicode_ci; CREATE TABLE IF NOT EXISTS EC_URL ( ID INT PRIMARY KEY AUTO_INCREMENT, DOMAIN_ID INT NOT NULL, - PROTO ENUM('http','https','gemini') NOT NULL, - PATH VARCHAR(255) NOT NULL COLLATE utf8mb4_bin, + + PROTO ENUM('http','https','gemini') NOT NULL COLLATE utf8mb4_unicode_ci, + PATH VARCHAR(255) NOT NULL, PORT INT, + PARAM VARCHAR(255), PATH_HASH BIGINT NOT NULL COMMENT "Hash of PATH for uniqueness check by domain", + VISITED BOOLEAN NOT NULL DEFAULT FALSE, - STATE ENUM('ok', 'redirect', 'dead', 'archived', 'disqualified') NOT NULL DEFAULT 'ok', + STATE ENUM('ok', 'redirect', 'dead', 'archived', 'disqualified') NOT NULL DEFAULT 'ok' COLLATE utf8mb4_unicode_ci, CONSTRAINT CONS UNIQUE (DOMAIN_ID, PATH_HASH), FOREIGN KEY (DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE ) CHARACTER SET utf8mb4 -COLLATE utf8mb4_unicode_ci; +COLLATE utf8mb4_bin; CREATE TABLE IF NOT EXISTS EC_PAGE_DATA ( ID INT PRIMARY KEY AUTO_INCREMENT, @@ -113,10 +116,13 @@ CREATE TABLE IF NOT EXISTS EC_DOMAIN_LINK ( CREATE OR REPLACE VIEW EC_URL_VIEW AS SELECT - IF(PORT IS NULL, - CONCAT(EC_URL.PROTO, "://", EC_DOMAIN.DOMAIN_NAME, EC_URL.PATH), - CONCAT(EC_URL.PROTO, "://", EC_DOMAIN.DOMAIN_NAME, ":", EC_URL.PORT, EC_URL.PATH)) - AS URL, + CONCAT(EC_URL.PROTO, + '://', + EC_DOMAIN.DOMAIN_NAME, + IF(EC_URL.PORT IS NULL, '', CONCAT(':', EC_URL.PORT)), + EC_URL.PATH, + IF(EC_URL.PARAM IS NULL, '', CONCAT('?', EC_URL.PARAM)) + ) AS URL, EC_URL.PATH_HASH AS PATH_HASH, EC_URL.PATH AS PATH, EC_DOMAIN.DOMAIN_NAME AS DOMAIN_NAME, diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/EdgeUrlTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/EdgeUrlTest.java index dac8dd97..c16f1f08 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/EdgeUrlTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/EdgeUrlTest.java @@ -17,4 +17,13 @@ class EdgeUrlTest { System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%22-sign")); System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/\n \"huh\"")); } + + @Test + void testParms() throws URISyntaxException { + System.out.println(new EdgeUrl("https://search.marginalia.nu/?id=123")); + System.out.println(new EdgeUrl("https://search.marginalia.nu/?t=123")); + System.out.println(new EdgeUrl("https://search.marginalia.nu/?v=123")); + System.out.println(new EdgeUrl("https://search.marginalia.nu/?m=123")); + System.out.println(new EdgeUrl("https://search.marginalia.nu/?follow=123")); + } } \ No newline at end of file