WIP: Selective URL param strings

This commit is contained in:
vlofgren 2022-07-04 14:47:16 +02:00
parent ee07c4d94a
commit 853108028e
12 changed files with 93 additions and 39 deletions

View File

@ -144,7 +144,8 @@ public class LinkKeywordExtractorMain {
try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) { try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) {
AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(crawledDomains::contains, AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(crawledDomains::contains,
url -> crawledUrls.contains(url.toString().hashCode()), url -> url.params != null,
//url -> crawledUrls.contains(url.toString().hashCode()),
output::write); output::write);
logger.info("Reading files"); logger.info("Reading files");

View File

@ -74,9 +74,6 @@ public class AnchorTextExtractor {
if (!isInterestingAnchorText(text)) { if (!isInterestingAnchorText(text)) {
return; return;
} }
if (href.contains("?")) {
return;
}
var optLinkUrl = linkParser.parseLink(documentUrl, href); var optLinkUrl = linkParser.parseLink(documentUrl, href);
if (optLinkUrl.isEmpty()) return; if (optLinkUrl.isEmpty()) return;
@ -92,16 +89,19 @@ public class AnchorTextExtractor {
continue; continue;
word = word.toLowerCase(); word = word.toLowerCase();
if (!WordPatterns.filter(word)) if (!WordPatterns.filter(word)) {
continue; continue;
}
if (linkUrl.domain.equals(documentUrl.domain)) {
continue;
}
if (!linkUrl.domain.equals(documentUrl.domain)) {
if (isNewKeywordForLink(word, linkUrl.toString())) { if (isNewKeywordForLink(word, linkUrl.toString())) {
linkKeywordConsumer.accept(linkUrl, word); linkKeywordConsumer.accept(linkUrl, word);
} }
} }
} }
}
// This pattern doesn't need to perfectly capture all anchor texts that are URLs, if it gets 95% that's fine // This pattern doesn't need to perfectly capture all anchor texts that are URLs, if it gets 95% that's fine
private final Predicate<String> looksLikeAnURL = Pattern.compile("(\\p{Alpha}+://)?[\\p{Alnum}.]+(/[^/]+)+").asMatchPredicate(); private final Predicate<String> looksLikeAnURL = Pattern.compile("(\\p{Alpha}+://)?[\\p{Alnum}.]+(/[^/]+)+").asMatchPredicate();

View File

@ -30,6 +30,7 @@ public class SqlLoadUrls {
IN DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci, IN DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci,
IN PORT INT, IN PORT INT,
IN PATH VARCHAR(255), IN PATH VARCHAR(255),
IN PARAM VARCHAR(255),
IN PATH_HASH BIGINT IN PATH_HASH BIGINT
) )
BEGIN BEGIN
@ -45,8 +46,8 @@ public class SqlLoadUrls {
public void load(LoaderData data, EdgeUrl[] urls) { public void load(LoaderData data, EdgeUrl[] urls) {
try (var conn = dataSource.getConnection(); try (var conn = dataSource.getConnection();
var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?, ?)"); var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?,?,?)");
var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH FROM EC_URL WHERE DOMAIN_ID=?") var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH, PARAM FROM EC_URL WHERE DOMAIN_ID=?")
) )
{ {
conn.setAutoCommit(false); conn.setAutoCommit(false);
@ -61,7 +62,8 @@ public class SqlLoadUrls {
insertCall.setNull(3, Types.INTEGER); insertCall.setNull(3, Types.INTEGER);
} }
insertCall.setString(4, url.path); insertCall.setString(4, url.path);
insertCall.setLong(5, hashPath(url.path)); insertCall.setString(5, url.params);
insertCall.setLong(6, hashPath(url.path));
insertCall.addBatch(); insertCall.addBatch();
} }
var ret = insertCall.executeBatch(); var ret = insertCall.executeBatch();
@ -84,8 +86,9 @@ public class SqlLoadUrls {
int urlId = rsp.getInt(1); int urlId = rsp.getInt(1);
String proto = rsp.getString(2); String proto = rsp.getString(2);
String path = rsp.getString(3); String path = rsp.getString(3);
String param = rsp.getString(4);
data.addUrl(new EdgeUrl(proto, targetDomain, null, path), urlId); data.addUrl(new EdgeUrl(proto, targetDomain, null, path, param), urlId);
} }
} }

View File

@ -13,9 +13,12 @@ import org.slf4j.LoggerFactory;
import java.net.URI; import java.net.URI;
import java.net.URISyntaxException; import java.net.URISyntaxException;
import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.Optional; import java.util.Optional;
import java.util.function.Predicate;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class LinkParser { public class LinkParser {
private final Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());
@ -107,21 +110,30 @@ public class LinkParser {
@SneakyThrows @SneakyThrows
private String resolveUrl(EdgeUrl baseUrl, String s) { private String resolveUrl(EdgeUrl baseUrl, String s) {
s = paramRegex.matcher(s).replaceAll("");
// url looks like http://www.marginalia.nu/ // url looks like http://www.marginalia.nu/
if (isAbsoluteDomain(s)) { if (isAbsoluteDomain(s)) {
return s; return s;
} }
// url looks like /my-page String[] parts = s.split("\\?", 2);
if (s.startsWith("/")) { String path = parts[0];
return baseUrl.withPath(s).toString(); String param;
if (parts.length > 1) {
param = queryParamsSanitizer(parts[1]);
}
else {
param = null;
} }
final String partFromNewLink = spaceRegex.matcher(s).replaceAll("%20"); // url looks like /my-page
if (path.startsWith("/")) {
return baseUrl.withPathAndParam(path, param).toString();
}
return baseUrl.withPath(relativeNavigation(baseUrl) + partFromNewLink).toString(); final String partFromNewLink = spaceRegex.matcher(path).replaceAll("%20");
return baseUrl.withPathAndParam(relativeNavigation(baseUrl) + partFromNewLink, param).toString();
} }
// for a relative url that looks like /foo or /foo/bar; return / or /foo // for a relative url that looks like /foo or /foo/bar; return / or /foo
@ -183,4 +195,21 @@ public class LinkParser {
return documentUrl; return documentUrl;
} }
private static final Pattern paramSplitterPattern = Pattern.compile("&");
private static final Predicate<String> paramPatternPredicate = Pattern.compile("((id|i|p|t|v|m|name|view|post)=[a-zA-Z\\d]+)|(view=(/[a-zA-Z\\d\\-])+)").asMatchPredicate();
public static String queryParamsSanitizer(String queryParams) {
if (queryParams == null) {
return null;
}
var ret = Arrays.stream(paramSplitterPattern.split(queryParams))
.filter(paramPatternPredicate)
.sorted()
.collect(Collectors.joining("&"));
if (ret.isBlank())
return null;
return ret;
}
} }

View File

@ -63,7 +63,7 @@ public class CrawlerRetreiver {
if (queue.peek() != null) { if (queue.peek() != null) {
var fst = queue.peek(); var fst = queue.peek();
var root = new EdgeUrl(fst.proto, fst.domain, fst.port, "/"); var root = new EdgeUrl(fst.proto, fst.domain, fst.port, "/", null);
if (known.add(root)) if (known.add(root))
queue.addFirst(root); queue.addFirst(root);
} }
@ -110,7 +110,7 @@ public class CrawlerRetreiver {
.build()); .build());
} }
var fetchResult = fetcher.probeDomain(new EdgeUrl(fst.proto, fst.domain, fst.port, "/")); var fetchResult = fetcher.probeDomain(fst.domain.toRootUrl());
if (!fetchResult.ok()) { if (!fetchResult.ok()) {
logger.debug("Bad status on {}", domain); logger.debug("Bad status on {}", domain);
return Optional.of(createErrorPostFromStatus(fetchResult)); return Optional.of(createErrorPostFromStatus(fetchResult));
@ -232,7 +232,7 @@ public class CrawlerRetreiver {
} }
private Optional<EdgeUrl> findCanonicalUrl(EdgeUrl baseUrl, Document parsed) { private Optional<EdgeUrl> findCanonicalUrl(EdgeUrl baseUrl, Document parsed) {
baseUrl = baseUrl.withPath("/"); baseUrl = baseUrl.domain.toRootUrl();
for (var link : parsed.select("link[rel=canonical]")) { for (var link : parsed.select("link[rel=canonical]")) {
return linkParser.parseLink(baseUrl, link); return linkParser.parseLink(baseUrl, link);

View File

@ -109,7 +109,7 @@ public class HttpFetcher {
@SneakyThrows @SneakyThrows
public FetchResult probeDomain(EdgeUrl url) { public FetchResult probeDomain(EdgeUrl url) {
var head = new Request.Builder().head().addHeader("User-agent", userAgent) var head = new Request.Builder().head().addHeader("User-agent", userAgent)
.url(new EdgeUrl(url.proto, url.domain, url.port, "/").toString()) .url(url.domain.toRootUrl().toString())
.build(); .build();
var call = client.newCall(head); var call = client.newCall(head);
@ -293,7 +293,7 @@ public class HttpFetcher {
private Optional<SimpleRobotRules> fetchRobotsForProto(String proto, EdgeDomain domain) { private Optional<SimpleRobotRules> fetchRobotsForProto(String proto, EdgeDomain domain) {
try { try {
var url = new EdgeUrl(proto, domain, null, "/robots.txt"); var url = new EdgeUrl(proto, domain, null, "/robots.txt", null);
return Optional.of(parseRobotsTxt(fetchContent(url))); return Optional.of(parseRobotsTxt(fetchContent(url)));
} }
catch (Exception ex) { catch (Exception ex) {

View File

@ -64,7 +64,7 @@ public class StackOverflowPostsReader extends DefaultHandler {
} }
private StackOverflowPost createPost(StackOverflowQuestionData data) { private StackOverflowPost createPost(StackOverflowQuestionData data) {
EdgeUrl url = new EdgeUrl("https", domain, null, "/questions/"+data.getId()); EdgeUrl url = new EdgeUrl("https", domain, null, "/questions/"+data.getId(), null);
StringBuilder body = new StringBuilder(); StringBuilder body = new StringBuilder();
body.append(data.getQuestion()); body.append(data.getQuestion());

View File

@ -37,7 +37,7 @@ public class WikipediaReader {
} }
private EdgeUrl synthesizeUrl(String originalUrl) { private EdgeUrl synthesizeUrl(String originalUrl) {
return new EdgeUrl("https", domain, null, "/wiki/"+originalUrl); return new EdgeUrl("https", domain, null, "/wiki/"+originalUrl, null);
} }
public void join() throws InterruptedException { public void join() throws InterruptedException {

View File

@ -59,7 +59,7 @@ public class EdgeDomain implements WideHashable {
public EdgeUrl toRootUrl() { public EdgeUrl toRootUrl() {
// Set default protocol to http, as most https websites redirect http->https, but few http websites redirect https->http // Set default protocol to http, as most https websites redirect http->https, but few http websites redirect https->http
return new EdgeUrl("http", this, null, "/"); return new EdgeUrl("http", this, null, "/", null);
} }
public String toString() { public String toString() {

View File

@ -4,6 +4,7 @@ import lombok.Builder;
import lombok.EqualsAndHashCode; import lombok.EqualsAndHashCode;
import lombok.Getter; import lombok.Getter;
import lombok.Setter; import lombok.Setter;
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
import java.net.URI; import java.net.URI;
import java.net.URISyntaxException; import java.net.URISyntaxException;
@ -15,12 +16,14 @@ public class EdgeUrl implements WideHashable {
public final EdgeDomain domain; public final EdgeDomain domain;
public final Integer port; public final Integer port;
public final String path; public final String path;
public final String params;
public EdgeUrl(String proto, EdgeDomain domain, Integer port, String path) { public EdgeUrl(String proto, EdgeDomain domain, Integer port, String path, String params) {
this.proto = proto; this.proto = proto;
this.domain = domain; this.domain = domain;
this.port = port(port, proto); this.port = port(port, proto);
this.path = path; this.path = path;
this.params = params;
} }
public EdgeUrl(String url) throws URISyntaxException { public EdgeUrl(String url) throws URISyntaxException {
@ -77,8 +80,10 @@ public class EdgeUrl implements WideHashable {
this.path = URI.getPath().isEmpty() ? "/" : URI.getPath(); this.path = URI.getPath().isEmpty() ? "/" : URI.getPath();
this.proto = URI.getScheme().toLowerCase(); this.proto = URI.getScheme().toLowerCase();
this.port = port(URI.getPort(), proto); this.port = port(URI.getPort(), proto);
this.params = LinkParser.queryParamsSanitizer(URI.getQuery());
} }
private static Integer port(Integer port, String protocol) { private static Integer port(Integer port, String protocol) {
if (null == port || port < 1) { if (null == port || port < 1) {
return null; return null;
@ -94,8 +99,9 @@ public class EdgeUrl implements WideHashable {
public String toString() { public String toString() {
String portPart = port == null ? "" : (":" + port); String portPart = port == null ? "" : (":" + port);
String queryPart = params == null ? "" : ("?" + params);
return proto + "://" + domain + portPart + "" + path; return proto + "://" + domain + portPart + path + queryPart;
} }
public String dir() { public String dir() {
@ -115,7 +121,7 @@ public class EdgeUrl implements WideHashable {
return (int) path.chars().filter(c -> c=='/').count(); return (int) path.chars().filter(c -> c=='/').count();
} }
public EdgeUrl withPath(String s) { public EdgeUrl withPathAndParam(String path, String param) {
return new EdgeUrl(proto, domain, port, s); return new EdgeUrl(proto, domain, port, path, param);
} }
} }

View File

@ -46,20 +46,23 @@ COLLATE utf8mb4_unicode_ci;
CREATE TABLE IF NOT EXISTS EC_URL ( CREATE TABLE IF NOT EXISTS EC_URL (
ID INT PRIMARY KEY AUTO_INCREMENT, ID INT PRIMARY KEY AUTO_INCREMENT,
DOMAIN_ID INT NOT NULL, DOMAIN_ID INT NOT NULL,
PROTO ENUM('http','https','gemini') NOT NULL,
PATH VARCHAR(255) NOT NULL COLLATE utf8mb4_bin, PROTO ENUM('http','https','gemini') NOT NULL COLLATE utf8mb4_unicode_ci,
PATH VARCHAR(255) NOT NULL,
PORT INT, PORT INT,
PARAM VARCHAR(255),
PATH_HASH BIGINT NOT NULL COMMENT "Hash of PATH for uniqueness check by domain", PATH_HASH BIGINT NOT NULL COMMENT "Hash of PATH for uniqueness check by domain",
VISITED BOOLEAN NOT NULL DEFAULT FALSE, VISITED BOOLEAN NOT NULL DEFAULT FALSE,
STATE ENUM('ok', 'redirect', 'dead', 'archived', 'disqualified') NOT NULL DEFAULT 'ok', STATE ENUM('ok', 'redirect', 'dead', 'archived', 'disqualified') NOT NULL DEFAULT 'ok' COLLATE utf8mb4_unicode_ci,
CONSTRAINT CONS UNIQUE (DOMAIN_ID, PATH_HASH), CONSTRAINT CONS UNIQUE (DOMAIN_ID, PATH_HASH),
FOREIGN KEY (DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE FOREIGN KEY (DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE
) )
CHARACTER SET utf8mb4 CHARACTER SET utf8mb4
COLLATE utf8mb4_unicode_ci; COLLATE utf8mb4_bin;
CREATE TABLE IF NOT EXISTS EC_PAGE_DATA ( CREATE TABLE IF NOT EXISTS EC_PAGE_DATA (
ID INT PRIMARY KEY AUTO_INCREMENT, ID INT PRIMARY KEY AUTO_INCREMENT,
@ -113,10 +116,13 @@ CREATE TABLE IF NOT EXISTS EC_DOMAIN_LINK (
CREATE OR REPLACE VIEW EC_URL_VIEW AS CREATE OR REPLACE VIEW EC_URL_VIEW AS
SELECT SELECT
IF(PORT IS NULL, CONCAT(EC_URL.PROTO,
CONCAT(EC_URL.PROTO, "://", EC_DOMAIN.DOMAIN_NAME, EC_URL.PATH), '://',
CONCAT(EC_URL.PROTO, "://", EC_DOMAIN.DOMAIN_NAME, ":", EC_URL.PORT, EC_URL.PATH)) EC_DOMAIN.DOMAIN_NAME,
AS URL, IF(EC_URL.PORT IS NULL, '', CONCAT(':', EC_URL.PORT)),
EC_URL.PATH,
IF(EC_URL.PARAM IS NULL, '', CONCAT('?', EC_URL.PARAM))
) AS URL,
EC_URL.PATH_HASH AS PATH_HASH, EC_URL.PATH_HASH AS PATH_HASH,
EC_URL.PATH AS PATH, EC_URL.PATH AS PATH,
EC_DOMAIN.DOMAIN_NAME AS DOMAIN_NAME, EC_DOMAIN.DOMAIN_NAME AS DOMAIN_NAME,

View File

@ -17,4 +17,13 @@ class EdgeUrlTest {
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%22-sign")); System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%22-sign"));
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/\n \"huh\"")); System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/\n \"huh\""));
} }
@Test
void testParms() throws URISyntaxException {
System.out.println(new EdgeUrl("https://search.marginalia.nu/?id=123"));
System.out.println(new EdgeUrl("https://search.marginalia.nu/?t=123"));
System.out.println(new EdgeUrl("https://search.marginalia.nu/?v=123"));
System.out.println(new EdgeUrl("https://search.marginalia.nu/?m=123"));
System.out.println(new EdgeUrl("https://search.marginalia.nu/?follow=123"));
}
} }