WIP: Selective URL param strings

This commit is contained in:
vlofgren 2022-07-04 14:47:16 +02:00
parent ee07c4d94a
commit 853108028e
12 changed files with 93 additions and 39 deletions

View File

@ -144,7 +144,8 @@ public class LinkKeywordExtractorMain {
try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) {
AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(crawledDomains::contains,
url -> crawledUrls.contains(url.toString().hashCode()),
url -> url.params != null,
//url -> crawledUrls.contains(url.toString().hashCode()),
output::write);
logger.info("Reading files");

View File

@ -74,9 +74,6 @@ public class AnchorTextExtractor {
if (!isInterestingAnchorText(text)) {
return;
}
if (href.contains("?")) {
return;
}
var optLinkUrl = linkParser.parseLink(documentUrl, href);
if (optLinkUrl.isEmpty()) return;
@ -92,13 +89,16 @@ public class AnchorTextExtractor {
continue;
word = word.toLowerCase();
if (!WordPatterns.filter(word))
if (!WordPatterns.filter(word)) {
continue;
}
if (!linkUrl.domain.equals(documentUrl.domain)) {
if (isNewKeywordForLink(word, linkUrl.toString())) {
linkKeywordConsumer.accept(linkUrl, word);
}
if (linkUrl.domain.equals(documentUrl.domain)) {
continue;
}
if (isNewKeywordForLink(word, linkUrl.toString())) {
linkKeywordConsumer.accept(linkUrl, word);
}
}
}

View File

@ -30,6 +30,7 @@ public class SqlLoadUrls {
IN DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci,
IN PORT INT,
IN PATH VARCHAR(255),
IN PARAM VARCHAR(255),
IN PATH_HASH BIGINT
)
BEGIN
@ -45,8 +46,8 @@ public class SqlLoadUrls {
public void load(LoaderData data, EdgeUrl[] urls) {
try (var conn = dataSource.getConnection();
var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?, ?)");
var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH FROM EC_URL WHERE DOMAIN_ID=?")
var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?,?,?)");
var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH, PARAM FROM EC_URL WHERE DOMAIN_ID=?")
)
{
conn.setAutoCommit(false);
@ -61,7 +62,8 @@ public class SqlLoadUrls {
insertCall.setNull(3, Types.INTEGER);
}
insertCall.setString(4, url.path);
insertCall.setLong(5, hashPath(url.path));
insertCall.setString(5, url.params);
insertCall.setLong(6, hashPath(url.path));
insertCall.addBatch();
}
var ret = insertCall.executeBatch();
@ -84,8 +86,9 @@ public class SqlLoadUrls {
int urlId = rsp.getInt(1);
String proto = rsp.getString(2);
String path = rsp.getString(3);
String param = rsp.getString(4);
data.addUrl(new EdgeUrl(proto, targetDomain, null, path), urlId);
data.addUrl(new EdgeUrl(proto, targetDomain, null, path, param), urlId);
}
}

View File

@ -13,9 +13,12 @@ import org.slf4j.LoggerFactory;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Arrays;
import java.util.List;
import java.util.Optional;
import java.util.function.Predicate;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class LinkParser {
private final Logger logger = LoggerFactory.getLogger(getClass());
@ -107,21 +110,30 @@ public class LinkParser {
@SneakyThrows
private String resolveUrl(EdgeUrl baseUrl, String s) {
s = paramRegex.matcher(s).replaceAll("");
// url looks like http://www.marginalia.nu/
if (isAbsoluteDomain(s)) {
return s;
}
// url looks like /my-page
if (s.startsWith("/")) {
return baseUrl.withPath(s).toString();
String[] parts = s.split("\\?", 2);
String path = parts[0];
String param;
if (parts.length > 1) {
param = queryParamsSanitizer(parts[1]);
}
else {
param = null;
}
final String partFromNewLink = spaceRegex.matcher(s).replaceAll("%20");
// url looks like /my-page
if (path.startsWith("/")) {
return baseUrl.withPathAndParam(path, param).toString();
}
return baseUrl.withPath(relativeNavigation(baseUrl) + partFromNewLink).toString();
final String partFromNewLink = spaceRegex.matcher(path).replaceAll("%20");
return baseUrl.withPathAndParam(relativeNavigation(baseUrl) + partFromNewLink, param).toString();
}
// for a relative url that looks like /foo or /foo/bar; return / or /foo
@ -183,4 +195,21 @@ public class LinkParser {
return documentUrl;
}
private static final Pattern paramSplitterPattern = Pattern.compile("&");
private static final Predicate<String> paramPatternPredicate = Pattern.compile("((id|i|p|t|v|m|name|view|post)=[a-zA-Z\\d]+)|(view=(/[a-zA-Z\\d\\-])+)").asMatchPredicate();
public static String queryParamsSanitizer(String queryParams) {
if (queryParams == null) {
return null;
}
var ret = Arrays.stream(paramSplitterPattern.split(queryParams))
.filter(paramPatternPredicate)
.sorted()
.collect(Collectors.joining("&"));
if (ret.isBlank())
return null;
return ret;
}
}

View File

@ -63,7 +63,7 @@ public class CrawlerRetreiver {
if (queue.peek() != null) {
var fst = queue.peek();
var root = new EdgeUrl(fst.proto, fst.domain, fst.port, "/");
var root = new EdgeUrl(fst.proto, fst.domain, fst.port, "/", null);
if (known.add(root))
queue.addFirst(root);
}
@ -110,7 +110,7 @@ public class CrawlerRetreiver {
.build());
}
var fetchResult = fetcher.probeDomain(new EdgeUrl(fst.proto, fst.domain, fst.port, "/"));
var fetchResult = fetcher.probeDomain(fst.domain.toRootUrl());
if (!fetchResult.ok()) {
logger.debug("Bad status on {}", domain);
return Optional.of(createErrorPostFromStatus(fetchResult));
@ -232,7 +232,7 @@ public class CrawlerRetreiver {
}
private Optional<EdgeUrl> findCanonicalUrl(EdgeUrl baseUrl, Document parsed) {
baseUrl = baseUrl.withPath("/");
baseUrl = baseUrl.domain.toRootUrl();
for (var link : parsed.select("link[rel=canonical]")) {
return linkParser.parseLink(baseUrl, link);

View File

@ -109,7 +109,7 @@ public class HttpFetcher {
@SneakyThrows
public FetchResult probeDomain(EdgeUrl url) {
var head = new Request.Builder().head().addHeader("User-agent", userAgent)
.url(new EdgeUrl(url.proto, url.domain, url.port, "/").toString())
.url(url.domain.toRootUrl().toString())
.build();
var call = client.newCall(head);
@ -293,7 +293,7 @@ public class HttpFetcher {
private Optional<SimpleRobotRules> fetchRobotsForProto(String proto, EdgeDomain domain) {
try {
var url = new EdgeUrl(proto, domain, null, "/robots.txt");
var url = new EdgeUrl(proto, domain, null, "/robots.txt", null);
return Optional.of(parseRobotsTxt(fetchContent(url)));
}
catch (Exception ex) {

View File

@ -64,7 +64,7 @@ public class StackOverflowPostsReader extends DefaultHandler {
}
private StackOverflowPost createPost(StackOverflowQuestionData data) {
EdgeUrl url = new EdgeUrl("https", domain, null, "/questions/"+data.getId());
EdgeUrl url = new EdgeUrl("https", domain, null, "/questions/"+data.getId(), null);
StringBuilder body = new StringBuilder();
body.append(data.getQuestion());

View File

@ -37,7 +37,7 @@ public class WikipediaReader {
}
private EdgeUrl synthesizeUrl(String originalUrl) {
return new EdgeUrl("https", domain, null, "/wiki/"+originalUrl);
return new EdgeUrl("https", domain, null, "/wiki/"+originalUrl, null);
}
public void join() throws InterruptedException {

View File

@ -59,7 +59,7 @@ public class EdgeDomain implements WideHashable {
public EdgeUrl toRootUrl() {
// Set default protocol to http, as most https websites redirect http->https, but few http websites redirect https->http
return new EdgeUrl("http", this, null, "/");
return new EdgeUrl("http", this, null, "/", null);
}
public String toString() {

View File

@ -4,6 +4,7 @@ import lombok.Builder;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.Setter;
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
import java.net.URI;
import java.net.URISyntaxException;
@ -15,12 +16,14 @@ public class EdgeUrl implements WideHashable {
public final EdgeDomain domain;
public final Integer port;
public final String path;
public final String params;
public EdgeUrl(String proto, EdgeDomain domain, Integer port, String path) {
public EdgeUrl(String proto, EdgeDomain domain, Integer port, String path, String params) {
this.proto = proto;
this.domain = domain;
this.port = port(port, proto);
this.path = path;
this.params = params;
}
public EdgeUrl(String url) throws URISyntaxException {
@ -77,8 +80,10 @@ public class EdgeUrl implements WideHashable {
this.path = URI.getPath().isEmpty() ? "/" : URI.getPath();
this.proto = URI.getScheme().toLowerCase();
this.port = port(URI.getPort(), proto);
this.params = LinkParser.queryParamsSanitizer(URI.getQuery());
}
private static Integer port(Integer port, String protocol) {
if (null == port || port < 1) {
return null;
@ -94,8 +99,9 @@ public class EdgeUrl implements WideHashable {
public String toString() {
String portPart = port == null ? "" : (":" + port);
String queryPart = params == null ? "" : ("?" + params);
return proto + "://" + domain + portPart + "" + path;
return proto + "://" + domain + portPart + path + queryPart;
}
public String dir() {
@ -115,7 +121,7 @@ public class EdgeUrl implements WideHashable {
return (int) path.chars().filter(c -> c=='/').count();
}
public EdgeUrl withPath(String s) {
return new EdgeUrl(proto, domain, port, s);
public EdgeUrl withPathAndParam(String path, String param) {
return new EdgeUrl(proto, domain, port, path, param);
}
}

View File

@ -46,20 +46,23 @@ COLLATE utf8mb4_unicode_ci;
CREATE TABLE IF NOT EXISTS EC_URL (
ID INT PRIMARY KEY AUTO_INCREMENT,
DOMAIN_ID INT NOT NULL,
PROTO ENUM('http','https','gemini') NOT NULL,
PATH VARCHAR(255) NOT NULL COLLATE utf8mb4_bin,
PROTO ENUM('http','https','gemini') NOT NULL COLLATE utf8mb4_unicode_ci,
PATH VARCHAR(255) NOT NULL,
PORT INT,
PARAM VARCHAR(255),
PATH_HASH BIGINT NOT NULL COMMENT "Hash of PATH for uniqueness check by domain",
VISITED BOOLEAN NOT NULL DEFAULT FALSE,
STATE ENUM('ok', 'redirect', 'dead', 'archived', 'disqualified') NOT NULL DEFAULT 'ok',
STATE ENUM('ok', 'redirect', 'dead', 'archived', 'disqualified') NOT NULL DEFAULT 'ok' COLLATE utf8mb4_unicode_ci,
CONSTRAINT CONS UNIQUE (DOMAIN_ID, PATH_HASH),
FOREIGN KEY (DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE
)
CHARACTER SET utf8mb4
COLLATE utf8mb4_unicode_ci;
COLLATE utf8mb4_bin;
CREATE TABLE IF NOT EXISTS EC_PAGE_DATA (
ID INT PRIMARY KEY AUTO_INCREMENT,
@ -113,10 +116,13 @@ CREATE TABLE IF NOT EXISTS EC_DOMAIN_LINK (
CREATE OR REPLACE VIEW EC_URL_VIEW AS
SELECT
IF(PORT IS NULL,
CONCAT(EC_URL.PROTO, "://", EC_DOMAIN.DOMAIN_NAME, EC_URL.PATH),
CONCAT(EC_URL.PROTO, "://", EC_DOMAIN.DOMAIN_NAME, ":", EC_URL.PORT, EC_URL.PATH))
AS URL,
CONCAT(EC_URL.PROTO,
'://',
EC_DOMAIN.DOMAIN_NAME,
IF(EC_URL.PORT IS NULL, '', CONCAT(':', EC_URL.PORT)),
EC_URL.PATH,
IF(EC_URL.PARAM IS NULL, '', CONCAT('?', EC_URL.PARAM))
) AS URL,
EC_URL.PATH_HASH AS PATH_HASH,
EC_URL.PATH AS PATH,
EC_DOMAIN.DOMAIN_NAME AS DOMAIN_NAME,

View File

@ -17,4 +17,13 @@ class EdgeUrlTest {
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%22-sign"));
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/\n \"huh\""));
}
@Test
void testParms() throws URISyntaxException {
System.out.println(new EdgeUrl("https://search.marginalia.nu/?id=123"));
System.out.println(new EdgeUrl("https://search.marginalia.nu/?t=123"));
System.out.println(new EdgeUrl("https://search.marginalia.nu/?v=123"));
System.out.println(new EdgeUrl("https://search.marginalia.nu/?m=123"));
System.out.println(new EdgeUrl("https://search.marginalia.nu/?follow=123"));
}
}