From f3be865293a792c8cc343e5f32f240b9b7e46733 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Fri, 8 Jul 2022 16:36:09 +0200 Subject: [PATCH] Allow query params for *some* path,param combinations, targeted at allowing the crawl of forums. --- .../converting/LinkKeywordExtractorMain.java | 2 +- .../converting/atags/AnchorTextExtractor.java | 4 +- .../edge/converting/loader/SqlLoadUrls.java | 17 +++++-- .../processor/logic/LinkParser.java | 22 +------- .../processor/logic/LinkProcessor.java | 2 +- .../processor/logic/QueryParams.java | 50 +++++++++++++++++++ .../edge/crawling/blocklist/UrlBlocklist.java | 8 +-- .../crawling/retreival/CrawlerRetreiver.java | 10 ++-- .../edge/index/lexicon/KeywordLexicon.java | 6 +-- .../marginalia/wmsa/edge/model/EdgeUrl.java | 12 ++--- 10 files changed, 84 insertions(+), 49 deletions(-) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/QueryParams.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java index 156dbdaa..99c93740 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java @@ -144,7 +144,7 @@ public class LinkKeywordExtractorMain { try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) { AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(crawledDomains::contains, - url -> url.params != null, + url -> url.param != null, //url -> crawledUrls.contains(url.toString().hashCode()), output::write); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java index c44e7f18..8c5fc6c1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java @@ -138,8 +138,8 @@ public class AnchorTextExtractor { private boolean isNewKeywordForLink(String href, String text) { long hash = 0; - hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).asLong(); - hash ^= hashFunction.hashString(text, StandardCharsets.UTF_8).asLong(); + hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).padToLong(); + hash ^= hashFunction.hashString(text, StandardCharsets.UTF_8).padToLong(); // Remove sign bit because we don't want a negative index in deduplicateHashBitset hash &= 0x7FFF_FFFF_FFFF_FFFFL; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java index 04c9735f..d09fac4a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java @@ -1,5 +1,6 @@ package nu.marginalia.wmsa.edge.converting.loader; +import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; import com.google.inject.Inject; import com.zaxxer.hikari.HikariDataSource; @@ -62,8 +63,8 @@ public class SqlLoadUrls { insertCall.setNull(3, Types.INTEGER); } insertCall.setString(4, url.path); - insertCall.setString(5, url.params); - insertCall.setLong(6, hashPath(url.path)); + insertCall.setString(5, url.param); + insertCall.setLong(6, hashPath(url.path, url.param)); insertCall.addBatch(); } var ret = insertCall.executeBatch(); @@ -97,7 +98,15 @@ public class SqlLoadUrls { } } - private long hashPath(String path) { - return Hashing.murmur3_128().hashString(path, StandardCharsets.UTF_8).asLong(); + private static final HashFunction murmur3_128 = Hashing.murmur3_128(); + private long hashPath(String path, String queryParam) { + long pathHash = murmur3_128.hashString(path, StandardCharsets.UTF_8).padToLong(); + + if (queryParam == null) { + return pathHash; + } + else { + return pathHash + murmur3_128.hashString(queryParam, StandardCharsets.UTF_8).padToLong(); + } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java index c14e31cb..d58b15bf 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java @@ -13,12 +13,9 @@ import org.slf4j.LoggerFactory; import java.net.URI; import java.net.URISyntaxException; -import java.util.Arrays; import java.util.List; import java.util.Optional; -import java.util.function.Predicate; import java.util.regex.Pattern; -import java.util.stream.Collectors; public class LinkParser { private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -105,7 +102,6 @@ public class LinkParser { return url; } - private static final Pattern paramRegex = Pattern.compile("\\?.*$"); private static final Pattern spaceRegex = Pattern.compile(" "); @SneakyThrows @@ -120,7 +116,7 @@ public class LinkParser { String path = parts[0]; String param; if (parts.length > 1) { - param = queryParamsSanitizer(parts[1]); + param = QueryParams.queryParamsSanitizer(parts[0], parts[1]); } else { param = null; @@ -196,20 +192,4 @@ public class LinkParser { return documentUrl; } - private static final Pattern paramSplitterPattern = Pattern.compile("&"); - private static final Predicate paramPatternPredicate = Pattern.compile("((id|i|p|t|v|m|name|view|post)=[a-zA-Z\\d]+)|(view=(/[a-zA-Z\\d\\-])+)").asMatchPredicate(); - - public static String queryParamsSanitizer(String queryParams) { - if (queryParams == null) { - return null; - } - - var ret = Arrays.stream(paramSplitterPattern.split(queryParams)) - .filter(paramPatternPredicate) - .sorted() - .collect(Collectors.joining("&")); - if (ret.isBlank()) - return null; - return ret; - } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkProcessor.java index 24c9229d..54c47e4c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkProcessor.java @@ -72,7 +72,7 @@ public class LinkProcessor { return false; } - if (urlBlocklist.isForumLink(link)) { + if (urlBlocklist.isMailingListLink(link)) { return false; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/QueryParams.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/QueryParams.java new file mode 100644 index 00000000..ad52e347 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/QueryParams.java @@ -0,0 +1,50 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic; + +import javax.annotation.Nullable; +import java.util.Arrays; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +public class QueryParams { + + private static final Pattern paramSplitterPattern = Pattern.compile("&"); + + @Nullable + public static String queryParamsSanitizer(String path, @Nullable String queryParams) { + if (queryParams == null) { + return null; + } + + var ret = Arrays.stream(paramSplitterPattern.split(queryParams)) + .filter(param -> QueryParams.isPermittedParam(path, param)) + .sorted() + .collect(Collectors.joining("&")); + + if (ret.isBlank()) + return null; + + return ret; + } + + public static boolean isPermittedParam(String path, String param) { + if (path.endsWith("index.php")) { + if (param.startsWith("showtopic")) + return true; + if (param.startsWith("showforum")) + return true; + } + if (path.endsWith("viewtopic.php")) { + return (param.startsWith("t=") || param.startsWith("p=")); + } + if (path.endsWith("viewforum.php")) { + return param.startsWith("v="); + } + if (path.endsWith("showthread.php")) { + return (param.startsWith("t=") || param.startsWith("p=")); + } + if (path.endsWith("showforum.php")) { + return param.startsWith("v="); + } + return false; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java index f81ca0db..b70e4ab0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java @@ -33,20 +33,14 @@ public class UrlBlocklist { } } - public boolean isForumLink(EdgeUrl linkUrl) { + public boolean isMailingListLink(EdgeUrl linkUrl) { var path = linkUrl.path; - if (path.startsWith("/forum")) { - return true; - } if (path.startsWith("/lists/")) { return true; } if (path.startsWith("mailinglist")) { return true; } - if (path.contains("phpbb")) { - return true; - } return false; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java index c275ad6f..b9fb79c5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java @@ -63,7 +63,7 @@ public class CrawlerRetreiver { if (queue.peek() != null) { var fst = queue.peek(); - var root = new EdgeUrl(fst.proto, fst.domain, fst.port, "/", null); + var root = fst.domain.toRootUrl(); if (known.add(root)) queue.addFirst(root); } @@ -121,6 +121,8 @@ public class CrawlerRetreiver { private CrawledDomain crawlDomain() { String ip = findIp(domain); + assert !queue.isEmpty(); + var robotsRules = fetcher.fetchRobotRules(queue.peek().domain); long crawlDelay = robotsRules.getCrawlDelay(); @@ -209,7 +211,7 @@ public class CrawlerRetreiver { linkParser.parseLink(baseUrl, link) .filter(this::isSameDomain) .filter(u -> !urlBlocklist.isUrlBlocked(u)) - .filter(u -> !urlBlocklist.isForumLink(u)) + .filter(u -> !urlBlocklist.isMailingListLink(u)) .filter(known::add) .ifPresent(queue::addLast); } @@ -217,7 +219,7 @@ public class CrawlerRetreiver { linkParser.parseFrame(baseUrl, link) .filter(this::isSameDomain) .filter(u -> !urlBlocklist.isUrlBlocked(u)) - .filter(u -> !urlBlocklist.isForumLink(u)) + .filter(u -> !urlBlocklist.isMailingListLink(u)) .filter(known::add) .ifPresent(queue::addLast); } @@ -225,7 +227,7 @@ public class CrawlerRetreiver { linkParser.parseFrame(baseUrl, link) .filter(this::isSameDomain) .filter(u -> !urlBlocklist.isUrlBlocked(u)) - .filter(u -> !urlBlocklist.isForumLink(u)) + .filter(u -> !urlBlocklist.isMailingListLink(u)) .filter(known::add) .ifPresent(queue::addLast); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java index 6485f381..8d15f8f3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java @@ -46,13 +46,13 @@ public class KeywordLexicon implements AutoCloseable { } private void loadJournalEntry(byte[] bytes) { - final long key = hashFunction.hashBytes(bytes).asLong(); + final long key = hashFunction.hashBytes(bytes).padToLong(); reverseIndex.put(key); } @SneakyThrows public int getOrInsert(String macroWord) { - final long key = hashFunction.hashBytes(macroWord.getBytes()).asLong(); + final long key = hashFunction.hashBytes(macroWord.getBytes()).padToLong(); int idx = getReadOnly(key); if (idx >= 0) @@ -78,7 +78,7 @@ public class KeywordLexicon implements AutoCloseable { } public int getReadOnly(String word) { - return getReadOnly(hashFunction.hashBytes(word.getBytes()).asLong()); + return getReadOnly(hashFunction.hashBytes(word.getBytes()).padToLong()); } public int getReadOnly(long hashedKey) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java index b7681951..123bd95a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java @@ -4,7 +4,7 @@ import lombok.Builder; import lombok.EqualsAndHashCode; import lombok.Getter; import lombok.Setter; -import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; +import nu.marginalia.wmsa.edge.converting.processor.logic.QueryParams; import java.net.URI; import java.net.URISyntaxException; @@ -16,14 +16,14 @@ public class EdgeUrl implements WideHashable { public final EdgeDomain domain; public final Integer port; public final String path; - public final String params; + public final String param; - public EdgeUrl(String proto, EdgeDomain domain, Integer port, String path, String params) { + public EdgeUrl(String proto, EdgeDomain domain, Integer port, String path, String param) { this.proto = proto; this.domain = domain; this.port = port(port, proto); this.path = path; - this.params = params; + this.param = param; } public EdgeUrl(String url) throws URISyntaxException { @@ -80,7 +80,7 @@ public class EdgeUrl implements WideHashable { this.path = URI.getPath().isEmpty() ? "/" : URI.getPath(); this.proto = URI.getScheme().toLowerCase(); this.port = port(URI.getPort(), proto); - this.params = LinkParser.queryParamsSanitizer(URI.getQuery()); + this.param = QueryParams.queryParamsSanitizer(this.path, URI.getQuery()); } @@ -99,7 +99,7 @@ public class EdgeUrl implements WideHashable { public String toString() { String portPart = port == null ? "" : (":" + port); - String queryPart = params == null ? "" : ("?" + params); + String queryPart = param == null ? "" : ("?" + param); return proto + "://" + domain + portPart + path + queryPart; }