Allow query params for *some* path,param combinations, targeted at allowing the crawl of forums.

This commit is contained in:
vlofgren 2022-07-08 16:36:09 +02:00
parent 853108028e
commit f3be865293
10 changed files with 84 additions and 49 deletions

View File

@ -144,7 +144,7 @@ public class LinkKeywordExtractorMain {
try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) {
AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(crawledDomains::contains,
url -> url.params != null,
url -> url.param != null,
//url -> crawledUrls.contains(url.toString().hashCode()),
output::write);

View File

@ -138,8 +138,8 @@ public class AnchorTextExtractor {
private boolean isNewKeywordForLink(String href, String text) {
long hash = 0;
hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).asLong();
hash ^= hashFunction.hashString(text, StandardCharsets.UTF_8).asLong();
hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).padToLong();
hash ^= hashFunction.hashString(text, StandardCharsets.UTF_8).padToLong();
// Remove sign bit because we don't want a negative index in deduplicateHashBitset
hash &= 0x7FFF_FFFF_FFFF_FFFFL;

View File

@ -1,5 +1,6 @@
package nu.marginalia.wmsa.edge.converting.loader;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
@ -62,8 +63,8 @@ public class SqlLoadUrls {
insertCall.setNull(3, Types.INTEGER);
}
insertCall.setString(4, url.path);
insertCall.setString(5, url.params);
insertCall.setLong(6, hashPath(url.path));
insertCall.setString(5, url.param);
insertCall.setLong(6, hashPath(url.path, url.param));
insertCall.addBatch();
}
var ret = insertCall.executeBatch();
@ -97,7 +98,15 @@ public class SqlLoadUrls {
}
}
private long hashPath(String path) {
return Hashing.murmur3_128().hashString(path, StandardCharsets.UTF_8).asLong();
private static final HashFunction murmur3_128 = Hashing.murmur3_128();
private long hashPath(String path, String queryParam) {
long pathHash = murmur3_128.hashString(path, StandardCharsets.UTF_8).padToLong();
if (queryParam == null) {
return pathHash;
}
else {
return pathHash + murmur3_128.hashString(queryParam, StandardCharsets.UTF_8).padToLong();
}
}
}

View File

@ -13,12 +13,9 @@ import org.slf4j.LoggerFactory;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Arrays;
import java.util.List;
import java.util.Optional;
import java.util.function.Predicate;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class LinkParser {
private final Logger logger = LoggerFactory.getLogger(getClass());
@ -105,7 +102,6 @@ public class LinkParser {
return url;
}
private static final Pattern paramRegex = Pattern.compile("\\?.*$");
private static final Pattern spaceRegex = Pattern.compile(" ");
@SneakyThrows
@ -120,7 +116,7 @@ public class LinkParser {
String path = parts[0];
String param;
if (parts.length > 1) {
param = queryParamsSanitizer(parts[1]);
param = QueryParams.queryParamsSanitizer(parts[0], parts[1]);
}
else {
param = null;
@ -196,20 +192,4 @@ public class LinkParser {
return documentUrl;
}
private static final Pattern paramSplitterPattern = Pattern.compile("&");
private static final Predicate<String> paramPatternPredicate = Pattern.compile("((id|i|p|t|v|m|name|view|post)=[a-zA-Z\\d]+)|(view=(/[a-zA-Z\\d\\-])+)").asMatchPredicate();
public static String queryParamsSanitizer(String queryParams) {
if (queryParams == null) {
return null;
}
var ret = Arrays.stream(paramSplitterPattern.split(queryParams))
.filter(paramPatternPredicate)
.sorted()
.collect(Collectors.joining("&"));
if (ret.isBlank())
return null;
return ret;
}
}

View File

@ -72,7 +72,7 @@ public class LinkProcessor {
return false;
}
if (urlBlocklist.isForumLink(link)) {
if (urlBlocklist.isMailingListLink(link)) {
return false;
}

View File

@ -0,0 +1,50 @@
package nu.marginalia.wmsa.edge.converting.processor.logic;
import javax.annotation.Nullable;
import java.util.Arrays;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class QueryParams {
private static final Pattern paramSplitterPattern = Pattern.compile("&");
@Nullable
public static String queryParamsSanitizer(String path, @Nullable String queryParams) {
if (queryParams == null) {
return null;
}
var ret = Arrays.stream(paramSplitterPattern.split(queryParams))
.filter(param -> QueryParams.isPermittedParam(path, param))
.sorted()
.collect(Collectors.joining("&"));
if (ret.isBlank())
return null;
return ret;
}
public static boolean isPermittedParam(String path, String param) {
if (path.endsWith("index.php")) {
if (param.startsWith("showtopic"))
return true;
if (param.startsWith("showforum"))
return true;
}
if (path.endsWith("viewtopic.php")) {
return (param.startsWith("t=") || param.startsWith("p="));
}
if (path.endsWith("viewforum.php")) {
return param.startsWith("v=");
}
if (path.endsWith("showthread.php")) {
return (param.startsWith("t=") || param.startsWith("p="));
}
if (path.endsWith("showforum.php")) {
return param.startsWith("v=");
}
return false;
}
}

View File

@ -33,20 +33,14 @@ public class UrlBlocklist {
}
}
public boolean isForumLink(EdgeUrl linkUrl) {
public boolean isMailingListLink(EdgeUrl linkUrl) {
var path = linkUrl.path;
if (path.startsWith("/forum")) {
return true;
}
if (path.startsWith("/lists/")) {
return true;
}
if (path.startsWith("mailinglist")) {
return true;
}
if (path.contains("phpbb")) {
return true;
}
return false;
}

View File

@ -63,7 +63,7 @@ public class CrawlerRetreiver {
if (queue.peek() != null) {
var fst = queue.peek();
var root = new EdgeUrl(fst.proto, fst.domain, fst.port, "/", null);
var root = fst.domain.toRootUrl();
if (known.add(root))
queue.addFirst(root);
}
@ -121,6 +121,8 @@ public class CrawlerRetreiver {
private CrawledDomain crawlDomain() {
String ip = findIp(domain);
assert !queue.isEmpty();
var robotsRules = fetcher.fetchRobotRules(queue.peek().domain);
long crawlDelay = robotsRules.getCrawlDelay();
@ -209,7 +211,7 @@ public class CrawlerRetreiver {
linkParser.parseLink(baseUrl, link)
.filter(this::isSameDomain)
.filter(u -> !urlBlocklist.isUrlBlocked(u))
.filter(u -> !urlBlocklist.isForumLink(u))
.filter(u -> !urlBlocklist.isMailingListLink(u))
.filter(known::add)
.ifPresent(queue::addLast);
}
@ -217,7 +219,7 @@ public class CrawlerRetreiver {
linkParser.parseFrame(baseUrl, link)
.filter(this::isSameDomain)
.filter(u -> !urlBlocklist.isUrlBlocked(u))
.filter(u -> !urlBlocklist.isForumLink(u))
.filter(u -> !urlBlocklist.isMailingListLink(u))
.filter(known::add)
.ifPresent(queue::addLast);
}
@ -225,7 +227,7 @@ public class CrawlerRetreiver {
linkParser.parseFrame(baseUrl, link)
.filter(this::isSameDomain)
.filter(u -> !urlBlocklist.isUrlBlocked(u))
.filter(u -> !urlBlocklist.isForumLink(u))
.filter(u -> !urlBlocklist.isMailingListLink(u))
.filter(known::add)
.ifPresent(queue::addLast);
}

View File

@ -46,13 +46,13 @@ public class KeywordLexicon implements AutoCloseable {
}
private void loadJournalEntry(byte[] bytes) {
final long key = hashFunction.hashBytes(bytes).asLong();
final long key = hashFunction.hashBytes(bytes).padToLong();
reverseIndex.put(key);
}
@SneakyThrows
public int getOrInsert(String macroWord) {
final long key = hashFunction.hashBytes(macroWord.getBytes()).asLong();
final long key = hashFunction.hashBytes(macroWord.getBytes()).padToLong();
int idx = getReadOnly(key);
if (idx >= 0)
@ -78,7 +78,7 @@ public class KeywordLexicon implements AutoCloseable {
}
public int getReadOnly(String word) {
return getReadOnly(hashFunction.hashBytes(word.getBytes()).asLong());
return getReadOnly(hashFunction.hashBytes(word.getBytes()).padToLong());
}
public int getReadOnly(long hashedKey) {

View File

@ -4,7 +4,7 @@ import lombok.Builder;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.Setter;
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
import nu.marginalia.wmsa.edge.converting.processor.logic.QueryParams;
import java.net.URI;
import java.net.URISyntaxException;
@ -16,14 +16,14 @@ public class EdgeUrl implements WideHashable {
public final EdgeDomain domain;
public final Integer port;
public final String path;
public final String params;
public final String param;
public EdgeUrl(String proto, EdgeDomain domain, Integer port, String path, String params) {
public EdgeUrl(String proto, EdgeDomain domain, Integer port, String path, String param) {
this.proto = proto;
this.domain = domain;
this.port = port(port, proto);
this.path = path;
this.params = params;
this.param = param;
}
public EdgeUrl(String url) throws URISyntaxException {
@ -80,7 +80,7 @@ public class EdgeUrl implements WideHashable {
this.path = URI.getPath().isEmpty() ? "/" : URI.getPath();
this.proto = URI.getScheme().toLowerCase();
this.port = port(URI.getPort(), proto);
this.params = LinkParser.queryParamsSanitizer(URI.getQuery());
this.param = QueryParams.queryParamsSanitizer(this.path, URI.getQuery());
}
@ -99,7 +99,7 @@ public class EdgeUrl implements WideHashable {
public String toString() {
String portPart = port == null ? "" : (":" + port);
String queryPart = params == null ? "" : ("?" + params);
String queryPart = param == null ? "" : ("?" + param);
return proto + "://" + domain + portPart + path + queryPart;
}