mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Allow query params for *some* path,param combinations, targeted at allowing the crawl of forums.
This commit is contained in:
parent
853108028e
commit
f3be865293
@ -144,7 +144,7 @@ public class LinkKeywordExtractorMain {
|
||||
|
||||
try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) {
|
||||
AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(crawledDomains::contains,
|
||||
url -> url.params != null,
|
||||
url -> url.param != null,
|
||||
//url -> crawledUrls.contains(url.toString().hashCode()),
|
||||
output::write);
|
||||
|
||||
|
@ -138,8 +138,8 @@ public class AnchorTextExtractor {
|
||||
private boolean isNewKeywordForLink(String href, String text) {
|
||||
long hash = 0;
|
||||
|
||||
hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).asLong();
|
||||
hash ^= hashFunction.hashString(text, StandardCharsets.UTF_8).asLong();
|
||||
hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).padToLong();
|
||||
hash ^= hashFunction.hashString(text, StandardCharsets.UTF_8).padToLong();
|
||||
|
||||
// Remove sign bit because we don't want a negative index in deduplicateHashBitset
|
||||
hash &= 0x7FFF_FFFF_FFFF_FFFFL;
|
||||
|
@ -1,5 +1,6 @@
|
||||
package nu.marginalia.wmsa.edge.converting.loader;
|
||||
|
||||
import com.google.common.hash.HashFunction;
|
||||
import com.google.common.hash.Hashing;
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
@ -62,8 +63,8 @@ public class SqlLoadUrls {
|
||||
insertCall.setNull(3, Types.INTEGER);
|
||||
}
|
||||
insertCall.setString(4, url.path);
|
||||
insertCall.setString(5, url.params);
|
||||
insertCall.setLong(6, hashPath(url.path));
|
||||
insertCall.setString(5, url.param);
|
||||
insertCall.setLong(6, hashPath(url.path, url.param));
|
||||
insertCall.addBatch();
|
||||
}
|
||||
var ret = insertCall.executeBatch();
|
||||
@ -97,7 +98,15 @@ public class SqlLoadUrls {
|
||||
}
|
||||
}
|
||||
|
||||
private long hashPath(String path) {
|
||||
return Hashing.murmur3_128().hashString(path, StandardCharsets.UTF_8).asLong();
|
||||
private static final HashFunction murmur3_128 = Hashing.murmur3_128();
|
||||
private long hashPath(String path, String queryParam) {
|
||||
long pathHash = murmur3_128.hashString(path, StandardCharsets.UTF_8).padToLong();
|
||||
|
||||
if (queryParam == null) {
|
||||
return pathHash;
|
||||
}
|
||||
else {
|
||||
return pathHash + murmur3_128.hashString(queryParam, StandardCharsets.UTF_8).padToLong();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -13,12 +13,9 @@ import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class LinkParser {
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
@ -105,7 +102,6 @@ public class LinkParser {
|
||||
return url;
|
||||
}
|
||||
|
||||
private static final Pattern paramRegex = Pattern.compile("\\?.*$");
|
||||
private static final Pattern spaceRegex = Pattern.compile(" ");
|
||||
|
||||
@SneakyThrows
|
||||
@ -120,7 +116,7 @@ public class LinkParser {
|
||||
String path = parts[0];
|
||||
String param;
|
||||
if (parts.length > 1) {
|
||||
param = queryParamsSanitizer(parts[1]);
|
||||
param = QueryParams.queryParamsSanitizer(parts[0], parts[1]);
|
||||
}
|
||||
else {
|
||||
param = null;
|
||||
@ -196,20 +192,4 @@ public class LinkParser {
|
||||
return documentUrl;
|
||||
}
|
||||
|
||||
private static final Pattern paramSplitterPattern = Pattern.compile("&");
|
||||
private static final Predicate<String> paramPatternPredicate = Pattern.compile("((id|i|p|t|v|m|name|view|post)=[a-zA-Z\\d]+)|(view=(/[a-zA-Z\\d\\-])+)").asMatchPredicate();
|
||||
|
||||
public static String queryParamsSanitizer(String queryParams) {
|
||||
if (queryParams == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
var ret = Arrays.stream(paramSplitterPattern.split(queryParams))
|
||||
.filter(paramPatternPredicate)
|
||||
.sorted()
|
||||
.collect(Collectors.joining("&"));
|
||||
if (ret.isBlank())
|
||||
return null;
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
@ -72,7 +72,7 @@ public class LinkProcessor {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (urlBlocklist.isForumLink(link)) {
|
||||
if (urlBlocklist.isMailingListLink(link)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -0,0 +1,50 @@
|
||||
package nu.marginalia.wmsa.edge.converting.processor.logic;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.util.Arrays;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class QueryParams {
|
||||
|
||||
private static final Pattern paramSplitterPattern = Pattern.compile("&");
|
||||
|
||||
@Nullable
|
||||
public static String queryParamsSanitizer(String path, @Nullable String queryParams) {
|
||||
if (queryParams == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
var ret = Arrays.stream(paramSplitterPattern.split(queryParams))
|
||||
.filter(param -> QueryParams.isPermittedParam(path, param))
|
||||
.sorted()
|
||||
.collect(Collectors.joining("&"));
|
||||
|
||||
if (ret.isBlank())
|
||||
return null;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public static boolean isPermittedParam(String path, String param) {
|
||||
if (path.endsWith("index.php")) {
|
||||
if (param.startsWith("showtopic"))
|
||||
return true;
|
||||
if (param.startsWith("showforum"))
|
||||
return true;
|
||||
}
|
||||
if (path.endsWith("viewtopic.php")) {
|
||||
return (param.startsWith("t=") || param.startsWith("p="));
|
||||
}
|
||||
if (path.endsWith("viewforum.php")) {
|
||||
return param.startsWith("v=");
|
||||
}
|
||||
if (path.endsWith("showthread.php")) {
|
||||
return (param.startsWith("t=") || param.startsWith("p="));
|
||||
}
|
||||
if (path.endsWith("showforum.php")) {
|
||||
return param.startsWith("v=");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
@ -33,20 +33,14 @@ public class UrlBlocklist {
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isForumLink(EdgeUrl linkUrl) {
|
||||
public boolean isMailingListLink(EdgeUrl linkUrl) {
|
||||
var path = linkUrl.path;
|
||||
if (path.startsWith("/forum")) {
|
||||
return true;
|
||||
}
|
||||
if (path.startsWith("/lists/")) {
|
||||
return true;
|
||||
}
|
||||
if (path.startsWith("mailinglist")) {
|
||||
return true;
|
||||
}
|
||||
if (path.contains("phpbb")) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -63,7 +63,7 @@ public class CrawlerRetreiver {
|
||||
|
||||
if (queue.peek() != null) {
|
||||
var fst = queue.peek();
|
||||
var root = new EdgeUrl(fst.proto, fst.domain, fst.port, "/", null);
|
||||
var root = fst.domain.toRootUrl();
|
||||
if (known.add(root))
|
||||
queue.addFirst(root);
|
||||
}
|
||||
@ -121,6 +121,8 @@ public class CrawlerRetreiver {
|
||||
private CrawledDomain crawlDomain() {
|
||||
String ip = findIp(domain);
|
||||
|
||||
assert !queue.isEmpty();
|
||||
|
||||
var robotsRules = fetcher.fetchRobotRules(queue.peek().domain);
|
||||
long crawlDelay = robotsRules.getCrawlDelay();
|
||||
|
||||
@ -209,7 +211,7 @@ public class CrawlerRetreiver {
|
||||
linkParser.parseLink(baseUrl, link)
|
||||
.filter(this::isSameDomain)
|
||||
.filter(u -> !urlBlocklist.isUrlBlocked(u))
|
||||
.filter(u -> !urlBlocklist.isForumLink(u))
|
||||
.filter(u -> !urlBlocklist.isMailingListLink(u))
|
||||
.filter(known::add)
|
||||
.ifPresent(queue::addLast);
|
||||
}
|
||||
@ -217,7 +219,7 @@ public class CrawlerRetreiver {
|
||||
linkParser.parseFrame(baseUrl, link)
|
||||
.filter(this::isSameDomain)
|
||||
.filter(u -> !urlBlocklist.isUrlBlocked(u))
|
||||
.filter(u -> !urlBlocklist.isForumLink(u))
|
||||
.filter(u -> !urlBlocklist.isMailingListLink(u))
|
||||
.filter(known::add)
|
||||
.ifPresent(queue::addLast);
|
||||
}
|
||||
@ -225,7 +227,7 @@ public class CrawlerRetreiver {
|
||||
linkParser.parseFrame(baseUrl, link)
|
||||
.filter(this::isSameDomain)
|
||||
.filter(u -> !urlBlocklist.isUrlBlocked(u))
|
||||
.filter(u -> !urlBlocklist.isForumLink(u))
|
||||
.filter(u -> !urlBlocklist.isMailingListLink(u))
|
||||
.filter(known::add)
|
||||
.ifPresent(queue::addLast);
|
||||
}
|
||||
|
@ -46,13 +46,13 @@ public class KeywordLexicon implements AutoCloseable {
|
||||
}
|
||||
|
||||
private void loadJournalEntry(byte[] bytes) {
|
||||
final long key = hashFunction.hashBytes(bytes).asLong();
|
||||
final long key = hashFunction.hashBytes(bytes).padToLong();
|
||||
reverseIndex.put(key);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public int getOrInsert(String macroWord) {
|
||||
final long key = hashFunction.hashBytes(macroWord.getBytes()).asLong();
|
||||
final long key = hashFunction.hashBytes(macroWord.getBytes()).padToLong();
|
||||
|
||||
int idx = getReadOnly(key);
|
||||
if (idx >= 0)
|
||||
@ -78,7 +78,7 @@ public class KeywordLexicon implements AutoCloseable {
|
||||
}
|
||||
|
||||
public int getReadOnly(String word) {
|
||||
return getReadOnly(hashFunction.hashBytes(word.getBytes()).asLong());
|
||||
return getReadOnly(hashFunction.hashBytes(word.getBytes()).padToLong());
|
||||
}
|
||||
|
||||
public int getReadOnly(long hashedKey) {
|
||||
|
@ -4,7 +4,7 @@ import lombok.Builder;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.QueryParams;
|
||||
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
@ -16,14 +16,14 @@ public class EdgeUrl implements WideHashable {
|
||||
public final EdgeDomain domain;
|
||||
public final Integer port;
|
||||
public final String path;
|
||||
public final String params;
|
||||
public final String param;
|
||||
|
||||
public EdgeUrl(String proto, EdgeDomain domain, Integer port, String path, String params) {
|
||||
public EdgeUrl(String proto, EdgeDomain domain, Integer port, String path, String param) {
|
||||
this.proto = proto;
|
||||
this.domain = domain;
|
||||
this.port = port(port, proto);
|
||||
this.path = path;
|
||||
this.params = params;
|
||||
this.param = param;
|
||||
}
|
||||
|
||||
public EdgeUrl(String url) throws URISyntaxException {
|
||||
@ -80,7 +80,7 @@ public class EdgeUrl implements WideHashable {
|
||||
this.path = URI.getPath().isEmpty() ? "/" : URI.getPath();
|
||||
this.proto = URI.getScheme().toLowerCase();
|
||||
this.port = port(URI.getPort(), proto);
|
||||
this.params = LinkParser.queryParamsSanitizer(URI.getQuery());
|
||||
this.param = QueryParams.queryParamsSanitizer(this.path, URI.getQuery());
|
||||
}
|
||||
|
||||
|
||||
@ -99,7 +99,7 @@ public class EdgeUrl implements WideHashable {
|
||||
|
||||
public String toString() {
|
||||
String portPart = port == null ? "" : (":" + port);
|
||||
String queryPart = params == null ? "" : ("?" + params);
|
||||
String queryPart = param == null ? "" : ("?" + param);
|
||||
|
||||
return proto + "://" + domain + portPart + path + queryPart;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user