mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
WIP: Selective URL param strings
This commit is contained in:
parent
ee07c4d94a
commit
853108028e
@ -144,7 +144,8 @@ public class LinkKeywordExtractorMain {
|
||||
|
||||
try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) {
|
||||
AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(crawledDomains::contains,
|
||||
url -> crawledUrls.contains(url.toString().hashCode()),
|
||||
url -> url.params != null,
|
||||
//url -> crawledUrls.contains(url.toString().hashCode()),
|
||||
output::write);
|
||||
|
||||
logger.info("Reading files");
|
||||
|
@ -74,9 +74,6 @@ public class AnchorTextExtractor {
|
||||
if (!isInterestingAnchorText(text)) {
|
||||
return;
|
||||
}
|
||||
if (href.contains("?")) {
|
||||
return;
|
||||
}
|
||||
|
||||
var optLinkUrl = linkParser.parseLink(documentUrl, href);
|
||||
if (optLinkUrl.isEmpty()) return;
|
||||
@ -92,13 +89,16 @@ public class AnchorTextExtractor {
|
||||
continue;
|
||||
|
||||
word = word.toLowerCase();
|
||||
if (!WordPatterns.filter(word))
|
||||
if (!WordPatterns.filter(word)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!linkUrl.domain.equals(documentUrl.domain)) {
|
||||
if (isNewKeywordForLink(word, linkUrl.toString())) {
|
||||
linkKeywordConsumer.accept(linkUrl, word);
|
||||
}
|
||||
if (linkUrl.domain.equals(documentUrl.domain)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (isNewKeywordForLink(word, linkUrl.toString())) {
|
||||
linkKeywordConsumer.accept(linkUrl, word);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -30,6 +30,7 @@ public class SqlLoadUrls {
|
||||
IN DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci,
|
||||
IN PORT INT,
|
||||
IN PATH VARCHAR(255),
|
||||
IN PARAM VARCHAR(255),
|
||||
IN PATH_HASH BIGINT
|
||||
)
|
||||
BEGIN
|
||||
@ -45,8 +46,8 @@ public class SqlLoadUrls {
|
||||
|
||||
public void load(LoaderData data, EdgeUrl[] urls) {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?, ?)");
|
||||
var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH FROM EC_URL WHERE DOMAIN_ID=?")
|
||||
var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?,?,?)");
|
||||
var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH, PARAM FROM EC_URL WHERE DOMAIN_ID=?")
|
||||
)
|
||||
{
|
||||
conn.setAutoCommit(false);
|
||||
@ -61,7 +62,8 @@ public class SqlLoadUrls {
|
||||
insertCall.setNull(3, Types.INTEGER);
|
||||
}
|
||||
insertCall.setString(4, url.path);
|
||||
insertCall.setLong(5, hashPath(url.path));
|
||||
insertCall.setString(5, url.params);
|
||||
insertCall.setLong(6, hashPath(url.path));
|
||||
insertCall.addBatch();
|
||||
}
|
||||
var ret = insertCall.executeBatch();
|
||||
@ -84,8 +86,9 @@ public class SqlLoadUrls {
|
||||
int urlId = rsp.getInt(1);
|
||||
String proto = rsp.getString(2);
|
||||
String path = rsp.getString(3);
|
||||
String param = rsp.getString(4);
|
||||
|
||||
data.addUrl(new EdgeUrl(proto, targetDomain, null, path), urlId);
|
||||
data.addUrl(new EdgeUrl(proto, targetDomain, null, path, param), urlId);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -13,9 +13,12 @@ import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class LinkParser {
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
@ -107,21 +110,30 @@ public class LinkParser {
|
||||
|
||||
@SneakyThrows
|
||||
private String resolveUrl(EdgeUrl baseUrl, String s) {
|
||||
s = paramRegex.matcher(s).replaceAll("");
|
||||
|
||||
// url looks like http://www.marginalia.nu/
|
||||
if (isAbsoluteDomain(s)) {
|
||||
return s;
|
||||
}
|
||||
|
||||
// url looks like /my-page
|
||||
if (s.startsWith("/")) {
|
||||
return baseUrl.withPath(s).toString();
|
||||
String[] parts = s.split("\\?", 2);
|
||||
String path = parts[0];
|
||||
String param;
|
||||
if (parts.length > 1) {
|
||||
param = queryParamsSanitizer(parts[1]);
|
||||
}
|
||||
else {
|
||||
param = null;
|
||||
}
|
||||
|
||||
final String partFromNewLink = spaceRegex.matcher(s).replaceAll("%20");
|
||||
// url looks like /my-page
|
||||
if (path.startsWith("/")) {
|
||||
return baseUrl.withPathAndParam(path, param).toString();
|
||||
}
|
||||
|
||||
return baseUrl.withPath(relativeNavigation(baseUrl) + partFromNewLink).toString();
|
||||
final String partFromNewLink = spaceRegex.matcher(path).replaceAll("%20");
|
||||
|
||||
return baseUrl.withPathAndParam(relativeNavigation(baseUrl) + partFromNewLink, param).toString();
|
||||
}
|
||||
|
||||
// for a relative url that looks like /foo or /foo/bar; return / or /foo
|
||||
@ -183,4 +195,21 @@ public class LinkParser {
|
||||
|
||||
return documentUrl;
|
||||
}
|
||||
|
||||
private static final Pattern paramSplitterPattern = Pattern.compile("&");
|
||||
private static final Predicate<String> paramPatternPredicate = Pattern.compile("((id|i|p|t|v|m|name|view|post)=[a-zA-Z\\d]+)|(view=(/[a-zA-Z\\d\\-])+)").asMatchPredicate();
|
||||
|
||||
public static String queryParamsSanitizer(String queryParams) {
|
||||
if (queryParams == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
var ret = Arrays.stream(paramSplitterPattern.split(queryParams))
|
||||
.filter(paramPatternPredicate)
|
||||
.sorted()
|
||||
.collect(Collectors.joining("&"));
|
||||
if (ret.isBlank())
|
||||
return null;
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
@ -63,7 +63,7 @@ public class CrawlerRetreiver {
|
||||
|
||||
if (queue.peek() != null) {
|
||||
var fst = queue.peek();
|
||||
var root = new EdgeUrl(fst.proto, fst.domain, fst.port, "/");
|
||||
var root = new EdgeUrl(fst.proto, fst.domain, fst.port, "/", null);
|
||||
if (known.add(root))
|
||||
queue.addFirst(root);
|
||||
}
|
||||
@ -110,7 +110,7 @@ public class CrawlerRetreiver {
|
||||
.build());
|
||||
}
|
||||
|
||||
var fetchResult = fetcher.probeDomain(new EdgeUrl(fst.proto, fst.domain, fst.port, "/"));
|
||||
var fetchResult = fetcher.probeDomain(fst.domain.toRootUrl());
|
||||
if (!fetchResult.ok()) {
|
||||
logger.debug("Bad status on {}", domain);
|
||||
return Optional.of(createErrorPostFromStatus(fetchResult));
|
||||
@ -232,7 +232,7 @@ public class CrawlerRetreiver {
|
||||
}
|
||||
|
||||
private Optional<EdgeUrl> findCanonicalUrl(EdgeUrl baseUrl, Document parsed) {
|
||||
baseUrl = baseUrl.withPath("/");
|
||||
baseUrl = baseUrl.domain.toRootUrl();
|
||||
|
||||
for (var link : parsed.select("link[rel=canonical]")) {
|
||||
return linkParser.parseLink(baseUrl, link);
|
||||
|
@ -109,7 +109,7 @@ public class HttpFetcher {
|
||||
@SneakyThrows
|
||||
public FetchResult probeDomain(EdgeUrl url) {
|
||||
var head = new Request.Builder().head().addHeader("User-agent", userAgent)
|
||||
.url(new EdgeUrl(url.proto, url.domain, url.port, "/").toString())
|
||||
.url(url.domain.toRootUrl().toString())
|
||||
.build();
|
||||
|
||||
var call = client.newCall(head);
|
||||
@ -293,7 +293,7 @@ public class HttpFetcher {
|
||||
|
||||
private Optional<SimpleRobotRules> fetchRobotsForProto(String proto, EdgeDomain domain) {
|
||||
try {
|
||||
var url = new EdgeUrl(proto, domain, null, "/robots.txt");
|
||||
var url = new EdgeUrl(proto, domain, null, "/robots.txt", null);
|
||||
return Optional.of(parseRobotsTxt(fetchContent(url)));
|
||||
}
|
||||
catch (Exception ex) {
|
||||
|
@ -64,7 +64,7 @@ public class StackOverflowPostsReader extends DefaultHandler {
|
||||
}
|
||||
|
||||
private StackOverflowPost createPost(StackOverflowQuestionData data) {
|
||||
EdgeUrl url = new EdgeUrl("https", domain, null, "/questions/"+data.getId());
|
||||
EdgeUrl url = new EdgeUrl("https", domain, null, "/questions/"+data.getId(), null);
|
||||
|
||||
StringBuilder body = new StringBuilder();
|
||||
body.append(data.getQuestion());
|
||||
|
@ -37,7 +37,7 @@ public class WikipediaReader {
|
||||
}
|
||||
|
||||
private EdgeUrl synthesizeUrl(String originalUrl) {
|
||||
return new EdgeUrl("https", domain, null, "/wiki/"+originalUrl);
|
||||
return new EdgeUrl("https", domain, null, "/wiki/"+originalUrl, null);
|
||||
}
|
||||
|
||||
public void join() throws InterruptedException {
|
||||
|
@ -59,7 +59,7 @@ public class EdgeDomain implements WideHashable {
|
||||
|
||||
public EdgeUrl toRootUrl() {
|
||||
// Set default protocol to http, as most https websites redirect http->https, but few http websites redirect https->http
|
||||
return new EdgeUrl("http", this, null, "/");
|
||||
return new EdgeUrl("http", this, null, "/", null);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
|
@ -4,6 +4,7 @@ import lombok.Builder;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
|
||||
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
@ -15,12 +16,14 @@ public class EdgeUrl implements WideHashable {
|
||||
public final EdgeDomain domain;
|
||||
public final Integer port;
|
||||
public final String path;
|
||||
public final String params;
|
||||
|
||||
public EdgeUrl(String proto, EdgeDomain domain, Integer port, String path) {
|
||||
public EdgeUrl(String proto, EdgeDomain domain, Integer port, String path, String params) {
|
||||
this.proto = proto;
|
||||
this.domain = domain;
|
||||
this.port = port(port, proto);
|
||||
this.path = path;
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
public EdgeUrl(String url) throws URISyntaxException {
|
||||
@ -77,8 +80,10 @@ public class EdgeUrl implements WideHashable {
|
||||
this.path = URI.getPath().isEmpty() ? "/" : URI.getPath();
|
||||
this.proto = URI.getScheme().toLowerCase();
|
||||
this.port = port(URI.getPort(), proto);
|
||||
this.params = LinkParser.queryParamsSanitizer(URI.getQuery());
|
||||
}
|
||||
|
||||
|
||||
private static Integer port(Integer port, String protocol) {
|
||||
if (null == port || port < 1) {
|
||||
return null;
|
||||
@ -94,8 +99,9 @@ public class EdgeUrl implements WideHashable {
|
||||
|
||||
public String toString() {
|
||||
String portPart = port == null ? "" : (":" + port);
|
||||
String queryPart = params == null ? "" : ("?" + params);
|
||||
|
||||
return proto + "://" + domain + portPart + "" + path;
|
||||
return proto + "://" + domain + portPart + path + queryPart;
|
||||
}
|
||||
|
||||
public String dir() {
|
||||
@ -115,7 +121,7 @@ public class EdgeUrl implements WideHashable {
|
||||
return (int) path.chars().filter(c -> c=='/').count();
|
||||
}
|
||||
|
||||
public EdgeUrl withPath(String s) {
|
||||
return new EdgeUrl(proto, domain, port, s);
|
||||
public EdgeUrl withPathAndParam(String path, String param) {
|
||||
return new EdgeUrl(proto, domain, port, path, param);
|
||||
}
|
||||
}
|
||||
|
@ -46,20 +46,23 @@ COLLATE utf8mb4_unicode_ci;
|
||||
CREATE TABLE IF NOT EXISTS EC_URL (
|
||||
ID INT PRIMARY KEY AUTO_INCREMENT,
|
||||
DOMAIN_ID INT NOT NULL,
|
||||
PROTO ENUM('http','https','gemini') NOT NULL,
|
||||
PATH VARCHAR(255) NOT NULL COLLATE utf8mb4_bin,
|
||||
|
||||
PROTO ENUM('http','https','gemini') NOT NULL COLLATE utf8mb4_unicode_ci,
|
||||
PATH VARCHAR(255) NOT NULL,
|
||||
PORT INT,
|
||||
PARAM VARCHAR(255),
|
||||
|
||||
PATH_HASH BIGINT NOT NULL COMMENT "Hash of PATH for uniqueness check by domain",
|
||||
|
||||
VISITED BOOLEAN NOT NULL DEFAULT FALSE,
|
||||
|
||||
STATE ENUM('ok', 'redirect', 'dead', 'archived', 'disqualified') NOT NULL DEFAULT 'ok',
|
||||
STATE ENUM('ok', 'redirect', 'dead', 'archived', 'disqualified') NOT NULL DEFAULT 'ok' COLLATE utf8mb4_unicode_ci,
|
||||
|
||||
CONSTRAINT CONS UNIQUE (DOMAIN_ID, PATH_HASH),
|
||||
FOREIGN KEY (DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE
|
||||
)
|
||||
CHARACTER SET utf8mb4
|
||||
COLLATE utf8mb4_unicode_ci;
|
||||
COLLATE utf8mb4_bin;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS EC_PAGE_DATA (
|
||||
ID INT PRIMARY KEY AUTO_INCREMENT,
|
||||
@ -113,10 +116,13 @@ CREATE TABLE IF NOT EXISTS EC_DOMAIN_LINK (
|
||||
|
||||
CREATE OR REPLACE VIEW EC_URL_VIEW AS
|
||||
SELECT
|
||||
IF(PORT IS NULL,
|
||||
CONCAT(EC_URL.PROTO, "://", EC_DOMAIN.DOMAIN_NAME, EC_URL.PATH),
|
||||
CONCAT(EC_URL.PROTO, "://", EC_DOMAIN.DOMAIN_NAME, ":", EC_URL.PORT, EC_URL.PATH))
|
||||
AS URL,
|
||||
CONCAT(EC_URL.PROTO,
|
||||
'://',
|
||||
EC_DOMAIN.DOMAIN_NAME,
|
||||
IF(EC_URL.PORT IS NULL, '', CONCAT(':', EC_URL.PORT)),
|
||||
EC_URL.PATH,
|
||||
IF(EC_URL.PARAM IS NULL, '', CONCAT('?', EC_URL.PARAM))
|
||||
) AS URL,
|
||||
EC_URL.PATH_HASH AS PATH_HASH,
|
||||
EC_URL.PATH AS PATH,
|
||||
EC_DOMAIN.DOMAIN_NAME AS DOMAIN_NAME,
|
||||
|
@ -17,4 +17,13 @@ class EdgeUrlTest {
|
||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%22-sign"));
|
||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/\n \"huh\""));
|
||||
}
|
||||
|
||||
@Test
|
||||
void testParms() throws URISyntaxException {
|
||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?id=123"));
|
||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?t=123"));
|
||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?v=123"));
|
||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?m=123"));
|
||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?follow=123"));
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user