From 1de63f225d9d425ee89741e5a3fa1b00893c5c5b Mon Sep 17 00:00:00 2001 From: vlofgren Date: Tue, 14 Jun 2022 17:55:14 +0200 Subject: [PATCH] Added support for -style tags. --- .../processor/DocumentProcessor.java | 17 +++++----- .../processor/logic/LinkParser.java | 34 ++++++++++++++++--- .../crawling/retreival/CrawlerRetreiver.java | 14 ++++---- .../wmsa/edge/model/EdgeDomain.java | 1 + .../marginalia/wmsa/edge/model/EdgeUrl.java | 9 ++--- .../wmsa/edge/crawling/LinkParserTest.java | 34 +++++++++++++++++-- 6 files changed, 81 insertions(+), 28 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java index ce6393f2..b205cdea 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java @@ -185,26 +185,25 @@ public class DocumentProcessor { } private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, EdgePageWordSet words) { - var links = doc.getElementsByTag("a"); - var frames = doc.getElementsByTag("frame"); - var feeds = doc.select("link[rel=alternate]"); - LinkProcessor lp = new LinkProcessor(ret, baseUrl); + final LinkProcessor lp = new LinkProcessor(ret, baseUrl); - for (var atag : links) { + baseUrl = linkParser.getBaseLink(doc, baseUrl); + + for (var atag : doc.getElementsByTag("a")) { linkParser.parseLink(baseUrl, atag).ifPresent(lp::accept); } - for (var frame : frames) { + for (var frame : doc.getElementsByTag("frame")) { linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept); } - for (var link : feeds) { + for (var link : doc.select("link[rel=alternate]")) { feedExtractor - .getFeedFromAlternateTag(baseUrl, link) + .getFeedFromAlternateTag(baseUrl, link) .ifPresent(lp::acceptFeed); } - Set linkTerms = new HashSet<>(); + final Set linkTerms = new HashSet<>(); for (var domain : lp.getForeignDomains()) { linkTerms.add("links:"+domain.toString().toLowerCase()); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java index aedaf0f7..378182f2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java @@ -1,9 +1,12 @@ package nu.marginalia.wmsa.edge.converting.processor.logic; import com.google.common.base.CharMatcher; +import com.google.common.base.Strings; import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.model.EdgeUrl; import org.jetbrains.annotations.Contract; +import org.jetbrains.annotations.Nullable; +import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -26,11 +29,11 @@ public class LinkParser { ".gz", ".asc", ".md5", ".asf", ".mov", ".sig", ".pub", ".iso"); @Contract(pure=true) - public Optional parseLink(EdgeUrl baseUrl, Element l) { + public Optional parseLink(EdgeUrl relativeBaseUrl, Element l) { return Optional.of(l) .filter(this::shouldIndexLink) .map(this::getUrl) - .map(link -> resolveUrl(baseUrl, link)) + .map(link -> resolveUrl(relativeBaseUrl, link)) .flatMap(this::createURI) .map(URI::normalize) .map(this::renormalize) @@ -100,6 +103,8 @@ public class LinkParser { } private static final Pattern paramRegex = Pattern.compile("\\?.*$"); + private static final Pattern spaceRegex = Pattern.compile(" "); + @SneakyThrows private String resolveUrl(EdgeUrl baseUrl, String s) { s = paramRegex.matcher(s).replaceAll(""); @@ -111,10 +116,12 @@ public class LinkParser { // url looks like /my-page if (s.startsWith("/")) { - return baseUrl.sibling(s).toString(); + return baseUrl.withPath(s).toString(); } - return baseUrl.sibling(relativeNavigation(baseUrl) + s.replaceAll(" ", "%20")).toString(); + final String partFromNewLink = spaceRegex.matcher(s).replaceAll("%20"); + + return baseUrl.withPath(relativeNavigation(baseUrl) + partFromNewLink).toString(); } // for a relative url that looks like /foo or /foo/bar; return / or /foo @@ -162,4 +169,23 @@ public class LinkParser { } return true; } + + @Nullable + public EdgeUrl getBaseLink(Document parsed, EdgeUrl documentUrl) { + var baseTags = parsed.getElementsByTag("base"); + + try { + for (var tag : baseTags) { + String href = tag.attr("href"); + if (!Strings.isNullOrEmpty(href)) { + return new EdgeUrl(resolveUrl(documentUrl, href)); + } + } + } + catch (Exception ex) { + logger.warn("Failed to parse , falling back to document url"); + } + + return documentUrl; + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java index a7c08a24..2b27ed4d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java @@ -202,10 +202,11 @@ public class CrawlerRetreiver { return domain.equals(url.domain.toString().toLowerCase()); } - private void findLinks(EdgeUrl url, Document parsed) { + private void findLinks(EdgeUrl baseUrl, Document parsed) { + baseUrl = linkParser.getBaseLink(parsed, baseUrl); for (var link : parsed.getElementsByTag("a")) { - linkParser.parseLink(url, link) + linkParser.parseLink(baseUrl, link) .filter(this::isSameDomain) .filter(u -> !urlBlocklist.isUrlBlocked(u)) .filter(u -> !urlBlocklist.isForumLink(u)) @@ -213,7 +214,7 @@ public class CrawlerRetreiver { .ifPresent(queue::addLast); } for (var link : parsed.getElementsByTag("frame")) { - linkParser.parseFrame(url, link) + linkParser.parseFrame(baseUrl, link) .filter(this::isSameDomain) .filter(u -> !urlBlocklist.isUrlBlocked(u)) .filter(u -> !urlBlocklist.isForumLink(u)) @@ -221,7 +222,7 @@ public class CrawlerRetreiver { .ifPresent(queue::addLast); } for (var link : parsed.getElementsByTag("iframe")) { - linkParser.parseFrame(url, link) + linkParser.parseFrame(baseUrl, link) .filter(this::isSameDomain) .filter(u -> !urlBlocklist.isUrlBlocked(u)) .filter(u -> !urlBlocklist.isForumLink(u)) @@ -230,10 +231,11 @@ public class CrawlerRetreiver { } } - private Optional findCanonicalUrl(EdgeUrl url, Document parsed) { + private Optional findCanonicalUrl(EdgeUrl baseUrl, Document parsed) { + baseUrl = baseUrl.withPath("/"); for (var link : parsed.select("link[rel=canonical]")) { - return linkParser.parseLink(url, link); + return linkParser.parseLink(baseUrl, link); } return Optional.empty(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java index cb778947..53740c95 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java @@ -21,6 +21,7 @@ public class EdgeDomain implements WideHashable { @SneakyThrows public EdgeDomain(String host) { + Objects.requireNonNull(host, "domain name must not be null"); var dot = host.lastIndexOf('.'); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java index 39bc475b..e82d4b7c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java @@ -79,11 +79,6 @@ public class EdgeUrl implements WideHashable { this.port = port(URI.getPort(), proto); } - public EdgeUrl sibling(String newPath) { - return new EdgeUrl(proto, domain, port, newPath); - } - - private static Integer port(Integer port, String protocol) { if (null == port || port < 1) { return null; @@ -120,5 +115,7 @@ public class EdgeUrl implements WideHashable { return (int) path.chars().filter(c -> c=='/').count(); } - + public EdgeUrl withPath(String s) { + return new EdgeUrl(proto, domain, port, s); + } } diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/LinkParserTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/LinkParserTest.java index 80c62153..d4a7e428 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/LinkParserTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/LinkParserTest.java @@ -11,9 +11,8 @@ import static org.junit.jupiter.api.Assertions.*; class LinkParserTest { - private String parseLink(String href, String base) throws URISyntaxException { - var url = new EdgeUrl("http://www.marginalia.nu/" + base); - var domain = url.domain; + private String parseLink(String href, String relBase) throws URISyntaxException { + var url = new EdgeUrl("http://www.marginalia.nu/" + relBase); var parser = new LinkParser(); var stuff = Jsoup.parseBodyFragment("test"); var lnk = parser.parseLink( @@ -43,6 +42,7 @@ class LinkParserTest { void testAnchor() throws URISyntaxException { assertNull(parseLink("#test", "/")); } + @Test void testRelative() throws URISyntaxException { assertEquals("http://www.marginalia.nu/test", parseLink("../test", "/")); @@ -51,4 +51,32 @@ class LinkParserTest { assertEquals("http://www.marginalia.nu/test", parseLink("../test", "/foo/index.html")); assertEquals("http://www.marginalia.nu/test", parseLink("/test", "/foo/index.html")); } + + private EdgeUrl getBaseUrl(String href, EdgeUrl documentUrl) { + LinkParser lp = new LinkParser(); + + return lp.getBaseLink(Jsoup.parse(""), documentUrl); + } + + @Test + public void getBaseUrlTest() throws URISyntaxException { + assertEquals(new EdgeUrl("https://www.marginalia.nu/base"), + getBaseUrl("/base", new EdgeUrl("https://www.marginalia.nu/test/foo.bar"))); + + assertEquals(new EdgeUrl("https://memex.marginalia.nu/base"), + getBaseUrl("https://memex.marginalia.nu/base", new EdgeUrl("https://www.marginalia.nu/test/foo.bar"))); + + assertEquals(new EdgeUrl("https://www.marginalia.nu/test/base"), + getBaseUrl("base", new EdgeUrl("https://www.marginalia.nu/test/foo.bar"))); + } + + @Test + public void testParseBadBaseLink() throws URISyntaxException { + LinkParser lp = new LinkParser(); + var url = new EdgeUrl("https://memex.marginalia.nu/"); + + assertEquals(url, lp.getBaseLink(Jsoup.parse(""), url)); + assertEquals(url, lp.getBaseLink(Jsoup.parse(""), url)); + assertEquals(url, lp.getBaseLink(Jsoup.parse(""), url)); + } } \ No newline at end of file