mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Added support for <base href>-style tags.
This commit is contained in:
parent
389818c6c3
commit
1de63f225d
@ -185,26 +185,25 @@ public class DocumentProcessor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, EdgePageWordSet words) {
|
private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, EdgePageWordSet words) {
|
||||||
var links = doc.getElementsByTag("a");
|
|
||||||
var frames = doc.getElementsByTag("frame");
|
|
||||||
var feeds = doc.select("link[rel=alternate]");
|
|
||||||
|
|
||||||
LinkProcessor lp = new LinkProcessor(ret, baseUrl);
|
final LinkProcessor lp = new LinkProcessor(ret, baseUrl);
|
||||||
|
|
||||||
for (var atag : links) {
|
baseUrl = linkParser.getBaseLink(doc, baseUrl);
|
||||||
|
|
||||||
|
for (var atag : doc.getElementsByTag("a")) {
|
||||||
linkParser.parseLink(baseUrl, atag).ifPresent(lp::accept);
|
linkParser.parseLink(baseUrl, atag).ifPresent(lp::accept);
|
||||||
}
|
}
|
||||||
for (var frame : frames) {
|
for (var frame : doc.getElementsByTag("frame")) {
|
||||||
linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept);
|
linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (var link : feeds) {
|
for (var link : doc.select("link[rel=alternate]")) {
|
||||||
feedExtractor
|
feedExtractor
|
||||||
.getFeedFromAlternateTag(baseUrl, link)
|
.getFeedFromAlternateTag(baseUrl, link)
|
||||||
.ifPresent(lp::acceptFeed);
|
.ifPresent(lp::acceptFeed);
|
||||||
}
|
}
|
||||||
|
|
||||||
Set<String> linkTerms = new HashSet<>();
|
final Set<String> linkTerms = new HashSet<>();
|
||||||
|
|
||||||
for (var domain : lp.getForeignDomains()) {
|
for (var domain : lp.getForeignDomains()) {
|
||||||
linkTerms.add("links:"+domain.toString().toLowerCase());
|
linkTerms.add("links:"+domain.toString().toLowerCase());
|
||||||
|
@ -1,9 +1,12 @@
|
|||||||
package nu.marginalia.wmsa.edge.converting.processor.logic;
|
package nu.marginalia.wmsa.edge.converting.processor.logic;
|
||||||
|
|
||||||
import com.google.common.base.CharMatcher;
|
import com.google.common.base.CharMatcher;
|
||||||
|
import com.google.common.base.Strings;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
import org.jetbrains.annotations.Contract;
|
import org.jetbrains.annotations.Contract;
|
||||||
|
import org.jetbrains.annotations.Nullable;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
import org.jsoup.nodes.Element;
|
import org.jsoup.nodes.Element;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@ -26,11 +29,11 @@ public class LinkParser {
|
|||||||
".gz", ".asc", ".md5", ".asf", ".mov", ".sig", ".pub", ".iso");
|
".gz", ".asc", ".md5", ".asf", ".mov", ".sig", ".pub", ".iso");
|
||||||
|
|
||||||
@Contract(pure=true)
|
@Contract(pure=true)
|
||||||
public Optional<EdgeUrl> parseLink(EdgeUrl baseUrl, Element l) {
|
public Optional<EdgeUrl> parseLink(EdgeUrl relativeBaseUrl, Element l) {
|
||||||
return Optional.of(l)
|
return Optional.of(l)
|
||||||
.filter(this::shouldIndexLink)
|
.filter(this::shouldIndexLink)
|
||||||
.map(this::getUrl)
|
.map(this::getUrl)
|
||||||
.map(link -> resolveUrl(baseUrl, link))
|
.map(link -> resolveUrl(relativeBaseUrl, link))
|
||||||
.flatMap(this::createURI)
|
.flatMap(this::createURI)
|
||||||
.map(URI::normalize)
|
.map(URI::normalize)
|
||||||
.map(this::renormalize)
|
.map(this::renormalize)
|
||||||
@ -100,6 +103,8 @@ public class LinkParser {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private static final Pattern paramRegex = Pattern.compile("\\?.*$");
|
private static final Pattern paramRegex = Pattern.compile("\\?.*$");
|
||||||
|
private static final Pattern spaceRegex = Pattern.compile(" ");
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private String resolveUrl(EdgeUrl baseUrl, String s) {
|
private String resolveUrl(EdgeUrl baseUrl, String s) {
|
||||||
s = paramRegex.matcher(s).replaceAll("");
|
s = paramRegex.matcher(s).replaceAll("");
|
||||||
@ -111,10 +116,12 @@ public class LinkParser {
|
|||||||
|
|
||||||
// url looks like /my-page
|
// url looks like /my-page
|
||||||
if (s.startsWith("/")) {
|
if (s.startsWith("/")) {
|
||||||
return baseUrl.sibling(s).toString();
|
return baseUrl.withPath(s).toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
return baseUrl.sibling(relativeNavigation(baseUrl) + s.replaceAll(" ", "%20")).toString();
|
final String partFromNewLink = spaceRegex.matcher(s).replaceAll("%20");
|
||||||
|
|
||||||
|
return baseUrl.withPath(relativeNavigation(baseUrl) + partFromNewLink).toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
// for a relative url that looks like /foo or /foo/bar; return / or /foo
|
// for a relative url that looks like /foo or /foo/bar; return / or /foo
|
||||||
@ -162,4 +169,23 @@ public class LinkParser {
|
|||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Nullable
|
||||||
|
public EdgeUrl getBaseLink(Document parsed, EdgeUrl documentUrl) {
|
||||||
|
var baseTags = parsed.getElementsByTag("base");
|
||||||
|
|
||||||
|
try {
|
||||||
|
for (var tag : baseTags) {
|
||||||
|
String href = tag.attr("href");
|
||||||
|
if (!Strings.isNullOrEmpty(href)) {
|
||||||
|
return new EdgeUrl(resolveUrl(documentUrl, href));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
logger.warn("Failed to parse <base href=...>, falling back to document url");
|
||||||
|
}
|
||||||
|
|
||||||
|
return documentUrl;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -202,10 +202,11 @@ public class CrawlerRetreiver {
|
|||||||
return domain.equals(url.domain.toString().toLowerCase());
|
return domain.equals(url.domain.toString().toLowerCase());
|
||||||
}
|
}
|
||||||
|
|
||||||
private void findLinks(EdgeUrl url, Document parsed) {
|
private void findLinks(EdgeUrl baseUrl, Document parsed) {
|
||||||
|
baseUrl = linkParser.getBaseLink(parsed, baseUrl);
|
||||||
|
|
||||||
for (var link : parsed.getElementsByTag("a")) {
|
for (var link : parsed.getElementsByTag("a")) {
|
||||||
linkParser.parseLink(url, link)
|
linkParser.parseLink(baseUrl, link)
|
||||||
.filter(this::isSameDomain)
|
.filter(this::isSameDomain)
|
||||||
.filter(u -> !urlBlocklist.isUrlBlocked(u))
|
.filter(u -> !urlBlocklist.isUrlBlocked(u))
|
||||||
.filter(u -> !urlBlocklist.isForumLink(u))
|
.filter(u -> !urlBlocklist.isForumLink(u))
|
||||||
@ -213,7 +214,7 @@ public class CrawlerRetreiver {
|
|||||||
.ifPresent(queue::addLast);
|
.ifPresent(queue::addLast);
|
||||||
}
|
}
|
||||||
for (var link : parsed.getElementsByTag("frame")) {
|
for (var link : parsed.getElementsByTag("frame")) {
|
||||||
linkParser.parseFrame(url, link)
|
linkParser.parseFrame(baseUrl, link)
|
||||||
.filter(this::isSameDomain)
|
.filter(this::isSameDomain)
|
||||||
.filter(u -> !urlBlocklist.isUrlBlocked(u))
|
.filter(u -> !urlBlocklist.isUrlBlocked(u))
|
||||||
.filter(u -> !urlBlocklist.isForumLink(u))
|
.filter(u -> !urlBlocklist.isForumLink(u))
|
||||||
@ -221,7 +222,7 @@ public class CrawlerRetreiver {
|
|||||||
.ifPresent(queue::addLast);
|
.ifPresent(queue::addLast);
|
||||||
}
|
}
|
||||||
for (var link : parsed.getElementsByTag("iframe")) {
|
for (var link : parsed.getElementsByTag("iframe")) {
|
||||||
linkParser.parseFrame(url, link)
|
linkParser.parseFrame(baseUrl, link)
|
||||||
.filter(this::isSameDomain)
|
.filter(this::isSameDomain)
|
||||||
.filter(u -> !urlBlocklist.isUrlBlocked(u))
|
.filter(u -> !urlBlocklist.isUrlBlocked(u))
|
||||||
.filter(u -> !urlBlocklist.isForumLink(u))
|
.filter(u -> !urlBlocklist.isForumLink(u))
|
||||||
@ -230,10 +231,11 @@ public class CrawlerRetreiver {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private Optional<EdgeUrl> findCanonicalUrl(EdgeUrl url, Document parsed) {
|
private Optional<EdgeUrl> findCanonicalUrl(EdgeUrl baseUrl, Document parsed) {
|
||||||
|
baseUrl = baseUrl.withPath("/");
|
||||||
|
|
||||||
for (var link : parsed.select("link[rel=canonical]")) {
|
for (var link : parsed.select("link[rel=canonical]")) {
|
||||||
return linkParser.parseLink(url, link);
|
return linkParser.parseLink(baseUrl, link);
|
||||||
}
|
}
|
||||||
|
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
|
@ -21,6 +21,7 @@ public class EdgeDomain implements WideHashable {
|
|||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public EdgeDomain(String host) {
|
public EdgeDomain(String host) {
|
||||||
|
Objects.requireNonNull(host, "domain name must not be null");
|
||||||
|
|
||||||
var dot = host.lastIndexOf('.');
|
var dot = host.lastIndexOf('.');
|
||||||
|
|
||||||
|
@ -79,11 +79,6 @@ public class EdgeUrl implements WideHashable {
|
|||||||
this.port = port(URI.getPort(), proto);
|
this.port = port(URI.getPort(), proto);
|
||||||
}
|
}
|
||||||
|
|
||||||
public EdgeUrl sibling(String newPath) {
|
|
||||||
return new EdgeUrl(proto, domain, port, newPath);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private static Integer port(Integer port, String protocol) {
|
private static Integer port(Integer port, String protocol) {
|
||||||
if (null == port || port < 1) {
|
if (null == port || port < 1) {
|
||||||
return null;
|
return null;
|
||||||
@ -120,5 +115,7 @@ public class EdgeUrl implements WideHashable {
|
|||||||
return (int) path.chars().filter(c -> c=='/').count();
|
return (int) path.chars().filter(c -> c=='/').count();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public EdgeUrl withPath(String s) {
|
||||||
|
return new EdgeUrl(proto, domain, port, s);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -11,9 +11,8 @@ import static org.junit.jupiter.api.Assertions.*;
|
|||||||
|
|
||||||
class LinkParserTest {
|
class LinkParserTest {
|
||||||
|
|
||||||
private String parseLink(String href, String base) throws URISyntaxException {
|
private String parseLink(String href, String relBase) throws URISyntaxException {
|
||||||
var url = new EdgeUrl("http://www.marginalia.nu/" + base);
|
var url = new EdgeUrl("http://www.marginalia.nu/" + relBase);
|
||||||
var domain = url.domain;
|
|
||||||
var parser = new LinkParser();
|
var parser = new LinkParser();
|
||||||
var stuff = Jsoup.parseBodyFragment("<a href='"+href+"''>test</a>");
|
var stuff = Jsoup.parseBodyFragment("<a href='"+href+"''>test</a>");
|
||||||
var lnk = parser.parseLink(
|
var lnk = parser.parseLink(
|
||||||
@ -43,6 +42,7 @@ class LinkParserTest {
|
|||||||
void testAnchor() throws URISyntaxException {
|
void testAnchor() throws URISyntaxException {
|
||||||
assertNull(parseLink("#test", "/"));
|
assertNull(parseLink("#test", "/"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void testRelative() throws URISyntaxException {
|
void testRelative() throws URISyntaxException {
|
||||||
assertEquals("http://www.marginalia.nu/test", parseLink("../test", "/"));
|
assertEquals("http://www.marginalia.nu/test", parseLink("../test", "/"));
|
||||||
@ -51,4 +51,32 @@ class LinkParserTest {
|
|||||||
assertEquals("http://www.marginalia.nu/test", parseLink("../test", "/foo/index.html"));
|
assertEquals("http://www.marginalia.nu/test", parseLink("../test", "/foo/index.html"));
|
||||||
assertEquals("http://www.marginalia.nu/test", parseLink("/test", "/foo/index.html"));
|
assertEquals("http://www.marginalia.nu/test", parseLink("/test", "/foo/index.html"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private EdgeUrl getBaseUrl(String href, EdgeUrl documentUrl) {
|
||||||
|
LinkParser lp = new LinkParser();
|
||||||
|
|
||||||
|
return lp.getBaseLink(Jsoup.parse("<base href=\"" + href + "\" />"), documentUrl);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void getBaseUrlTest() throws URISyntaxException {
|
||||||
|
assertEquals(new EdgeUrl("https://www.marginalia.nu/base"),
|
||||||
|
getBaseUrl("/base", new EdgeUrl("https://www.marginalia.nu/test/foo.bar")));
|
||||||
|
|
||||||
|
assertEquals(new EdgeUrl("https://memex.marginalia.nu/base"),
|
||||||
|
getBaseUrl("https://memex.marginalia.nu/base", new EdgeUrl("https://www.marginalia.nu/test/foo.bar")));
|
||||||
|
|
||||||
|
assertEquals(new EdgeUrl("https://www.marginalia.nu/test/base"),
|
||||||
|
getBaseUrl("base", new EdgeUrl("https://www.marginalia.nu/test/foo.bar")));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testParseBadBaseLink() throws URISyntaxException {
|
||||||
|
LinkParser lp = new LinkParser();
|
||||||
|
var url = new EdgeUrl("https://memex.marginalia.nu/");
|
||||||
|
|
||||||
|
assertEquals(url, lp.getBaseLink(Jsoup.parse("<base href/>"), url));
|
||||||
|
assertEquals(url, lp.getBaseLink(Jsoup.parse("<base target=\"foo\"/>"), url));
|
||||||
|
assertEquals(url, lp.getBaseLink(Jsoup.parse("<base href=\"http://\"/>"), url));
|
||||||
|
}
|
||||||
}
|
}
|
Loading…
Reference in New Issue
Block a user