From 785d8deaddb07b507809e3a1e8055be1fff86a26 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 1 Feb 2024 20:30:43 +0100 Subject: [PATCH] (crawler) Improve meta-tag redirect handling, add tests for redirects. Wrote a new test to examine the redirect behavior of the crawler, ensuring that the redirect URL is the URL that is reported in the parquet file. This works as intended. Noticed in the course of this that the crawler doesn't add links from meta-tag redirects to the crawl frontier. Added logic to handle this case, amended the test case to verify the new behavior. Added the meta-redirect case to the HtmlDocumentProcessorPlugin as well, so that we consider it a link between documents in the unlikely case that a meta redirect is to another domain. --- .../nu/marginalia/link_parser/LinkParser.java | 30 +++++++- .../plugin/HtmlDocumentProcessorPlugin.java | 3 + .../crawl/retreival/CrawlerRetreiver.java | 5 ++ .../crawl/retreival/DomainCrawlFrontier.java | 7 ++ .../retreival/CrawlerRetreiverTest.java | 69 +++++++++++++++++-- 5 files changed, 108 insertions(+), 6 deletions(-) diff --git a/code/features-crawl/link-parser/src/main/java/nu/marginalia/link_parser/LinkParser.java b/code/features-crawl/link-parser/src/main/java/nu/marginalia/link_parser/LinkParser.java index e4286eab..08ccc221 100644 --- a/code/features-crawl/link-parser/src/main/java/nu/marginalia/link_parser/LinkParser.java +++ b/code/features-crawl/link-parser/src/main/java/nu/marginalia/link_parser/LinkParser.java @@ -20,9 +20,11 @@ import java.util.regex.Pattern; public class LinkParser { private final Logger logger = LoggerFactory.getLogger(getClass()); - private final List blockPrefixList = List.of( + // These are schemas that we don't want to try to index + private final List blockedSchemaList = List.of( "mailto:", "javascript:", "tel:", "itpc:", "#", "file:"); + // These are file suffixes we suspect may be a binary file private final List binarySuffixList = List.of( ".pdf", ".mp3", ".wmv", ".avi", ".zip", ".7z", ".mpv", ".mp4", ".avi", ".mkv", ".tiff", ".dat", ".tar", @@ -96,6 +98,30 @@ public class LinkParser { .flatMap(this::createEdgeUrl); } + @Contract(pure=true) + public Optional parseMetaRedirect(EdgeUrl baseUrl, Element meta) { + return Optional.of(meta) + .map(l -> l.attr("content")) + .flatMap(this::getMetaRedirectUrl) + .map(link -> resolveRelativeUrl(baseUrl, link)) + .flatMap(this::createURI) + .map(URI::normalize) + .map(this::renormalize) + .flatMap(this::createEdgeUrl); + } + + // Matches the format of a meta http-equiv=refresh content tag, e.g. '10; url=http://example.com/' + private static Pattern metaRedirectPattern = Pattern.compile("^\\d+\\s*;\\s*url=(\\S+)\\s*$"); + /** Parse the URL from a meta refresh tag, returning only the URL part and + * discarding the rest. Returns Optional.empty() on parse error. */ + private Optional getMetaRedirectUrl(String content) { + var matcher = metaRedirectPattern.matcher(content); + + if (!matcher.find()) + return Optional.empty(); + return Optional.ofNullable(matcher.group(1)); + } + @SneakyThrows private URI renormalize(URI uri) { if (uri.getPath() == null) { @@ -191,7 +217,7 @@ public class LinkParser { } href = href.toLowerCase(); - if (blockPrefixList.stream().anyMatch(href::startsWith)) { + if (blockedSchemaList.stream().anyMatch(href::startsWith)) { return false; } if (hasBinarySuffix(href)) { diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java index b430f0bf..43c2952a 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java @@ -284,6 +284,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin for (var frame : doc.getElementsByTag("iframe")) { linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept); } + for (var meta : doc.select("meta[http-equiv=refresh]")) { + linkParser.parseMetaRedirect(baseUrl, meta).ifPresent(lp::accept); + } for (var link : doc.select("link[rel=alternate]")) { feedExtractor .getFeedFromAlternateTag(baseUrl, link) diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index 49760046..b1abf3e1 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -79,6 +79,11 @@ public class CrawlerRetreiver implements AutoCloseable { } } + // For testing + public DomainCrawlFrontier getCrawlFrontier() { + return crawlFrontier; + } + public int fetch() { return fetch(new DomainLinks(), new CrawlDataReference()); } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java index 4b501826..0d0dfc03 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java @@ -111,6 +111,10 @@ public class DomainCrawlFrontier { long hashCode = hasher.hashNearlyASCII(url.toString()); return visited.contains(hashCode); } + public boolean isKnown(EdgeUrl url) { + long hashCode = hasher.hashNearlyASCII(url.toString()); + return known.contains(hashCode); + } public boolean filterLink(EdgeUrl url) { return linkFilter.test(url); @@ -162,6 +166,9 @@ public class DomainCrawlFrontier { for (var link : parsed.getElementsByTag("frame")) { linkParser.parseFrame(baseUrl, link).ifPresent(this::addToQueue); } + for (var meta : parsed.select("meta[http-equiv=refresh]")) { + linkParser.parseMetaRedirect(baseUrl, meta).ifPresent(this::addToQueue); + } for (var link : parsed.getElementsByTag("iframe")) { linkParser.parseFrame(baseUrl, link).ifPresent(this::addToQueue); } diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java index 27d0ec27..811200cc 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java @@ -17,18 +17,19 @@ import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawlspec.CrawlSpecRecord; +import org.jetbrains.annotations.NotNull; import org.junit.jupiter.api.*; import org.netpreserve.jwarc.*; import java.io.IOException; import java.io.RandomAccessFile; +import java.net.URISyntaxException; import java.nio.file.Files; import java.nio.file.Path; import java.util.*; import java.util.stream.Collectors; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.*; @Tag("slow") class CrawlerRetreiverTest { @@ -209,6 +210,62 @@ class CrawlerRetreiverTest { } + @Test + public void testRedirect() throws IOException, URISyntaxException { + var specs = CrawlSpecRecord + .builder() + .crawlDepth(3) + .domain("www.marginalia.nu") + .urls(List.of( + "https://www.marginalia.nu/log/06-optimization.gmi" + )) + .build(); + + List data = new ArrayList<>(); + + tempFileWarc1 = Files.createTempFile("crawling-process", ".warc"); + + DomainCrawlFrontier frontier = doCrawl(tempFileWarc1, specs); + + assertTrue(frontier.isVisited(new EdgeUrl("https://www.marginalia.nu/log/06-optimization.gmi/"))); + assertTrue(frontier.isVisited(new EdgeUrl("https://www.marginalia.nu/log/06-optimization.gmi"))); + assertTrue(frontier.isVisited(new EdgeUrl("https://www.marginalia.nu/"))); + + assertFalse(frontier.isVisited(new EdgeUrl("https://www.marginalia.nu/log/06-optimization/"))); + assertTrue(frontier.isKnown(new EdgeUrl("https://www.marginalia.nu/log/06-optimization/"))); + + convertToParquet(tempFileWarc1, tempFileParquet1); + + try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.ANY, tempFileParquet1)) { + while (stream.hasNext()) { + if (stream.next() instanceof CrawledDocument doc) { + data.add(doc); + } + } + } catch (Exception e) { + throw new RuntimeException(e); + } + + // The URL https://www.marginalia.nu/log/06-optimization.gmi + // redirects to https://www.marginalia.nu/log/06-optimization.gmi/ (note the trailing slash) + // + // Ensure that the redirect is followed, and that the trailing slash is added + // to the url as reported in the parquet file. + + var fetchedUrls = + data.stream() + .filter(CrawledDocument.class::isInstance) + .map(CrawledDocument.class::cast) + .peek(doc -> System.out.println(doc.url)) + .map(doc -> doc.url) + .collect(Collectors.toSet()); + + assertEquals(Set.of("https://www.marginalia.nu/", + "https://www.marginalia.nu/log/06-optimization.gmi/"), + fetchedUrls); + + } + @Test public void testEmptySet() throws IOException { @@ -418,11 +475,15 @@ class CrawlerRetreiverTest { } } - private void doCrawl(Path tempFileWarc1, CrawlSpecRecord specs) { + @NotNull + private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlSpecRecord specs) { try (var recorder = new WarcRecorder(tempFileWarc1)) { - new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch(); + var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder); + crawler.fetch(); + return crawler.getCrawlFrontier(); } catch (IOException ex) { Assertions.fail(ex); + return null; // unreachable } } } \ No newline at end of file