(crawler) Improve meta-tag redirect handling, add tests for redirects.

Wrote a new test to examine the redirect behavior of the crawler, ensuring that the redirect URL is the URL that is reported in the parquet file. This works as intended. Noticed in the course of this that the crawler doesn't add links from meta-tag redirects to the crawl frontier. Added logic to handle this case, amended the test case to verify the new behavior. Added the meta-redirect case to the HtmlDocumentProcessorPlugin as well, so that we consider it a link between documents in the unlikely case that a meta redirect is to another domain.
2025-02-24 13:19:02 +00:00 · 2024-02-01 20:30:43 +01:00 · 2024-02-01 20:30:43 +01:00 · 785d8deadd
commit 785d8deadd
parent 93a2d5afbf
5 changed files with 108 additions and 6 deletions
--- a/code/features-crawl/link-parser/src/main/java/nu/marginalia/link_parser/LinkParser.java
+++ b/code/features-crawl/link-parser/src/main/java/nu/marginalia/link_parser/LinkParser.java
@ -20,9 +20,11 @@ import java.util.regex.Pattern;
 public class LinkParser {
    private final Logger logger = LoggerFactory.getLogger(getClass());
-    private final List<String> blockPrefixList = List.of(
+    // These are schemas that we don't want to try to index
    private final List<String> blockedSchemaList = List.of(
            "mailto:", "javascript:", "tel:", "itpc:", "#", "file:");
    // These are file suffixes we suspect may be a binary file
    private final List<String> binarySuffixList = List.of(
            ".pdf", ".mp3", ".wmv", ".avi", ".zip", ".7z",
            ".mpv", ".mp4", ".avi", ".mkv", ".tiff", ".dat", ".tar",
@ -96,6 +98,30 @@ public class LinkParser {
                .flatMap(this::createEdgeUrl);
    }
    @Contract(pure=true)
    public Optional<EdgeUrl> parseMetaRedirect(EdgeUrl baseUrl, Element meta) {
        return Optional.of(meta)
                .map(l -> l.attr("content"))
                .flatMap(this::getMetaRedirectUrl)
                .map(link -> resolveRelativeUrl(baseUrl, link))
                .flatMap(this::createURI)
                .map(URI::normalize)
                .map(this::renormalize)
                .flatMap(this::createEdgeUrl);
    }
    // Matches the format of a meta http-equiv=refresh content tag, e.g. '10; url=http://example.com/'
    private static Pattern metaRedirectPattern = Pattern.compile("^\\d+\\s*;\\s*url=(\\S+)\\s*$");
    /** Parse the URL from a meta refresh tag, returning only the URL part and
     * discarding the rest.  Returns Optional.empty() on parse error. */
    private Optional<String> getMetaRedirectUrl(String content) {
        var matcher = metaRedirectPattern.matcher(content);
        if (!matcher.find())
            return Optional.empty();
        return Optional.ofNullable(matcher.group(1));
    }
    @SneakyThrows
    private URI renormalize(URI uri) {
        if (uri.getPath() == null) {
@ -191,7 +217,7 @@ public class LinkParser {
        }
        href = href.toLowerCase();
-        if (blockPrefixList.stream().anyMatch(href::startsWith)) {
+        if (blockedSchemaList.stream().anyMatch(href::startsWith)) {
            return false;
        }
        if (hasBinarySuffix(href)) {
--- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java
+++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java
@ -284,6 +284,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
        for (var frame : doc.getElementsByTag("iframe")) {
            linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept);
        }
        for (var meta : doc.select("meta[http-equiv=refresh]")) {
            linkParser.parseMetaRedirect(baseUrl, meta).ifPresent(lp::accept);
        }
        for (var link : doc.select("link[rel=alternate]")) {
            feedExtractor
                    .getFeedFromAlternateTag(baseUrl, link)
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
@ -79,6 +79,11 @@ public class CrawlerRetreiver implements AutoCloseable {
        }
    }
    // For testing
    public DomainCrawlFrontier getCrawlFrontier() {
        return crawlFrontier;
    }
    public int fetch() {
        return fetch(new DomainLinks(), new CrawlDataReference());
    }
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java
@ -111,6 +111,10 @@ public class DomainCrawlFrontier {
        long hashCode = hasher.hashNearlyASCII(url.toString());
        return visited.contains(hashCode);
    }
    public boolean isKnown(EdgeUrl url) {
        long hashCode = hasher.hashNearlyASCII(url.toString());
        return known.contains(hashCode);
    }
    public boolean filterLink(EdgeUrl url) {
        return linkFilter.test(url);
@ -162,6 +166,9 @@ public class DomainCrawlFrontier {
        for (var link : parsed.getElementsByTag("frame")) {
            linkParser.parseFrame(baseUrl, link).ifPresent(this::addToQueue);
        }
        for (var meta : parsed.select("meta[http-equiv=refresh]")) {
            linkParser.parseMetaRedirect(baseUrl, meta).ifPresent(this::addToQueue);
        }
        for (var link : parsed.getElementsByTag("iframe")) {
            linkParser.parseFrame(baseUrl, link).ifPresent(this::addToQueue);
        }
--- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
+++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
@ -17,18 +17,19 @@ import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawlspec.CrawlSpecRecord;
 import org.jetbrains.annotations.NotNull;
 import org.junit.jupiter.api.*;
 import org.netpreserve.jwarc.*;
 import java.io.IOException;
 import java.io.RandomAccessFile;
 import java.net.URISyntaxException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.*;
 import java.util.stream.Collectors;
-import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.*;
 import static org.junit.jupiter.api.Assertions.assertTrue;
@Tag("slow")
 class CrawlerRetreiverTest {
@ -209,6 +210,62 @@ class CrawlerRetreiverTest {
    }
    @Test
    public void testRedirect() throws IOException, URISyntaxException {
        var specs = CrawlSpecRecord
                .builder()
                .crawlDepth(3)
                .domain("www.marginalia.nu")
                .urls(List.of(
                        "https://www.marginalia.nu/log/06-optimization.gmi"
                        ))
                .build();
        List<SerializableCrawlData> data = new ArrayList<>();
        tempFileWarc1 = Files.createTempFile("crawling-process", ".warc");
        DomainCrawlFrontier frontier = doCrawl(tempFileWarc1, specs);
        assertTrue(frontier.isVisited(new EdgeUrl("https://www.marginalia.nu/log/06-optimization.gmi/")));
        assertTrue(frontier.isVisited(new EdgeUrl("https://www.marginalia.nu/log/06-optimization.gmi")));
        assertTrue(frontier.isVisited(new EdgeUrl("https://www.marginalia.nu/")));
        assertFalse(frontier.isVisited(new EdgeUrl("https://www.marginalia.nu/log/06-optimization/")));
        assertTrue(frontier.isKnown(new EdgeUrl("https://www.marginalia.nu/log/06-optimization/")));
        convertToParquet(tempFileWarc1, tempFileParquet1);
        try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.ANY, tempFileParquet1)) {
            while (stream.hasNext()) {
                if (stream.next() instanceof CrawledDocument doc) {
                    data.add(doc);
                }
            }
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
        // The URL https://www.marginalia.nu/log/06-optimization.gmi
        // redirects to https://www.marginalia.nu/log/06-optimization.gmi/  (note the trailing slash)
        //
        // Ensure that the redirect is followed, and that the trailing slash is added
        // to the url as reported in the parquet file.
        var fetchedUrls =
                data.stream()
                        .filter(CrawledDocument.class::isInstance)
                        .map(CrawledDocument.class::cast)
                        .peek(doc -> System.out.println(doc.url))
                        .map(doc -> doc.url)
                        .collect(Collectors.toSet());
        assertEquals(Set.of("https://www.marginalia.nu/",
                            "https://www.marginalia.nu/log/06-optimization.gmi/"),
                    fetchedUrls);
    }
    @Test
    public void testEmptySet() throws IOException {
@ -418,11 +475,15 @@ class CrawlerRetreiverTest {
        }
    }
-    private void doCrawl(Path tempFileWarc1, CrawlSpecRecord specs) {
+    @NotNull
    private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlSpecRecord specs) {
        try (var recorder = new WarcRecorder(tempFileWarc1)) {
-            new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch();
+            var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder);
            crawler.fetch();
            return crawler.getCrawlFrontier();
        } catch (IOException ex) {
            Assertions.fail(ex);
            return null; // unreachable
        }
    }
 }