From eb60ddb729503aae916034a634f61989b3530795 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 5 Oct 2024 17:49:39 +0200 Subject: [PATCH] (crawler) Properly enqueue links from the root document in the crawler --- .../crawl/retreival/CrawlerRetreiver.java | 1 + .../retreival/CrawlerRetreiverTest.java | 42 +++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index 3f180222..e204d9c9 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -217,6 +217,7 @@ public class CrawlerRetreiver implements AutoCloseable { // Sniff the software based on the sample document var doc = optDoc.get(); crawlFrontier.setLinkFilter(linkFilterSelector.selectFilter(doc)); + crawlFrontier.enqueueLinksFromDocument(url, doc); EdgeUrl faviconUrl = url.withPathAndParam("/favicon.ico", null); Optional sitemapUrl = Optional.empty(); diff --git a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java index 88604f57..a5c0d124 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java @@ -74,6 +74,48 @@ class CrawlerRetreiverTest { } } + @Test + public void testWtf() throws IOException { + var specs = CrawlSpecProvider.CrawlSpecRecord + .builder() + .crawlDepth(5) + .domain("indigo.re") + .urls(List.of("https://indigo.re/")) + .build(); + Path tempFile = null; + try { + tempFile = Files.createTempFile("crawling-process", "warc"); + + doCrawl(tempFile, specs); + + Set requests = new HashSet<>(); + Set responses = new HashSet<>(); + + try (var reader = new WarcReader(tempFile)) { + reader.forEach(record -> { + if (record instanceof WarcRequest req) { + requests.add(req.target()); + System.out.println(req.type() + ":" + req.target()); + } + else if (record instanceof WarcResponse rsp) { + responses.add(rsp.target()); + System.out.println(rsp.type() + ":" + rsp.target()); + } + else { + System.out.println(record.type()); + } + }); + } + + assertTrue(requests.contains("https://www.marginalia.nu/misc/debian-laptop-install-log/")); + assertEquals(requests, responses); + } + finally { + if (tempFile != null) + Files.deleteIfExists(tempFile); + } + } + @Test public void testWarcOutput() throws IOException { var specs = CrawlSpecProvider.CrawlSpecRecord