From fe800b3af70a3c50419543fc1e7b7f80fdb59f02 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 5 Oct 2024 19:04:49 +0200 Subject: [PATCH] (crawler) Properly enqueue links from the root document in the crawler --- .../retreival/CrawlerRetreiverTest.java | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java index 88604f57..d82976f2 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java @@ -116,6 +116,48 @@ class CrawlerRetreiverTest { } } + @Test + public void testWarcOutputNoKnownUrls() throws IOException { + var specs = CrawlSpecProvider.CrawlSpecRecord + .builder() + .crawlDepth(5) + .domain("www.marginalia.nu") + .urls(List.of()) + .build(); + Path tempFile = null; + try { + tempFile = Files.createTempFile("crawling-process", "warc"); + + doCrawl(tempFile, specs); + + Set requests = new HashSet<>(); + Set responses = new HashSet<>(); + + try (var reader = new WarcReader(tempFile)) { + reader.forEach(record -> { + if (record instanceof WarcRequest req) { + requests.add(req.target()); + System.out.println(req.type() + ":" + req.target()); + } + else if (record instanceof WarcResponse rsp) { + responses.add(rsp.target()); + System.out.println(rsp.type() + ":" + rsp.target()); + } + else { + System.out.println(record.type()); + } + }); + } + + assertTrue(responses.size() > 5, "Should have fetched more than 5 URLs"); + assertEquals(requests, responses); + } + finally { + if (tempFile != null) + Files.deleteIfExists(tempFile); + } + } + @SneakyThrows @Test public void testResync() throws IOException {