(crawler) Properly enqueue links from the root document in the crawler

2025-02-23 21:18:58 +00:00 · 2024-10-05 19:04:49 +02:00 · 2024-10-05 19:04:49 +02:00 · fe800b3af7
commit fe800b3af7
parent 2a1077ff43
1 changed files with 42 additions and 0 deletions
--- a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
@ -116,6 +116,48 @@ class CrawlerRetreiverTest {
        }
    }

+    @Test
+    public void testWarcOutputNoKnownUrls() throws IOException {
+        var specs = CrawlSpecProvider.CrawlSpecRecord
+                .builder()
+                .crawlDepth(5)
+                .domain("www.marginalia.nu")
+                .urls(List.of())
+                .build();
+        Path tempFile = null;
+        try {
+            tempFile = Files.createTempFile("crawling-process", "warc");
+
+            doCrawl(tempFile, specs);
+
+            Set<String> requests = new HashSet<>();
+            Set<String> responses = new HashSet<>();
+
+            try (var reader = new WarcReader(tempFile)) {
+                reader.forEach(record -> {
+                    if (record instanceof WarcRequest req) {
+                        requests.add(req.target());
+                        System.out.println(req.type() + ":" + req.target());
+                    }
+                    else if (record instanceof WarcResponse rsp) {
+                        responses.add(rsp.target());
+                        System.out.println(rsp.type() + ":" + rsp.target());
+                    }
+                    else {
+                        System.out.println(record.type());
+                    }
+                });
+            }
+
+            assertTrue(responses.size() > 5, "Should have fetched more than 5 URLs");
+            assertEquals(requests, responses);
+        }
+        finally {
+            if (tempFile != null)
+                Files.deleteIfExists(tempFile);
+        }
+    }
+
    @SneakyThrows
    @Test
    public void testResync() throws IOException {