(crawler) Properly enqueue links from the root document in the crawler

This commit is contained in:
Viktor Lofgren 2024-10-05 17:49:39 +02:00
parent db5faeceee
commit eb60ddb729
2 changed files with 43 additions and 0 deletions

View File

@ -217,6 +217,7 @@ public class CrawlerRetreiver implements AutoCloseable {
// Sniff the software based on the sample document
var doc = optDoc.get();
crawlFrontier.setLinkFilter(linkFilterSelector.selectFilter(doc));
crawlFrontier.enqueueLinksFromDocument(url, doc);
EdgeUrl faviconUrl = url.withPathAndParam("/favicon.ico", null);
Optional<EdgeUrl> sitemapUrl = Optional.empty();

View File

@ -74,6 +74,48 @@ class CrawlerRetreiverTest {
}
}
@Test
public void testWtf() throws IOException {
var specs = CrawlSpecProvider.CrawlSpecRecord
.builder()
.crawlDepth(5)
.domain("indigo.re")
.urls(List.of("https://indigo.re/"))
.build();
Path tempFile = null;
try {
tempFile = Files.createTempFile("crawling-process", "warc");
doCrawl(tempFile, specs);
Set<String> requests = new HashSet<>();
Set<String> responses = new HashSet<>();
try (var reader = new WarcReader(tempFile)) {
reader.forEach(record -> {
if (record instanceof WarcRequest req) {
requests.add(req.target());
System.out.println(req.type() + ":" + req.target());
}
else if (record instanceof WarcResponse rsp) {
responses.add(rsp.target());
System.out.println(rsp.type() + ":" + rsp.target());
}
else {
System.out.println(record.type());
}
});
}
assertTrue(requests.contains("https://www.marginalia.nu/misc/debian-laptop-install-log/"));
assertEquals(requests, responses);
}
finally {
if (tempFile != null)
Files.deleteIfExists(tempFile);
}
}
@Test
public void testWarcOutput() throws IOException {
var specs = CrawlSpecProvider.CrawlSpecRecord