mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(crawler) Properly enqueue links from the root document in the crawler
This commit is contained in:
parent
db5faeceee
commit
eb60ddb729
@ -217,6 +217,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
// Sniff the software based on the sample document
|
||||
var doc = optDoc.get();
|
||||
crawlFrontier.setLinkFilter(linkFilterSelector.selectFilter(doc));
|
||||
crawlFrontier.enqueueLinksFromDocument(url, doc);
|
||||
|
||||
EdgeUrl faviconUrl = url.withPathAndParam("/favicon.ico", null);
|
||||
Optional<EdgeUrl> sitemapUrl = Optional.empty();
|
||||
|
@ -74,6 +74,48 @@ class CrawlerRetreiverTest {
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWtf() throws IOException {
|
||||
var specs = CrawlSpecProvider.CrawlSpecRecord
|
||||
.builder()
|
||||
.crawlDepth(5)
|
||||
.domain("indigo.re")
|
||||
.urls(List.of("https://indigo.re/"))
|
||||
.build();
|
||||
Path tempFile = null;
|
||||
try {
|
||||
tempFile = Files.createTempFile("crawling-process", "warc");
|
||||
|
||||
doCrawl(tempFile, specs);
|
||||
|
||||
Set<String> requests = new HashSet<>();
|
||||
Set<String> responses = new HashSet<>();
|
||||
|
||||
try (var reader = new WarcReader(tempFile)) {
|
||||
reader.forEach(record -> {
|
||||
if (record instanceof WarcRequest req) {
|
||||
requests.add(req.target());
|
||||
System.out.println(req.type() + ":" + req.target());
|
||||
}
|
||||
else if (record instanceof WarcResponse rsp) {
|
||||
responses.add(rsp.target());
|
||||
System.out.println(rsp.type() + ":" + rsp.target());
|
||||
}
|
||||
else {
|
||||
System.out.println(record.type());
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
assertTrue(requests.contains("https://www.marginalia.nu/misc/debian-laptop-install-log/"));
|
||||
assertEquals(requests, responses);
|
||||
}
|
||||
finally {
|
||||
if (tempFile != null)
|
||||
Files.deleteIfExists(tempFile);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWarcOutput() throws IOException {
|
||||
var specs = CrawlSpecProvider.CrawlSpecRecord
|
||||
|
Loading…
Reference in New Issue
Block a user