mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(crawler) Properly enqueue links from the root document in the crawler
This commit is contained in:
parent
db5faeceee
commit
eb60ddb729
@ -217,6 +217,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
// Sniff the software based on the sample document
|
// Sniff the software based on the sample document
|
||||||
var doc = optDoc.get();
|
var doc = optDoc.get();
|
||||||
crawlFrontier.setLinkFilter(linkFilterSelector.selectFilter(doc));
|
crawlFrontier.setLinkFilter(linkFilterSelector.selectFilter(doc));
|
||||||
|
crawlFrontier.enqueueLinksFromDocument(url, doc);
|
||||||
|
|
||||||
EdgeUrl faviconUrl = url.withPathAndParam("/favicon.ico", null);
|
EdgeUrl faviconUrl = url.withPathAndParam("/favicon.ico", null);
|
||||||
Optional<EdgeUrl> sitemapUrl = Optional.empty();
|
Optional<EdgeUrl> sitemapUrl = Optional.empty();
|
||||||
|
@ -74,6 +74,48 @@ class CrawlerRetreiverTest {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testWtf() throws IOException {
|
||||||
|
var specs = CrawlSpecProvider.CrawlSpecRecord
|
||||||
|
.builder()
|
||||||
|
.crawlDepth(5)
|
||||||
|
.domain("indigo.re")
|
||||||
|
.urls(List.of("https://indigo.re/"))
|
||||||
|
.build();
|
||||||
|
Path tempFile = null;
|
||||||
|
try {
|
||||||
|
tempFile = Files.createTempFile("crawling-process", "warc");
|
||||||
|
|
||||||
|
doCrawl(tempFile, specs);
|
||||||
|
|
||||||
|
Set<String> requests = new HashSet<>();
|
||||||
|
Set<String> responses = new HashSet<>();
|
||||||
|
|
||||||
|
try (var reader = new WarcReader(tempFile)) {
|
||||||
|
reader.forEach(record -> {
|
||||||
|
if (record instanceof WarcRequest req) {
|
||||||
|
requests.add(req.target());
|
||||||
|
System.out.println(req.type() + ":" + req.target());
|
||||||
|
}
|
||||||
|
else if (record instanceof WarcResponse rsp) {
|
||||||
|
responses.add(rsp.target());
|
||||||
|
System.out.println(rsp.type() + ":" + rsp.target());
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
System.out.println(record.type());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
assertTrue(requests.contains("https://www.marginalia.nu/misc/debian-laptop-install-log/"));
|
||||||
|
assertEquals(requests, responses);
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
if (tempFile != null)
|
||||||
|
Files.deleteIfExists(tempFile);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testWarcOutput() throws IOException {
|
public void testWarcOutput() throws IOException {
|
||||||
var specs = CrawlSpecProvider.CrawlSpecRecord
|
var specs = CrawlSpecProvider.CrawlSpecRecord
|
||||||
|
Loading…
Reference in New Issue
Block a user