Remove annoying log spam in crawler retriever

This commit is contained in:
Viktor Lofgren 2023-06-30 17:08:24 +02:00
parent 8274e8a953
commit 11c26e700e

View File

@ -227,12 +227,12 @@ public class CrawlerRetreiver {
crawlFrontier.addAllToQueue(sitemap); crawlFrontier.addAllToQueue(sitemap);
} }
logger.info("Queue is now {}", crawlFrontier.queueSize()); logger.debug("Queue is now {}", crawlFrontier.queueSize());
} }
private void sniffRootDocument() { private void sniffRootDocument() {
try { try {
logger.info("Configuring link filter"); logger.debug("Configuring link filter");
var url = crawlFrontier.peek(); var url = crawlFrontier.peek();
@ -241,6 +241,9 @@ public class CrawlerRetreiver {
return; return;
var sample = maybeSample.get(); var sample = maybeSample.get();
if (sample.documentBody == null)
return;
// Sniff the software based on the sample document // Sniff the software based on the sample document
var doc = Jsoup.parse(sample.documentBody.decode()); var doc = Jsoup.parse(sample.documentBody.decode());
crawlFrontier.setLinkFilter(linkFilterSelector.selectFilter(doc)); crawlFrontier.setLinkFilter(linkFilterSelector.selectFilter(doc));