diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index b81ea431..09352765 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -227,12 +227,12 @@ public class CrawlerRetreiver { crawlFrontier.addAllToQueue(sitemap); } - logger.info("Queue is now {}", crawlFrontier.queueSize()); + logger.debug("Queue is now {}", crawlFrontier.queueSize()); } private void sniffRootDocument() { try { - logger.info("Configuring link filter"); + logger.debug("Configuring link filter"); var url = crawlFrontier.peek(); @@ -241,6 +241,9 @@ public class CrawlerRetreiver { return; var sample = maybeSample.get(); + if (sample.documentBody == null) + return; + // Sniff the software based on the sample document var doc = Jsoup.parse(sample.documentBody.decode()); crawlFrontier.setLinkFilter(linkFilterSelector.selectFilter(doc));