From 11c26e700e60708e55495285195b955b37f56a05 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 30 Jun 2023 17:08:24 +0200 Subject: [PATCH] Remove annoying log spam in crawler retriever --- .../nu/marginalia/crawl/retreival/CrawlerRetreiver.java | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index b81ea431..09352765 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -227,12 +227,12 @@ public class CrawlerRetreiver { crawlFrontier.addAllToQueue(sitemap); } - logger.info("Queue is now {}", crawlFrontier.queueSize()); + logger.debug("Queue is now {}", crawlFrontier.queueSize()); } private void sniffRootDocument() { try { - logger.info("Configuring link filter"); + logger.debug("Configuring link filter"); var url = crawlFrontier.peek(); @@ -241,6 +241,9 @@ public class CrawlerRetreiver { return; var sample = maybeSample.get(); + if (sample.documentBody == null) + return; + // Sniff the software based on the sample document var doc = Jsoup.parse(sample.documentBody.decode()); crawlFrontier.setLinkFilter(linkFilterSelector.selectFilter(doc));