From c5aab7e8db7c9e411a624ef1c953d68fc693793d Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 25 Dec 2023 00:54:38 +0100 Subject: [PATCH] (warc) Fix NPE in WarcRecorder --- .../nu/marginalia/crawl/retreival/CrawlerRetreiver.java | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index 18035d52..aa7a8f7d 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -289,10 +289,12 @@ public class CrawlerRetreiver implements AutoCloseable { new ContentType(doc.contentType, "UTF-8"), doc.documentBody); - var parsed = Jsoup.parse(doc.documentBody); + if (doc.documentBody != null) { + var parsed = Jsoup.parse(doc.documentBody); - crawlFrontier.enqueueLinksFromDocument(top, parsed); - crawlFrontier.addVisited(top); + crawlFrontier.enqueueLinksFromDocument(top, parsed); + crawlFrontier.addVisited(top); + } } else if (fetchedDoc instanceof HttpFetchResult.ResultException ex) { errorCount ++;