From e9e8580913d8e63ed0dd0c5fd47c8a4d4f20ba44 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 25 Sep 2024 12:10:26 +0200 Subject: [PATCH] (converter) Fix NPE bugs in converter due to the reintroduction of CrawledDocument.headers --- .../converting/processor/AcceptableAds.java | 5 ++++- .../plugin/HtmlDocumentProcessorPlugin.java | 12 ++++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/AcceptableAds.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/AcceptableAds.java index f75c35ad..7bb26e7d 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/AcceptableAds.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/AcceptableAds.java @@ -17,6 +17,9 @@ public class AcceptableAds { } public static boolean hasAcceptableAdsHeader(CrawledDocument document) { - return document.headers.contains("X-Adblock-Key"); + if (document.headers != null) { + return document.headers.contains("X-Adblock-Key"); + } + return false; } } diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java index ccb8a383..3e93e5cd 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java @@ -39,6 +39,7 @@ import org.slf4j.LoggerFactory; import java.net.URISyntaxException; import java.util.EnumSet; import java.util.HashSet; +import java.util.Objects; import java.util.Set; import static nu.marginalia.converting.model.DisqualifiedException.DisqualificationReason; @@ -127,7 +128,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin final EdgeUrl url = new EdgeUrl(crawledDocument.url); - final var generatorParts = documentGeneratorExtractor.detectGenerator(doc, crawledDocument.headers); + final var generatorParts = documentGeneratorExtractor.detectGenerator(doc, + Objects.requireNonNullElse(crawledDocument.headers, "") + ); final var specialization = htmlProcessorSpecializations.select(generatorParts, url); @@ -162,7 +165,12 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin throw new DisqualifiedException(DisqualificationReason.QUALITY); } - PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, standard, true); + PubDate pubDate = pubDateSniffer.getPubDate( + Objects.requireNonNullElse(crawledDocument.headers, ""), + url, + doc, + standard, + true); EnumSet documentFlags = documentFlags(features, generatorParts.type());