From 84563b0d4683ed878dea58b3879047fada5d072f Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 25 Dec 2023 00:55:05 +0100 Subject: [PATCH] (crawler) Be a bit more conservative about pulling etags and so on if the previous fetch wasn't OK --- .../crawl/retreival/revisit/DocumentWithReference.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java index a1533480..c604ff5b 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java @@ -49,6 +49,9 @@ public record DocumentWithReference( if (null == doc) return ContentTags.empty(); + if (doc.documentBody == null || doc.httpStatus != 200) + return ContentTags.empty(); + String lastmod = doc.getLastModified(); String etag = doc.getEtag();