From 214551f1df65ab29b12fd32a1e6377c460954d32 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 19 Apr 2024 20:36:01 +0200 Subject: [PATCH] (converter) Stopgap fix for some cases of lost crawl data due to HTTP 304. The root cause needs further investigation. --- .../nu/marginalia/converting/processor/DomainProcessor.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java index ac10bcb9..7ec0bf29 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -193,6 +193,8 @@ public class DomainProcessor { continue; if (doc.url == null) continue; + if (doc.documentBody.isBlank()) + continue; if (!processedUrls.add(doc.url)) continue;