From 25d086c4e1754288aa3c8e30bde9db9cf5970e64 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 25 Dec 2023 15:07:36 +0100 Subject: [PATCH] (crawler) Clean up stale warc files We should probably have an option to keep them, but not by default! --- .../src/main/java/nu/marginalia/crawl/CrawlerMain.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java index f4b5b1e9..12aad1ed 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java @@ -254,13 +254,14 @@ public class CrawlerMain { logger.info("Fetched {}", domain); } catch (Exception e) { logger.error("Error fetching domain " + domain, e); - Files.deleteIfExists(newWarcFile); - Files.deleteIfExists(tempFile); } finally { // We don't need to double-count these; it's also kept int he workLog processingIds.remove(domain); Thread.currentThread().setName("[idle]"); + + Files.deleteIfExists(newWarcFile); + Files.deleteIfExists(tempFile); } }