From a86b59689736cf804f368db468b5251f06a3f36e Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 22 Apr 2024 15:37:35 +0200 Subject: [PATCH] (crawler) Code quality --- .../java/nu/marginalia/crawl/CrawlerMain.java | 10 ++++++---- .../crawl/retreival/revisit/CrawlerRevisitor.java | 2 -- .../java/nu/marginalia/crawl/warc/WarcArchiverIf.java | 1 + 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java index be152d38..1b04c0f9 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java @@ -69,11 +69,11 @@ public class CrawlerMain extends ProcessMainClass { private final Map processingIds = new ConcurrentHashMap<>(); - final AbortMonitor abortMonitor = AbortMonitor.getInstance(); + private final AbortMonitor abortMonitor = AbortMonitor.getInstance(); + private final AtomicInteger tasksDone = new AtomicInteger(0); + private final HttpFetcherImpl fetcher; - volatile int totalTasks; - final AtomicInteger tasksDone = new AtomicInteger(0); - private HttpFetcherImpl fetcher; + private volatile int totalTasks; @Inject public CrawlerMain(UserAgent userAgent, @@ -263,6 +263,8 @@ public class CrawlerMain extends ProcessMainClass { CrawledDocumentParquetRecordFileWriter .convertWarc(domain, userAgent, newWarcFile, parquetFile); + // Optionally archive the WARC file if full retention is enabled, + // otherwise delete it: warcArchiver.consumeWarc(newWarcFile, domain); workLog.setJobToFinished(domain, parquetFile.toString(), size); diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java index 4c091302..55dbb3c2 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java @@ -86,8 +86,6 @@ public class CrawlerRevisitor { // fashion to make sure we eventually catch changes over time // and ensure we discover new links - crawlFrontier.addVisited(url); - // Hoover up any links from the document crawlFrontier.enqueueLinksFromDocument(url, Jsoup.parse(doc.documentBody)); diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/warc/WarcArchiverIf.java b/code/processes/crawling-process/java/nu/marginalia/crawl/warc/WarcArchiverIf.java index 80e64d7a..cc9eb8e8 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/warc/WarcArchiverIf.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/warc/WarcArchiverIf.java @@ -3,6 +3,7 @@ package nu.marginalia.crawl.warc; import java.io.IOException; import java.nio.file.Path; +/** Interface for archiving warc files. */ public interface WarcArchiverIf extends AutoCloseable { /** Process the warc file. After processing, the warc file is deleted. * Processing may be a no-op, depending on the implementation.