(crawler) Code quality

This commit is contained in:
Viktor Lofgren 2024-04-22 15:37:35 +02:00
parent 6dd87b0378
commit a86b596897
3 changed files with 7 additions and 6 deletions

View File

@ -69,11 +69,11 @@ public class CrawlerMain extends ProcessMainClass {
private final Map<String, String> processingIds = new ConcurrentHashMap<>();
final AbortMonitor abortMonitor = AbortMonitor.getInstance();
private final AbortMonitor abortMonitor = AbortMonitor.getInstance();
private final AtomicInteger tasksDone = new AtomicInteger(0);
private final HttpFetcherImpl fetcher;
volatile int totalTasks;
final AtomicInteger tasksDone = new AtomicInteger(0);
private HttpFetcherImpl fetcher;
private volatile int totalTasks;
@Inject
public CrawlerMain(UserAgent userAgent,
@ -263,6 +263,8 @@ public class CrawlerMain extends ProcessMainClass {
CrawledDocumentParquetRecordFileWriter
.convertWarc(domain, userAgent, newWarcFile, parquetFile);
// Optionally archive the WARC file if full retention is enabled,
// otherwise delete it:
warcArchiver.consumeWarc(newWarcFile, domain);
workLog.setJobToFinished(domain, parquetFile.toString(), size);

View File

@ -86,8 +86,6 @@ public class CrawlerRevisitor {
// fashion to make sure we eventually catch changes over time
// and ensure we discover new links
crawlFrontier.addVisited(url);
// Hoover up any links from the document
crawlFrontier.enqueueLinksFromDocument(url, Jsoup.parse(doc.documentBody));

View File

@ -3,6 +3,7 @@ package nu.marginalia.crawl.warc;
import java.io.IOException;
import java.nio.file.Path;
/** Interface for archiving warc files. */
public interface WarcArchiverIf extends AutoCloseable {
/** Process the warc file. After processing, the warc file is deleted.
* Processing may be a no-op, depending on the implementation.