mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(crawler) Code quality
This commit is contained in:
parent
6dd87b0378
commit
a86b596897
@ -69,11 +69,11 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
|
||||
private final Map<String, String> processingIds = new ConcurrentHashMap<>();
|
||||
|
||||
final AbortMonitor abortMonitor = AbortMonitor.getInstance();
|
||||
private final AbortMonitor abortMonitor = AbortMonitor.getInstance();
|
||||
private final AtomicInteger tasksDone = new AtomicInteger(0);
|
||||
private final HttpFetcherImpl fetcher;
|
||||
|
||||
volatile int totalTasks;
|
||||
final AtomicInteger tasksDone = new AtomicInteger(0);
|
||||
private HttpFetcherImpl fetcher;
|
||||
private volatile int totalTasks;
|
||||
|
||||
@Inject
|
||||
public CrawlerMain(UserAgent userAgent,
|
||||
@ -263,6 +263,8 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
CrawledDocumentParquetRecordFileWriter
|
||||
.convertWarc(domain, userAgent, newWarcFile, parquetFile);
|
||||
|
||||
// Optionally archive the WARC file if full retention is enabled,
|
||||
// otherwise delete it:
|
||||
warcArchiver.consumeWarc(newWarcFile, domain);
|
||||
|
||||
workLog.setJobToFinished(domain, parquetFile.toString(), size);
|
||||
|
@ -86,8 +86,6 @@ public class CrawlerRevisitor {
|
||||
// fashion to make sure we eventually catch changes over time
|
||||
// and ensure we discover new links
|
||||
|
||||
crawlFrontier.addVisited(url);
|
||||
|
||||
// Hoover up any links from the document
|
||||
crawlFrontier.enqueueLinksFromDocument(url, Jsoup.parse(doc.documentBody));
|
||||
|
||||
|
@ -3,6 +3,7 @@ package nu.marginalia.crawl.warc;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
|
||||
/** Interface for archiving warc files. */
|
||||
public interface WarcArchiverIf extends AutoCloseable {
|
||||
/** Process the warc file. After processing, the warc file is deleted.
|
||||
* Processing may be a no-op, depending on the implementation.
|
||||
|
Loading…
Reference in New Issue
Block a user