From 35b29e4f9e49cdda78b71b57b8942f5fd25217ac Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 23 Jul 2023 19:06:37 +0200 Subject: [PATCH] (crawler) Clean up and refactor the code a bit --- .../nu/marginalia/crawling/io/CrawledDomainWriter.java | 7 ++++--- .../src/main/java/nu/marginalia/crawl/CrawlerMain.java | 2 +- .../crawling/retreival/CrawlerRetreiverTest.java | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java index 83582212..1598428a 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java @@ -4,6 +4,7 @@ import com.github.luben.zstd.ZstdOutputStream; import com.google.gson.Gson; import lombok.SneakyThrows; import nu.marginalia.crawling.model.SerializableCrawlData; +import nu.marginalia.crawling.model.spec.CrawlingSpecification; import nu.marginalia.model.gson.GsonFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -25,15 +26,15 @@ public class CrawledDomainWriter implements AutoCloseable { private final Path tmpFile; private final Path outputFile; - public CrawledDomainWriter(Path outputDir, String name, String id) throws IOException { + public CrawledDomainWriter(Path outputDir, CrawlingSpecification spec) throws IOException { this.outputDir = outputDir; if (!Files.isDirectory(outputDir)) { throw new IllegalArgumentException("Output dir " + outputDir + " does not exist"); } - tmpFile = getOutputFile(id, name + "_tmp"); - outputFile = getOutputFile(id, name); + tmpFile = getOutputFile(spec.id, spec.domain + "_tmp"); + outputFile = getOutputFile(spec.id, spec.domain); writer = new OutputStreamWriter(new ZstdOutputStream(new BufferedOutputStream(Files.newOutputStream(tmpFile, StandardOpenOption.CREATE, StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING)))); } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java index 2c7e6e41..f633a294 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java @@ -199,7 +199,7 @@ public class CrawlerMain implements AutoCloseable { HttpFetcher fetcher = new HttpFetcherImpl(userAgent.uaString(), dispatcher, connectionPool); - try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) { + try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification)) { var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept); CrawlDataReference reference = getReference(specification); diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java index bb4dd6f4..96f475a9 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java @@ -118,7 +118,7 @@ class CrawlerRetreiverTest { Path out = Files.createTempDirectory("crawling-process"); - var writer = new CrawledDomainWriter(out, specs.domain, specs.id); + var writer = new CrawledDomainWriter(out, specs); Map, List> data = new HashMap<>(); new CrawlerRetreiver(httpFetcher, specs, d -> {