(crawler) Clean up and refactor the code a bit

This commit is contained in:
Viktor Lofgren 2023-07-23 19:06:37 +02:00
parent 69f333c0bf
commit 35b29e4f9e
3 changed files with 6 additions and 5 deletions

View File

@ -4,6 +4,7 @@ import com.github.luben.zstd.ZstdOutputStream;
import com.google.gson.Gson; import com.google.gson.Gson;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.crawling.model.SerializableCrawlData; import nu.marginalia.crawling.model.SerializableCrawlData;
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.model.gson.GsonFactory;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -25,15 +26,15 @@ public class CrawledDomainWriter implements AutoCloseable {
private final Path tmpFile; private final Path tmpFile;
private final Path outputFile; private final Path outputFile;
public CrawledDomainWriter(Path outputDir, String name, String id) throws IOException { public CrawledDomainWriter(Path outputDir, CrawlingSpecification spec) throws IOException {
this.outputDir = outputDir; this.outputDir = outputDir;
if (!Files.isDirectory(outputDir)) { if (!Files.isDirectory(outputDir)) {
throw new IllegalArgumentException("Output dir " + outputDir + " does not exist"); throw new IllegalArgumentException("Output dir " + outputDir + " does not exist");
} }
tmpFile = getOutputFile(id, name + "_tmp"); tmpFile = getOutputFile(spec.id, spec.domain + "_tmp");
outputFile = getOutputFile(id, name); outputFile = getOutputFile(spec.id, spec.domain);
writer = new OutputStreamWriter(new ZstdOutputStream(new BufferedOutputStream(Files.newOutputStream(tmpFile, writer = new OutputStreamWriter(new ZstdOutputStream(new BufferedOutputStream(Files.newOutputStream(tmpFile,
StandardOpenOption.CREATE, StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING)))); StandardOpenOption.CREATE, StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING))));
} }

View File

@ -199,7 +199,7 @@ public class CrawlerMain implements AutoCloseable {
HttpFetcher fetcher = new HttpFetcherImpl(userAgent.uaString(), dispatcher, connectionPool); HttpFetcher fetcher = new HttpFetcherImpl(userAgent.uaString(), dispatcher, connectionPool);
try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) { try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification)) {
var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept); var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept);
CrawlDataReference reference = getReference(specification); CrawlDataReference reference = getReference(specification);

View File

@ -118,7 +118,7 @@ class CrawlerRetreiverTest {
Path out = Files.createTempDirectory("crawling-process"); Path out = Files.createTempDirectory("crawling-process");
var writer = new CrawledDomainWriter(out, specs.domain, specs.id); var writer = new CrawledDomainWriter(out, specs);
Map<Class<? extends SerializableCrawlData>, List<SerializableCrawlData>> data = new HashMap<>(); Map<Class<? extends SerializableCrawlData>, List<SerializableCrawlData>> data = new HashMap<>();
new CrawlerRetreiver(httpFetcher, specs, d -> { new CrawlerRetreiver(httpFetcher, specs, d -> {