MarginaliaSearch/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java

151 lines
5.0 KiB
Java
Raw Normal View History

package plan;
2022-05-19 15:45:26 +00:00
2022-08-08 13:18:04 +00:00
import lombok.AllArgsConstructor;
import lombok.NoArgsConstructor;
import lombok.ToString;
2023-03-04 12:19:01 +00:00
import nu.marginalia.crawling.io.CrawledDomainReader;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.crawling.model.spec.CrawlerSpecificationLoader;
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
import nu.marginalia.process.log.WorkLog;
import nu.marginalia.process.log.WorkLogEntry;
import org.jetbrains.annotations.NotNull;
2023-03-04 12:19:01 +00:00
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
2022-08-08 13:18:04 +00:00
2023-03-04 12:19:01 +00:00
import java.io.FileNotFoundException;
2022-08-08 13:18:04 +00:00
import java.io.IOException;
2022-05-19 15:45:26 +00:00
import java.nio.file.Path;
import java.util.Iterator;
2022-08-08 13:18:04 +00:00
import java.util.function.Consumer;
2022-09-02 07:34:20 +00:00
import java.util.function.Predicate;
import java.util.stream.Stream;
2022-05-19 15:45:26 +00:00
@AllArgsConstructor @NoArgsConstructor @ToString
public class CrawlPlan {
2023-03-04 12:19:01 +00:00
private final Logger logger = LoggerFactory.getLogger(getClass());
2022-05-19 15:45:26 +00:00
public String jobSpec;
public WorkDir crawl;
public WorkDir process;
2023-03-04 12:19:01 +00:00
private static String rootDirRewrite = System.getProperty("crawl.rootDirRewrite");
2022-05-19 15:45:26 +00:00
public Path getJobSpec() {
return Path.of(jobSpec);
}
@AllArgsConstructor @NoArgsConstructor @ToString
public static class WorkDir {
public String dir;
public String logName;
public Path getDir() {
2023-03-04 12:19:01 +00:00
return Path.of(rewrite(dir));
2022-05-19 15:45:26 +00:00
}
public Path getLogFile() {
2023-03-04 12:19:01 +00:00
return Path.of(rewrite(dir)).resolve(logName);
2022-05-19 15:45:26 +00:00
}
}
2023-03-04 12:19:01 +00:00
private static String rewrite(String dir) {
if (rootDirRewrite == null) {
return dir;
}
String[] parts = rootDirRewrite.split(":");
return dir.replace(parts[0], parts[1]);
}
2022-06-22 10:57:58 +00:00
public Path getCrawledFilePath(String fileName) {
String sp1 = fileName.substring(0, 2);
String sp2 = fileName.substring(2, 4);
return crawl.getDir().resolve(sp1).resolve(sp2).resolve(fileName);
}
public Path getProcessedFilePath(String fileName) {
String sp1 = fileName.substring(0, 2);
String sp2 = fileName.substring(2, 4);
return process.getDir().resolve(sp1).resolve(sp2).resolve(fileName);
}
2022-08-08 13:18:04 +00:00
2022-08-10 15:03:58 +00:00
public WorkLog createCrawlWorkLog() throws IOException {
return new WorkLog(crawl.getLogFile());
}
public WorkLog createProcessWorkLog() throws IOException {
return new WorkLog(process.getLogFile());
}
public void forEachCrawlingSpecification(Consumer<CrawlingSpecification> consumer) {
CrawlerSpecificationLoader.readInputSpec(getJobSpec(), consumer);
}
public void forEachCrawlingLogEntry(Consumer<WorkLogEntry> consumer) throws FileNotFoundException {
2022-08-10 15:03:58 +00:00
WorkLog.readLog(this.crawl.getLogFile(), consumer);
}
public void forEachProcessingLogEntry(Consumer<WorkLogEntry> consumer) throws FileNotFoundException {
2022-08-10 15:03:58 +00:00
WorkLog.readLog(this.process.getLogFile(), consumer);
}
public void forEachCrawledDomain(Consumer<CrawledDomain> consumer) {
2022-08-08 13:18:04 +00:00
final CrawledDomainReader reader = new CrawledDomainReader();
try (Stream<WorkLogEntry> entryStream = WorkLog.streamLog(crawl.getLogFile())) {
entryStream
.map(WorkLogEntry::path)
.map(this::getCrawledFilePath)
.map(reader::readRuntimeExcept)
.forEach(consumer);
}
catch (IOException ex) {
2023-03-04 12:19:01 +00:00
logger.warn("Failed to read domains", ex);
throw new RuntimeException(ex);
}
}
2022-09-02 07:34:20 +00:00
public void forEachCrawledDomain(Predicate<String> idReadPredicate, Consumer<CrawledDomain> consumer) {
final CrawledDomainReader reader = new CrawledDomainReader();
try (Stream<WorkLogEntry> entryStream = WorkLog.streamLog(crawl.getLogFile())) {
2022-09-02 07:34:20 +00:00
entryStream
.filter(entry -> idReadPredicate.test(entry.id()))
.map(WorkLogEntry::path)
2022-09-02 07:34:20 +00:00
.map(this::getCrawledFilePath)
.map(reader::readRuntimeExcept)
.forEach(consumer);
}
catch (IOException ex) {
2023-03-04 12:19:01 +00:00
logger.warn("Failed to read domains", ex);
2022-09-02 07:34:20 +00:00
throw new RuntimeException(ex);
}
}
public DomainsIterable domainsIterable() throws IOException {
return new DomainsIterable();
}
public class DomainsIterable implements Iterable<CrawledDomain>, AutoCloseable {
private final Stream<CrawledDomain> stream;
DomainsIterable() throws IOException {
final CrawledDomainReader reader = new CrawledDomainReader();
stream = WorkLog.streamLog(crawl.getLogFile())
.map(WorkLogEntry::path)
.map(CrawlPlan.this::getCrawledFilePath)
.map(reader::readRuntimeExcept);
}
@Override
public void close() {
stream.close();
}
@NotNull
@Override
public Iterator<CrawledDomain> iterator() {
return stream.iterator();
}
2022-08-08 13:18:04 +00:00
}
2022-05-19 15:45:26 +00:00
}