2023-03-16 20:35:54 +00:00
|
|
|
package plan;
|
2022-05-19 15:45:26 +00:00
|
|
|
|
2022-08-08 13:18:04 +00:00
|
|
|
import lombok.AllArgsConstructor;
|
|
|
|
import lombok.NoArgsConstructor;
|
|
|
|
import lombok.ToString;
|
2023-03-04 12:19:01 +00:00
|
|
|
import nu.marginalia.crawling.io.CrawledDomainReader;
|
|
|
|
import nu.marginalia.crawling.model.CrawledDomain;
|
2023-03-16 20:35:54 +00:00
|
|
|
import nu.marginalia.crawling.model.spec.CrawlerSpecificationLoader;
|
|
|
|
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
|
|
|
|
import nu.marginalia.process.log.WorkLog;
|
|
|
|
import nu.marginalia.process.log.WorkLogEntry;
|
2022-08-10 13:04:25 +00:00
|
|
|
import org.jetbrains.annotations.NotNull;
|
2023-03-04 12:19:01 +00:00
|
|
|
import org.slf4j.Logger;
|
|
|
|
import org.slf4j.LoggerFactory;
|
2022-08-08 13:18:04 +00:00
|
|
|
|
2023-03-04 12:19:01 +00:00
|
|
|
import java.io.FileNotFoundException;
|
2022-08-08 13:18:04 +00:00
|
|
|
import java.io.IOException;
|
2022-05-19 15:45:26 +00:00
|
|
|
import java.nio.file.Path;
|
2022-08-10 13:04:25 +00:00
|
|
|
import java.util.Iterator;
|
2022-08-08 13:18:04 +00:00
|
|
|
import java.util.function.Consumer;
|
2022-09-02 07:34:20 +00:00
|
|
|
import java.util.function.Predicate;
|
2022-08-10 13:04:25 +00:00
|
|
|
import java.util.stream.Stream;
|
2022-05-19 15:45:26 +00:00
|
|
|
|
|
|
|
@AllArgsConstructor @NoArgsConstructor @ToString
|
2023-03-12 10:42:07 +00:00
|
|
|
public class CrawlPlan {
|
2023-03-04 12:19:01 +00:00
|
|
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
2022-05-19 15:45:26 +00:00
|
|
|
public String jobSpec;
|
|
|
|
public WorkDir crawl;
|
|
|
|
public WorkDir process;
|
|
|
|
|
2023-03-04 12:19:01 +00:00
|
|
|
private static String rootDirRewrite = System.getProperty("crawl.rootDirRewrite");
|
|
|
|
|
2022-05-19 15:45:26 +00:00
|
|
|
public Path getJobSpec() {
|
2023-03-30 13:41:07 +00:00
|
|
|
return Path.of(rewrite(jobSpec));
|
2022-05-19 15:45:26 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
@AllArgsConstructor @NoArgsConstructor @ToString
|
|
|
|
public static class WorkDir {
|
|
|
|
public String dir;
|
|
|
|
public String logName;
|
|
|
|
|
|
|
|
public Path getDir() {
|
2023-03-04 12:19:01 +00:00
|
|
|
return Path.of(rewrite(dir));
|
2022-05-19 15:45:26 +00:00
|
|
|
}
|
|
|
|
public Path getLogFile() {
|
2023-03-04 12:19:01 +00:00
|
|
|
return Path.of(rewrite(dir)).resolve(logName);
|
2022-05-19 15:45:26 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-03-04 12:19:01 +00:00
|
|
|
private static String rewrite(String dir) {
|
|
|
|
if (rootDirRewrite == null) {
|
|
|
|
return dir;
|
|
|
|
}
|
|
|
|
String[] parts = rootDirRewrite.split(":");
|
|
|
|
|
2023-03-30 13:41:07 +00:00
|
|
|
return dir.replaceFirst(parts[0], parts[1]);
|
2023-03-04 12:19:01 +00:00
|
|
|
}
|
|
|
|
|
2022-06-22 10:57:58 +00:00
|
|
|
public Path getCrawledFilePath(String fileName) {
|
|
|
|
String sp1 = fileName.substring(0, 2);
|
|
|
|
String sp2 = fileName.substring(2, 4);
|
|
|
|
return crawl.getDir().resolve(sp1).resolve(sp2).resolve(fileName);
|
|
|
|
}
|
|
|
|
|
|
|
|
public Path getProcessedFilePath(String fileName) {
|
|
|
|
String sp1 = fileName.substring(0, 2);
|
|
|
|
String sp2 = fileName.substring(2, 4);
|
|
|
|
return process.getDir().resolve(sp1).resolve(sp2).resolve(fileName);
|
|
|
|
}
|
2022-08-08 13:18:04 +00:00
|
|
|
|
2022-08-10 15:03:58 +00:00
|
|
|
public WorkLog createCrawlWorkLog() throws IOException {
|
|
|
|
return new WorkLog(crawl.getLogFile());
|
|
|
|
}
|
|
|
|
|
|
|
|
public WorkLog createProcessWorkLog() throws IOException {
|
|
|
|
return new WorkLog(process.getLogFile());
|
|
|
|
}
|
|
|
|
|
|
|
|
public void forEachCrawlingSpecification(Consumer<CrawlingSpecification> consumer) {
|
|
|
|
CrawlerSpecificationLoader.readInputSpec(getJobSpec(), consumer);
|
|
|
|
}
|
|
|
|
|
2023-03-12 10:42:07 +00:00
|
|
|
public void forEachCrawlingLogEntry(Consumer<WorkLogEntry> consumer) throws FileNotFoundException {
|
2022-08-10 15:03:58 +00:00
|
|
|
WorkLog.readLog(this.crawl.getLogFile(), consumer);
|
|
|
|
}
|
2023-03-12 10:42:07 +00:00
|
|
|
public void forEachProcessingLogEntry(Consumer<WorkLogEntry> consumer) throws FileNotFoundException {
|
2022-08-10 15:03:58 +00:00
|
|
|
WorkLog.readLog(this.process.getLogFile(), consumer);
|
|
|
|
}
|
|
|
|
|
|
|
|
public void forEachCrawledDomain(Consumer<CrawledDomain> consumer) {
|
2022-08-08 13:18:04 +00:00
|
|
|
final CrawledDomainReader reader = new CrawledDomainReader();
|
|
|
|
|
2023-03-12 10:42:07 +00:00
|
|
|
try (Stream<WorkLogEntry> entryStream = WorkLog.streamLog(crawl.getLogFile())) {
|
2022-08-10 13:04:25 +00:00
|
|
|
entryStream
|
2023-03-12 10:42:07 +00:00
|
|
|
.map(WorkLogEntry::path)
|
2022-08-10 13:04:25 +00:00
|
|
|
.map(this::getCrawledFilePath)
|
|
|
|
.map(reader::readRuntimeExcept)
|
|
|
|
.forEach(consumer);
|
|
|
|
}
|
|
|
|
catch (IOException ex) {
|
2023-03-04 12:19:01 +00:00
|
|
|
logger.warn("Failed to read domains", ex);
|
|
|
|
|
2022-08-10 13:04:25 +00:00
|
|
|
throw new RuntimeException(ex);
|
|
|
|
}
|
|
|
|
}
|
2022-09-02 07:34:20 +00:00
|
|
|
public void forEachCrawledDomain(Predicate<String> idReadPredicate, Consumer<CrawledDomain> consumer) {
|
|
|
|
final CrawledDomainReader reader = new CrawledDomainReader();
|
2022-08-10 13:04:25 +00:00
|
|
|
|
2023-03-12 10:42:07 +00:00
|
|
|
try (Stream<WorkLogEntry> entryStream = WorkLog.streamLog(crawl.getLogFile())) {
|
2022-09-02 07:34:20 +00:00
|
|
|
entryStream
|
|
|
|
.filter(entry -> idReadPredicate.test(entry.id()))
|
2023-03-12 10:42:07 +00:00
|
|
|
.map(WorkLogEntry::path)
|
2022-09-02 07:34:20 +00:00
|
|
|
.map(this::getCrawledFilePath)
|
|
|
|
.map(reader::readRuntimeExcept)
|
|
|
|
.forEach(consumer);
|
|
|
|
}
|
|
|
|
catch (IOException ex) {
|
2023-03-04 12:19:01 +00:00
|
|
|
logger.warn("Failed to read domains", ex);
|
|
|
|
|
2022-09-02 07:34:20 +00:00
|
|
|
throw new RuntimeException(ex);
|
|
|
|
}
|
|
|
|
}
|
2022-08-10 13:04:25 +00:00
|
|
|
public DomainsIterable domainsIterable() throws IOException {
|
|
|
|
return new DomainsIterable();
|
|
|
|
}
|
|
|
|
|
|
|
|
public class DomainsIterable implements Iterable<CrawledDomain>, AutoCloseable {
|
|
|
|
private final Stream<CrawledDomain> stream;
|
|
|
|
|
|
|
|
DomainsIterable() throws IOException {
|
|
|
|
final CrawledDomainReader reader = new CrawledDomainReader();
|
|
|
|
|
|
|
|
stream = WorkLog.streamLog(crawl.getLogFile())
|
2023-03-12 10:42:07 +00:00
|
|
|
.map(WorkLogEntry::path)
|
|
|
|
.map(CrawlPlan.this::getCrawledFilePath)
|
2022-08-10 13:04:25 +00:00
|
|
|
.map(reader::readRuntimeExcept);
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
public void close() {
|
|
|
|
stream.close();
|
|
|
|
}
|
|
|
|
|
|
|
|
@NotNull
|
|
|
|
@Override
|
|
|
|
public Iterator<CrawledDomain> iterator() {
|
|
|
|
return stream.iterator();
|
|
|
|
}
|
2022-08-08 13:18:04 +00:00
|
|
|
}
|
2022-05-19 15:45:26 +00:00
|
|
|
}
|