MarginaliaSearch/code/process-models/crawling-model/java/plan/CrawlPlan.java

106 lines
3.2 KiB
Java
Raw Normal View History

package plan;
2022-05-19 15:45:26 +00:00
2022-08-08 13:18:04 +00:00
import lombok.AllArgsConstructor;
import lombok.NoArgsConstructor;
import lombok.ToString;
2023-03-04 12:19:01 +00:00
import nu.marginalia.crawling.io.CrawledDomainReader;
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
2023-03-04 12:19:01 +00:00
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.process.log.WorkLog;
import org.apache.logging.log4j.util.Strings;
2023-03-04 12:19:01 +00:00
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
2022-08-08 13:18:04 +00:00
import java.io.IOException;
import java.nio.file.Files;
2022-05-19 15:45:26 +00:00
import java.nio.file.Path;
2022-09-02 07:34:20 +00:00
import java.util.function.Predicate;
import java.util.Optional;
2022-05-19 15:45:26 +00:00
@AllArgsConstructor @NoArgsConstructor @ToString
public class CrawlPlan {
2023-03-04 12:19:01 +00:00
private final Logger logger = LoggerFactory.getLogger(getClass());
2022-05-19 15:45:26 +00:00
public String jobSpec;
public WorkDir crawl;
public WorkDir process;
private final static String rootDirRewrite = System.getProperty("crawl.rootDirRewrite");
2023-03-04 12:19:01 +00:00
2022-05-19 15:45:26 +00:00
public Path getJobSpec() {
return Path.of(rewrite(jobSpec));
2022-05-19 15:45:26 +00:00
}
@AllArgsConstructor @NoArgsConstructor @ToString
public static class WorkDir {
public String dir;
public String logName;
public Path getDir() {
2023-03-04 12:19:01 +00:00
return Path.of(rewrite(dir));
2022-05-19 15:45:26 +00:00
}
public Path getLogFile() {
2023-03-04 12:19:01 +00:00
return Path.of(rewrite(dir)).resolve(logName);
2022-05-19 15:45:26 +00:00
}
}
2023-03-04 12:19:01 +00:00
private static String rewrite(String dir) {
if (rootDirRewrite == null) {
return dir;
}
String[] parts = rootDirRewrite.split(":");
return dir.replaceFirst(parts[0], parts[1]);
2023-03-04 12:19:01 +00:00
}
2022-06-22 10:57:58 +00:00
public Path getCrawledFilePath(String fileName) {
int sp = fileName.lastIndexOf('/');
// Normalize the filename
if (sp >= 0 && sp + 1< fileName.length())
fileName = fileName.substring(sp + 1);
if (fileName.length() < 4)
fileName = Strings.repeat("0", 4 - fileName.length()) + fileName;
2022-06-22 10:57:58 +00:00
String sp1 = fileName.substring(0, 2);
String sp2 = fileName.substring(2, 4);
return crawl.getDir().resolve(sp1).resolve(sp2).resolve(fileName);
2022-08-10 15:03:58 +00:00
}
public int countCrawledDomains() {
int count = 0;
for (var ignored : WorkLog.iterable(crawl.getLogFile())) {
count++;
}
return count;
}
@Deprecated
public Iterable<CrawledDomain> domainsIterable() {
// This is no longer supported
throw new UnsupportedOperationException();
}
public Iterable<SerializableCrawlDataStream> crawlDataIterable(Predicate<String> idPredicate) {
return WorkLog.iterableMap(crawl.getLogFile(),
entry -> {
if (!idPredicate.test(entry.id())) {
return Optional.empty();
}
var path = getCrawledFilePath(entry.path());
if (!Files.exists(path)) {
logger.warn("File not found: {}", path);
return Optional.empty();
}
try {
return Optional.of(CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.COMPATIBLE, path));
}
catch (IOException ex) {
return Optional.empty();
}
});
}
2022-05-19 15:45:26 +00:00
}