MarginaliaSearch/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java

package plan;

import lombok.AllArgsConstructor;
import lombok.NoArgsConstructor;
import lombok.ToString;
import nu.marginalia.crawling.io.CrawledDomainReader;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.crawling.model.spec.CrawlerSpecificationLoader;
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
import nu.marginalia.process.log.WorkLog;
import nu.marginalia.process.log.WorkLogEntry;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.file.Path;
import java.util.Iterator;
import java.util.function.Consumer;
import java.util.function.Predicate;
import java.util.stream.Stream;

@AllArgsConstructor @NoArgsConstructor @ToString
public class CrawlPlan {
    private final Logger logger = LoggerFactory.getLogger(getClass());
    public String jobSpec;
    public WorkDir crawl;
    public WorkDir process;

    private static String rootDirRewrite = System.getProperty("crawl.rootDirRewrite");

    public Path getJobSpec() {
        return Path.of(jobSpec);
    }

    @AllArgsConstructor @NoArgsConstructor @ToString
    public static class WorkDir {
        public String dir;
        public String logName;

        public Path getDir() {
            return Path.of(rewrite(dir));
        }
        public Path getLogFile() {
            return Path.of(rewrite(dir)).resolve(logName);
        }
    }

    private static String rewrite(String dir) {
        if (rootDirRewrite == null) {
            return dir;
        }
        String[] parts = rootDirRewrite.split(":");

        return dir.replace(parts[0], parts[1]);
    }

    public Path getCrawledFilePath(String fileName) {
        String sp1 = fileName.substring(0, 2);
        String sp2 = fileName.substring(2, 4);
        return crawl.getDir().resolve(sp1).resolve(sp2).resolve(fileName);
    }

    public Path getProcessedFilePath(String fileName) {
        String sp1 = fileName.substring(0, 2);
        String sp2 = fileName.substring(2, 4);
        return process.getDir().resolve(sp1).resolve(sp2).resolve(fileName);
    }

    public WorkLog createCrawlWorkLog() throws IOException {
        return new WorkLog(crawl.getLogFile());
    }

    public WorkLog createProcessWorkLog() throws IOException {
        return new WorkLog(process.getLogFile());
    }

    public void forEachCrawlingSpecification(Consumer<CrawlingSpecification> consumer) {
        CrawlerSpecificationLoader.readInputSpec(getJobSpec(), consumer);
    }

    public void forEachCrawlingLogEntry(Consumer<WorkLogEntry> consumer) throws FileNotFoundException {
        WorkLog.readLog(this.crawl.getLogFile(), consumer);
    }
    public void forEachProcessingLogEntry(Consumer<WorkLogEntry> consumer) throws FileNotFoundException {
        WorkLog.readLog(this.process.getLogFile(), consumer);
    }

    public void forEachCrawledDomain(Consumer<CrawledDomain> consumer) {
        final CrawledDomainReader reader = new CrawledDomainReader();

        try (Stream<WorkLogEntry> entryStream = WorkLog.streamLog(crawl.getLogFile())) {
            entryStream
                    .map(WorkLogEntry::path)
                    .map(this::getCrawledFilePath)
                    .map(reader::readRuntimeExcept)
                    .forEach(consumer);
        }
        catch (IOException ex) {
            logger.warn("Failed to read domains", ex);

            throw new RuntimeException(ex);
        }
    }
    public void forEachCrawledDomain(Predicate<String> idReadPredicate, Consumer<CrawledDomain> consumer) {
        final CrawledDomainReader reader = new CrawledDomainReader();

        try (Stream<WorkLogEntry> entryStream = WorkLog.streamLog(crawl.getLogFile())) {
            entryStream
                    .filter(entry -> idReadPredicate.test(entry.id()))
                    .map(WorkLogEntry::path)
                    .map(this::getCrawledFilePath)
                    .map(reader::readRuntimeExcept)
                    .forEach(consumer);
        }
        catch (IOException ex) {
            logger.warn("Failed to read domains", ex);

            throw new RuntimeException(ex);
        }
    }
    public DomainsIterable domainsIterable() throws IOException {
        return new DomainsIterable();
    }

    public class DomainsIterable implements Iterable<CrawledDomain>, AutoCloseable {
        private final Stream<CrawledDomain> stream;

        DomainsIterable() throws IOException {
            final CrawledDomainReader reader = new CrawledDomainReader();

            stream = WorkLog.streamLog(crawl.getLogFile())
                    .map(WorkLogEntry::path)
                    .map(CrawlPlan.this::getCrawledFilePath)
                    .map(reader::readRuntimeExcept);
        }

        @Override
        public void close() {
            stream.close();
        }

        @NotNull
        @Override
        public Iterator<CrawledDomain> iterator() {
            return stream.iterator();
        }
    }
}
Yet more restructuring. Improved search result ranking. 2023-03-16 20:35:54 +00:00			`package plan;`
first commit 2022-05-19 15:45:26 +00:00
Recipe detection 2022-08-08 13:18:04 +00:00			`import lombok.AllArgsConstructor;`
			`import lombok.NoArgsConstructor;`
			`import lombok.ToString;`
Restructuring the git repo 2023-03-04 12:19:01 +00:00			`import nu.marginalia.crawling.io.CrawledDomainReader;`
			`import nu.marginalia.crawling.model.CrawledDomain;`
Yet more restructuring. Improved search result ranking. 2023-03-16 20:35:54 +00:00			`import nu.marginalia.crawling.model.spec.CrawlerSpecificationLoader;`
			`import nu.marginalia.crawling.model.spec.CrawlingSpecification;`
			`import nu.marginalia.process.log.WorkLog;`
			`import nu.marginalia.process.log.WorkLogEntry;`
Topical detection (experimental), Adblock simulation (experimental) 2022-08-10 13:04:25 +00:00			`import org.jetbrains.annotations.NotNull;`
Restructuring the git repo 2023-03-04 12:19:01 +00:00			`import org.slf4j.Logger;`
			`import org.slf4j.LoggerFactory;`
Recipe detection 2022-08-08 13:18:04 +00:00
Restructuring the git repo 2023-03-04 12:19:01 +00:00			`import java.io.FileNotFoundException;`
Recipe detection 2022-08-08 13:18:04 +00:00			`import java.io.IOException;`
first commit 2022-05-19 15:45:26 +00:00			`import java.nio.file.Path;`
Topical detection (experimental), Adblock simulation (experimental) 2022-08-10 13:04:25 +00:00			`import java.util.Iterator;`
Recipe detection 2022-08-08 13:18:04 +00:00			`import java.util.function.Consumer;`
Tweaks for search result relevance 2022-09-02 07:34:20 +00:00			`import java.util.function.Predicate;`
Topical detection (experimental), Adblock simulation (experimental) 2022-08-10 13:04:25 +00:00			`import java.util.stream.Stream;`
first commit 2022-05-19 15:45:26 +00:00
			`@AllArgsConstructor @NoArgsConstructor @ToString`
The refactoring will continue until morale improves. 2023-03-12 10:42:07 +00:00			`public class CrawlPlan {`
Restructuring the git repo 2023-03-04 12:19:01 +00:00			`private final Logger logger = LoggerFactory.getLogger(getClass());`
first commit 2022-05-19 15:45:26 +00:00			`public String jobSpec;`
			`public WorkDir crawl;`
			`public WorkDir process;`

Restructuring the git repo 2023-03-04 12:19:01 +00:00			`private static String rootDirRewrite = System.getProperty("crawl.rootDirRewrite");`

first commit 2022-05-19 15:45:26 +00:00			`public Path getJobSpec() {`
			`return Path.of(jobSpec);`
			`}`

			`@AllArgsConstructor @NoArgsConstructor @ToString`
			`public static class WorkDir {`
			`public String dir;`
			`public String logName;`

			`public Path getDir() {`
Restructuring the git repo 2023-03-04 12:19:01 +00:00			`return Path.of(rewrite(dir));`
first commit 2022-05-19 15:45:26 +00:00			`}`
			`public Path getLogFile() {`
Restructuring the git repo 2023-03-04 12:19:01 +00:00			`return Path.of(rewrite(dir)).resolve(logName);`
first commit 2022-05-19 15:45:26 +00:00			`}`
			`}`

Restructuring the git repo 2023-03-04 12:19:01 +00:00			`private static String rewrite(String dir) {`
			`if (rootDirRewrite == null) {`
			`return dir;`
			`}`
			`String[] parts = rootDirRewrite.split(":");`

			`return dir.replace(parts[0], parts[1]);`
			`}`

Anchor text capture work-in-progress 2022-06-22 10:57:58 +00:00			`public Path getCrawledFilePath(String fileName) {`
			`String sp1 = fileName.substring(0, 2);`
			`String sp2 = fileName.substring(2, 4);`
			`return crawl.getDir().resolve(sp1).resolve(sp2).resolve(fileName);`
			`}`

			`public Path getProcessedFilePath(String fileName) {`
			`String sp1 = fileName.substring(0, 2);`
			`String sp2 = fileName.substring(2, 4);`
			`return process.getDir().resolve(sp1).resolve(sp2).resolve(fileName);`
			`}`
Recipe detection 2022-08-08 13:18:04 +00:00
Faster crawling 2022-08-10 15:03:58 +00:00			`public WorkLog createCrawlWorkLog() throws IOException {`
			`return new WorkLog(crawl.getLogFile());`
			`}`

			`public WorkLog createProcessWorkLog() throws IOException {`
			`return new WorkLog(process.getLogFile());`
			`}`

			`public void forEachCrawlingSpecification(Consumer<CrawlingSpecification> consumer) {`
			`CrawlerSpecificationLoader.readInputSpec(getJobSpec(), consumer);`
			`}`

The refactoring will continue until morale improves. 2023-03-12 10:42:07 +00:00			`public void forEachCrawlingLogEntry(Consumer<WorkLogEntry> consumer) throws FileNotFoundException {`
Faster crawling 2022-08-10 15:03:58 +00:00			`WorkLog.readLog(this.crawl.getLogFile(), consumer);`
			`}`
The refactoring will continue until morale improves. 2023-03-12 10:42:07 +00:00			`public void forEachProcessingLogEntry(Consumer<WorkLogEntry> consumer) throws FileNotFoundException {`
Faster crawling 2022-08-10 15:03:58 +00:00			`WorkLog.readLog(this.process.getLogFile(), consumer);`
			`}`

			`public void forEachCrawledDomain(Consumer<CrawledDomain> consumer) {`
Recipe detection 2022-08-08 13:18:04 +00:00			`final CrawledDomainReader reader = new CrawledDomainReader();`

The refactoring will continue until morale improves. 2023-03-12 10:42:07 +00:00			`try (Stream<WorkLogEntry> entryStream = WorkLog.streamLog(crawl.getLogFile())) {`
Topical detection (experimental), Adblock simulation (experimental) 2022-08-10 13:04:25 +00:00			`entryStream`
The refactoring will continue until morale improves. 2023-03-12 10:42:07 +00:00			`.map(WorkLogEntry::path)`
Topical detection (experimental), Adblock simulation (experimental) 2022-08-10 13:04:25 +00:00			`.map(this::getCrawledFilePath)`
			`.map(reader::readRuntimeExcept)`
			`.forEach(consumer);`
			`}`
			`catch (IOException ex) {`
Restructuring the git repo 2023-03-04 12:19:01 +00:00			`logger.warn("Failed to read domains", ex);`

Topical detection (experimental), Adblock simulation (experimental) 2022-08-10 13:04:25 +00:00			`throw new RuntimeException(ex);`
			`}`
			`}`
Tweaks for search result relevance 2022-09-02 07:34:20 +00:00			`public void forEachCrawledDomain(Predicate<String> idReadPredicate, Consumer<CrawledDomain> consumer) {`
			`final CrawledDomainReader reader = new CrawledDomainReader();`
Topical detection (experimental), Adblock simulation (experimental) 2022-08-10 13:04:25 +00:00
The refactoring will continue until morale improves. 2023-03-12 10:42:07 +00:00			`try (Stream<WorkLogEntry> entryStream = WorkLog.streamLog(crawl.getLogFile())) {`
Tweaks for search result relevance 2022-09-02 07:34:20 +00:00			`entryStream`
			`.filter(entry -> idReadPredicate.test(entry.id()))`
The refactoring will continue until morale improves. 2023-03-12 10:42:07 +00:00			`.map(WorkLogEntry::path)`
Tweaks for search result relevance 2022-09-02 07:34:20 +00:00			`.map(this::getCrawledFilePath)`
			`.map(reader::readRuntimeExcept)`
			`.forEach(consumer);`
			`}`
			`catch (IOException ex) {`
Restructuring the git repo 2023-03-04 12:19:01 +00:00			`logger.warn("Failed to read domains", ex);`

Tweaks for search result relevance 2022-09-02 07:34:20 +00:00			`throw new RuntimeException(ex);`
			`}`
			`}`
Topical detection (experimental), Adblock simulation (experimental) 2022-08-10 13:04:25 +00:00			`public DomainsIterable domainsIterable() throws IOException {`
			`return new DomainsIterable();`
			`}`

			`public class DomainsIterable implements Iterable<CrawledDomain>, AutoCloseable {`
			`private final Stream<CrawledDomain> stream;`

			`DomainsIterable() throws IOException {`
			`final CrawledDomainReader reader = new CrawledDomainReader();`

			`stream = WorkLog.streamLog(crawl.getLogFile())`
The refactoring will continue until morale improves. 2023-03-12 10:42:07 +00:00			`.map(WorkLogEntry::path)`
			`.map(CrawlPlan.this::getCrawledFilePath)`
Topical detection (experimental), Adblock simulation (experimental) 2022-08-10 13:04:25 +00:00			`.map(reader::readRuntimeExcept);`
			`}`

			`@Override`
			`public void close() {`
			`stream.close();`
			`}`

			`@NotNull`
			`@Override`
			`public Iterator<CrawledDomain> iterator() {`
			`return stream.iterator();`
			`}`
Recipe detection 2022-08-08 13:18:04 +00:00			`}`
first commit 2022-05-19 15:45:26 +00:00			`}`