The refactoring will continue until morale improves.

This commit is contained in:
Viktor Lofgren 2023-03-12 11:42:07 +01:00
parent 73eaa0865d
commit 8b8fc49901
313 changed files with 275 additions and 105 deletions

View File

@ -14,6 +14,7 @@ java {
dependencies { dependencies {
implementation project(':code:common:model') implementation project(':code:common:model')
implementation project(':code:common:config') implementation project(':code:common:config')
implementation project(':code:features-crawl:work-log')
implementation project(':code:libraries:guarded-regex') implementation project(':code:libraries:guarded-regex')
implementation project(':code:crawl-models:crawling-model') implementation project(':code:crawl-models:crawling-model')

View File

@ -1,3 +1,3 @@
# Crawl/Common # Crawl Common
Contains model classes shared by the whole crawl-process-load ecosystem. Contains model classes shared by the whole crawl-process-load ecosystem.

View File

@ -4,9 +4,9 @@ import com.google.errorprone.annotations.MustBeClosed;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.NoArgsConstructor; import lombok.NoArgsConstructor;
import lombok.ToString; import lombok.ToString;
import nu.marginalia.crawling.common.WorkLog; import nu.marginalia.work_log.WorkLog;
import nu.marginalia.crawling.io.CrawledDomainReader; import nu.marginalia.crawling.io.CrawledDomainReader;
import nu.marginalia.crawling.model.CrawlLogEntry; import nu.marginalia.work_log.WorkLogEntry;
import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.crawling.model.CrawlingSpecification; import nu.marginalia.crawling.model.CrawlingSpecification;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
@ -22,7 +22,7 @@ import java.util.function.Predicate;
import java.util.stream.Stream; import java.util.stream.Stream;
@AllArgsConstructor @NoArgsConstructor @ToString @AllArgsConstructor @NoArgsConstructor @ToString
public class EdgeCrawlPlan { public class CrawlPlan {
private final Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());
public String jobSpec; public String jobSpec;
public WorkDir crawl; public WorkDir crawl;
@ -80,19 +80,19 @@ public class EdgeCrawlPlan {
CrawlerSpecificationLoader.readInputSpec(getJobSpec(), consumer); CrawlerSpecificationLoader.readInputSpec(getJobSpec(), consumer);
} }
public void forEachCrawlingLogEntry(Consumer<CrawlLogEntry> consumer) throws FileNotFoundException { public void forEachCrawlingLogEntry(Consumer<WorkLogEntry> consumer) throws FileNotFoundException {
WorkLog.readLog(this.crawl.getLogFile(), consumer); WorkLog.readLog(this.crawl.getLogFile(), consumer);
} }
public void forEachProcessingLogEntry(Consumer<CrawlLogEntry> consumer) throws FileNotFoundException { public void forEachProcessingLogEntry(Consumer<WorkLogEntry> consumer) throws FileNotFoundException {
WorkLog.readLog(this.process.getLogFile(), consumer); WorkLog.readLog(this.process.getLogFile(), consumer);
} }
public void forEachCrawledDomain(Consumer<CrawledDomain> consumer) { public void forEachCrawledDomain(Consumer<CrawledDomain> consumer) {
final CrawledDomainReader reader = new CrawledDomainReader(); final CrawledDomainReader reader = new CrawledDomainReader();
try (Stream<CrawlLogEntry> entryStream = WorkLog.streamLog(crawl.getLogFile())) { try (Stream<WorkLogEntry> entryStream = WorkLog.streamLog(crawl.getLogFile())) {
entryStream entryStream
.map(CrawlLogEntry::path) .map(WorkLogEntry::path)
.map(this::getCrawledFilePath) .map(this::getCrawledFilePath)
.map(reader::readRuntimeExcept) .map(reader::readRuntimeExcept)
.forEach(consumer); .forEach(consumer);
@ -106,10 +106,10 @@ public class EdgeCrawlPlan {
public void forEachCrawledDomain(Predicate<String> idReadPredicate, Consumer<CrawledDomain> consumer) { public void forEachCrawledDomain(Predicate<String> idReadPredicate, Consumer<CrawledDomain> consumer) {
final CrawledDomainReader reader = new CrawledDomainReader(); final CrawledDomainReader reader = new CrawledDomainReader();
try (Stream<CrawlLogEntry> entryStream = WorkLog.streamLog(crawl.getLogFile())) { try (Stream<WorkLogEntry> entryStream = WorkLog.streamLog(crawl.getLogFile())) {
entryStream entryStream
.filter(entry -> idReadPredicate.test(entry.id())) .filter(entry -> idReadPredicate.test(entry.id()))
.map(CrawlLogEntry::path) .map(WorkLogEntry::path)
.map(this::getCrawledFilePath) .map(this::getCrawledFilePath)
.map(reader::readRuntimeExcept) .map(reader::readRuntimeExcept)
.forEach(consumer); .forEach(consumer);
@ -132,8 +132,8 @@ public class EdgeCrawlPlan {
final CrawledDomainReader reader = new CrawledDomainReader(); final CrawledDomainReader reader = new CrawledDomainReader();
stream = WorkLog.streamLog(crawl.getLogFile()) stream = WorkLog.streamLog(crawl.getLogFile())
.map(CrawlLogEntry::path) .map(WorkLogEntry::path)
.map(EdgeCrawlPlan.this::getCrawledFilePath) .map(CrawlPlan.this::getCrawledFilePath)
.map(reader::readRuntimeExcept); .map(reader::readRuntimeExcept);
} }

View File

@ -13,9 +13,9 @@ public class CrawlPlanLoader {
yaml = new Yaml(); yaml = new Yaml();
} }
public EdgeCrawlPlan load(Path yamlFile) throws IOException { public CrawlPlan load(Path yamlFile) throws IOException {
try (var reader = new FileReader(yamlFile.toFile())) { try (var reader = new FileReader(yamlFile.toFile())) {
return yaml.loadAs(reader, EdgeCrawlPlan.class); return yaml.loadAs(reader, CrawlPlan.class);
} }
catch (IOException ex) { catch (IOException ex) {
throw new IOException("Failed to load crawl plan " + yamlFile, ex); throw new IOException("Failed to load crawl plan " + yamlFile, ex);

View File

@ -3,11 +3,8 @@ package nu.marginalia.crawling.common.plan;
import com.github.luben.zstd.ZstdInputStream; import com.github.luben.zstd.ZstdInputStream;
import com.google.gson.Gson; import com.google.gson.Gson;
import com.google.gson.JsonStreamParser; import com.google.gson.JsonStreamParser;
import com.google.gson.stream.JsonReader;
import nu.marginalia.crawling.common.AbortMonitor;
import nu.marginalia.crawling.model.CrawlingSpecification; import nu.marginalia.crawling.model.CrawlingSpecification;
import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.model.gson.GsonFactory;
import org.apache.logging.log4j.util.Strings;
import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.FileInputStream; import java.io.FileInputStream;

View File

@ -1,4 +1,4 @@
# Converting Models # Converting Models
Contains models shared by the [converting-process](../../crawl/converting-process/) and Contains models shared by the [converting-process](../../crawl-processes/converting-process/) and
[loading-process](../../crawl/loading-process/). [loading-process](../../crawl-processes/loading-process/).

View File

@ -1,7 +1,7 @@
# Crawling Models # Crawling Models
Contains models shared by the [crawling-process](../../crawl/crawling-process/) and Contains models shared by the [crawling-process](../../crawl-processes/crawling-process/) and
[converting-process](../../crawl/converting-process/). [converting-process](../../crawl-processes/converting-process/).
## Central Classes ## Central Classes

View File

@ -1,4 +0,0 @@
package nu.marginalia.crawling.model;
public record CrawlLogEntry(String id, String ts, String path, int cnt) {
}

View File

@ -40,7 +40,9 @@ dependencies {
implementation project(':code:features-crawl:adblock') implementation project(':code:features-crawl:adblock')
implementation project(':code:features-crawl:pubdate') implementation project(':code:features-crawl:pubdate')
implementation project(':code:features-crawl:topic-detection') implementation project(':code:features-crawl:topic-detection')
implementation project(':code:features-crawl:crawl-blocklist')
implementation project(':code:features-crawl:link-parser')
implementation project(':code:features-crawl:work-log')
implementation libs.lombok implementation libs.lombok
annotationProcessor libs.lombok annotationProcessor libs.lombok
implementation libs.bundles.slf4j implementation libs.bundles.slf4j

View File

@ -4,9 +4,9 @@ import com.google.gson.Gson;
import com.google.inject.Guice; import com.google.inject.Guice;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Injector; import com.google.inject.Injector;
import nu.marginalia.crawling.common.WorkLog; import nu.marginalia.work_log.WorkLog;
import nu.marginalia.crawling.common.plan.CrawlPlanLoader; import nu.marginalia.crawling.common.plan.CrawlPlanLoader;
import nu.marginalia.crawling.common.plan.EdgeCrawlPlan; import nu.marginalia.crawling.common.plan.CrawlPlan;
import nu.marginalia.converting.compiler.InstructionsCompiler; import nu.marginalia.converting.compiler.InstructionsCompiler;
import nu.marginalia.converting.instruction.Instruction; import nu.marginalia.converting.instruction.Instruction;
import nu.marginalia.converting.processor.DomainProcessor; import nu.marginalia.converting.processor.DomainProcessor;
@ -41,7 +41,7 @@ public class ConverterMain {
@Inject @Inject
public ConverterMain( public ConverterMain(
EdgeCrawlPlan plan, CrawlPlan plan,
DomainProcessor processor, DomainProcessor processor,
InstructionsCompiler compiler, InstructionsCompiler compiler,
Gson gson Gson gson

View File

@ -5,19 +5,19 @@ import com.google.inject.AbstractModule;
import com.google.inject.name.Names; import com.google.inject.name.Names;
import nu.marginalia.LanguageModels; import nu.marginalia.LanguageModels;
import nu.marginalia.WmsaHome; import nu.marginalia.WmsaHome;
import nu.marginalia.crawling.common.plan.EdgeCrawlPlan; import nu.marginalia.crawling.common.plan.CrawlPlan;
import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.model.gson.GsonFactory;
public class ConverterModule extends AbstractModule { public class ConverterModule extends AbstractModule {
private final EdgeCrawlPlan plan; private final CrawlPlan plan;
public ConverterModule(EdgeCrawlPlan plan) { public ConverterModule(CrawlPlan plan) {
this.plan = plan; this.plan = plan;
} }
public void configure() { public void configure() {
bind(EdgeCrawlPlan.class).toInstance(plan); bind(CrawlPlan.class).toInstance(plan);
bind(Gson.class).toInstance(createGson()); bind(Gson.class).toInstance(createGson());

View File

@ -1,6 +1,6 @@
package nu.marginalia.converting.processor.logic; package nu.marginalia.converting.processor.logic;
import nu.marginalia.crawling.common.link.LinkParser; import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import org.slf4j.Logger; import org.slf4j.Logger;

View File

@ -1,7 +1,7 @@
package nu.marginalia.converting.processor.logic.links; package nu.marginalia.converting.processor.logic.links;
import nu.marginalia.converting.model.ProcessedDocumentDetails; import nu.marginalia.converting.model.ProcessedDocumentDetails;
import nu.marginalia.crawling.common.blocklist.UrlBlocklist; import nu.marginalia.ip_blocklist.UrlBlocklist;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;

View File

@ -4,7 +4,7 @@ import com.google.inject.Inject;
import com.google.inject.name.Named; import com.google.inject.name.Named;
import nu.marginalia.converting.processor.logic.links.LinkProcessor; import nu.marginalia.converting.processor.logic.links.LinkProcessor;
import nu.marginalia.converting.processor.logic.summary.SummaryExtractor; import nu.marginalia.converting.processor.logic.summary.SummaryExtractor;
import nu.marginalia.crawling.common.link.LinkParser; import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.converting.processor.keywords.DocumentKeywordExtractor; import nu.marginalia.converting.processor.keywords.DocumentKeywordExtractor;

Some files were not shown because too many files have changed in this diff Show More