The refactoring will continue until morale improves.

This commit is contained in:
Viktor Lofgren 2023-03-12 11:42:07 +01:00
parent 73eaa0865d
commit 8b8fc49901
313 changed files with 275 additions and 105 deletions

View File

@ -14,6 +14,7 @@ java {
dependencies {
implementation project(':code:common:model')
implementation project(':code:common:config')
implementation project(':code:features-crawl:work-log')
implementation project(':code:libraries:guarded-regex')
implementation project(':code:crawl-models:crawling-model')

View File

@ -1,3 +1,3 @@
# Crawl/Common
# Crawl Common
Contains model classes shared by the whole crawl-process-load ecosystem.

View File

@ -4,9 +4,9 @@ import com.google.errorprone.annotations.MustBeClosed;
import lombok.AllArgsConstructor;
import lombok.NoArgsConstructor;
import lombok.ToString;
import nu.marginalia.crawling.common.WorkLog;
import nu.marginalia.work_log.WorkLog;
import nu.marginalia.crawling.io.CrawledDomainReader;
import nu.marginalia.crawling.model.CrawlLogEntry;
import nu.marginalia.work_log.WorkLogEntry;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.crawling.model.CrawlingSpecification;
import org.jetbrains.annotations.NotNull;
@ -22,7 +22,7 @@ import java.util.function.Predicate;
import java.util.stream.Stream;
@AllArgsConstructor @NoArgsConstructor @ToString
public class EdgeCrawlPlan {
public class CrawlPlan {
private final Logger logger = LoggerFactory.getLogger(getClass());
public String jobSpec;
public WorkDir crawl;
@ -80,19 +80,19 @@ public class EdgeCrawlPlan {
CrawlerSpecificationLoader.readInputSpec(getJobSpec(), consumer);
}
public void forEachCrawlingLogEntry(Consumer<CrawlLogEntry> consumer) throws FileNotFoundException {
public void forEachCrawlingLogEntry(Consumer<WorkLogEntry> consumer) throws FileNotFoundException {
WorkLog.readLog(this.crawl.getLogFile(), consumer);
}
public void forEachProcessingLogEntry(Consumer<CrawlLogEntry> consumer) throws FileNotFoundException {
public void forEachProcessingLogEntry(Consumer<WorkLogEntry> consumer) throws FileNotFoundException {
WorkLog.readLog(this.process.getLogFile(), consumer);
}
public void forEachCrawledDomain(Consumer<CrawledDomain> consumer) {
final CrawledDomainReader reader = new CrawledDomainReader();
try (Stream<CrawlLogEntry> entryStream = WorkLog.streamLog(crawl.getLogFile())) {
try (Stream<WorkLogEntry> entryStream = WorkLog.streamLog(crawl.getLogFile())) {
entryStream
.map(CrawlLogEntry::path)
.map(WorkLogEntry::path)
.map(this::getCrawledFilePath)
.map(reader::readRuntimeExcept)
.forEach(consumer);
@ -106,10 +106,10 @@ public class EdgeCrawlPlan {
public void forEachCrawledDomain(Predicate<String> idReadPredicate, Consumer<CrawledDomain> consumer) {
final CrawledDomainReader reader = new CrawledDomainReader();
try (Stream<CrawlLogEntry> entryStream = WorkLog.streamLog(crawl.getLogFile())) {
try (Stream<WorkLogEntry> entryStream = WorkLog.streamLog(crawl.getLogFile())) {
entryStream
.filter(entry -> idReadPredicate.test(entry.id()))
.map(CrawlLogEntry::path)
.map(WorkLogEntry::path)
.map(this::getCrawledFilePath)
.map(reader::readRuntimeExcept)
.forEach(consumer);
@ -132,8 +132,8 @@ public class EdgeCrawlPlan {
final CrawledDomainReader reader = new CrawledDomainReader();
stream = WorkLog.streamLog(crawl.getLogFile())
.map(CrawlLogEntry::path)
.map(EdgeCrawlPlan.this::getCrawledFilePath)
.map(WorkLogEntry::path)
.map(CrawlPlan.this::getCrawledFilePath)
.map(reader::readRuntimeExcept);
}

View File

@ -13,9 +13,9 @@ public class CrawlPlanLoader {
yaml = new Yaml();
}
public EdgeCrawlPlan load(Path yamlFile) throws IOException {
public CrawlPlan load(Path yamlFile) throws IOException {
try (var reader = new FileReader(yamlFile.toFile())) {
return yaml.loadAs(reader, EdgeCrawlPlan.class);
return yaml.loadAs(reader, CrawlPlan.class);
}
catch (IOException ex) {
throw new IOException("Failed to load crawl plan " + yamlFile, ex);

View File

@ -3,11 +3,8 @@ package nu.marginalia.crawling.common.plan;
import com.github.luben.zstd.ZstdInputStream;
import com.google.gson.Gson;
import com.google.gson.JsonStreamParser;
import com.google.gson.stream.JsonReader;
import nu.marginalia.crawling.common.AbortMonitor;
import nu.marginalia.crawling.model.CrawlingSpecification;
import nu.marginalia.model.gson.GsonFactory;
import org.apache.logging.log4j.util.Strings;
import java.io.BufferedReader;
import java.io.FileInputStream;

View File

@ -1,4 +1,4 @@
# Converting Models
Contains models shared by the [converting-process](../../crawl/converting-process/) and
[loading-process](../../crawl/loading-process/).
Contains models shared by the [converting-process](../../crawl-processes/converting-process/) and
[loading-process](../../crawl-processes/loading-process/).

View File

@ -1,7 +1,7 @@
# Crawling Models
Contains models shared by the [crawling-process](../../crawl/crawling-process/) and
[converting-process](../../crawl/converting-process/).
Contains models shared by the [crawling-process](../../crawl-processes/crawling-process/) and
[converting-process](../../crawl-processes/converting-process/).
## Central Classes

View File

@ -1,4 +0,0 @@
package nu.marginalia.crawling.model;
public record CrawlLogEntry(String id, String ts, String path, int cnt) {
}

View File

@ -40,7 +40,9 @@ dependencies {
implementation project(':code:features-crawl:adblock')
implementation project(':code:features-crawl:pubdate')
implementation project(':code:features-crawl:topic-detection')
implementation project(':code:features-crawl:crawl-blocklist')
implementation project(':code:features-crawl:link-parser')
implementation project(':code:features-crawl:work-log')
implementation libs.lombok
annotationProcessor libs.lombok
implementation libs.bundles.slf4j

View File

@ -4,9 +4,9 @@ import com.google.gson.Gson;
import com.google.inject.Guice;
import com.google.inject.Inject;
import com.google.inject.Injector;
import nu.marginalia.crawling.common.WorkLog;
import nu.marginalia.work_log.WorkLog;
import nu.marginalia.crawling.common.plan.CrawlPlanLoader;
import nu.marginalia.crawling.common.plan.EdgeCrawlPlan;
import nu.marginalia.crawling.common.plan.CrawlPlan;
import nu.marginalia.converting.compiler.InstructionsCompiler;
import nu.marginalia.converting.instruction.Instruction;
import nu.marginalia.converting.processor.DomainProcessor;
@ -41,7 +41,7 @@ public class ConverterMain {
@Inject
public ConverterMain(
EdgeCrawlPlan plan,
CrawlPlan plan,
DomainProcessor processor,
InstructionsCompiler compiler,
Gson gson

View File

@ -5,19 +5,19 @@ import com.google.inject.AbstractModule;
import com.google.inject.name.Names;
import nu.marginalia.LanguageModels;
import nu.marginalia.WmsaHome;
import nu.marginalia.crawling.common.plan.EdgeCrawlPlan;
import nu.marginalia.crawling.common.plan.CrawlPlan;
import nu.marginalia.model.gson.GsonFactory;
public class ConverterModule extends AbstractModule {
private final EdgeCrawlPlan plan;
private final CrawlPlan plan;
public ConverterModule(EdgeCrawlPlan plan) {
public ConverterModule(CrawlPlan plan) {
this.plan = plan;
}
public void configure() {
bind(EdgeCrawlPlan.class).toInstance(plan);
bind(CrawlPlan.class).toInstance(plan);
bind(Gson.class).toInstance(createGson());

View File

@ -1,6 +1,6 @@
package nu.marginalia.converting.processor.logic;
import nu.marginalia.crawling.common.link.LinkParser;
import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.model.EdgeUrl;
import org.jsoup.nodes.Element;
import org.slf4j.Logger;

View File

@ -1,7 +1,7 @@
package nu.marginalia.converting.processor.logic.links;
import nu.marginalia.converting.model.ProcessedDocumentDetails;
import nu.marginalia.crawling.common.blocklist.UrlBlocklist;
import nu.marginalia.ip_blocklist.UrlBlocklist;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;

View File

@ -4,7 +4,7 @@ import com.google.inject.Inject;
import com.google.inject.name.Named;
import nu.marginalia.converting.processor.logic.links.LinkProcessor;
import nu.marginalia.converting.processor.logic.summary.SummaryExtractor;
import nu.marginalia.crawling.common.link.LinkParser;
import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.converting.processor.keywords.DocumentKeywordExtractor;

Some files were not shown because too many files have changed in this diff Show More