mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 04:58:59 +00:00
The refactoring will continue until morale improves.
This commit is contained in:
parent
73eaa0865d
commit
8b8fc49901
@ -14,6 +14,7 @@ java {
|
||||
dependencies {
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:features-crawl:work-log')
|
||||
implementation project(':code:libraries:guarded-regex')
|
||||
implementation project(':code:crawl-models:crawling-model')
|
||||
|
||||
|
@ -1,3 +1,3 @@
|
||||
# Crawl/Common
|
||||
# Crawl Common
|
||||
|
||||
Contains model classes shared by the whole crawl-process-load ecosystem.
|
@ -4,9 +4,9 @@ import com.google.errorprone.annotations.MustBeClosed;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.crawling.common.WorkLog;
|
||||
import nu.marginalia.work_log.WorkLog;
|
||||
import nu.marginalia.crawling.io.CrawledDomainReader;
|
||||
import nu.marginalia.crawling.model.CrawlLogEntry;
|
||||
import nu.marginalia.work_log.WorkLogEntry;
|
||||
import nu.marginalia.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.crawling.model.CrawlingSpecification;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
@ -22,7 +22,7 @@ import java.util.function.Predicate;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
@AllArgsConstructor @NoArgsConstructor @ToString
|
||||
public class EdgeCrawlPlan {
|
||||
public class CrawlPlan {
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
public String jobSpec;
|
||||
public WorkDir crawl;
|
||||
@ -80,19 +80,19 @@ public class EdgeCrawlPlan {
|
||||
CrawlerSpecificationLoader.readInputSpec(getJobSpec(), consumer);
|
||||
}
|
||||
|
||||
public void forEachCrawlingLogEntry(Consumer<CrawlLogEntry> consumer) throws FileNotFoundException {
|
||||
public void forEachCrawlingLogEntry(Consumer<WorkLogEntry> consumer) throws FileNotFoundException {
|
||||
WorkLog.readLog(this.crawl.getLogFile(), consumer);
|
||||
}
|
||||
public void forEachProcessingLogEntry(Consumer<CrawlLogEntry> consumer) throws FileNotFoundException {
|
||||
public void forEachProcessingLogEntry(Consumer<WorkLogEntry> consumer) throws FileNotFoundException {
|
||||
WorkLog.readLog(this.process.getLogFile(), consumer);
|
||||
}
|
||||
|
||||
public void forEachCrawledDomain(Consumer<CrawledDomain> consumer) {
|
||||
final CrawledDomainReader reader = new CrawledDomainReader();
|
||||
|
||||
try (Stream<CrawlLogEntry> entryStream = WorkLog.streamLog(crawl.getLogFile())) {
|
||||
try (Stream<WorkLogEntry> entryStream = WorkLog.streamLog(crawl.getLogFile())) {
|
||||
entryStream
|
||||
.map(CrawlLogEntry::path)
|
||||
.map(WorkLogEntry::path)
|
||||
.map(this::getCrawledFilePath)
|
||||
.map(reader::readRuntimeExcept)
|
||||
.forEach(consumer);
|
||||
@ -106,10 +106,10 @@ public class EdgeCrawlPlan {
|
||||
public void forEachCrawledDomain(Predicate<String> idReadPredicate, Consumer<CrawledDomain> consumer) {
|
||||
final CrawledDomainReader reader = new CrawledDomainReader();
|
||||
|
||||
try (Stream<CrawlLogEntry> entryStream = WorkLog.streamLog(crawl.getLogFile())) {
|
||||
try (Stream<WorkLogEntry> entryStream = WorkLog.streamLog(crawl.getLogFile())) {
|
||||
entryStream
|
||||
.filter(entry -> idReadPredicate.test(entry.id()))
|
||||
.map(CrawlLogEntry::path)
|
||||
.map(WorkLogEntry::path)
|
||||
.map(this::getCrawledFilePath)
|
||||
.map(reader::readRuntimeExcept)
|
||||
.forEach(consumer);
|
||||
@ -132,8 +132,8 @@ public class EdgeCrawlPlan {
|
||||
final CrawledDomainReader reader = new CrawledDomainReader();
|
||||
|
||||
stream = WorkLog.streamLog(crawl.getLogFile())
|
||||
.map(CrawlLogEntry::path)
|
||||
.map(EdgeCrawlPlan.this::getCrawledFilePath)
|
||||
.map(WorkLogEntry::path)
|
||||
.map(CrawlPlan.this::getCrawledFilePath)
|
||||
.map(reader::readRuntimeExcept);
|
||||
}
|
||||
|
@ -13,9 +13,9 @@ public class CrawlPlanLoader {
|
||||
yaml = new Yaml();
|
||||
}
|
||||
|
||||
public EdgeCrawlPlan load(Path yamlFile) throws IOException {
|
||||
public CrawlPlan load(Path yamlFile) throws IOException {
|
||||
try (var reader = new FileReader(yamlFile.toFile())) {
|
||||
return yaml.loadAs(reader, EdgeCrawlPlan.class);
|
||||
return yaml.loadAs(reader, CrawlPlan.class);
|
||||
}
|
||||
catch (IOException ex) {
|
||||
throw new IOException("Failed to load crawl plan " + yamlFile, ex);
|
||||
|
@ -3,11 +3,8 @@ package nu.marginalia.crawling.common.plan;
|
||||
import com.github.luben.zstd.ZstdInputStream;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.JsonStreamParser;
|
||||
import com.google.gson.stream.JsonReader;
|
||||
import nu.marginalia.crawling.common.AbortMonitor;
|
||||
import nu.marginalia.crawling.model.CrawlingSpecification;
|
||||
import nu.marginalia.model.gson.GsonFactory;
|
||||
import org.apache.logging.log4j.util.Strings;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.FileInputStream;
|
||||
|
@ -1,4 +1,4 @@
|
||||
# Converting Models
|
||||
|
||||
Contains models shared by the [converting-process](../../crawl/converting-process/) and
|
||||
[loading-process](../../crawl/loading-process/).
|
||||
Contains models shared by the [converting-process](../../crawl-processes/converting-process/) and
|
||||
[loading-process](../../crawl-processes/loading-process/).
|
@ -1,7 +1,7 @@
|
||||
# Crawling Models
|
||||
|
||||
Contains models shared by the [crawling-process](../../crawl/crawling-process/) and
|
||||
[converting-process](../../crawl/converting-process/).
|
||||
Contains models shared by the [crawling-process](../../crawl-processes/crawling-process/) and
|
||||
[converting-process](../../crawl-processes/converting-process/).
|
||||
|
||||
## Central Classes
|
||||
|
||||
|
@ -1,4 +0,0 @@
|
||||
package nu.marginalia.crawling.model;
|
||||
|
||||
public record CrawlLogEntry(String id, String ts, String path, int cnt) {
|
||||
}
|
@ -40,7 +40,9 @@ dependencies {
|
||||
implementation project(':code:features-crawl:adblock')
|
||||
implementation project(':code:features-crawl:pubdate')
|
||||
implementation project(':code:features-crawl:topic-detection')
|
||||
|
||||
implementation project(':code:features-crawl:crawl-blocklist')
|
||||
implementation project(':code:features-crawl:link-parser')
|
||||
implementation project(':code:features-crawl:work-log')
|
||||
implementation libs.lombok
|
||||
annotationProcessor libs.lombok
|
||||
implementation libs.bundles.slf4j
|
@ -4,9 +4,9 @@ import com.google.gson.Gson;
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Injector;
|
||||
import nu.marginalia.crawling.common.WorkLog;
|
||||
import nu.marginalia.work_log.WorkLog;
|
||||
import nu.marginalia.crawling.common.plan.CrawlPlanLoader;
|
||||
import nu.marginalia.crawling.common.plan.EdgeCrawlPlan;
|
||||
import nu.marginalia.crawling.common.plan.CrawlPlan;
|
||||
import nu.marginalia.converting.compiler.InstructionsCompiler;
|
||||
import nu.marginalia.converting.instruction.Instruction;
|
||||
import nu.marginalia.converting.processor.DomainProcessor;
|
||||
@ -41,7 +41,7 @@ public class ConverterMain {
|
||||
|
||||
@Inject
|
||||
public ConverterMain(
|
||||
EdgeCrawlPlan plan,
|
||||
CrawlPlan plan,
|
||||
DomainProcessor processor,
|
||||
InstructionsCompiler compiler,
|
||||
Gson gson
|
@ -5,19 +5,19 @@ import com.google.inject.AbstractModule;
|
||||
import com.google.inject.name.Names;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.crawling.common.plan.EdgeCrawlPlan;
|
||||
import nu.marginalia.crawling.common.plan.CrawlPlan;
|
||||
import nu.marginalia.model.gson.GsonFactory;
|
||||
|
||||
public class ConverterModule extends AbstractModule {
|
||||
|
||||
private final EdgeCrawlPlan plan;
|
||||
private final CrawlPlan plan;
|
||||
|
||||
public ConverterModule(EdgeCrawlPlan plan) {
|
||||
public ConverterModule(CrawlPlan plan) {
|
||||
this.plan = plan;
|
||||
}
|
||||
|
||||
public void configure() {
|
||||
bind(EdgeCrawlPlan.class).toInstance(plan);
|
||||
bind(CrawlPlan.class).toInstance(plan);
|
||||
|
||||
bind(Gson.class).toInstance(createGson());
|
||||
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.converting.processor.logic;
|
||||
|
||||
import nu.marginalia.crawling.common.link.LinkParser;
|
||||
import nu.marginalia.link_parser.LinkParser;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.slf4j.Logger;
|
@ -1,7 +1,7 @@
|
||||
package nu.marginalia.converting.processor.logic.links;
|
||||
|
||||
import nu.marginalia.converting.model.ProcessedDocumentDetails;
|
||||
import nu.marginalia.crawling.common.blocklist.UrlBlocklist;
|
||||
import nu.marginalia.ip_blocklist.UrlBlocklist;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
@ -4,7 +4,7 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.converting.processor.logic.links.LinkProcessor;
|
||||
import nu.marginalia.converting.processor.logic.summary.SummaryExtractor;
|
||||
import nu.marginalia.crawling.common.link.LinkParser;
|
||||
import nu.marginalia.link_parser.LinkParser;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.converting.processor.keywords.DocumentKeywordExtractor;
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user