diff --git a/code/common/config/java/nu/marginalia/storage/model/FileStorageType.java b/code/common/config/java/nu/marginalia/storage/model/FileStorageType.java index e3d48731..2a160b61 100644 --- a/code/common/config/java/nu/marginalia/storage/model/FileStorageType.java +++ b/code/common/config/java/nu/marginalia/storage/model/FileStorageType.java @@ -1,7 +1,9 @@ package nu.marginalia.storage.model; public enum FileStorageType { - CRAWL_SPEC, + @Deprecated + CRAWL_SPEC, // + CRAWL_DATA, PROCESSED_DATA, BACKUP, diff --git a/code/execution/api/java/nu/marginalia/executor/client/ExecutorCrawlClient.java b/code/execution/api/java/nu/marginalia/executor/client/ExecutorCrawlClient.java index 25610892..b170463a 100644 --- a/code/execution/api/java/nu/marginalia/executor/client/ExecutorCrawlClient.java +++ b/code/execution/api/java/nu/marginalia/executor/client/ExecutorCrawlClient.java @@ -10,9 +10,10 @@ import nu.marginalia.service.discovery.property.ServicePartition; import nu.marginalia.storage.model.FileStorageId; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + import java.util.List; -import static nu.marginalia.functions.execution.api.ExecutorCrawlApiGrpc.*; +import static nu.marginalia.functions.execution.api.ExecutorCrawlApiGrpc.ExecutorCrawlApiBlockingStub; @Singleton public class ExecutorCrawlClient { @@ -36,14 +37,6 @@ public class ExecutorCrawlClient { .build()); } - public void triggerRecrawl(int node, FileStorageId fid) { - channelPool.call(ExecutorCrawlApiBlockingStub::triggerRecrawl) - .forNode(node) - .run(RpcFileStorageId.newBuilder() - .setFileStorageId(fid.id()) - .build()); - } - public void triggerRecrawlSingleDomain(int node, FileStorageId fid, String domainName) { channelPool.call(ExecutorCrawlApiBlockingStub::triggerSingleDomainRecrawl) .forNode(node) diff --git a/code/execution/api/src/main/protobuf/executor-api.proto b/code/execution/api/src/main/protobuf/executor-api.proto index 2858d60b..667fe70d 100644 --- a/code/execution/api/src/main/protobuf/executor-api.proto +++ b/code/execution/api/src/main/protobuf/executor-api.proto @@ -21,7 +21,6 @@ service ExecutorApi { service ExecutorCrawlApi { rpc triggerCrawl(RpcFileStorageId) returns (Empty) {} - rpc triggerRecrawl(RpcFileStorageId) returns (Empty) {} rpc triggerSingleDomainRecrawl(RpcFileStorageIdWithDomainName) returns (Empty) {} rpc triggerConvert(RpcFileStorageId) returns (Empty) {} rpc triggerConvertAndLoad(RpcFileStorageId) returns (Empty) {} diff --git a/code/execution/data-extractors/java/nu/marginalia/extractor/AtagExporter.java b/code/execution/data-extractors/java/nu/marginalia/extractor/AtagExporter.java index d2f2c91b..3cf001d3 100644 --- a/code/execution/data-extractors/java/nu/marginalia/extractor/AtagExporter.java +++ b/code/execution/data-extractors/java/nu/marginalia/extractor/AtagExporter.java @@ -4,8 +4,8 @@ import com.google.inject.Inject; import gnu.trove.set.hash.TLongHashSet; import lombok.SneakyThrows; import nu.marginalia.hash.MurmurHash3_128; -import nu.marginalia.io.crawldata.CrawledDomainReader; -import nu.marginalia.io.crawldata.SerializableCrawlDataStream; +import nu.marginalia.io.CrawledDomainReader; +import nu.marginalia.io.SerializableCrawlDataStream; import nu.marginalia.link_parser.LinkParser; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; diff --git a/code/execution/data-extractors/java/nu/marginalia/extractor/FeedExporter.java b/code/execution/data-extractors/java/nu/marginalia/extractor/FeedExporter.java index 547b810b..999fc734 100644 --- a/code/execution/data-extractors/java/nu/marginalia/extractor/FeedExporter.java +++ b/code/execution/data-extractors/java/nu/marginalia/extractor/FeedExporter.java @@ -2,8 +2,8 @@ package nu.marginalia.extractor; import com.google.inject.Inject; import lombok.SneakyThrows; -import nu.marginalia.io.crawldata.CrawledDomainReader; -import nu.marginalia.io.crawldata.SerializableCrawlDataStream; +import nu.marginalia.io.CrawledDomainReader; +import nu.marginalia.io.SerializableCrawlDataStream; import nu.marginalia.link_parser.FeedExtractor; import nu.marginalia.link_parser.LinkParser; import nu.marginalia.model.EdgeDomain; diff --git a/code/execution/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java b/code/execution/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java index 2545d666..d5d8184c 100644 --- a/code/execution/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java +++ b/code/execution/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java @@ -5,7 +5,7 @@ import gnu.trove.map.hash.TLongIntHashMap; import gnu.trove.set.hash.TLongHashSet; import nu.marginalia.WmsaHome; import nu.marginalia.converting.processor.logic.dom.DomPruningFilter; -import nu.marginalia.io.crawldata.CrawledDomainReader; +import nu.marginalia.io.CrawledDomainReader; import nu.marginalia.language.filter.LanguageFilter; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.sentence.SentenceExtractor; diff --git a/code/execution/java/nu/marginalia/actor/ExecutorActorControlService.java b/code/execution/java/nu/marginalia/actor/ExecutorActorControlService.java index 5975a54a..836382b2 100644 --- a/code/execution/java/nu/marginalia/actor/ExecutorActorControlService.java +++ b/code/execution/java/nu/marginalia/actor/ExecutorActorControlService.java @@ -33,7 +33,6 @@ public class ExecutorActorControlService { ConvertActor convertActor, ConvertAndLoadActor convertAndLoadActor, CrawlActor crawlActor, - RecrawlActor recrawlActor, RecrawlSingleDomainActor recrawlSingleDomainActor, RestoreBackupActor restoreBackupActor, ConverterMonitorActor converterMonitorFSM, @@ -43,7 +42,6 @@ public class ExecutorActorControlService { FileStorageMonitorActor fileStorageMonitorActor, IndexConstructorMonitorActor indexConstructorMonitorActor, TriggerAdjacencyCalculationActor triggerAdjacencyCalculationActor, - CrawlJobExtractorActor crawlJobExtractorActor, ExportDataActor exportDataActor, ExportAtagsActor exportAtagsActor, ExportFeedsActor exportFeedsActor, @@ -59,7 +57,6 @@ public class ExecutorActorControlService { this.node = baseServiceParams.configuration.node(); register(ExecutorActor.CRAWL, crawlActor); - register(ExecutorActor.RECRAWL, recrawlActor); register(ExecutorActor.RECRAWL_SINGLE_DOMAIN, recrawlSingleDomainActor); register(ExecutorActor.CONVERT, convertActor); @@ -75,7 +72,6 @@ public class ExecutorActorControlService { register(ExecutorActor.MONITOR_FILE_STORAGE, fileStorageMonitorActor); register(ExecutorActor.ADJACENCY_CALCULATION, triggerAdjacencyCalculationActor); - register(ExecutorActor.CRAWL_JOB_EXTRACTOR, crawlJobExtractorActor); register(ExecutorActor.EXPORT_DATA, exportDataActor); register(ExecutorActor.EXPORT_ATAGS, exportAtagsActor); diff --git a/code/execution/java/nu/marginalia/actor/task/CrawlActor.java b/code/execution/java/nu/marginalia/actor/task/CrawlActor.java index 0a742888..8b48200e 100644 --- a/code/execution/java/nu/marginalia/actor/task/CrawlActor.java +++ b/code/execution/java/nu/marginalia/actor/task/CrawlActor.java @@ -3,86 +3,118 @@ package nu.marginalia.actor.task; import com.google.gson.Gson; import com.google.inject.Inject; import com.google.inject.Singleton; +import nu.marginalia.actor.ExecutorActor; +import nu.marginalia.actor.ExecutorActorStateMachines; import nu.marginalia.actor.prototype.RecordActorPrototype; import nu.marginalia.actor.state.ActorResumeBehavior; import nu.marginalia.actor.state.ActorStep; import nu.marginalia.actor.state.Resume; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.outbox.MqOutbox; +import nu.marginalia.mqapi.crawling.CrawlRequest; import nu.marginalia.process.ProcessOutboxes; import nu.marginalia.process.ProcessService; import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageType; -import nu.marginalia.mq.MqMessageState; -import nu.marginalia.mq.outbox.MqOutbox; -import nu.marginalia.mqapi.crawling.CrawlRequest; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import nu.marginalia.svc.DomainListRefreshService; -import java.util.List; +import java.nio.file.Files; +import java.nio.file.Path; @Singleton public class CrawlActor extends RecordActorPrototype { private final MqOutbox mqCrawlerOutbox; private final FileStorageService storageService; - private final Logger logger = LoggerFactory.getLogger(getClass()); - + private final DomainListRefreshService refreshService; private final ActorProcessWatcher processWatcher; + private final ExecutorActorStateMachines executorActorStateMachines; - public record Initial(FileStorageId storageId) implements ActorStep {} + /** Initial step + * @param storageId - the id of the storage to recrawl + * @param cascadeLoad - whether to automatically start the convert and load actor after the crawl + */ + public record Initial(FileStorageId storageId, boolean cascadeLoad) implements ActorStep {} + + /** The action step */ @Resume(behavior = ActorResumeBehavior.RETRY) - public record Crawl(long messageId) implements ActorStep {} + public record Crawl(long messageId, FileStorageId fid, boolean cascadeLoad) implements ActorStep {} @Override public ActorStep transition(ActorStep self) throws Exception { return switch (self) { - case Initial (FileStorageId fid) -> { - var storage = storageService.getStorage(fid); - - if (storage == null) yield new Error("Bad storage id"); - if (storage.type() != FileStorageType.CRAWL_SPEC) yield new Error("Bad storage type " + storage.type()); - + case Initial (FileStorageId fid, boolean cascadeLoad) when fid.id() < 0 -> { + // Allocate a new storage area if we aren't given one var dataArea = storageService.allocateStorage( FileStorageType.CRAWL_DATA, "crawl-data", - storage.description()); + "Crawl data"); - storageService.relateFileStorages(storage.id(), dataArea.id()); - - // Send convert request - long msgId = mqCrawlerOutbox.sendAsync( - CrawlRequest.forSpec(fid, dataArea.id()) - ); - - yield new Crawl(msgId); + // Now we can jump to the main track + yield new Initial(dataArea.id(), cascadeLoad); } - case Crawl(long msgId) -> { - var rsp = processWatcher.waitResponse(mqCrawlerOutbox, ProcessService.ProcessId.CRAWLER, msgId); + case Initial (FileStorageId fid, boolean cascadeLoad) -> { + var crawlStorage = storageService.getStorage(fid); - if (rsp.state() != MqMessageState.OK) + if (crawlStorage == null) yield new Error("Bad storage id"); + if (crawlStorage.type() != FileStorageType.CRAWL_DATA) yield new Error("Bad storage type " + crawlStorage.type()); + + Path crawlLogPath = crawlStorage.asPath().resolve("crawler.log"); + if (Files.exists(crawlLogPath)) { + // Save the old crawl log + Path crawlLogBackup = crawlStorage.asPath().resolve("crawler.log-" + System.currentTimeMillis()); + Files.move(crawlLogPath, crawlLogBackup); + } + + refreshService.synchronizeDomainList(); + + long id = mqCrawlerOutbox.sendAsync(CrawlRequest.forFullCrawl(fid)); + + yield new Crawl(id, fid, cascadeLoad); + } + case Crawl (long msgId, FileStorageId fid, boolean cascadeLoad) -> { + var rsp = processWatcher.waitResponse( + mqCrawlerOutbox, + ProcessService.ProcessId.CRAWLER, + msgId); + + if (rsp.state() != MqMessageState.OK) { yield new Error("Crawler failed"); + } + + if (cascadeLoad) { + // Spawn the convert and load actor + executorActorStateMachines.initFrom(ExecutorActor.CONVERT_AND_LOAD, + new ConvertAndLoadActor.Initial(fid)); + } yield new End(); } - default -> new Error(); + default -> new End(); }; } @Override public String describe() { - return "Run the crawler with the given crawl spec using no previous crawl data for a reference"; + return "Run the crawler with the given crawl spec using previous crawl data for a reference"; } @Inject - public CrawlActor(ProcessOutboxes processOutboxes, + public CrawlActor(ActorProcessWatcher processWatcher, + ProcessOutboxes processOutboxes, FileStorageService storageService, - Gson gson, - ActorProcessWatcher processWatcher) + DomainListRefreshService refreshService, + ExecutorActorStateMachines executorActorStateMachines, + Gson gson) { super(gson); + + this.processWatcher = processWatcher; this.mqCrawlerOutbox = processOutboxes.getCrawlerOutbox(); this.storageService = storageService; - this.processWatcher = processWatcher; + this.refreshService = refreshService; + this.executorActorStateMachines = executorActorStateMachines; } } diff --git a/code/execution/java/nu/marginalia/actor/task/CrawlJobExtractorActor.java b/code/execution/java/nu/marginalia/actor/task/CrawlJobExtractorActor.java deleted file mode 100644 index 27ed6a08..00000000 --- a/code/execution/java/nu/marginalia/actor/task/CrawlJobExtractorActor.java +++ /dev/null @@ -1,83 +0,0 @@ -package nu.marginalia.actor.task; - -import com.google.gson.Gson; -import com.google.inject.Inject; -import com.google.inject.Singleton; -import nu.marginalia.actor.prototype.RecordActorPrototype; -import nu.marginalia.actor.state.ActorStep; -import nu.marginalia.crawlspec.CrawlSpecFileNames; -import nu.marginalia.storage.FileStorageService; -import nu.marginalia.storage.model.FileStorageType; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.net.URL; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.StandardOpenOption; - -import static nu.marginalia.crawlspec.CrawlSpecGenerator.*; - -@Singleton -public class CrawlJobExtractorActor extends RecordActorPrototype { - - private static final int INITIAL_URLS_PER_DOMAIN = Integer.getInteger("crawler.initialUrlsPerDomain", 250); - private final Logger logger = LoggerFactory.getLogger(getClass()); - private final FileStorageService fileStorageService; - - @Inject - public CrawlJobExtractorActor(Gson gson, - FileStorageService fileStorageService - ) { - super(gson); - this.fileStorageService = fileStorageService; - } - - public record CreateFromUrl(String description, String url) implements ActorStep {} - - @Override - public ActorStep transition(ActorStep self) throws Exception { - return switch (self) { - case CreateFromUrl(String description, String url) -> { - var storage = fileStorageService.allocateStorage(FileStorageType.CRAWL_SPEC, "crawl-spec", description); - - Path urlsTxt = storage.asPath().resolve("urls.txt"); - - try { - downloadToFile(url, urlsTxt); - } - catch (IOException ex) { - fileStorageService.flagFileForDeletion(storage.id()); - yield new Error("Error downloading " + url); - } - - final Path path = CrawlSpecFileNames.resolve(storage); - - generateCrawlSpec( - path, - DomainSource.fromFile(urlsTxt), - KnownUrlsCountSource.fixed(INITIAL_URLS_PER_DOMAIN), - KnownUrlsListSource.justIndex() - ); - - yield new End(); - } - default -> new Error(); - }; - } - - private void downloadToFile(String url, Path urlsTxt) throws IOException { - try (var os = Files.newOutputStream(urlsTxt, StandardOpenOption.CREATE_NEW); - var is = new URL(url).openStream()) - { - is.transferTo(os); - } - } - - @Override - public String describe() { - return "Run the crawler job extractor process"; - } - -} diff --git a/code/execution/java/nu/marginalia/actor/task/RecrawlActor.java b/code/execution/java/nu/marginalia/actor/task/RecrawlActor.java deleted file mode 100644 index 0eefd4ef..00000000 --- a/code/execution/java/nu/marginalia/actor/task/RecrawlActor.java +++ /dev/null @@ -1,110 +0,0 @@ -package nu.marginalia.actor.task; - -import com.google.gson.Gson; -import com.google.inject.Inject; -import com.google.inject.Singleton; -import nu.marginalia.actor.ExecutorActor; -import nu.marginalia.actor.ExecutorActorStateMachines; -import nu.marginalia.actor.prototype.RecordActorPrototype; -import nu.marginalia.actor.state.ActorResumeBehavior; -import nu.marginalia.actor.state.ActorStep; -import nu.marginalia.actor.state.Resume; -import nu.marginalia.process.ProcessOutboxes; -import nu.marginalia.process.ProcessService; -import nu.marginalia.storage.FileStorageService; -import nu.marginalia.storage.model.FileStorageId; -import nu.marginalia.storage.model.FileStorageType; -import nu.marginalia.mq.MqMessageState; -import nu.marginalia.mq.outbox.MqOutbox; -import nu.marginalia.mqapi.crawling.CrawlRequest; -import nu.marginalia.svc.DomainListRefreshService; - -import java.nio.file.Files; -import java.nio.file.Path; - -@Singleton -public class RecrawlActor extends RecordActorPrototype { - - private final MqOutbox mqCrawlerOutbox; - private final FileStorageService storageService; - private final DomainListRefreshService refreshService; - private final ActorProcessWatcher processWatcher; - private final ExecutorActorStateMachines executorActorStateMachines; - - /** Initial step - * @param storageId - the id of the storage to recrawl - * @param cascadeLoad - whether to automatically start the convert and load actor after the crawl - */ - public record Initial(FileStorageId storageId, boolean cascadeLoad) implements ActorStep {} - - /** The action step */ - @Resume(behavior = ActorResumeBehavior.RETRY) - public record Crawl(long messageId, FileStorageId fid, boolean cascadeLoad) implements ActorStep {} - - @Override - public ActorStep transition(ActorStep self) throws Exception { - return switch (self) { - case Initial (FileStorageId fid, boolean cascadeLoad) -> { - var crawlStorage = storageService.getStorage(fid); - - if (crawlStorage == null) yield new Error("Bad storage id"); - if (crawlStorage.type() != FileStorageType.CRAWL_DATA) yield new Error("Bad storage type " + crawlStorage.type()); - - Path crawlLogPath = crawlStorage.asPath().resolve("crawler.log"); - if (Files.exists(crawlLogPath)) { - // Save the old crawl log - Path crawlLogBackup = crawlStorage.asPath().resolve("crawler.log-" + System.currentTimeMillis()); - Files.move(crawlLogPath, crawlLogBackup); - } - - refreshService.synchronizeDomainList(); - - long id = mqCrawlerOutbox.sendAsync(CrawlRequest.forRecrawl(fid)); - - yield new Crawl(id, fid, cascadeLoad); - } - case Crawl (long msgId, FileStorageId fid, boolean cascadeLoad) -> { - var rsp = processWatcher.waitResponse( - mqCrawlerOutbox, - ProcessService.ProcessId.CRAWLER, - msgId); - - if (rsp.state() != MqMessageState.OK) { - yield new Error("Crawler failed"); - } - - if (cascadeLoad) { - // Spawn the convert and load actor - executorActorStateMachines.initFrom(ExecutorActor.CONVERT_AND_LOAD, - new ConvertAndLoadActor.Initial(fid)); - } - - yield new End(); - } - default -> new End(); - }; - } - - @Override - public String describe() { - return "Run the crawler with the given crawl spec using previous crawl data for a reference"; - } - - @Inject - public RecrawlActor(ActorProcessWatcher processWatcher, - ProcessOutboxes processOutboxes, - FileStorageService storageService, - DomainListRefreshService refreshService, - ExecutorActorStateMachines executorActorStateMachines, - Gson gson) - { - super(gson); - - this.processWatcher = processWatcher; - this.mqCrawlerOutbox = processOutboxes.getCrawlerOutbox(); - this.storageService = storageService; - this.refreshService = refreshService; - this.executorActorStateMachines = executorActorStateMachines; - } - -} diff --git a/code/execution/java/nu/marginalia/execution/ExecutorCrawlGrpcService.java b/code/execution/java/nu/marginalia/execution/ExecutorCrawlGrpcService.java index e8f35f2e..b4895c87 100644 --- a/code/execution/java/nu/marginalia/execution/ExecutorCrawlGrpcService.java +++ b/code/execution/java/nu/marginalia/execution/ExecutorCrawlGrpcService.java @@ -4,7 +4,10 @@ import com.google.inject.Inject; import io.grpc.stub.StreamObserver; import nu.marginalia.actor.ExecutorActor; import nu.marginalia.actor.ExecutorActorControlService; -import nu.marginalia.actor.task.*; +import nu.marginalia.actor.task.ConvertActor; +import nu.marginalia.actor.task.ConvertAndLoadActor; +import nu.marginalia.actor.task.CrawlActor; +import nu.marginalia.actor.task.RecrawlSingleDomainActor; import nu.marginalia.functions.execution.api.*; import nu.marginalia.service.server.DiscoverableService; import nu.marginalia.storage.model.FileStorageId; @@ -27,21 +30,7 @@ public class ExecutorCrawlGrpcService public void triggerCrawl(RpcFileStorageId request, StreamObserver responseObserver) { try { actorControlService.startFrom(ExecutorActor.CRAWL, - new CrawlActor.Initial(FileStorageId.of(request.getFileStorageId()))); - - responseObserver.onNext(Empty.getDefaultInstance()); - responseObserver.onCompleted(); - } - catch (Exception e) { - responseObserver.onError(e); - } - } - - @Override - public void triggerRecrawl(RpcFileStorageId request, StreamObserver responseObserver) { - try { - actorControlService.startFrom(ExecutorActor.RECRAWL, - new RecrawlActor.Initial(FileStorageId.of(request.getFileStorageId()), false)); + new CrawlActor.Initial(FileStorageId.of(request.getFileStorageId()), false)); responseObserver.onNext(Empty.getDefaultInstance()); responseObserver.onCompleted(); @@ -113,21 +102,4 @@ public class ExecutorCrawlGrpcService } } - @Override - public void createCrawlSpecFromDownload(RpcCrawlSpecFromDownload request, StreamObserver responseObserver) { - try { - actorControlService.startFrom(ExecutorActor.CRAWL_JOB_EXTRACTOR, - new CrawlJobExtractorActor.CreateFromUrl( - request.getDescription(), - request.getUrl()) - ); - - responseObserver.onNext(Empty.getDefaultInstance()); - responseObserver.onCompleted(); - } - catch (Exception e) { - responseObserver.onError(e); - } - } - } diff --git a/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java index c45967bc..60a66102 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java @@ -14,8 +14,8 @@ import nu.marginalia.converting.sideload.SideloadSourceFactory; import nu.marginalia.converting.writer.ConverterBatchWritableIf; import nu.marginalia.converting.writer.ConverterBatchWriter; import nu.marginalia.converting.writer.ConverterWriter; -import nu.marginalia.io.crawldata.CrawledDomainReader; -import nu.marginalia.io.crawldata.SerializableCrawlDataStream; +import nu.marginalia.io.CrawledDomainReader; +import nu.marginalia.io.SerializableCrawlDataStream; import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mq.MqMessage; import nu.marginalia.mq.inbox.MqInboxResponse; diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java index 0328709c..3a3286e5 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -16,7 +16,7 @@ import nu.marginalia.converting.writer.ConverterBatchWritableIf; import nu.marginalia.converting.writer.ConverterBatchWriter; import nu.marginalia.geoip.GeoIpDictionary; import nu.marginalia.geoip.sources.AsnTable; -import nu.marginalia.io.crawldata.SerializableCrawlDataStream; +import nu.marginalia.io.SerializableCrawlDataStream; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.crawldata.CrawledDocument; diff --git a/code/processes/converting-process/test/nu/marginalia/converting/ConvertingIntegrationTest.java b/code/processes/converting-process/test/nu/marginalia/converting/ConvertingIntegrationTest.java index 06b839eb..a2770f36 100644 --- a/code/processes/converting-process/test/nu/marginalia/converting/ConvertingIntegrationTest.java +++ b/code/processes/converting-process/test/nu/marginalia/converting/ConvertingIntegrationTest.java @@ -5,7 +5,7 @@ import com.google.inject.Guice; import com.google.inject.Injector; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.processor.DomainProcessor; -import nu.marginalia.io.crawldata.SerializableCrawlDataStream; +import nu.marginalia.io.SerializableCrawlDataStream; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.crawl.PubDate; diff --git a/code/processes/converting-process/test/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java b/code/processes/converting-process/test/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java index 6ab40a90..df6a4c2c 100644 --- a/code/processes/converting-process/test/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java +++ b/code/processes/converting-process/test/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java @@ -12,13 +12,13 @@ import nu.marginalia.crawl.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.fetcher.warc.WarcRecorder; import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.DomainProber; +import nu.marginalia.crawl.spec.CrawlSpecProvider; import nu.marginalia.io.crawldata.format.ParquetSerializableCrawlDataStream; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.crawldata.CrawledDocument; import nu.marginalia.model.crawldata.CrawledDomain; import nu.marginalia.model.crawldata.SerializableCrawlData; -import nu.marginalia.model.crawlspec.CrawlSpecRecord; import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter; import org.junit.jupiter.api.*; import org.slf4j.Logger; @@ -77,11 +77,7 @@ public class CrawlingThenConvertingIntegrationTest { @Test public void testInvalidDomain() throws IOException { // Attempt to fetch an invalid domain - var specs = CrawlSpecRecord.builder() - .domain("invalid.invalid.invalid") - .crawlDepth(10) - .urls(List.of()) // add specific URLs to crawl here - .build(); + var specs = new CrawlSpecProvider.CrawlSpecRecord("invalid.invalid.invalid", 10); CrawledDomain crawlData = crawl(specs); @@ -97,11 +93,7 @@ public class CrawlingThenConvertingIntegrationTest { @Test public void testRedirectingDomain() throws IOException { // Attempt to fetch an invalid domain - var specs = CrawlSpecRecord.builder() - .domain("memex.marginalia.nu") - .crawlDepth(10) - .urls(List.of()) // add specific URLs to crawl here - .build(); + var specs = new CrawlSpecProvider.CrawlSpecRecord("memex.marginalia.nu", 10); CrawledDomain crawlData = crawl(specs); @@ -120,11 +112,7 @@ public class CrawlingThenConvertingIntegrationTest { @Test public void testBlockedDomain() throws IOException { // Attempt to fetch an invalid domain - var specs = CrawlSpecRecord.builder() - .domain("search.marginalia.nu") - .crawlDepth(10) - .urls(List.of()) // add specific URLs to crawl here - .build(); + var specs = new CrawlSpecProvider.CrawlSpecRecord("search.marginalia.nu", 10); CrawledDomain crawlData = crawl(specs, d->false); // simulate blocking by blacklisting everything @@ -140,11 +128,7 @@ public class CrawlingThenConvertingIntegrationTest { @Test public void crawlSunnyDay() throws IOException { - var specs = CrawlSpecRecord.builder() - .domain("www.marginalia.nu") - .crawlDepth(10) - .urls(List.of()) // add specific URLs to crawl here - .build(); + var specs = new CrawlSpecProvider.CrawlSpecRecord("www.marginalia.nu", 10); CrawledDomain domain = crawl(specs); assertFalse(domain.doc.isEmpty()); @@ -177,14 +161,12 @@ public class CrawlingThenConvertingIntegrationTest { @Test public void crawlContentTypes() throws IOException { - var specs = CrawlSpecRecord.builder() - .domain("www.marginalia.nu") - .crawlDepth(5) - .urls(List.of( + var specs = new CrawlSpecProvider.CrawlSpecRecord("www.marginalia.nu", 10, + List.of( "https://www.marginalia.nu/sanic.png", "https://www.marginalia.nu/invalid" - )) - .build(); + ) + ); CrawledDomain domain = crawl(specs); assertFalse(domain.doc.isEmpty()); @@ -217,13 +199,9 @@ public class CrawlingThenConvertingIntegrationTest { @Test public void crawlRobotsTxt() throws IOException { - var specs = CrawlSpecRecord.builder() - .domain("search.marginalia.nu") - .crawlDepth(5) - .urls(List.of( - "https://search.marginalia.nu/search?q=hello+world" - )) - .build(); + var specs = new CrawlSpecProvider.CrawlSpecRecord("search.marginalia.nu", 5, + List.of("https://search.marginalia.nu/search?q=hello+world") + ); CrawledDomain domain = crawl(specs); assertFalse(domain.doc.isEmpty()); @@ -260,18 +238,18 @@ public class CrawlingThenConvertingIntegrationTest { return null; // unreachable } } - private CrawledDomain crawl(CrawlSpecRecord specs) throws IOException { + private CrawledDomain crawl(CrawlSpecProvider.CrawlSpecRecord specs) throws IOException { return crawl(specs, domain -> true); } - private CrawledDomain crawl(CrawlSpecRecord specs, Predicate domainBlacklist) throws IOException { + private CrawledDomain crawl(CrawlSpecProvider.CrawlSpecRecord specs, Predicate domainBlacklist) throws IOException { List data = new ArrayList<>(); try (var recorder = new WarcRecorder(fileName)) { new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, recorder).crawlDomain(); } - CrawledDocumentParquetRecordFileWriter.convertWarc(specs.domain, + CrawledDocumentParquetRecordFileWriter.convertWarc(specs.domain(), new UserAgent("test", "test"), fileName, fileName2); diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java index 81ebc3b7..4307952d 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java @@ -17,15 +17,11 @@ import nu.marginalia.crawl.retreival.CrawlDataReference; import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.spec.CrawlSpecProvider; -import nu.marginalia.crawl.spec.DbCrawlSpecProvider; -import nu.marginalia.crawl.spec.ParquetCrawlSpecProvider; import nu.marginalia.crawl.warc.WarcArchiverFactory; import nu.marginalia.crawl.warc.WarcArchiverIf; -import nu.marginalia.crawlspec.CrawlSpecFileNames; -import nu.marginalia.io.crawldata.CrawledDomainReader; -import nu.marginalia.io.crawldata.CrawlerOutputFile; +import nu.marginalia.io.CrawledDomainReader; +import nu.marginalia.io.CrawlerOutputFile; import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.crawlspec.CrawlSpecRecord; import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mq.MqMessage; import nu.marginalia.mq.inbox.MqInboxResponse; @@ -66,7 +62,7 @@ public class CrawlerMain extends ProcessMainClass { private final MessageQueueFactory messageQueueFactory; private final DomainProber domainProber; private final FileStorageService fileStorageService; - private final DbCrawlSpecProvider dbCrawlSpecProvider; + private final CrawlSpecProvider crawlSpecProvider; private final AnchorTagsSourceFactory anchorTagsSourceFactory; private final WarcArchiverFactory warcArchiverFactory; private final Gson gson; @@ -89,7 +85,7 @@ public class CrawlerMain extends ProcessMainClass { MessageQueueFactory messageQueueFactory, DomainProber domainProber, FileStorageService fileStorageService, ProcessConfiguration processConfiguration, - DbCrawlSpecProvider dbCrawlSpecProvider, + CrawlSpecProvider crawlSpecProvider, AnchorTagsSourceFactory anchorTagsSourceFactory, WarcArchiverFactory warcArchiverFactory, Gson gson) { @@ -98,7 +94,7 @@ public class CrawlerMain extends ProcessMainClass { this.messageQueueFactory = messageQueueFactory; this.domainProber = domainProber; this.fileStorageService = fileStorageService; - this.dbCrawlSpecProvider = dbCrawlSpecProvider; + this.crawlSpecProvider = crawlSpecProvider; this.anchorTagsSourceFactory = anchorTagsSourceFactory; this.warcArchiverFactory = warcArchiverFactory; this.gson = gson; @@ -148,7 +144,7 @@ public class CrawlerMain extends ProcessMainClass { crawler.runForSingleDomain(instructions.targetDomainName, instructions.outputDir); } else { - crawler.run(instructions.specProvider, instructions.outputDir); + crawler.run(instructions.outputDir); } instructions.ok(); } catch (Exception ex) { @@ -164,12 +160,12 @@ public class CrawlerMain extends ProcessMainClass { System.exit(0); } - public void run(CrawlSpecProvider specProvider, Path outputDir) throws Exception { + public void run(Path outputDir) throws Exception { heartbeat.start(); // First a validation run to ensure the file is all good to parse - totalTasks = specProvider.totalCount(); + totalTasks = crawlSpecProvider.totalCount(); if (totalTasks == 0) { // This is an error state, and we should make noise about it throw new IllegalStateException("No crawl tasks found, refusing to continue"); @@ -178,18 +174,18 @@ public class CrawlerMain extends ProcessMainClass { try (WorkLog workLog = new WorkLog(outputDir.resolve("crawler.log")); WarcArchiverIf warcArchiver = warcArchiverFactory.get(outputDir); - AnchorTagsSource anchorTagsSource = anchorTagsSourceFactory.create(specProvider.getDomains()) + AnchorTagsSource anchorTagsSource = anchorTagsSourceFactory.create(crawlSpecProvider.getDomains()) ) { // Set the number of tasks done to the number of tasks that are already finished, // (this happens when the process is restarted after a crash or a shutdown) tasksDone.set(workLog.countFinishedJobs()); // Process the crawl tasks - try (var specStream = specProvider.stream()) { + try (var specStream = crawlSpecProvider.stream()) { specStream .takeWhile((e) -> abortMonitor.isAlive()) - .filter(e -> !workLog.isJobFinished(e.domain)) - .filter(e -> processingIds.put(e.domain, "") == null) + .filter(e -> !workLog.isJobFinished(e.domain())) + .filter(e -> processingIds.put(e.domain(), "") == null) .map(e -> new CrawlTask(e, anchorTagsSource, outputDir, warcArchiver, workLog)) .forEach(pool::submitQuietly); } @@ -226,7 +222,7 @@ public class CrawlerMain extends ProcessMainClass { WarcArchiverIf warcArchiver = warcArchiverFactory.get(outputDir); AnchorTagsSource anchorTagsSource = anchorTagsSourceFactory.create(List.of(new EdgeDomain(targetDomainName))) ) { - var spec = new CrawlSpecRecord(targetDomainName, 1000, null); + var spec = new CrawlSpecProvider.CrawlSpecRecord(targetDomainName, 1000, null); var task = new CrawlTask(spec, anchorTagsSource, outputDir, warcArchiver, workLog); task.run(); } @@ -240,7 +236,7 @@ public class CrawlerMain extends ProcessMainClass { class CrawlTask implements SimpleBlockingThreadPool.Task { - private final CrawlSpecRecord specification; + private final CrawlSpecProvider.CrawlSpecRecord specification; private final String domain; private final String id; @@ -250,7 +246,7 @@ public class CrawlerMain extends ProcessMainClass { private final WarcArchiverIf warcArchiver; private final WorkLog workLog; - CrawlTask(CrawlSpecRecord specification, + CrawlTask(CrawlSpecProvider.CrawlSpecRecord specification, AnchorTagsSource anchorTagsSource, Path outputDir, WarcArchiverIf warcArchiver, @@ -262,7 +258,7 @@ public class CrawlerMain extends ProcessMainClass { this.warcArchiver = warcArchiver; this.workLog = workLog; - this.domain = specification.domain; + this.domain = specification.domain(); this.id = Integer.toHexString(domain.hashCode()); } @@ -280,7 +276,7 @@ public class CrawlerMain extends ProcessMainClass { Files.deleteIfExists(tempFile); } - var domainLock = domainLocks.getSemaphore(new EdgeDomain(specification.domain)); + var domainLock = domainLocks.getSemaphore(new EdgeDomain(specification.domain())); try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, warcRecorder); @@ -336,7 +332,7 @@ public class CrawlerMain extends ProcessMainClass { try { return new CrawlDataReference(CrawledDomainReader.createDataStream(outputDir, domain, id)); } catch (IOException e) { - logger.debug("Failed to read previous crawl data for {}", specification.domain); + logger.debug("Failed to read previous crawl data for {}", specification.domain()); return new CrawlDataReference(); } } @@ -346,22 +342,19 @@ public class CrawlerMain extends ProcessMainClass { private static class CrawlRequest { - private final CrawlSpecProvider specProvider; private final Path outputDir; private final MqMessage message; private final MqSingleShotInbox inbox; private final String targetDomainName; - CrawlRequest(CrawlSpecProvider specProvider, - String targetDomainName, + CrawlRequest(String targetDomainName, Path outputDir, MqMessage message, MqSingleShotInbox inbox) { this.message = message; this.inbox = inbox; - this.specProvider = specProvider; this.outputDir = outputDir; this.targetDomainName = targetDomainName; } @@ -387,26 +380,9 @@ public class CrawlerMain extends ProcessMainClass { var request = gson.fromJson(msg.payload(), nu.marginalia.mqapi.crawling.CrawlRequest.class); - CrawlSpecProvider specProvider; - - if (request.specStorage != null) { - var specData = fileStorageService.getStorage(request.specStorage); - var parquetProvider = new ParquetCrawlSpecProvider(CrawlSpecFileNames.resolve(specData)); - - // Ensure the parquet domains are loaded into the database to avoid - // rare data-loss scenarios - dbCrawlSpecProvider.ensureParquetDomainsLoaded(parquetProvider); - - specProvider = parquetProvider; - } - else { - specProvider = dbCrawlSpecProvider; - } - var crawlData = fileStorageService.getStorage(request.crawlStorage); return new CrawlRequest( - specProvider, request.targetDomainName, crawlData.asPath(), msg, diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDataReference.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDataReference.java index 8b34cb77..b208867c 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDataReference.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDataReference.java @@ -2,7 +2,7 @@ package nu.marginalia.crawl.retreival; import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; -import nu.marginalia.io.crawldata.SerializableCrawlDataStream; +import nu.marginalia.io.SerializableCrawlDataStream; import nu.marginalia.lsh.EasyLSH; import nu.marginalia.model.crawldata.CrawledDocument; import org.slf4j.Logger; diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index ab6456de..3f180222 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -11,12 +11,12 @@ import nu.marginalia.crawl.logic.LinkFilterSelector; import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor; import nu.marginalia.crawl.retreival.revisit.DocumentWithReference; import nu.marginalia.crawl.retreival.sitemap.SitemapFetcher; +import nu.marginalia.crawl.spec.CrawlSpecProvider; import nu.marginalia.ip_blocklist.UrlBlocklist; import nu.marginalia.link_parser.LinkParser; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.body.HttpFetchResult; -import nu.marginalia.model.crawlspec.CrawlSpecRecord; import org.jsoup.Jsoup; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -26,7 +26,6 @@ import java.net.InetAddress; import java.net.UnknownHostException; import java.nio.file.Path; import java.util.List; -import java.util.Objects; import java.util.Optional; import java.util.concurrent.TimeUnit; @@ -55,16 +54,16 @@ public class CrawlerRetreiver implements AutoCloseable { public CrawlerRetreiver(HttpFetcher fetcher, DomainProber domainProber, - CrawlSpecRecord specs, + CrawlSpecProvider.CrawlSpecRecord specs, WarcRecorder warcRecorder) { this.warcRecorder = warcRecorder; this.fetcher = fetcher; this.domainProber = domainProber; - domain = specs.domain; + domain = specs.domain(); - crawlFrontier = new DomainCrawlFrontier(new EdgeDomain(domain), Objects.requireNonNullElse(specs.urls, List.of()), specs.crawlDepth); + crawlFrontier = new DomainCrawlFrontier(new EdgeDomain(domain), specs.urls(), specs.crawlDepth()); crawlerRevisitor = new CrawlerRevisitor(crawlFrontier, this, warcRecorder); sitemapFetcher = new SitemapFetcher(crawlFrontier, fetcher.createSitemapRetriever()); diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/spec/CrawlSpecProvider.java b/code/processes/crawling-process/java/nu/marginalia/crawl/spec/CrawlSpecProvider.java index 240b4232..d19aa98c 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/spec/CrawlSpecProvider.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/spec/CrawlSpecProvider.java @@ -1,16 +1,143 @@ package nu.marginalia.crawl.spec; +import com.google.inject.Inject; +import com.zaxxer.hikari.HikariDataSource; +import lombok.Builder; +import lombok.SneakyThrows; +import nu.marginalia.ProcessConfiguration; +import nu.marginalia.db.DomainBlacklist; import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.crawlspec.CrawlSpecRecord; +import org.jetbrains.annotations.NotNull; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.util.ArrayList; +import java.util.Collections; import java.util.List; import java.util.stream.Stream; -public interface CrawlSpecProvider { - int totalCount() throws Exception; - Stream stream(); +public class CrawlSpecProvider { + private final HikariDataSource dataSource; + private final ProcessConfiguration processConfiguration; + private final DomainBlacklist blacklist; - default List getDomains() { - return stream().map(CrawlSpecRecord::getDomain).map(EdgeDomain::new).toList(); + private List domains; + + private static final Logger logger = LoggerFactory.getLogger(CrawlSpecProvider.class); + + private static final double URL_GROWTH_FACTOR = Double.parseDouble(System.getProperty("crawler.crawlSetGrowthFactor", "1.25")); + private static final int MIN_URLS_PER_DOMAIN = Integer.getInteger("crawler.minUrlsPerDomain", 100); + private static final int MID_URLS_PER_DOMAIN = Integer.getInteger("crawler.minUrlsPerDomain", 2_000); + private static final int MAX_URLS_PER_DOMAIN = Integer.getInteger("crawler.maxUrlsPerDomain", 10_000); + + @Inject + public CrawlSpecProvider(HikariDataSource dataSource, + ProcessConfiguration processConfiguration, + DomainBlacklist blacklist + ) { + this.dataSource = dataSource; + this.processConfiguration = processConfiguration; + this.blacklist = blacklist; + } + + // Load the domains into memory to ensure the crawler is resilient to database blips + private List loadData() throws Exception { + var domains = new ArrayList(); + + logger.info("Loading domains to be crawled"); + + blacklist.waitUntilLoaded(); + + List domainIds = new ArrayList<>(10_000); + + try (var conn = dataSource.getConnection(); + var assignFreeDomains = conn.prepareStatement("UPDATE EC_DOMAIN SET NODE_AFFINITY=? WHERE NODE_AFFINITY=0"); + var query = conn.prepareStatement(""" + SELECT DOMAIN_NAME, COALESCE(KNOWN_URLS, 0), EC_DOMAIN.ID + FROM EC_DOMAIN + LEFT JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID + WHERE NODE_AFFINITY=? OR NODE_AFFINITY=0 + """) + ) + { + + // Assign any domains with node_affinity=0 to this node. We must do this now, before we start crawling + // to avoid race conditions with other crawl runs. We don't want multiple crawlers to crawl the same domain. + assignFreeDomains.setInt(1, processConfiguration.node()); + assignFreeDomains.executeUpdate(); + + // Fetch the domains to be crawled + query.setInt(1, processConfiguration.node()); + query.setFetchSize(10_000); + var rs = query.executeQuery(); + + while (rs.next()) { + // Skip blacklisted domains + int id = rs.getInt(3); + if (blacklist.isBlacklisted(id)) + continue; + domainIds.add(id); + + int urls = rs.getInt(2); + double growthFactor; + + if (urls < MID_URLS_PER_DOMAIN) { + growthFactor = Math.max(2.5, URL_GROWTH_FACTOR); + } + else { + growthFactor = URL_GROWTH_FACTOR; + } + + int urlsToFetch = Math.clamp((int) (growthFactor * rs.getInt(2)), MIN_URLS_PER_DOMAIN, MAX_URLS_PER_DOMAIN); + + var record = new CrawlSpecRecord( + rs.getString(1), + urlsToFetch, + List.of() + ); + + domains.add(record); + } + + } + + logger.info("Loaded {} domains", domains.size()); + + // Shuffle the domains to ensure we get a good mix of domains in each crawl, + // so that e.g. the big domains don't get all crawled at once, or we end up + // crawling the same server in parallel from different subdomains... + Collections.shuffle(domains); + + return domains; + } + + public List getDomains() { + return stream().map(CrawlSpecRecord::domain).map(EdgeDomain::new).toList(); + } + + public int totalCount() throws Exception { + if (domains == null) { + domains = loadData(); + } + return domains.size(); + } + + @SneakyThrows + public Stream stream() { + if (domains == null) { + domains = loadData(); + } + + return domains.stream(); + } + + + @Builder + public record CrawlSpecRecord(@NotNull String domain, + int crawlDepth, + @NotNull List urls) { + public CrawlSpecRecord(String domain, int crawlDepth) { + this(domain, crawlDepth, List.of()); + } } } diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/spec/DbCrawlSpecProvider.java b/code/processes/crawling-process/java/nu/marginalia/crawl/spec/DbCrawlSpecProvider.java deleted file mode 100644 index b482182f..00000000 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/spec/DbCrawlSpecProvider.java +++ /dev/null @@ -1,164 +0,0 @@ -package nu.marginalia.crawl.spec; - -import com.google.inject.Inject; -import com.zaxxer.hikari.HikariDataSource; -import lombok.SneakyThrows; -import nu.marginalia.ProcessConfiguration; -import nu.marginalia.db.DomainBlacklist; -import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.crawlspec.CrawlSpecRecord; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.stream.Stream; - -public class DbCrawlSpecProvider implements CrawlSpecProvider { - private final HikariDataSource dataSource; - private final ProcessConfiguration processConfiguration; - private final DomainBlacklist blacklist; - private List domains; - - private static final Logger logger = LoggerFactory.getLogger(DbCrawlSpecProvider.class); - - private static final double URL_GROWTH_FACTOR = Double.parseDouble(System.getProperty("crawler.crawlSetGrowthFactor", "1.25")); - private static final int MIN_URLS_PER_DOMAIN = Integer.getInteger("crawler.minUrlsPerDomain", 100); - private static final int MID_URLS_PER_DOMAIN = Integer.getInteger("crawler.minUrlsPerDomain", 2_000); - private static final int MAX_URLS_PER_DOMAIN = Integer.getInteger("crawler.maxUrlsPerDomain", 10_000); - - @Inject - public DbCrawlSpecProvider(HikariDataSource dataSource, - ProcessConfiguration processConfiguration, - DomainBlacklist blacklist - ) { - this.dataSource = dataSource; - this.processConfiguration = processConfiguration; - this.blacklist = blacklist; - } - - // Load the domains into memory to ensure the crawler is resilient to database blips - private List loadData() throws Exception { - var domains = new ArrayList(); - - logger.info("Loading domains to be crawled"); - - blacklist.waitUntilLoaded(); - - List domainIds = new ArrayList<>(10_000); - - try (var conn = dataSource.getConnection(); - var assignFreeDomains = conn.prepareStatement("UPDATE EC_DOMAIN SET NODE_AFFINITY=? WHERE NODE_AFFINITY=0"); - var query = conn.prepareStatement(""" - SELECT DOMAIN_NAME, COALESCE(KNOWN_URLS, 0), EC_DOMAIN.ID - FROM EC_DOMAIN - LEFT JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID - WHERE NODE_AFFINITY=? OR NODE_AFFINITY=0 - """) - ) - { - - // Assign any domains with node_affinity=0 to this node. We must do this now, before we start crawling - // to avoid race conditions with other crawl runs. We don't want multiple crawlers to crawl the same domain. - assignFreeDomains.setInt(1, processConfiguration.node()); - assignFreeDomains.executeUpdate(); - - // Fetch the domains to be crawled - query.setInt(1, processConfiguration.node()); - query.setFetchSize(10_000); - var rs = query.executeQuery(); - - while (rs.next()) { - // Skip blacklisted domains - int id = rs.getInt(3); - if (blacklist.isBlacklisted(id)) - continue; - domainIds.add(id); - - int urls = rs.getInt(2); - double growthFactor; - - if (urls < MID_URLS_PER_DOMAIN) { - growthFactor = Math.max(2.5, URL_GROWTH_FACTOR); - } - else { - growthFactor = URL_GROWTH_FACTOR; - } - - int urlsToFetch = Math.clamp((int) (growthFactor * rs.getInt(2)), MIN_URLS_PER_DOMAIN, MAX_URLS_PER_DOMAIN); - - var record = new CrawlSpecRecord( - rs.getString(1), - urlsToFetch, - List.of() - ); - - domains.add(record); - } - - } - - logger.info("Loaded {} domains", domains.size()); - - // Shuffle the domains to ensure we get a good mix of domains in each crawl, - // so that e.g. the big domains don't get all crawled at once, or we end up - // crawling the same server in parallel from different subdomains... - Collections.shuffle(domains); - - return domains; - } - - /** Ensure that the domains in the parquet spec provider are loaded into - * the database. This mitigates the risk of certain footguns, such - * re-crawling before performing the 'Load' operation, which would - * otherwise result in the crawler not being able to find the domain - * in the database through the DbCrawlSpecProvider, and thus not - * being able to crawl it. - * */ - public void ensureParquetDomainsLoaded(ParquetCrawlSpecProvider parquetCrawlSpecProvider) throws Exception { - - // This is a bit of an unhealthy mix of concerns, but it's for the Greater Good (TM) - - try (var conn = dataSource.getConnection(); - var stmt = conn.prepareStatement(""" - INSERT IGNORE INTO EC_DOMAIN(DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) - VALUES (?, ?, ?) - """)) - { - parquetCrawlSpecProvider.stream().forEach(record -> { - try { - var domain = new EdgeDomain(record.getDomain()); - stmt.setString(1, record.domain); - stmt.setString(2, domain.topDomain); - stmt.setInt(3, processConfiguration.node()); - stmt.addBatch(); - } catch (Exception e) { - throw new RuntimeException(e); - } - }); - - stmt.executeBatch(); - } - } - - @Override - public int totalCount() throws Exception { - if (domains == null) { - domains = loadData(); - } - return domains.size(); - } - - @SneakyThrows - @Override - public Stream stream() { - if (domains == null) { - domains = loadData(); - } - - return domains.stream(); - } - - -} diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/spec/ParquetCrawlSpecProvider.java b/code/processes/crawling-process/java/nu/marginalia/crawl/spec/ParquetCrawlSpecProvider.java deleted file mode 100644 index db272623..00000000 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/spec/ParquetCrawlSpecProvider.java +++ /dev/null @@ -1,37 +0,0 @@ -package nu.marginalia.crawl.spec; - -import lombok.SneakyThrows; -import nu.marginalia.io.crawlspec.CrawlSpecRecordParquetFileReader; -import nu.marginalia.model.crawlspec.CrawlSpecRecord; - -import java.io.IOException; -import java.nio.file.Path; -import java.util.List; -import java.util.stream.Stream; - -public class ParquetCrawlSpecProvider implements CrawlSpecProvider { - private final List files; - - public ParquetCrawlSpecProvider(List files) { - this.files = files; - } - - @Override - public int totalCount() throws IOException { - int total = 0; - for (var specs : files) { - total += CrawlSpecRecordParquetFileReader.count(specs); - } - return total; - } - - @Override - public Stream stream() { - return files.stream().flatMap(this::streamQuietly); - } - - @SneakyThrows - private Stream streamQuietly(Path file) { - return CrawlSpecRecordParquetFileReader.stream(file); - } -} diff --git a/code/processes/crawling-process/model/java/nu/marginalia/crawlspec/CrawlSpecFileNames.java b/code/processes/crawling-process/model/java/nu/marginalia/crawlspec/CrawlSpecFileNames.java deleted file mode 100644 index ec715b97..00000000 --- a/code/processes/crawling-process/model/java/nu/marginalia/crawlspec/CrawlSpecFileNames.java +++ /dev/null @@ -1,34 +0,0 @@ -package nu.marginalia.crawlspec; - -import nu.marginalia.storage.model.FileStorage; -import nu.marginalia.storage.model.FileStorageType; - -import java.nio.file.Path; -import java.util.ArrayList; -import java.util.List; - -public class CrawlSpecFileNames { - public static Path resolve(Path base) { - return base.resolve("crawl-spec.parquet"); - } - - public static Path resolve(FileStorage storage) { - if (storage.type() != FileStorageType.CRAWL_SPEC) - throw new IllegalArgumentException("Provided file storage is of unexpected type " + - storage.type() + ", expected CRAWL_SPEC"); - - return resolve(storage.asPath()); - } - - public static List resolve(List storageList) { - List ret = new ArrayList<>(); - for (var storage : storageList) { - if (storage.type() != FileStorageType.CRAWL_SPEC) - throw new IllegalArgumentException("Provided file storage is of unexpected type " + - storage.type() + ", expected CRAWL_SPEC"); - ret.add(resolve(storage)); - } - - return ret; - } -} diff --git a/code/processes/crawling-process/model/java/nu/marginalia/crawlspec/CrawlSpecGenerator.java b/code/processes/crawling-process/model/java/nu/marginalia/crawlspec/CrawlSpecGenerator.java deleted file mode 100644 index abd831e5..00000000 --- a/code/processes/crawling-process/model/java/nu/marginalia/crawlspec/CrawlSpecGenerator.java +++ /dev/null @@ -1,71 +0,0 @@ -package nu.marginalia.crawlspec; - -import nu.marginalia.io.crawlspec.CrawlSpecRecordParquetFileWriter; -import nu.marginalia.model.crawlspec.CrawlSpecRecord; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.sql.SQLException; -import java.util.List; - -public class CrawlSpecGenerator { - - public static void generateCrawlSpec(Path output, - DomainSource domains, - KnownUrlsCountSource counts, - KnownUrlsListSource listSource) - throws IOException, SQLException - { - try (var writer = new CrawlSpecRecordParquetFileWriter(output)) { - for (String domain : domains.getDomainNames()) { - - domain = domain.toLowerCase(); - - writer.write(CrawlSpecRecord - .builder() - .crawlDepth(counts.getKnownUrlCount(domain)) - .urls(listSource.getKnownUrls(domain)) - .domain(domain) - .build()); - } - } - } - - public interface DomainSource { - List getDomainNames() throws IOException, SQLException; - - static DomainSource fromFile(Path file) { - return () -> { - var lines = Files.readAllLines(file); - lines.replaceAll(s -> - s.split("#", 2)[0] - .trim() - .toLowerCase() - ); - lines.removeIf(String::isBlank); - return lines; - }; - } - - } - - public interface KnownUrlsCountSource { - int getKnownUrlCount(String domainName) throws SQLException; - - static KnownUrlsCountSource fixed(int value) { - return domainName -> value; - } - } - - public interface KnownUrlsListSource { - List getKnownUrls(String domainName) throws SQLException; - - static KnownUrlsListSource justIndex() { - return domainName -> List.of( - "http://" + domainName + "/", - "https://" + domainName + "/" - ); - } - } -} diff --git a/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/CrawledDomainReader.java b/code/processes/crawling-process/model/java/nu/marginalia/io/CrawledDomainReader.java similarity index 97% rename from code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/CrawledDomainReader.java rename to code/processes/crawling-process/model/java/nu/marginalia/io/CrawledDomainReader.java index 7e359814..95e69925 100644 --- a/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/CrawledDomainReader.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/io/CrawledDomainReader.java @@ -1,4 +1,4 @@ -package nu.marginalia.io.crawldata; +package nu.marginalia.io; import nu.marginalia.io.crawldata.format.ParquetSerializableCrawlDataStream; import org.slf4j.Logger; diff --git a/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/CrawlerOutputFile.java b/code/processes/crawling-process/model/java/nu/marginalia/io/CrawlerOutputFile.java similarity index 98% rename from code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/CrawlerOutputFile.java rename to code/processes/crawling-process/model/java/nu/marginalia/io/CrawlerOutputFile.java index 266a7f24..ab54173a 100644 --- a/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/CrawlerOutputFile.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/io/CrawlerOutputFile.java @@ -1,4 +1,4 @@ -package nu.marginalia.io.crawldata; +package nu.marginalia.io; import org.apache.logging.log4j.util.Strings; diff --git a/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/SerializableCrawlDataStream.java b/code/processes/crawling-process/model/java/nu/marginalia/io/SerializableCrawlDataStream.java similarity index 97% rename from code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/SerializableCrawlDataStream.java rename to code/processes/crawling-process/model/java/nu/marginalia/io/SerializableCrawlDataStream.java index 1ade3836..1d7006d9 100644 --- a/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/SerializableCrawlDataStream.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/io/SerializableCrawlDataStream.java @@ -1,4 +1,4 @@ -package nu.marginalia.io.crawldata; +package nu.marginalia.io; import nu.marginalia.model.crawldata.SerializableCrawlData; import org.jetbrains.annotations.Nullable; diff --git a/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/ParquetSerializableCrawlDataStream.java b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/ParquetSerializableCrawlDataStream.java index dba7a326..05f744fb 100644 --- a/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/ParquetSerializableCrawlDataStream.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/ParquetSerializableCrawlDataStream.java @@ -4,7 +4,7 @@ import lombok.SneakyThrows; import nu.marginalia.contenttype.ContentType; import nu.marginalia.contenttype.DocumentBodyToString; import nu.marginalia.hash.MurmurHash3_128; -import nu.marginalia.io.crawldata.SerializableCrawlDataStream; +import nu.marginalia.io.SerializableCrawlDataStream; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawldata.*; import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecord; diff --git a/code/processes/crawling-process/model/java/nu/marginalia/io/crawlspec/CrawlSpecRecordParquetFileReader.java b/code/processes/crawling-process/model/java/nu/marginalia/io/crawlspec/CrawlSpecRecordParquetFileReader.java deleted file mode 100644 index 9317f55b..00000000 --- a/code/processes/crawling-process/model/java/nu/marginalia/io/crawlspec/CrawlSpecRecordParquetFileReader.java +++ /dev/null @@ -1,26 +0,0 @@ -package nu.marginalia.io.crawlspec; - -import blue.strategic.parquet.HydratorSupplier; -import blue.strategic.parquet.ParquetReader; -import nu.marginalia.model.crawlspec.CrawlSpecRecord; -import org.jetbrains.annotations.NotNull; - -import java.io.IOException; -import java.nio.file.Path; -import java.util.stream.Stream; - -public class CrawlSpecRecordParquetFileReader { - @NotNull - public static Stream stream(Path path) throws IOException { - return ParquetReader.streamContent(path.toFile(), - HydratorSupplier.constantly(CrawlSpecRecord.newHydrator())); - } - - public static int count(Path path) throws IOException { - try (var stream = stream(path)) { - // FIXME This can be done in a more performant way by using another hydrator that only reads a single field - return (int) stream.count(); - } - } - -} diff --git a/code/processes/crawling-process/model/java/nu/marginalia/io/crawlspec/CrawlSpecRecordParquetFileWriter.java b/code/processes/crawling-process/model/java/nu/marginalia/io/crawlspec/CrawlSpecRecordParquetFileWriter.java deleted file mode 100644 index 9a67f225..00000000 --- a/code/processes/crawling-process/model/java/nu/marginalia/io/crawlspec/CrawlSpecRecordParquetFileWriter.java +++ /dev/null @@ -1,24 +0,0 @@ -package nu.marginalia.io.crawlspec; - -import blue.strategic.parquet.ParquetWriter; -import nu.marginalia.model.crawlspec.CrawlSpecRecord; - -import java.io.IOException; -import java.nio.file.Path; - -public class CrawlSpecRecordParquetFileWriter implements AutoCloseable { - private final ParquetWriter writer; - - public CrawlSpecRecordParquetFileWriter(Path file) throws IOException { - writer = ParquetWriter.writeFile(CrawlSpecRecord.schema, - file.toFile(), CrawlSpecRecord.newDehydrator()); - } - - public void write(CrawlSpecRecord domainData) throws IOException { - writer.write(domainData); - } - - public void close() throws IOException { - writer.close(); - } -} diff --git a/code/processes/crawling-process/model/java/nu/marginalia/model/crawlspec/CrawlSpecRecord.java b/code/processes/crawling-process/model/java/nu/marginalia/model/crawlspec/CrawlSpecRecord.java deleted file mode 100644 index 02b881cb..00000000 --- a/code/processes/crawling-process/model/java/nu/marginalia/model/crawlspec/CrawlSpecRecord.java +++ /dev/null @@ -1,90 +0,0 @@ -package nu.marginalia.model.crawlspec; - -import blue.strategic.parquet.Dehydrator; -import blue.strategic.parquet.Hydrator; -import blue.strategic.parquet.ValueWriter; -import lombok.*; -import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.Types; -import org.jetbrains.annotations.NotNull; -import org.jetbrains.annotations.Nullable; - -import java.util.ArrayList; -import java.util.List; - -import static org.apache.parquet.schema.LogicalTypeAnnotation.stringType; -import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.*; - -@Getter -@Setter -@NoArgsConstructor -@AllArgsConstructor -@EqualsAndHashCode -@Builder -@ToString -public class CrawlSpecRecord { - @NotNull - public String domain; - - - /** Limit for how many documents will be crawled */ - public int crawlDepth; - - /** List of known URLs */ - @Nullable - public List urls; - - public static Hydrator newHydrator() { - return new CrawlSpecRecordHydrator(); - } - - public static Dehydrator newDehydrator() { - return CrawlSpecRecord::dehydrate; - } - - public static MessageType schema = new MessageType( - CrawlSpecRecord.class.getSimpleName(), - Types.required(BINARY).as(stringType()).named("domain"), - Types.required(INT32).named("crawlDepth"), - Types.repeated(BINARY).as(stringType()).named("urls") - ); - - public void dehydrate(ValueWriter valueWriter) { - valueWriter.write("domain", domain); - valueWriter.write("crawlDepth", crawlDepth); - valueWriter.writeList("urls", urls); - } - - public CrawlSpecRecord add(String heading, Object value) { - switch (heading) { - case "domain" -> domain = (String) value; - case "crawlDepth" -> crawlDepth = (Integer) value; - case "urls" -> { - if (urls == null) - urls = new ArrayList<>(); - urls.add((String) value); - } - } - - return this; - } -} - -class CrawlSpecRecordHydrator implements Hydrator { - - @Override - public CrawlSpecRecord start() { - return new CrawlSpecRecord(); - } - - @Override - public CrawlSpecRecord add(CrawlSpecRecord target, String heading, Object value) { - return target.add(heading, value); - } - - @Override - public CrawlSpecRecord finish(CrawlSpecRecord target) { - return target; - } - -} diff --git a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java index ffed5cd8..9b6312e6 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java @@ -9,13 +9,13 @@ import nu.marginalia.crawl.fetcher.SitemapRetriever; import nu.marginalia.crawl.fetcher.warc.WarcRecorder; import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.DomainProber; +import nu.marginalia.crawl.spec.CrawlSpecProvider; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.body.HttpFetchResult; import nu.marginalia.model.crawldata.CrawledDocument; import nu.marginalia.model.crawldata.CrawlerDocumentStatus; import nu.marginalia.model.crawldata.SerializableCrawlData; -import nu.marginalia.model.crawlspec.CrawlSpecRecord; import nu.marginalia.test.CommonTestData; import okhttp3.Headers; import org.junit.jupiter.api.AfterEach; @@ -68,7 +68,7 @@ public class CrawlerMockFetcherTest { } - void crawl(CrawlSpecRecord spec) throws IOException { + void crawl(CrawlSpecProvider.CrawlSpecRecord spec) throws IOException { try (var recorder = new WarcRecorder()) { new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), spec, recorder) .crawlDomain(); @@ -83,7 +83,7 @@ public class CrawlerMockFetcherTest { registerUrlClasspathData(new EdgeUrl("https://startrek.website/c/startrek"), "mock-crawl-data/lemmy/c_startrek.html"); registerUrlClasspathData(new EdgeUrl("https://startrek.website/post/108995"), "mock-crawl-data/lemmy/108995.html"); - crawl(new CrawlSpecRecord("startrek.website", 10, new ArrayList<>())); + crawl(new CrawlSpecProvider.CrawlSpecRecord("startrek.website", 10, new ArrayList<>())); } @Test @@ -92,7 +92,7 @@ public class CrawlerMockFetcherTest { registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html"); - crawl(new CrawlSpecRecord("en.wikipedia.org", 10, new ArrayList<>())); + crawl(new CrawlSpecProvider.CrawlSpecRecord("en.wikipedia.org", 10, new ArrayList<>())); } @Test @@ -103,7 +103,7 @@ public class CrawlerMockFetcherTest { registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/telegram-channel-to-idle-on/3501"), "mock-crawl-data/discourse/telegram.html"); registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/combined-mode-but-grid/4489"), "mock-crawl-data/discourse/grid.html"); - crawl(new CrawlSpecRecord("community.tt-rss.org", 10, new ArrayList<>())); + crawl(new CrawlSpecProvider.CrawlSpecRecord("community.tt-rss.org", 10, new ArrayList<>())); } class MockFetcher implements HttpFetcher { diff --git a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java index df0df41a..88604f57 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java @@ -8,14 +8,14 @@ import nu.marginalia.crawl.fetcher.HttpFetcher; import nu.marginalia.crawl.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.fetcher.warc.WarcRecorder; import nu.marginalia.crawl.retreival.*; -import nu.marginalia.io.crawldata.CrawledDomainReader; -import nu.marginalia.io.crawldata.SerializableCrawlDataStream; +import nu.marginalia.crawl.spec.CrawlSpecProvider; +import nu.marginalia.io.CrawledDomainReader; +import nu.marginalia.io.SerializableCrawlDataStream; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawldata.CrawledDocument; import nu.marginalia.model.crawldata.CrawledDomain; import nu.marginalia.model.crawldata.SerializableCrawlData; -import nu.marginalia.model.crawlspec.CrawlSpecRecord; import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter; import org.jetbrains.annotations.NotNull; import org.junit.jupiter.api.*; @@ -76,7 +76,7 @@ class CrawlerRetreiverTest { @Test public void testWarcOutput() throws IOException { - var specs = CrawlSpecRecord + var specs = CrawlSpecProvider.CrawlSpecRecord .builder() .crawlDepth(5) .domain("www.marginalia.nu") @@ -119,7 +119,7 @@ class CrawlerRetreiverTest { @SneakyThrows @Test public void testResync() throws IOException { - var specs = CrawlSpecRecord + var specs = CrawlSpecProvider.CrawlSpecRecord .builder() .crawlDepth(5) .domain("www.marginalia.nu") @@ -168,7 +168,7 @@ class CrawlerRetreiverTest { @Test public void testWithKnownDomains() throws IOException { - var specs = CrawlSpecRecord + var specs = CrawlSpecProvider.CrawlSpecRecord .builder() .crawlDepth(5) .domain("www.marginalia.nu") @@ -212,7 +212,7 @@ class CrawlerRetreiverTest { @Test public void testRedirect() throws IOException, URISyntaxException { - var specs = CrawlSpecRecord + var specs = CrawlSpecProvider.CrawlSpecRecord .builder() .crawlDepth(3) .domain("www.marginalia.nu") @@ -270,7 +270,7 @@ class CrawlerRetreiverTest { @Test public void testEmptySet() throws IOException { - var specs = CrawlSpecRecord + var specs = CrawlSpecProvider.CrawlSpecRecord .builder() .crawlDepth(5) .domain("www.marginalia.nu") @@ -318,7 +318,7 @@ class CrawlerRetreiverTest { @Test public void testRecrawl() throws IOException { - var specs = CrawlSpecRecord + var specs = CrawlSpecProvider.CrawlSpecRecord .builder() .crawlDepth(12) .domain("www.marginalia.nu") @@ -378,7 +378,7 @@ class CrawlerRetreiverTest { @Test public void testRecrawlWithResync() throws IOException { - var specs = CrawlSpecRecord + var specs = CrawlSpecProvider.CrawlSpecRecord .builder() .crawlDepth(12) .domain("www.marginalia.nu") @@ -466,7 +466,7 @@ class CrawlerRetreiverTest { } } - private void doCrawlWithReferenceStream(CrawlSpecRecord specs, SerializableCrawlDataStream stream) { + private void doCrawlWithReferenceStream(CrawlSpecProvider.CrawlSpecRecord specs, SerializableCrawlDataStream stream) { try (var recorder = new WarcRecorder(tempFileWarc2)) { new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).crawlDomain(new DomainLinks(), new CrawlDataReference(stream)); @@ -477,7 +477,7 @@ class CrawlerRetreiverTest { } @NotNull - private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlSpecRecord specs) { + private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlSpecProvider.CrawlSpecRecord specs) { try (var recorder = new WarcRecorder(tempFileWarc1)) { var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder); crawler.crawlDomain(); diff --git a/code/processes/process-mq-api/java/nu/marginalia/mqapi/crawling/CrawlRequest.java b/code/processes/process-mq-api/java/nu/marginalia/mqapi/crawling/CrawlRequest.java index ff090140..2ea2feb3 100644 --- a/code/processes/process-mq-api/java/nu/marginalia/mqapi/crawling/CrawlRequest.java +++ b/code/processes/process-mq-api/java/nu/marginalia/mqapi/crawling/CrawlRequest.java @@ -3,16 +3,9 @@ package nu.marginalia.mqapi.crawling; import lombok.AllArgsConstructor; import nu.marginalia.storage.model.FileStorageId; -import java.util.List; - /** A request to start a crawl */ @AllArgsConstructor public class CrawlRequest { - /** (optional) Crawl spec(s) for sourcing domains to crawl. If not set, - * the EC_DOMAIN table will be consulted and domains with the corresponding - * node affinity will be used. - */ - public List specStorage; /** (optional) Name of a single domain to be re-crawled */ public String targetDomainName; @@ -22,16 +15,12 @@ public class CrawlRequest { */ public FileStorageId crawlStorage; - public static CrawlRequest forSpec(FileStorageId specStorage, FileStorageId crawlStorage) { - return new CrawlRequest(List.of(specStorage), null, crawlStorage); - } - public static CrawlRequest forSingleDomain(String targetDomainName, FileStorageId crawlStorage) { - return new CrawlRequest(null, targetDomainName, crawlStorage); + return new CrawlRequest(targetDomainName, crawlStorage); } - public static CrawlRequest forRecrawl(FileStorageId crawlStorage) { - return new CrawlRequest(null, null, crawlStorage); + public static CrawlRequest forFullCrawl(FileStorageId crawlStorage) { + return new CrawlRequest(null, crawlStorage); } } diff --git a/code/services-core/control-service/java/nu/marginalia/control/node/model/FileStorageWithActions.java b/code/services-core/control-service/java/nu/marginalia/control/node/model/FileStorageWithActions.java index 534fe1f0..35970baf 100644 --- a/code/services-core/control-service/java/nu/marginalia/control/node/model/FileStorageWithActions.java +++ b/code/services-core/control-service/java/nu/marginalia/control/node/model/FileStorageWithActions.java @@ -10,9 +10,6 @@ import java.time.LocalDate; import java.time.format.DateTimeFormatter; public record FileStorageWithActions(FileStorage storage) { - public boolean isCrawlable() { - return storage.type() == FileStorageType.CRAWL_SPEC; - } public boolean isStatusNew() { return storage.state() == FileStorageState.NEW; } diff --git a/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlNodeActionsService.java b/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlNodeActionsService.java index c385e52e..cd0b3f2e 100644 --- a/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlNodeActionsService.java +++ b/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlNodeActionsService.java @@ -83,12 +83,9 @@ public class ControlNodeActionsService { Spark.post("/nodes/:node/actions/download-sample-data", this::downloadSampleData, redirectControl.renderRedirectAcknowledgement("Downloading", "..") ); - Spark.post("/nodes/:id/actions/new-crawl", this::triggerNewCrawl, + Spark.post("/nodes/:id/actions/new-crawl", this::triggerCrawl, redirectControl.renderRedirectAcknowledgement("Crawling", "..") ); - Spark.post("/nodes/:id/actions/recrawl", this::triggerAutoRecrawl, - redirectControl.renderRedirectAcknowledgement("Recrawling", "..") - ); Spark.post("/nodes/:id/actions/recrawl-single-domain", this::triggerSingleDomainRecrawl, redirectControl.renderRedirectAcknowledgement("Recrawling", "..") ); @@ -101,9 +98,6 @@ public class ControlNodeActionsService { Spark.post("/nodes/:id/actions/restore-backup", this::triggerRestoreBackup, redirectControl.renderRedirectAcknowledgement("Restoring", "..") ); - Spark.post("/nodes/:id/actions/new-crawl-specs", this::createNewSpecsAction, - redirectControl.renderRedirectAcknowledgement("Creating", "../actions?view=new-crawl") - ); Spark.post("/nodes/:id/actions/export-db-data", this::exportDbData, redirectControl.renderRedirectAcknowledgement("Exporting", "..") ); @@ -205,14 +199,14 @@ public class ControlNodeActionsService { return ""; } - private Object triggerAutoRecrawl(Request request, Response response) throws SQLException { + private Object triggerCrawl(Request request, Response response) throws SQLException { int nodeId = Integer.parseInt(request.params("id")); var toCrawl = parseSourceFileStorageId(request.queryParams("source")); changeActiveStorage(nodeId, FileStorageType.CRAWL_DATA, toCrawl); - crawlClient.triggerRecrawl( + crawlClient.triggerCrawl( nodeId, toCrawl ); @@ -235,18 +229,6 @@ public class ControlNodeActionsService { return ""; } - private Object triggerNewCrawl(Request request, Response response) throws SQLException { - int nodeId = Integer.parseInt(request.params("id")); - - var toCrawl = parseSourceFileStorageId(request.queryParams("source")); - - changeActiveStorage(nodeId, FileStorageType.CRAWL_SPEC, toCrawl); - - crawlClient.triggerCrawl(nodeId, toCrawl); - - return ""; - } - private Object triggerProcess(Request request, Response response) throws SQLException { int nodeId = Integer.parseInt(request.params("id")); boolean isAutoload = "on".equalsIgnoreCase(request.queryParams("autoload")); @@ -308,21 +290,6 @@ public class ControlNodeActionsService { } } - - private Object createNewSpecsAction(Request request, Response response) { - final String description = request.queryParams("description"); - final String url = request.queryParams("url"); - int nodeId = Integer.parseInt(request.params("id")); - - if (url == null || url.isBlank()) { - throw new ControlValidationError("No url specified", "A url must be specified", ".."); - } - - crawlClient.createCrawlSpecFromDownload(nodeId, description, url); - - return ""; - } - private Object exportDbData(Request req, Response rsp) { exportClient.exportData(Integer.parseInt(req.params("id"))); diff --git a/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlNodeService.java b/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlNodeService.java index 32267155..a0049fee 100644 --- a/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlNodeService.java +++ b/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlNodeService.java @@ -175,9 +175,7 @@ public class ControlNodeService { "allCrawlData", fileStorageService.getEachFileStorage(nodeId, FileStorageType.CRAWL_DATA), "allProcessedData", - fileStorageService.getEachFileStorage(nodeId, FileStorageType.PROCESSED_DATA), - "allCrawlSpecs", - fileStorageService.getEachFileStorage(nodeId, FileStorageType.CRAWL_SPEC) + fileStorageService.getEachFileStorage(nodeId, FileStorageType.PROCESSED_DATA) ); } @@ -201,7 +199,6 @@ public class ControlNodeService { case "backup" -> FileStorageType.BACKUP; case "crawl" -> FileStorageType.CRAWL_DATA; case "processed" -> FileStorageType.PROCESSED_DATA; - case "specs" -> FileStorageType.CRAWL_SPEC; case "exports" -> FileStorageType.EXPORT; default -> throw new IllegalArgumentException(view); }; diff --git a/code/services-core/control-service/resources/templates/control/app/domains.hdb b/code/services-core/control-service/resources/templates/control/app/domains.hdb index 339b9664..b1a80c1d 100644 --- a/code/services-core/control-service/resources/templates/control/app/domains.hdb +++ b/code/services-core/control-service/resources/templates/control/app/domains.hdb @@ -8,7 +8,10 @@ {{> control/partials/nav}}

Domains

- +
+ This is a listing of all domains known by the search engine. Additional domains can be added via the + Add Domains utility. +
diff --git a/code/services-core/control-service/resources/templates/control/node/actions/partial-new-crawl-specs.hdb b/code/services-core/control-service/resources/templates/control/node/actions/partial-new-crawl-specs.hdb deleted file mode 100644 index b3cfa762..00000000 --- a/code/services-core/control-service/resources/templates/control/node/actions/partial-new-crawl-specs.hdb +++ /dev/null @@ -1,28 +0,0 @@ -

Create New Specification

- -
-

A crawling specification is a list of domain names to crawl.

-

- If you are just looking to test the software, feel free to use this - short list of marginalia-related websites, that are safe to crawl repeatedly without causing any problems. -

- -

Warning Ensure downloaded sample data has not been loaded onto this instance - before performing this action, otherwise those domains will also be crawled while re-crawling in the future!

-
- - -
- - -
This is how you'll be able to find the - specification later so give it a good and descriptive name
- -
- - -
- - -
- \ No newline at end of file diff --git a/code/services-core/control-service/resources/templates/control/node/actions/partial-new-crawl.hdb b/code/services-core/control-service/resources/templates/control/node/actions/partial-new-crawl.hdb index 76268e4e..f01f0116 100644 --- a/code/services-core/control-service/resources/templates/control/node/actions/partial-new-crawl.hdb +++ b/code/services-core/control-service/resources/templates/control/node/actions/partial-new-crawl.hdb @@ -1,56 +1,52 @@

New Crawl

-{{#unless allCrawlSpecs}} - No crawling specifications have been created. You must create a crawl spec before you can crawl. - Create a crawl spec. -{{/unless}} - -{{#if allCrawlSpecs}} -
- This will perform a new crawl on node {{node.id}} based on the crawl spec you select below. - Additional specifications can be created with this form. -
-
-

IMPORTANT! Be sure you've read and understood the - crawling documentation - before you begin a crawl. You will be accessing real servers from your connection, and you may end up on IP greylists - that temporarily block your access to those servers for up to a few weeks; on rare occasions permanently. The crawler - is generally polite and well-behaved, but it is still a crawler, and it will be accessing a lot of pages. -

-

- You run the risk of getting in trouble with your ISP, the law, your partner, your landlord, and/or your parents; - depending on terms of service, jurisdiction, marital status, living conditions and age. +

+ This will crawl on node {{node.id}}. +

+ If existing crawl data is provided, the crawler may perform a smart crawl that uses the existing data + as a starting point. It will use the existing crawl data to perform conditional GETs on the domains in the + crawl spec. If the document has changed, it will be re-crawled. If it has not changed, it will be skipped, + and the previous data will be retained. This is both faster and easier on the target server.

- This action is performed at your own risk. -

-
+

Warning Ensure downloaded sample data + has not been loaded onto this instance before performing this action, otherwise those domains will also be crawled!

+

Use the Domains view to inspect and manage known domains.

+
- -
- - - - - - + +

Select the crawl data to use:

+
UsePathDescriptionDetails
+ + + + + + +{{#each allCrawlData}} + + + + + + +{{/each}} + + + + +
UsePathDescriptionDetails
{{description}} + {{#if new}}[CREATING]{{/if}} + {{#if delete}}[DELETING]{{/if}} + [Details]
+ + + +
- {{#each allCrawlSpecs}} - - - - {{description}} - {{#if new}}[CREATING]{{/if}} - {{#if delete}}[DELETING]{{/if}} - - [Details] - - {{/each}} - - - -{{/if}} \ No newline at end of file + + \ No newline at end of file diff --git a/code/services-core/control-service/resources/templates/control/node/actions/partial-recrawl.hdb b/code/services-core/control-service/resources/templates/control/node/actions/partial-recrawl.hdb deleted file mode 100644 index 1adef14b..00000000 --- a/code/services-core/control-service/resources/templates/control/node/actions/partial-recrawl.hdb +++ /dev/null @@ -1,53 +0,0 @@ -

Recrawl

- -{{#unless allCrawlData}} -

- No crawl data currently exists on this node. Recrawling is not possible. - First, create a crawl spec and do an initial crawl. Then, you can recrawl. -

-{{/unless}} - -{{#if allCrawlData}} - -
-

- This will perform a re-crawl on node {{node.id}}. -

- A re-crawl is a smart crawl that uses the existing crawl data - as a starting point. It will use the existing crawl data to perform conditional GETs on the domains in the - crawl spec. If the document has changed, it will be re-crawled. If it has not changed, it will be skipped, - and the previous data will be retained. This is both faster and easier on the target server. -

-

Warning Ensure downloaded sample data - has not been loaded onto this instance before performing this action, otherwise those domains will also be crawled!

-
- -
-

Select the crawl data to use:

- - - - - - - -{{#each allCrawlData}} - - - - - - -{{/each}} -
UsePathDescriptionDetails
{{description}} - {{#if new}}[CREATING]{{/if}} - {{#if delete}}[DELETING]{{/if}} - [Details]
- - -
-{{/if}} \ No newline at end of file diff --git a/code/services-core/control-service/resources/templates/control/node/node-actions.hdb b/code/services-core/control-service/resources/templates/control/node/node-actions.hdb index 1399f1f6..1dc7a80b 100644 --- a/code/services-core/control-service/resources/templates/control/node/node-actions.hdb +++ b/code/services-core/control-service/resources/templates/control/node/node-actions.hdb @@ -10,8 +10,6 @@ {{> control/node/partial-node-nav }} {{#if view.new-crawl}} {{> control/node/actions/partial-new-crawl }} {{/if}} - {{#if view.new-crawl-specs}} {{> control/node/actions/partial-new-crawl-specs }} {{/if}} - {{#if view.re-crawl}} {{> control/node/actions/partial-recrawl }} {{/if}} {{#if view.process}} {{> control/node/actions/partial-process }} {{/if}} {{#if view.load}} {{> control/node/actions/partial-load }} {{/if}} {{#if view.repartition}} {{> control/node/actions/partial-repartition }} {{/if}} diff --git a/code/services-core/control-service/resources/templates/control/node/partial-node-nav.hdb b/code/services-core/control-service/resources/templates/control/node/partial-node-nav.hdb index ff16507d..07c7ced3 100644 --- a/code/services-core/control-service/resources/templates/control/node/partial-node-nav.hdb +++ b/code/services-core/control-service/resources/templates/control/node/partial-node-nav.hdb @@ -14,7 +14,6 @@