From cae1bad274140411be837e64105dea69e8fd8f5e Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 25 Jan 2024 13:36:30 +0100 Subject: [PATCH] (*) Add download-sample action, refactor file storage This changeset adds an action for downloading a set of sample data from downloads.marginalia.nu. It also refactors out some leaky abstractions out of FileStorageService. allocateTemporaryStorage has been renamed allocateStorage. The storage was never temporary in any scenario... It also doesn't take a storage base, as there was always only one valid option for this input. The allocateStorage method finds the appropriate base itself. --- .../executor/client/ExecutorClient.java | 6 +- .../storage/FileStorageService.java | 12 +- .../storage/model/FileStorageBaseType.java | 13 +- .../storage/FileStorageServiceTest.java | 7 +- .../node/svc/ControlNodeActionsService.java | 22 +++ .../actions/partial-download-sample-data.hdb | 47 +++++++ .../templates/control/node/node-actions.hdb | 1 + .../control/node/partial-node-nav.hdb | 1 + .../executor-service/build.gradle | 1 + .../nu/marginalia/actor/ExecutorActor.java | 4 +- .../actor/ExecutorActorControlService.java | 3 + .../marginalia/actor/task/ConvertActor.java | 16 +-- .../actor/task/ConvertAndLoadActor.java | 5 +- .../nu/marginalia/actor/task/CrawlActor.java | 5 +- .../actor/task/CrawlJobExtractorActor.java | 4 +- .../actor/task/DownloadSampleActor.java | 133 ++++++++++++++++++ .../actor/task/ExportAtagsActor.java | 3 +- .../actor/task/ExportDataActor.java | 4 +- .../actor/task/ExportFeedsActor.java | 4 +- .../actor/task/ExportSampleDataActor.java | 6 +- .../actor/task/ExportTermFreqActor.java | 4 +- .../nu/marginalia/executor/ExecutorSvc.java | 2 + .../executor/svc/SideloadService.java | 11 ++ .../executor/svc/TransferService.java | 4 +- .../java/nu/marginalia/svc/BackupService.java | 5 +- 25 files changed, 256 insertions(+), 67 deletions(-) create mode 100644 code/services-core/control-service/src/main/resources/templates/control/node/actions/partial-download-sample-data.hdb create mode 100644 code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/DownloadSampleActor.java diff --git a/code/api/executor-api/src/main/java/nu/marginalia/executor/client/ExecutorClient.java b/code/api/executor-api/src/main/java/nu/marginalia/executor/client/ExecutorClient.java index b5554edd..df9c4072 100644 --- a/code/api/executor-api/src/main/java/nu/marginalia/executor/client/ExecutorClient.java +++ b/code/api/executor-api/src/main/java/nu/marginalia/executor/client/ExecutorClient.java @@ -71,7 +71,6 @@ public class ExecutorClient extends AbstractDynamicClient { post(ctx, node, "/sideload/encyclopedia?path="+ URLEncoder.encode(sourcePath.toString(), StandardCharsets.UTF_8) + "&baseUrl=" + URLEncoder.encode(baseUrl, StandardCharsets.UTF_8), "").blockingSubscribe(); - } public void sideloadDirtree(Context ctx, int node, Path sourcePath) { @@ -111,6 +110,10 @@ public class ExecutorClient extends AbstractDynamicClient { post(ctx, node, "/export/termfreq?fid="+fid, "").blockingSubscribe(); } + public void downloadSampleData(Context ctx, int node, String sampleSet) { + post(ctx, node, "/action/download-sample-data?set="+URLEncoder.encode(sampleSet, StandardCharsets.UTF_8), "").blockingSubscribe(); + } + public void exportData(Context ctx, int node) { post(ctx, node, "/export/data", "").blockingSubscribe(); } @@ -166,4 +169,5 @@ public class ExecutorClient extends AbstractDynamicClient { public void yieldDomain(Context context, int node, TransferItem item) { post(context, node, "/transfer/yield", item).blockingSubscribe(); } + } diff --git a/code/common/config/src/main/java/nu/marginalia/storage/FileStorageService.java b/code/common/config/src/main/java/nu/marginalia/storage/FileStorageService.java index 999a51a5..b2162664 100644 --- a/code/common/config/src/main/java/nu/marginalia/storage/FileStorageService.java +++ b/code/common/config/src/main/java/nu/marginalia/storage/FileStorageService.java @@ -223,14 +223,12 @@ public class FileStorageService { return maybePath; } - /** Allocate a temporary storage of the given type */ - public FileStorage allocateTemporaryStorage(FileStorageBase base, - FileStorageType type, - String prefix, - String description) throws IOException, SQLException + /** Allocate a storage area of the given type */ + public FileStorage allocateStorage(FileStorageType type, + String prefix, + String description) throws IOException, SQLException { - if (!base.type().permitsStorageType(type)) - throw new RuntimeException("Attempting to allocate storage of type " + type + " in base of type " + base.type()); + var base = getStorageBase(FileStorageBaseType.forFileStorageType(type)); Path newDir = allocateDirectory(base.asPath(), prefix); diff --git a/code/common/config/src/main/java/nu/marginalia/storage/model/FileStorageBaseType.java b/code/common/config/src/main/java/nu/marginalia/storage/model/FileStorageBaseType.java index 7bd1eb0d..d319786f 100644 --- a/code/common/config/src/main/java/nu/marginalia/storage/model/FileStorageBaseType.java +++ b/code/common/config/src/main/java/nu/marginalia/storage/model/FileStorageBaseType.java @@ -1,18 +1,17 @@ package nu.marginalia.storage.model; -import java.util.EnumSet; - public enum FileStorageBaseType { CURRENT, WORK, STORAGE, BACKUP; - public boolean permitsStorageType(FileStorageType type) { - return switch (this) { - case BACKUP -> FileStorageType.BACKUP.equals(type); - case STORAGE -> EnumSet.of(FileStorageType.EXPORT, FileStorageType.CRAWL_DATA, FileStorageType.PROCESSED_DATA, FileStorageType.CRAWL_SPEC).contains(type); - default -> false; + + public static FileStorageBaseType forFileStorageType(FileStorageType type) { + return switch (type) { + case EXPORT, CRAWL_DATA, PROCESSED_DATA, CRAWL_SPEC -> STORAGE; + case BACKUP -> BACKUP; }; } + } diff --git a/code/common/config/src/test/java/nu/marginalia/storage/FileStorageServiceTest.java b/code/common/config/src/test/java/nu/marginalia/storage/FileStorageServiceTest.java index 69fa9b1b..c9ffa309 100644 --- a/code/common/config/src/test/java/nu/marginalia/storage/FileStorageServiceTest.java +++ b/code/common/config/src/test/java/nu/marginalia/storage/FileStorageServiceTest.java @@ -13,18 +13,14 @@ import org.testcontainers.containers.MariaDBContainer; import org.testcontainers.junit.jupiter.Container; import org.testcontainers.junit.jupiter.Testcontainers; -import java.io.FileNotFoundException; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.sql.SQLException; import java.util.ArrayList; import java.util.List; -import java.util.Objects; import java.util.UUID; -import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD; - @Testcontainers @Execution(ExecutionMode.SAME_THREAD) @Tag("slow") @@ -124,8 +120,7 @@ public class FileStorageServiceTest { var storage = new FileStorageService(dataSource, 0); - var base = storage.createStorageBase(name, createTempDir(), FileStorageBaseType.STORAGE); - var fileStorage = storage.allocateTemporaryStorage(base, FileStorageType.CRAWL_DATA, "xyz", "thisShouldSucceed"); + var fileStorage = storage.allocateStorage(FileStorageType.CRAWL_DATA, "xyz", "thisShouldSucceed"); System.out.println("Allocated " + fileStorage.asPath()); Assertions.assertTrue(Files.exists(fileStorage.asPath())); tempDirs.add(fileStorage.asPath()); diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/node/svc/ControlNodeActionsService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/node/svc/ControlNodeActionsService.java index 16b06998..a707aefe 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/node/svc/ControlNodeActionsService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/node/svc/ControlNodeActionsService.java @@ -13,6 +13,8 @@ import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageState; import nu.marginalia.storage.model.FileStorageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import spark.Request; import spark.Response; import spark.Spark; @@ -21,9 +23,11 @@ import java.nio.file.Path; import java.sql.SQLException; import java.util.Arrays; import java.util.List; +import java.util.Set; @Singleton public class ControlNodeActionsService { + private static final Logger logger = LoggerFactory.getLogger(ControlNodeActionsService.class); private final IndexClient indexClient; private final RedirectControl redirectControl; private final FileStorageService fileStorageService; @@ -62,6 +66,9 @@ public class ControlNodeActionsService { Spark.post("/public/nodes/:node/actions/sideload-stackexchange", this::sideloadStackexchange, redirectControl.renderRedirectAcknowledgement("Sideloading", "..") ); + Spark.post("/public/nodes/:node/actions/download-sample-data", this::downloadSampleData, + redirectControl.renderRedirectAcknowledgement("Downloading", "..") + ); Spark.post("/public/nodes/:id/actions/new-crawl", this::triggerNewCrawl, redirectControl.renderRedirectAcknowledgement("Crawling", "..") ); @@ -91,6 +98,21 @@ public class ControlNodeActionsService { ); } + private Object downloadSampleData(Request request, Response response) { + String set = request.queryParams("sample"); + + if (set == null) + throw new ControlValidationError("No sample specified", "A sample data set must be specified", ".."); + if (!Set.of("sample-s", "sample-m", "sample-l", "sample-xl").contains(set)) + throw new ControlValidationError("Invalid sample specified", "A valid sample data set must be specified", ".."); + + executorClient.downloadSampleData(Context.fromRequest(request), Integer.parseInt(request.params("node")), set); + + logger.info("Downloading sample data set {}", set); + + return ""; + } + public Object sideloadEncyclopedia(Request request, Response response) { String source = request.queryParams("source"); diff --git a/code/services-core/control-service/src/main/resources/templates/control/node/actions/partial-download-sample-data.hdb b/code/services-core/control-service/src/main/resources/templates/control/node/actions/partial-download-sample-data.hdb new file mode 100644 index 00000000..3c432e65 --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/node/actions/partial-download-sample-data.hdb @@ -0,0 +1,47 @@ +

Download Sample Data

+ +
+This will download sample crawl data from downloads.marginalia.nu onto Node {{node.id}}. +This is a sample of real crawl data. It is intended for demo, testing and development purposes. Several sets are available. +
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
UseSetDescription
1000 Domains. About 2 GB.
2000 Domains. About 6 GB. Recommended.
5000 Domains. About 20 GB.
50,000 Domains. Around 180 GB. Primarily intended for pre-production like testing environments. + Expect hours of processing time.
+ +
\ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/node/node-actions.hdb b/code/services-core/control-service/src/main/resources/templates/control/node/node-actions.hdb index 736c7961..ea4502fa 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/node/node-actions.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/node/node-actions.hdb @@ -23,6 +23,7 @@ {{#if view.export-from-crawl-data}} {{> control/node/actions/partial-export-from-crawl-data }} {{/if}} {{#if view.export-sample-data}} {{> control/node/actions/partial-export-sample-data }} {{/if}} {{#if view.restore-backup}} {{> control/node/actions/partial-restore-backup }} {{/if}} + {{#if view.download-sample-data}} {{> control/node/actions/partial-download-sample-data }} {{/if}}
 
diff --git a/code/services-core/control-service/src/main/resources/templates/control/node/partial-node-nav.hdb b/code/services-core/control-service/src/main/resources/templates/control/node/partial-node-nav.hdb index ddb5ff4e..de963ea3 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/node/partial-node-nav.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/node/partial-node-nav.hdb @@ -24,6 +24,7 @@
  • Sideload Stackexchange
  • Sideload WARC Files
  • Sideload Dirtree
  • +
  • Download Sample Crawl Data
  • Export Database Data
  • Export Sample Crawl Data
  • diff --git a/code/services-core/executor-service/build.gradle b/code/services-core/executor-service/build.gradle index d26eadce..26f97808 100644 --- a/code/services-core/executor-service/build.gradle +++ b/code/services-core/executor-service/build.gradle @@ -61,6 +61,7 @@ dependencies { implementation libs.zstd implementation libs.jsoup implementation libs.commons.io + implementation libs.commons.compress implementation libs.commons.lang3 implementation libs.bundles.mariadb diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/ExecutorActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/ExecutorActor.java index d06549ba..ee7fb1d3 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/ExecutorActor.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/ExecutorActor.java @@ -17,7 +17,9 @@ public enum ExecutorActor { EXPORT_FEEDS, PROC_INDEX_CONSTRUCTOR_SPAWNER, CONVERT, - RESTORE_BACKUP, EXPORT_SAMPLE_DATA; + RESTORE_BACKUP, + EXPORT_SAMPLE_DATA, + DOWNLOAD_SAMPLE; public String id() { return "fsm:" + name().toLowerCase(); diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/ExecutorActorControlService.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/ExecutorActorControlService.java index 9ff2d1ed..53abdfe3 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/ExecutorActorControlService.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/ExecutorActorControlService.java @@ -47,6 +47,7 @@ public class ExecutorActorControlService { ExportFeedsActor exportFeedsActor, ExportSampleDataActor exportSampleDataActor, ExportTermFreqActor exportTermFrequenciesActor, + DownloadSampleActor downloadSampleActor, ExecutorActorStateMachines stateMachines) { this.messageQueueFactory = messageQueueFactory; this.eventLog = baseServiceParams.eventLog; @@ -75,6 +76,8 @@ public class ExecutorActorControlService { register(ExecutorActor.EXPORT_FEEDS, exportFeedsActor); register(ExecutorActor.EXPORT_SAMPLE_DATA, exportSampleDataActor); register(ExecutorActor.EXPORT_TERM_FREQUENCIES, exportTermFrequenciesActor); + + register(ExecutorActor.DOWNLOAD_SAMPLE, downloadSampleActor); } private void register(ExecutorActor process, RecordActorPrototype graph) { diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ConvertActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ConvertActor.java index 9cd08ea4..aed6d05a 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ConvertActor.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ConvertActor.java @@ -13,7 +13,6 @@ import nu.marginalia.process.ProcessService; import nu.marginalia.sideload.SideloadHelper; import nu.marginalia.sideload.StackExchangeSideloadHelper; import nu.marginalia.storage.FileStorageService; -import nu.marginalia.storage.model.FileStorageBaseType; import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageState; import nu.marginalia.storage.model.FileStorageType; @@ -49,8 +48,7 @@ public class ConvertActor extends RecordActorPrototype { return switch (self) { case Convert (FileStorageId fid) -> { var toProcess = storageService.getStorage(fid); - var base = storageService.getStorageBase(FileStorageBaseType.STORAGE); - var processedArea = storageService.allocateTemporaryStorage(base, + var processedArea = storageService.allocateStorage( FileStorageType.PROCESSED_DATA, "processed-data", "Processed Data; " + toProcess.description()); @@ -69,8 +67,7 @@ public class ConvertActor extends RecordActorPrototype { String fileName = sourcePath.toFile().getName(); - var base = storageService.getStorageBase(FileStorageBaseType.STORAGE); - var processedArea = storageService.allocateTemporaryStorage(base, + var processedArea = storageService.allocateStorage( FileStorageType.PROCESSED_DATA, "processed-data", "Processed Dirtree Data; " + fileName); @@ -88,8 +85,7 @@ public class ConvertActor extends RecordActorPrototype { String fileName = sourcePath.toFile().getName(); - var base = storageService.getStorageBase(FileStorageBaseType.STORAGE); - var processedArea = storageService.allocateTemporaryStorage(base, + var processedArea = storageService.allocateStorage( FileStorageType.PROCESSED_DATA, "processed-data", "Processed Warc Data; " + fileName); @@ -121,8 +117,7 @@ public class ConvertActor extends RecordActorPrototype { String fileName = sourcePath.toFile().getName(); - var base = storageService.getStorageBase(FileStorageBaseType.STORAGE); - var processedArea = storageService.allocateTemporaryStorage(base, + var processedArea = storageService.allocateStorage( FileStorageType.PROCESSED_DATA, "processed-data", "Processed Encylopedia Data; " + fileName); @@ -171,8 +166,7 @@ public class ConvertActor extends RecordActorPrototype { String fileName = sourcePath.toFile().getName(); - var base = storageService.getStorageBase(FileStorageBaseType.STORAGE); - var processedArea = storageService.allocateTemporaryStorage(base, + var processedArea = storageService.allocateStorage( FileStorageType.PROCESSED_DATA, "processed-data", "Processed Stackexchange Data; " + fileName); diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ConvertAndLoadActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ConvertAndLoadActor.java index 87f9a8f0..86918c55 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ConvertAndLoadActor.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ConvertAndLoadActor.java @@ -17,14 +17,12 @@ import nu.marginalia.service.module.ServiceConfiguration; import nu.marginalia.storage.model.FileStorageState; import nu.marginalia.svc.BackupService; import nu.marginalia.storage.FileStorageService; -import nu.marginalia.storage.model.FileStorageBaseType; import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageType; import nu.marginalia.index.client.IndexClient; import nu.marginalia.index.client.IndexMqEndpoints; import nu.marginalia.mq.MqMessageState; import nu.marginalia.mq.outbox.MqOutbox; -import nu.marginalia.mqapi.converting.ConvertAction; import nu.marginalia.mqapi.converting.ConvertRequest; import nu.marginalia.mqapi.index.CreateIndexRequest; import nu.marginalia.mqapi.index.IndexName; @@ -96,8 +94,7 @@ public class ConvertAndLoadActor extends RecordActorPrototype { if (storage.type() != FileStorageType.CRAWL_DATA) yield new Error("Bad storage type " + storage.type()); - var base = storageService.getStorageBase(FileStorageBaseType.STORAGE); - var processedArea = storageService.allocateTemporaryStorage(base, FileStorageType.PROCESSED_DATA, "processed-data", + var processedArea = storageService.allocateStorage(FileStorageType.PROCESSED_DATA, "processed-data", "Processed Data; " + storage.description()); storageService.setFileStorageState(processedArea.id(), FileStorageState.NEW); diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/CrawlActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/CrawlActor.java index 79d1c8ad..3e097554 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/CrawlActor.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/CrawlActor.java @@ -10,7 +10,6 @@ import nu.marginalia.actor.state.Resume; import nu.marginalia.process.ProcessOutboxes; import nu.marginalia.process.ProcessService; import nu.marginalia.storage.FileStorageService; -import nu.marginalia.storage.model.FileStorageBaseType; import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageType; import nu.marginalia.mq.MqMessageState; @@ -43,9 +42,7 @@ public class CrawlActor extends RecordActorPrototype { if (storage == null) yield new Error("Bad storage id"); if (storage.type() != FileStorageType.CRAWL_SPEC) yield new Error("Bad storage type " + storage.type()); - var base = storageService.getStorageBase(FileStorageBaseType.STORAGE); - var dataArea = storageService.allocateTemporaryStorage( - base, + var dataArea = storageService.allocateStorage( FileStorageType.CRAWL_DATA, "crawl-data", storage.description()); diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/CrawlJobExtractorActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/CrawlJobExtractorActor.java index faba7b05..27ed6a08 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/CrawlJobExtractorActor.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/CrawlJobExtractorActor.java @@ -7,7 +7,6 @@ import nu.marginalia.actor.prototype.RecordActorPrototype; import nu.marginalia.actor.state.ActorStep; import nu.marginalia.crawlspec.CrawlSpecFileNames; import nu.marginalia.storage.FileStorageService; -import nu.marginalia.storage.model.FileStorageBaseType; import nu.marginalia.storage.model.FileStorageType; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -41,8 +40,7 @@ public class CrawlJobExtractorActor extends RecordActorPrototype { public ActorStep transition(ActorStep self) throws Exception { return switch (self) { case CreateFromUrl(String description, String url) -> { - var base = fileStorageService.getStorageBase(FileStorageBaseType.STORAGE); - var storage = fileStorageService.allocateTemporaryStorage(base, FileStorageType.CRAWL_SPEC, "crawl-spec", description); + var storage = fileStorageService.allocateStorage(FileStorageType.CRAWL_SPEC, "crawl-spec", description); Path urlsTxt = storage.asPath().resolve("urls.txt"); diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/DownloadSampleActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/DownloadSampleActor.java new file mode 100644 index 00000000..d554e7fe --- /dev/null +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/DownloadSampleActor.java @@ -0,0 +1,133 @@ +package nu.marginalia.actor.task; + +import com.google.gson.Gson; +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.actor.prototype.RecordActorPrototype; +import nu.marginalia.actor.state.ActorStep; +import nu.marginalia.storage.FileStorageService; +import nu.marginalia.storage.model.*; +import org.apache.commons.compress.archivers.tar.TarArchiveEntry; +import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.net.MalformedURLException; +import java.net.URI; +import java.net.URL; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; +import java.nio.file.attribute.PosixFilePermissions; + +@Singleton +public class DownloadSampleActor extends RecordActorPrototype { + + private final FileStorageService storageService; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + public record Run(String setName) implements ActorStep {} + @Override + public ActorStep transition(ActorStep self) throws Exception { + return switch(self) { + case Run(String setName) -> { + final FileStorage newStorage = storageService.allocateStorage( + FileStorageType.CRAWL_DATA, + "sample-crawl-data", + "Sample " + setName); + + storageService.setFileStorageState(newStorage.id(), FileStorageState.NEW); + + URL downloadURI = getDownloadURL(setName); + + try { + downloadArchive(downloadURI, newStorage.asPath()); + } + catch (IOException ex) { + logger.error("Error downloading sample", ex); + storageService.flagFileForDeletion(newStorage.id()); + yield new Error(); + } + finally { + storageService.setFileStorageState(newStorage.id(), FileStorageState.UNSET); + } + + yield new End(); + } + default -> new Error(); + }; + } + + private void downloadArchive(URL downloadURI, Path outputPath) throws IOException, InterruptedException { + // See the documentation for commons compress: + // https://commons.apache.org/proper/commons-compress/examples.html + + try (var tar = new TarArchiveInputStream(downloadURI.openStream())) { + TarArchiveEntry nextEntry; + byte[] buffer = new byte[8192]; + + while ((nextEntry = tar.getNextEntry()) != null) { + // Poll for interruption, to ensure this can be cancelled + if (Thread.interrupted()) { + throw new InterruptedException(); + } + + if (nextEntry.isDirectory()) { + continue; + } + + Path outputFile = outputPath.resolve(nextEntry.getName()); + Files.createDirectories(outputFile.getParent(), + PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rwxr-xr-x")) + ); + + long size = nextEntry.getSize(); + + // Extract tar entry + try (var fos = Files.newOutputStream(outputFile, StandardOpenOption.CREATE)) { + transferBytes(tar, fos, buffer, size); + } + + Files.setPosixFilePermissions(outputPath, PosixFilePermissions.fromString("rw-r--r--")); + } + } + } + + private void transferBytes(InputStream inputStream, OutputStream outputStream, byte[] buffer, long size) + throws IOException + { + long copiedSize = 0; + + while (copiedSize < size) { + int read = inputStream.read(buffer); + + if (read < 0) // We've been promised a file of length 'size', so this shouldn't happen, but just in case... + throw new IOException("Unexpected end of stream"); + + outputStream.write(buffer, 0, read); + copiedSize += read; + } + } + + + private URL getDownloadURL(String setName) throws MalformedURLException { + return URI.create(STR."https://downloads.marginalia.nu/samples/\{setName}.tar").toURL(); + } + + @Override + public String describe() { + return "Download a sample of crawl data from downloads.marginalia.nu"; + } + + @Inject + public DownloadSampleActor(Gson gson, + FileStorageService storageService) + { + super(gson); + this.storageService = storageService; + } + +} diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java index 3a06fecb..5323302b 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java @@ -23,8 +23,7 @@ public class ExportAtagsActor extends RecordActorPrototype { public ActorStep transition(ActorStep self) throws Exception { return switch(self) { case Export(FileStorageId crawlId) -> { - var storageBase = storageService.getStorageBase(FileStorageBaseType.STORAGE); - var storage = storageService.allocateTemporaryStorage(storageBase, FileStorageType.EXPORT, "atag-export", "Anchor Tags " + LocalDateTime.now()); + var storage = storageService.allocateStorage(FileStorageType.EXPORT, "atag-export", "Anchor Tags " + LocalDateTime.now()); if (storage == null) yield new Error("Bad storage id"); yield new Run(crawlId, storage.id()); diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportDataActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportDataActor.java index 572a85c5..042a3ec7 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportDataActor.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportDataActor.java @@ -8,7 +8,6 @@ import nu.marginalia.actor.prototype.RecordActorPrototype; import nu.marginalia.actor.state.ActorStep; import nu.marginalia.query.client.QueryClient; import nu.marginalia.storage.FileStorageService; -import nu.marginalia.storage.model.FileStorageBaseType; import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageType; import org.slf4j.Logger; @@ -43,8 +42,7 @@ public class ExportDataActor extends RecordActorPrototype { public ActorStep transition(ActorStep self) throws Exception { return switch(self) { case Export() -> { - var storageBase = storageService.getStorageBase(FileStorageBaseType.STORAGE); - var storage = storageService.allocateTemporaryStorage(storageBase, FileStorageType.EXPORT, "db-export", "DB Exports " + LocalDateTime.now()); + var storage = storageService.allocateStorage(FileStorageType.EXPORT, "db-export", "DB Exports " + LocalDateTime.now()); if (storage == null) yield new Error("Bad storage id"); yield new ExportBlacklist(storage.id()); diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportFeedsActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportFeedsActor.java index 5df5b236..faaaf528 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportFeedsActor.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportFeedsActor.java @@ -8,7 +8,6 @@ import nu.marginalia.actor.state.ActorStep; import nu.marginalia.extractor.ExporterIf; import nu.marginalia.extractor.FeedExporter; import nu.marginalia.storage.FileStorageService; -import nu.marginalia.storage.model.FileStorageBaseType; import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageState; import nu.marginalia.storage.model.FileStorageType; @@ -29,8 +28,7 @@ public class ExportFeedsActor extends RecordActorPrototype { public ActorStep transition(ActorStep self) throws Exception { return switch(self) { case Export(FileStorageId crawlId) -> { - var storageBase = storageService.getStorageBase(FileStorageBaseType.STORAGE); - var storage = storageService.allocateTemporaryStorage(storageBase, FileStorageType.EXPORT, "feed-export", "Feeds " + LocalDateTime.now()); + var storage = storageService.allocateStorage(FileStorageType.EXPORT, "feed-export", "Feeds " + LocalDateTime.now()); if (storage == null) yield new Error("Bad storage id"); yield new Run(crawlId, storage.id()); diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportSampleDataActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportSampleDataActor.java index 9954f619..fcc076a7 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportSampleDataActor.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportSampleDataActor.java @@ -5,11 +5,8 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.actor.prototype.RecordActorPrototype; import nu.marginalia.actor.state.ActorStep; -import nu.marginalia.extractor.ExporterIf; -import nu.marginalia.extractor.FeedExporter; import nu.marginalia.extractor.SampleDataExporter; import nu.marginalia.storage.FileStorageService; -import nu.marginalia.storage.model.FileStorageBaseType; import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageState; import nu.marginalia.storage.model.FileStorageType; @@ -30,8 +27,7 @@ public class ExportSampleDataActor extends RecordActorPrototype { public ActorStep transition(ActorStep self) throws Exception { return switch(self) { case Export(FileStorageId crawlId, int size, String name) -> { - var storageBase = storageService.getStorageBase(FileStorageBaseType.STORAGE); - var storage = storageService.allocateTemporaryStorage(storageBase, FileStorageType.EXPORT, + var storage = storageService.allocateStorage(FileStorageType.EXPORT, "crawl-sample-export", STR."Crawl Data Sample \{name}/\{size} \{LocalDateTime.now()}" ); diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportTermFreqActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportTermFreqActor.java index d04b75d2..a47fcabd 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportTermFreqActor.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportTermFreqActor.java @@ -8,7 +8,6 @@ import nu.marginalia.actor.state.ActorStep; import nu.marginalia.extractor.ExporterIf; import nu.marginalia.extractor.TermFrequencyExporter; import nu.marginalia.storage.FileStorageService; -import nu.marginalia.storage.model.FileStorageBaseType; import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageState; import nu.marginalia.storage.model.FileStorageType; @@ -25,8 +24,7 @@ public class ExportTermFreqActor extends RecordActorPrototype { public ActorStep transition(ActorStep self) throws Exception { return switch(self) { case Export(FileStorageId crawlId) -> { - var storageBase = storageService.getStorageBase(FileStorageBaseType.STORAGE); - var storage = storageService.allocateTemporaryStorage(storageBase, FileStorageType.EXPORT, "term-freq-export", "Term Frequencies " + LocalDateTime.now()); + var storage = storageService.allocateStorage(FileStorageType.EXPORT, "term-freq-export", "Term Frequencies " + LocalDateTime.now()); if (storage == null) yield new Error("Bad storage id"); yield new Run(crawlId, storage.id()); diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/executor/ExecutorSvc.java b/code/services-core/executor-service/src/main/java/nu/marginalia/executor/ExecutorSvc.java index d4420624..613ca18f 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/executor/ExecutorSvc.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/executor/ExecutorSvc.java @@ -71,6 +71,8 @@ public class ExecutorSvc extends Service { Spark.post("/sideload/stackexchange", sideloadService::sideloadStackexchange); Spark.post("/sideload/encyclopedia", sideloadService::sideloadEncyclopedia); + Spark.post("/action/download-sample-data", sideloadService::downloadSampleData); + Spark.post("/export/atags", exportService::exportAtags); Spark.post("/export/sample-data", exportService::exportSampleData); Spark.post("/export/feeds", exportService::exportFeeds); diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/executor/svc/SideloadService.java b/code/services-core/executor-service/src/main/java/nu/marginalia/executor/svc/SideloadService.java index dd766094..8ef39c12 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/executor/svc/SideloadService.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/executor/svc/SideloadService.java @@ -5,8 +5,11 @@ import nu.marginalia.WmsaHome; import nu.marginalia.actor.ExecutorActor; import nu.marginalia.actor.ExecutorActorControlService; import nu.marginalia.actor.task.ConvertActor; +import nu.marginalia.actor.task.DownloadSampleActor; import nu.marginalia.executor.upload.UploadDirContents; import nu.marginalia.executor.upload.UploadDirItem; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import spark.Request; import spark.Response; @@ -18,6 +21,7 @@ import java.util.List; public class SideloadService { private final ExecutorActorControlService actorControlService; + private static final Logger logger = LoggerFactory.getLogger(SideloadService.class); @Inject public SideloadService(ExecutorActorControlService actorControlService) { @@ -56,4 +60,11 @@ public class SideloadService { } + public Object downloadSampleData(Request request, Response response) throws Exception { + String sampleSet = request.queryParams("set"); + + actorControlService.startFrom(ExecutorActor.DOWNLOAD_SAMPLE, new DownloadSampleActor.Run(sampleSet)); + + return ""; + } } diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/executor/svc/TransferService.java b/code/services-core/executor-service/src/main/java/nu/marginalia/executor/svc/TransferService.java index 0700fbc9..ed84695f 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/executor/svc/TransferService.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/executor/svc/TransferService.java @@ -15,7 +15,6 @@ import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.process.log.WorkLog; import nu.marginalia.service.module.ServiceConfiguration; import nu.marginalia.storage.FileStorageService; -import nu.marginalia.storage.model.FileStorageBaseType; import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageType; import org.apache.commons.io.FileUtils; @@ -187,8 +186,7 @@ public class TransferService { // Ensure crawl data exists to receive into if (storages.isEmpty()) { - var storage = fileStorageService.allocateTemporaryStorage( - fileStorageService.getStorageBase(FileStorageBaseType.STORAGE), + var storage = fileStorageService.allocateStorage( FileStorageType.CRAWL_DATA, "crawl-data", "Crawl Data" diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/svc/BackupService.java b/code/services-core/executor-service/src/main/java/nu/marginalia/svc/BackupService.java index 2205e5e2..ec0f561b 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/svc/BackupService.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/svc/BackupService.java @@ -5,7 +5,6 @@ import com.github.luben.zstd.ZstdOutputStream; import nu.marginalia.IndexLocations; import nu.marginalia.service.control.ServiceHeartbeat; import nu.marginalia.storage.FileStorageService; -import nu.marginalia.storage.model.FileStorageBaseType; import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageType; import nu.marginallia.index.journal.IndexJournalFileNames; @@ -45,11 +44,9 @@ public class BackupService { * This backup can later be dehydrated and quickly loaded into _LIVE. * */ public void createBackupFromStaging(List associatedIds) throws SQLException, IOException { - var backupBase = storageService.getStorageBase(FileStorageBaseType.BACKUP); - String desc = "Pre-load backup snapshot " + LocalDateTime.now(); - var backupStorage = storageService.allocateTemporaryStorage(backupBase, + var backupStorage = storageService.allocateStorage( FileStorageType.BACKUP, "snapshot", desc); for (var associatedId : associatedIds) {