diff --git a/code/api/executor-api/src/main/java/nu/marginalia/executor/client/ExecutorClient.java b/code/api/executor-api/src/main/java/nu/marginalia/executor/client/ExecutorClient.java index b5554edd..df9c4072 100644 --- a/code/api/executor-api/src/main/java/nu/marginalia/executor/client/ExecutorClient.java +++ b/code/api/executor-api/src/main/java/nu/marginalia/executor/client/ExecutorClient.java @@ -71,7 +71,6 @@ public class ExecutorClient extends AbstractDynamicClient { post(ctx, node, "/sideload/encyclopedia?path="+ URLEncoder.encode(sourcePath.toString(), StandardCharsets.UTF_8) + "&baseUrl=" + URLEncoder.encode(baseUrl, StandardCharsets.UTF_8), "").blockingSubscribe(); - } public void sideloadDirtree(Context ctx, int node, Path sourcePath) { @@ -111,6 +110,10 @@ public class ExecutorClient extends AbstractDynamicClient { post(ctx, node, "/export/termfreq?fid="+fid, "").blockingSubscribe(); } + public void downloadSampleData(Context ctx, int node, String sampleSet) { + post(ctx, node, "/action/download-sample-data?set="+URLEncoder.encode(sampleSet, StandardCharsets.UTF_8), "").blockingSubscribe(); + } + public void exportData(Context ctx, int node) { post(ctx, node, "/export/data", "").blockingSubscribe(); } @@ -166,4 +169,5 @@ public class ExecutorClient extends AbstractDynamicClient { public void yieldDomain(Context context, int node, TransferItem item) { post(context, node, "/transfer/yield", item).blockingSubscribe(); } + } diff --git a/code/common/config/src/main/java/nu/marginalia/storage/FileStorageService.java b/code/common/config/src/main/java/nu/marginalia/storage/FileStorageService.java index 999a51a5..b2162664 100644 --- a/code/common/config/src/main/java/nu/marginalia/storage/FileStorageService.java +++ b/code/common/config/src/main/java/nu/marginalia/storage/FileStorageService.java @@ -223,14 +223,12 @@ public class FileStorageService { return maybePath; } - /** Allocate a temporary storage of the given type */ - public FileStorage allocateTemporaryStorage(FileStorageBase base, - FileStorageType type, - String prefix, - String description) throws IOException, SQLException + /** Allocate a storage area of the given type */ + public FileStorage allocateStorage(FileStorageType type, + String prefix, + String description) throws IOException, SQLException { - if (!base.type().permitsStorageType(type)) - throw new RuntimeException("Attempting to allocate storage of type " + type + " in base of type " + base.type()); + var base = getStorageBase(FileStorageBaseType.forFileStorageType(type)); Path newDir = allocateDirectory(base.asPath(), prefix); diff --git a/code/common/config/src/main/java/nu/marginalia/storage/model/FileStorageBaseType.java b/code/common/config/src/main/java/nu/marginalia/storage/model/FileStorageBaseType.java index 7bd1eb0d..d319786f 100644 --- a/code/common/config/src/main/java/nu/marginalia/storage/model/FileStorageBaseType.java +++ b/code/common/config/src/main/java/nu/marginalia/storage/model/FileStorageBaseType.java @@ -1,18 +1,17 @@ package nu.marginalia.storage.model; -import java.util.EnumSet; - public enum FileStorageBaseType { CURRENT, WORK, STORAGE, BACKUP; - public boolean permitsStorageType(FileStorageType type) { - return switch (this) { - case BACKUP -> FileStorageType.BACKUP.equals(type); - case STORAGE -> EnumSet.of(FileStorageType.EXPORT, FileStorageType.CRAWL_DATA, FileStorageType.PROCESSED_DATA, FileStorageType.CRAWL_SPEC).contains(type); - default -> false; + + public static FileStorageBaseType forFileStorageType(FileStorageType type) { + return switch (type) { + case EXPORT, CRAWL_DATA, PROCESSED_DATA, CRAWL_SPEC -> STORAGE; + case BACKUP -> BACKUP; }; } + } diff --git a/code/common/config/src/test/java/nu/marginalia/storage/FileStorageServiceTest.java b/code/common/config/src/test/java/nu/marginalia/storage/FileStorageServiceTest.java index 69fa9b1b..c9ffa309 100644 --- a/code/common/config/src/test/java/nu/marginalia/storage/FileStorageServiceTest.java +++ b/code/common/config/src/test/java/nu/marginalia/storage/FileStorageServiceTest.java @@ -13,18 +13,14 @@ import org.testcontainers.containers.MariaDBContainer; import org.testcontainers.junit.jupiter.Container; import org.testcontainers.junit.jupiter.Testcontainers; -import java.io.FileNotFoundException; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.sql.SQLException; import java.util.ArrayList; import java.util.List; -import java.util.Objects; import java.util.UUID; -import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD; - @Testcontainers @Execution(ExecutionMode.SAME_THREAD) @Tag("slow") @@ -124,8 +120,7 @@ public class FileStorageServiceTest { var storage = new FileStorageService(dataSource, 0); - var base = storage.createStorageBase(name, createTempDir(), FileStorageBaseType.STORAGE); - var fileStorage = storage.allocateTemporaryStorage(base, FileStorageType.CRAWL_DATA, "xyz", "thisShouldSucceed"); + var fileStorage = storage.allocateStorage(FileStorageType.CRAWL_DATA, "xyz", "thisShouldSucceed"); System.out.println("Allocated " + fileStorage.asPath()); Assertions.assertTrue(Files.exists(fileStorage.asPath())); tempDirs.add(fileStorage.asPath()); diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/node/svc/ControlNodeActionsService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/node/svc/ControlNodeActionsService.java index 16b06998..a707aefe 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/node/svc/ControlNodeActionsService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/node/svc/ControlNodeActionsService.java @@ -13,6 +13,8 @@ import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageState; import nu.marginalia.storage.model.FileStorageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import spark.Request; import spark.Response; import spark.Spark; @@ -21,9 +23,11 @@ import java.nio.file.Path; import java.sql.SQLException; import java.util.Arrays; import java.util.List; +import java.util.Set; @Singleton public class ControlNodeActionsService { + private static final Logger logger = LoggerFactory.getLogger(ControlNodeActionsService.class); private final IndexClient indexClient; private final RedirectControl redirectControl; private final FileStorageService fileStorageService; @@ -62,6 +66,9 @@ public class ControlNodeActionsService { Spark.post("/public/nodes/:node/actions/sideload-stackexchange", this::sideloadStackexchange, redirectControl.renderRedirectAcknowledgement("Sideloading", "..") ); + Spark.post("/public/nodes/:node/actions/download-sample-data", this::downloadSampleData, + redirectControl.renderRedirectAcknowledgement("Downloading", "..") + ); Spark.post("/public/nodes/:id/actions/new-crawl", this::triggerNewCrawl, redirectControl.renderRedirectAcknowledgement("Crawling", "..") ); @@ -91,6 +98,21 @@ public class ControlNodeActionsService { ); } + private Object downloadSampleData(Request request, Response response) { + String set = request.queryParams("sample"); + + if (set == null) + throw new ControlValidationError("No sample specified", "A sample data set must be specified", ".."); + if (!Set.of("sample-s", "sample-m", "sample-l", "sample-xl").contains(set)) + throw new ControlValidationError("Invalid sample specified", "A valid sample data set must be specified", ".."); + + executorClient.downloadSampleData(Context.fromRequest(request), Integer.parseInt(request.params("node")), set); + + logger.info("Downloading sample data set {}", set); + + return ""; + } + public Object sideloadEncyclopedia(Request request, Response response) { String source = request.queryParams("source"); diff --git a/code/services-core/control-service/src/main/resources/templates/control/node/actions/partial-download-sample-data.hdb b/code/services-core/control-service/src/main/resources/templates/control/node/actions/partial-download-sample-data.hdb new file mode 100644 index 00000000..3c432e65 --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/node/actions/partial-download-sample-data.hdb @@ -0,0 +1,47 @@ +

Download Sample Data

+ +
+This will download sample crawl data from downloads.marginalia.nu onto Node {{node.id}}. +This is a sample of real crawl data. It is intended for demo, testing and development purposes. Several sets are available. +
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
UseSetDescription
1000 Domains. About 2 GB.
2000 Domains. About 6 GB. Recommended.
5000 Domains. About 20 GB.
50,000 Domains. Around 180 GB. Primarily intended for pre-production like testing environments. + Expect hours of processing time.
+ +
\ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/node/node-actions.hdb b/code/services-core/control-service/src/main/resources/templates/control/node/node-actions.hdb index 736c7961..ea4502fa 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/node/node-actions.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/node/node-actions.hdb @@ -23,6 +23,7 @@ {{#if view.export-from-crawl-data}} {{> control/node/actions/partial-export-from-crawl-data }} {{/if}} {{#if view.export-sample-data}} {{> control/node/actions/partial-export-sample-data }} {{/if}} {{#if view.restore-backup}} {{> control/node/actions/partial-restore-backup }} {{/if}} + {{#if view.download-sample-data}} {{> control/node/actions/partial-download-sample-data }} {{/if}}
 
diff --git a/code/services-core/control-service/src/main/resources/templates/control/node/partial-node-nav.hdb b/code/services-core/control-service/src/main/resources/templates/control/node/partial-node-nav.hdb index ddb5ff4e..de963ea3 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/node/partial-node-nav.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/node/partial-node-nav.hdb @@ -24,6 +24,7 @@
  • Sideload Stackexchange
  • Sideload WARC Files
  • Sideload Dirtree
  • +
  • Download Sample Crawl Data
  • Export Database Data
  • Export Sample Crawl Data
  • diff --git a/code/services-core/executor-service/build.gradle b/code/services-core/executor-service/build.gradle index d26eadce..26f97808 100644 --- a/code/services-core/executor-service/build.gradle +++ b/code/services-core/executor-service/build.gradle @@ -61,6 +61,7 @@ dependencies { implementation libs.zstd implementation libs.jsoup implementation libs.commons.io + implementation libs.commons.compress implementation libs.commons.lang3 implementation libs.bundles.mariadb diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/ExecutorActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/ExecutorActor.java index d06549ba..ee7fb1d3 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/ExecutorActor.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/ExecutorActor.java @@ -17,7 +17,9 @@ public enum ExecutorActor { EXPORT_FEEDS, PROC_INDEX_CONSTRUCTOR_SPAWNER, CONVERT, - RESTORE_BACKUP, EXPORT_SAMPLE_DATA; + RESTORE_BACKUP, + EXPORT_SAMPLE_DATA, + DOWNLOAD_SAMPLE; public String id() { return "fsm:" + name().toLowerCase(); diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/ExecutorActorControlService.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/ExecutorActorControlService.java index 9ff2d1ed..53abdfe3 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/ExecutorActorControlService.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/ExecutorActorControlService.java @@ -47,6 +47,7 @@ public class ExecutorActorControlService { ExportFeedsActor exportFeedsActor, ExportSampleDataActor exportSampleDataActor, ExportTermFreqActor exportTermFrequenciesActor, + DownloadSampleActor downloadSampleActor, ExecutorActorStateMachines stateMachines) { this.messageQueueFactory = messageQueueFactory; this.eventLog = baseServiceParams.eventLog; @@ -75,6 +76,8 @@ public class ExecutorActorControlService { register(ExecutorActor.EXPORT_FEEDS, exportFeedsActor); register(ExecutorActor.EXPORT_SAMPLE_DATA, exportSampleDataActor); register(ExecutorActor.EXPORT_TERM_FREQUENCIES, exportTermFrequenciesActor); + + register(ExecutorActor.DOWNLOAD_SAMPLE, downloadSampleActor); } private void register(ExecutorActor process, RecordActorPrototype graph) { diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ConvertActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ConvertActor.java index 9cd08ea4..aed6d05a 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ConvertActor.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ConvertActor.java @@ -13,7 +13,6 @@ import nu.marginalia.process.ProcessService; import nu.marginalia.sideload.SideloadHelper; import nu.marginalia.sideload.StackExchangeSideloadHelper; import nu.marginalia.storage.FileStorageService; -import nu.marginalia.storage.model.FileStorageBaseType; import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageState; import nu.marginalia.storage.model.FileStorageType; @@ -49,8 +48,7 @@ public class ConvertActor extends RecordActorPrototype { return switch (self) { case Convert (FileStorageId fid) -> { var toProcess = storageService.getStorage(fid); - var base = storageService.getStorageBase(FileStorageBaseType.STORAGE); - var processedArea = storageService.allocateTemporaryStorage(base, + var processedArea = storageService.allocateStorage( FileStorageType.PROCESSED_DATA, "processed-data", "Processed Data; " + toProcess.description()); @@ -69,8 +67,7 @@ public class ConvertActor extends RecordActorPrototype { String fileName = sourcePath.toFile().getName(); - var base = storageService.getStorageBase(FileStorageBaseType.STORAGE); - var processedArea = storageService.allocateTemporaryStorage(base, + var processedArea = storageService.allocateStorage( FileStorageType.PROCESSED_DATA, "processed-data", "Processed Dirtree Data; " + fileName); @@ -88,8 +85,7 @@ public class ConvertActor extends RecordActorPrototype { String fileName = sourcePath.toFile().getName(); - var base = storageService.getStorageBase(FileStorageBaseType.STORAGE); - var processedArea = storageService.allocateTemporaryStorage(base, + var processedArea = storageService.allocateStorage( FileStorageType.PROCESSED_DATA, "processed-data", "Processed Warc Data; " + fileName); @@ -121,8 +117,7 @@ public class ConvertActor extends RecordActorPrototype { String fileName = sourcePath.toFile().getName(); - var base = storageService.getStorageBase(FileStorageBaseType.STORAGE); - var processedArea = storageService.allocateTemporaryStorage(base, + var processedArea = storageService.allocateStorage( FileStorageType.PROCESSED_DATA, "processed-data", "Processed Encylopedia Data; " + fileName); @@ -171,8 +166,7 @@ public class ConvertActor extends RecordActorPrototype { String fileName = sourcePath.toFile().getName(); - var base = storageService.getStorageBase(FileStorageBaseType.STORAGE); - var processedArea = storageService.allocateTemporaryStorage(base, + var processedArea = storageService.allocateStorage( FileStorageType.PROCESSED_DATA, "processed-data", "Processed Stackexchange Data; " + fileName); diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ConvertAndLoadActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ConvertAndLoadActor.java index 87f9a8f0..86918c55 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ConvertAndLoadActor.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ConvertAndLoadActor.java @@ -17,14 +17,12 @@ import nu.marginalia.service.module.ServiceConfiguration; import nu.marginalia.storage.model.FileStorageState; import nu.marginalia.svc.BackupService; import nu.marginalia.storage.FileStorageService; -import nu.marginalia.storage.model.FileStorageBaseType; import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageType; import nu.marginalia.index.client.IndexClient; import nu.marginalia.index.client.IndexMqEndpoints; import nu.marginalia.mq.MqMessageState; import nu.marginalia.mq.outbox.MqOutbox; -import nu.marginalia.mqapi.converting.ConvertAction; import nu.marginalia.mqapi.converting.ConvertRequest; import nu.marginalia.mqapi.index.CreateIndexRequest; import nu.marginalia.mqapi.index.IndexName; @@ -96,8 +94,7 @@ public class ConvertAndLoadActor extends RecordActorPrototype { if (storage.type() != FileStorageType.CRAWL_DATA) yield new Error("Bad storage type " + storage.type()); - var base = storageService.getStorageBase(FileStorageBaseType.STORAGE); - var processedArea = storageService.allocateTemporaryStorage(base, FileStorageType.PROCESSED_DATA, "processed-data", + var processedArea = storageService.allocateStorage(FileStorageType.PROCESSED_DATA, "processed-data", "Processed Data; " + storage.description()); storageService.setFileStorageState(processedArea.id(), FileStorageState.NEW); diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/CrawlActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/CrawlActor.java index 79d1c8ad..3e097554 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/CrawlActor.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/CrawlActor.java @@ -10,7 +10,6 @@ import nu.marginalia.actor.state.Resume; import nu.marginalia.process.ProcessOutboxes; import nu.marginalia.process.ProcessService; import nu.marginalia.storage.FileStorageService; -import nu.marginalia.storage.model.FileStorageBaseType; import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageType; import nu.marginalia.mq.MqMessageState; @@ -43,9 +42,7 @@ public class CrawlActor extends RecordActorPrototype { if (storage == null) yield new Error("Bad storage id"); if (storage.type() != FileStorageType.CRAWL_SPEC) yield new Error("Bad storage type " + storage.type()); - var base = storageService.getStorageBase(FileStorageBaseType.STORAGE); - var dataArea = storageService.allocateTemporaryStorage( - base, + var dataArea = storageService.allocateStorage( FileStorageType.CRAWL_DATA, "crawl-data", storage.description()); diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/CrawlJobExtractorActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/CrawlJobExtractorActor.java index faba7b05..27ed6a08 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/CrawlJobExtractorActor.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/CrawlJobExtractorActor.java @@ -7,7 +7,6 @@ import nu.marginalia.actor.prototype.RecordActorPrototype; import nu.marginalia.actor.state.ActorStep; import nu.marginalia.crawlspec.CrawlSpecFileNames; import nu.marginalia.storage.FileStorageService; -import nu.marginalia.storage.model.FileStorageBaseType; import nu.marginalia.storage.model.FileStorageType; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -41,8 +40,7 @@ public class CrawlJobExtractorActor extends RecordActorPrototype { public ActorStep transition(ActorStep self) throws Exception { return switch (self) { case CreateFromUrl(String description, String url) -> { - var base = fileStorageService.getStorageBase(FileStorageBaseType.STORAGE); - var storage = fileStorageService.allocateTemporaryStorage(base, FileStorageType.CRAWL_SPEC, "crawl-spec", description); + var storage = fileStorageService.allocateStorage(FileStorageType.CRAWL_SPEC, "crawl-spec", description); Path urlsTxt = storage.asPath().resolve("urls.txt"); diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/DownloadSampleActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/DownloadSampleActor.java new file mode 100644 index 00000000..d554e7fe --- /dev/null +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/DownloadSampleActor.java @@ -0,0 +1,133 @@ +package nu.marginalia.actor.task; + +import com.google.gson.Gson; +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.actor.prototype.RecordActorPrototype; +import nu.marginalia.actor.state.ActorStep; +import nu.marginalia.storage.FileStorageService; +import nu.marginalia.storage.model.*; +import org.apache.commons.compress.archivers.tar.TarArchiveEntry; +import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.net.MalformedURLException; +import java.net.URI; +import java.net.URL; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; +import java.nio.file.attribute.PosixFilePermissions; + +@Singleton +public class DownloadSampleActor extends RecordActorPrototype { + + private final FileStorageService storageService; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + public record Run(String setName) implements ActorStep {} + @Override + public ActorStep transition(ActorStep self) throws Exception { + return switch(self) { + case Run(String setName) -> { + final FileStorage newStorage = storageService.allocateStorage( + FileStorageType.CRAWL_DATA, + "sample-crawl-data", + "Sample " + setName); + + storageService.setFileStorageState(newStorage.id(), FileStorageState.NEW); + + URL downloadURI = getDownloadURL(setName); + + try { + downloadArchive(downloadURI, newStorage.asPath()); + } + catch (IOException ex) { + logger.error("Error downloading sample", ex); + storageService.flagFileForDeletion(newStorage.id()); + yield new Error(); + } + finally { + storageService.setFileStorageState(newStorage.id(), FileStorageState.UNSET); + } + + yield new End(); + } + default -> new Error(); + }; + } + + private void downloadArchive(URL downloadURI, Path outputPath) throws IOException, InterruptedException { + // See the documentation for commons compress: + // https://commons.apache.org/proper/commons-compress/examples.html + + try (var tar = new TarArchiveInputStream(downloadURI.openStream())) { + TarArchiveEntry nextEntry; + byte[] buffer = new byte[8192]; + + while ((nextEntry = tar.getNextEntry()) != null) { + // Poll for interruption, to ensure this can be cancelled + if (Thread.interrupted()) { + throw new InterruptedException(); + } + + if (nextEntry.isDirectory()) { + continue; + } + + Path outputFile = outputPath.resolve(nextEntry.getName()); + Files.createDirectories(outputFile.getParent(), + PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rwxr-xr-x")) + ); + + long size = nextEntry.getSize(); + + // Extract tar entry + try (var fos = Files.newOutputStream(outputFile, StandardOpenOption.CREATE)) { + transferBytes(tar, fos, buffer, size); + } + + Files.setPosixFilePermissions(outputPath, PosixFilePermissions.fromString("rw-r--r--")); + } + } + } + + private void transferBytes(InputStream inputStream, OutputStream outputStream, byte[] buffer, long size) + throws IOException + { + long copiedSize = 0; + + while (copiedSize < size) { + int read = inputStream.read(buffer); + + if (read < 0) // We've been promised a file of length 'size', so this shouldn't happen, but just in case... + throw new IOException("Unexpected end of stream"); + + outputStream.write(buffer, 0, read); + copiedSize += read; + } + } + + + private URL getDownloadURL(String setName) throws MalformedURLException { + return URI.create(STR."https://downloads.marginalia.nu/samples/\{setName}.tar").toURL(); + } + + @Override + public String describe() { + return "Download a sample of crawl data from downloads.marginalia.nu"; + } + + @Inject + public DownloadSampleActor(Gson gson, + FileStorageService storageService) + { + super(gson); + this.storageService = storageService; + } + +} diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java index 3a06fecb..5323302b 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java @@ -23,8 +23,7 @@ public class ExportAtagsActor extends RecordActorPrototype { public ActorStep transition(ActorStep self) throws Exception { return switch(self) { case Export(FileStorageId crawlId) -> { - var storageBase = storageService.getStorageBase(FileStorageBaseType.STORAGE); - var storage = storageService.allocateTemporaryStorage(storageBase, FileStorageType.EXPORT, "atag-export", "Anchor Tags " + LocalDateTime.now()); + var storage = storageService.allocateStorage(FileStorageType.EXPORT, "atag-export", "Anchor Tags " + LocalDateTime.now()); if (storage == null) yield new Error("Bad storage id"); yield new Run(crawlId, storage.id()); diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportDataActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportDataActor.java index 572a85c5..042a3ec7 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportDataActor.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportDataActor.java @@ -8,7 +8,6 @@ import nu.marginalia.actor.prototype.RecordActorPrototype; import nu.marginalia.actor.state.ActorStep; import nu.marginalia.query.client.QueryClient; import nu.marginalia.storage.FileStorageService; -import nu.marginalia.storage.model.FileStorageBaseType; import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageType; import org.slf4j.Logger; @@ -43,8 +42,7 @@ public class ExportDataActor extends RecordActorPrototype { public ActorStep transition(ActorStep self) throws Exception { return switch(self) { case Export() -> { - var storageBase = storageService.getStorageBase(FileStorageBaseType.STORAGE); - var storage = storageService.allocateTemporaryStorage(storageBase, FileStorageType.EXPORT, "db-export", "DB Exports " + LocalDateTime.now()); + var storage = storageService.allocateStorage(FileStorageType.EXPORT, "db-export", "DB Exports " + LocalDateTime.now()); if (storage == null) yield new Error("Bad storage id"); yield new ExportBlacklist(storage.id()); diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportFeedsActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportFeedsActor.java index 5df5b236..faaaf528 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportFeedsActor.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportFeedsActor.java @@ -8,7 +8,6 @@ import nu.marginalia.actor.state.ActorStep; import nu.marginalia.extractor.ExporterIf; import nu.marginalia.extractor.FeedExporter; import nu.marginalia.storage.FileStorageService; -import nu.marginalia.storage.model.FileStorageBaseType; import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageState; import nu.marginalia.storage.model.FileStorageType; @@ -29,8 +28,7 @@ public class ExportFeedsActor extends RecordActorPrototype { public ActorStep transition(ActorStep self) throws Exception { return switch(self) { case Export(FileStorageId crawlId) -> { - var storageBase = storageService.getStorageBase(FileStorageBaseType.STORAGE); - var storage = storageService.allocateTemporaryStorage(storageBase, FileStorageType.EXPORT, "feed-export", "Feeds " + LocalDateTime.now()); + var storage = storageService.allocateStorage(FileStorageType.EXPORT, "feed-export", "Feeds " + LocalDateTime.now()); if (storage == null) yield new Error("Bad storage id"); yield new Run(crawlId, storage.id()); diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportSampleDataActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportSampleDataActor.java index 9954f619..fcc076a7 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportSampleDataActor.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportSampleDataActor.java @@ -5,11 +5,8 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.actor.prototype.RecordActorPrototype; import nu.marginalia.actor.state.ActorStep; -import nu.marginalia.extractor.ExporterIf; -import nu.marginalia.extractor.FeedExporter; import nu.marginalia.extractor.SampleDataExporter; import nu.marginalia.storage.FileStorageService; -import nu.marginalia.storage.model.FileStorageBaseType; import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageState; import nu.marginalia.storage.model.FileStorageType; @@ -30,8 +27,7 @@ public class ExportSampleDataActor extends RecordActorPrototype { public ActorStep transition(ActorStep self) throws Exception { return switch(self) { case Export(FileStorageId crawlId, int size, String name) -> { - var storageBase = storageService.getStorageBase(FileStorageBaseType.STORAGE); - var storage = storageService.allocateTemporaryStorage(storageBase, FileStorageType.EXPORT, + var storage = storageService.allocateStorage(FileStorageType.EXPORT, "crawl-sample-export", STR."Crawl Data Sample \{name}/\{size} \{LocalDateTime.now()}" ); diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportTermFreqActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportTermFreqActor.java index d04b75d2..a47fcabd 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportTermFreqActor.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportTermFreqActor.java @@ -8,7 +8,6 @@ import nu.marginalia.actor.state.ActorStep; import nu.marginalia.extractor.ExporterIf; import nu.marginalia.extractor.TermFrequencyExporter; import nu.marginalia.storage.FileStorageService; -import nu.marginalia.storage.model.FileStorageBaseType; import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageState; import nu.marginalia.storage.model.FileStorageType; @@ -25,8 +24,7 @@ public class ExportTermFreqActor extends RecordActorPrototype { public ActorStep transition(ActorStep self) throws Exception { return switch(self) { case Export(FileStorageId crawlId) -> { - var storageBase = storageService.getStorageBase(FileStorageBaseType.STORAGE); - var storage = storageService.allocateTemporaryStorage(storageBase, FileStorageType.EXPORT, "term-freq-export", "Term Frequencies " + LocalDateTime.now()); + var storage = storageService.allocateStorage(FileStorageType.EXPORT, "term-freq-export", "Term Frequencies " + LocalDateTime.now()); if (storage == null) yield new Error("Bad storage id"); yield new Run(crawlId, storage.id()); diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/executor/ExecutorSvc.java b/code/services-core/executor-service/src/main/java/nu/marginalia/executor/ExecutorSvc.java index d4420624..613ca18f 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/executor/ExecutorSvc.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/executor/ExecutorSvc.java @@ -71,6 +71,8 @@ public class ExecutorSvc extends Service { Spark.post("/sideload/stackexchange", sideloadService::sideloadStackexchange); Spark.post("/sideload/encyclopedia", sideloadService::sideloadEncyclopedia); + Spark.post("/action/download-sample-data", sideloadService::downloadSampleData); + Spark.post("/export/atags", exportService::exportAtags); Spark.post("/export/sample-data", exportService::exportSampleData); Spark.post("/export/feeds", exportService::exportFeeds); diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/executor/svc/SideloadService.java b/code/services-core/executor-service/src/main/java/nu/marginalia/executor/svc/SideloadService.java index dd766094..8ef39c12 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/executor/svc/SideloadService.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/executor/svc/SideloadService.java @@ -5,8 +5,11 @@ import nu.marginalia.WmsaHome; import nu.marginalia.actor.ExecutorActor; import nu.marginalia.actor.ExecutorActorControlService; import nu.marginalia.actor.task.ConvertActor; +import nu.marginalia.actor.task.DownloadSampleActor; import nu.marginalia.executor.upload.UploadDirContents; import nu.marginalia.executor.upload.UploadDirItem; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import spark.Request; import spark.Response; @@ -18,6 +21,7 @@ import java.util.List; public class SideloadService { private final ExecutorActorControlService actorControlService; + private static final Logger logger = LoggerFactory.getLogger(SideloadService.class); @Inject public SideloadService(ExecutorActorControlService actorControlService) { @@ -56,4 +60,11 @@ public class SideloadService { } + public Object downloadSampleData(Request request, Response response) throws Exception { + String sampleSet = request.queryParams("set"); + + actorControlService.startFrom(ExecutorActor.DOWNLOAD_SAMPLE, new DownloadSampleActor.Run(sampleSet)); + + return ""; + } } diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/executor/svc/TransferService.java b/code/services-core/executor-service/src/main/java/nu/marginalia/executor/svc/TransferService.java index 0700fbc9..ed84695f 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/executor/svc/TransferService.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/executor/svc/TransferService.java @@ -15,7 +15,6 @@ import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.process.log.WorkLog; import nu.marginalia.service.module.ServiceConfiguration; import nu.marginalia.storage.FileStorageService; -import nu.marginalia.storage.model.FileStorageBaseType; import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageType; import org.apache.commons.io.FileUtils; @@ -187,8 +186,7 @@ public class TransferService { // Ensure crawl data exists to receive into if (storages.isEmpty()) { - var storage = fileStorageService.allocateTemporaryStorage( - fileStorageService.getStorageBase(FileStorageBaseType.STORAGE), + var storage = fileStorageService.allocateStorage( FileStorageType.CRAWL_DATA, "crawl-data", "Crawl Data" diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/svc/BackupService.java b/code/services-core/executor-service/src/main/java/nu/marginalia/svc/BackupService.java index 2205e5e2..ec0f561b 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/svc/BackupService.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/svc/BackupService.java @@ -5,7 +5,6 @@ import com.github.luben.zstd.ZstdOutputStream; import nu.marginalia.IndexLocations; import nu.marginalia.service.control.ServiceHeartbeat; import nu.marginalia.storage.FileStorageService; -import nu.marginalia.storage.model.FileStorageBaseType; import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageType; import nu.marginallia.index.journal.IndexJournalFileNames; @@ -45,11 +44,9 @@ public class BackupService { * This backup can later be dehydrated and quickly loaded into _LIVE. * */ public void createBackupFromStaging(List associatedIds) throws SQLException, IOException { - var backupBase = storageService.getStorageBase(FileStorageBaseType.BACKUP); - String desc = "Pre-load backup snapshot " + LocalDateTime.now(); - var backupStorage = storageService.allocateTemporaryStorage(backupBase, + var backupStorage = storageService.allocateStorage( FileStorageType.BACKUP, "snapshot", desc); for (var associatedId : associatedIds) {