(*) Add download-sample action, refactor file storage

This changeset adds an action for downloading a set of sample data from downloads.marginalia.nu.

It also refactors out some leaky abstractions out of FileStorageService.  allocateTemporaryStorage has been renamed allocateStorage.  The storage was never temporary in any scenario...

It also doesn't take a storage base, as there was always only one valid option for this input.  The allocateStorage method finds the appropriate base itself.
This commit is contained in:
Viktor Lofgren 2024-01-25 13:36:30 +01:00
parent 1b8b97b8ec
commit cae1bad274
25 changed files with 256 additions and 67 deletions

View File

@ -71,7 +71,6 @@ public class ExecutorClient extends AbstractDynamicClient {
post(ctx, node, post(ctx, node,
"/sideload/encyclopedia?path="+ URLEncoder.encode(sourcePath.toString(), StandardCharsets.UTF_8) + "&baseUrl=" + URLEncoder.encode(baseUrl, StandardCharsets.UTF_8), "/sideload/encyclopedia?path="+ URLEncoder.encode(sourcePath.toString(), StandardCharsets.UTF_8) + "&baseUrl=" + URLEncoder.encode(baseUrl, StandardCharsets.UTF_8),
"").blockingSubscribe(); "").blockingSubscribe();
} }
public void sideloadDirtree(Context ctx, int node, Path sourcePath) { public void sideloadDirtree(Context ctx, int node, Path sourcePath) {
@ -111,6 +110,10 @@ public class ExecutorClient extends AbstractDynamicClient {
post(ctx, node, "/export/termfreq?fid="+fid, "").blockingSubscribe(); post(ctx, node, "/export/termfreq?fid="+fid, "").blockingSubscribe();
} }
public void downloadSampleData(Context ctx, int node, String sampleSet) {
post(ctx, node, "/action/download-sample-data?set="+URLEncoder.encode(sampleSet, StandardCharsets.UTF_8), "").blockingSubscribe();
}
public void exportData(Context ctx, int node) { public void exportData(Context ctx, int node) {
post(ctx, node, "/export/data", "").blockingSubscribe(); post(ctx, node, "/export/data", "").blockingSubscribe();
} }
@ -166,4 +169,5 @@ public class ExecutorClient extends AbstractDynamicClient {
public void yieldDomain(Context context, int node, TransferItem item) { public void yieldDomain(Context context, int node, TransferItem item) {
post(context, node, "/transfer/yield", item).blockingSubscribe(); post(context, node, "/transfer/yield", item).blockingSubscribe();
} }
} }

View File

@ -223,14 +223,12 @@ public class FileStorageService {
return maybePath; return maybePath;
} }
/** Allocate a temporary storage of the given type */ /** Allocate a storage area of the given type */
public FileStorage allocateTemporaryStorage(FileStorageBase base, public FileStorage allocateStorage(FileStorageType type,
FileStorageType type, String prefix,
String prefix, String description) throws IOException, SQLException
String description) throws IOException, SQLException
{ {
if (!base.type().permitsStorageType(type)) var base = getStorageBase(FileStorageBaseType.forFileStorageType(type));
throw new RuntimeException("Attempting to allocate storage of type " + type + " in base of type " + base.type());
Path newDir = allocateDirectory(base.asPath(), prefix); Path newDir = allocateDirectory(base.asPath(), prefix);

View File

@ -1,18 +1,17 @@
package nu.marginalia.storage.model; package nu.marginalia.storage.model;
import java.util.EnumSet;
public enum FileStorageBaseType { public enum FileStorageBaseType {
CURRENT, CURRENT,
WORK, WORK,
STORAGE, STORAGE,
BACKUP; BACKUP;
public boolean permitsStorageType(FileStorageType type) {
return switch (this) { public static FileStorageBaseType forFileStorageType(FileStorageType type) {
case BACKUP -> FileStorageType.BACKUP.equals(type); return switch (type) {
case STORAGE -> EnumSet.of(FileStorageType.EXPORT, FileStorageType.CRAWL_DATA, FileStorageType.PROCESSED_DATA, FileStorageType.CRAWL_SPEC).contains(type); case EXPORT, CRAWL_DATA, PROCESSED_DATA, CRAWL_SPEC -> STORAGE;
default -> false; case BACKUP -> BACKUP;
}; };
} }
} }

View File

@ -13,18 +13,14 @@ import org.testcontainers.containers.MariaDBContainer;
import org.testcontainers.junit.jupiter.Container; import org.testcontainers.junit.jupiter.Container;
import org.testcontainers.junit.jupiter.Testcontainers; import org.testcontainers.junit.jupiter.Testcontainers;
import java.io.FileNotFoundException;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.sql.SQLException; import java.sql.SQLException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Objects;
import java.util.UUID; import java.util.UUID;
import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD;
@Testcontainers @Testcontainers
@Execution(ExecutionMode.SAME_THREAD) @Execution(ExecutionMode.SAME_THREAD)
@Tag("slow") @Tag("slow")
@ -124,8 +120,7 @@ public class FileStorageServiceTest {
var storage = new FileStorageService(dataSource, 0); var storage = new FileStorageService(dataSource, 0);
var base = storage.createStorageBase(name, createTempDir(), FileStorageBaseType.STORAGE); var fileStorage = storage.allocateStorage(FileStorageType.CRAWL_DATA, "xyz", "thisShouldSucceed");
var fileStorage = storage.allocateTemporaryStorage(base, FileStorageType.CRAWL_DATA, "xyz", "thisShouldSucceed");
System.out.println("Allocated " + fileStorage.asPath()); System.out.println("Allocated " + fileStorage.asPath());
Assertions.assertTrue(Files.exists(fileStorage.asPath())); Assertions.assertTrue(Files.exists(fileStorage.asPath()));
tempDirs.add(fileStorage.asPath()); tempDirs.add(fileStorage.asPath());

View File

@ -13,6 +13,8 @@ import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageId;
import nu.marginalia.storage.model.FileStorageState; import nu.marginalia.storage.model.FileStorageState;
import nu.marginalia.storage.model.FileStorageType; import nu.marginalia.storage.model.FileStorageType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.Request; import spark.Request;
import spark.Response; import spark.Response;
import spark.Spark; import spark.Spark;
@ -21,9 +23,11 @@ import java.nio.file.Path;
import java.sql.SQLException; import java.sql.SQLException;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.Set;
@Singleton @Singleton
public class ControlNodeActionsService { public class ControlNodeActionsService {
private static final Logger logger = LoggerFactory.getLogger(ControlNodeActionsService.class);
private final IndexClient indexClient; private final IndexClient indexClient;
private final RedirectControl redirectControl; private final RedirectControl redirectControl;
private final FileStorageService fileStorageService; private final FileStorageService fileStorageService;
@ -62,6 +66,9 @@ public class ControlNodeActionsService {
Spark.post("/public/nodes/:node/actions/sideload-stackexchange", this::sideloadStackexchange, Spark.post("/public/nodes/:node/actions/sideload-stackexchange", this::sideloadStackexchange,
redirectControl.renderRedirectAcknowledgement("Sideloading", "..") redirectControl.renderRedirectAcknowledgement("Sideloading", "..")
); );
Spark.post("/public/nodes/:node/actions/download-sample-data", this::downloadSampleData,
redirectControl.renderRedirectAcknowledgement("Downloading", "..")
);
Spark.post("/public/nodes/:id/actions/new-crawl", this::triggerNewCrawl, Spark.post("/public/nodes/:id/actions/new-crawl", this::triggerNewCrawl,
redirectControl.renderRedirectAcknowledgement("Crawling", "..") redirectControl.renderRedirectAcknowledgement("Crawling", "..")
); );
@ -91,6 +98,21 @@ public class ControlNodeActionsService {
); );
} }
private Object downloadSampleData(Request request, Response response) {
String set = request.queryParams("sample");
if (set == null)
throw new ControlValidationError("No sample specified", "A sample data set must be specified", "..");
if (!Set.of("sample-s", "sample-m", "sample-l", "sample-xl").contains(set))
throw new ControlValidationError("Invalid sample specified", "A valid sample data set must be specified", "..");
executorClient.downloadSampleData(Context.fromRequest(request), Integer.parseInt(request.params("node")), set);
logger.info("Downloading sample data set {}", set);
return "";
}
public Object sideloadEncyclopedia(Request request, Response response) { public Object sideloadEncyclopedia(Request request, Response response) {
String source = request.queryParams("source"); String source = request.queryParams("source");

View File

@ -0,0 +1,47 @@
<h1 class="my-3">Download Sample Data</h1>
<div class="my-3 p-3 border bg-light">
This will download sample crawl data from <a href="https://downloads.marginalia.nu">downloads.marginalia.nu</a> onto Node {{node.id}}.
This is a sample of real crawl data. It is intended for demo, testing and development purposes. Several sets are available.
</div>
<form method="post" action="actions/download-sample-data">
<table class="table">
<tr>
<th>Use</th>
<th>Set</th>
<th>Description</th>
</tr>
<tr>
<td><input id="sample-s" value="sample-s" name="sample" class="form-check-input" type="radio"></td>
<td><label for="sample-s">Small</label></td>
<td>1000 Domains. About 2 GB. </td>
</tr>
<tr>
<td><input id="sample-m" value="sample-m" name="sample" class="form-check-input" type="radio"></td>
<td><label for="sample-m">Medium</label></td>
<td>2000 Domains. About 6 GB. Recommended.</td>
</tr>
<tr>
<td><input id="sample-l" value="sample-l" name="sample" class="form-check-input" type="radio"></td>
<td><label for="sample-l">Large</label></td>
<td>5000 Domains. About 20 GB.</td>
</tr>
<tr>
<td><input id="sample-xl" value="sample-xl" name="sample" class="form-check-input" type="radio"></td>
<td><label for="sample-xl">Huge</label></td>
<td>50,000 Domains. Around 180 GB. Primarily intended for pre-production like testing environments.
Expect hours of processing time. </td>
</tr>
</table>
<button
class="btn btn-primary me-md-2"
onclick="return confirm('Confirm downloading sample data onto node {{node.id}}');"
type="submit">
Start Download</button>
</form>

View File

@ -23,6 +23,7 @@
{{#if view.export-from-crawl-data}} {{> control/node/actions/partial-export-from-crawl-data }} {{/if}} {{#if view.export-from-crawl-data}} {{> control/node/actions/partial-export-from-crawl-data }} {{/if}}
{{#if view.export-sample-data}} {{> control/node/actions/partial-export-sample-data }} {{/if}} {{#if view.export-sample-data}} {{> control/node/actions/partial-export-sample-data }} {{/if}}
{{#if view.restore-backup}} {{> control/node/actions/partial-restore-backup }} {{/if}} {{#if view.restore-backup}} {{> control/node/actions/partial-restore-backup }} {{/if}}
{{#if view.download-sample-data}} {{> control/node/actions/partial-download-sample-data }} {{/if}}
<div class="mt-10">&nbsp;</div> <div class="mt-10">&nbsp;</div>
</div> </div>
</body> </body>

View File

@ -24,6 +24,7 @@
<li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=sideload-stackexchange">Sideload Stackexchange</a></li> <li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=sideload-stackexchange">Sideload Stackexchange</a></li>
<li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=sideload-warc">Sideload WARC Files</a></li> <li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=sideload-warc">Sideload WARC Files</a></li>
<li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=sideload-dirtree">Sideload Dirtree</a></li> <li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=sideload-dirtree">Sideload Dirtree</a></li>
<li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=download-sample-data">Download Sample Crawl Data</a></li>
<li><hr class="dropdown-divider"></li> <li><hr class="dropdown-divider"></li>
<li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=export-db-data">Export Database Data</a></li> <li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=export-db-data">Export Database Data</a></li>
<li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=export-sample-data">Export Sample Crawl Data</a></li> <li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=export-sample-data">Export Sample Crawl Data</a></li>

View File

@ -61,6 +61,7 @@ dependencies {
implementation libs.zstd implementation libs.zstd
implementation libs.jsoup implementation libs.jsoup
implementation libs.commons.io implementation libs.commons.io
implementation libs.commons.compress
implementation libs.commons.lang3 implementation libs.commons.lang3
implementation libs.bundles.mariadb implementation libs.bundles.mariadb

View File

@ -17,7 +17,9 @@ public enum ExecutorActor {
EXPORT_FEEDS, EXPORT_FEEDS,
PROC_INDEX_CONSTRUCTOR_SPAWNER, PROC_INDEX_CONSTRUCTOR_SPAWNER,
CONVERT, CONVERT,
RESTORE_BACKUP, EXPORT_SAMPLE_DATA; RESTORE_BACKUP,
EXPORT_SAMPLE_DATA,
DOWNLOAD_SAMPLE;
public String id() { public String id() {
return "fsm:" + name().toLowerCase(); return "fsm:" + name().toLowerCase();

View File

@ -47,6 +47,7 @@ public class ExecutorActorControlService {
ExportFeedsActor exportFeedsActor, ExportFeedsActor exportFeedsActor,
ExportSampleDataActor exportSampleDataActor, ExportSampleDataActor exportSampleDataActor,
ExportTermFreqActor exportTermFrequenciesActor, ExportTermFreqActor exportTermFrequenciesActor,
DownloadSampleActor downloadSampleActor,
ExecutorActorStateMachines stateMachines) { ExecutorActorStateMachines stateMachines) {
this.messageQueueFactory = messageQueueFactory; this.messageQueueFactory = messageQueueFactory;
this.eventLog = baseServiceParams.eventLog; this.eventLog = baseServiceParams.eventLog;
@ -75,6 +76,8 @@ public class ExecutorActorControlService {
register(ExecutorActor.EXPORT_FEEDS, exportFeedsActor); register(ExecutorActor.EXPORT_FEEDS, exportFeedsActor);
register(ExecutorActor.EXPORT_SAMPLE_DATA, exportSampleDataActor); register(ExecutorActor.EXPORT_SAMPLE_DATA, exportSampleDataActor);
register(ExecutorActor.EXPORT_TERM_FREQUENCIES, exportTermFrequenciesActor); register(ExecutorActor.EXPORT_TERM_FREQUENCIES, exportTermFrequenciesActor);
register(ExecutorActor.DOWNLOAD_SAMPLE, downloadSampleActor);
} }
private void register(ExecutorActor process, RecordActorPrototype graph) { private void register(ExecutorActor process, RecordActorPrototype graph) {

View File

@ -13,7 +13,6 @@ import nu.marginalia.process.ProcessService;
import nu.marginalia.sideload.SideloadHelper; import nu.marginalia.sideload.SideloadHelper;
import nu.marginalia.sideload.StackExchangeSideloadHelper; import nu.marginalia.sideload.StackExchangeSideloadHelper;
import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorageBaseType;
import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageId;
import nu.marginalia.storage.model.FileStorageState; import nu.marginalia.storage.model.FileStorageState;
import nu.marginalia.storage.model.FileStorageType; import nu.marginalia.storage.model.FileStorageType;
@ -49,8 +48,7 @@ public class ConvertActor extends RecordActorPrototype {
return switch (self) { return switch (self) {
case Convert (FileStorageId fid) -> { case Convert (FileStorageId fid) -> {
var toProcess = storageService.getStorage(fid); var toProcess = storageService.getStorage(fid);
var base = storageService.getStorageBase(FileStorageBaseType.STORAGE); var processedArea = storageService.allocateStorage(
var processedArea = storageService.allocateTemporaryStorage(base,
FileStorageType.PROCESSED_DATA, "processed-data", FileStorageType.PROCESSED_DATA, "processed-data",
"Processed Data; " + toProcess.description()); "Processed Data; " + toProcess.description());
@ -69,8 +67,7 @@ public class ConvertActor extends RecordActorPrototype {
String fileName = sourcePath.toFile().getName(); String fileName = sourcePath.toFile().getName();
var base = storageService.getStorageBase(FileStorageBaseType.STORAGE); var processedArea = storageService.allocateStorage(
var processedArea = storageService.allocateTemporaryStorage(base,
FileStorageType.PROCESSED_DATA, "processed-data", FileStorageType.PROCESSED_DATA, "processed-data",
"Processed Dirtree Data; " + fileName); "Processed Dirtree Data; " + fileName);
@ -88,8 +85,7 @@ public class ConvertActor extends RecordActorPrototype {
String fileName = sourcePath.toFile().getName(); String fileName = sourcePath.toFile().getName();
var base = storageService.getStorageBase(FileStorageBaseType.STORAGE); var processedArea = storageService.allocateStorage(
var processedArea = storageService.allocateTemporaryStorage(base,
FileStorageType.PROCESSED_DATA, "processed-data", FileStorageType.PROCESSED_DATA, "processed-data",
"Processed Warc Data; " + fileName); "Processed Warc Data; " + fileName);
@ -121,8 +117,7 @@ public class ConvertActor extends RecordActorPrototype {
String fileName = sourcePath.toFile().getName(); String fileName = sourcePath.toFile().getName();
var base = storageService.getStorageBase(FileStorageBaseType.STORAGE); var processedArea = storageService.allocateStorage(
var processedArea = storageService.allocateTemporaryStorage(base,
FileStorageType.PROCESSED_DATA, "processed-data", FileStorageType.PROCESSED_DATA, "processed-data",
"Processed Encylopedia Data; " + fileName); "Processed Encylopedia Data; " + fileName);
@ -171,8 +166,7 @@ public class ConvertActor extends RecordActorPrototype {
String fileName = sourcePath.toFile().getName(); String fileName = sourcePath.toFile().getName();
var base = storageService.getStorageBase(FileStorageBaseType.STORAGE); var processedArea = storageService.allocateStorage(
var processedArea = storageService.allocateTemporaryStorage(base,
FileStorageType.PROCESSED_DATA, "processed-data", FileStorageType.PROCESSED_DATA, "processed-data",
"Processed Stackexchange Data; " + fileName); "Processed Stackexchange Data; " + fileName);

View File

@ -17,14 +17,12 @@ import nu.marginalia.service.module.ServiceConfiguration;
import nu.marginalia.storage.model.FileStorageState; import nu.marginalia.storage.model.FileStorageState;
import nu.marginalia.svc.BackupService; import nu.marginalia.svc.BackupService;
import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorageBaseType;
import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageId;
import nu.marginalia.storage.model.FileStorageType; import nu.marginalia.storage.model.FileStorageType;
import nu.marginalia.index.client.IndexClient; import nu.marginalia.index.client.IndexClient;
import nu.marginalia.index.client.IndexMqEndpoints; import nu.marginalia.index.client.IndexMqEndpoints;
import nu.marginalia.mq.MqMessageState; import nu.marginalia.mq.MqMessageState;
import nu.marginalia.mq.outbox.MqOutbox; import nu.marginalia.mq.outbox.MqOutbox;
import nu.marginalia.mqapi.converting.ConvertAction;
import nu.marginalia.mqapi.converting.ConvertRequest; import nu.marginalia.mqapi.converting.ConvertRequest;
import nu.marginalia.mqapi.index.CreateIndexRequest; import nu.marginalia.mqapi.index.CreateIndexRequest;
import nu.marginalia.mqapi.index.IndexName; import nu.marginalia.mqapi.index.IndexName;
@ -96,8 +94,7 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
if (storage.type() != FileStorageType.CRAWL_DATA) yield new Error("Bad storage type " + storage.type()); if (storage.type() != FileStorageType.CRAWL_DATA) yield new Error("Bad storage type " + storage.type());
var base = storageService.getStorageBase(FileStorageBaseType.STORAGE); var processedArea = storageService.allocateStorage(FileStorageType.PROCESSED_DATA, "processed-data",
var processedArea = storageService.allocateTemporaryStorage(base, FileStorageType.PROCESSED_DATA, "processed-data",
"Processed Data; " + storage.description()); "Processed Data; " + storage.description());
storageService.setFileStorageState(processedArea.id(), FileStorageState.NEW); storageService.setFileStorageState(processedArea.id(), FileStorageState.NEW);

View File

@ -10,7 +10,6 @@ import nu.marginalia.actor.state.Resume;
import nu.marginalia.process.ProcessOutboxes; import nu.marginalia.process.ProcessOutboxes;
import nu.marginalia.process.ProcessService; import nu.marginalia.process.ProcessService;
import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorageBaseType;
import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageId;
import nu.marginalia.storage.model.FileStorageType; import nu.marginalia.storage.model.FileStorageType;
import nu.marginalia.mq.MqMessageState; import nu.marginalia.mq.MqMessageState;
@ -43,9 +42,7 @@ public class CrawlActor extends RecordActorPrototype {
if (storage == null) yield new Error("Bad storage id"); if (storage == null) yield new Error("Bad storage id");
if (storage.type() != FileStorageType.CRAWL_SPEC) yield new Error("Bad storage type " + storage.type()); if (storage.type() != FileStorageType.CRAWL_SPEC) yield new Error("Bad storage type " + storage.type());
var base = storageService.getStorageBase(FileStorageBaseType.STORAGE); var dataArea = storageService.allocateStorage(
var dataArea = storageService.allocateTemporaryStorage(
base,
FileStorageType.CRAWL_DATA, FileStorageType.CRAWL_DATA,
"crawl-data", "crawl-data",
storage.description()); storage.description());

View File

@ -7,7 +7,6 @@ import nu.marginalia.actor.prototype.RecordActorPrototype;
import nu.marginalia.actor.state.ActorStep; import nu.marginalia.actor.state.ActorStep;
import nu.marginalia.crawlspec.CrawlSpecFileNames; import nu.marginalia.crawlspec.CrawlSpecFileNames;
import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorageBaseType;
import nu.marginalia.storage.model.FileStorageType; import nu.marginalia.storage.model.FileStorageType;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -41,8 +40,7 @@ public class CrawlJobExtractorActor extends RecordActorPrototype {
public ActorStep transition(ActorStep self) throws Exception { public ActorStep transition(ActorStep self) throws Exception {
return switch (self) { return switch (self) {
case CreateFromUrl(String description, String url) -> { case CreateFromUrl(String description, String url) -> {
var base = fileStorageService.getStorageBase(FileStorageBaseType.STORAGE); var storage = fileStorageService.allocateStorage(FileStorageType.CRAWL_SPEC, "crawl-spec", description);
var storage = fileStorageService.allocateTemporaryStorage(base, FileStorageType.CRAWL_SPEC, "crawl-spec", description);
Path urlsTxt = storage.asPath().resolve("urls.txt"); Path urlsTxt = storage.asPath().resolve("urls.txt");

View File

@ -0,0 +1,133 @@
package nu.marginalia.actor.task;
import com.google.gson.Gson;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.actor.prototype.RecordActorPrototype;
import nu.marginalia.actor.state.ActorStep;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.*;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.nio.file.attribute.PosixFilePermissions;
@Singleton
public class DownloadSampleActor extends RecordActorPrototype {
private final FileStorageService storageService;
private final Logger logger = LoggerFactory.getLogger(getClass());
public record Run(String setName) implements ActorStep {}
@Override
public ActorStep transition(ActorStep self) throws Exception {
return switch(self) {
case Run(String setName) -> {
final FileStorage newStorage = storageService.allocateStorage(
FileStorageType.CRAWL_DATA,
"sample-crawl-data",
"Sample " + setName);
storageService.setFileStorageState(newStorage.id(), FileStorageState.NEW);
URL downloadURI = getDownloadURL(setName);
try {
downloadArchive(downloadURI, newStorage.asPath());
}
catch (IOException ex) {
logger.error("Error downloading sample", ex);
storageService.flagFileForDeletion(newStorage.id());
yield new Error();
}
finally {
storageService.setFileStorageState(newStorage.id(), FileStorageState.UNSET);
}
yield new End();
}
default -> new Error();
};
}
private void downloadArchive(URL downloadURI, Path outputPath) throws IOException, InterruptedException {
// See the documentation for commons compress:
// https://commons.apache.org/proper/commons-compress/examples.html
try (var tar = new TarArchiveInputStream(downloadURI.openStream())) {
TarArchiveEntry nextEntry;
byte[] buffer = new byte[8192];
while ((nextEntry = tar.getNextEntry()) != null) {
// Poll for interruption, to ensure this can be cancelled
if (Thread.interrupted()) {
throw new InterruptedException();
}
if (nextEntry.isDirectory()) {
continue;
}
Path outputFile = outputPath.resolve(nextEntry.getName());
Files.createDirectories(outputFile.getParent(),
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rwxr-xr-x"))
);
long size = nextEntry.getSize();
// Extract tar entry
try (var fos = Files.newOutputStream(outputFile, StandardOpenOption.CREATE)) {
transferBytes(tar, fos, buffer, size);
}
Files.setPosixFilePermissions(outputPath, PosixFilePermissions.fromString("rw-r--r--"));
}
}
}
private void transferBytes(InputStream inputStream, OutputStream outputStream, byte[] buffer, long size)
throws IOException
{
long copiedSize = 0;
while (copiedSize < size) {
int read = inputStream.read(buffer);
if (read < 0) // We've been promised a file of length 'size', so this shouldn't happen, but just in case...
throw new IOException("Unexpected end of stream");
outputStream.write(buffer, 0, read);
copiedSize += read;
}
}
private URL getDownloadURL(String setName) throws MalformedURLException {
return URI.create(STR."https://downloads.marginalia.nu/samples/\{setName}.tar").toURL();
}
@Override
public String describe() {
return "Download a sample of crawl data from downloads.marginalia.nu";
}
@Inject
public DownloadSampleActor(Gson gson,
FileStorageService storageService)
{
super(gson);
this.storageService = storageService;
}
}

View File

@ -23,8 +23,7 @@ public class ExportAtagsActor extends RecordActorPrototype {
public ActorStep transition(ActorStep self) throws Exception { public ActorStep transition(ActorStep self) throws Exception {
return switch(self) { return switch(self) {
case Export(FileStorageId crawlId) -> { case Export(FileStorageId crawlId) -> {
var storageBase = storageService.getStorageBase(FileStorageBaseType.STORAGE); var storage = storageService.allocateStorage(FileStorageType.EXPORT, "atag-export", "Anchor Tags " + LocalDateTime.now());
var storage = storageService.allocateTemporaryStorage(storageBase, FileStorageType.EXPORT, "atag-export", "Anchor Tags " + LocalDateTime.now());
if (storage == null) yield new Error("Bad storage id"); if (storage == null) yield new Error("Bad storage id");
yield new Run(crawlId, storage.id()); yield new Run(crawlId, storage.id());

View File

@ -8,7 +8,6 @@ import nu.marginalia.actor.prototype.RecordActorPrototype;
import nu.marginalia.actor.state.ActorStep; import nu.marginalia.actor.state.ActorStep;
import nu.marginalia.query.client.QueryClient; import nu.marginalia.query.client.QueryClient;
import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorageBaseType;
import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageId;
import nu.marginalia.storage.model.FileStorageType; import nu.marginalia.storage.model.FileStorageType;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -43,8 +42,7 @@ public class ExportDataActor extends RecordActorPrototype {
public ActorStep transition(ActorStep self) throws Exception { public ActorStep transition(ActorStep self) throws Exception {
return switch(self) { return switch(self) {
case Export() -> { case Export() -> {
var storageBase = storageService.getStorageBase(FileStorageBaseType.STORAGE); var storage = storageService.allocateStorage(FileStorageType.EXPORT, "db-export", "DB Exports " + LocalDateTime.now());
var storage = storageService.allocateTemporaryStorage(storageBase, FileStorageType.EXPORT, "db-export", "DB Exports " + LocalDateTime.now());
if (storage == null) yield new Error("Bad storage id"); if (storage == null) yield new Error("Bad storage id");
yield new ExportBlacklist(storage.id()); yield new ExportBlacklist(storage.id());

View File

@ -8,7 +8,6 @@ import nu.marginalia.actor.state.ActorStep;
import nu.marginalia.extractor.ExporterIf; import nu.marginalia.extractor.ExporterIf;
import nu.marginalia.extractor.FeedExporter; import nu.marginalia.extractor.FeedExporter;
import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorageBaseType;
import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageId;
import nu.marginalia.storage.model.FileStorageState; import nu.marginalia.storage.model.FileStorageState;
import nu.marginalia.storage.model.FileStorageType; import nu.marginalia.storage.model.FileStorageType;
@ -29,8 +28,7 @@ public class ExportFeedsActor extends RecordActorPrototype {
public ActorStep transition(ActorStep self) throws Exception { public ActorStep transition(ActorStep self) throws Exception {
return switch(self) { return switch(self) {
case Export(FileStorageId crawlId) -> { case Export(FileStorageId crawlId) -> {
var storageBase = storageService.getStorageBase(FileStorageBaseType.STORAGE); var storage = storageService.allocateStorage(FileStorageType.EXPORT, "feed-export", "Feeds " + LocalDateTime.now());
var storage = storageService.allocateTemporaryStorage(storageBase, FileStorageType.EXPORT, "feed-export", "Feeds " + LocalDateTime.now());
if (storage == null) yield new Error("Bad storage id"); if (storage == null) yield new Error("Bad storage id");
yield new Run(crawlId, storage.id()); yield new Run(crawlId, storage.id());

View File

@ -5,11 +5,8 @@ import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import nu.marginalia.actor.prototype.RecordActorPrototype; import nu.marginalia.actor.prototype.RecordActorPrototype;
import nu.marginalia.actor.state.ActorStep; import nu.marginalia.actor.state.ActorStep;
import nu.marginalia.extractor.ExporterIf;
import nu.marginalia.extractor.FeedExporter;
import nu.marginalia.extractor.SampleDataExporter; import nu.marginalia.extractor.SampleDataExporter;
import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorageBaseType;
import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageId;
import nu.marginalia.storage.model.FileStorageState; import nu.marginalia.storage.model.FileStorageState;
import nu.marginalia.storage.model.FileStorageType; import nu.marginalia.storage.model.FileStorageType;
@ -30,8 +27,7 @@ public class ExportSampleDataActor extends RecordActorPrototype {
public ActorStep transition(ActorStep self) throws Exception { public ActorStep transition(ActorStep self) throws Exception {
return switch(self) { return switch(self) {
case Export(FileStorageId crawlId, int size, String name) -> { case Export(FileStorageId crawlId, int size, String name) -> {
var storageBase = storageService.getStorageBase(FileStorageBaseType.STORAGE); var storage = storageService.allocateStorage(FileStorageType.EXPORT,
var storage = storageService.allocateTemporaryStorage(storageBase, FileStorageType.EXPORT,
"crawl-sample-export", "crawl-sample-export",
STR."Crawl Data Sample \{name}/\{size} \{LocalDateTime.now()}" STR."Crawl Data Sample \{name}/\{size} \{LocalDateTime.now()}"
); );

View File

@ -8,7 +8,6 @@ import nu.marginalia.actor.state.ActorStep;
import nu.marginalia.extractor.ExporterIf; import nu.marginalia.extractor.ExporterIf;
import nu.marginalia.extractor.TermFrequencyExporter; import nu.marginalia.extractor.TermFrequencyExporter;
import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorageBaseType;
import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageId;
import nu.marginalia.storage.model.FileStorageState; import nu.marginalia.storage.model.FileStorageState;
import nu.marginalia.storage.model.FileStorageType; import nu.marginalia.storage.model.FileStorageType;
@ -25,8 +24,7 @@ public class ExportTermFreqActor extends RecordActorPrototype {
public ActorStep transition(ActorStep self) throws Exception { public ActorStep transition(ActorStep self) throws Exception {
return switch(self) { return switch(self) {
case Export(FileStorageId crawlId) -> { case Export(FileStorageId crawlId) -> {
var storageBase = storageService.getStorageBase(FileStorageBaseType.STORAGE); var storage = storageService.allocateStorage(FileStorageType.EXPORT, "term-freq-export", "Term Frequencies " + LocalDateTime.now());
var storage = storageService.allocateTemporaryStorage(storageBase, FileStorageType.EXPORT, "term-freq-export", "Term Frequencies " + LocalDateTime.now());
if (storage == null) yield new Error("Bad storage id"); if (storage == null) yield new Error("Bad storage id");
yield new Run(crawlId, storage.id()); yield new Run(crawlId, storage.id());

View File

@ -71,6 +71,8 @@ public class ExecutorSvc extends Service {
Spark.post("/sideload/stackexchange", sideloadService::sideloadStackexchange); Spark.post("/sideload/stackexchange", sideloadService::sideloadStackexchange);
Spark.post("/sideload/encyclopedia", sideloadService::sideloadEncyclopedia); Spark.post("/sideload/encyclopedia", sideloadService::sideloadEncyclopedia);
Spark.post("/action/download-sample-data", sideloadService::downloadSampleData);
Spark.post("/export/atags", exportService::exportAtags); Spark.post("/export/atags", exportService::exportAtags);
Spark.post("/export/sample-data", exportService::exportSampleData); Spark.post("/export/sample-data", exportService::exportSampleData);
Spark.post("/export/feeds", exportService::exportFeeds); Spark.post("/export/feeds", exportService::exportFeeds);

View File

@ -5,8 +5,11 @@ import nu.marginalia.WmsaHome;
import nu.marginalia.actor.ExecutorActor; import nu.marginalia.actor.ExecutorActor;
import nu.marginalia.actor.ExecutorActorControlService; import nu.marginalia.actor.ExecutorActorControlService;
import nu.marginalia.actor.task.ConvertActor; import nu.marginalia.actor.task.ConvertActor;
import nu.marginalia.actor.task.DownloadSampleActor;
import nu.marginalia.executor.upload.UploadDirContents; import nu.marginalia.executor.upload.UploadDirContents;
import nu.marginalia.executor.upload.UploadDirItem; import nu.marginalia.executor.upload.UploadDirItem;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.Request; import spark.Request;
import spark.Response; import spark.Response;
@ -18,6 +21,7 @@ import java.util.List;
public class SideloadService { public class SideloadService {
private final ExecutorActorControlService actorControlService; private final ExecutorActorControlService actorControlService;
private static final Logger logger = LoggerFactory.getLogger(SideloadService.class);
@Inject @Inject
public SideloadService(ExecutorActorControlService actorControlService) { public SideloadService(ExecutorActorControlService actorControlService) {
@ -56,4 +60,11 @@ public class SideloadService {
} }
public Object downloadSampleData(Request request, Response response) throws Exception {
String sampleSet = request.queryParams("set");
actorControlService.startFrom(ExecutorActor.DOWNLOAD_SAMPLE, new DownloadSampleActor.Run(sampleSet));
return "";
}
} }

View File

@ -15,7 +15,6 @@ import nu.marginalia.mq.persistence.MqPersistence;
import nu.marginalia.process.log.WorkLog; import nu.marginalia.process.log.WorkLog;
import nu.marginalia.service.module.ServiceConfiguration; import nu.marginalia.service.module.ServiceConfiguration;
import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorageBaseType;
import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageId;
import nu.marginalia.storage.model.FileStorageType; import nu.marginalia.storage.model.FileStorageType;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
@ -187,8 +186,7 @@ public class TransferService {
// Ensure crawl data exists to receive into // Ensure crawl data exists to receive into
if (storages.isEmpty()) { if (storages.isEmpty()) {
var storage = fileStorageService.allocateTemporaryStorage( var storage = fileStorageService.allocateStorage(
fileStorageService.getStorageBase(FileStorageBaseType.STORAGE),
FileStorageType.CRAWL_DATA, FileStorageType.CRAWL_DATA,
"crawl-data", "crawl-data",
"Crawl Data" "Crawl Data"

View File

@ -5,7 +5,6 @@ import com.github.luben.zstd.ZstdOutputStream;
import nu.marginalia.IndexLocations; import nu.marginalia.IndexLocations;
import nu.marginalia.service.control.ServiceHeartbeat; import nu.marginalia.service.control.ServiceHeartbeat;
import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorageBaseType;
import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageId;
import nu.marginalia.storage.model.FileStorageType; import nu.marginalia.storage.model.FileStorageType;
import nu.marginallia.index.journal.IndexJournalFileNames; import nu.marginallia.index.journal.IndexJournalFileNames;
@ -45,11 +44,9 @@ public class BackupService {
* This backup can later be dehydrated and quickly loaded into _LIVE. * This backup can later be dehydrated and quickly loaded into _LIVE.
* */ * */
public void createBackupFromStaging(List<FileStorageId> associatedIds) throws SQLException, IOException { public void createBackupFromStaging(List<FileStorageId> associatedIds) throws SQLException, IOException {
var backupBase = storageService.getStorageBase(FileStorageBaseType.BACKUP);
String desc = "Pre-load backup snapshot " + LocalDateTime.now(); String desc = "Pre-load backup snapshot " + LocalDateTime.now();
var backupStorage = storageService.allocateTemporaryStorage(backupBase, var backupStorage = storageService.allocateStorage(
FileStorageType.BACKUP, "snapshot", desc); FileStorageType.BACKUP, "snapshot", desc);
for (var associatedId : associatedIds) { for (var associatedId : associatedIds) {