mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(*) Add download-sample action, refactor file storage
This changeset adds an action for downloading a set of sample data from downloads.marginalia.nu. It also refactors out some leaky abstractions out of FileStorageService. allocateTemporaryStorage has been renamed allocateStorage. The storage was never temporary in any scenario... It also doesn't take a storage base, as there was always only one valid option for this input. The allocateStorage method finds the appropriate base itself.
This commit is contained in:
parent
1b8b97b8ec
commit
cae1bad274
@ -71,7 +71,6 @@ public class ExecutorClient extends AbstractDynamicClient {
|
||||
post(ctx, node,
|
||||
"/sideload/encyclopedia?path="+ URLEncoder.encode(sourcePath.toString(), StandardCharsets.UTF_8) + "&baseUrl=" + URLEncoder.encode(baseUrl, StandardCharsets.UTF_8),
|
||||
"").blockingSubscribe();
|
||||
|
||||
}
|
||||
|
||||
public void sideloadDirtree(Context ctx, int node, Path sourcePath) {
|
||||
@ -111,6 +110,10 @@ public class ExecutorClient extends AbstractDynamicClient {
|
||||
post(ctx, node, "/export/termfreq?fid="+fid, "").blockingSubscribe();
|
||||
}
|
||||
|
||||
public void downloadSampleData(Context ctx, int node, String sampleSet) {
|
||||
post(ctx, node, "/action/download-sample-data?set="+URLEncoder.encode(sampleSet, StandardCharsets.UTF_8), "").blockingSubscribe();
|
||||
}
|
||||
|
||||
public void exportData(Context ctx, int node) {
|
||||
post(ctx, node, "/export/data", "").blockingSubscribe();
|
||||
}
|
||||
@ -166,4 +169,5 @@ public class ExecutorClient extends AbstractDynamicClient {
|
||||
public void yieldDomain(Context context, int node, TransferItem item) {
|
||||
post(context, node, "/transfer/yield", item).blockingSubscribe();
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -223,14 +223,12 @@ public class FileStorageService {
|
||||
return maybePath;
|
||||
}
|
||||
|
||||
/** Allocate a temporary storage of the given type */
|
||||
public FileStorage allocateTemporaryStorage(FileStorageBase base,
|
||||
FileStorageType type,
|
||||
String prefix,
|
||||
String description) throws IOException, SQLException
|
||||
/** Allocate a storage area of the given type */
|
||||
public FileStorage allocateStorage(FileStorageType type,
|
||||
String prefix,
|
||||
String description) throws IOException, SQLException
|
||||
{
|
||||
if (!base.type().permitsStorageType(type))
|
||||
throw new RuntimeException("Attempting to allocate storage of type " + type + " in base of type " + base.type());
|
||||
var base = getStorageBase(FileStorageBaseType.forFileStorageType(type));
|
||||
|
||||
Path newDir = allocateDirectory(base.asPath(), prefix);
|
||||
|
||||
|
@ -1,18 +1,17 @@
|
||||
package nu.marginalia.storage.model;
|
||||
|
||||
import java.util.EnumSet;
|
||||
|
||||
public enum FileStorageBaseType {
|
||||
CURRENT,
|
||||
WORK,
|
||||
STORAGE,
|
||||
BACKUP;
|
||||
|
||||
public boolean permitsStorageType(FileStorageType type) {
|
||||
return switch (this) {
|
||||
case BACKUP -> FileStorageType.BACKUP.equals(type);
|
||||
case STORAGE -> EnumSet.of(FileStorageType.EXPORT, FileStorageType.CRAWL_DATA, FileStorageType.PROCESSED_DATA, FileStorageType.CRAWL_SPEC).contains(type);
|
||||
default -> false;
|
||||
|
||||
public static FileStorageBaseType forFileStorageType(FileStorageType type) {
|
||||
return switch (type) {
|
||||
case EXPORT, CRAWL_DATA, PROCESSED_DATA, CRAWL_SPEC -> STORAGE;
|
||||
case BACKUP -> BACKUP;
|
||||
};
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -13,18 +13,14 @@ import org.testcontainers.containers.MariaDBContainer;
|
||||
import org.testcontainers.junit.jupiter.Container;
|
||||
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.UUID;
|
||||
|
||||
import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD;
|
||||
|
||||
@Testcontainers
|
||||
@Execution(ExecutionMode.SAME_THREAD)
|
||||
@Tag("slow")
|
||||
@ -124,8 +120,7 @@ public class FileStorageServiceTest {
|
||||
|
||||
var storage = new FileStorageService(dataSource, 0);
|
||||
|
||||
var base = storage.createStorageBase(name, createTempDir(), FileStorageBaseType.STORAGE);
|
||||
var fileStorage = storage.allocateTemporaryStorage(base, FileStorageType.CRAWL_DATA, "xyz", "thisShouldSucceed");
|
||||
var fileStorage = storage.allocateStorage(FileStorageType.CRAWL_DATA, "xyz", "thisShouldSucceed");
|
||||
System.out.println("Allocated " + fileStorage.asPath());
|
||||
Assertions.assertTrue(Files.exists(fileStorage.asPath()));
|
||||
tempDirs.add(fileStorage.asPath());
|
||||
|
@ -13,6 +13,8 @@ import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.storage.model.FileStorageState;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
import spark.Spark;
|
||||
@ -21,9 +23,11 @@ import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
@Singleton
|
||||
public class ControlNodeActionsService {
|
||||
private static final Logger logger = LoggerFactory.getLogger(ControlNodeActionsService.class);
|
||||
private final IndexClient indexClient;
|
||||
private final RedirectControl redirectControl;
|
||||
private final FileStorageService fileStorageService;
|
||||
@ -62,6 +66,9 @@ public class ControlNodeActionsService {
|
||||
Spark.post("/public/nodes/:node/actions/sideload-stackexchange", this::sideloadStackexchange,
|
||||
redirectControl.renderRedirectAcknowledgement("Sideloading", "..")
|
||||
);
|
||||
Spark.post("/public/nodes/:node/actions/download-sample-data", this::downloadSampleData,
|
||||
redirectControl.renderRedirectAcknowledgement("Downloading", "..")
|
||||
);
|
||||
Spark.post("/public/nodes/:id/actions/new-crawl", this::triggerNewCrawl,
|
||||
redirectControl.renderRedirectAcknowledgement("Crawling", "..")
|
||||
);
|
||||
@ -91,6 +98,21 @@ public class ControlNodeActionsService {
|
||||
);
|
||||
}
|
||||
|
||||
private Object downloadSampleData(Request request, Response response) {
|
||||
String set = request.queryParams("sample");
|
||||
|
||||
if (set == null)
|
||||
throw new ControlValidationError("No sample specified", "A sample data set must be specified", "..");
|
||||
if (!Set.of("sample-s", "sample-m", "sample-l", "sample-xl").contains(set))
|
||||
throw new ControlValidationError("Invalid sample specified", "A valid sample data set must be specified", "..");
|
||||
|
||||
executorClient.downloadSampleData(Context.fromRequest(request), Integer.parseInt(request.params("node")), set);
|
||||
|
||||
logger.info("Downloading sample data set {}", set);
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
public Object sideloadEncyclopedia(Request request, Response response) {
|
||||
|
||||
String source = request.queryParams("source");
|
||||
|
@ -0,0 +1,47 @@
|
||||
<h1 class="my-3">Download Sample Data</h1>
|
||||
|
||||
<div class="my-3 p-3 border bg-light">
|
||||
This will download sample crawl data from <a href="https://downloads.marginalia.nu">downloads.marginalia.nu</a> onto Node {{node.id}}.
|
||||
This is a sample of real crawl data. It is intended for demo, testing and development purposes. Several sets are available.
|
||||
</div>
|
||||
|
||||
<form method="post" action="actions/download-sample-data">
|
||||
|
||||
<table class="table">
|
||||
<tr>
|
||||
<th>Use</th>
|
||||
<th>Set</th>
|
||||
<th>Description</th>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td><input id="sample-s" value="sample-s" name="sample" class="form-check-input" type="radio"></td>
|
||||
<td><label for="sample-s">Small</label></td>
|
||||
<td>1000 Domains. About 2 GB. </td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td><input id="sample-m" value="sample-m" name="sample" class="form-check-input" type="radio"></td>
|
||||
<td><label for="sample-m">Medium</label></td>
|
||||
<td>2000 Domains. About 6 GB. Recommended.</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td><input id="sample-l" value="sample-l" name="sample" class="form-check-input" type="radio"></td>
|
||||
<td><label for="sample-l">Large</label></td>
|
||||
<td>5000 Domains. About 20 GB.</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td><input id="sample-xl" value="sample-xl" name="sample" class="form-check-input" type="radio"></td>
|
||||
<td><label for="sample-xl">Huge</label></td>
|
||||
<td>50,000 Domains. Around 180 GB. Primarily intended for pre-production like testing environments.
|
||||
Expect hours of processing time. </td>
|
||||
</tr>
|
||||
</table>
|
||||
<button
|
||||
class="btn btn-primary me-md-2"
|
||||
onclick="return confirm('Confirm downloading sample data onto node {{node.id}}');"
|
||||
type="submit">
|
||||
Start Download</button>
|
||||
</form>
|
@ -23,6 +23,7 @@
|
||||
{{#if view.export-from-crawl-data}} {{> control/node/actions/partial-export-from-crawl-data }} {{/if}}
|
||||
{{#if view.export-sample-data}} {{> control/node/actions/partial-export-sample-data }} {{/if}}
|
||||
{{#if view.restore-backup}} {{> control/node/actions/partial-restore-backup }} {{/if}}
|
||||
{{#if view.download-sample-data}} {{> control/node/actions/partial-download-sample-data }} {{/if}}
|
||||
<div class="mt-10"> </div>
|
||||
</div>
|
||||
</body>
|
||||
|
@ -24,6 +24,7 @@
|
||||
<li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=sideload-stackexchange">Sideload Stackexchange</a></li>
|
||||
<li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=sideload-warc">Sideload WARC Files</a></li>
|
||||
<li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=sideload-dirtree">Sideload Dirtree</a></li>
|
||||
<li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=download-sample-data">Download Sample Crawl Data</a></li>
|
||||
<li><hr class="dropdown-divider"></li>
|
||||
<li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=export-db-data">Export Database Data</a></li>
|
||||
<li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=export-sample-data">Export Sample Crawl Data</a></li>
|
||||
|
@ -61,6 +61,7 @@ dependencies {
|
||||
implementation libs.zstd
|
||||
implementation libs.jsoup
|
||||
implementation libs.commons.io
|
||||
implementation libs.commons.compress
|
||||
implementation libs.commons.lang3
|
||||
implementation libs.bundles.mariadb
|
||||
|
||||
|
@ -17,7 +17,9 @@ public enum ExecutorActor {
|
||||
EXPORT_FEEDS,
|
||||
PROC_INDEX_CONSTRUCTOR_SPAWNER,
|
||||
CONVERT,
|
||||
RESTORE_BACKUP, EXPORT_SAMPLE_DATA;
|
||||
RESTORE_BACKUP,
|
||||
EXPORT_SAMPLE_DATA,
|
||||
DOWNLOAD_SAMPLE;
|
||||
|
||||
public String id() {
|
||||
return "fsm:" + name().toLowerCase();
|
||||
|
@ -47,6 +47,7 @@ public class ExecutorActorControlService {
|
||||
ExportFeedsActor exportFeedsActor,
|
||||
ExportSampleDataActor exportSampleDataActor,
|
||||
ExportTermFreqActor exportTermFrequenciesActor,
|
||||
DownloadSampleActor downloadSampleActor,
|
||||
ExecutorActorStateMachines stateMachines) {
|
||||
this.messageQueueFactory = messageQueueFactory;
|
||||
this.eventLog = baseServiceParams.eventLog;
|
||||
@ -75,6 +76,8 @@ public class ExecutorActorControlService {
|
||||
register(ExecutorActor.EXPORT_FEEDS, exportFeedsActor);
|
||||
register(ExecutorActor.EXPORT_SAMPLE_DATA, exportSampleDataActor);
|
||||
register(ExecutorActor.EXPORT_TERM_FREQUENCIES, exportTermFrequenciesActor);
|
||||
|
||||
register(ExecutorActor.DOWNLOAD_SAMPLE, downloadSampleActor);
|
||||
}
|
||||
|
||||
private void register(ExecutorActor process, RecordActorPrototype graph) {
|
||||
|
@ -13,7 +13,6 @@ import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.sideload.SideloadHelper;
|
||||
import nu.marginalia.sideload.StackExchangeSideloadHelper;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageBaseType;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.storage.model.FileStorageState;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
@ -49,8 +48,7 @@ public class ConvertActor extends RecordActorPrototype {
|
||||
return switch (self) {
|
||||
case Convert (FileStorageId fid) -> {
|
||||
var toProcess = storageService.getStorage(fid);
|
||||
var base = storageService.getStorageBase(FileStorageBaseType.STORAGE);
|
||||
var processedArea = storageService.allocateTemporaryStorage(base,
|
||||
var processedArea = storageService.allocateStorage(
|
||||
FileStorageType.PROCESSED_DATA, "processed-data",
|
||||
"Processed Data; " + toProcess.description());
|
||||
|
||||
@ -69,8 +67,7 @@ public class ConvertActor extends RecordActorPrototype {
|
||||
|
||||
String fileName = sourcePath.toFile().getName();
|
||||
|
||||
var base = storageService.getStorageBase(FileStorageBaseType.STORAGE);
|
||||
var processedArea = storageService.allocateTemporaryStorage(base,
|
||||
var processedArea = storageService.allocateStorage(
|
||||
FileStorageType.PROCESSED_DATA, "processed-data",
|
||||
"Processed Dirtree Data; " + fileName);
|
||||
|
||||
@ -88,8 +85,7 @@ public class ConvertActor extends RecordActorPrototype {
|
||||
|
||||
String fileName = sourcePath.toFile().getName();
|
||||
|
||||
var base = storageService.getStorageBase(FileStorageBaseType.STORAGE);
|
||||
var processedArea = storageService.allocateTemporaryStorage(base,
|
||||
var processedArea = storageService.allocateStorage(
|
||||
FileStorageType.PROCESSED_DATA, "processed-data",
|
||||
"Processed Warc Data; " + fileName);
|
||||
|
||||
@ -121,8 +117,7 @@ public class ConvertActor extends RecordActorPrototype {
|
||||
|
||||
String fileName = sourcePath.toFile().getName();
|
||||
|
||||
var base = storageService.getStorageBase(FileStorageBaseType.STORAGE);
|
||||
var processedArea = storageService.allocateTemporaryStorage(base,
|
||||
var processedArea = storageService.allocateStorage(
|
||||
FileStorageType.PROCESSED_DATA, "processed-data",
|
||||
"Processed Encylopedia Data; " + fileName);
|
||||
|
||||
@ -171,8 +166,7 @@ public class ConvertActor extends RecordActorPrototype {
|
||||
|
||||
String fileName = sourcePath.toFile().getName();
|
||||
|
||||
var base = storageService.getStorageBase(FileStorageBaseType.STORAGE);
|
||||
var processedArea = storageService.allocateTemporaryStorage(base,
|
||||
var processedArea = storageService.allocateStorage(
|
||||
FileStorageType.PROCESSED_DATA, "processed-data",
|
||||
"Processed Stackexchange Data; " + fileName);
|
||||
|
||||
|
@ -17,14 +17,12 @@ import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import nu.marginalia.storage.model.FileStorageState;
|
||||
import nu.marginalia.svc.BackupService;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageBaseType;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
import nu.marginalia.index.client.IndexClient;
|
||||
import nu.marginalia.index.client.IndexMqEndpoints;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.mqapi.converting.ConvertAction;
|
||||
import nu.marginalia.mqapi.converting.ConvertRequest;
|
||||
import nu.marginalia.mqapi.index.CreateIndexRequest;
|
||||
import nu.marginalia.mqapi.index.IndexName;
|
||||
@ -96,8 +94,7 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
|
||||
if (storage.type() != FileStorageType.CRAWL_DATA) yield new Error("Bad storage type " + storage.type());
|
||||
|
||||
|
||||
var base = storageService.getStorageBase(FileStorageBaseType.STORAGE);
|
||||
var processedArea = storageService.allocateTemporaryStorage(base, FileStorageType.PROCESSED_DATA, "processed-data",
|
||||
var processedArea = storageService.allocateStorage(FileStorageType.PROCESSED_DATA, "processed-data",
|
||||
"Processed Data; " + storage.description());
|
||||
|
||||
storageService.setFileStorageState(processedArea.id(), FileStorageState.NEW);
|
||||
|
@ -10,7 +10,6 @@ import nu.marginalia.actor.state.Resume;
|
||||
import nu.marginalia.process.ProcessOutboxes;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageBaseType;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
@ -43,9 +42,7 @@ public class CrawlActor extends RecordActorPrototype {
|
||||
if (storage == null) yield new Error("Bad storage id");
|
||||
if (storage.type() != FileStorageType.CRAWL_SPEC) yield new Error("Bad storage type " + storage.type());
|
||||
|
||||
var base = storageService.getStorageBase(FileStorageBaseType.STORAGE);
|
||||
var dataArea = storageService.allocateTemporaryStorage(
|
||||
base,
|
||||
var dataArea = storageService.allocateStorage(
|
||||
FileStorageType.CRAWL_DATA,
|
||||
"crawl-data",
|
||||
storage.description());
|
||||
|
@ -7,7 +7,6 @@ import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.crawlspec.CrawlSpecFileNames;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageBaseType;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -41,8 +40,7 @@ public class CrawlJobExtractorActor extends RecordActorPrototype {
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch (self) {
|
||||
case CreateFromUrl(String description, String url) -> {
|
||||
var base = fileStorageService.getStorageBase(FileStorageBaseType.STORAGE);
|
||||
var storage = fileStorageService.allocateTemporaryStorage(base, FileStorageType.CRAWL_SPEC, "crawl-spec", description);
|
||||
var storage = fileStorageService.allocateStorage(FileStorageType.CRAWL_SPEC, "crawl-spec", description);
|
||||
|
||||
Path urlsTxt = storage.asPath().resolve("urls.txt");
|
||||
|
||||
|
@ -0,0 +1,133 @@
|
||||
package nu.marginalia.actor.task;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.*;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URI;
|
||||
import java.net.URL;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.nio.file.attribute.PosixFilePermissions;
|
||||
|
||||
@Singleton
|
||||
public class DownloadSampleActor extends RecordActorPrototype {
|
||||
|
||||
private final FileStorageService storageService;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
public record Run(String setName) implements ActorStep {}
|
||||
@Override
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch(self) {
|
||||
case Run(String setName) -> {
|
||||
final FileStorage newStorage = storageService.allocateStorage(
|
||||
FileStorageType.CRAWL_DATA,
|
||||
"sample-crawl-data",
|
||||
"Sample " + setName);
|
||||
|
||||
storageService.setFileStorageState(newStorage.id(), FileStorageState.NEW);
|
||||
|
||||
URL downloadURI = getDownloadURL(setName);
|
||||
|
||||
try {
|
||||
downloadArchive(downloadURI, newStorage.asPath());
|
||||
}
|
||||
catch (IOException ex) {
|
||||
logger.error("Error downloading sample", ex);
|
||||
storageService.flagFileForDeletion(newStorage.id());
|
||||
yield new Error();
|
||||
}
|
||||
finally {
|
||||
storageService.setFileStorageState(newStorage.id(), FileStorageState.UNSET);
|
||||
}
|
||||
|
||||
yield new End();
|
||||
}
|
||||
default -> new Error();
|
||||
};
|
||||
}
|
||||
|
||||
private void downloadArchive(URL downloadURI, Path outputPath) throws IOException, InterruptedException {
|
||||
// See the documentation for commons compress:
|
||||
// https://commons.apache.org/proper/commons-compress/examples.html
|
||||
|
||||
try (var tar = new TarArchiveInputStream(downloadURI.openStream())) {
|
||||
TarArchiveEntry nextEntry;
|
||||
byte[] buffer = new byte[8192];
|
||||
|
||||
while ((nextEntry = tar.getNextEntry()) != null) {
|
||||
// Poll for interruption, to ensure this can be cancelled
|
||||
if (Thread.interrupted()) {
|
||||
throw new InterruptedException();
|
||||
}
|
||||
|
||||
if (nextEntry.isDirectory()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
Path outputFile = outputPath.resolve(nextEntry.getName());
|
||||
Files.createDirectories(outputFile.getParent(),
|
||||
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rwxr-xr-x"))
|
||||
);
|
||||
|
||||
long size = nextEntry.getSize();
|
||||
|
||||
// Extract tar entry
|
||||
try (var fos = Files.newOutputStream(outputFile, StandardOpenOption.CREATE)) {
|
||||
transferBytes(tar, fos, buffer, size);
|
||||
}
|
||||
|
||||
Files.setPosixFilePermissions(outputPath, PosixFilePermissions.fromString("rw-r--r--"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void transferBytes(InputStream inputStream, OutputStream outputStream, byte[] buffer, long size)
|
||||
throws IOException
|
||||
{
|
||||
long copiedSize = 0;
|
||||
|
||||
while (copiedSize < size) {
|
||||
int read = inputStream.read(buffer);
|
||||
|
||||
if (read < 0) // We've been promised a file of length 'size', so this shouldn't happen, but just in case...
|
||||
throw new IOException("Unexpected end of stream");
|
||||
|
||||
outputStream.write(buffer, 0, read);
|
||||
copiedSize += read;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private URL getDownloadURL(String setName) throws MalformedURLException {
|
||||
return URI.create(STR."https://downloads.marginalia.nu/samples/\{setName}.tar").toURL();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String describe() {
|
||||
return "Download a sample of crawl data from downloads.marginalia.nu";
|
||||
}
|
||||
|
||||
@Inject
|
||||
public DownloadSampleActor(Gson gson,
|
||||
FileStorageService storageService)
|
||||
{
|
||||
super(gson);
|
||||
this.storageService = storageService;
|
||||
}
|
||||
|
||||
}
|
@ -23,8 +23,7 @@ public class ExportAtagsActor extends RecordActorPrototype {
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch(self) {
|
||||
case Export(FileStorageId crawlId) -> {
|
||||
var storageBase = storageService.getStorageBase(FileStorageBaseType.STORAGE);
|
||||
var storage = storageService.allocateTemporaryStorage(storageBase, FileStorageType.EXPORT, "atag-export", "Anchor Tags " + LocalDateTime.now());
|
||||
var storage = storageService.allocateStorage(FileStorageType.EXPORT, "atag-export", "Anchor Tags " + LocalDateTime.now());
|
||||
|
||||
if (storage == null) yield new Error("Bad storage id");
|
||||
yield new Run(crawlId, storage.id());
|
||||
|
@ -8,7 +8,6 @@ import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.query.client.QueryClient;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageBaseType;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
import org.slf4j.Logger;
|
||||
@ -43,8 +42,7 @@ public class ExportDataActor extends RecordActorPrototype {
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch(self) {
|
||||
case Export() -> {
|
||||
var storageBase = storageService.getStorageBase(FileStorageBaseType.STORAGE);
|
||||
var storage = storageService.allocateTemporaryStorage(storageBase, FileStorageType.EXPORT, "db-export", "DB Exports " + LocalDateTime.now());
|
||||
var storage = storageService.allocateStorage(FileStorageType.EXPORT, "db-export", "DB Exports " + LocalDateTime.now());
|
||||
|
||||
if (storage == null) yield new Error("Bad storage id");
|
||||
yield new ExportBlacklist(storage.id());
|
||||
|
@ -8,7 +8,6 @@ import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.extractor.ExporterIf;
|
||||
import nu.marginalia.extractor.FeedExporter;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageBaseType;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.storage.model.FileStorageState;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
@ -29,8 +28,7 @@ public class ExportFeedsActor extends RecordActorPrototype {
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch(self) {
|
||||
case Export(FileStorageId crawlId) -> {
|
||||
var storageBase = storageService.getStorageBase(FileStorageBaseType.STORAGE);
|
||||
var storage = storageService.allocateTemporaryStorage(storageBase, FileStorageType.EXPORT, "feed-export", "Feeds " + LocalDateTime.now());
|
||||
var storage = storageService.allocateStorage(FileStorageType.EXPORT, "feed-export", "Feeds " + LocalDateTime.now());
|
||||
|
||||
if (storage == null) yield new Error("Bad storage id");
|
||||
yield new Run(crawlId, storage.id());
|
||||
|
@ -5,11 +5,8 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.extractor.ExporterIf;
|
||||
import nu.marginalia.extractor.FeedExporter;
|
||||
import nu.marginalia.extractor.SampleDataExporter;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageBaseType;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.storage.model.FileStorageState;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
@ -30,8 +27,7 @@ public class ExportSampleDataActor extends RecordActorPrototype {
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch(self) {
|
||||
case Export(FileStorageId crawlId, int size, String name) -> {
|
||||
var storageBase = storageService.getStorageBase(FileStorageBaseType.STORAGE);
|
||||
var storage = storageService.allocateTemporaryStorage(storageBase, FileStorageType.EXPORT,
|
||||
var storage = storageService.allocateStorage(FileStorageType.EXPORT,
|
||||
"crawl-sample-export",
|
||||
STR."Crawl Data Sample \{name}/\{size} \{LocalDateTime.now()}"
|
||||
);
|
||||
|
@ -8,7 +8,6 @@ import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.extractor.ExporterIf;
|
||||
import nu.marginalia.extractor.TermFrequencyExporter;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageBaseType;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.storage.model.FileStorageState;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
@ -25,8 +24,7 @@ public class ExportTermFreqActor extends RecordActorPrototype {
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch(self) {
|
||||
case Export(FileStorageId crawlId) -> {
|
||||
var storageBase = storageService.getStorageBase(FileStorageBaseType.STORAGE);
|
||||
var storage = storageService.allocateTemporaryStorage(storageBase, FileStorageType.EXPORT, "term-freq-export", "Term Frequencies " + LocalDateTime.now());
|
||||
var storage = storageService.allocateStorage(FileStorageType.EXPORT, "term-freq-export", "Term Frequencies " + LocalDateTime.now());
|
||||
|
||||
if (storage == null) yield new Error("Bad storage id");
|
||||
yield new Run(crawlId, storage.id());
|
||||
|
@ -71,6 +71,8 @@ public class ExecutorSvc extends Service {
|
||||
Spark.post("/sideload/stackexchange", sideloadService::sideloadStackexchange);
|
||||
Spark.post("/sideload/encyclopedia", sideloadService::sideloadEncyclopedia);
|
||||
|
||||
Spark.post("/action/download-sample-data", sideloadService::downloadSampleData);
|
||||
|
||||
Spark.post("/export/atags", exportService::exportAtags);
|
||||
Spark.post("/export/sample-data", exportService::exportSampleData);
|
||||
Spark.post("/export/feeds", exportService::exportFeeds);
|
||||
|
@ -5,8 +5,11 @@ import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.actor.ExecutorActor;
|
||||
import nu.marginalia.actor.ExecutorActorControlService;
|
||||
import nu.marginalia.actor.task.ConvertActor;
|
||||
import nu.marginalia.actor.task.DownloadSampleActor;
|
||||
import nu.marginalia.executor.upload.UploadDirContents;
|
||||
import nu.marginalia.executor.upload.UploadDirItem;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
|
||||
@ -18,6 +21,7 @@ import java.util.List;
|
||||
|
||||
public class SideloadService {
|
||||
private final ExecutorActorControlService actorControlService;
|
||||
private static final Logger logger = LoggerFactory.getLogger(SideloadService.class);
|
||||
|
||||
@Inject
|
||||
public SideloadService(ExecutorActorControlService actorControlService) {
|
||||
@ -56,4 +60,11 @@ public class SideloadService {
|
||||
|
||||
}
|
||||
|
||||
public Object downloadSampleData(Request request, Response response) throws Exception {
|
||||
String sampleSet = request.queryParams("set");
|
||||
|
||||
actorControlService.startFrom(ExecutorActor.DOWNLOAD_SAMPLE, new DownloadSampleActor.Run(sampleSet));
|
||||
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
@ -15,7 +15,6 @@ import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.process.log.WorkLog;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageBaseType;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
@ -187,8 +186,7 @@ public class TransferService {
|
||||
|
||||
// Ensure crawl data exists to receive into
|
||||
if (storages.isEmpty()) {
|
||||
var storage = fileStorageService.allocateTemporaryStorage(
|
||||
fileStorageService.getStorageBase(FileStorageBaseType.STORAGE),
|
||||
var storage = fileStorageService.allocateStorage(
|
||||
FileStorageType.CRAWL_DATA,
|
||||
"crawl-data",
|
||||
"Crawl Data"
|
||||
|
@ -5,7 +5,6 @@ import com.github.luben.zstd.ZstdOutputStream;
|
||||
import nu.marginalia.IndexLocations;
|
||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageBaseType;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
import nu.marginallia.index.journal.IndexJournalFileNames;
|
||||
@ -45,11 +44,9 @@ public class BackupService {
|
||||
* This backup can later be dehydrated and quickly loaded into _LIVE.
|
||||
* */
|
||||
public void createBackupFromStaging(List<FileStorageId> associatedIds) throws SQLException, IOException {
|
||||
var backupBase = storageService.getStorageBase(FileStorageBaseType.BACKUP);
|
||||
|
||||
String desc = "Pre-load backup snapshot " + LocalDateTime.now();
|
||||
|
||||
var backupStorage = storageService.allocateTemporaryStorage(backupBase,
|
||||
var backupStorage = storageService.allocateStorage(
|
||||
FileStorageType.BACKUP, "snapshot", desc);
|
||||
|
||||
for (var associatedId : associatedIds) {
|
||||
|
Loading…
Reference in New Issue
Block a user