(converter, control) Re-enable sideloading encyclopedia data

This commit is contained in:
Viktor Lofgren 2023-09-14 12:12:07 +02:00
parent 35996d0adb
commit 5e5aaf9a7e
5 changed files with 68 additions and 10 deletions

View File

@ -7,6 +7,7 @@ import com.google.inject.Injector;
import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.converting.sideload.SideloadSource;
import nu.marginalia.converting.sideload.SideloadSourceFactory;
import nu.marginalia.converting.writer.ConverterBatchWriter;
import nu.marginalia.converting.writer.ConverterWriter;
import nu.marginalia.db.storage.FileStorageService;
import nu.marginalia.mq.MessageQueueFactory;
@ -25,6 +26,7 @@ import nu.marginalia.converting.processor.DomainProcessor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Path;
import java.sql.SQLException;
import java.util.Optional;
@ -82,10 +84,10 @@ public class ConverterMain {
heartbeat.start();
}
public void convert(SideloadSource sideloadSource, Path writeDir) throws Exception {
int maxPoolSize = 16;
// FIXME
public void convert(SideloadSource sideloadSource, Path writeDir) throws IOException {
try (var writer = new ConverterBatchWriter(writeDir, 0)) {
writer.write(sideloadSource);
}
}
public void convert(CrawlPlan plan) throws Exception {

View File

@ -2,7 +2,9 @@ package nu.marginalia.converting.writer;
import gnu.trove.list.TLongList;
import gnu.trove.list.array.TLongArrayList;
import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.converting.sideload.SideloadSource;
import nu.marginalia.io.processed.DocumentRecordParquetFileWriter;
import nu.marginalia.io.processed.DomainLinkRecordParquetFileWriter;
import nu.marginalia.io.processed.DomainRecordParquetFileWriter;
@ -24,14 +26,15 @@ import java.util.concurrent.Callable;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.Future;
public class ConverterBatchWriter {
/** Writer for a single batch of converter parquet files */
public class ConverterBatchWriter implements AutoCloseable {
private final DomainRecordParquetFileWriter domainWriter;
private final DomainLinkRecordParquetFileWriter domainLinkWriter;
private final DocumentRecordParquetFileWriter documentWriter;
private static final Logger logger = LoggerFactory.getLogger(ConverterBatchWriter.class);
ConverterBatchWriter(Path basePath, int batchNumber) throws IOException {
public ConverterBatchWriter(Path basePath, int batchNumber) throws IOException {
domainWriter = new DomainRecordParquetFileWriter(
ProcessedDataFileNames.domainFileName(basePath, batchNumber)
);
@ -43,6 +46,14 @@ public class ConverterBatchWriter {
);
}
public void write(SideloadSource sideloadSource) throws IOException {
var domain = sideloadSource.getDomain();
writeDomainData(domain);
writeDocumentData(domain.domain, sideloadSource.getDocumentsStream());
}
public void write(ProcessedDomain domain) {
var results = ForkJoinPool.commonPool().invokeAll(
writeTasks(domain)
@ -67,10 +78,22 @@ public class ConverterBatchWriter {
if (domain.documents == null)
return this;
String domainName = domain.domain.toString();
writeDocumentData(domain.domain, domain.documents.iterator());
return this;
}
private void writeDocumentData(EdgeDomain domain,
Iterator<ProcessedDocument> documentIterator)
throws IOException
{
int ordinal = 0;
for (var document : domain.documents) {
String domainName = domain.toString();
while (documentIterator.hasNext()) {
var document = documentIterator.next();
if (document.details == null) {
new DocumentRecord(
domainName,
@ -119,7 +142,6 @@ public class ConverterBatchWriter {
ordinal++;
}
return this;
}
private Object writeLinkData(ProcessedDomain domain) throws IOException {

View File

@ -188,6 +188,7 @@ public class ControlService extends Service {
Spark.post("/public/actions/trigger-data-exports", controlActionsService::triggerDataExports, redirectToActors);
Spark.post("/public/actions/flush-api-caches", controlActionsService::flushApiCaches, redirectToActors);
Spark.post("/public/actions/truncate-links-database", controlActionsService::truncateLinkDatabase, redirectToActors);
Spark.post("/public/actions/sideload-encyclopedia", controlActionsService::sideloadEncyclopedia, redirectToActors);
// Review Random Domains
Spark.get("/public/review-random-domains", this::reviewRandomDomainsModel, reviewRandomDomainsRenderer::render);

View File

@ -4,18 +4,20 @@ import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.control.actor.ControlActors;
import nu.marginalia.control.actor.Actor;
import nu.marginalia.control.actor.task.ConvertActor;
import nu.marginalia.db.DomainTypes;
import nu.marginalia.index.client.IndexClient;
import nu.marginalia.index.client.IndexMqEndpoints;
import nu.marginalia.mq.MessageQueueFactory;
import nu.marginalia.mq.outbox.MqOutbox;
import nu.marginalia.search.client.SearchClient;
import nu.marginalia.service.control.ServiceEventLog;
import nu.marginalia.service.id.ServiceId;
import spark.Request;
import spark.Response;
import spark.Spark;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.UUID;
@Singleton
@ -97,6 +99,22 @@ public class ControlActionsService {
return "";
}
public Object sideloadEncyclopedia(Request request, Response response) throws Exception {
Path sourcePath = Path.of(request.queryParams("source"));
if (!Files.exists(sourcePath)) {
Spark.halt(404);
return "No such file " + sourcePath;
}
eventLog.logEvent("USER-ACTION", "SIDELOAD ENCYCLOPEDIA");
actors.startFrom(Actor.CONVERT, ConvertActor.CONVERT_ENCYCLOPEDIA, sourcePath.toString());
return "";
}
public Object triggerRepartition(Request request, Response response) throws Exception {
indexClient.outbox().sendAsync(IndexMqEndpoints.INDEX_REPARTITION, "");

View File

@ -24,6 +24,21 @@
</form>
</td>
</tr>
<tr>
<td><b>Sideload Encyclopedia</b><p>
This will load pre-digested encyclopedia data
from a encyclopedia.marginalia.nu-style database.
</td>
<td>
<form method="post" action="/actions/sideload-encyclopedia" onsubmit="return confirm('Confirm sideloading')">
<label for="source">articles.db location on server</label><br>
<input id="source" name="source" value="">
<br><br>
<input type="submit" value="Sideload Encyclopedia">
</form>
</td>
</tr>
<tr>
<td>
<b>Reload Blogs List</b>