mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(converter, control) Re-enable sideloading encyclopedia data
This commit is contained in:
parent
35996d0adb
commit
5e5aaf9a7e
@ -7,6 +7,7 @@ import com.google.inject.Injector;
|
||||
import nu.marginalia.converting.model.ProcessedDomain;
|
||||
import nu.marginalia.converting.sideload.SideloadSource;
|
||||
import nu.marginalia.converting.sideload.SideloadSourceFactory;
|
||||
import nu.marginalia.converting.writer.ConverterBatchWriter;
|
||||
import nu.marginalia.converting.writer.ConverterWriter;
|
||||
import nu.marginalia.db.storage.FileStorageService;
|
||||
import nu.marginalia.mq.MessageQueueFactory;
|
||||
@ -25,6 +26,7 @@ import nu.marginalia.converting.processor.DomainProcessor;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.util.Optional;
|
||||
@ -82,10 +84,10 @@ public class ConverterMain {
|
||||
heartbeat.start();
|
||||
}
|
||||
|
||||
public void convert(SideloadSource sideloadSource, Path writeDir) throws Exception {
|
||||
int maxPoolSize = 16;
|
||||
|
||||
// FIXME
|
||||
public void convert(SideloadSource sideloadSource, Path writeDir) throws IOException {
|
||||
try (var writer = new ConverterBatchWriter(writeDir, 0)) {
|
||||
writer.write(sideloadSource);
|
||||
}
|
||||
}
|
||||
|
||||
public void convert(CrawlPlan plan) throws Exception {
|
||||
|
@ -2,7 +2,9 @@ package nu.marginalia.converting.writer;
|
||||
|
||||
import gnu.trove.list.TLongList;
|
||||
import gnu.trove.list.array.TLongArrayList;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.converting.model.ProcessedDomain;
|
||||
import nu.marginalia.converting.sideload.SideloadSource;
|
||||
import nu.marginalia.io.processed.DocumentRecordParquetFileWriter;
|
||||
import nu.marginalia.io.processed.DomainLinkRecordParquetFileWriter;
|
||||
import nu.marginalia.io.processed.DomainRecordParquetFileWriter;
|
||||
@ -24,14 +26,15 @@ import java.util.concurrent.Callable;
|
||||
import java.util.concurrent.ForkJoinPool;
|
||||
import java.util.concurrent.Future;
|
||||
|
||||
public class ConverterBatchWriter {
|
||||
/** Writer for a single batch of converter parquet files */
|
||||
public class ConverterBatchWriter implements AutoCloseable {
|
||||
private final DomainRecordParquetFileWriter domainWriter;
|
||||
private final DomainLinkRecordParquetFileWriter domainLinkWriter;
|
||||
private final DocumentRecordParquetFileWriter documentWriter;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(ConverterBatchWriter.class);
|
||||
|
||||
ConverterBatchWriter(Path basePath, int batchNumber) throws IOException {
|
||||
public ConverterBatchWriter(Path basePath, int batchNumber) throws IOException {
|
||||
domainWriter = new DomainRecordParquetFileWriter(
|
||||
ProcessedDataFileNames.domainFileName(basePath, batchNumber)
|
||||
);
|
||||
@ -43,6 +46,14 @@ public class ConverterBatchWriter {
|
||||
);
|
||||
}
|
||||
|
||||
public void write(SideloadSource sideloadSource) throws IOException {
|
||||
var domain = sideloadSource.getDomain();
|
||||
|
||||
writeDomainData(domain);
|
||||
|
||||
writeDocumentData(domain.domain, sideloadSource.getDocumentsStream());
|
||||
}
|
||||
|
||||
public void write(ProcessedDomain domain) {
|
||||
var results = ForkJoinPool.commonPool().invokeAll(
|
||||
writeTasks(domain)
|
||||
@ -67,10 +78,22 @@ public class ConverterBatchWriter {
|
||||
if (domain.documents == null)
|
||||
return this;
|
||||
|
||||
String domainName = domain.domain.toString();
|
||||
writeDocumentData(domain.domain, domain.documents.iterator());
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
private void writeDocumentData(EdgeDomain domain,
|
||||
Iterator<ProcessedDocument> documentIterator)
|
||||
throws IOException
|
||||
{
|
||||
|
||||
int ordinal = 0;
|
||||
|
||||
for (var document : domain.documents) {
|
||||
String domainName = domain.toString();
|
||||
|
||||
while (documentIterator.hasNext()) {
|
||||
var document = documentIterator.next();
|
||||
if (document.details == null) {
|
||||
new DocumentRecord(
|
||||
domainName,
|
||||
@ -119,7 +142,6 @@ public class ConverterBatchWriter {
|
||||
ordinal++;
|
||||
}
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
private Object writeLinkData(ProcessedDomain domain) throws IOException {
|
||||
|
@ -188,6 +188,7 @@ public class ControlService extends Service {
|
||||
Spark.post("/public/actions/trigger-data-exports", controlActionsService::triggerDataExports, redirectToActors);
|
||||
Spark.post("/public/actions/flush-api-caches", controlActionsService::flushApiCaches, redirectToActors);
|
||||
Spark.post("/public/actions/truncate-links-database", controlActionsService::truncateLinkDatabase, redirectToActors);
|
||||
Spark.post("/public/actions/sideload-encyclopedia", controlActionsService::sideloadEncyclopedia, redirectToActors);
|
||||
|
||||
// Review Random Domains
|
||||
Spark.get("/public/review-random-domains", this::reviewRandomDomainsModel, reviewRandomDomainsRenderer::render);
|
||||
|
@ -4,18 +4,20 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.control.actor.ControlActors;
|
||||
import nu.marginalia.control.actor.Actor;
|
||||
import nu.marginalia.control.actor.task.ConvertActor;
|
||||
import nu.marginalia.db.DomainTypes;
|
||||
import nu.marginalia.index.client.IndexClient;
|
||||
import nu.marginalia.index.client.IndexMqEndpoints;
|
||||
import nu.marginalia.mq.MessageQueueFactory;
|
||||
import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.search.client.SearchClient;
|
||||
import nu.marginalia.service.control.ServiceEventLog;
|
||||
import nu.marginalia.service.id.ServiceId;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
import spark.Spark;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.UUID;
|
||||
|
||||
@Singleton
|
||||
@ -97,6 +99,22 @@ public class ControlActionsService {
|
||||
return "";
|
||||
}
|
||||
|
||||
public Object sideloadEncyclopedia(Request request, Response response) throws Exception {
|
||||
|
||||
Path sourcePath = Path.of(request.queryParams("source"));
|
||||
if (!Files.exists(sourcePath)) {
|
||||
Spark.halt(404);
|
||||
return "No such file " + sourcePath;
|
||||
}
|
||||
|
||||
eventLog.logEvent("USER-ACTION", "SIDELOAD ENCYCLOPEDIA");
|
||||
|
||||
actors.startFrom(Actor.CONVERT, ConvertActor.CONVERT_ENCYCLOPEDIA, sourcePath.toString());
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
|
||||
public Object triggerRepartition(Request request, Response response) throws Exception {
|
||||
indexClient.outbox().sendAsync(IndexMqEndpoints.INDEX_REPARTITION, "");
|
||||
|
||||
|
@ -24,6 +24,21 @@
|
||||
</form>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><b>Sideload Encyclopedia</b><p>
|
||||
This will load pre-digested encyclopedia data
|
||||
from a encyclopedia.marginalia.nu-style database.
|
||||
</td>
|
||||
<td>
|
||||
<form method="post" action="/actions/sideload-encyclopedia" onsubmit="return confirm('Confirm sideloading')">
|
||||
<label for="source">articles.db location on server</label><br>
|
||||
<input id="source" name="source" value="">
|
||||
<br><br>
|
||||
|
||||
<input type="submit" value="Sideload Encyclopedia">
|
||||
</form>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<b>Reload Blogs List</b>
|
||||
|
Loading…
Reference in New Issue
Block a user