mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(converter, control) Re-enable sideloading encyclopedia data
This commit is contained in:
parent
35996d0adb
commit
5e5aaf9a7e
@ -7,6 +7,7 @@ import com.google.inject.Injector;
|
|||||||
import nu.marginalia.converting.model.ProcessedDomain;
|
import nu.marginalia.converting.model.ProcessedDomain;
|
||||||
import nu.marginalia.converting.sideload.SideloadSource;
|
import nu.marginalia.converting.sideload.SideloadSource;
|
||||||
import nu.marginalia.converting.sideload.SideloadSourceFactory;
|
import nu.marginalia.converting.sideload.SideloadSourceFactory;
|
||||||
|
import nu.marginalia.converting.writer.ConverterBatchWriter;
|
||||||
import nu.marginalia.converting.writer.ConverterWriter;
|
import nu.marginalia.converting.writer.ConverterWriter;
|
||||||
import nu.marginalia.db.storage.FileStorageService;
|
import nu.marginalia.db.storage.FileStorageService;
|
||||||
import nu.marginalia.mq.MessageQueueFactory;
|
import nu.marginalia.mq.MessageQueueFactory;
|
||||||
@ -25,6 +26,7 @@ import nu.marginalia.converting.processor.DomainProcessor;
|
|||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@ -82,10 +84,10 @@ public class ConverterMain {
|
|||||||
heartbeat.start();
|
heartbeat.start();
|
||||||
}
|
}
|
||||||
|
|
||||||
public void convert(SideloadSource sideloadSource, Path writeDir) throws Exception {
|
public void convert(SideloadSource sideloadSource, Path writeDir) throws IOException {
|
||||||
int maxPoolSize = 16;
|
try (var writer = new ConverterBatchWriter(writeDir, 0)) {
|
||||||
|
writer.write(sideloadSource);
|
||||||
// FIXME
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void convert(CrawlPlan plan) throws Exception {
|
public void convert(CrawlPlan plan) throws Exception {
|
||||||
|
@ -2,7 +2,9 @@ package nu.marginalia.converting.writer;
|
|||||||
|
|
||||||
import gnu.trove.list.TLongList;
|
import gnu.trove.list.TLongList;
|
||||||
import gnu.trove.list.array.TLongArrayList;
|
import gnu.trove.list.array.TLongArrayList;
|
||||||
|
import nu.marginalia.converting.model.ProcessedDocument;
|
||||||
import nu.marginalia.converting.model.ProcessedDomain;
|
import nu.marginalia.converting.model.ProcessedDomain;
|
||||||
|
import nu.marginalia.converting.sideload.SideloadSource;
|
||||||
import nu.marginalia.io.processed.DocumentRecordParquetFileWriter;
|
import nu.marginalia.io.processed.DocumentRecordParquetFileWriter;
|
||||||
import nu.marginalia.io.processed.DomainLinkRecordParquetFileWriter;
|
import nu.marginalia.io.processed.DomainLinkRecordParquetFileWriter;
|
||||||
import nu.marginalia.io.processed.DomainRecordParquetFileWriter;
|
import nu.marginalia.io.processed.DomainRecordParquetFileWriter;
|
||||||
@ -24,14 +26,15 @@ import java.util.concurrent.Callable;
|
|||||||
import java.util.concurrent.ForkJoinPool;
|
import java.util.concurrent.ForkJoinPool;
|
||||||
import java.util.concurrent.Future;
|
import java.util.concurrent.Future;
|
||||||
|
|
||||||
public class ConverterBatchWriter {
|
/** Writer for a single batch of converter parquet files */
|
||||||
|
public class ConverterBatchWriter implements AutoCloseable {
|
||||||
private final DomainRecordParquetFileWriter domainWriter;
|
private final DomainRecordParquetFileWriter domainWriter;
|
||||||
private final DomainLinkRecordParquetFileWriter domainLinkWriter;
|
private final DomainLinkRecordParquetFileWriter domainLinkWriter;
|
||||||
private final DocumentRecordParquetFileWriter documentWriter;
|
private final DocumentRecordParquetFileWriter documentWriter;
|
||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(ConverterBatchWriter.class);
|
private static final Logger logger = LoggerFactory.getLogger(ConverterBatchWriter.class);
|
||||||
|
|
||||||
ConverterBatchWriter(Path basePath, int batchNumber) throws IOException {
|
public ConverterBatchWriter(Path basePath, int batchNumber) throws IOException {
|
||||||
domainWriter = new DomainRecordParquetFileWriter(
|
domainWriter = new DomainRecordParquetFileWriter(
|
||||||
ProcessedDataFileNames.domainFileName(basePath, batchNumber)
|
ProcessedDataFileNames.domainFileName(basePath, batchNumber)
|
||||||
);
|
);
|
||||||
@ -43,6 +46,14 @@ public class ConverterBatchWriter {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void write(SideloadSource sideloadSource) throws IOException {
|
||||||
|
var domain = sideloadSource.getDomain();
|
||||||
|
|
||||||
|
writeDomainData(domain);
|
||||||
|
|
||||||
|
writeDocumentData(domain.domain, sideloadSource.getDocumentsStream());
|
||||||
|
}
|
||||||
|
|
||||||
public void write(ProcessedDomain domain) {
|
public void write(ProcessedDomain domain) {
|
||||||
var results = ForkJoinPool.commonPool().invokeAll(
|
var results = ForkJoinPool.commonPool().invokeAll(
|
||||||
writeTasks(domain)
|
writeTasks(domain)
|
||||||
@ -67,10 +78,22 @@ public class ConverterBatchWriter {
|
|||||||
if (domain.documents == null)
|
if (domain.documents == null)
|
||||||
return this;
|
return this;
|
||||||
|
|
||||||
String domainName = domain.domain.toString();
|
writeDocumentData(domain.domain, domain.documents.iterator());
|
||||||
|
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void writeDocumentData(EdgeDomain domain,
|
||||||
|
Iterator<ProcessedDocument> documentIterator)
|
||||||
|
throws IOException
|
||||||
|
{
|
||||||
|
|
||||||
int ordinal = 0;
|
int ordinal = 0;
|
||||||
|
|
||||||
for (var document : domain.documents) {
|
String domainName = domain.toString();
|
||||||
|
|
||||||
|
while (documentIterator.hasNext()) {
|
||||||
|
var document = documentIterator.next();
|
||||||
if (document.details == null) {
|
if (document.details == null) {
|
||||||
new DocumentRecord(
|
new DocumentRecord(
|
||||||
domainName,
|
domainName,
|
||||||
@ -119,7 +142,6 @@ public class ConverterBatchWriter {
|
|||||||
ordinal++;
|
ordinal++;
|
||||||
}
|
}
|
||||||
|
|
||||||
return this;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private Object writeLinkData(ProcessedDomain domain) throws IOException {
|
private Object writeLinkData(ProcessedDomain domain) throws IOException {
|
||||||
|
@ -188,6 +188,7 @@ public class ControlService extends Service {
|
|||||||
Spark.post("/public/actions/trigger-data-exports", controlActionsService::triggerDataExports, redirectToActors);
|
Spark.post("/public/actions/trigger-data-exports", controlActionsService::triggerDataExports, redirectToActors);
|
||||||
Spark.post("/public/actions/flush-api-caches", controlActionsService::flushApiCaches, redirectToActors);
|
Spark.post("/public/actions/flush-api-caches", controlActionsService::flushApiCaches, redirectToActors);
|
||||||
Spark.post("/public/actions/truncate-links-database", controlActionsService::truncateLinkDatabase, redirectToActors);
|
Spark.post("/public/actions/truncate-links-database", controlActionsService::truncateLinkDatabase, redirectToActors);
|
||||||
|
Spark.post("/public/actions/sideload-encyclopedia", controlActionsService::sideloadEncyclopedia, redirectToActors);
|
||||||
|
|
||||||
// Review Random Domains
|
// Review Random Domains
|
||||||
Spark.get("/public/review-random-domains", this::reviewRandomDomainsModel, reviewRandomDomainsRenderer::render);
|
Spark.get("/public/review-random-domains", this::reviewRandomDomainsModel, reviewRandomDomainsRenderer::render);
|
||||||
|
@ -4,18 +4,20 @@ import com.google.inject.Inject;
|
|||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.control.actor.ControlActors;
|
import nu.marginalia.control.actor.ControlActors;
|
||||||
import nu.marginalia.control.actor.Actor;
|
import nu.marginalia.control.actor.Actor;
|
||||||
|
import nu.marginalia.control.actor.task.ConvertActor;
|
||||||
import nu.marginalia.db.DomainTypes;
|
import nu.marginalia.db.DomainTypes;
|
||||||
import nu.marginalia.index.client.IndexClient;
|
import nu.marginalia.index.client.IndexClient;
|
||||||
import nu.marginalia.index.client.IndexMqEndpoints;
|
import nu.marginalia.index.client.IndexMqEndpoints;
|
||||||
import nu.marginalia.mq.MessageQueueFactory;
|
import nu.marginalia.mq.MessageQueueFactory;
|
||||||
import nu.marginalia.mq.outbox.MqOutbox;
|
import nu.marginalia.mq.outbox.MqOutbox;
|
||||||
import nu.marginalia.search.client.SearchClient;
|
|
||||||
import nu.marginalia.service.control.ServiceEventLog;
|
import nu.marginalia.service.control.ServiceEventLog;
|
||||||
import nu.marginalia.service.id.ServiceId;
|
import nu.marginalia.service.id.ServiceId;
|
||||||
import spark.Request;
|
import spark.Request;
|
||||||
import spark.Response;
|
import spark.Response;
|
||||||
import spark.Spark;
|
import spark.Spark;
|
||||||
|
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
import java.util.UUID;
|
import java.util.UUID;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
@ -97,6 +99,22 @@ public class ControlActionsService {
|
|||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Object sideloadEncyclopedia(Request request, Response response) throws Exception {
|
||||||
|
|
||||||
|
Path sourcePath = Path.of(request.queryParams("source"));
|
||||||
|
if (!Files.exists(sourcePath)) {
|
||||||
|
Spark.halt(404);
|
||||||
|
return "No such file " + sourcePath;
|
||||||
|
}
|
||||||
|
|
||||||
|
eventLog.logEvent("USER-ACTION", "SIDELOAD ENCYCLOPEDIA");
|
||||||
|
|
||||||
|
actors.startFrom(Actor.CONVERT, ConvertActor.CONVERT_ENCYCLOPEDIA, sourcePath.toString());
|
||||||
|
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public Object triggerRepartition(Request request, Response response) throws Exception {
|
public Object triggerRepartition(Request request, Response response) throws Exception {
|
||||||
indexClient.outbox().sendAsync(IndexMqEndpoints.INDEX_REPARTITION, "");
|
indexClient.outbox().sendAsync(IndexMqEndpoints.INDEX_REPARTITION, "");
|
||||||
|
|
||||||
|
@ -24,6 +24,21 @@
|
|||||||
</form>
|
</form>
|
||||||
</td>
|
</td>
|
||||||
</tr>
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><b>Sideload Encyclopedia</b><p>
|
||||||
|
This will load pre-digested encyclopedia data
|
||||||
|
from a encyclopedia.marginalia.nu-style database.
|
||||||
|
</td>
|
||||||
|
<td>
|
||||||
|
<form method="post" action="/actions/sideload-encyclopedia" onsubmit="return confirm('Confirm sideloading')">
|
||||||
|
<label for="source">articles.db location on server</label><br>
|
||||||
|
<input id="source" name="source" value="">
|
||||||
|
<br><br>
|
||||||
|
|
||||||
|
<input type="submit" value="Sideload Encyclopedia">
|
||||||
|
</form>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td>
|
<td>
|
||||||
<b>Reload Blogs List</b>
|
<b>Reload Blogs List</b>
|
||||||
|
Loading…
Reference in New Issue
Block a user