From 62cc9df2063141b70e05868c3f9ed02c2ca3871b Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 3 Jul 2023 10:40:32 +0200 Subject: [PATCH 001/157] Embryo of new control process * New events and heartbeat tables in mariadb * Refactored to a cleaner Service interface --- .../sql/current/10-service-status.sql | 17 ++ .../sql/migrations/02-service-status.sql | 17 ++ .../marginalia/client/AbortingScheduler.java | 2 - .../nu/marginalia/client/ServiceMonitors.java | 133 ++++++++++++++++ .../service/SearchServiceDescriptors.java | 4 +- .../nu/marginalia/service/id/ServiceId.java | 21 +-- code/common/service/build.gradle | 1 + code/common/service/readme.md | 46 ++++++ .../java/nu/marginalia/service/MainClass.java | 3 + .../service/control/ServiceEventLog.java | 58 +++++++ .../service/control/ServiceHeartbeat.java | 145 ++++++++++++++++++ .../service/module/ConfigurationModule.java | 16 +- .../service/module/ServiceConfiguration.java | 27 ++++ .../service/server/BaseServiceParams.java | 30 ++++ .../service/server/Initialization.java | 25 +++ .../service/server/MetricsServer.java | 6 +- .../nu/marginalia/service/server/Service.java | 23 +-- .../assistant/AssistantService.java | 16 +- .../java/nu/marginalia/index/IndexModule.java | 20 ++- .../nu/marginalia/index/IndexService.java | 23 +-- .../marginalia/index/index/SearchIndex.java | 21 ++- ...ndexQueryServiceIntegrationTestModule.java | 22 ++- .../nu/marginalia/search/SearchService.java | 13 +- .../java/nu/marginalia/api/ApiService.java | 15 +- .../control-service/build.gradle | 64 ++++++++ .../nu/marginalia/control/ControlMain.java | 29 ++++ .../nu/marginalia/control/ControlService.java | 42 +++++ .../marginalia/control/HeartbeatService.java | 48 ++++++ .../control/model/ServiceHeartbeat.java | 11 ++ .../nu/marginalia/dating/DatingService.java | 15 +- .../marginalia/explorer/ExplorerService.java | 17 +- docker-compose.yml | 10 ++ run/nginx-site.conf | 3 + settings.gradle | 1 + 34 files changed, 835 insertions(+), 109 deletions(-) create mode 100644 code/common/db/src/main/resources/sql/current/10-service-status.sql create mode 100644 code/common/db/src/main/resources/sql/migrations/02-service-status.sql create mode 100644 code/common/service-client/src/main/java/nu/marginalia/client/ServiceMonitors.java create mode 100644 code/common/service/src/main/java/nu/marginalia/service/control/ServiceEventLog.java create mode 100644 code/common/service/src/main/java/nu/marginalia/service/control/ServiceHeartbeat.java create mode 100644 code/common/service/src/main/java/nu/marginalia/service/module/ServiceConfiguration.java create mode 100644 code/common/service/src/main/java/nu/marginalia/service/server/BaseServiceParams.java create mode 100644 code/services-satellite/control-service/build.gradle create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlMain.java create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/HeartbeatService.java create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ServiceHeartbeat.java diff --git a/code/common/db/src/main/resources/sql/current/10-service-status.sql b/code/common/db/src/main/resources/sql/current/10-service-status.sql new file mode 100644 index 00000000..ca934785 --- /dev/null +++ b/code/common/db/src/main/resources/sql/current/10-service-status.sql @@ -0,0 +1,17 @@ +CREATE TABLE PROC_SERVICE_HEARTBEAT( + SERVICE_NAME VARCHAR(255) PRIMARY KEY COMMENT 'Full name of the service, including node id if applicable, e.g. search-service:0', + SERVICE_BASE VARCHAR(255) NOT NULL COMMENT 'Base name of the service, e.g. search-service', + INSTANCE VARCHAR(255) NOT NULL COMMENT 'UUID of the service instance', + ALIVE BOOLEAN NOT NULL DEFAULT TRUE COMMENT 'Set to false when the service is doing an orderly shutdown', + HEARTBEAT_TIME TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) COMMENT 'Service was last seen at this point' +); + +CREATE TABLE PROC_SERVICE_EVENTLOG( + ID BIGINT AUTO_INCREMENT PRIMARY KEY COMMENT 'Unique id', + SERVICE_NAME VARCHAR(255) NOT NULL COMMENT 'Full name of the service, including node id if applicable, e.g. search-service:0', + SERVICE_BASE VARCHAR(255) NOT NULL COMMENT 'Base name of the service, e.g. search-service', + INSTANCE VARCHAR(255) NOT NULL COMMENT 'UUID of the service instance', + EVENT_TIME TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) COMMENT 'Event time', + EVENT_TYPE VARCHAR(255) NOT NULL COMMENT 'Event type', + EVENT_MESSAGE VARCHAR(255) NOT NULL COMMENT 'Event message' +); \ No newline at end of file diff --git a/code/common/db/src/main/resources/sql/migrations/02-service-status.sql b/code/common/db/src/main/resources/sql/migrations/02-service-status.sql new file mode 100644 index 00000000..acb9645a --- /dev/null +++ b/code/common/db/src/main/resources/sql/migrations/02-service-status.sql @@ -0,0 +1,17 @@ +CREATE TABLE PROC_SERVICE_HEARTBEAT( + SERVICE_NAME VARCHAR(255) PRIMARY KEY COMMENT "Full name of the service, including node id if applicable, e.g. search-service:0", + SERVICE_BASE VARCHAR(255) NOT NULL COMMENT "Base name of the service, e.g. search-service", + INSTANCE VARCHAR(255) NOT NULL COMMENT "UUID of the service instance", + ALIVE BOOLEAN NOT NULL DEFAULT TRUE COMMENT "Set to false when the service is doing an orderly shutdown", + HEARTBEAT_TIME TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) COMMENT "Service was last seen at this point" +); + +CREATE TABLE PROC_SERVICE_EVENTLOG( + ID BIGINT AUTO_INCREMENT PRIMARY KEY COMMENT "Unique id", + SERVICE_NAME VARCHAR(255) NOT NULL COMMENT "Full name of the service, including node id if applicable, e.g. search-service:0", + SERVICE_BASE VARCHAR(255) NOT NULL COMMENT "Base name of the service, e.g. search-service", + INSTANCE VARCHAR(255) NOT NULL COMMENT "UUID of the service instance", + EVENT_TIME TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) COMMENT "Event time", + EVENT_TYPE VARCHAR(255) NOT NULL COMMENT "Event type", + EVENT_MESSAGE VARCHAR(255) NOT NULL COMMENT "Event message" +); \ No newline at end of file diff --git a/code/common/service-client/src/main/java/nu/marginalia/client/AbortingScheduler.java b/code/common/service-client/src/main/java/nu/marginalia/client/AbortingScheduler.java index 2ec196e6..f190bfe4 100644 --- a/code/common/service-client/src/main/java/nu/marginalia/client/AbortingScheduler.java +++ b/code/common/service-client/src/main/java/nu/marginalia/client/AbortingScheduler.java @@ -13,7 +13,6 @@ import java.util.concurrent.Executors; import java.util.concurrent.ThreadFactory; public class AbortingScheduler { - private final String name; private final ThreadFactory threadFactory; private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -22,7 +21,6 @@ public class AbortingScheduler { private ExecutorService executorService; public AbortingScheduler(String name) { - this.name = name; threadFactory = new ThreadFactoryBuilder() .setNameFormat(name+"client--%d") .setUncaughtExceptionHandler(this::handleException) diff --git a/code/common/service-client/src/main/java/nu/marginalia/client/ServiceMonitors.java b/code/common/service-client/src/main/java/nu/marginalia/client/ServiceMonitors.java new file mode 100644 index 00000000..1ce8ae0c --- /dev/null +++ b/code/common/service-client/src/main/java/nu/marginalia/client/ServiceMonitors.java @@ -0,0 +1,133 @@ +package nu.marginalia.client; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.service.id.ServiceId; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.SQLException; +import java.util.*; +import java.util.concurrent.TimeUnit; + +@Singleton +public class ServiceMonitors { + private final HikariDataSource dataSource; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + private final Set runningServices = new HashSet<>(); + private final Set callbacks = new HashSet<>(); + + + private final int heartbeatInterval = Integer.getInteger("mcp.heartbeat.interval", 5); + + private volatile boolean running; + + @Inject + public ServiceMonitors(HikariDataSource dataSource) { + this.dataSource = dataSource; + + var runThread = new Thread(this::run, "service monitor"); + runThread.setDaemon(true); + runThread.start(); + } + + public void subscribe(Runnable callback) { + synchronized (callbacks) { + callbacks.add(callback); + } + } + public void unsubscribe(Runnable callback) { + synchronized (callbacks) { + callbacks.remove(callback); + } + } + + public void run() { + if (running) { + return; + } + else { + running = true; + } + + while (running) { + if (updateRunningServices()) { + runCallbacks(); + } + + try { + TimeUnit.SECONDS.sleep(heartbeatInterval); + } + catch (InterruptedException ex) { + logger.warn("ServiceMonitors interrupted", ex); + running = false; + } + } + } + + private void runCallbacks() { + synchronized (callbacks) { + for (var callback : callbacks) { + synchronized (runningServices) { + callback.run(); + } + } + } + } + + private boolean updateRunningServices() { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT SERVICE_BASE, TIMESTAMPDIFF(SECOND, HEARTBEAT_TIME, CURRENT_TIMESTAMP(6)) + FROM PROC_SERVICE_HEARTBEAT + WHERE ALIVE=1 + """)) { + try (var rs = stmt.executeQuery()) { + Set newRunningServices = new HashSet<>(10); + while (rs.next()) { + String svc = rs.getString(1); + int dtime = rs.getInt(2); + if (dtime < 2.5 * heartbeatInterval) { + newRunningServices.add(svc); + } + } + + boolean changed; + + synchronized (runningServices) { + changed = !Objects.equals(runningServices, newRunningServices); + + runningServices.clear(); + runningServices.addAll(newRunningServices); + } + + return changed; + } + } + catch (SQLException ex) { + logger.warn("Failed to update running services", ex); + } + + return false; + } + + public boolean isServiceUp(ServiceId serviceId) { + synchronized (runningServices) { + return runningServices.contains(serviceId.name); + } + } + + public List getRunningServices() { + List ret = new ArrayList<>(ServiceId.values().length); + + synchronized (runningServices) { + for (var runningService : runningServices) { + ret.add(ServiceId.byName(runningService)); + } + } + + return ret; + } +} diff --git a/code/common/service-discovery/src/main/java/nu/marginalia/service/SearchServiceDescriptors.java b/code/common/service-discovery/src/main/java/nu/marginalia/service/SearchServiceDescriptors.java index a1f2bf13..c082bedb 100644 --- a/code/common/service-discovery/src/main/java/nu/marginalia/service/SearchServiceDescriptors.java +++ b/code/common/service-discovery/src/main/java/nu/marginalia/service/SearchServiceDescriptors.java @@ -13,5 +13,7 @@ public class SearchServiceDescriptors { new ServiceDescriptor(ServiceId.Search, 5023), new ServiceDescriptor(ServiceId.Assistant, 5025), new ServiceDescriptor(ServiceId.Dating, 5070), - new ServiceDescriptor(ServiceId.Explorer, 5071))); + new ServiceDescriptor(ServiceId.Explorer, 5071), + new ServiceDescriptor(ServiceId.Control, 5090) + )); } diff --git a/code/common/service-discovery/src/main/java/nu/marginalia/service/id/ServiceId.java b/code/common/service-discovery/src/main/java/nu/marginalia/service/id/ServiceId.java index 92ffb4a7..ad459d36 100644 --- a/code/common/service-discovery/src/main/java/nu/marginalia/service/id/ServiceId.java +++ b/code/common/service-discovery/src/main/java/nu/marginalia/service/id/ServiceId.java @@ -7,19 +7,22 @@ public enum ServiceId { Search("search-service"), Index("index-service"), + Control("control-service"), + Dating("dating-service"), - Explorer("explorer-service"), - - Other_Auth("auth"), - Other_Memex("memex"), - - - Other_ResourceStore("resource-store"), - Other_Renderer("renderer"), - Other_PodcastScraper("podcast-scraper"); + Explorer("explorer-service"); public final String name; ServiceId(String name) { this.name = name; } + + public static ServiceId byName(String name) { + for (ServiceId id : values()) { + if (id.name.equals(name)) { + return id; + } + } + return null; + } } diff --git a/code/common/service/build.gradle b/code/common/service/build.gradle index bad65877..f153500b 100644 --- a/code/common/service/build.gradle +++ b/code/common/service/build.gradle @@ -12,6 +12,7 @@ java { dependencies { implementation project(':code:common:service-client') implementation project(':code:common:service-discovery') + implementation project(':code:common:db') implementation libs.lombok annotationProcessor libs.lombok diff --git a/code/common/service/readme.md b/code/common/service/readme.md index 9077e8d4..91741dc0 100644 --- a/code/common/service/readme.md +++ b/code/common/service/readme.md @@ -3,6 +3,52 @@ Contains the base classes for the services. This is where port configuration, and common endpoints are set up. +## Creating a new Service + +The minimal service needs a `MainClass` and a `Service` class. + +For proper initiation, the main class should look like this: + +```java +public class FoobarMain extends MainClass { + + @Inject + public FoobarMain(FoobarService service) {} + + public static void main(String... args) { + init(ServiceId.Foobar, args); + + Injector injector = Guice.createInjector( + new FoobarModule(), /* optional custom bindings go here */ + new DatabaseModule(), + new ConfigurationModule(SearchServiceDescriptors.descriptors, + ServiceId.Foobar)); + + injector.getInstance(FoobarMain.class); + + // set the service as ready so that delayed tasks can be started + injector.getInstance(Initialization.class).setReady(); + } +} +``` + +A service class has a boilerplate set-up that looks like this: + +```java +@Singleton +public class FoobarService extends Service { + + @Inject + public FoobarService(BaseServiceParams params) { + super(params); + + // set up Spark endpoints here + } +} +``` + +Further the new service needs to be added to the `ServiceId` enum in [service-discovery](../service-discovery). + ## Central Classes * [MainClass](src/main/java/nu/marginalia/service/MainClass.java) bootstraps all executables diff --git a/code/common/service/src/main/java/nu/marginalia/service/MainClass.java b/code/common/service/src/main/java/nu/marginalia/service/MainClass.java index 26343581..c935e282 100644 --- a/code/common/service/src/main/java/nu/marginalia/service/MainClass.java +++ b/code/common/service/src/main/java/nu/marginalia/service/MainClass.java @@ -11,6 +11,9 @@ import org.slf4j.LoggerFactory; import java.net.SocketTimeoutException; import java.net.UnknownHostException; +/** Each main class of a service should extend this class. + * They must also invoke init() in their main method. + */ public abstract class MainClass { private final Logger logger = LoggerFactory.getLogger(getClass()); diff --git a/code/common/service/src/main/java/nu/marginalia/service/control/ServiceEventLog.java b/code/common/service/src/main/java/nu/marginalia/service/control/ServiceEventLog.java new file mode 100644 index 00000000..217e670e --- /dev/null +++ b/code/common/service/src/main/java/nu/marginalia/service/control/ServiceEventLog.java @@ -0,0 +1,58 @@ +package nu.marginalia.service.control; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.service.module.ServiceConfiguration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.SQLException; +import java.util.Objects; +import java.util.UUID; + +@Singleton +public class ServiceEventLog { + private final HikariDataSource dataSource; + + private final Logger logger = LoggerFactory.getLogger(ServiceEventLog.class); + + private final String serviceName; + private final UUID instanceUuid; + private final String serviceBase; + + @Inject + public ServiceEventLog(HikariDataSource dataSource, + ServiceConfiguration configuration + ) { + this.dataSource = dataSource; + + this.serviceName = configuration.serviceName() + ":" + configuration.node(); + this.instanceUuid = configuration.instanceUuid(); + this.serviceBase = configuration.serviceName(); + + logger.info("Starting service {} instance {}", serviceName, instanceUuid); + + logEvent("START", "Service starting"); + } + + public void logEvent(String type, String message) { + + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + INSERT INTO PROC_SERVICE_EVENTLOG(SERVICE_NAME, SERVICE_BASE, INSTANCE, EVENT_TYPE, EVENT_MESSAGE) + VALUES (?, ?, ?, ?, ?) + """)) { + stmt.setString(1, serviceName); + stmt.setString(2, serviceBase); + stmt.setString(3, instanceUuid.toString()); + stmt.setString(4, type); + stmt.setString(5, Objects.requireNonNull(message, "")); + + stmt.executeUpdate(); + } + catch (SQLException ex) { + logger.error("Failed to log event {}:{}", type, message); + } + } +} diff --git a/code/common/service/src/main/java/nu/marginalia/service/control/ServiceHeartbeat.java b/code/common/service/src/main/java/nu/marginalia/service/control/ServiceHeartbeat.java new file mode 100644 index 00000000..8850ae7f --- /dev/null +++ b/code/common/service/src/main/java/nu/marginalia/service/control/ServiceHeartbeat.java @@ -0,0 +1,145 @@ +package nu.marginalia.service.control; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.service.module.ServiceConfiguration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.SQLException; +import java.util.concurrent.TimeUnit; + +/** This service sends a heartbeat to the database every 5 seconds. + */ +@Singleton +public class ServiceHeartbeat { + private final Logger logger = LoggerFactory.getLogger(ServiceHeartbeat.class); + private final String serviceName; + private final String serviceBase; + private final String instanceUUID; + private final HikariDataSource dataSource; + + + private final Thread runnerThread; + private final int heartbeatInterval = Integer.getInteger("mcp.heartbeat.interval", 5); + + private volatile boolean running = false; + + @Inject + public ServiceHeartbeat(ServiceConfiguration configuration, + HikariDataSource dataSource) + { + this.serviceName = configuration.serviceName() + ":" + configuration.node(); + this.serviceBase = configuration.serviceName(); + this.dataSource = dataSource; + + this.instanceUUID = configuration.instanceUuid().toString(); + + runnerThread = new Thread(this::run); + + Runtime.getRuntime().addShutdownHook(new Thread(this::shutDown)); + } + + public void start() { + if (!running) { + runnerThread.start(); + } + } + + public void shutDown() { + if (!running) + return; + + running = false; + + try { + runnerThread.join(); + heartbeatStop(); + } + catch (InterruptedException|SQLException ex) { + logger.warn("ServiceHeartbeat shutdown failed", ex); + } + } + + private void run() { + if (!running) + running = true; + else + return; + + try { + heartbeatInit(); + + while (running) { + + try { + heartbeatUpdate(); + } + catch (SQLException ex) { + logger.warn("ServiceHeartbeat failed to update", ex); + } + + TimeUnit.SECONDS.sleep(heartbeatInterval); + } + } + catch (InterruptedException|SQLException ex) { + logger.error("ServiceHeartbeat caught irrecoverable exception, killing service", ex); + System.exit(255); + } + } + + private void heartbeatInit() throws SQLException { + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement( + """ + INSERT INTO PROC_SERVICE_HEARTBEAT (SERVICE_NAME, SERVICE_BASE, INSTANCE, HEARTBEAT_TIME, ALIVE) + VALUES (?, ?, ?, CURRENT_TIMESTAMP(6), 1) + ON DUPLICATE KEY UPDATE + INSTANCE = ?, + HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), + ALIVE = 1 + """ + )) + { + stmt.setString(1, serviceName); + stmt.setString(2, serviceBase); + stmt.setString(3, instanceUUID); + stmt.setString(4, instanceUUID); + stmt.executeUpdate(); + } + } + } + + private void heartbeatUpdate() throws SQLException { + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement( + """ + UPDATE PROC_SERVICE_HEARTBEAT + SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6) + WHERE INSTANCE = ? AND ALIVE = 1 + """) + ) + { + stmt.setString(1, instanceUUID); + stmt.executeUpdate(); + } + } + } + + private void heartbeatStop() throws SQLException { + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement( + """ + UPDATE PROC_SERVICE_HEARTBEAT + SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), ALIVE = 0 + WHERE INSTANCE = ? + """) + ) + { + stmt.setString(1, instanceUUID); + stmt.executeUpdate(); + } + } + } +} diff --git a/code/common/service/src/main/java/nu/marginalia/service/module/ConfigurationModule.java b/code/common/service/src/main/java/nu/marginalia/service/module/ConfigurationModule.java index a0d763d0..62d1f9ce 100644 --- a/code/common/service/src/main/java/nu/marginalia/service/module/ConfigurationModule.java +++ b/code/common/service/src/main/java/nu/marginalia/service/module/ConfigurationModule.java @@ -8,9 +8,9 @@ import nu.marginalia.service.descriptor.ServiceDescriptors; import nu.marginalia.service.id.ServiceId; import java.util.Objects; +import java.util.UUID; public class ConfigurationModule extends AbstractModule { - private static final String SERVICE_NAME = System.getProperty("service-name"); private final ServiceDescriptors descriptors; private final ServiceId id; @@ -21,15 +21,13 @@ public class ConfigurationModule extends AbstractModule { public void configure() { bind(ServiceDescriptors.class).toInstance(descriptors); - bind(String.class).annotatedWith(Names.named("service-name")).toInstance(Objects.requireNonNull(SERVICE_NAME)); - bind(String.class).annotatedWith(Names.named("service-host")).toInstance(System.getProperty("service-host", "127.0.0.1")); - bind(Integer.class).annotatedWith(Names.named("service-port")).toInstance(descriptors.forId(id).port); - } - @Provides - @Named("metrics-server-port") - public Integer provideMetricsServerPort(@Named("service-port") Integer servicePort) { - return servicePort + 1000; + int basePort = descriptors.forId(id).port; + int prometheusPort = basePort + 1000; + String host = Objects.requireNonNull(System.getProperty("service-host", "127.0.0.1")); + var configObject = new ServiceConfiguration(id, 0, host, basePort, prometheusPort, UUID.randomUUID()); + + bind(ServiceConfiguration.class).toInstance(configObject); } } diff --git a/code/common/service/src/main/java/nu/marginalia/service/module/ServiceConfiguration.java b/code/common/service/src/main/java/nu/marginalia/service/module/ServiceConfiguration.java new file mode 100644 index 00000000..df97b7b0 --- /dev/null +++ b/code/common/service/src/main/java/nu/marginalia/service/module/ServiceConfiguration.java @@ -0,0 +1,27 @@ +package nu.marginalia.service.module; + +import nu.marginalia.service.id.ServiceId; + +import java.util.UUID; + +/** + * Configuration object for a service. This is a guice-injectable object + * intended to keep down the amount of named bindings. + * + * @param serviceId - service descriptor + * @param node - always 0 for now, for future service partitioning + * @param host - the bind address of the service + * @param port - main port of the service + * @param metricsPort - prometheus metrics server port + * @param instanceUuid - unique identifier for this instance of the service + */ +public record ServiceConfiguration(ServiceId serviceId, + int node, + String host, + int port, + int metricsPort, + UUID instanceUuid) { + public String serviceName() { + return serviceId.name; + } +} diff --git a/code/common/service/src/main/java/nu/marginalia/service/server/BaseServiceParams.java b/code/common/service/src/main/java/nu/marginalia/service/server/BaseServiceParams.java new file mode 100644 index 00000000..1cd94b6c --- /dev/null +++ b/code/common/service/src/main/java/nu/marginalia/service/server/BaseServiceParams.java @@ -0,0 +1,30 @@ +package nu.marginalia.service.server; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.service.control.ServiceEventLog; +import nu.marginalia.service.control.ServiceHeartbeat; +import nu.marginalia.service.module.ServiceConfiguration; + +/** This class exists to reduce Service boilerplate */ +@Singleton +public class BaseServiceParams { + public final ServiceConfiguration configuration; + public final Initialization initialization; + public final MetricsServer metricsServer; + public final ServiceHeartbeat heartbeat; + public final ServiceEventLog eventLog; + + @Inject + public BaseServiceParams(ServiceConfiguration configuration, + Initialization initialization, + MetricsServer metricsServer, + ServiceHeartbeat heartbeat, + ServiceEventLog eventLog) { + this.configuration = configuration; + this.initialization = initialization; + this.metricsServer = metricsServer; + this.heartbeat = heartbeat; + this.eventLog = eventLog; + } +} diff --git a/code/common/service/src/main/java/nu/marginalia/service/server/Initialization.java b/code/common/service/src/main/java/nu/marginalia/service/server/Initialization.java index c7a857ea..e75db6fe 100644 --- a/code/common/service/src/main/java/nu/marginalia/service/server/Initialization.java +++ b/code/common/service/src/main/java/nu/marginalia/service/server/Initialization.java @@ -5,10 +5,14 @@ import lombok.SneakyThrows; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.util.ArrayList; +import java.util.List; + @Singleton public class Initialization { boolean initialized; private final Logger logger = LoggerFactory.getLogger(getClass()); + private final List callbacks = new ArrayList<>(); public static Initialization already() { Initialization init = new Initialization(); @@ -21,6 +25,27 @@ public class Initialization { logger.info("Initialized"); initialized = true; notifyAll(); + + } + + callbacks.forEach(Runnable::run); + callbacks.clear(); + } + + public void addCallback(Runnable callback) { + boolean runNow; + + synchronized (this) { + if (!initialized) { + callbacks.add(callback); + runNow = false; + } else { + runNow = true; + } + } + + if (runNow) { + callback.run(); } } diff --git a/code/common/service/src/main/java/nu/marginalia/service/server/MetricsServer.java b/code/common/service/src/main/java/nu/marginalia/service/server/MetricsServer.java index 1822b465..7dc52d9e 100644 --- a/code/common/service/src/main/java/nu/marginalia/service/server/MetricsServer.java +++ b/code/common/service/src/main/java/nu/marginalia/service/server/MetricsServer.java @@ -1,9 +1,9 @@ package nu.marginalia.service.server; import com.google.inject.Inject; -import com.google.inject.name.Named; import io.prometheus.client.exporter.MetricsServlet; import lombok.SneakyThrows; +import nu.marginalia.service.module.ServiceConfiguration; import org.eclipse.jetty.server.Server; import org.eclipse.jetty.servlet.ServletContextHandler; import org.eclipse.jetty.servlet.ServletHolder; @@ -12,8 +12,8 @@ public class MetricsServer { @SneakyThrows @Inject - public MetricsServer(@Named("metrics-server-port") int port) { - Server server = new Server(port); + public MetricsServer(ServiceConfiguration configuration) { + Server server = new Server(configuration.metricsPort()); ServletContextHandler context = new ServletContextHandler(); context.setContextPath("/"); server.setHandler(context); diff --git a/code/common/service/src/main/java/nu/marginalia/service/server/Service.java b/code/common/service/src/main/java/nu/marginalia/service/server/Service.java index e5c04877..5a287c99 100644 --- a/code/common/service/src/main/java/nu/marginalia/service/server/Service.java +++ b/code/common/service/src/main/java/nu/marginalia/service/server/Service.java @@ -1,6 +1,5 @@ package nu.marginalia.service.server; -import com.google.common.base.Strings; import io.prometheus.client.Counter; import nu.marginalia.client.Context; import nu.marginalia.client.exception.MessagingException; @@ -35,22 +34,28 @@ public class Service { .labelNames("service") .register(); private final String serviceName; - private static volatile boolean initialized = false; - public Service(String ip, int port, Initialization initialization, MetricsServer metricsServer, Runnable configureStaticFiles) { - this.initialization = initialization; + public Service(BaseServiceParams params, + Runnable configureStaticFiles + ) { + this.initialization = params.initialization; serviceName = System.getProperty("service-name"); + initialization.addCallback(params.heartbeat::start); + initialization.addCallback(() -> params.eventLog.logEvent("SVC-INIT", "")); + if (!initialization.isReady() && ! initialized ) { initialized = true; Spark.threadPool(32, 4, 60_000); - Spark.ipAddress(ip); - Spark.port(port); + Spark.ipAddress(params.configuration.host()); + Spark.port(params.configuration.port()); - logger.info("{} Listening to {}:{}", getClass().getSimpleName(), ip == null ? "" : ip, port); + logger.info("{} Listening to {}:{}", getClass().getSimpleName(), + params.configuration.host(), + params.configuration.port()); configureStaticFiles.run(); @@ -66,8 +71,8 @@ public class Service { } } - public Service(String ip, int port, Initialization initialization, MetricsServer metricsServer) { - this(ip, port, initialization, metricsServer, () -> { + public Service(BaseServiceParams params) { + this(params, () -> { // configureStaticFiles can't be an overridable method in Service because it may // need to depend on parameters to the constructor, and super-constructors // must run first diff --git a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/AssistantService.java b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/AssistantService.java index c0d908fd..3992986b 100644 --- a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/AssistantService.java +++ b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/AssistantService.java @@ -2,7 +2,6 @@ package nu.marginalia.assistant; import com.google.gson.Gson; import com.google.inject.Inject; -import com.google.inject.name.Named; import lombok.SneakyThrows; import nu.marginalia.assistant.eval.Units; import nu.marginalia.assistant.suggest.Suggestions; @@ -10,9 +9,7 @@ import nu.marginalia.assistant.eval.MathParser; import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.screenshot.ScreenshotService; import nu.marginalia.assistant.dict.DictionaryService; -import nu.marginalia.service.server.Initialization; -import nu.marginalia.service.server.MetricsServer; -import nu.marginalia.service.server.Service; +import nu.marginalia.service.server.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import spark.Request; @@ -28,18 +25,15 @@ public class AssistantService extends Service { @SneakyThrows @Inject - public AssistantService(@Named("service-host") String ip, - @Named("service-port") Integer port, - Initialization initialization, - MetricsServer metricsServer, + public AssistantService(BaseServiceParams params, DictionaryService dictionaryService, MathParser mathParser, Units units, ScreenshotService screenshotService, - Suggestions suggestions - ) + Suggestions suggestions) { - super(ip, port, initialization, metricsServer); + super(params); + this.mathParser = mathParser; this.units = units; this.suggestions = suggestions; diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexModule.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexModule.java index 1e674d01..80f8187a 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexModule.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexModule.java @@ -8,6 +8,7 @@ import nu.marginalia.WmsaHome; import nu.marginalia.lexicon.KeywordLexicon; import nu.marginalia.lexicon.KeywordLexiconReadOnlyView; import nu.marginalia.lexicon.journal.KeywordLexiconJournal; +import nu.marginalia.service.control.ServiceEventLog; import java.nio.file.Path; @@ -20,13 +21,20 @@ public class IndexModule extends AbstractModule { @Provides @SneakyThrows - private KeywordLexiconReadOnlyView createLexicon() { - return new KeywordLexiconReadOnlyView( - new KeywordLexicon( - new KeywordLexiconJournal(WmsaHome.getDisk("index-write").resolve("dictionary.dat").toFile() + private KeywordLexiconReadOnlyView createLexicon(ServiceEventLog eventLog) { + try { + eventLog.logEvent("INDEX-LEXICON-LOAD-BEGIN", ""); + + return new KeywordLexiconReadOnlyView( + new KeywordLexicon( + new KeywordLexiconJournal(WmsaHome.getDisk("index-write").resolve("dictionary.dat").toFile() + ) ) - ) - ); + ); + } + finally { + eventLog.logEvent("INDEX-LEXICON-LOAD-OK", ""); + } } @Provides diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexService.java index 8d4a7984..369e8309 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexService.java @@ -2,16 +2,14 @@ package nu.marginalia.index; import com.google.gson.Gson; import com.google.inject.Inject; -import com.google.inject.name.Named; import io.reactivex.rxjava3.schedulers.Schedulers; import nu.marginalia.index.index.SearchIndex; import nu.marginalia.index.svc.IndexOpsService; import nu.marginalia.index.svc.IndexQueryService; import nu.marginalia.index.svc.IndexSearchSetsService; import nu.marginalia.model.gson.GsonFactory; -import nu.marginalia.service.server.Initialization; -import nu.marginalia.service.server.MetricsServer; -import nu.marginalia.service.server.Service; +import nu.marginalia.service.control.ServiceEventLog; +import nu.marginalia.service.server.*; import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -34,28 +32,29 @@ public class IndexService extends Service { private final IndexServicesFactory servicesFactory; private final IndexSearchSetsService searchSetsService; + private final ServiceEventLog eventLog; @Inject - public IndexService(@Named("service-host") String ip, - @Named("service-port") Integer port, - Initialization init, - MetricsServer metricsServer, + public IndexService(BaseServiceParams params, IndexOpsService opsService, IndexQueryService indexQueryService, SearchIndex searchIndex, IndexServicesFactory servicesFactory, - IndexSearchSetsService searchSetsService) + IndexSearchSetsService searchSetsService, + ServiceEventLog eventLog) { - super(ip, port, init, metricsServer); + super(params); + this.opsService = opsService; this.searchIndex = searchIndex; this.servicesFactory = servicesFactory; this.searchSetsService = searchSetsService; + this.eventLog = eventLog; final Gson gson = GsonFactory.get(); - this.init = init; + this.init = params.initialization; Spark.post("/search/", indexQueryService::search, gson::toJson); @@ -94,9 +93,11 @@ public class IndexService extends Service { } try { + eventLog.logEvent("INDEX-AUTO-CONVERT-BEGIN", ""); logger.info("Auto-converting"); searchSetsService.recalculateAll(); searchIndex.switchIndex(); + eventLog.logEvent("INDEX-AUTO-CONVERT-END", ""); logger.info("Auto-conversion finished!"); } catch (IOException ex) { diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java index c218caab..397c291c 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java @@ -6,6 +6,7 @@ import nu.marginalia.index.IndexServicesFactory; import nu.marginalia.index.query.*; import nu.marginalia.index.query.filter.QueryFilterStepFromPredicate; import nu.marginalia.index.svc.IndexSearchSetsService; +import nu.marginalia.service.control.ServiceEventLog; import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -36,10 +37,15 @@ public class SearchIndex { private final IndexServicesFactory servicesFactory; private final IndexSearchSetsService searchSetsService; + private final ServiceEventLog eventLog; + @Inject - public SearchIndex(@NotNull IndexServicesFactory servicesFactory, IndexSearchSetsService searchSetsService) { + public SearchIndex(@NotNull IndexServicesFactory servicesFactory, + IndexSearchSetsService searchSetsService, + ServiceEventLog eventLog) { this.servicesFactory = servicesFactory; this.searchSetsService = searchSetsService; + this.eventLog = eventLog; } public void init() { @@ -51,7 +57,13 @@ public class SearchIndex { if (indexReader == null) { indexReader = servicesFactory.getSearchIndexReader(); + eventLog.logEvent("INDEX-INIT", "Index loaded"); } + else { + eventLog.logEvent("INDEX-INIT", "No index loaded"); + } + + } catch (Exception ex) { logger.error("Uncaught exception", ex); @@ -63,9 +75,12 @@ public class SearchIndex { public boolean switchIndex() throws IOException { + eventLog.logEvent("CONVERT-INDEX-BEGIN", ""); servicesFactory.convertIndex(searchSetsService.getDomainRankings()); + eventLog.logEvent("CONVERT-INDEX-END", ""); System.gc(); + eventLog.logEvent("INDEX-SWITCH-BEGIN", ""); Lock lock = indexReplacementLock.writeLock(); try { lock.lock(); @@ -73,11 +88,15 @@ public class SearchIndex { servicesFactory.switchFilesJob().call(); indexReader = servicesFactory.getSearchIndexReader(); + + eventLog.logEvent("INDEX-SWITCH-OK", ""); } catch (Exception ex) { + eventLog.logEvent("INDEX-SWITCH-ERR", ""); logger.error("Uncaught exception", ex); } finally { + lock.unlock(); } diff --git a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java index a2962027..2b573c92 100644 --- a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java +++ b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java @@ -1,11 +1,8 @@ package nu.marginalia.index.svc; import com.google.inject.AbstractModule; -import com.google.inject.name.Names; -import nu.marginalia.WmsaHome; import nu.marginalia.index.IndexServicesFactory; import nu.marginalia.index.journal.writer.IndexJournalWriter; -import nu.marginalia.index.journal.writer.IndexJournalWriterImpl; import nu.marginalia.lexicon.KeywordLexicon; import nu.marginalia.lexicon.KeywordLexiconReadOnlyView; import nu.marginalia.lexicon.journal.KeywordLexiconJournal; @@ -13,12 +10,17 @@ import nu.marginalia.ranking.DomainRankings; import nu.marginalia.index.svc.searchset.SearchSetAny; import nu.marginalia.index.util.TestUtil; import nu.marginalia.index.client.model.query.SearchSetIdentifier; +import nu.marginalia.service.control.ServiceEventLog; +import nu.marginalia.service.control.ServiceHeartbeat; +import nu.marginalia.service.id.ServiceId; +import nu.marginalia.service.module.ServiceConfiguration; import org.mockito.Mockito; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.Random; +import java.util.UUID; import static org.mockito.Mockito.when; @@ -62,8 +64,18 @@ public class IndexQueryServiceIntegrationTestModule extends AbstractModule { bind(IndexJournalWriter.class).toInstance(servicesFactory.createIndexJournalWriter(keywordLexicon)); - bind(String.class).annotatedWith(Names.named("service-host")).toInstance("127.0.0.1"); - bind(Integer.class).annotatedWith(Names.named("service-port")).toProvider(this::randomPort); + bind(ServiceEventLog.class).toInstance(Mockito.mock(ServiceEventLog.class)); + bind(ServiceHeartbeat.class).toInstance(Mockito.mock(ServiceHeartbeat.class)); + + bind(ServiceConfiguration.class).toInstance(new ServiceConfiguration( + ServiceId.Index, + 0, + "127.0.0.1", + randomPort(), + randomPort(), + UUID.randomUUID() + )); + } catch (IOException e) { throw new RuntimeException(e); } diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/SearchService.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/SearchService.java index 66953dde..61ff69c3 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/SearchService.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/SearchService.java @@ -2,17 +2,13 @@ package nu.marginalia.search; import com.google.gson.Gson; import com.google.inject.Inject; -import com.google.inject.name.Named; import lombok.SneakyThrows; import nu.marginalia.WebsiteUrl; import nu.marginalia.client.Context; import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.search.svc.SearchFrontPageService; import nu.marginalia.search.svc.*; -import nu.marginalia.service.server.Initialization; -import nu.marginalia.service.server.MetricsServer; -import nu.marginalia.service.server.Service; -import nu.marginalia.service.server.StaticResources; +import nu.marginalia.service.server.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import spark.Request; @@ -31,10 +27,7 @@ public class SearchService extends Service { @SneakyThrows @Inject - public SearchService(@Named("service-host") String ip, - @Named("service-port") Integer port, - Initialization initialization, - MetricsServer metricsServer, + public SearchService(BaseServiceParams params, WebsiteUrl websiteUrl, StaticResources staticResources, SearchFrontPageService frontPageService, @@ -44,7 +37,7 @@ public class SearchService extends Service { SearchQueryService searchQueryService, SearchApiQueryService apiQueryService ) { - super(ip, port, initialization, metricsServer); + super(params); this.websiteUrl = websiteUrl; this.staticResources = staticResources; diff --git a/code/services-satellite/api-service/src/main/java/nu/marginalia/api/ApiService.java b/code/services-satellite/api-service/src/main/java/nu/marginalia/api/ApiService.java index cd658831..4da8c0f6 100644 --- a/code/services-satellite/api-service/src/main/java/nu/marginalia/api/ApiService.java +++ b/code/services-satellite/api-service/src/main/java/nu/marginalia/api/ApiService.java @@ -2,7 +2,6 @@ package nu.marginalia.api; import com.google.gson.Gson; import com.google.inject.Inject; -import com.google.inject.name.Named; import nu.marginalia.api.model.ApiLicense; import nu.marginalia.api.svc.LicenseService; import nu.marginalia.api.svc.RateLimiterService; @@ -11,9 +10,7 @@ import nu.marginalia.client.Context; import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.search.client.SearchClient; import nu.marginalia.search.client.model.ApiSearchResults; -import nu.marginalia.service.server.Initialization; -import nu.marginalia.service.server.MetricsServer; -import nu.marginalia.service.server.Service; +import nu.marginalia.service.server.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.slf4j.Marker; @@ -36,16 +33,14 @@ public class ApiService extends Service { private final Marker queryMarker = MarkerFactory.getMarker("QUERY"); @Inject - public ApiService(@Named("service-host") String ip, - @Named("service-port") Integer port, - Initialization initialization, - MetricsServer metricsServer, + public ApiService(BaseServiceParams params, SearchClient searchClient, ResponseCache responseCache, LicenseService licenseService, - RateLimiterService rateLimiterService) { + RateLimiterService rateLimiterService + ) { - super(ip, port, initialization, metricsServer); + super(params); this.searchClient = searchClient; this.responseCache = responseCache; diff --git a/code/services-satellite/control-service/build.gradle b/code/services-satellite/control-service/build.gradle new file mode 100644 index 00000000..1bb9bfdc --- /dev/null +++ b/code/services-satellite/control-service/build.gradle @@ -0,0 +1,64 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + id 'application' + id 'com.palantir.docker' version '0.34.0' + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +application { + mainClass = 'nu.marginalia.control.ControlMain' + applicationName = 'control-service' +} + +tasks.distZip.enabled = false + +apply from: "$rootProject.projectDir/docker-service.gradle" + +dependencies { + implementation project(':code:common:db') + implementation project(':code:common:model') + implementation project(':code:common:service') + implementation project(':code:common:config') + implementation project(':code:common:service-discovery') + implementation project(':code:common:service-client') + implementation project(':code:api:search-api') + + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + + implementation libs.prometheus + implementation libs.notnull + implementation libs.guice + implementation libs.trove + implementation libs.spark + implementation libs.fastutil + implementation libs.bundles.gson + implementation libs.bundles.mariadb + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito + testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4') + testImplementation 'org.testcontainers:mariadb:1.17.4' + testImplementation 'org.testcontainers:junit-jupiter:1.17.4' +} + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} + diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlMain.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlMain.java new file mode 100644 index 00000000..e3d12163 --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlMain.java @@ -0,0 +1,29 @@ +package nu.marginalia.control; + +import com.google.inject.Guice; +import com.google.inject.Inject; +import com.google.inject.Injector; +import nu.marginalia.service.MainClass; +import nu.marginalia.service.SearchServiceDescriptors; +import nu.marginalia.service.id.ServiceId; +import nu.marginalia.service.module.ConfigurationModule; +import nu.marginalia.service.module.DatabaseModule; +import nu.marginalia.service.server.Initialization; + +public class ControlMain extends MainClass { + + @Inject + public ControlMain(ControlService service) { + } + + public static void main(String... args) { + init(ServiceId.Control, args); + + Injector injector = Guice.createInjector( + new DatabaseModule(), + new ConfigurationModule(SearchServiceDescriptors.descriptors, ServiceId.Control)); + + injector.getInstance(ControlMain.class); + injector.getInstance(Initialization.class).setReady(); + } +} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java new file mode 100644 index 00000000..952559e0 --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java @@ -0,0 +1,42 @@ +package nu.marginalia.control; + +import com.google.gson.Gson; +import com.google.inject.Inject; +import nu.marginalia.client.ServiceMonitors; +import nu.marginalia.model.gson.GsonFactory; +import nu.marginalia.service.server.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import spark.Spark; + +public class ControlService extends Service { + + private final Logger logger = LoggerFactory.getLogger(getClass()); + private final Gson gson = GsonFactory.get(); + + private final ServiceMonitors monitors; + + + @Inject + public ControlService(BaseServiceParams params, + ServiceMonitors monitors, + HeartbeatService heartbeatService + ) { + + super(params); + this.monitors = monitors; + + Spark.get("/public/heartbeats", (req, res) -> { + res.type("application/json"); + return heartbeatService.getHeartbeats(); + }, gson::toJson); + + monitors.subscribe(this::logMonitorStateChange); + + } + + private void logMonitorStateChange() { + logger.info("Service state change: {}", monitors.getRunningServices()); + } + +} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/HeartbeatService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/HeartbeatService.java new file mode 100644 index 00000000..d0fd67cb --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/HeartbeatService.java @@ -0,0 +1,48 @@ +package nu.marginalia.control; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.control.model.ServiceHeartbeat; + +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.List; + +@Singleton +public class HeartbeatService { + private final HikariDataSource dataSource; + + @Inject + public HeartbeatService(HikariDataSource dataSource) { + this.dataSource = dataSource; + } + + public List getHeartbeats() { + List heartbeats = new ArrayList<>(); + + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT SERVICE_NAME, SERVICE_BASE, INSTANCE, ALIVE, + TIMESTAMPDIFF(MICROSECOND, HEARTBEAT_TIME, CURRENT_TIMESTAMP(6)) AS TSDIFF + FROM PROC_SERVICE_HEARTBEAT + """)) { + + var rs = stmt.executeQuery(); + while (rs.next()) { + heartbeats.add(new ServiceHeartbeat( + rs.getString("SERVICE_NAME"), + rs.getString("SERVICE_BASE"), + rs.getString("INSTANCE"), + rs.getInt("TSDIFF") / 1000., + rs.getBoolean("ALIVE") + )); + } + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + + return heartbeats; + } +} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ServiceHeartbeat.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ServiceHeartbeat.java new file mode 100644 index 00000000..cc0dcef4 --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ServiceHeartbeat.java @@ -0,0 +1,11 @@ +package nu.marginalia.control.model; + +public record ServiceHeartbeat( + String serviceId, + String serviceBase, + String uuid, + double lastSeenMillis, + boolean alive +) { + +} diff --git a/code/services-satellite/dating-service/src/main/java/nu/marginalia/dating/DatingService.java b/code/services-satellite/dating-service/src/main/java/nu/marginalia/dating/DatingService.java index d39f5a0c..c5c8a3cd 100644 --- a/code/services-satellite/dating-service/src/main/java/nu/marginalia/dating/DatingService.java +++ b/code/services-satellite/dating-service/src/main/java/nu/marginalia/dating/DatingService.java @@ -1,7 +1,6 @@ package nu.marginalia.dating; import com.google.inject.Inject; -import com.google.inject.name.Named; import lombok.SneakyThrows; import nu.marginalia.browse.DbBrowseDomainsRandom; import nu.marginalia.browse.DbBrowseDomainsSimilarCosine; @@ -11,9 +10,7 @@ import nu.marginalia.renderer.MustacheRenderer; import nu.marginalia.renderer.RendererFactory; import nu.marginalia.screenshot.ScreenshotService; import nu.marginalia.model.id.EdgeId; -import nu.marginalia.service.server.Initialization; -import nu.marginalia.service.server.MetricsServer; -import nu.marginalia.service.server.Service; +import nu.marginalia.service.server.*; import org.jetbrains.annotations.NotNull; import spark.Request; import spark.Response; @@ -33,17 +30,15 @@ public class DatingService extends Service { private final String SESSION_OBJECT_NAME = "so"; @SneakyThrows @Inject - public DatingService(@Named("service-host") String ip, - @Named("service-port") Integer port, + public DatingService(BaseServiceParams params, RendererFactory rendererFactory, - Initialization initialization, - MetricsServer metricsServer, DomainBlacklist blacklist, DbBrowseDomainsSimilarCosine browseSimilarCosine, DbBrowseDomainsRandom browseRandom, - ScreenshotService screenshotService) { + ScreenshotService screenshotService) + { - super(ip, port, initialization, metricsServer); + super(params); this.blacklist = blacklist; diff --git a/code/services-satellite/explorer-service/src/main/java/nu/marginalia/explorer/ExplorerService.java b/code/services-satellite/explorer-service/src/main/java/nu/marginalia/explorer/ExplorerService.java index 8f967bb1..a753eed9 100644 --- a/code/services-satellite/explorer-service/src/main/java/nu/marginalia/explorer/ExplorerService.java +++ b/code/services-satellite/explorer-service/src/main/java/nu/marginalia/explorer/ExplorerService.java @@ -1,15 +1,11 @@ package nu.marginalia.explorer; import com.google.inject.Inject; -import com.google.inject.name.Named; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; import nu.marginalia.renderer.MustacheRenderer; import nu.marginalia.renderer.RendererFactory; -import nu.marginalia.service.server.Initialization; -import nu.marginalia.service.server.MetricsServer; -import nu.marginalia.service.server.Service; -import nu.marginalia.service.server.StaticResources; +import nu.marginalia.service.server.*; import org.jetbrains.annotations.NotNull; import spark.Request; import spark.Response; @@ -42,16 +38,13 @@ public class ExplorerService extends Service { @SneakyThrows @Inject - public ExplorerService(@Named("service-host") String ip, - @Named("service-port") Integer port, - Initialization initialization, - MetricsServer metricsServer, - RendererFactory rendererFactory, - HikariDataSource dataSource, + public ExplorerService(BaseServiceParams params, + RendererFactory rendererFactory, + HikariDataSource dataSource, StaticResources staticResources ) { - super(ip, port, initialization, metricsServer); + super(params); renderer = rendererFactory.renderer("explorer/explorer"); diff --git a/docker-compose.yml b/docker-compose.yml index e8247deb..fc88dcc3 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -71,6 +71,16 @@ services: - "127.0.0.1:7071:4000" depends_on: - mariadb + control-service: + <<: *service + image: "marginalia.nu/control-service" + container_name: "control-service" + ports: + - "127.0.0.1:5090:5090" + - "127.0.0.1:4090:5000" + - "127.0.0.1:7090:4000" + depends_on: + - mariadb mariadb: image: "mariadb/server:10.3" container_name: "mariadb" diff --git a/run/nginx-site.conf b/run/nginx-site.conf index 327287b1..90f93ff9 100644 --- a/run/nginx-site.conf +++ b/run/nginx-site.conf @@ -33,6 +33,9 @@ server { proxy_pass http://assistant-service:5025/public$request_uri; access_log off; } + location /control/ { + proxy_pass http://control-service:5090/public/; + } location / { proxy_pass http://search-service:5023/public/; } diff --git a/settings.gradle b/settings.gradle index 1e59fa5b..90d74f99 100644 --- a/settings.gradle +++ b/settings.gradle @@ -7,6 +7,7 @@ include 'code:services-core:search-service' include 'code:services-satellite:api-service' include 'code:services-satellite:dating-service' include 'code:services-satellite:explorer-service' +include 'code:services-satellite:control-service' include 'code:libraries:array' include 'code:libraries:btree' From 31ae71c7d663648242a975205e0151ccb96820df Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 3 Jul 2023 11:04:08 +0200 Subject: [PATCH 002/157] Message queue WIP --- .../sql/current/11-message-queue.sql | 20 ++ .../sql/migrations/03-message-queue.sql | 23 ++ code/common/message-queue/build.gradle | 48 ++++ code/common/message-queue/msgstate.svg | 4 + code/common/message-queue/readme.md | 5 + .../java/nu/marginalia/mq/MqException.java | 11 + .../main/java/nu/marginalia/mq/MqMessage.java | 10 + .../java/nu/marginalia/mq/MqMessageState.java | 9 + .../java/nu/marginalia/mq/inbox/MqInbox.java | 185 ++++++++++++++ .../marginalia/mq/inbox/MqInboxResponse.java | 22 ++ .../marginalia/mq/inbox/MqSubscription.java | 9 + .../nu/marginalia/mq/outbox/MqOutbox.java | 107 ++++++++ .../mq/persistence/MqPersistence.java | 237 ++++++++++++++++++ .../nu/marginalia/mq/outbox/MqMessageRow.java | 21 ++ .../nu/marginalia/mq/outbox/MqOutboxTest.java | 177 +++++++++++++ .../mq/outbox/MqPersistenceTest.java | 189 ++++++++++++++ .../nu/marginalia/mq/outbox/MqTestUtil.java | 52 ++++ settings.gradle | 1 + 18 files changed, 1130 insertions(+) create mode 100644 code/common/db/src/main/resources/sql/current/11-message-queue.sql create mode 100644 code/common/db/src/main/resources/sql/migrations/03-message-queue.sql create mode 100644 code/common/message-queue/build.gradle create mode 100644 code/common/message-queue/msgstate.svg create mode 100644 code/common/message-queue/readme.md create mode 100644 code/common/message-queue/src/main/java/nu/marginalia/mq/MqException.java create mode 100644 code/common/message-queue/src/main/java/nu/marginalia/mq/MqMessage.java create mode 100644 code/common/message-queue/src/main/java/nu/marginalia/mq/MqMessageState.java create mode 100644 code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInbox.java create mode 100644 code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInboxResponse.java create mode 100644 code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSubscription.java create mode 100644 code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java create mode 100644 code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java create mode 100644 code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqMessageRow.java create mode 100644 code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqOutboxTest.java create mode 100644 code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqPersistenceTest.java create mode 100644 code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqTestUtil.java diff --git a/code/common/db/src/main/resources/sql/current/11-message-queue.sql b/code/common/db/src/main/resources/sql/current/11-message-queue.sql new file mode 100644 index 00000000..97e20d5a --- /dev/null +++ b/code/common/db/src/main/resources/sql/current/11-message-queue.sql @@ -0,0 +1,20 @@ +CREATE TABLE PROC_MESSAGE( + ID BIGINT AUTO_INCREMENT PRIMARY KEY COMMENT 'Unique id', + + RELATED_ID BIGINT NOT NULL DEFAULT -1 COMMENT 'Unique id a related message', + SENDER_INBOX VARCHAR(255) COMMENT 'Name of the sender inbox', + + RECIPIENT_INBOX VARCHAR(255) NOT NULL COMMENT 'Name of the recipient inbox', + FUNCTION VARCHAR(255) NOT NULL COMMENT 'Which function to run', + PAYLOAD TEXT COMMENT 'Message to recipient', + + OWNER_INSTANCE VARCHAR(255) COMMENT 'Instance UUID corresponding to the party that has claimed the message', + OWNER_TICK BIGINT DEFAULT -1 COMMENT 'Used by recipient to determine which messages it has processed', + + STATE ENUM('NEW', 'ACK', 'OK', 'ERR', 'DEAD') + NOT NULL DEFAULT 'NEW' COMMENT 'Processing state', + + CREATED_TIME TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) COMMENT 'Time of creation', + UPDATED_TIME TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) COMMENT 'Time of last update', + TTL INT COMMENT 'Time to live in seconds' +); diff --git a/code/common/db/src/main/resources/sql/migrations/03-message-queue.sql b/code/common/db/src/main/resources/sql/migrations/03-message-queue.sql new file mode 100644 index 00000000..d357650e --- /dev/null +++ b/code/common/db/src/main/resources/sql/migrations/03-message-queue.sql @@ -0,0 +1,23 @@ +CREATE TABLE PROC_MESSAGE( + ID BIGINT AUTO_INCREMENT PRIMARY KEY COMMENT 'Unique id', + + RELATED_ID BIGINT COMMENT 'Unique id a related message', + SENDER_INBOX VARCHAR(255) COMMENT 'Name of the sender inbox', + + RECIPIENT_INBOX VARCHAR(255) NOT NULL COMMENT 'Name of the recipient inbox', + FUNCTION VARCHAR(255) NOT NULL COMMENT 'Which function to run', + PAYLOAD TEXT COMMENT 'Message to recipient', + + -- These fields are used to avoid double processing of messages + -- instance marks the unique instance of the party, and the tick marks + -- the current polling iteration. Both are necessary. + OWNER_INSTANCE VARCHAR(255) COMMENT 'Instance UUID corresponding to the party that has claimed the message', + OWNER_TICK BIGINT DEFAULT -1 COMMENT 'Used by recipient to determine which messages it has processed', + + STATE ENUM('NEW', 'ACK', 'OK', 'ERR', 'DEAD') + NOT NULL DEFAULT 'NEW' COMMENT 'Processing state', + + CREATED_TIME TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) COMMENT 'Time of creation', + UPDATED_TIME TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) COMMENT 'Time of last update', + TTL INT COMMENT 'Time to live in seconds' +); diff --git a/code/common/message-queue/build.gradle b/code/common/message-queue/build.gradle new file mode 100644 index 00000000..84ea9651 --- /dev/null +++ b/code/common/message-queue/build.gradle @@ -0,0 +1,48 @@ +plugins { + id 'java' +} + + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + implementation project(':code:common:service-client') + implementation project(':code:common:service-discovery') + implementation project(':code:common:db') + + implementation libs.lombok + annotationProcessor libs.lombok + + implementation libs.spark + implementation libs.guice + implementation libs.rxjava + + implementation libs.bundles.prometheus + implementation libs.bundles.slf4j + implementation libs.bucket4j + + testImplementation libs.bundles.slf4j.test + implementation libs.bundles.mariadb + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito + + testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4') + testImplementation 'org.testcontainers:mariadb:1.17.4' + testImplementation 'org.testcontainers:junit-jupiter:1.17.4' +} + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} diff --git a/code/common/message-queue/msgstate.svg b/code/common/message-queue/msgstate.svg new file mode 100644 index 00000000..22691893 --- /dev/null +++ b/code/common/message-queue/msgstate.svg @@ -0,0 +1,4 @@ + + + +
If the message is not
acknowledged, it may
be declared dead after
TTL
If the message is not...
Inbox acknowledges the message
Inbox acknowledges the message
New
New
Message processing
failed
Message processing...
If the message doesn't
finish within TTL it will
be marked as dead
If the message doesn't...
Message processed
OK, sender may
receive a reply in their
inbox
Message processed...
Ack
Ack
Ok
Ok
Err
Err
Dead
Dead
Terminal States
Terminal S...
Intermediate States
Intermedia...
Initial State
Initial St...

Message States

Messages pass through several states through their lifecycle

Message States...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/code/common/message-queue/readme.md b/code/common/message-queue/readme.md new file mode 100644 index 00000000..68ae2825 --- /dev/null +++ b/code/common/message-queue/readme.md @@ -0,0 +1,5 @@ +# Message Queue + +Implements a message queue using mariadb. + +![Message States](msgstate.svg) \ No newline at end of file diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/MqException.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/MqException.java new file mode 100644 index 00000000..351f60d7 --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/MqException.java @@ -0,0 +1,11 @@ +package nu.marginalia.mq; + +public class MqException extends Exception { + public MqException(String message) { + super(message); + } + + public MqException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/MqMessage.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/MqMessage.java new file mode 100644 index 00000000..5f4c11aa --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/MqMessage.java @@ -0,0 +1,10 @@ +package nu.marginalia.mq; + +public record MqMessage( + long msgId, + long relatedId, + String function, + String payload, + MqMessageState state +) { +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/MqMessageState.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/MqMessageState.java new file mode 100644 index 00000000..d1d03f15 --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/MqMessageState.java @@ -0,0 +1,9 @@ +package nu.marginalia.mq; + +public enum MqMessageState { + NEW, + ACK, + OK, + ERR, + DEAD +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInbox.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInbox.java new file mode 100644 index 00000000..7d94b327 --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInbox.java @@ -0,0 +1,185 @@ +package nu.marginalia.mq.inbox; + +import nu.marginalia.mq.MqMessage; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.persistence.MqPersistence; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; +import java.sql.SQLException; +import java.util.Collection; +import java.util.UUID; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.TimeUnit; + +public class MqInbox { + private final Logger logger = LoggerFactory.getLogger(MqInbox.class); + + private final String inboxName; + private final String instanceUUID; + private final ExecutorService threadPool; + private final MqPersistence persistence; + + private volatile boolean run = true; + + private final int pollIntervalMs = Integer.getInteger("mq.inbox.poll-interval-ms", 1000); + private final List eventSubscribers = new ArrayList<>(); + private final LinkedBlockingQueue queue = new LinkedBlockingQueue<>(32); + + private Thread pollDbThread; + private Thread notifyThread; + + public MqInbox(MqPersistence persistence, + String inboxName, + UUID instanceUUID) + { + this.threadPool = Executors.newCachedThreadPool(); + this.persistence = persistence; + this.inboxName = inboxName; + this.instanceUUID = instanceUUID.toString(); + } + + public void subscribe(MqSubscription subscription) { + eventSubscribers.add(subscription); + } + + public void start() { + run = true; + + if (eventSubscribers.isEmpty()) { + logger.error("No subscribers for inbox {}, registering shredder", inboxName); + } + + // Add a final handler that fails any message that is not handled + eventSubscribers.add(new MqInboxShredder()); + + pollDbThread = new Thread(this::pollDb, "mq-inbox-update-thread:"+inboxName); + pollDbThread.setDaemon(true); + pollDbThread.start(); + + notifyThread = new Thread(this::notifySubscribers, "mq-inbox-notify-thread:"+inboxName); + notifyThread.setDaemon(true); + notifyThread.start(); + } + + public void stop() throws InterruptedException { + if (!run) + return; + + logger.info("Shutting down inbox {}", inboxName); + + run = false; + pollDbThread.join(); + notifyThread.join(); + + threadPool.shutdownNow(); + + while (!threadPool.awaitTermination(5, TimeUnit.SECONDS)); + } + + private void notifySubscribers() { + try { + while (run) { + + MqMessage msg = queue.poll(pollIntervalMs, TimeUnit.MILLISECONDS); + + if (msg == null) + continue; + + logger.info("Notifying subscribers of message {}", msg.msgId()); + + boolean handled = false; + + for (var eventSubscriber : eventSubscribers) { + if (eventSubscriber.filter(msg)) { + handleMessageWithSubscriber(eventSubscriber, msg); + handled = true; + break; + } + } + + if (!handled) { + logger.error("No subscriber wanted to handle message {}", msg.msgId()); + } + } + } + catch (InterruptedException ex) { + logger.error("MQ inbox notify thread interrupted", ex); + } + } + + private void handleMessageWithSubscriber(MqSubscription subscriber, MqMessage msg) { + + threadPool.execute(() -> { + try { + final var rsp = subscriber.handle(msg); + + sendResponse(msg, rsp.state(), rsp.message()); + } catch (Exception ex) { + logger.error("Message Queue subscriber threw exception", ex); + sendResponse(msg, MqMessageState.ERR); + } + }); + } + + private void sendResponse(MqMessage msg, MqMessageState mqMessageState) { + try { + persistence.updateMessageState(msg.msgId(), mqMessageState); + } + catch (SQLException ex) { + logger.error("Failed to update message state", ex); + } + } + + private void sendResponse(MqMessage msg, MqMessageState mqMessageState, String response) { + try { + persistence.sendResponse(msg.msgId(), mqMessageState, response); + } + catch (SQLException ex) { + logger.error("Failed to update message state", ex); + } + } + + public void pollDb() { + try { + for (long tick = 1; run; tick++) { + + queue.addAll(pollInbox(tick)); + + TimeUnit.MILLISECONDS.sleep(pollIntervalMs); + } + } + catch (InterruptedException ex) { + logger.error("MQ inbox update thread interrupted", ex); + } + } + + private Collection pollInbox(long tick) { + try { + return persistence.pollInbox(inboxName, instanceUUID, tick); + } + catch (SQLException ex) { + logger.error("Failed to poll inbox", ex); + return List.of(); + } + } + + + private class MqInboxShredder implements MqSubscription { + + @Override + public boolean filter(MqMessage rawMessage) { + return true; + } + + @Override + public MqInboxResponse handle(MqMessage msg) { + logger.warn("Unhandled message {}", msg.msgId()); + return MqInboxResponse.err(); + } + } +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInboxResponse.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInboxResponse.java new file mode 100644 index 00000000..ba4eb6f2 --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInboxResponse.java @@ -0,0 +1,22 @@ +package nu.marginalia.mq.inbox; + +import nu.marginalia.mq.MqMessageState; + +public record MqInboxResponse(String message, MqMessageState state) { + + public static MqInboxResponse ok(String message) { + return new MqInboxResponse(message, MqMessageState.OK); + } + + public static MqInboxResponse ok() { + return new MqInboxResponse("", MqMessageState.OK); + } + + public static MqInboxResponse err(String message) { + return new MqInboxResponse(message, MqMessageState.ERR); + } + + public static MqInboxResponse err() { + return new MqInboxResponse("", MqMessageState.ERR); + } +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSubscription.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSubscription.java new file mode 100644 index 00000000..ce52a26b --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSubscription.java @@ -0,0 +1,9 @@ +package nu.marginalia.mq.inbox; + +import nu.marginalia.mq.MqMessage; + +public interface MqSubscription { + boolean filter(MqMessage rawMessage); + + MqInboxResponse handle(MqMessage msg); +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java new file mode 100644 index 00000000..e4fa2e23 --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java @@ -0,0 +1,107 @@ +package nu.marginalia.mq.outbox; + +import nu.marginalia.mq.MqMessage; +import nu.marginalia.mq.persistence.MqPersistence; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.SQLException; +import java.time.Duration; +import java.util.UUID; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.TimeUnit; + +public class MqOutbox { + private final Logger logger = LoggerFactory.getLogger(MqOutbox.class); + private final MqPersistence persistence; + private final String inboxName; + private final String replyInboxName; + private final String instanceUUID; + + private final ConcurrentHashMap pendingRequests = new ConcurrentHashMap<>(); + private final ConcurrentHashMap pendingResponses = new ConcurrentHashMap<>(); + + private final int pollIntervalMs = Integer.getInteger("mq.outbox.poll-interval-ms", 1000); + private final Thread pollThread; + + private volatile boolean run = true; + + public MqOutbox(MqPersistence persistence, + String inboxName, + UUID instanceUUID) { + this.persistence = persistence; + + this.inboxName = inboxName; + this.replyInboxName = "reply:" + inboxName; + this.instanceUUID = instanceUUID.toString(); + + pollThread = new Thread(this::poll, "mq-outbox-poll-thread:" + inboxName); + pollThread.setDaemon(true); + pollThread.start(); + } + + public void stop() throws InterruptedException { + if (!run) + return; + + logger.info("Shutting down outbox {}", inboxName); + + pendingRequests.clear(); + + run = false; + pollThread.join(); + } + + private void poll() { + try { + for (long id = 1; run; id++) { + pollDb(id); + + TimeUnit.MILLISECONDS.sleep(pollIntervalMs); + } + } catch (InterruptedException ex) { + logger.error("Outbox poll thread interrupted", ex); + } + } + + private void pollDb(long tick) { + if (pendingRequests.isEmpty()) + return; + + try { + var updates = persistence.pollReplyInbox(replyInboxName, instanceUUID, tick); + + for (var message : updates) { + pendingResponses.put(message.relatedId(), message); + pendingRequests.remove(message.relatedId()); + } + + if (updates.isEmpty() || pendingResponses.isEmpty()) + return; + + logger.info("Notifying {} pending responses", pendingResponses.size()); + + synchronized (pendingResponses) { + pendingResponses.notifyAll(); + } + } + catch (SQLException ex) { + logger.error("Failed to poll inbox", ex); + } + + } + + public MqMessage send(String function, String payload) throws Exception { + var id = persistence.sendNewMessage(inboxName, replyInboxName, function, payload, null); + pendingRequests.put(id, id); + + synchronized (pendingResponses) { + while (!pendingResponses.containsKey(id)) { + pendingResponses.wait(100); + } + return pendingResponses.remove(id); + } + } + + +} \ No newline at end of file diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java new file mode 100644 index 00000000..92fffb51 --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java @@ -0,0 +1,237 @@ +package nu.marginalia.mq.persistence; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.MqMessage; + +import javax.annotation.Nullable; +import java.sql.SQLException; +import java.time.Duration; +import java.util.*; + +@Singleton +public class MqPersistence { + private final HikariDataSource dataSource; + + @Inject + public MqPersistence(HikariDataSource dataSource) { + this.dataSource = dataSource; + } + + /** Flags messages as dead if they have not been set to a terminal state within a TTL after the last update. */ + public int reapDeadMessages() throws SQLException { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + UPDATE PROC_MESSAGE + SET STATE='DEAD', UPDATED_TIME=CURRENT_TIMESTAMP(6) + WHERE STATE IN ('NEW', 'ACK') + AND TTL IS NOT NULL + AND TIMESTAMPDIFF(SECOND, UPDATED_TIME, CURRENT_TIMESTAMP(6)) > TTL + """)) { + return stmt.executeUpdate(); + } + } + + public long sendNewMessage(String recipientInboxName, + @Nullable + String senderInboxName, + String function, + String payload, + @Nullable Duration ttl + ) throws Exception { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + INSERT INTO PROC_MESSAGE(RECIPIENT_INBOX, SENDER_INBOX, FUNCTION, PAYLOAD, TTL) + VALUES(?, ?, ?, ?, ?) + """); + var lastIdQuery = conn.prepareStatement("SELECT LAST_INSERT_ID()")) { + + stmt.setString(1, recipientInboxName); + + if (senderInboxName == null) stmt.setNull(2, java.sql.Types.VARCHAR); + else stmt.setString(2, senderInboxName); + + stmt.setString(3, function); + stmt.setString(4, payload); + if (ttl == null) stmt.setNull(5, java.sql.Types.BIGINT); + else stmt.setLong(5, ttl.toSeconds()); + + stmt.executeUpdate(); + var rsp = lastIdQuery.executeQuery(); + + if (!rsp.next()) { + throw new IllegalStateException("No last insert id"); + } + + return rsp.getLong(1); + } + } + + + public void updateMessageState(long id, MqMessageState mqMessageState) throws SQLException { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + UPDATE PROC_MESSAGE + SET STATE=?, UPDATED_TIME=CURRENT_TIMESTAMP(6) + WHERE ID=? + """)) { + stmt.setString(1, mqMessageState.name()); + stmt.setLong(2, id); + + if (stmt.executeUpdate() != 1) { + throw new IllegalArgumentException("No rows updated"); + } + } + } + + public long sendResponse(long id, MqMessageState mqMessageState, String message) throws SQLException { + try (var conn = dataSource.getConnection()) { + conn.setAutoCommit(false); + + try (var updateState = conn.prepareStatement(""" + UPDATE PROC_MESSAGE + SET STATE=?, UPDATED_TIME=CURRENT_TIMESTAMP(6) + WHERE ID=? + """); + var addResponse = conn.prepareStatement(""" + INSERT INTO PROC_MESSAGE(RECIPIENT_INBOX, RELATED_ID, FUNCTION, PAYLOAD) + SELECT SENDER_INBOX, ID, ?, ? + FROM PROC_MESSAGE + WHERE ID=? AND SENDER_INBOX IS NOT NULL + """); + var lastIdQuery = conn.prepareStatement("SELECT LAST_INSERT_ID()") + ) { + + updateState.setString(1, mqMessageState.name()); + updateState.setLong(2, id); + if (updateState.executeUpdate() != 1) { + throw new IllegalArgumentException("No rows updated"); + } + + addResponse.setString(1, "REPLY"); + addResponse.setString(2, message); + addResponse.setLong(3, id); + if (addResponse.executeUpdate() != 1) { + throw new IllegalArgumentException("No rows updated"); + } + + var rsp = lastIdQuery.executeQuery(); + if (!rsp.next()) { + throw new IllegalStateException("No last insert id"); + } + long newId = rsp.getLong(1); + + conn.commit(); + + return newId; + } catch (SQLException|IllegalStateException|IllegalArgumentException ex) { + conn.rollback(); + throw ex; + } finally { + conn.setAutoCommit(true); + } + } + } + + + private int markInboxMessages(String inboxName, String instanceUUID, long tick) throws SQLException { + try (var conn = dataSource.getConnection(); + var updateStmt = conn.prepareStatement(""" + UPDATE PROC_MESSAGE + SET OWNER_INSTANCE=?, OWNER_TICK=?, UPDATED_TIME=CURRENT_TIMESTAMP(6), STATE='ACK' + WHERE RECIPIENT_INBOX=? + AND OWNER_INSTANCE IS NULL AND STATE='NEW' + """); + ) { + updateStmt.setString(1, instanceUUID); + updateStmt.setLong(2, tick); + updateStmt.setString(3, inboxName); + return updateStmt.executeUpdate(); + } + } + + /** Marks unclaimed messages addressed to this inbox with instanceUUID and tick, + * then returns these messages. + */ + public Collection pollInbox(String inboxName, String instanceUUID, long tick) throws SQLException { + + int expected = markInboxMessages(inboxName, instanceUUID, tick); + if (expected == 0) { + return Collections.emptyList(); + } + + try (var conn = dataSource.getConnection(); + var queryStmt = conn.prepareStatement(""" + SELECT ID, RELATED_ID, FUNCTION, PAYLOAD, STATE FROM PROC_MESSAGE + WHERE OWNER_INSTANCE=? AND OWNER_TICK=? + """) + ) { + queryStmt.setString(1, instanceUUID); + queryStmt.setLong(2, tick); + var rs = queryStmt.executeQuery(); + + List messages = new ArrayList<>(expected); + + while (rs.next()) { + long msgId = rs.getLong(1); + long relatedId = rs.getLong(2); + + String function = rs.getString(3); + String payload = rs.getString(4); + + MqMessageState state = MqMessageState.valueOf(rs.getString(5)); + + var msg = new MqMessage(msgId, relatedId, function, payload, state); + + messages.add(msg); + } + + return messages; + } + + } + + + /** Marks unclaimed messages addressed to this inbox with instanceUUID and tick, + * then returns these messages. + */ + public Collection pollReplyInbox(String inboxName, String instanceUUID, long tick) throws SQLException { + + int expected = markInboxMessages(inboxName, instanceUUID, tick); + if (expected == 0) { + return Collections.emptyList(); + } + + try (var conn = dataSource.getConnection(); + var queryStmt = conn.prepareStatement(""" + SELECT SELF.ID, SELF.RELATED_ID, SELF.FUNCTION, SELF.PAYLOAD, PARENT.STATE FROM PROC_MESSAGE SELF + LEFT JOIN PROC_MESSAGE PARENT ON SELF.RELATED_ID=PARENT.ID + WHERE SELF.OWNER_INSTANCE=? AND SELF.OWNER_TICK=? + """) + ) { + queryStmt.setString(1, instanceUUID); + queryStmt.setLong(2, tick); + var rs = queryStmt.executeQuery(); + + List messages = new ArrayList<>(expected); + + while (rs.next()) { + long msgId = rs.getLong(1); + long relatedId = rs.getLong(2); + + String function = rs.getString(3); + String payload = rs.getString(4); + + MqMessageState state = MqMessageState.valueOf(rs.getString(5)); + + var msg = new MqMessage(msgId, relatedId, function, payload, state); + + messages.add(msg); + } + + return messages; + } + } +} diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqMessageRow.java b/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqMessageRow.java new file mode 100644 index 00000000..933cdb62 --- /dev/null +++ b/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqMessageRow.java @@ -0,0 +1,21 @@ +package nu.marginalia.mq.outbox; + +import nu.marginalia.mq.MqMessageState; + +import javax.annotation.Nullable; + +public record MqMessageRow ( + long id, + long relatedId, + @Nullable + String senderInbox, + String recipientInbox, + String function, + String payload, + MqMessageState state, + String ownerInstance, + long ownerTick, + long createdTime, + long updatedTime, + long ttl +) {} \ No newline at end of file diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqOutboxTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqOutboxTest.java new file mode 100644 index 00000000..789aec15 --- /dev/null +++ b/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqOutboxTest.java @@ -0,0 +1,177 @@ +package nu.marginalia.mq.outbox; + +import com.zaxxer.hikari.HikariConfig; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.mq.MqMessage; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.inbox.MqInboxResponse; +import nu.marginalia.mq.inbox.MqInbox; +import nu.marginalia.mq.inbox.MqSubscription; +import nu.marginalia.mq.persistence.MqPersistence; +import org.junit.jupiter.api.*; +import org.testcontainers.containers.MariaDBContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +import java.util.UUID; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +@Tag("slow") +@Testcontainers +public class MqOutboxTest { + @Container + static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") + .withDatabaseName("WMSA_prod") + .withUsername("wmsa") + .withPassword("wmsa") + .withInitScript("sql/current/11-message-queue.sql") + .withNetworkAliases("mariadb"); + + static HikariDataSource dataSource; + private String inboxId; + + @BeforeEach + public void setUp() { + inboxId = UUID.randomUUID().toString(); + } + @BeforeAll + public static void setUpAll() { + HikariConfig config = new HikariConfig(); + config.setJdbcUrl(mariaDBContainer.getJdbcUrl()); + config.setUsername("wmsa"); + config.setPassword("wmsa"); + + dataSource = new HikariDataSource(config); + } + + @AfterAll + public static void tearDownAll() { + dataSource.close(); + } + + @Test + public void testOpenClose() throws InterruptedException { + var outbox = new MqOutbox(new MqPersistence(dataSource), inboxId, UUID.randomUUID()); + outbox.stop(); + } + + @Test + public void testSend() throws Exception { + var outbox = new MqOutbox(new MqPersistence(dataSource), inboxId, UUID.randomUUID()); + Executors.newSingleThreadExecutor().submit(() -> outbox.send("test", "Hello World")); + + TimeUnit.MILLISECONDS.sleep(100); + + var messages = MqTestUtil.getMessages(dataSource, inboxId); + assertEquals(1, messages.size()); + System.out.println(messages.get(0)); + + outbox.stop(); + } + + @Test + public void testSendAndRespond() throws Exception { + var outbox = new MqOutbox(new MqPersistence(dataSource), inboxId, UUID.randomUUID()); + + var inbox = new MqInbox(new MqPersistence(dataSource), inboxId, UUID.randomUUID()); + inbox.subscribe(justRespond("Alright then")); + inbox.start(); + + var rsp = outbox.send("test", "Hello World"); + + assertEquals(MqMessageState.OK, rsp.state()); + assertEquals("Alright then", rsp.payload()); + + var messages = MqTestUtil.getMessages(dataSource, inboxId); + assertEquals(1, messages.size()); + assertEquals(MqMessageState.OK, messages.get(0).state()); + + outbox.stop(); + inbox.stop(); + } + + @Test + public void testSendMultiple() throws Exception { + var outbox = new MqOutbox(new MqPersistence(dataSource), inboxId, UUID.randomUUID()); + + var inbox = new MqInbox(new MqPersistence(dataSource), inboxId, UUID.randomUUID()); + inbox.subscribe(echo()); + inbox.start(); + + var rsp1 = outbox.send("test", "one"); + var rsp2 = outbox.send("test", "two"); + var rsp3 = outbox.send("test", "three"); + var rsp4 = outbox.send("test", "four"); + + Thread.sleep(500); + + assertEquals(MqMessageState.OK, rsp1.state()); + assertEquals("one", rsp1.payload()); + assertEquals(MqMessageState.OK, rsp2.state()); + assertEquals("two", rsp2.payload()); + assertEquals(MqMessageState.OK, rsp3.state()); + assertEquals("three", rsp3.payload()); + assertEquals(MqMessageState.OK, rsp4.state()); + assertEquals("four", rsp4.payload()); + + var messages = MqTestUtil.getMessages(dataSource, inboxId); + assertEquals(4, messages.size()); + for (var message : messages) { + assertEquals(MqMessageState.OK, message.state()); + } + + outbox.stop(); + inbox.stop(); + } + + @Test + public void testSendAndRespondWithErrorHandler() throws Exception { + var outbox = new MqOutbox(new MqPersistence(dataSource), inboxId, UUID.randomUUID()); + var inbox = new MqInbox(new MqPersistence(dataSource), inboxId, UUID.randomUUID()); + + inbox.start(); + + var rsp = outbox.send("test", "Hello World"); + + assertEquals(MqMessageState.ERR, rsp.state()); + + var messages = MqTestUtil.getMessages(dataSource, inboxId); + assertEquals(1, messages.size()); + assertEquals(MqMessageState.ERR, messages.get(0).state()); + + outbox.stop(); + inbox.stop(); + } + + public MqSubscription justRespond(String response) { + return new MqSubscription() { + @Override + public boolean filter(MqMessage rawMessage) { + return true; + } + + @Override + public MqInboxResponse handle(MqMessage msg) { + return MqInboxResponse.ok(response); + } + }; + } + + public MqSubscription echo() { + return new MqSubscription() { + @Override + public boolean filter(MqMessage rawMessage) { + return true; + } + + @Override + public MqInboxResponse handle(MqMessage msg) { + return MqInboxResponse.ok(msg.payload()); + } + }; + } + +} \ No newline at end of file diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqPersistenceTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqPersistenceTest.java new file mode 100644 index 00000000..590ff64b --- /dev/null +++ b/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqPersistenceTest.java @@ -0,0 +1,189 @@ +package nu.marginalia.mq.outbox; + +import com.zaxxer.hikari.HikariConfig; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.persistence.MqPersistence; +import org.junit.jupiter.api.*; +import org.testcontainers.containers.MariaDBContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +import java.time.Duration; +import java.util.UUID; +import java.util.concurrent.TimeUnit; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNull; + +@Tag("slow") +@Testcontainers +public class MqPersistenceTest { + @Container + static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") + .withDatabaseName("WMSA_prod") + .withUsername("wmsa") + .withPassword("wmsa") + .withInitScript("sql/current/11-message-queue.sql") + .withNetworkAliases("mariadb"); + + static HikariDataSource dataSource; + static MqPersistence persistence; + String recipientId; + String senderId; + + @BeforeEach + public void setUp() { + senderId = UUID.randomUUID().toString(); + recipientId = UUID.randomUUID().toString(); + } + + @BeforeAll + public static void setUpAll() { + HikariConfig config = new HikariConfig(); + config.setJdbcUrl(mariaDBContainer.getJdbcUrl()); + config.setUsername("wmsa"); + config.setPassword("wmsa"); + + dataSource = new HikariDataSource(config); + persistence = new MqPersistence(dataSource); + } + + @AfterAll + public static void tearDownAll() { + dataSource.close(); + } + + @Test + public void testReaper() throws Exception { + + long id = persistence.sendNewMessage(recipientId, senderId, "function", "payload", Duration.ofSeconds(2)); + persistence.reapDeadMessages(); + + var messages = MqTestUtil.getMessages(dataSource, recipientId); + assertEquals(1, messages.size()); + assertEquals(MqMessageState.NEW, messages.get(0).state()); + + TimeUnit.SECONDS.sleep(5); + + persistence.reapDeadMessages(); + + messages = MqTestUtil.getMessages(dataSource, recipientId); + assertEquals(1, messages.size()); + assertEquals(MqMessageState.DEAD, messages.get(0).state()); + } + + @Test + public void sendWithReplyAddress() throws Exception { + + long id = persistence.sendNewMessage(recipientId, senderId, "function", "payload", Duration.ofSeconds(30)); + + var messages = MqTestUtil.getMessages(dataSource, recipientId); + assertEquals(1, messages.size()); + + var message = messages.get(0); + + assertEquals(id, message.id()); + assertEquals("function", message.function()); + assertEquals("payload", message.payload()); + assertEquals(MqMessageState.NEW, message.state()); + + System.out.println(message); + } + + @Test + public void sendNoReplyAddress() throws Exception { + + long id = persistence.sendNewMessage(recipientId, null, "function", "payload", Duration.ofSeconds(30)); + + var messages = MqTestUtil.getMessages(dataSource, recipientId); + assertEquals(1, messages.size()); + + var message = messages.get(0); + + assertEquals(id, message.id()); + assertNull(message.senderInbox()); + assertEquals("function", message.function()); + assertEquals("payload", message.payload()); + assertEquals(MqMessageState.NEW, message.state()); + + System.out.println(message); + } + + @Test + public void updateState() throws Exception { + + long id = persistence.sendNewMessage(recipientId, senderId, "function", "payload", Duration.ofSeconds(30)); + persistence.updateMessageState(id, MqMessageState.OK); + System.out.println(id); + + var messages = MqTestUtil.getMessages(dataSource, recipientId); + assertEquals(1, messages.size()); + + var message = messages.get(0); + + assertEquals(id, message.id()); + assertEquals(MqMessageState.OK, message.state()); + + System.out.println(message); + } + + @Test + public void testReply() throws Exception { + long request = persistence.sendNewMessage(recipientId, senderId, "function", "payload", Duration.ofSeconds(30)); + long response = persistence.sendResponse(request, MqMessageState.OK, "response"); + + var sentMessages = MqTestUtil.getMessages(dataSource, recipientId); + System.out.println(sentMessages); + assertEquals(1, sentMessages.size()); + + var requestMessage = sentMessages.get(0); + assertEquals(request, requestMessage.id()); + assertEquals(MqMessageState.OK, requestMessage.state()); + + + var replies = MqTestUtil.getMessages(dataSource, senderId); + System.out.println(replies); + assertEquals(1, replies.size()); + + var responseMessage = replies.get(0); + assertEquals(response, responseMessage.id()); + assertEquals(request, responseMessage.relatedId()); + assertEquals(MqMessageState.NEW, responseMessage.state()); + } + + @Test + public void testPollInbox() throws Exception { + + String instanceId = "BATMAN"; + long tick = 1234L; + + long id = persistence.sendNewMessage(recipientId, null,"function", "payload", Duration.ofSeconds(30)); + + var messagesPollFirstTime = persistence.pollInbox(recipientId, instanceId , tick); + + /** CHECK POLL RESULT */ + assertEquals(1, messagesPollFirstTime.size()); + var firstPollMessage = messagesPollFirstTime.iterator().next(); + assertEquals(id, firstPollMessage.msgId()); + assertEquals("function", firstPollMessage.function()); + assertEquals("payload", firstPollMessage.payload()); + + /** CHECK DB TABLE */ + var messages = MqTestUtil.getMessages(dataSource, recipientId); + assertEquals(1, messages.size()); + + var message = messages.get(0); + + assertEquals(id, message.id()); + assertEquals("function", message.function()); + assertEquals("payload", message.payload()); + assertEquals(MqMessageState.ACK, message.state()); + assertEquals(instanceId, message.ownerInstance()); + assertEquals(tick, message.ownerTick()); + + /** VERIFY SECOND POLL IS EMPTY */ + var messagePollSecondTime = persistence.pollInbox(recipientId, instanceId , 1); + assertEquals(0, messagePollSecondTime.size()); + } +} diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqTestUtil.java b/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqTestUtil.java new file mode 100644 index 00000000..3fee8b20 --- /dev/null +++ b/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqTestUtil.java @@ -0,0 +1,52 @@ +package nu.marginalia.mq.outbox; + +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.mq.MqMessageState; +import org.junit.jupiter.api.Assertions; + +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.List; + +public class MqTestUtil { + public static List getMessages(HikariDataSource dataSource, String inbox) { + List messages = new ArrayList<>(); + + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT ID, RELATED_ID, + SENDER_INBOX, RECIPIENT_INBOX, + FUNCTION, PAYLOAD, + STATE, + OWNER_INSTANCE, OWNER_TICK, + CREATED_TIME, UPDATED_TIME, + TTL + FROM PROC_MESSAGE + WHERE RECIPIENT_INBOX = ? + """)) + { + stmt.setString(1, inbox); + var rsp = stmt.executeQuery(); + while (rsp.next()) { + messages.add(new MqMessageRow( + rsp.getLong("ID"), + rsp.getLong("RELATED_ID"), + rsp.getString("SENDER_INBOX"), + rsp.getString("RECIPIENT_INBOX"), + rsp.getString("FUNCTION"), + rsp.getString("PAYLOAD"), + MqMessageState.valueOf(rsp.getString("STATE")), + rsp.getString("OWNER_INSTANCE"), + rsp.getLong("OWNER_TICK"), + rsp.getTimestamp("CREATED_TIME").getTime(), + rsp.getTimestamp("UPDATED_TIME").getTime(), + rsp.getLong("TTL") + )); + } + } + catch (SQLException ex) { + Assertions.fail(ex); + } + return messages; + } +} diff --git a/settings.gradle b/settings.gradle index 90d74f99..41e0cb53 100644 --- a/settings.gradle +++ b/settings.gradle @@ -48,6 +48,7 @@ include 'code:api:assistant-api' include 'code:common:service-discovery' include 'code:common:service-client' include 'code:common:db' +include 'code:common:message-queue' include 'code:common:service' include 'code:common:config' include 'code:common:model' From 2ae0b8c159b8f32323cd60a80d49a333fd3ef061 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 4 Jul 2023 17:42:06 +0200 Subject: [PATCH 003/157] Message queue based state machine --- code/common/message-queue/build.gradle | 1 + code/common/message-queue/readme.md | 5 +- .../main/java/nu/marginalia/mq/MqMessage.java | 3 +- .../java/nu/marginalia/mq/inbox/MqInbox.java | 86 ++++++--- .../marginalia/mq/inbox/MqSubscription.java | 7 +- .../nu/marginalia/mq/outbox/MqOutbox.java | 6 +- .../mq/persistence/MqPersistence.java | 42 ++++- .../java/nu/marginalia/mqsm/StateFactory.java | 66 +++++++ .../java/nu/marginalia/mqsm/StateMachine.java | 176 ++++++++++++++++++ .../nu/marginalia/mqsm/state/ErrorState.java | 14 ++ .../nu/marginalia/mqsm/state/FinalState.java | 14 ++ .../marginalia/mqsm/state/MachineState.java | 8 + .../marginalia/mqsm/state/ResumingState.java | 14 ++ .../mqsm/state/StateTransition.java | 11 ++ .../mq/{outbox => }/MqMessageRow.java | 2 +- .../mq/{outbox => }/MqTestUtil.java | 3 +- .../nu/marginalia/mq/outbox/MqOutboxTest.java | 11 +- .../MqPersistenceTest.java | 4 +- .../nu/marginalia/mqsm/StateMachineTest.java | 174 +++++++++++++++++ 19 files changed, 610 insertions(+), 37 deletions(-) create mode 100644 code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateFactory.java create mode 100644 code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java create mode 100644 code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/ErrorState.java create mode 100644 code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/FinalState.java create mode 100644 code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/MachineState.java create mode 100644 code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/ResumingState.java create mode 100644 code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/StateTransition.java rename code/common/message-queue/src/test/java/nu/marginalia/mq/{outbox => }/MqMessageRow.java (92%) rename code/common/message-queue/src/test/java/nu/marginalia/mq/{outbox => }/MqTestUtil.java (96%) rename code/common/message-queue/src/test/java/nu/marginalia/mq/{outbox => persistence}/MqPersistenceTest.java (98%) create mode 100644 code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineTest.java diff --git a/code/common/message-queue/build.gradle b/code/common/message-queue/build.gradle index 84ea9651..d71ca1d4 100644 --- a/code/common/message-queue/build.gradle +++ b/code/common/message-queue/build.gradle @@ -19,6 +19,7 @@ dependencies { implementation libs.spark implementation libs.guice + implementation libs.gson implementation libs.rxjava implementation libs.bundles.prometheus diff --git a/code/common/message-queue/readme.md b/code/common/message-queue/readme.md index 68ae2825..20e59642 100644 --- a/code/common/message-queue/readme.md +++ b/code/common/message-queue/readme.md @@ -1,5 +1,8 @@ # Message Queue -Implements a message queue using mariadb. +Implements resilient message queueing for the application, +as well as a finite state machine library backed by the +message queue that enables long-running tasks that outlive +the execution lifespan of the involved processes. ![Message States](msgstate.svg) \ No newline at end of file diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/MqMessage.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/MqMessage.java index 5f4c11aa..df0c4839 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/MqMessage.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/MqMessage.java @@ -5,6 +5,7 @@ public record MqMessage( long relatedId, String function, String payload, - MqMessageState state + MqMessageState state, + boolean expectsResponse ) { } diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInbox.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInbox.java index 7d94b327..00b30cad 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInbox.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInbox.java @@ -15,6 +15,7 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeUnit; +import java.util.function.Consumer; public class MqInbox { private final Logger logger = LoggerFactory.getLogger(MqInbox.class); @@ -26,7 +27,7 @@ public class MqInbox { private volatile boolean run = true; - private final int pollIntervalMs = Integer.getInteger("mq.inbox.poll-interval-ms", 1000); + private final int pollIntervalMs = Integer.getInteger("mq.inbox.poll-interval-ms", 100); private final List eventSubscribers = new ArrayList<>(); private final LinkedBlockingQueue queue = new LinkedBlockingQueue<>(32); @@ -114,27 +115,52 @@ public class MqInbox { private void handleMessageWithSubscriber(MqSubscription subscriber, MqMessage msg) { - threadPool.execute(() -> { - try { - final var rsp = subscriber.handle(msg); - - sendResponse(msg, rsp.state(), rsp.message()); - } catch (Exception ex) { - logger.error("Message Queue subscriber threw exception", ex); - sendResponse(msg, MqMessageState.ERR); - } - }); + if (msg.expectsResponse()) { + threadPool.execute(() -> respondToMessage(subscriber, msg)); + } + else { + threadPool.execute(() -> acknowledgeNotification(subscriber, msg)); + } } - private void sendResponse(MqMessage msg, MqMessageState mqMessageState) { + private void respondToMessage(MqSubscription subscriber, MqMessage msg) { try { - persistence.updateMessageState(msg.msgId(), mqMessageState); + final var rsp = subscriber.onRequest(msg); + sendResponse(msg, rsp.state(), rsp.message()); + } catch (Exception ex) { + logger.error("Message Queue subscriber threw exception", ex); + sendResponse(msg, MqMessageState.ERR); + } + } + + private void acknowledgeNotification(MqSubscription subscriber, MqMessage msg) { + try { + subscriber.onNotification(msg); + updateMessageState(msg, MqMessageState.OK); + } catch (Exception ex) { + logger.error("Message Queue subscriber threw exception", ex); + updateMessageState(msg, MqMessageState.ERR); + } + } + + private void sendResponse(MqMessage msg, MqMessageState state) { + try { + persistence.updateMessageState(msg.msgId(), state); } catch (SQLException ex) { logger.error("Failed to update message state", ex); } } + private void updateMessageState(MqMessage msg, MqMessageState state) { + try { + persistence.updateMessageState(msg.msgId(), state); + } + catch (SQLException ex2) { + logger.error("Failed to update message state", ex2); + } + } + private void sendResponse(MqMessage msg, MqMessageState mqMessageState, String response) { try { persistence.sendResponse(msg.msgId(), mqMessageState, response); @@ -159,14 +185,25 @@ public class MqInbox { } private Collection pollInbox(long tick) { - try { - return persistence.pollInbox(inboxName, instanceUUID, tick); - } - catch (SQLException ex) { - logger.error("Failed to poll inbox", ex); - return List.of(); - } - } + try { + return persistence.pollInbox(inboxName, instanceUUID, tick); + } + catch (SQLException ex) { + logger.error("Failed to poll inbox", ex); + return List.of(); + } + } + + /** Retrieve the last N messages from the inbox. */ + public List replay(int lastN) { + try { + return persistence.lastNMessages(inboxName, lastN); + } + catch (SQLException ex) { + logger.error("Failed to replay inbox", ex); + return List.of(); + } + } private class MqInboxShredder implements MqSubscription { @@ -177,9 +214,14 @@ public class MqInbox { } @Override - public MqInboxResponse handle(MqMessage msg) { + public MqInboxResponse onRequest(MqMessage msg) { logger.warn("Unhandled message {}", msg.msgId()); return MqInboxResponse.err(); } + + @Override + public void onNotification(MqMessage msg) { + logger.warn("Unhandled message {}", msg.msgId()); + } } } diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSubscription.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSubscription.java index ce52a26b..417b7b35 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSubscription.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSubscription.java @@ -3,7 +3,12 @@ package nu.marginalia.mq.inbox; import nu.marginalia.mq.MqMessage; public interface MqSubscription { + /** Return true if this subscription should handle the message. */ boolean filter(MqMessage rawMessage); - MqInboxResponse handle(MqMessage msg); + /** Handle the message and return a response. */ + MqInboxResponse onRequest(MqMessage msg); + + /** Handle a message with no reply address */ + void onNotification(MqMessage msg); } diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java index e4fa2e23..e8faa0ab 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java @@ -6,7 +6,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.sql.SQLException; -import java.time.Duration; import java.util.UUID; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.TimeUnit; @@ -21,7 +20,7 @@ public class MqOutbox { private final ConcurrentHashMap pendingRequests = new ConcurrentHashMap<>(); private final ConcurrentHashMap pendingResponses = new ConcurrentHashMap<>(); - private final int pollIntervalMs = Integer.getInteger("mq.outbox.poll-interval-ms", 1000); + private final int pollIntervalMs = Integer.getInteger("mq.outbox.poll-interval-ms", 100); private final Thread pollThread; private volatile boolean run = true; @@ -103,5 +102,8 @@ public class MqOutbox { } } + public long notify(String function, String payload) throws Exception { + return persistence.sendNewMessage(inboxName, null, function, payload, null); + } } \ No newline at end of file diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java index 92fffb51..d5356c55 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java @@ -1,5 +1,6 @@ package nu.marginalia.mq.persistence; +import com.google.common.collect.Lists; import com.google.inject.Inject; import com.google.inject.Singleton; import com.zaxxer.hikari.HikariDataSource; @@ -164,7 +165,7 @@ public class MqPersistence { try (var conn = dataSource.getConnection(); var queryStmt = conn.prepareStatement(""" - SELECT ID, RELATED_ID, FUNCTION, PAYLOAD, STATE FROM PROC_MESSAGE + SELECT ID, RELATED_ID, FUNCTION, PAYLOAD, STATE, SENDER_INBOX FROM PROC_MESSAGE WHERE OWNER_INSTANCE=? AND OWNER_TICK=? """) ) { @@ -182,8 +183,9 @@ public class MqPersistence { String payload = rs.getString(4); MqMessageState state = MqMessageState.valueOf(rs.getString(5)); + boolean expectsResponse = rs.getBoolean(6); - var msg = new MqMessage(msgId, relatedId, function, payload, state); + var msg = new MqMessage(msgId, relatedId, function, payload, state, expectsResponse); messages.add(msg); } @@ -226,7 +228,7 @@ public class MqPersistence { MqMessageState state = MqMessageState.valueOf(rs.getString(5)); - var msg = new MqMessage(msgId, relatedId, function, payload, state); + var msg = new MqMessage(msgId, relatedId, function, payload, state, false); messages.add(msg); } @@ -234,4 +236,38 @@ public class MqPersistence { return messages; } } + + public List lastNMessages(String inboxName, int lastN) throws SQLException { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT ID, RELATED_ID, FUNCTION, PAYLOAD, STATE, SENDER_INBOX FROM PROC_MESSAGE + WHERE RECIPIENT_INBOX = ? + ORDER BY ID DESC LIMIT ? + """)) { + + stmt.setString(1, inboxName); + stmt.setInt(2, lastN); + List messages = new ArrayList<>(lastN); + + var rs = stmt.executeQuery(); + while (rs.next()) { + long msgId = rs.getLong(1); + long relatedId = rs.getLong(2); + + String function = rs.getString(3); + String payload = rs.getString(4); + + MqMessageState state = MqMessageState.valueOf(rs.getString(5)); + boolean expectsResponse = rs.getBoolean(6); + + var msg = new MqMessage(msgId, relatedId, function, payload, state, expectsResponse); + + messages.add(msg); + } + + Lists.reverse(messages); + return messages; + } + + } } diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateFactory.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateFactory.java new file mode 100644 index 00000000..8dccde4b --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateFactory.java @@ -0,0 +1,66 @@ +package nu.marginalia.mqsm; + +import com.google.gson.Gson; +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.mqsm.state.MachineState; +import nu.marginalia.mqsm.state.StateTransition; + +import java.util.function.Function; +import java.util.function.Supplier; + +@Singleton +public class StateFactory { + private final Gson gson; + + @Inject + public StateFactory(Gson gson) { + this.gson = gson; + } + + public MachineState create(String name, Class param, Function logic) { + return new MachineState() { + @Override + public String name() { + return name; + } + + @Override + public StateTransition next(String message) { + return logic.apply(gson.fromJson(message, param)); + } + + @Override + public boolean isFinal() { + return false; + } + }; + } + + public MachineState create(String name, Supplier logic) { + return new MachineState() { + @Override + public String name() { + return name; + } + + @Override + public StateTransition next(String message) { + return logic.get(); + } + + @Override + public boolean isFinal() { + return false; + } + }; + } + + public StateTransition transition(String state) { + return StateTransition.to(state); + } + + public StateTransition transition(String state, Object message) { + return StateTransition.to(state, gson.toJson(message)); + } +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java new file mode 100644 index 00000000..cb7d1f33 --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java @@ -0,0 +1,176 @@ +package nu.marginalia.mqsm; + +import nu.marginalia.mq.MqMessage; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.inbox.MqInbox; +import nu.marginalia.mq.inbox.MqInboxResponse; +import nu.marginalia.mq.inbox.MqSubscription; +import nu.marginalia.mq.outbox.MqOutbox; +import nu.marginalia.mq.persistence.MqPersistence; +import nu.marginalia.mqsm.state.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.UUID; + +/** A state machine that can be used to implement a finite state machine + * using a message queue as the persistence layer. The state machine is + * resilient to crashes and can be resumed from the last state. + */ +public class StateMachine { + private final Logger logger = LoggerFactory.getLogger(StateMachine.class); + + private final MqInbox smInbox; + private final MqOutbox smOutbox; + private final String queueName; + private MachineState state; + + private final MachineState errorState = new ErrorState(); + private final MachineState finalState = new FinalState(); + private final MachineState resumingState = new ResumingState(); + + private final Map allStates = new HashMap<>(); + + public StateMachine(MqPersistence persistence, String queueName, UUID instanceUUID) { + this.queueName = queueName; + + smInbox = new MqInbox(persistence, queueName, instanceUUID); + smOutbox = new MqOutbox(persistence, queueName, instanceUUID); + + smInbox.subscribe(new StateEventSubscription()); + + registerStates(List.of(errorState, finalState, resumingState)); + } + + /** Register the state graph */ + public void registerStates(MachineState... states) { + if (state != null) { + throw new IllegalStateException("Cannot register states after state machine has been initialized"); + } + + for (var state : states) { + allStates.put(state.name(), state); + } + } + + /** Register the state graph */ + public void registerStates(List states) { + for (var state : states) { + allStates.put(state.name(), state); + } + } + + /** Wait for the state machine to reach a final state. + * (possibly forever, halting problem and so on) + */ + public void join() throws InterruptedException { + synchronized (this) { + if (null == state) + return; + + while (!state.isFinal()) { + wait(); + } + } + } + + + /** Initialize the state machine. */ + public void init() throws Exception { + var transition = StateTransition.to("INITIAL"); + + synchronized (this) { + this.state = allStates.get(transition.state()); + notifyAll(); + } + + smInbox.start(); + smOutbox.notify(transition.state(), transition.message()); + } + + /** Resume the state machine from the last known state. */ + public void resume() throws Exception { + + if (state == null) { + var messages = smInbox.replay(1); + + if (messages.isEmpty()) { + init(); + } else { + var firstMessage = messages.get(0); + + smInbox.start(); + + logger.info("Resuming state machine from {}({})/{}", firstMessage.function(), firstMessage.payload(), firstMessage.state()); + + if (firstMessage.state() == MqMessageState.NEW) { + // The message is not acknowledged, so starting the inbox will trigger a state transition + // + // We still need to set a state here so that the join() method works + + state = resumingState; + } else { + // The message is already acknowledged, so we replay the last state + onStateTransition(firstMessage.function(), firstMessage.payload()); + } + } + } + } + + public void stop() throws InterruptedException { + smInbox.stop(); + smOutbox.stop(); + } + + private void onStateTransition(String nextState, String message) { + try { + logger.info("FSM State change in {}: {}->{}({})", + queueName, + state == null ? "[null]" : state.name(), + nextState, + message); + + synchronized (this) { + this.state = allStates.get(nextState); + notifyAll(); + } + + if (!state.isFinal()) { + var transition = state.next(message); + smOutbox.notify(transition.state(), transition.message()); + } + } + catch (Exception e) { + logger.error("Error in state machine transition", e); + setErrorState(); + } + } + + private void setErrorState() { + synchronized (this) { + state = errorState; + notifyAll(); + } + } + + private class StateEventSubscription implements MqSubscription { + + @Override + public boolean filter(MqMessage rawMessage) { + return true; + } + + @Override + public MqInboxResponse onRequest(MqMessage msg) { + return null; + } + + @Override + public void onNotification(MqMessage msg) { + onStateTransition(msg.function(), msg.payload()); + } + } +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/ErrorState.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/ErrorState.java new file mode 100644 index 00000000..4f1fef96 --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/ErrorState.java @@ -0,0 +1,14 @@ +package nu.marginalia.mqsm.state; + +public class ErrorState implements MachineState { + @Override + public String name() { return "ERROR"; } + + @Override + public StateTransition next(String message) { + throw new UnsupportedOperationException(); + } + + @Override + public boolean isFinal() { return true; } +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/FinalState.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/FinalState.java new file mode 100644 index 00000000..5ee7d435 --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/FinalState.java @@ -0,0 +1,14 @@ +package nu.marginalia.mqsm.state; + +public class FinalState implements MachineState { + @Override + public String name() { return "END"; } + + @Override + public StateTransition next(String message) { + throw new UnsupportedOperationException(); + } + + @Override + public boolean isFinal() { return true; } +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/MachineState.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/MachineState.java new file mode 100644 index 00000000..4bba33cf --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/MachineState.java @@ -0,0 +1,8 @@ +package nu.marginalia.mqsm.state; + +public interface MachineState { + String name(); + StateTransition next(String message); + + boolean isFinal(); +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/ResumingState.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/ResumingState.java new file mode 100644 index 00000000..36a474e2 --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/ResumingState.java @@ -0,0 +1,14 @@ +package nu.marginalia.mqsm.state; + +public class ResumingState implements MachineState { + @Override + public String name() { return "RESUMING"; } + + @Override + public StateTransition next(String message) { + throw new UnsupportedOperationException(); + } + + @Override + public boolean isFinal() { return false; } +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/StateTransition.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/StateTransition.java new file mode 100644 index 00000000..6ca5d387 --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/StateTransition.java @@ -0,0 +1,11 @@ +package nu.marginalia.mqsm.state; + +public record StateTransition(String state, String message) { + public static StateTransition to(String state) { + return new StateTransition(state, ""); + } + + public static StateTransition to(String state, String message) { + return new StateTransition(state, message); + } +} diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqMessageRow.java b/code/common/message-queue/src/test/java/nu/marginalia/mq/MqMessageRow.java similarity index 92% rename from code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqMessageRow.java rename to code/common/message-queue/src/test/java/nu/marginalia/mq/MqMessageRow.java index 933cdb62..ef12105a 100644 --- a/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqMessageRow.java +++ b/code/common/message-queue/src/test/java/nu/marginalia/mq/MqMessageRow.java @@ -1,4 +1,4 @@ -package nu.marginalia.mq.outbox; +package nu.marginalia.mq; import nu.marginalia.mq.MqMessageState; diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqTestUtil.java b/code/common/message-queue/src/test/java/nu/marginalia/mq/MqTestUtil.java similarity index 96% rename from code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqTestUtil.java rename to code/common/message-queue/src/test/java/nu/marginalia/mq/MqTestUtil.java index 3fee8b20..dcefaf1a 100644 --- a/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqTestUtil.java +++ b/code/common/message-queue/src/test/java/nu/marginalia/mq/MqTestUtil.java @@ -1,7 +1,6 @@ -package nu.marginalia.mq.outbox; +package nu.marginalia.mq; import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.mq.MqMessageState; import org.junit.jupiter.api.Assertions; import java.sql.SQLException; diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqOutboxTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqOutboxTest.java index 789aec15..6dc51f2d 100644 --- a/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqOutboxTest.java +++ b/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqOutboxTest.java @@ -4,6 +4,7 @@ import com.zaxxer.hikari.HikariConfig; import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.mq.MqMessage; import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.MqTestUtil; import nu.marginalia.mq.inbox.MqInboxResponse; import nu.marginalia.mq.inbox.MqInbox; import nu.marginalia.mq.inbox.MqSubscription; @@ -154,9 +155,12 @@ public class MqOutboxTest { } @Override - public MqInboxResponse handle(MqMessage msg) { + public MqInboxResponse onRequest(MqMessage msg) { return MqInboxResponse.ok(response); } + + @Override + public void onNotification(MqMessage msg) { } }; } @@ -168,9 +172,12 @@ public class MqOutboxTest { } @Override - public MqInboxResponse handle(MqMessage msg) { + public MqInboxResponse onRequest(MqMessage msg) { return MqInboxResponse.ok(msg.payload()); } + + @Override + public void onNotification(MqMessage msg) {} }; } diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqPersistenceTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mq/persistence/MqPersistenceTest.java similarity index 98% rename from code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqPersistenceTest.java rename to code/common/message-queue/src/test/java/nu/marginalia/mq/persistence/MqPersistenceTest.java index 590ff64b..7166531d 100644 --- a/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqPersistenceTest.java +++ b/code/common/message-queue/src/test/java/nu/marginalia/mq/persistence/MqPersistenceTest.java @@ -1,9 +1,9 @@ -package nu.marginalia.mq.outbox; +package nu.marginalia.mq.persistence; import com.zaxxer.hikari.HikariConfig; import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.mq.MqMessageState; -import nu.marginalia.mq.persistence.MqPersistence; +import nu.marginalia.mq.MqTestUtil; import org.junit.jupiter.api.*; import org.testcontainers.containers.MariaDBContainer; import org.testcontainers.junit.jupiter.Container; diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineTest.java new file mode 100644 index 00000000..06cc658c --- /dev/null +++ b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineTest.java @@ -0,0 +1,174 @@ +package nu.marginalia.mqsm; + +import com.google.gson.GsonBuilder; +import com.zaxxer.hikari.HikariConfig; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.mq.MqMessageRow; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.MqTestUtil; +import nu.marginalia.mq.persistence.MqPersistence; +import org.junit.jupiter.api.*; +import org.testcontainers.containers.MariaDBContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +import java.util.List; +import java.util.UUID; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +@Tag("slow") +@Testcontainers +public class StateMachineTest { + @Container + static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") + .withDatabaseName("WMSA_prod") + .withUsername("wmsa") + .withPassword("wmsa") + .withInitScript("sql/current/11-message-queue.sql") + .withNetworkAliases("mariadb"); + + static HikariDataSource dataSource; + static MqPersistence persistence; + private String inboxId; + + @BeforeEach + public void setUp() { + inboxId = UUID.randomUUID().toString(); + } + @BeforeAll + public static void setUpAll() { + HikariConfig config = new HikariConfig(); + config.setJdbcUrl(mariaDBContainer.getJdbcUrl()); + config.setUsername("wmsa"); + config.setPassword("wmsa"); + + dataSource = new HikariDataSource(config); + persistence = new MqPersistence(dataSource); + } + + @AfterAll + public static void tearDownAll() { + dataSource.close(); + } + + @Test + public void testStartStopStartStop() throws Exception { + var sm = new StateMachine(persistence, inboxId, UUID.randomUUID()); + var stateFactory = new StateFactory(new GsonBuilder().create()); + + var initial = stateFactory.create("INITIAL", () -> stateFactory.transition("GREET", "World")); + + var greet = stateFactory.create("GREET", String.class, (String message) -> { + System.out.println("Hello, " + message + "!"); + return stateFactory.transition("COUNT-TO-FIVE", 0); + }); + + var ctf = stateFactory.create("COUNT-TO-FIVE", Integer.class, (Integer count) -> { + System.out.println(count); + if (count < 5) { + return stateFactory.transition("COUNT-TO-FIVE", count + 1); + } else { + return stateFactory.transition("END"); + } + }); + + sm.registerStates(initial, greet, ctf); + + sm.init(); + + Thread.sleep(300); + sm.stop(); + + var sm2 = new StateMachine(persistence, inboxId, UUID.randomUUID()); + sm2.registerStates(initial, greet, ctf); + sm2.resume(); + sm2.join(); + sm2.stop(); + + MqTestUtil.getMessages(dataSource, inboxId).forEach(System.out::println); + } + + @Test + public void smResumeFromNew() throws Exception { + var sm = new StateMachine(persistence, inboxId, UUID.randomUUID()); + var stateFactory = new StateFactory(new GsonBuilder().create()); + + var initial = stateFactory.create("INITIAL", () -> stateFactory.transition("A")); + var stateA = stateFactory.create("A", () -> stateFactory.transition("B")); + var stateB = stateFactory.create("B", () -> stateFactory.transition("C")); + var stateC = stateFactory.create("C", () -> stateFactory.transition("END")); + + sm.registerStates(initial, stateA, stateB, stateC); + persistence.sendNewMessage(inboxId, null,"B", "", null); + + sm.resume(); + + sm.join(); + sm.stop(); + + List states = MqTestUtil.getMessages(dataSource, inboxId) + .stream() + .peek(System.out::println) + .map(MqMessageRow::function) + .toList(); + + assertEquals(List.of("B", "C", "END"), states); + } + + @Test + public void smResumeFromAck() throws Exception { + var sm = new StateMachine(persistence, inboxId, UUID.randomUUID()); + var stateFactory = new StateFactory(new GsonBuilder().create()); + + var initial = stateFactory.create("INITIAL", () -> stateFactory.transition("A")); + var stateA = stateFactory.create("A", () -> stateFactory.transition("B")); + var stateB = stateFactory.create("B", () -> stateFactory.transition("C")); + var stateC = stateFactory.create("C", () -> stateFactory.transition("END")); + + sm.registerStates(initial, stateA, stateB, stateC); + + long id = persistence.sendNewMessage(inboxId, null,"B", "", null); + persistence.updateMessageState(id, MqMessageState.ACK); + + sm.resume(); + + sm.join(); + sm.stop(); + + List states = MqTestUtil.getMessages(dataSource, inboxId) + .stream() + .peek(System.out::println) + .map(MqMessageRow::function) + .toList(); + + assertEquals(List.of("B", "C", "END"), states); + } + + + @Test + public void smResumeEmptyQueue() throws Exception { + var sm = new StateMachine(persistence, inboxId, UUID.randomUUID()); + var stateFactory = new StateFactory(new GsonBuilder().create()); + + var initial = stateFactory.create("INITIAL", () -> stateFactory.transition("A")); + var stateA = stateFactory.create("A", () -> stateFactory.transition("B")); + var stateB = stateFactory.create("B", () -> stateFactory.transition("C")); + var stateC = stateFactory.create("C", () -> stateFactory.transition("END")); + + sm.registerStates(initial, stateA, stateB, stateC); + + sm.resume(); + + sm.join(); + sm.stop(); + + List states = MqTestUtil.getMessages(dataSource, inboxId) + .stream() + .peek(System.out::println) + .map(MqMessageRow::function) + .toList(); + + assertEquals(List.of("INITIAL", "A", "B", "C", "END"), states); + } +} From 097a163cf580257957ab32aeec5d0de87341d91b Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 4 Jul 2023 18:25:42 +0200 Subject: [PATCH 004/157] Getting a skeleton in place for the control service. --- .../control-service/build.gradle | 1 + .../nu/marginalia/control/ControlService.java | 22 +++++++- .../marginalia/control/EventLogService.java | 49 ++++++++++++++++++ .../control/model/EventLogEntry.java | 10 ++++ .../control/model/ServiceHeartbeat.java | 3 ++ .../main/resources/static/control/style.css | 4 ++ .../resources/templates/control/index.hdb | 14 ++++++ .../templates/control/partials/nav.hdb | 7 +++ .../resources/templates/control/services.hdb | 50 +++++++++++++++++++ 9 files changed, 158 insertions(+), 2 deletions(-) create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/EventLogService.java create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/EventLogEntry.java create mode 100644 code/services-satellite/control-service/src/main/resources/static/control/style.css create mode 100644 code/services-satellite/control-service/src/main/resources/templates/control/index.hdb create mode 100644 code/services-satellite/control-service/src/main/resources/templates/control/partials/nav.hdb create mode 100644 code/services-satellite/control-service/src/main/resources/templates/control/services.hdb diff --git a/code/services-satellite/control-service/build.gradle b/code/services-satellite/control-service/build.gradle index 1bb9bfdc..f8ed32e0 100644 --- a/code/services-satellite/control-service/build.gradle +++ b/code/services-satellite/control-service/build.gradle @@ -26,6 +26,7 @@ dependencies { implementation project(':code:common:model') implementation project(':code:common:service') implementation project(':code:common:config') + implementation project(':code:common:renderer') implementation project(':code:common:service-discovery') implementation project(':code:common:service-client') implementation project(':code:api:search-api') diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java index 952559e0..c8ded478 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java @@ -4,35 +4,53 @@ import com.google.gson.Gson; import com.google.inject.Inject; import nu.marginalia.client.ServiceMonitors; import nu.marginalia.model.gson.GsonFactory; +import nu.marginalia.renderer.MustacheRenderer; +import nu.marginalia.renderer.RendererFactory; import nu.marginalia.service.server.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import spark.Spark; +import java.io.IOException; +import java.util.Map; + public class ControlService extends Service { private final Logger logger = LoggerFactory.getLogger(getClass()); private final Gson gson = GsonFactory.get(); private final ServiceMonitors monitors; + private final MustacheRenderer indexRenderer; + private final MustacheRenderer> servicesRenderer; @Inject public ControlService(BaseServiceParams params, ServiceMonitors monitors, - HeartbeatService heartbeatService - ) { + HeartbeatService heartbeatService, + EventLogService eventLogService, + RendererFactory rendererFactory + ) throws IOException { super(params); this.monitors = monitors; + indexRenderer = rendererFactory.renderer("control/index"); + servicesRenderer = rendererFactory.renderer("control/services"); Spark.get("/public/heartbeats", (req, res) -> { res.type("application/json"); return heartbeatService.getHeartbeats(); }, gson::toJson); + Spark.get("/public/", (req, rsp) -> indexRenderer.render(Map.of())); + Spark.get("/public/services", (req, rsp) -> servicesRenderer.render( + Map.of("heartbeats", heartbeatService.getHeartbeats(), + "events", eventLogService.getLastEntries(100) + ))); + monitors.subscribe(this::logMonitorStateChange); + } private void logMonitorStateChange() { diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/EventLogService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/EventLogService.java new file mode 100644 index 00000000..842fe86e --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/EventLogService.java @@ -0,0 +1,49 @@ +package nu.marginalia.control; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.control.model.EventLogEntry; + +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.List; + +@Singleton +public class EventLogService { + + private final HikariDataSource dataSource; + + @Inject + public EventLogService(HikariDataSource dataSource) { + this.dataSource = dataSource; + } + + public List getLastEntries(int n) { + try (var conn = dataSource.getConnection(); + var query = conn.prepareStatement(""" + SELECT SERVICE_NAME, INSTANCE, EVENT_TIME, EVENT_TYPE, EVENT_MESSAGE + FROM PROC_SERVICE_EVENTLOG ORDER BY ID DESC LIMIT ? + """)) { + + query.setInt(1, n); + List entries = new ArrayList<>(n); + var rs = query.executeQuery(); + while (rs.next()) { + entries.add(new EventLogEntry( + rs.getString("SERVICE_NAME"), + rs.getString("INSTANCE"), + rs.getTimestamp("EVENT_TIME").toLocalDateTime().toLocalTime().toString(), + rs.getString("EVENT_TYPE"), + rs.getString("EVENT_MESSAGE") + )); + } + return entries; + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } + + +} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/EventLogEntry.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/EventLogEntry.java new file mode 100644 index 00000000..65de7699 --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/EventLogEntry.java @@ -0,0 +1,10 @@ +package nu.marginalia.control.model; + +public record EventLogEntry( + String serviceName, + String instance, + String eventTime, + String eventType, + String eventMessage) +{ +} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ServiceHeartbeat.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ServiceHeartbeat.java index cc0dcef4..dcb4d94e 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ServiceHeartbeat.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ServiceHeartbeat.java @@ -7,5 +7,8 @@ public record ServiceHeartbeat( double lastSeenMillis, boolean alive ) { + public boolean isMissing() { + return lastSeenMillis > 10000; + } } diff --git a/code/services-satellite/control-service/src/main/resources/static/control/style.css b/code/services-satellite/control-service/src/main/resources/static/control/style.css new file mode 100644 index 00000000..6bd9166e --- /dev/null +++ b/code/services-satellite/control-service/src/main/resources/static/control/style.css @@ -0,0 +1,4 @@ +body { + font-family: serif; + line-height: 1.6; +} \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/index.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/index.hdb new file mode 100644 index 00000000..701ed915 --- /dev/null +++ b/code/services-satellite/control-service/src/main/resources/templates/control/index.hdb @@ -0,0 +1,14 @@ + + + + Control Service + + + + + {{> control/partials/nav}} +
+

Overview

+
+ + \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/partials/nav.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/partials/nav.hdb new file mode 100644 index 00000000..9b68f4b2 --- /dev/null +++ b/code/services-satellite/control-service/src/main/resources/templates/control/partials/nav.hdb @@ -0,0 +1,7 @@ + \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/services.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/services.hdb new file mode 100644 index 00000000..5b5febf2 --- /dev/null +++ b/code/services-satellite/control-service/src/main/resources/templates/control/services.hdb @@ -0,0 +1,50 @@ + + + + Control Service + + + + + {{> control/partials/nav}} + +
+

Services

+ + + + + + + {{#each heartbeats}} + + + + + + {{/each}} +
Service IDUUIDLast Seen (ms)
{{serviceId}}{{uuid}}{{lastSeenMillis}}
+ +

Events

+ + + + + + + + + + {{#each events}} + + + + + + + + {{/each}} +
Service NameInstanceEvent TimeTypeMessage
{{serviceName}}{{instance}}{{eventTime}}{{eventType}}{{eventMessage}}
+
+ + \ No newline at end of file From 7a17933c65252a59ee4a7a993c2c48c421dfca72 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 4 Jul 2023 19:52:30 +0200 Subject: [PATCH 005/157] Control service owns message queue garbage collection. --- .../control-service/build.gradle | 1 + .../nu/marginalia/control/ControlService.java | 30 ++++++++++++++++++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/code/services-satellite/control-service/build.gradle b/code/services-satellite/control-service/build.gradle index f8ed32e0..6bfffcaa 100644 --- a/code/services-satellite/control-service/build.gradle +++ b/code/services-satellite/control-service/build.gradle @@ -27,6 +27,7 @@ dependencies { implementation project(':code:common:service') implementation project(':code:common:config') implementation project(':code:common:renderer') + implementation project(':code:common:message-queue') implementation project(':code:common:service-discovery') implementation project(':code:common:service-client') implementation project(':code:api:search-api') diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java index c8ded478..40559746 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java @@ -4,6 +4,7 @@ import com.google.gson.Gson; import com.google.inject.Inject; import nu.marginalia.client.ServiceMonitors; import nu.marginalia.model.gson.GsonFactory; +import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.renderer.MustacheRenderer; import nu.marginalia.renderer.RendererFactory; import nu.marginalia.service.server.*; @@ -13,6 +14,7 @@ import spark.Spark; import java.io.IOException; import java.util.Map; +import java.util.concurrent.TimeUnit; public class ControlService extends Service { @@ -22,6 +24,7 @@ public class ControlService extends Service { private final ServiceMonitors monitors; private final MustacheRenderer indexRenderer; private final MustacheRenderer> servicesRenderer; + private final MqPersistence messageQueuePersistence; @Inject @@ -29,13 +32,15 @@ public class ControlService extends Service { ServiceMonitors monitors, HeartbeatService heartbeatService, EventLogService eventLogService, - RendererFactory rendererFactory + RendererFactory rendererFactory, + MqPersistence messageQueuePersistence ) throws IOException { super(params); this.monitors = monitors; indexRenderer = rendererFactory.renderer("control/index"); servicesRenderer = rendererFactory.renderer("control/services"); + this.messageQueuePersistence = messageQueuePersistence; Spark.get("/public/heartbeats", (req, res) -> { res.type("application/json"); @@ -50,7 +55,30 @@ public class ControlService extends Service { monitors.subscribe(this::logMonitorStateChange); + Thread reaperThread = new Thread(this::reapMessageQueue, "message-queue-reaper"); + reaperThread.setDaemon(true); + reaperThread.start(); + } + private void reapMessageQueue() { + + for (;;) { + try { + TimeUnit.MINUTES.sleep(30); + + int outcome = messageQueuePersistence.reapDeadMessages(); + if (outcome > 0) { + logger.info("Reaped {} dead messages from message queue", outcome); + } + } + catch (InterruptedException ex) { + logger.info("Message queue reaper interrupted"); + return; + } + catch (Exception ex) { + logger.error("Message queue reaper failed", ex); + } + } } private void logMonitorStateChange() { From 979a620ead05f41d8f6b2f3f54bddd88bbe96005 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 3 Jul 2023 11:06:39 +0200 Subject: [PATCH 006/157] Bugfix where DocumentGeneratorExtractor out of bounded for generators starting with 'microsoft' or 'adobe' but having no followup string. --- .../processor/logic/DocumentGeneratorExtractor.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java index 5ffa11df..1f9c11eb 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java @@ -56,7 +56,12 @@ public class DocumentGeneratorExtractor { return DocumentGenerator.of(parts[0]); case "adobe": case "microsoft": - return DocumentGenerator.of(parts[1]); + if (parts.length > 1) { + return DocumentGenerator.of(parts[1]); + } + else { + return DocumentGenerator.of(parts[0]); + } } if (parts.length > 1) { From 2cb209ae9c2a3295cd63b908fa0f2f5230ed213c Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 3 Jul 2023 11:29:27 +0200 Subject: [PATCH 007/157] Better wordpress fingerprinting --- .../processor/logic/DocumentGeneratorExtractor.java | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java index 1f9c11eb..0fa3de61 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java @@ -87,10 +87,19 @@ public class DocumentGeneratorExtractor { } } - for (var scriptTags : doc.head().select("script")) { - if (scriptTags.html().contains("window.lemmyConfig")) { + for (var tag : doc.head().getElementsByTag("script")) { + if (tag.html().contains("window.lemmyConfig")) { return DocumentGenerator.of("lemmy"); } + if (tag.attr("src").contains("wp-content")) { + return DocumentGenerator.of("wordpress", "wordpress-sneaky"); + } + } + + for (var tag : doc.head().getElementsByTag("link")) { + if (tag.attr("href").contains("wp-content")) { + return DocumentGenerator.of("wordpress", "wordpress-sneaky"); + } } if (doc.getElementById("flarum-json-payload") != null) { From 78f21dd19ac0a05347eaed4d643cc823717d408e Mon Sep 17 00:00:00 2001 From: Adrthegamedev <46429482+Adrthegamedev@users.noreply.github.com> Date: Mon, 3 Jul 2023 13:00:05 +0200 Subject: [PATCH 008/157] (an attempt to) Add wikidot to wiki generators list --- .../processor/logic/DocumentGeneratorExtractor.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java index 0fa3de61..ac1c15a2 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java @@ -91,6 +91,9 @@ public class DocumentGeneratorExtractor { if (tag.html().contains("window.lemmyConfig")) { return DocumentGenerator.of("lemmy"); } + if (tag.html().contains("URL_DOMAIN = 'wikidot.com'")) { + return DocumentGenerator.of("wikidot"); + } if (tag.attr("src").contains("wp-content")) { return DocumentGenerator.of("wordpress", "wordpress-sneaky"); } @@ -193,7 +196,7 @@ public class DocumentGeneratorExtractor { case "vbulletin", "phpbb", "mybb", "nodebb", "flarum", "tribe", "discourse", "lemmy", "xenforo", "invision" -> GeneratorType.FORUM; - case "mediawiki", "dokuwiki", "sharepoint" + case "mediawiki", "dokuwiki", "wikidot", "sharepoint" -> GeneratorType.WIKI; case "pandoc", "mkdocs", "doxygen", "javadoc" -> GeneratorType.DOCS; From 413dc6ced40fff0af390d7c8e03b36c774150c05 Mon Sep 17 00:00:00 2001 From: Viktor Date: Tue, 4 Jul 2023 18:46:58 +0200 Subject: [PATCH 009/157] Update FUNDING.yml --- .github/FUNDING.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml index 6729104a..45656388 100644 --- a/.github/FUNDING.yml +++ b/.github/FUNDING.yml @@ -1,6 +1,6 @@ # These are supported funding model platforms -github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] +github: MarginaliaSearch patreon: marginalia_nu open_collective: # Replace with a single Open Collective username ko_fi: # Replace with a single Ko-fi username From d89db10645138c65a07be4b3db2a137b51ef1477 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 6 Jul 2023 13:02:16 +0200 Subject: [PATCH 010/157] MQFSM Usability WIP --- .../java/nu/marginalia/mq/inbox/MqInbox.java | 10 +- .../java/nu/marginalia/mqsm/StateFactory.java | 15 +- .../java/nu/marginalia/mqsm/StateMachine.java | 49 +++-- .../mqsm/graph/ControlFlowException.java | 21 ++ .../nu/marginalia/mqsm/graph/GraphState.java | 14 ++ .../nu/marginalia/mqsm/graph/StateGraph.java | 121 +++++++++++ .../marginalia/mqsm/graph/TerminalState.java | 9 + .../nu/marginalia/mqsm/state/ErrorState.java | 3 + .../nu/marginalia/mqsm/state/FinalState.java | 3 + .../marginalia/mqsm/state/MachineState.java | 1 + .../marginalia/mqsm/state/ResumeBehavior.java | 6 + .../marginalia/mqsm/state/ResumingState.java | 3 + .../mqsm/StateMachineErrorTest.java | 100 +++++++++ .../mqsm/StateMachineResumeTest.java | 191 ++++++++++++++++++ .../nu/marginalia/mqsm/StateMachineTest.java | 135 +++++-------- 15 files changed, 574 insertions(+), 107 deletions(-) create mode 100644 code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/ControlFlowException.java create mode 100644 code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/GraphState.java create mode 100644 code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/StateGraph.java create mode 100644 code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/TerminalState.java create mode 100644 code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/ResumeBehavior.java create mode 100644 code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineErrorTest.java create mode 100644 code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineResumeTest.java diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInbox.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInbox.java index 00b30cad..20184f32 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInbox.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInbox.java @@ -38,7 +38,15 @@ public class MqInbox { String inboxName, UUID instanceUUID) { - this.threadPool = Executors.newCachedThreadPool(); + this(persistence, inboxName, instanceUUID, Executors.newCachedThreadPool()); + } + + public MqInbox(MqPersistence persistence, + String inboxName, + UUID instanceUUID, + ExecutorService executorService) + { + this.threadPool = executorService; this.persistence = persistence; this.inboxName = inboxName; this.instanceUUID = instanceUUID.toString(); diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateFactory.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateFactory.java index 8dccde4b..09c02ea7 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateFactory.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateFactory.java @@ -4,6 +4,7 @@ import com.google.gson.Gson; import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.mqsm.state.MachineState; +import nu.marginalia.mqsm.state.ResumeBehavior; import nu.marginalia.mqsm.state.StateTransition; import java.util.function.Function; @@ -18,7 +19,7 @@ public class StateFactory { this.gson = gson; } - public MachineState create(String name, Class param, Function logic) { + public MachineState create(String name, ResumeBehavior resumeBehavior, Class param, Function logic) { return new MachineState() { @Override public String name() { @@ -30,6 +31,11 @@ public class StateFactory { return logic.apply(gson.fromJson(message, param)); } + @Override + public ResumeBehavior resumeBehavior() { + return resumeBehavior; + } + @Override public boolean isFinal() { return false; @@ -37,7 +43,7 @@ public class StateFactory { }; } - public MachineState create(String name, Supplier logic) { + public MachineState create(String name, ResumeBehavior resumeBehavior, Supplier logic) { return new MachineState() { @Override public String name() { @@ -49,6 +55,11 @@ public class StateFactory { return logic.get(); } + @Override + public ResumeBehavior resumeBehavior() { + return resumeBehavior; + } + @Override public boolean isFinal() { return false; diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java index cb7d1f33..827005ed 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java @@ -7,6 +7,7 @@ import nu.marginalia.mq.inbox.MqInboxResponse; import nu.marginalia.mq.inbox.MqSubscription; import nu.marginalia.mq.outbox.MqOutbox; import nu.marginalia.mq.persistence.MqPersistence; +import nu.marginalia.mqsm.graph.StateGraph; import nu.marginalia.mqsm.state.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -15,6 +16,7 @@ import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.UUID; +import java.util.concurrent.Executors; /** A state machine that can be used to implement a finite state machine * using a message queue as the persistence layer. The state machine is @@ -37,7 +39,7 @@ public class StateMachine { public StateMachine(MqPersistence persistence, String queueName, UUID instanceUUID) { this.queueName = queueName; - smInbox = new MqInbox(persistence, queueName, instanceUUID); + smInbox = new MqInbox(persistence, queueName, instanceUUID, Executors.newSingleThreadExecutor()); smOutbox = new MqOutbox(persistence, queueName, instanceUUID); smInbox.subscribe(new StateEventSubscription()); @@ -63,6 +65,11 @@ public class StateMachine { } } + /** Register the state graph */ + public void registerStates(StateGraph states) { + registerStates(states.asStateList()); + } + /** Wait for the state machine to reach a final state. * (possibly forever, halting problem and so on) */ @@ -94,29 +101,33 @@ public class StateMachine { /** Resume the state machine from the last known state. */ public void resume() throws Exception { - if (state == null) { - var messages = smInbox.replay(1); + if (state != null) { + return; + } - if (messages.isEmpty()) { - init(); - } else { - var firstMessage = messages.get(0); + var messages = smInbox.replay(1); + if (messages.isEmpty()) { + init(); + return; + } - smInbox.start(); + var firstMessage = messages.get(0); + var resumeState = allStates.get(firstMessage.function()); - logger.info("Resuming state machine from {}({})/{}", firstMessage.function(), firstMessage.payload(), firstMessage.state()); + smInbox.start(); + logger.info("Resuming state machine from {}({})/{}", firstMessage.function(), firstMessage.payload(), firstMessage.state()); - if (firstMessage.state() == MqMessageState.NEW) { - // The message is not acknowledged, so starting the inbox will trigger a state transition - // - // We still need to set a state here so that the join() method works + if (firstMessage.state() == MqMessageState.NEW) { + // The message is not acknowledged, so starting the inbox will trigger a state transition + // We still need to set a state here so that the join() method works - state = resumingState; - } else { - // The message is already acknowledged, so we replay the last state - onStateTransition(firstMessage.function(), firstMessage.payload()); - } - } + state = resumingState; + } else if (resumeState.resumeBehavior().equals(ResumeBehavior.ERROR)) { + // The message is acknowledged, but the state does not support resuming + smOutbox.notify("ERROR", "Illegal resumption from ACK'ed state " + firstMessage.function()); + } else { + // The message is already acknowledged, so we replay the last state + onStateTransition(firstMessage.function(), firstMessage.payload()); } } diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/ControlFlowException.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/ControlFlowException.java new file mode 100644 index 00000000..aece44ea --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/ControlFlowException.java @@ -0,0 +1,21 @@ +package nu.marginalia.mqsm.graph; + +class ControlFlowException extends RuntimeException { + private final String state; + private final Object payload; + + public ControlFlowException(String state, Object payload) { + this.state = state; + this.payload = payload; + } + + public String getState() { + return state; + } + + public Object getPayload() { + return payload; + } + + public StackTraceElement[] getStackTrace() { return new StackTraceElement[0]; } +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/GraphState.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/GraphState.java new file mode 100644 index 00000000..b79b71aa --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/GraphState.java @@ -0,0 +1,14 @@ +package nu.marginalia.mqsm.graph; + + +import nu.marginalia.mqsm.state.ResumeBehavior; + +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; + +@Retention(RetentionPolicy.RUNTIME) +public @interface GraphState { + String name(); + String next() default "ERROR"; + ResumeBehavior resume() default ResumeBehavior.ERROR; +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/StateGraph.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/StateGraph.java new file mode 100644 index 00000000..df8f4318 --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/StateGraph.java @@ -0,0 +1,121 @@ +package nu.marginalia.mqsm.graph; + +import nu.marginalia.mqsm.StateFactory; +import nu.marginalia.mqsm.state.MachineState; +import nu.marginalia.mqsm.state.StateTransition; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public abstract class StateGraph { + private final StateFactory stateFactory; + private static final Logger logger = LoggerFactory.getLogger(StateGraph.class); + + public StateGraph(StateFactory stateFactory) { + this.stateFactory = stateFactory; + } + + public void transition(String state) { + throw new ControlFlowException(state, ""); + } + + public void transition(String state, T payload) { + throw new ControlFlowException(state, payload); + } + + public void error() { + throw new ControlFlowException("ERROR", ""); + } + public void error(T payload) { + throw new ControlFlowException("ERROR", payload); + } + public void error(Exception ex) { + throw new ControlFlowException("ERROR", ex.getClass().getSimpleName() + ":" + ex.getMessage()); + } + + public List asStateList() { + List ret = new ArrayList<>(); + + for (var method : getClass().getMethods()) { + var gs = method.getAnnotation(GraphState.class); + if (gs != null) { + ret.add(graphState(method, gs)); + } + } + + return ret; + } + + private MachineState graphState(Method method, GraphState gs) { + + var parameters = method.getParameterTypes(); + boolean returnsVoid = method.getGenericReturnType().equals(Void.TYPE); + + if (parameters.length == 0) { + return stateFactory.create(gs.name(), gs.resume(), () -> { + try { + if (returnsVoid) { + method.invoke(this); + return StateTransition.to(gs.next()); + } else { + Object ret = method.invoke(this); + return stateFactory.transition(gs.next(), ret); + } + } + catch (Exception e) { + return invocationExceptionToStateTransition(gs.name(), e); + } + }); + } + else if (parameters.length == 1) { + return stateFactory.create(gs.name(), gs.resume(), parameters[0], (param) -> { + try { + if (returnsVoid) { + method.invoke(this, param); + return StateTransition.to(gs.next()); + } else { + Object ret = method.invoke(this, param); + return stateFactory.transition(gs.next(), ret); + } + } catch (Exception e) { + return invocationExceptionToStateTransition(gs.name(), e); + } + }); + } + else { + // We permit only @GraphState-annotated methods like this: + // + // void foo(); + // void foo(Object bar); + // Object foo(); + // Object foo(Object bar); + + throw new IllegalStateException("StateGraph " + + getClass().getSimpleName() + + " has invalid method signature for method " + + method.getName() + + ": Expected 0 or 1 parameter(s) but found " + + Arrays.toString(parameters)); + } + } + + private StateTransition invocationExceptionToStateTransition(String state, Throwable ex) { + while (ex instanceof InvocationTargetException e) { + if (e.getCause() != null) ex = ex.getCause(); + } + + if (ex instanceof ControlFlowException cfe) { + return stateFactory.transition(cfe.getState(), cfe.getPayload()); + } else { + logger.error("Error in state invocation " + state, ex); + return StateTransition.to("ERROR", + "Exception: " + ex.getClass().getSimpleName() + "/" + ex.getMessage()); + } + } + +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/TerminalState.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/TerminalState.java new file mode 100644 index 00000000..5ae062b7 --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/TerminalState.java @@ -0,0 +1,9 @@ +package nu.marginalia.mqsm.graph; + +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; + +@Retention(RetentionPolicy.RUNTIME) +public @interface TerminalState { + String name(); +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/ErrorState.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/ErrorState.java index 4f1fef96..dcb19125 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/ErrorState.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/ErrorState.java @@ -9,6 +9,9 @@ public class ErrorState implements MachineState { throw new UnsupportedOperationException(); } + @Override + public ResumeBehavior resumeBehavior() { return ResumeBehavior.RETRY; } + @Override public boolean isFinal() { return true; } } diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/FinalState.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/FinalState.java index 5ee7d435..dc2362fe 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/FinalState.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/FinalState.java @@ -9,6 +9,9 @@ public class FinalState implements MachineState { throw new UnsupportedOperationException(); } + @Override + public ResumeBehavior resumeBehavior() { return ResumeBehavior.RETRY; } + @Override public boolean isFinal() { return true; } } diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/MachineState.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/MachineState.java index 4bba33cf..11efc7c5 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/MachineState.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/MachineState.java @@ -4,5 +4,6 @@ public interface MachineState { String name(); StateTransition next(String message); + ResumeBehavior resumeBehavior(); boolean isFinal(); } diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/ResumeBehavior.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/ResumeBehavior.java new file mode 100644 index 00000000..a82446f8 --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/ResumeBehavior.java @@ -0,0 +1,6 @@ +package nu.marginalia.mqsm.state; + +public enum ResumeBehavior { + RETRY, + ERROR +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/ResumingState.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/ResumingState.java index 36a474e2..ce01bb79 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/ResumingState.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/ResumingState.java @@ -9,6 +9,9 @@ public class ResumingState implements MachineState { throw new UnsupportedOperationException(); } + @Override + public ResumeBehavior resumeBehavior() { return ResumeBehavior.RETRY; } + @Override public boolean isFinal() { return false; } } diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineErrorTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineErrorTest.java new file mode 100644 index 00000000..6c6298eb --- /dev/null +++ b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineErrorTest.java @@ -0,0 +1,100 @@ +package nu.marginalia.mqsm; + +import com.google.gson.GsonBuilder; +import com.zaxxer.hikari.HikariConfig; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.mq.MqMessageRow; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.MqTestUtil; +import nu.marginalia.mq.persistence.MqPersistence; +import nu.marginalia.mqsm.graph.GraphState; +import nu.marginalia.mqsm.graph.StateGraph; +import nu.marginalia.mqsm.state.ResumeBehavior; +import org.junit.jupiter.api.*; +import org.testcontainers.containers.MariaDBContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +import java.util.List; +import java.util.UUID; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +@Tag("slow") +@Testcontainers +public class StateMachineErrorTest { + @Container + static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") + .withDatabaseName("WMSA_prod") + .withUsername("wmsa") + .withPassword("wmsa") + .withInitScript("sql/current/11-message-queue.sql") + .withNetworkAliases("mariadb"); + + static HikariDataSource dataSource; + static MqPersistence persistence; + private String inboxId; + + @BeforeEach + public void setUp() { + inboxId = UUID.randomUUID().toString(); + } + @BeforeAll + public static void setUpAll() { + HikariConfig config = new HikariConfig(); + config.setJdbcUrl(mariaDBContainer.getJdbcUrl()); + config.setUsername("wmsa"); + config.setPassword("wmsa"); + + dataSource = new HikariDataSource(config); + persistence = new MqPersistence(dataSource); + } + + @AfterAll + public static void tearDownAll() { + dataSource.close(); + } + + public static class ErrorHurdles extends StateGraph { + + public ErrorHurdles(StateFactory stateFactory) { + super(stateFactory); + } + + @GraphState(name = "INITIAL", next = "FAILING") + public void initial() { + + } + @GraphState(name = "FAILING", next = "OK", resume = ResumeBehavior.RETRY) + public void resumable() { + throw new RuntimeException("Boom!"); + } + @GraphState(name = "OK", next = "END") + public void ok() { + + } + + } + + @Test + public void smResumeResumableFromNew() throws Exception { + var sm = new StateMachine(persistence, inboxId, UUID.randomUUID()); + var stateFactory = new StateFactory(new GsonBuilder().create()); + + sm.registerStates(new ErrorHurdles(stateFactory).asStateList()); + + sm.init(); + + sm.join(); + sm.stop(); + + List states = MqTestUtil.getMessages(dataSource, inboxId) + .stream() + .peek(System.out::println) + .map(MqMessageRow::function) + .toList(); + + assertEquals(List.of("INITIAL", "FAILING", "ERROR"), states); + } + +} diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineResumeTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineResumeTest.java new file mode 100644 index 00000000..6913e13a --- /dev/null +++ b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineResumeTest.java @@ -0,0 +1,191 @@ +package nu.marginalia.mqsm; + +import com.google.gson.GsonBuilder; +import com.zaxxer.hikari.HikariConfig; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.mq.MqMessageRow; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.MqTestUtil; +import nu.marginalia.mq.persistence.MqPersistence; +import nu.marginalia.mqsm.graph.GraphState; +import nu.marginalia.mqsm.graph.StateGraph; +import nu.marginalia.mqsm.state.ResumeBehavior; +import org.junit.jupiter.api.*; +import org.testcontainers.containers.MariaDBContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +import java.util.List; +import java.util.UUID; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +@Tag("slow") +@Testcontainers +public class StateMachineResumeTest { + @Container + static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") + .withDatabaseName("WMSA_prod") + .withUsername("wmsa") + .withPassword("wmsa") + .withInitScript("sql/current/11-message-queue.sql") + .withNetworkAliases("mariadb"); + + static HikariDataSource dataSource; + static MqPersistence persistence; + private String inboxId; + + @BeforeEach + public void setUp() { + inboxId = UUID.randomUUID().toString(); + } + @BeforeAll + public static void setUpAll() { + HikariConfig config = new HikariConfig(); + config.setJdbcUrl(mariaDBContainer.getJdbcUrl()); + config.setUsername("wmsa"); + config.setPassword("wmsa"); + + dataSource = new HikariDataSource(config); + persistence = new MqPersistence(dataSource); + } + + @AfterAll + public static void tearDownAll() { + dataSource.close(); + } + + public static class ResumeTrialsGraph extends StateGraph { + + public ResumeTrialsGraph(StateFactory stateFactory) { + super(stateFactory); + } + + @GraphState(name = "INITIAL", next = "RESUMABLE") + public void initial() {} + @GraphState(name = "RESUMABLE", next = "NON-RESUMABLE", resume = ResumeBehavior.RETRY) + public void resumable() {} + @GraphState(name = "NON-RESUMABLE", next = "OK", resume = ResumeBehavior.ERROR) + public void nonResumable() {} + + @GraphState(name = "OK", next = "END") + public void ok() {} + + } + + @Test + public void smResumeResumableFromNew() throws Exception { + var sm = new StateMachine(persistence, inboxId, UUID.randomUUID()); + var stateFactory = new StateFactory(new GsonBuilder().create()); + + sm.registerStates(new ResumeTrialsGraph(stateFactory).asStateList()); + + persistence.sendNewMessage(inboxId, null,"RESUMABLE", "", null); + + sm.resume(); + + sm.join(); + sm.stop(); + + List states = MqTestUtil.getMessages(dataSource, inboxId) + .stream() + .peek(System.out::println) + .map(MqMessageRow::function) + .toList(); + + assertEquals(List.of("RESUMABLE", "NON-RESUMABLE", "OK", "END"), states); + } + + @Test + public void smResumeFromAck() throws Exception { + var sm = new StateMachine(persistence, inboxId, UUID.randomUUID()); + var stateFactory = new StateFactory(new GsonBuilder().create()); + + sm.registerStates(new ResumeTrialsGraph(stateFactory)); + + long id = persistence.sendNewMessage(inboxId, null,"RESUMABLE", "", null); + persistence.updateMessageState(id, MqMessageState.ACK); + + sm.resume(); + + sm.join(); + sm.stop(); + + List states = MqTestUtil.getMessages(dataSource, inboxId) + .stream() + .peek(System.out::println) + .map(MqMessageRow::function) + .toList(); + + assertEquals(List.of("RESUMABLE", "NON-RESUMABLE", "OK", "END"), states); + } + + + @Test + public void smResumeNonResumableFromNew() throws Exception { + var sm = new StateMachine(persistence, inboxId, UUID.randomUUID()); + var stateFactory = new StateFactory(new GsonBuilder().create()); + + sm.registerStates(new ResumeTrialsGraph(stateFactory)); + + persistence.sendNewMessage(inboxId, null,"NON-RESUMABLE", "", null); + + sm.resume(); + + sm.join(); + sm.stop(); + + List states = MqTestUtil.getMessages(dataSource, inboxId) + .stream() + .peek(System.out::println) + .map(MqMessageRow::function) + .toList(); + + assertEquals(List.of("NON-RESUMABLE", "OK", "END"), states); + } + + @Test + public void smResumeNonResumableFromAck() throws Exception { + var sm = new StateMachine(persistence, inboxId, UUID.randomUUID()); + var stateFactory = new StateFactory(new GsonBuilder().create()); + + sm.registerStates(new ResumeTrialsGraph(stateFactory)); + + long id = persistence.sendNewMessage(inboxId, null,"NON-RESUMABLE", "", null); + persistence.updateMessageState(id, MqMessageState.ACK); + + sm.resume(); + + sm.join(); + sm.stop(); + + List states = MqTestUtil.getMessages(dataSource, inboxId) + .stream() + .peek(System.out::println) + .map(MqMessageRow::function) + .toList(); + + assertEquals(List.of("NON-RESUMABLE", "ERROR"), states); + } + + @Test + public void smResumeEmptyQueue() throws Exception { + var sm = new StateMachine(persistence, inboxId, UUID.randomUUID()); + var stateFactory = new StateFactory(new GsonBuilder().create()); + + sm.registerStates(new ResumeTrialsGraph(stateFactory)); + + sm.resume(); + + sm.join(); + sm.stop(); + + List states = MqTestUtil.getMessages(dataSource, inboxId) + .stream() + .peek(System.out::println) + .map(MqMessageRow::function) + .toList(); + + assertEquals(List.of("INITIAL", "RESUMABLE", "NON-RESUMABLE", "OK", "END"), states); + } +} diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineTest.java index 06cc658c..789b13ad 100644 --- a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineTest.java +++ b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineTest.java @@ -7,6 +7,9 @@ import nu.marginalia.mq.MqMessageRow; import nu.marginalia.mq.MqMessageState; import nu.marginalia.mq.MqTestUtil; import nu.marginalia.mq.persistence.MqPersistence; +import nu.marginalia.mqsm.graph.GraphState; +import nu.marginalia.mqsm.graph.StateGraph; +import nu.marginalia.mqsm.state.ResumeBehavior; import org.junit.jupiter.api.*; import org.testcontainers.containers.MariaDBContainer; import org.testcontainers.junit.jupiter.Container; @@ -52,19 +55,63 @@ public class StateMachineTest { dataSource.close(); } + public static class TestGraph extends StateGraph { + public TestGraph(StateFactory stateFactory) { + super(stateFactory); + } + + @GraphState(name = "INITIAL", next = "GREET") + public String initial() { + return "World"; + } + + @GraphState(name = "GREET") + public void greet(String message) { + System.out.println("Hello, " + message + "!"); + + transition("COUNT-DOWN", 5); + } + + @GraphState(name = "COUNT-DOWN", next = "END") + public void countDown(Integer from) { + if (from > 0) { + System.out.println(from); + transition("COUNT-DOWN", from - 1); + } + } + } + + @Test + public void testAnnotatedStateGraph() throws Exception { + var stateFactory = new StateFactory(new GsonBuilder().create()); + var graph = new TestGraph(stateFactory); + + + var sm = new StateMachine(persistence, inboxId, UUID.randomUUID()); + sm.registerStates(graph.asStateList()); + + sm.init(); + + sm.join(); + sm.stop(); + + MqTestUtil.getMessages(dataSource, inboxId).forEach(System.out::println); + + } + @Test public void testStartStopStartStop() throws Exception { var sm = new StateMachine(persistence, inboxId, UUID.randomUUID()); var stateFactory = new StateFactory(new GsonBuilder().create()); - var initial = stateFactory.create("INITIAL", () -> stateFactory.transition("GREET", "World")); + var initial = stateFactory.create("INITIAL", ResumeBehavior.RETRY, () -> stateFactory.transition("GREET", "World")); - var greet = stateFactory.create("GREET", String.class, (String message) -> { + var greet = stateFactory.create("GREET", ResumeBehavior.RETRY, String.class, (String message) -> { System.out.println("Hello, " + message + "!"); return stateFactory.transition("COUNT-TO-FIVE", 0); }); - var ctf = stateFactory.create("COUNT-TO-FIVE", Integer.class, (Integer count) -> { + var ctf = stateFactory.create("COUNT-TO-FIVE", ResumeBehavior.RETRY, Integer.class, (Integer count) -> { System.out.println(count); if (count < 5) { return stateFactory.transition("COUNT-TO-FIVE", count + 1); @@ -89,86 +136,4 @@ public class StateMachineTest { MqTestUtil.getMessages(dataSource, inboxId).forEach(System.out::println); } - @Test - public void smResumeFromNew() throws Exception { - var sm = new StateMachine(persistence, inboxId, UUID.randomUUID()); - var stateFactory = new StateFactory(new GsonBuilder().create()); - - var initial = stateFactory.create("INITIAL", () -> stateFactory.transition("A")); - var stateA = stateFactory.create("A", () -> stateFactory.transition("B")); - var stateB = stateFactory.create("B", () -> stateFactory.transition("C")); - var stateC = stateFactory.create("C", () -> stateFactory.transition("END")); - - sm.registerStates(initial, stateA, stateB, stateC); - persistence.sendNewMessage(inboxId, null,"B", "", null); - - sm.resume(); - - sm.join(); - sm.stop(); - - List states = MqTestUtil.getMessages(dataSource, inboxId) - .stream() - .peek(System.out::println) - .map(MqMessageRow::function) - .toList(); - - assertEquals(List.of("B", "C", "END"), states); - } - - @Test - public void smResumeFromAck() throws Exception { - var sm = new StateMachine(persistence, inboxId, UUID.randomUUID()); - var stateFactory = new StateFactory(new GsonBuilder().create()); - - var initial = stateFactory.create("INITIAL", () -> stateFactory.transition("A")); - var stateA = stateFactory.create("A", () -> stateFactory.transition("B")); - var stateB = stateFactory.create("B", () -> stateFactory.transition("C")); - var stateC = stateFactory.create("C", () -> stateFactory.transition("END")); - - sm.registerStates(initial, stateA, stateB, stateC); - - long id = persistence.sendNewMessage(inboxId, null,"B", "", null); - persistence.updateMessageState(id, MqMessageState.ACK); - - sm.resume(); - - sm.join(); - sm.stop(); - - List states = MqTestUtil.getMessages(dataSource, inboxId) - .stream() - .peek(System.out::println) - .map(MqMessageRow::function) - .toList(); - - assertEquals(List.of("B", "C", "END"), states); - } - - - @Test - public void smResumeEmptyQueue() throws Exception { - var sm = new StateMachine(persistence, inboxId, UUID.randomUUID()); - var stateFactory = new StateFactory(new GsonBuilder().create()); - - var initial = stateFactory.create("INITIAL", () -> stateFactory.transition("A")); - var stateA = stateFactory.create("A", () -> stateFactory.transition("B")); - var stateB = stateFactory.create("B", () -> stateFactory.transition("C")); - var stateC = stateFactory.create("C", () -> stateFactory.transition("END")); - - sm.registerStates(initial, stateA, stateB, stateC); - - sm.resume(); - - sm.join(); - sm.stop(); - - List states = MqTestUtil.getMessages(dataSource, inboxId) - .stream() - .peek(System.out::println) - .map(MqMessageRow::function) - .toList(); - - assertEquals(List.of("INITIAL", "A", "B", "C", "END"), states); - } } From f0a8ca440fac57f60bb69e16c0809589e6421a3e Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 6 Jul 2023 13:33:11 +0200 Subject: [PATCH 011/157] MQFSM Usability WIP --- .../java/nu/marginalia/mqsm/StateFactory.java | 50 ++++++++++++++++++- .../java/nu/marginalia/mqsm/StateMachine.java | 36 ++++++------- ...tateGraph.java => AbstractStateGraph.java} | 25 +++++++--- .../nu/marginalia/mqsm/graph/GraphState.java | 2 - .../marginalia/mqsm/graph/ResumeBehavior.java | 8 +++ .../nu/marginalia/mqsm/state/ErrorState.java | 17 ------- .../nu/marginalia/mqsm/state/FinalState.java | 17 ------- .../marginalia/mqsm/state/MachineState.java | 4 ++ .../marginalia/mqsm/state/ResumeBehavior.java | 6 --- .../marginalia/mqsm/state/ResumingState.java | 17 ------- .../mqsm/StateMachineErrorTest.java | 13 ++--- .../mqsm/StateMachineResumeTest.java | 26 +++------- .../nu/marginalia/mqsm/StateMachineTest.java | 39 ++++----------- 13 files changed, 119 insertions(+), 141 deletions(-) rename code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/{StateGraph.java => AbstractStateGraph.java} (88%) create mode 100644 code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/ResumeBehavior.java delete mode 100644 code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/ErrorState.java delete mode 100644 code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/FinalState.java delete mode 100644 code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/ResumeBehavior.java delete mode 100644 code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/ResumingState.java diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateFactory.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateFactory.java index 09c02ea7..6a143157 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateFactory.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateFactory.java @@ -3,8 +3,8 @@ package nu.marginalia.mqsm; import com.google.gson.Gson; import com.google.inject.Inject; import com.google.inject.Singleton; +import nu.marginalia.mqsm.graph.ResumeBehavior; import nu.marginalia.mqsm.state.MachineState; -import nu.marginalia.mqsm.state.ResumeBehavior; import nu.marginalia.mqsm.state.StateTransition; import java.util.function.Function; @@ -74,4 +74,52 @@ public class StateFactory { public StateTransition transition(String state, Object message) { return StateTransition.to(state, gson.toJson(message)); } + + public static class ErrorState implements MachineState { + @Override + public String name() { return "ERROR"; } + + @Override + public StateTransition next(String message) { + throw new UnsupportedOperationException(); + } + + @Override + public ResumeBehavior resumeBehavior() { return ResumeBehavior.RETRY; } + + @Override + public boolean isFinal() { return true; } + } + + public static class FinalState implements MachineState { + @Override + public String name() { return "END"; } + + @Override + public StateTransition next(String message) { + throw new UnsupportedOperationException(); + } + + @Override + public ResumeBehavior resumeBehavior() { return ResumeBehavior.RETRY; } + + @Override + public boolean isFinal() { return true; } + } + + public static class ResumingState implements MachineState { + @Override + public String name() { return "RESUMING"; } + + @Override + public StateTransition next(String message) { + throw new UnsupportedOperationException(); + } + + @Override + public ResumeBehavior resumeBehavior() { return ResumeBehavior.RETRY; } + + @Override + public boolean isFinal() { return false; } + } } diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java index 827005ed..e54b48f7 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java @@ -7,7 +7,8 @@ import nu.marginalia.mq.inbox.MqInboxResponse; import nu.marginalia.mq.inbox.MqSubscription; import nu.marginalia.mq.outbox.MqOutbox; import nu.marginalia.mq.persistence.MqPersistence; -import nu.marginalia.mqsm.graph.StateGraph; +import nu.marginalia.mqsm.graph.ResumeBehavior; +import nu.marginalia.mqsm.graph.AbstractStateGraph; import nu.marginalia.mqsm.state.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -30,13 +31,16 @@ public class StateMachine { private final String queueName; private MachineState state; - private final MachineState errorState = new ErrorState(); - private final MachineState finalState = new FinalState(); - private final MachineState resumingState = new ResumingState(); + private final MachineState errorState = new StateFactory.ErrorState(); + private final MachineState finalState = new StateFactory.FinalState(); + private final MachineState resumingState = new StateFactory.ResumingState(); private final Map allStates = new HashMap<>(); - public StateMachine(MqPersistence persistence, String queueName, UUID instanceUUID) { + public StateMachine(MqPersistence persistence, + String queueName, + UUID instanceUUID, + AbstractStateGraph stateGraph) { this.queueName = queueName; smInbox = new MqInbox(persistence, queueName, instanceUUID, Executors.newSingleThreadExecutor()); @@ -45,28 +49,24 @@ public class StateMachine { smInbox.subscribe(new StateEventSubscription()); registerStates(List.of(errorState, finalState, resumingState)); + registerStates(stateGraph); + + for (var declaredState : stateGraph.declaredStates()) { + if (!allStates.containsKey(declaredState)) { + throw new IllegalArgumentException("State " + declaredState + " is not defined in the state graph"); + } + } } /** Register the state graph */ - public void registerStates(MachineState... states) { - if (state != null) { - throw new IllegalStateException("Cannot register states after state machine has been initialized"); - } - + void registerStates(List states) { for (var state : states) { allStates.put(state.name(), state); } } /** Register the state graph */ - public void registerStates(List states) { - for (var state : states) { - allStates.put(state.name(), state); - } - } - - /** Register the state graph */ - public void registerStates(StateGraph states) { + void registerStates(AbstractStateGraph states) { registerStates(states.asStateList()); } diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/StateGraph.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/AbstractStateGraph.java similarity index 88% rename from code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/StateGraph.java rename to code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/AbstractStateGraph.java index df8f4318..10aca984 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/StateGraph.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/AbstractStateGraph.java @@ -1,22 +1,20 @@ package nu.marginalia.mqsm.graph; -import nu.marginalia.mqsm.StateFactory; import nu.marginalia.mqsm.state.MachineState; +import nu.marginalia.mqsm.StateFactory; import nu.marginalia.mqsm.state.StateTransition; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; +import java.util.*; -public abstract class StateGraph { +public abstract class AbstractStateGraph { private final StateFactory stateFactory; - private static final Logger logger = LoggerFactory.getLogger(StateGraph.class); + private static final Logger logger = LoggerFactory.getLogger(AbstractStateGraph.class); - public StateGraph(StateFactory stateFactory) { + public AbstractStateGraph(StateFactory stateFactory) { this.stateFactory = stateFactory; } @@ -38,6 +36,19 @@ public abstract class StateGraph { throw new ControlFlowException("ERROR", ex.getClass().getSimpleName() + ":" + ex.getMessage()); } + public Set declaredStates() { + Set ret = new HashSet<>(); + + for (var method : getClass().getMethods()) { + var gs = method.getAnnotation(GraphState.class); + if (gs != null) { + ret.add(gs.name()); + ret.add(gs.next()); + } + } + + return ret; + } public List asStateList() { List ret = new ArrayList<>(); diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/GraphState.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/GraphState.java index b79b71aa..62183637 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/GraphState.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/GraphState.java @@ -1,8 +1,6 @@ package nu.marginalia.mqsm.graph; -import nu.marginalia.mqsm.state.ResumeBehavior; - import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/ResumeBehavior.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/ResumeBehavior.java new file mode 100644 index 00000000..2e275cb5 --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/ResumeBehavior.java @@ -0,0 +1,8 @@ +package nu.marginalia.mqsm.graph; + +public enum ResumeBehavior { + /** Retry the state on resume */ + RETRY, + /** Jump to ERROR on resume if the message has been acknowledged */ + ERROR +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/ErrorState.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/ErrorState.java deleted file mode 100644 index dcb19125..00000000 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/ErrorState.java +++ /dev/null @@ -1,17 +0,0 @@ -package nu.marginalia.mqsm.state; - -public class ErrorState implements MachineState { - @Override - public String name() { return "ERROR"; } - - @Override - public StateTransition next(String message) { - throw new UnsupportedOperationException(); - } - - @Override - public ResumeBehavior resumeBehavior() { return ResumeBehavior.RETRY; } - - @Override - public boolean isFinal() { return true; } -} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/FinalState.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/FinalState.java deleted file mode 100644 index dc2362fe..00000000 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/FinalState.java +++ /dev/null @@ -1,17 +0,0 @@ -package nu.marginalia.mqsm.state; - -public class FinalState implements MachineState { - @Override - public String name() { return "END"; } - - @Override - public StateTransition next(String message) { - throw new UnsupportedOperationException(); - } - - @Override - public ResumeBehavior resumeBehavior() { return ResumeBehavior.RETRY; } - - @Override - public boolean isFinal() { return true; } -} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/MachineState.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/MachineState.java index 11efc7c5..ec3c26ff 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/MachineState.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/MachineState.java @@ -1,9 +1,13 @@ package nu.marginalia.mqsm.state; +import nu.marginalia.mqsm.graph.ResumeBehavior; + public interface MachineState { String name(); + StateTransition next(String message); ResumeBehavior resumeBehavior(); + boolean isFinal(); } diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/ResumeBehavior.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/ResumeBehavior.java deleted file mode 100644 index a82446f8..00000000 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/ResumeBehavior.java +++ /dev/null @@ -1,6 +0,0 @@ -package nu.marginalia.mqsm.state; - -public enum ResumeBehavior { - RETRY, - ERROR -} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/ResumingState.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/ResumingState.java deleted file mode 100644 index ce01bb79..00000000 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/ResumingState.java +++ /dev/null @@ -1,17 +0,0 @@ -package nu.marginalia.mqsm.state; - -public class ResumingState implements MachineState { - @Override - public String name() { return "RESUMING"; } - - @Override - public StateTransition next(String message) { - throw new UnsupportedOperationException(); - } - - @Override - public ResumeBehavior resumeBehavior() { return ResumeBehavior.RETRY; } - - @Override - public boolean isFinal() { return false; } -} diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineErrorTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineErrorTest.java index 6c6298eb..06279f34 100644 --- a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineErrorTest.java +++ b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineErrorTest.java @@ -4,12 +4,11 @@ import com.google.gson.GsonBuilder; import com.zaxxer.hikari.HikariConfig; import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.mq.MqMessageRow; -import nu.marginalia.mq.MqMessageState; import nu.marginalia.mq.MqTestUtil; import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.mqsm.graph.GraphState; -import nu.marginalia.mqsm.graph.StateGraph; -import nu.marginalia.mqsm.state.ResumeBehavior; +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mqsm.graph.ResumeBehavior; import org.junit.jupiter.api.*; import org.testcontainers.containers.MariaDBContainer; import org.testcontainers.junit.jupiter.Container; @@ -55,7 +54,7 @@ public class StateMachineErrorTest { dataSource.close(); } - public static class ErrorHurdles extends StateGraph { + public static class ErrorHurdles extends AbstractStateGraph { public ErrorHurdles(StateFactory stateFactory) { super(stateFactory); @@ -71,17 +70,15 @@ public class StateMachineErrorTest { } @GraphState(name = "OK", next = "END") public void ok() { - + } } @Test public void smResumeResumableFromNew() throws Exception { - var sm = new StateMachine(persistence, inboxId, UUID.randomUUID()); var stateFactory = new StateFactory(new GsonBuilder().create()); - - sm.registerStates(new ErrorHurdles(stateFactory).asStateList()); + var sm = new StateMachine(persistence, inboxId, UUID.randomUUID(), new ErrorHurdles(stateFactory)); sm.init(); diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineResumeTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineResumeTest.java index 6913e13a..654e3623 100644 --- a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineResumeTest.java +++ b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineResumeTest.java @@ -8,8 +8,8 @@ import nu.marginalia.mq.MqMessageState; import nu.marginalia.mq.MqTestUtil; import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.mqsm.graph.GraphState; -import nu.marginalia.mqsm.graph.StateGraph; -import nu.marginalia.mqsm.state.ResumeBehavior; +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mqsm.graph.ResumeBehavior; import org.junit.jupiter.api.*; import org.testcontainers.containers.MariaDBContainer; import org.testcontainers.junit.jupiter.Container; @@ -55,7 +55,7 @@ public class StateMachineResumeTest { dataSource.close(); } - public static class ResumeTrialsGraph extends StateGraph { + public static class ResumeTrialsGraph extends AbstractStateGraph { public ResumeTrialsGraph(StateFactory stateFactory) { super(stateFactory); @@ -75,10 +75,8 @@ public class StateMachineResumeTest { @Test public void smResumeResumableFromNew() throws Exception { - var sm = new StateMachine(persistence, inboxId, UUID.randomUUID()); var stateFactory = new StateFactory(new GsonBuilder().create()); - - sm.registerStates(new ResumeTrialsGraph(stateFactory).asStateList()); + var sm = new StateMachine(persistence, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); persistence.sendNewMessage(inboxId, null,"RESUMABLE", "", null); @@ -98,10 +96,8 @@ public class StateMachineResumeTest { @Test public void smResumeFromAck() throws Exception { - var sm = new StateMachine(persistence, inboxId, UUID.randomUUID()); var stateFactory = new StateFactory(new GsonBuilder().create()); - - sm.registerStates(new ResumeTrialsGraph(stateFactory)); + var sm = new StateMachine(persistence, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); long id = persistence.sendNewMessage(inboxId, null,"RESUMABLE", "", null); persistence.updateMessageState(id, MqMessageState.ACK); @@ -123,10 +119,8 @@ public class StateMachineResumeTest { @Test public void smResumeNonResumableFromNew() throws Exception { - var sm = new StateMachine(persistence, inboxId, UUID.randomUUID()); var stateFactory = new StateFactory(new GsonBuilder().create()); - - sm.registerStates(new ResumeTrialsGraph(stateFactory)); + var sm = new StateMachine(persistence, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); persistence.sendNewMessage(inboxId, null,"NON-RESUMABLE", "", null); @@ -146,10 +140,8 @@ public class StateMachineResumeTest { @Test public void smResumeNonResumableFromAck() throws Exception { - var sm = new StateMachine(persistence, inboxId, UUID.randomUUID()); var stateFactory = new StateFactory(new GsonBuilder().create()); - - sm.registerStates(new ResumeTrialsGraph(stateFactory)); + var sm = new StateMachine(persistence, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); long id = persistence.sendNewMessage(inboxId, null,"NON-RESUMABLE", "", null); persistence.updateMessageState(id, MqMessageState.ACK); @@ -170,10 +162,8 @@ public class StateMachineResumeTest { @Test public void smResumeEmptyQueue() throws Exception { - var sm = new StateMachine(persistence, inboxId, UUID.randomUUID()); var stateFactory = new StateFactory(new GsonBuilder().create()); - - sm.registerStates(new ResumeTrialsGraph(stateFactory)); + var sm = new StateMachine(persistence, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); sm.resume(); diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineTest.java index 789b13ad..a6adfa4c 100644 --- a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineTest.java +++ b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineTest.java @@ -3,19 +3,15 @@ package nu.marginalia.mqsm; import com.google.gson.GsonBuilder; import com.zaxxer.hikari.HikariConfig; import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.mq.MqMessageRow; -import nu.marginalia.mq.MqMessageState; import nu.marginalia.mq.MqTestUtil; import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.mqsm.graph.GraphState; -import nu.marginalia.mqsm.graph.StateGraph; -import nu.marginalia.mqsm.state.ResumeBehavior; +import nu.marginalia.mqsm.graph.AbstractStateGraph; import org.junit.jupiter.api.*; import org.testcontainers.containers.MariaDBContainer; import org.testcontainers.junit.jupiter.Container; import org.testcontainers.junit.jupiter.Testcontainers; -import java.util.List; import java.util.UUID; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -55,7 +51,7 @@ public class StateMachineTest { dataSource.close(); } - public static class TestGraph extends StateGraph { + public static class TestGraph extends AbstractStateGraph { public TestGraph(StateFactory stateFactory) { super(stateFactory); } @@ -87,8 +83,8 @@ public class StateMachineTest { var graph = new TestGraph(stateFactory); - var sm = new StateMachine(persistence, inboxId, UUID.randomUUID()); - sm.registerStates(graph.asStateList()); + var sm = new StateMachine(persistence, inboxId, UUID.randomUUID(), graph); + sm.registerStates(graph); sm.init(); @@ -101,34 +97,17 @@ public class StateMachineTest { @Test public void testStartStopStartStop() throws Exception { - var sm = new StateMachine(persistence, inboxId, UUID.randomUUID()); var stateFactory = new StateFactory(new GsonBuilder().create()); - - var initial = stateFactory.create("INITIAL", ResumeBehavior.RETRY, () -> stateFactory.transition("GREET", "World")); - - var greet = stateFactory.create("GREET", ResumeBehavior.RETRY, String.class, (String message) -> { - System.out.println("Hello, " + message + "!"); - return stateFactory.transition("COUNT-TO-FIVE", 0); - }); - - var ctf = stateFactory.create("COUNT-TO-FIVE", ResumeBehavior.RETRY, Integer.class, (Integer count) -> { - System.out.println(count); - if (count < 5) { - return stateFactory.transition("COUNT-TO-FIVE", count + 1); - } else { - return stateFactory.transition("END"); - } - }); - - sm.registerStates(initial, greet, ctf); + var sm = new StateMachine(persistence, inboxId, UUID.randomUUID(), new TestGraph(stateFactory)); sm.init(); - Thread.sleep(300); + Thread.sleep(150); sm.stop(); - var sm2 = new StateMachine(persistence, inboxId, UUID.randomUUID()); - sm2.registerStates(initial, greet, ctf); + System.out.println("-------------------- "); + + var sm2 = new StateMachine(persistence, inboxId, UUID.randomUUID(), new TestGraph(stateFactory)); sm2.resume(); sm2.join(); sm2.stop(); From 34653f03a262e9582bde11d863aa0f94798115ca Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 6 Jul 2023 14:13:03 +0200 Subject: [PATCH 012/157] Temporary bugfix, need to find source --- .../nu/marginalia/loading/loader/SqlLoadProcessedDocument.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDocument.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDocument.java index 2a875b58..2aec488d 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDocument.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDocument.java @@ -4,6 +4,7 @@ import com.google.inject.Inject; import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError; +import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -78,7 +79,7 @@ public class SqlLoadProcessedDocument { stmt.setInt(1, urlId); stmt.setString(2, doc.state().name()); stmt.setString(3, doc.title()); - stmt.setString(4, doc.description()); + stmt.setString(4, StringUtils.truncate(doc.description(), 255)); stmt.setInt(5, doc.length()); stmt.setInt(6, doc.htmlFeatures()); stmt.setString(7, doc.standard()); From d9e6c4f2667ddeae6a479fa50c594641f5d8f0f9 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 6 Jul 2023 18:04:16 +0200 Subject: [PATCH 013/157] Trial integration of MQ-FSM into index service. --- code/api/index-api/build.gradle | 2 +- .../marginalia/index/client/IndexClient.java | 20 ++++- .../index/client/IndexMqEndpoints.java | 8 ++ .../nu/marginalia/mq/outbox/MqOutbox.java | 26 ++++++- .../java/nu/marginalia/mqsm/StateMachine.java | 8 +- .../nu/marginalia/mq/outbox/MqOutboxTest.java | 10 +-- code/common/service/build.gradle | 1 + .../service/server/BaseServiceParams.java | 5 +- .../nu/marginalia/service/server/Service.java | 24 ++++++ .../service/server/mq/MqNotification.java | 9 +++ .../service/server/mq/MqRequest.java | 9 +++ .../server/mq/ServiceMqSubscription.java | 74 +++++++++++++++++++ .../nu/marginalia/index/IndexService.java | 23 ++++++ .../marginalia/index/svc/IndexOpsService.java | 7 ++ .../control-service/build.gradle | 1 + .../nu/marginalia/control/ControlService.java | 8 +- .../control/process/ControlProcesses.java | 38 ++++++++++ .../process/RepartitionReindexProcess.java | 72 ++++++++++++++++++ 18 files changed, 332 insertions(+), 13 deletions(-) create mode 100644 code/api/index-api/src/main/java/nu/marginalia/index/client/IndexMqEndpoints.java create mode 100644 code/common/service/src/main/java/nu/marginalia/service/server/mq/MqNotification.java create mode 100644 code/common/service/src/main/java/nu/marginalia/service/server/mq/MqRequest.java create mode 100644 code/common/service/src/main/java/nu/marginalia/service/server/mq/ServiceMqSubscription.java create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ControlProcesses.java create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/RepartitionReindexProcess.java diff --git a/code/api/index-api/build.gradle b/code/api/index-api/build.gradle index 6dbcd98f..edb6056d 100644 --- a/code/api/index-api/build.gradle +++ b/code/api/index-api/build.gradle @@ -16,7 +16,7 @@ dependencies { implementation project(':code:common:config') implementation project(':code:common:service-discovery') implementation project(':code:common:service-client') - + implementation project(':code:common:message-queue') implementation project(':code:features-index:index-query') implementation libs.lombok diff --git a/code/api/index-api/src/main/java/nu/marginalia/index/client/IndexClient.java b/code/api/index-api/src/main/java/nu/marginalia/index/client/IndexClient.java index 8db8772f..b8d2e683 100644 --- a/code/api/index-api/src/main/java/nu/marginalia/index/client/IndexClient.java +++ b/code/api/index-api/src/main/java/nu/marginalia/index/client/IndexClient.java @@ -8,27 +8,41 @@ import nu.marginalia.WmsaHome; import nu.marginalia.client.AbstractDynamicClient; import nu.marginalia.client.Context; import nu.marginalia.index.client.model.query.SearchSpecification; -import nu.marginalia.index.client.model.results.SearchResultItem; import nu.marginalia.index.client.model.results.SearchResultSet; import nu.marginalia.model.gson.GsonFactory; +import nu.marginalia.mq.outbox.MqOutbox; +import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.service.descriptor.ServiceDescriptors; import nu.marginalia.service.id.ServiceId; import javax.annotation.CheckReturnValue; -import java.util.List; +import java.util.UUID; @Singleton public class IndexClient extends AbstractDynamicClient { private static final Summary wmsa_search_index_api_time = Summary.build().name("wmsa_search_index_api_time").help("-").register(); + private final MqOutbox outbox; + @Inject - public IndexClient(ServiceDescriptors descriptors) { + public IndexClient(ServiceDescriptors descriptors, + MqPersistence persistence) { super(descriptors.forId(ServiceId.Index), WmsaHome.getHostsFile(), GsonFactory::get); + String inboxName = ServiceId.Index.name + ":" + "0"; + String outboxName = System.getProperty("service-name", UUID.randomUUID().toString()); + + outbox = new MqOutbox(persistence, inboxName, outboxName, UUID.randomUUID()); + setTimeout(30); } + + public MqOutbox outbox() { + return outbox; + } + @CheckReturnValue public SearchResultSet query(Context ctx, SearchSpecification specs) { return wmsa_search_index_api_time.time( diff --git a/code/api/index-api/src/main/java/nu/marginalia/index/client/IndexMqEndpoints.java b/code/api/index-api/src/main/java/nu/marginalia/index/client/IndexMqEndpoints.java new file mode 100644 index 00000000..9d2476f8 --- /dev/null +++ b/code/api/index-api/src/main/java/nu/marginalia/index/client/IndexMqEndpoints.java @@ -0,0 +1,8 @@ +package nu.marginalia.index.client; + +public class IndexMqEndpoints { + public static final String INDEX_IS_BLOCKED = "INDEX-IS-BLOCKED"; + public static final String INDEX_REPARTITION = "INDEX-REPARTITION"; + public static final String INDEX_REINDEX = "INDEX-REINDEX"; + +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java index e8faa0ab..75fb8fcd 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java @@ -6,6 +6,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.sql.SQLException; +import java.util.Optional; import java.util.UUID; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.TimeUnit; @@ -27,11 +28,12 @@ public class MqOutbox { public MqOutbox(MqPersistence persistence, String inboxName, + String outboxName, UUID instanceUUID) { this.persistence = persistence; this.inboxName = inboxName; - this.replyInboxName = "reply:" + inboxName; + this.replyInboxName = outboxName + "//" + inboxName; this.instanceUUID = instanceUUID.toString(); pollThread = new Thread(this::poll, "mq-outbox-poll-thread:" + inboxName); @@ -90,10 +92,26 @@ public class MqOutbox { } + /** Send a message and wait for a response. */ public MqMessage send(String function, String payload) throws Exception { + final long id = sendAsync(function, payload); + + return waitResponse(id); + } + + /** Send a message asynchronously, without waiting for a response. + *
+ * Use waitResponse(id) or pollResponse(id) to fetch the response. */ + public long sendAsync(String function, String payload) throws Exception { var id = persistence.sendNewMessage(inboxName, replyInboxName, function, payload, null); + pendingRequests.put(id, id); + return id; + } + + /** Blocks until a response arrives for the given message id. */ + public MqMessage waitResponse(long id) throws Exception { synchronized (pendingResponses) { while (!pendingResponses.containsKey(id)) { pendingResponses.wait(100); @@ -102,6 +120,12 @@ public class MqOutbox { } } + /** Polls for a response for the given message id. */ + public Optional pollResponse(long id) { + // no need to sync here if we aren't going to wait() + return Optional.ofNullable(pendingResponses.remove(id)); + } + public long notify(String function, String payload) throws Exception { return persistence.sendNewMessage(inboxName, null, function, payload, null); } diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java index e54b48f7..3518e9e5 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java @@ -44,7 +44,7 @@ public class StateMachine { this.queueName = queueName; smInbox = new MqInbox(persistence, queueName, instanceUUID, Executors.newSingleThreadExecutor()); - smOutbox = new MqOutbox(persistence, queueName, instanceUUID); + smOutbox = new MqOutbox(persistence, queueName, queueName+"//out", instanceUUID); smInbox.subscribe(new StateEventSubscription()); @@ -144,6 +144,12 @@ public class StateMachine { nextState, message); + if (!allStates.containsKey(nextState)) { + logger.error("Unknown state {}", nextState); + setErrorState(); + return; + } + synchronized (this) { this.state = allStates.get(nextState); notifyAll(); diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqOutboxTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqOutboxTest.java index 6dc51f2d..3b7996f1 100644 --- a/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqOutboxTest.java +++ b/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqOutboxTest.java @@ -55,13 +55,13 @@ public class MqOutboxTest { @Test public void testOpenClose() throws InterruptedException { - var outbox = new MqOutbox(new MqPersistence(dataSource), inboxId, UUID.randomUUID()); + var outbox = new MqOutbox(new MqPersistence(dataSource), inboxId, inboxId+"/reply", UUID.randomUUID()); outbox.stop(); } @Test public void testSend() throws Exception { - var outbox = new MqOutbox(new MqPersistence(dataSource), inboxId, UUID.randomUUID()); + var outbox = new MqOutbox(new MqPersistence(dataSource), inboxId,inboxId+"/reply", UUID.randomUUID()); Executors.newSingleThreadExecutor().submit(() -> outbox.send("test", "Hello World")); TimeUnit.MILLISECONDS.sleep(100); @@ -75,7 +75,7 @@ public class MqOutboxTest { @Test public void testSendAndRespond() throws Exception { - var outbox = new MqOutbox(new MqPersistence(dataSource), inboxId, UUID.randomUUID()); + var outbox = new MqOutbox(new MqPersistence(dataSource), inboxId,inboxId+"/reply", UUID.randomUUID()); var inbox = new MqInbox(new MqPersistence(dataSource), inboxId, UUID.randomUUID()); inbox.subscribe(justRespond("Alright then")); @@ -96,7 +96,7 @@ public class MqOutboxTest { @Test public void testSendMultiple() throws Exception { - var outbox = new MqOutbox(new MqPersistence(dataSource), inboxId, UUID.randomUUID()); + var outbox = new MqOutbox(new MqPersistence(dataSource), inboxId,inboxId+"/reply", UUID.randomUUID()); var inbox = new MqInbox(new MqPersistence(dataSource), inboxId, UUID.randomUUID()); inbox.subscribe(echo()); @@ -130,7 +130,7 @@ public class MqOutboxTest { @Test public void testSendAndRespondWithErrorHandler() throws Exception { - var outbox = new MqOutbox(new MqPersistence(dataSource), inboxId, UUID.randomUUID()); + var outbox = new MqOutbox(new MqPersistence(dataSource), inboxId,inboxId+"/reply", UUID.randomUUID()); var inbox = new MqInbox(new MqPersistence(dataSource), inboxId, UUID.randomUUID()); inbox.start(); diff --git a/code/common/service/build.gradle b/code/common/service/build.gradle index f153500b..156b826f 100644 --- a/code/common/service/build.gradle +++ b/code/common/service/build.gradle @@ -12,6 +12,7 @@ java { dependencies { implementation project(':code:common:service-client') implementation project(':code:common:service-discovery') + implementation project(':code:common:message-queue') implementation project(':code:common:db') implementation libs.lombok diff --git a/code/common/service/src/main/java/nu/marginalia/service/server/BaseServiceParams.java b/code/common/service/src/main/java/nu/marginalia/service/server/BaseServiceParams.java index 1cd94b6c..abec5e55 100644 --- a/code/common/service/src/main/java/nu/marginalia/service/server/BaseServiceParams.java +++ b/code/common/service/src/main/java/nu/marginalia/service/server/BaseServiceParams.java @@ -2,6 +2,7 @@ package nu.marginalia.service.server; import com.google.inject.Inject; import com.google.inject.Singleton; +import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.service.control.ServiceEventLog; import nu.marginalia.service.control.ServiceHeartbeat; import nu.marginalia.service.module.ServiceConfiguration; @@ -14,17 +15,19 @@ public class BaseServiceParams { public final MetricsServer metricsServer; public final ServiceHeartbeat heartbeat; public final ServiceEventLog eventLog; + public final MqPersistence messageQueuePersistence; @Inject public BaseServiceParams(ServiceConfiguration configuration, Initialization initialization, MetricsServer metricsServer, ServiceHeartbeat heartbeat, - ServiceEventLog eventLog) { + ServiceEventLog eventLog, MqPersistence messageQueuePersistence) { this.configuration = configuration; this.initialization = initialization; this.metricsServer = metricsServer; this.heartbeat = heartbeat; this.eventLog = eventLog; + this.messageQueuePersistence = messageQueuePersistence; } } diff --git a/code/common/service/src/main/java/nu/marginalia/service/server/Service.java b/code/common/service/src/main/java/nu/marginalia/service/server/Service.java index 5a287c99..e8386fb8 100644 --- a/code/common/service/src/main/java/nu/marginalia/service/server/Service.java +++ b/code/common/service/src/main/java/nu/marginalia/service/server/Service.java @@ -3,6 +3,9 @@ package nu.marginalia.service.server; import io.prometheus.client.Counter; import nu.marginalia.client.Context; import nu.marginalia.client.exception.MessagingException; +import nu.marginalia.mq.inbox.MqInbox; +import nu.marginalia.service.server.mq.MqRequest; +import nu.marginalia.service.server.mq.ServiceMqSubscription; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.slf4j.Marker; @@ -36,14 +39,25 @@ public class Service { private final String serviceName; private static volatile boolean initialized = false; + protected final MqInbox messageQueueInbox; + public Service(BaseServiceParams params, Runnable configureStaticFiles ) { this.initialization = params.initialization; + var config = params.configuration; + + String inboxName = config.serviceName() + ":" + config.node(); + logger.info("Inbox name: {}", inboxName); + messageQueueInbox = new MqInbox(params.messageQueuePersistence, + inboxName, + config.instanceUuid()); + messageQueueInbox.subscribe(new ServiceMqSubscription(this)); serviceName = System.getProperty("service-name"); initialization.addCallback(params.heartbeat::start); + initialization.addCallback(messageQueueInbox::start); initialization.addCallback(() -> params.eventLog.logEvent("SVC-INIT", "")); if (!initialization.isReady() && ! initialized ) { @@ -81,6 +95,16 @@ public class Service { }); } + @MqRequest(endpoint = "SVC-READY") + public boolean mqIsReady() { + return initialization.isReady(); + } + + @MqRequest(endpoint = "SVC-PING") + public String mqPing() { + return "pong"; + } + private void filterPublicRequests(Request request, Response response) { if (null == request.headers("X-Public")) { return; diff --git a/code/common/service/src/main/java/nu/marginalia/service/server/mq/MqNotification.java b/code/common/service/src/main/java/nu/marginalia/service/server/mq/MqNotification.java new file mode 100644 index 00000000..20586f3e --- /dev/null +++ b/code/common/service/src/main/java/nu/marginalia/service/server/mq/MqNotification.java @@ -0,0 +1,9 @@ +package nu.marginalia.service.server.mq; + +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; + +@Retention(RetentionPolicy.RUNTIME) +public @interface MqNotification { + String endpoint(); +} diff --git a/code/common/service/src/main/java/nu/marginalia/service/server/mq/MqRequest.java b/code/common/service/src/main/java/nu/marginalia/service/server/mq/MqRequest.java new file mode 100644 index 00000000..60b7ebd8 --- /dev/null +++ b/code/common/service/src/main/java/nu/marginalia/service/server/mq/MqRequest.java @@ -0,0 +1,9 @@ +package nu.marginalia.service.server.mq; + +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; + +@Retention(RetentionPolicy.RUNTIME) +public @interface MqRequest { + String endpoint(); +} diff --git a/code/common/service/src/main/java/nu/marginalia/service/server/mq/ServiceMqSubscription.java b/code/common/service/src/main/java/nu/marginalia/service/server/mq/ServiceMqSubscription.java new file mode 100644 index 00000000..d344d928 --- /dev/null +++ b/code/common/service/src/main/java/nu/marginalia/service/server/mq/ServiceMqSubscription.java @@ -0,0 +1,74 @@ +package nu.marginalia.service.server.mq; + +import nu.marginalia.mq.MqMessage; +import nu.marginalia.mq.inbox.MqInboxResponse; +import nu.marginalia.mq.inbox.MqSubscription; +import nu.marginalia.service.server.Service; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.util.HashMap; +import java.util.Map; + +public class ServiceMqSubscription implements MqSubscription { + private static final Logger logger = LoggerFactory.getLogger(ServiceMqSubscription.class); + private final Map requests = new HashMap<>(); + private final Map notifications = new HashMap<>(); + private final Service service; + + public ServiceMqSubscription(Service service) { + this.service = service; + for (var method : service.getClass().getMethods()) { + var annotation = method.getAnnotation(MqRequest.class); + if (annotation != null) { + requests.put(annotation.endpoint(), method); + } + if (method.getAnnotation(MqNotification.class) != null) { + notifications.put(method.getName(), method); + } + } + } + + @Override + public boolean filter(MqMessage rawMessage) { + boolean isInteresting = requests.containsKey(rawMessage.function()) + || notifications.containsKey(rawMessage.function()); + + if (!isInteresting) { + logger.warn("Received message for unknown function " + rawMessage.function()); + } + + return isInteresting; + } + + @Override + public MqInboxResponse onRequest(MqMessage msg) { + var method = requests.get(msg.function()); + + try { + return MqInboxResponse.ok(method.invoke(service, msg.payload()).toString()); + } + catch (InvocationTargetException ex) { + logger.error("Error invoking method " + method, ex); + return MqInboxResponse.err(ex.getCause().getMessage()); + } + catch (Exception ex) { + logger.error("Error invoking method " + method, ex); + return MqInboxResponse.err(ex.getMessage()); + } + } + + @Override + public void onNotification(MqMessage msg) { + var method = notifications.get(msg.function()); + + try { + method.invoke(service, msg.payload()); + } + catch (Exception ex) { + logger.error("Error invoking method " + method, ex); + } + } +} diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexService.java index 369e8309..82ed2617 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexService.java @@ -3,6 +3,7 @@ package nu.marginalia.index; import com.google.gson.Gson; import com.google.inject.Inject; import io.reactivex.rxjava3.schedulers.Schedulers; +import nu.marginalia.index.client.IndexMqEndpoints; import nu.marginalia.index.index.SearchIndex; import nu.marginalia.index.svc.IndexOpsService; import nu.marginalia.index.svc.IndexQueryService; @@ -10,6 +11,7 @@ import nu.marginalia.index.svc.IndexSearchSetsService; import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.service.control.ServiceEventLog; import nu.marginalia.service.server.*; +import nu.marginalia.service.server.mq.MqRequest; import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -72,6 +74,27 @@ public class IndexService extends Service { volatile boolean initialized = false; + @MqRequest(endpoint = IndexMqEndpoints.INDEX_REPARTITION) + public String repartition(String message) { + if (!opsService.repartition()) { + throw new IllegalStateException("Ops lock busy"); + } + return "ok"; + } + + @MqRequest(endpoint = IndexMqEndpoints.INDEX_REINDEX) + public String reindex(String message) throws Exception { + if (!opsService.reindex()) { + throw new IllegalStateException("Ops lock busy"); + } + + return "ok"; + } + @MqRequest(endpoint = IndexMqEndpoints.INDEX_IS_BLOCKED) + public String isBlocked(String message) throws Exception { + return Boolean.valueOf(opsService.isBusy()).toString(); + } + public void initialize() { if (!initialized) { init.waitReady(); diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexOpsService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexOpsService.java index 34ed2927..36377c7c 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexOpsService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexOpsService.java @@ -30,6 +30,13 @@ public class IndexOpsService { return opsLock.isLocked(); } + public boolean repartition() { + return run(searchSetService::recalculateAll); + } + public boolean reindex() throws Exception { + return run(index::switchIndex).isPresent(); + } + public Object repartitionEndpoint(Request request, Response response) throws Exception { if (!run(searchSetService::recalculateAll)) { diff --git a/code/services-satellite/control-service/build.gradle b/code/services-satellite/control-service/build.gradle index 6bfffcaa..fac386e2 100644 --- a/code/services-satellite/control-service/build.gradle +++ b/code/services-satellite/control-service/build.gradle @@ -31,6 +31,7 @@ dependencies { implementation project(':code:common:service-discovery') implementation project(':code:common:service-client') implementation project(':code:api:search-api') + implementation project(':code:api:index-api') implementation libs.lombok diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java index 40559746..93873abb 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java @@ -3,6 +3,7 @@ package nu.marginalia.control; import com.google.gson.Gson; import com.google.inject.Inject; import nu.marginalia.client.ServiceMonitors; +import nu.marginalia.control.process.ControlProcesses; import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.renderer.MustacheRenderer; @@ -33,7 +34,8 @@ public class ControlService extends Service { HeartbeatService heartbeatService, EventLogService eventLogService, RendererFactory rendererFactory, - MqPersistence messageQueuePersistence + MqPersistence messageQueuePersistence, + ControlProcesses controlProcesses ) throws IOException { super(params); @@ -52,6 +54,10 @@ public class ControlService extends Service { Map.of("heartbeats", heartbeatService.getHeartbeats(), "events", eventLogService.getLastEntries(100) ))); + Spark.get("/public/repartition", (req, rsp) -> { + controlProcesses.start("REPARTITION-REINDEX"); + return "OK"; + }); monitors.subscribe(this::logMonitorStateChange); diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ControlProcesses.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ControlProcesses.java new file mode 100644 index 00000000..7a70eb83 --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ControlProcesses.java @@ -0,0 +1,38 @@ +package nu.marginalia.control.process; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.mq.persistence.MqPersistence; +import nu.marginalia.mqsm.StateMachine; +import nu.marginalia.mqsm.graph.AbstractStateGraph; + +import java.util.HashMap; +import java.util.Map; +import java.util.UUID; + +@Singleton +public class ControlProcesses { + private final MqPersistence persistence; + public Map stateMachines = new HashMap<>(); + + @Inject + public ControlProcesses(MqPersistence persistence, + RepartitionReindexProcess repartitionReindexProcess + ) { + this.persistence = persistence; + + register("REPARTITION-REINDEX", repartitionReindexProcess); + } + + private void register(String name, AbstractStateGraph graph) { + stateMachines.put(name, new StateMachine(persistence, name, UUID.randomUUID(), graph)); + } + + public void start(String name) throws Exception { + stateMachines.get(name).init(); + } + + public void resume(String name) throws Exception { + stateMachines.get(name).resume(); + } +} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/RepartitionReindexProcess.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/RepartitionReindexProcess.java new file mode 100644 index 00000000..ef76e654 --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/RepartitionReindexProcess.java @@ -0,0 +1,72 @@ +package nu.marginalia.control.process; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.index.client.IndexClient; +import nu.marginalia.index.client.IndexMqEndpoints; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.outbox.MqOutbox; +import nu.marginalia.mqsm.StateFactory; +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mqsm.graph.GraphState; + +@Singleton +public class RepartitionReindexProcess extends AbstractStateGraph { + + private final MqOutbox indexOutbox; + + // STATES + + private static final String INITIAL = "INITIAL"; + private static final String REPARTITION = "REPARTITION"; + private static final String REPARTITION_REPLY = "REPARTITION-REPLY"; + private static final String REINDEX = "REINDEX"; + private static final String REINDEX_REPLY = "REINDEX-REPLY"; + private static final String END = "END"; + + + @Inject + public RepartitionReindexProcess(StateFactory stateFactory, IndexClient indexClient) { + super(stateFactory); + + indexOutbox = indexClient.outbox(); + } + + @GraphState(name = INITIAL, next = REPARTITION) + public void init() throws Exception { + var rsp = indexOutbox.send(IndexMqEndpoints.INDEX_IS_BLOCKED, ""); + + if (rsp.payload().equalsIgnoreCase("true")) { + error("Index is blocked"); + } + } + + @GraphState(name = REPARTITION, next = REPARTITION_REPLY) + public Long repartition() throws Exception { + return indexOutbox.sendAsync(IndexMqEndpoints.INDEX_REPARTITION, ""); + } + + @GraphState(name = REPARTITION_REPLY, next = REINDEX) + public void repartitionReply(Long id) throws Exception { + var rsp = indexOutbox.waitResponse(id); + + if (rsp.state() != MqMessageState.OK) { + error("Repartition failed"); + } + } + + @GraphState(name = REINDEX, next = REINDEX_REPLY) + public Long reindex() throws Exception { + return indexOutbox.sendAsync(IndexMqEndpoints.INDEX_REINDEX, ""); + } + + @GraphState(name = REINDEX_REPLY, next = END) + public void reindexReply(Long id) throws Exception { + var rsp = indexOutbox.waitResponse(id); + + if (rsp.state() != MqMessageState.OK) { + error("Repartition failed"); + } + } + +} From b73fcc19fe451f600b1b2305490adc59f8e8c8be Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 6 Jul 2023 18:05:03 +0200 Subject: [PATCH 014/157] Fix so that crawler tests don't sometimes fetch real sitemaps when they're run. --- .../nu/marginalia/crawl/retreival/CrawlerRetreiver.java | 3 ++- .../nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java | 2 ++ .../marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java | 5 +++++ .../crawl/retreival/fetcher/SitemapRetriever.java | 1 - .../crawling/retreival/CrawlerMockFetcherTest.java | 6 ++++++ 5 files changed, 15 insertions(+), 2 deletions(-) diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index 09352765..3af0110a 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -54,7 +54,7 @@ public class CrawlerRetreiver { private static final LinkFilterSelector linkFilterSelector = new LinkFilterSelector(); private static final DomainProber domainProber = new DomainProber(); - private final SitemapRetriever sitemapRetriever = new SitemapRetriever(); + private final SitemapRetriever sitemapRetriever; private final DomainCrawlFrontier crawlFrontier; @@ -71,6 +71,7 @@ public class CrawlerRetreiver { crawledDomainWriter = writer; this.crawlFrontier = new DomainCrawlFrontier(new EdgeDomain(domain), specs.urls, specs.crawlDepth); + sitemapRetriever = fetcher.createSitemapRetriever(); var fst = crawlFrontier.peek(); if (fst != null) { diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java index 987278a0..1f630ac5 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java @@ -21,4 +21,6 @@ public interface HttpFetcher { CrawledDocument fetchContent(EdgeUrl url) throws RateLimitException; SimpleRobotRules fetchRobotRules(EdgeDomain domain); + + SitemapRetriever createSitemapRetriever(); } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java index 5978444d..55a6d296 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java @@ -307,6 +307,11 @@ public class HttpFetcherImpl implements HttpFetcher { .orElseGet(() -> new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_ALL)); } + @Override + public SitemapRetriever createSitemapRetriever() { + return new SitemapRetriever(); + } + private Optional fetchRobotsForProto(String proto, EdgeDomain domain) { try { var url = new EdgeUrl(proto, domain, null, "/robots.txt", null); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/SitemapRetriever.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/SitemapRetriever.java index 99701244..bb2d2898 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/SitemapRetriever.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/SitemapRetriever.java @@ -10,7 +10,6 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.util.*; -@Singleton public class SitemapRetriever { private final Logger logger = LoggerFactory.getLogger(SitemapRetriever.class); private final ThreadLocal siteMapParserThreadLocal = ThreadLocal.withInitial(() -> new SiteMapParser(false)); diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java index d5f4581e..7462b62c 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java @@ -17,6 +17,7 @@ import nu.marginalia.model.EdgeUrl; import nu.marginalia.test.CommonTestData; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Test; +import org.mockito.Mockito; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -146,5 +147,10 @@ public class CrawlerMockFetcherTest { public SimpleRobotRules fetchRobotRules(EdgeDomain domain) { return new SimpleRobotRules(); } + + @Override + public SitemapRetriever createSitemapRetriever() { + return Mockito.mock(SitemapRetriever.class); + } } } From 98d18986109a9726906406d237a627689c62e89e Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 6 Jul 2023 18:11:19 +0200 Subject: [PATCH 015/157] Bugfix: Don't run the xenforo specialization on phpBB. --- .../plugin/specialization/HtmlProcessorSpecializations.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/specialization/HtmlProcessorSpecializations.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/specialization/HtmlProcessorSpecializations.java index c8ab644a..dab6df24 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/specialization/HtmlProcessorSpecializations.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/specialization/HtmlProcessorSpecializations.java @@ -38,7 +38,7 @@ public class HtmlProcessorSpecializations { return xenforoSpecialization; } if (generator.keywords().contains("phpbb")) { - return xenforoSpecialization; + return phpBBSpecialization; } if (generator.keywords().contains("javadoc")) { return javadocSpecialization; From 96eecc6ea58a35061b3b2b90183421fd4df00628 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 6 Jul 2023 19:50:13 +0200 Subject: [PATCH 016/157] Minor: Readability. --- .../index/svc/IndexQueryService.java | 23 +++++++++++++++---- .../search/svc/SearchQueryIndexService.java | 4 +++- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java index 5c0d715b..37e7bf62 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java @@ -16,6 +16,7 @@ import nu.marginalia.index.client.model.results.SearchResultSet; import nu.marginalia.index.client.model.query.SearchSpecification; import nu.marginalia.index.index.SearchIndex; import nu.marginalia.index.index.SearchIndexSearchTerms; +import nu.marginalia.index.query.IndexQueryPriority; import nu.marginalia.index.results.IndexMetadataService; import nu.marginalia.index.searchset.SearchSet; import nu.marginalia.index.results.IndexResultValuator; @@ -40,6 +41,8 @@ public class IndexQueryService { private final Logger logger = LoggerFactory.getLogger(getClass()); + // This marker is used to mark sensitive log messages that are related to queries + // so that they can be filtered out in the production logging configuration private final Marker queryMarker = MarkerFactory.getMarker("QUERY"); private static final Counter wmsa_edge_index_query_timeouts = Counter.build().name("wmsa_edge_index_query_timeouts").help("-").register(); @@ -151,6 +154,11 @@ public class IndexQueryService { prioFrequencies); } + /** Execute subqueries and return a list of document ids. The index is queried for each subquery, + * at different priorty depths until timeout is reached or the results are all visited. + *
+ * Then the results are combined. + * */ private TLongList evaluateSubqueries(SearchParameters params) { final TLongArrayList results = new TLongArrayList(params.fetchSize); @@ -167,7 +175,6 @@ public class IndexQueryService { final SearchIndexSearchTerms searchTerms = searchTermsSvc.getSearchTerms(subquery); - if (searchTerms.isEmpty()) { logger.info(queryMarker, "empty"); continue; @@ -179,16 +186,16 @@ public class IndexQueryService { List queries = params.createIndexQueries(index, searchTerms); for (var query : queries) { - if (!params.hasTimeLeft()) { + if (!params.hasTimeLeft()) break; - } - if (omitQuery(params, query, results.size())) { + if (shouldOmitQuery(params, query, results.size())) { logger.info(queryMarker, "Omitting {}", query); continue; } int cnt = queryExecutor.executeQuery(query, results, params); + logger.info(queryMarker, "{} from {}", cnt, query); } } @@ -196,7 +203,9 @@ public class IndexQueryService { return results; } - private boolean omitQuery(SearchParameters params, IndexQuery query, int resultCount) { + /** @see IndexQueryPriority */ + private boolean shouldOmitQuery(SearchParameters params, IndexQuery query, int resultCount) { + var priority = query.queryPriority; return switch (priority) { @@ -208,6 +217,9 @@ public class IndexQueryService { private void logSearchTerms(SearchSubquery subquery, SearchIndexSearchTerms searchTerms) { + // This logging should only be enabled in testing, as it is very verbose + // and contains sensitive information + if (!logger.isInfoEnabled(queryMarker)) { return; } @@ -242,6 +254,7 @@ public class IndexQueryService { // Sort the ids for more favorable access patterns on disk resultIds.sort(); + // Parallel stream to calculate scores is a minor performance boost return Arrays.stream(resultIds.toArray()) .parallel() .mapToObj(evaluator::calculatePreliminaryScore) diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchQueryIndexService.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchQueryIndexService.java index 6b2ed7a1..4355793b 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchQueryIndexService.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchQueryIndexService.java @@ -43,12 +43,14 @@ public class SearchQueryIndexService { } public List executeQuery(Context ctx, SearchQuery processedQuery) { + // Send the query final SearchResultSet results = indexClient.query(ctx, processedQuery.specs); + // Update the query count (this is what you see on the front page) searchVisitorCount.registerQuery(); + // Decorate and sort the results List urlDetails = resultDecorator.getAllUrlDetails(results); - urlDetails.sort(resultListComparator); return limitAndDeduplicateResults(processedQuery, urlDetails); From da8bcc6e246c0a7358c8a49a00a3f8893784666c Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 6 Jul 2023 20:18:09 +0200 Subject: [PATCH 017/157] Minor: Don't blow up the reader on a corrupted file --- .../crawling/io/CrawledDomainReader.java | 8 ++++---- .../src/main/java/plan/CrawlPlan.java | 15 +++++++++++---- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java index 744236c0..47dec05f 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java @@ -15,6 +15,7 @@ import java.io.InputStreamReader; import java.nio.file.Path; import java.util.ArrayList; import java.util.List; +import java.util.Optional; import java.util.concurrent.ForkJoinPool; import java.util.concurrent.TimeUnit; @@ -58,14 +59,13 @@ public class CrawledDomainReader { } } - public CrawledDomain readRuntimeExcept(Path path) { + public Optional readOptionally(Path path) { try { - return read(path); + return Optional.of(read(path)); } catch (Exception ex) { logger.warn("Failed to read domain", ex); - - throw new RuntimeException(ex); + return Optional.empty(); } } diff --git a/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java b/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java index c46ed854..ff299d68 100644 --- a/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java +++ b/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java @@ -21,6 +21,7 @@ import java.util.Iterator; import java.util.function.Consumer; import java.util.function.Predicate; import java.util.stream.Stream; +import java.util.Optional; @AllArgsConstructor @NoArgsConstructor @ToString public class CrawlPlan { @@ -95,7 +96,9 @@ public class CrawlPlan { entryStream .map(WorkLogEntry::path) .map(this::getCrawledFilePath) - .map(reader::readRuntimeExcept) + .map(reader::readOptionally) + .filter(Optional::isPresent) + .map(Optional::get) .forEach(consumer); } catch (IOException ex) { @@ -119,11 +122,13 @@ public class CrawlPlan { } return true; }) - .map(reader::readRuntimeExcept) + .map(reader::readOptionally) + .filter(Optional::isPresent) + .map(Optional::get) .forEach(consumer); } catch (IOException ex) { - logger.warn("Failed to read domains", ex); + logger.error("Failed to read domains", ex); throw new RuntimeException(ex); } @@ -141,7 +146,9 @@ public class CrawlPlan { stream = WorkLog.streamLog(crawl.getLogFile()) .map(WorkLogEntry::path) .map(CrawlPlan.this::getCrawledFilePath) - .map(reader::readRuntimeExcept); + .map(reader::readOptionally) + .filter(Optional::isPresent) + .map(Optional::get); } @Override From dbb758d1a8aa98cf8748f94e5f8395f466eac011 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 7 Jul 2023 19:44:57 +0200 Subject: [PATCH 018/157] Minor: Better error handling in crawled domain reader --- .../java/nu/marginalia/crawling/io/CrawledDomainReader.java | 2 +- .../main/java/nu/marginalia/crawl/CrawlJobExtractorMain.java | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java index 47dec05f..9c293af7 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java @@ -64,7 +64,7 @@ public class CrawledDomainReader { return Optional.of(read(path)); } catch (Exception ex) { - logger.warn("Failed to read domain", ex); + logger.warn("Failed to read domain " + path, ex); return Optional.empty(); } } diff --git a/code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobExtractorMain.java b/code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobExtractorMain.java index e8de4de6..e898293b 100644 --- a/code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobExtractorMain.java +++ b/code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobExtractorMain.java @@ -1,6 +1,5 @@ package nu.marginalia.crawl; -import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.crawling.model.spec.CrawlingSpecification; import nu.marginalia.model.EdgeDomain; import nu.marginalia.db.DomainBlacklistImpl; @@ -10,7 +9,6 @@ import nu.marginalia.service.module.DatabaseModule; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import java.sql.SQLException; import java.util.*; import java.util.stream.Stream; From f03146de4b392028cb3a6773ff1615d915839487 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 7 Jul 2023 19:56:14 +0200 Subject: [PATCH 019/157] (crawler) Fix bug poor handling of duplicate ids * Also clean up the code a bit --- .../java/nu/marginalia/crawl/CrawlerMain.java | 49 +++++++++++++------ 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java index 224087f6..cbd9513a 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java @@ -18,6 +18,8 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.nio.file.Path; +import java.util.HashSet; +import java.util.Set; import java.util.concurrent.*; public class CrawlerMain implements AutoCloseable { @@ -38,6 +40,8 @@ public class CrawlerMain implements AutoCloseable { final int poolSize = Integer.getInteger("crawler.pool-size", 512); final int poolQueueSize = 32; + private final Set processedIds = new HashSet<>(); + AbortMonitor abortMonitor = AbortMonitor.getInstance(); Semaphore taskSem = new Semaphore(poolSize); @@ -87,26 +91,41 @@ public class CrawlerMain implements AutoCloseable { logger.info("Let's go"); + // TODO: Make this into an iterable instead so we can abort it plan.forEachCrawlingSpecification(this::startCrawlTask); } - private void startCrawlTask(CrawlingSpecification crawlingSpecification) { - if (abortMonitor.isAlive()) { - try { - taskSem.acquire(); - } catch (InterruptedException e) { - throw new RuntimeException(e); - } - pool.execute(() -> { - try { - fetchDomain(crawlingSpecification); - } - finally { - taskSem.release(); - } - }); + private void startCrawlTask(CrawlingSpecification crawlingSpecification) { + + if (!processedIds.add(crawlingSpecification.id)) { + + // This is a duplicate id, so we ignore it. Otherwise we'd end crawling the same site twice, + // and if we're really unlucky, we might end up writing to the same output file from multiple + // threads with complete bit salad as a result. + + logger.error("Ignoring duplicate id: {}", crawlingSpecification.id); + return; } + + if (!abortMonitor.isAlive()) { + return; + } + + try { + taskSem.acquire(); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + + pool.execute(() -> { + try { + fetchDomain(crawlingSpecification); + } + finally { + taskSem.release(); + } + }); } private void fetchDomain(CrawlingSpecification specification) { From c125d8ab48dccc4f99822421cd63e283037d2f51 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 7 Jul 2023 20:02:05 +0200 Subject: [PATCH 020/157] (search) Fix a bug where space-like characters weren't normalized in query processing. --- .../query_parser/QueryTokenizer.java | 2 +- .../query_parser/QueryParserTest.java | 23 +++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/code/features-search/query-parser/src/main/java/nu/marginalia/query_parser/QueryTokenizer.java b/code/features-search/query-parser/src/main/java/nu/marginalia/query_parser/QueryTokenizer.java index 8ca580db..992e3ee5 100644 --- a/code/features-search/query-parser/src/main/java/nu/marginalia/query_parser/QueryTokenizer.java +++ b/code/features-search/query-parser/src/main/java/nu/marginalia/query_parser/QueryTokenizer.java @@ -9,7 +9,7 @@ import java.util.List; import java.util.regex.Pattern; public class QueryTokenizer { - private static final Pattern noisePattern = Pattern.compile("[,]"); + private static final Pattern noisePattern = Pattern.compile("[,\\s]"); public List tokenizeQuery(String rawQuery) { List tokens = new ArrayList<>(); diff --git a/code/features-search/query-parser/src/test/java/nu/marginalia/query_parser/QueryParserTest.java b/code/features-search/query-parser/src/test/java/nu/marginalia/query_parser/QueryParserTest.java index d2a2ee6e..8bc19545 100644 --- a/code/features-search/query-parser/src/test/java/nu/marginalia/query_parser/QueryParserTest.java +++ b/code/features-search/query-parser/src/test/java/nu/marginalia/query_parser/QueryParserTest.java @@ -18,6 +18,29 @@ class QueryParserTest { parser = new QueryParser(); } + @Test + public void testTabHandling() { + var query = " lorem\tipsum\ndolor sit"; + var ret = parser.parse(query); + assertEquals(4, ret.size()); + + var lorem = ret.get(0); + assertEquals("lorem", lorem.str); + assertEquals("lorem", lorem.displayStr); + + var ipsum = ret.get(1); + assertEquals("ipsum", ipsum.str); + assertEquals("ipsum", ipsum.displayStr); + + var dolor = ret.get(2); + assertEquals("dolor", dolor.str); + assertEquals("dolor", dolor.displayStr); + + var sit = ret.get(3); + assertEquals("sit", sit.str); + assertEquals("sit", sit.displayStr); + } + @Test public void testAdviceString() { var ret = parser.parse("alcibiades (plato) \"my query\" -cars"); From cbbf60a599c84f402125ce09f51a53de7d1894c8 Mon Sep 17 00:00:00 2001 From: Viktor Date: Mon, 10 Jul 2023 17:36:12 +0200 Subject: [PATCH 021/157] Better fingerprinting (#35) * Better fingerprinting for server tech * Many more features in FeatureExtractor * Blog specialization * SiteType table --- code/common/db/build.gradle | 8 +- .../java/nu/marginalia/db/DomainTypes.java | 179 ++++++++++++++ .../resources/sql/current/10-domain-type.sql | 19 ++ .../nu/marginalia/db/DomainTypesTest.java | 63 +++++ .../marginalia/model/crawl/HtmlFeature.java | 44 +++- .../language/model/DocumentLanguageData.java | 9 + .../processes/converting-process/build.gradle | 1 + .../marginalia/converting/ConverterMain.java | 4 +- .../processor/ConverterDomainTypes.java | 53 ++++ .../logic/DocumentGeneratorExtractor.java | 123 ++++++++-- .../processor/logic/DocumentValuator.java | 5 +- .../processor/logic/FeatureExtractor.java | 227 +++++++++++++++--- .../plugin/HtmlDocumentProcessorPlugin.java | 25 +- .../specialization/BlogSpecialization.java | 210 ++++++++++++++++ .../HtmlProcessorSpecializations.java | 23 +- .../ConvertingIntegrationTestModule.java | 3 + .../BlogSpecializationTest.java | 17 ++ .../JavadocSpecializationTest.java | 2 +- .../LemmySpecializationTest.java | 4 +- .../XenForoSpecializationTest.java | 2 +- .../crawl/retreival/LinkFilterSelector.java | 3 +- .../index/svc/IndexQueryService.java | 2 +- .../marginalia/search/model/UrlDetails.java | 4 +- .../experiments/DebugConverterExperiment.java | 49 +--- 24 files changed, 963 insertions(+), 116 deletions(-) create mode 100644 code/common/db/src/main/java/nu/marginalia/db/DomainTypes.java create mode 100644 code/common/db/src/main/resources/sql/current/10-domain-type.sql create mode 100644 code/common/db/src/test/java/nu/marginalia/db/DomainTypesTest.java create mode 100644 code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/ConverterDomainTypes.java create mode 100644 code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/specialization/BlogSpecialization.java create mode 100644 code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/plugin/specialization/BlogSpecializationTest.java diff --git a/code/common/db/build.gradle b/code/common/db/build.gradle index a06d8c3e..b7e3f0ef 100644 --- a/code/common/db/build.gradle +++ b/code/common/db/build.gradle @@ -2,6 +2,7 @@ plugins { id 'java' id "io.freefair.lombok" version "5.3.3.3" id 'jvm-test-suite' + } java { @@ -32,8 +33,14 @@ dependencies { testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.junit testImplementation libs.mockito + + + testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4') + testImplementation 'org.testcontainers:mariadb:1.17.4' + testImplementation 'org.testcontainers:junit-jupiter:1.17.4' } + test { maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1 maxHeapSize = "8G" @@ -47,4 +54,3 @@ task fastTests(type: Test) { excludeTags "slow" } } - diff --git a/code/common/db/src/main/java/nu/marginalia/db/DomainTypes.java b/code/common/db/src/main/java/nu/marginalia/db/DomainTypes.java new file mode 100644 index 00000000..60b42030 --- /dev/null +++ b/code/common/db/src/main/java/nu/marginalia/db/DomainTypes.java @@ -0,0 +1,179 @@ +package nu.marginalia.db; + +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.id.EdgeIdList; +import org.slf4j.LoggerFactory; +import org.slf4j.Logger; + +import javax.inject.Inject; +import javax.inject.Singleton; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.URL; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.List; + +/** A list of domains that are known to be of a certain type */ +@Singleton +public class DomainTypes { + + public enum Type { + BLOG, + TEST + }; + + private final Logger logger = LoggerFactory.getLogger(DomainTypes.class); + + private final HikariDataSource dataSource; + + @Inject + public DomainTypes(HikariDataSource dataSource) { + this.dataSource = dataSource; + } + + /** Get all domains of a certain type, including domains that are not in the EC_DOMAIN table */ + public List getAllDomainsByType(Type type) { + List ret = new ArrayList<>(); + + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT DOMAIN_NAME + FROM DOMAIN_SELECTION INNER JOIN DOMAIN_SELECTION_TYPE ON DOMAIN_TYPE_ID = DOMAIN_SELECTION_TYPE.ID + WHERE DOMAIN_SELECTION_TYPE.NAME = ? + """)) + { + stmt.setString(1, type.name()); + var rs = stmt.executeQuery(); + while (rs.next()) { + ret.add(rs.getString(1)); + } + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + + return ret; + } + + /** Retrieve the EdgeId of all domains of a certain type, + * ignoring entries that are not in the EC_DOMAIN table */ + public EdgeIdList getKnownDomainsByType(Type type) { + EdgeIdList ret = new EdgeIdList<>(); + + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT EC_DOMAIN.ID + FROM DOMAIN_SELECTION + INNER JOIN DOMAIN_SELECTION_TYPE ON DOMAIN_TYPE_ID = DOMAIN_SELECTION_TYPE.ID + INNER JOIN EC_DOMAIN ON DOMAIN_SELECTION.DOMAIN_NAME = EC_DOMAIN.DOMAIN_NAME + WHERE DOMAIN_SELECTION_TYPE.NAME = ? + """)) + { + stmt.setString(1, type.name()); + var rs = stmt.executeQuery(); + while (rs.next()) { + ret.add(rs.getInt(1)); + } + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + + return ret; + } + + /** Reload the list of domains of a certain type from the source */ + public void reloadDomainsList(Type type) throws IOException, SQLException { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT SOURCE, ID FROM DOMAIN_SELECTION_TYPE WHERE NAME = ? + """); + var deleteStatement = conn.prepareStatement(""" + DELETE FROM DOMAIN_SELECTION WHERE DOMAIN_TYPE_ID = ? + """); + var insertStatement = conn.prepareStatement(""" + INSERT IGNORE INTO DOMAIN_SELECTION (DOMAIN_NAME, DOMAIN_TYPE_ID) VALUES (?, ?) + """) + ) + { + stmt.setString(1, type.name()); + var rsp = stmt.executeQuery(); + + if (!rsp.next()) { + throw new RuntimeException("No such domain selection type: " + type); + } + + var source = rsp.getString(1); + int typeId = rsp.getInt(2); + + List downloadDomains = downloadDomainsList(source); + + try { + conn.setAutoCommit(false); + deleteStatement.setInt(1, typeId); + deleteStatement.executeUpdate(); + + for (String domain : downloadDomains) { + insertStatement.setString(1, domain); + insertStatement.setInt(2, typeId); + insertStatement.executeUpdate(); + // Could use batch insert here, but this executes infrequently, so it's not worth the hassle + } + + conn.commit(); + } + catch (SQLException ex) { + conn.rollback(); + throw ex; + } + finally { + conn.setAutoCommit(true); + } + } + } + + private List downloadDomainsList(String source) throws IOException { + List ret = new ArrayList<>(); + + logger.info("Downloading domain list from {}", source); + + try (var br = new BufferedReader(new InputStreamReader(new URL(source).openStream()))) { + String line; + + while ((line = br.readLine()) != null) { + line = cleanDomainListLine(line); + + + if (isValidDomainListEntry(line)) + ret.add(line); + } + } + + logger.info("-- found {}", ret.size()); + + + return ret; + } + + private String cleanDomainListLine(String line) { + line = line.trim(); + + int hashIdx = line.indexOf('#'); + if (hashIdx >= 0) + line = line.substring(0, hashIdx).trim(); + + return line; + } + + private boolean isValidDomainListEntry(String line) { + if (line.isBlank()) + return false; + if (!line.matches("[a-z0-9\\-.]+")) + return false; + + return true; + } +} diff --git a/code/common/db/src/main/resources/sql/current/10-domain-type.sql b/code/common/db/src/main/resources/sql/current/10-domain-type.sql new file mode 100644 index 00000000..2011d1f6 --- /dev/null +++ b/code/common/db/src/main/resources/sql/current/10-domain-type.sql @@ -0,0 +1,19 @@ +CREATE TABLE IF NOT EXISTS DOMAIN_SELECTION_TYPE ( + ID INT PRIMARY KEY AUTO_INCREMENT, + NAME VARCHAR(255) UNIQUE, + SOURCE VARCHAR(255) NOT NULL +) +CHARACTER SET utf8mb4 +COLLATE utf8mb4_bin; + +CREATE TABLE DOMAIN_SELECTION ( + DOMAIN_NAME VARCHAR(255) PRIMARY KEY, + DOMAIN_TYPE_ID INT, + FOREIGN KEY (DOMAIN_TYPE_ID) REFERENCES DOMAIN_SELECTION_TYPE(ID) ON DELETE CASCADE +) +CHARACTER SET utf8mb4 +COLLATE utf8mb4_unicode_ci; + +INSERT IGNORE INTO DOMAIN_SELECTION_TYPE(NAME, SOURCE) +VALUES ('BLOG', 'https://raw.githubusercontent.com/MarginaliaSearch/submit-site-to-marginalia-search/master/blogs.txt'), + ('TEST', 'https://downloads.marginalia.nu/domain-list-test.txt'); \ No newline at end of file diff --git a/code/common/db/src/test/java/nu/marginalia/db/DomainTypesTest.java b/code/common/db/src/test/java/nu/marginalia/db/DomainTypesTest.java new file mode 100644 index 00000000..0829f6f5 --- /dev/null +++ b/code/common/db/src/test/java/nu/marginalia/db/DomainTypesTest.java @@ -0,0 +1,63 @@ +package nu.marginalia.db; + +import com.google.common.collect.Sets; +import com.zaxxer.hikari.HikariConfig; +import com.zaxxer.hikari.HikariDataSource; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.testcontainers.containers.MariaDBContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +import java.io.IOException; +import java.sql.SQLException; +import java.util.HashSet; +import java.util.Set; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +@Testcontainers +public class DomainTypesTest { + @Container + static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") + .withDatabaseName("WMSA_prod") + .withUsername("wmsa") + .withPassword("wmsa") + .withInitScript("sql/current/10-domain-type.sql") + .withNetworkAliases("mariadb"); + + static HikariDataSource dataSource; + static DomainTypes domainTypes; + + @BeforeAll + public static void setup() { + HikariConfig config = new HikariConfig(); + config.setJdbcUrl(mariaDBContainer.getJdbcUrl()); + config.setUsername("wmsa"); + config.setPassword("wmsa"); + + dataSource = new HikariDataSource(config); + + domainTypes = new DomainTypes(dataSource); + } + + @AfterAll + public static void teardown() { + dataSource.close(); + } + + @Test + public void reloadDomainsList() throws SQLException, IOException { + domainTypes.reloadDomainsList(DomainTypes.Type.TEST); + + var downloadedDomains = new HashSet<>(domainTypes.getAllDomainsByType(DomainTypes.Type.TEST)); + + var expectedDomains = Set.of("www.marginalia.nu", "search.marginalia.nu", + "encyclopedia.marginalia.nu", "memex.marginalia.nu"); + + assertEquals(4, downloadedDomains.size()); + assertEquals(Set.of(), Sets.symmetricDifference(expectedDomains, downloadedDomains)); + } + +} \ No newline at end of file diff --git a/code/common/model/src/main/java/nu/marginalia/model/crawl/HtmlFeature.java b/code/common/model/src/main/java/nu/marginalia/model/crawl/HtmlFeature.java index 4bdb5ca1..d9adbff6 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/crawl/HtmlFeature.java +++ b/code/common/model/src/main/java/nu/marginalia/model/crawl/HtmlFeature.java @@ -6,7 +6,10 @@ public enum HtmlFeature { MEDIA( "special:media"), JS("special:scripts"), AFFILIATE_LINK( "special:affiliate"), - TRACKING("special:tracking"), + TRACKING_INNOCENT("special:tracking"), + TRACKING_EVIL("special:tracking2"), + + VIEWPORT("special:viewport"), COOKIES("special:cookies"), CATEGORY_FOOD("category:food"), @@ -15,8 +18,43 @@ public enum HtmlFeature { GA_SPAM("special:gaspam"), - UNKNOWN("special:uncategorized") - ; + /** For fingerprinting and ranking */ + OPENGRAPH("special:opengraph"), + OPENGRAPH_IMAGE("special:opengraph:image"), + TWITTERCARD("special:twittercard"), + TWITTERCARD_IMAGE("special:twittercard:image"), + FONTAWSESOME("special:fontawesome"), + GOOGLEFONTS("special:googlefonts"), + DNS_PREFETCH("special:dnsprefetch"), + PRELOAD("special:preload"), + PRECONNECT("special:preconnect"), + PINGBACK("special:pingback"), + FEED("special:feed"), + WEBMENTION("special:webmention"), + INDIEAUTH("special:indieauth"), + ME_TAG("special:metag"), + NEXT_TAG("special:nexttag"), + AMPHTML("special:amphtml"), + JSON_LD("special:jsonld"), + ORIGIN_TRIAL("special:origintrial"), + PROFILE_GMPG("special:profile-gpmg"), + QUANTCAST("special:quantcast"), + COOKIELAW("special:cookielaw"), + DIDOMI("special:didomi"), + PARDOT("special:pardot"), + ONESIGNAL("special:onesignal"), + DATE_TAG("special:date_tag"), + NOSCRIPT_TAG("special:noscript_tag"), + + ROBOTS_INDEX("robots:index"), + ROBOTS_FOLLOW("robots:follow"), + ROBOTS_NOODP("robots:noodp"), + ROBOTS_NOYDIR("robots:noydir"), + DOFOLLOW_LINK("special:dofollow"), + APPLE_TOUCH_ICON("special:appleicon"), + + UNKNOWN("special:uncategorized"); + private final String keyword; diff --git a/code/libraries/language-processing/src/main/java/nu/marginalia/language/model/DocumentLanguageData.java b/code/libraries/language-processing/src/main/java/nu/marginalia/language/model/DocumentLanguageData.java index a40fd637..a889ab2a 100644 --- a/code/libraries/language-processing/src/main/java/nu/marginalia/language/model/DocumentLanguageData.java +++ b/code/libraries/language-processing/src/main/java/nu/marginalia/language/model/DocumentLanguageData.java @@ -17,6 +17,15 @@ public class DocumentLanguageData { public final DocumentSentence[] titleSentences; public final TObjectIntHashMap wordCount; + /** for test convenience */ + public static DocumentLanguageData empty() { + return new DocumentLanguageData( + new DocumentSentence[0], + new DocumentSentence[0], + new TObjectIntHashMap<>() + ); + } + public int totalNumWords() { int ret = 0; for (int i = 0; i < sentences.length; i++) { diff --git a/code/processes/converting-process/build.gradle b/code/processes/converting-process/build.gradle index 6ef9a25c..4cc4c63b 100644 --- a/code/processes/converting-process/build.gradle +++ b/code/processes/converting-process/build.gradle @@ -29,6 +29,7 @@ dependencies { implementation project(':code:api:index-api') implementation project(':code:common:model') + implementation project(':code:common:db') implementation project(':code:common:service') implementation project(':code:common:config') implementation project(':code:common:service-discovery') diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java index 31fa4bb1..3ecebb80 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java @@ -5,6 +5,7 @@ import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; import nu.marginalia.process.log.WorkLog; +import nu.marginalia.service.module.DatabaseModule; import plan.CrawlPlanLoader; import plan.CrawlPlan; import nu.marginalia.converting.compiler.InstructionsCompiler; @@ -33,7 +34,8 @@ public class ConverterMain { var plan = new CrawlPlanLoader().load(Path.of(args[0])); Injector injector = Guice.createInjector( - new ConverterModule(plan) + new ConverterModule(plan), + new DatabaseModule() ); injector.getInstance(ConverterMain.class); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/ConverterDomainTypes.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/ConverterDomainTypes.java new file mode 100644 index 00000000..95a1b5fd --- /dev/null +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/ConverterDomainTypes.java @@ -0,0 +1,53 @@ +package nu.marginalia.converting.processor; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.db.DomainTypes; +import nu.marginalia.model.EdgeDomain; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.sql.SQLException; +import java.util.HashMap; +import java.util.Map; + +/** Converter-side wrapper for of common:db's DomainTypes, + * which is a list of domains of a known type (e.g. blog) + */ +@Singleton +public class ConverterDomainTypes { + private final Logger logger = LoggerFactory.getLogger(ConverterDomainTypes.class); + private final Map domainTypes = new HashMap<>(); + + private enum DomainType { + BLOG + } + + @Inject + public ConverterDomainTypes(DomainTypes types) throws SQLException { + var allBlogs = types.getAllDomainsByType(DomainTypes.Type.BLOG); + + if (allBlogs.isEmpty()) { + logger.info("No domains of type BLOG found in database, downloading list"); + try { + types.reloadDomainsList(DomainTypes.Type.BLOG); + allBlogs = types.getAllDomainsByType(DomainTypes.Type.BLOG); + } + catch (IOException ex) { + logger.error("Failed to download domains list", ex); + } + } + + for (var item : allBlogs) { + domainTypes.put(new EdgeDomain(item), DomainType.BLOG); + } + + logger.info("Loaded {} domain types", domainTypes.size()); + + } + + public boolean isBlog(EdgeDomain domain) { + return domainTypes.get(domain) == DomainType.BLOG; + } +} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java index ac1c15a2..dea7cefa 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java @@ -12,13 +12,13 @@ import java.util.List; public class DocumentGeneratorExtractor { private static final String defaultValue = "unset"; - public DocumentGenerator generatorCleaned(Document doc) { + public DocumentGenerator detectGenerator(Document doc, String responseHeaders) { var tags = doc.select("meta[name=generator]"); if (tags.size() == 0) { // Some sites have a comment in the head instead of a meta tag - return fingerprintByComments(doc); + return fingerprintServerTech(doc, responseHeaders); } if (tags.size() > 1) { return DocumentGenerator.multiple(); @@ -29,11 +29,14 @@ public class DocumentGeneratorExtractor { generator = removePrefixOrSuffix(generator); if (generator.isBlank()) - return DocumentGenerator.unset(); + return fingerprintServerTech(doc, responseHeaders); + + if (generator.startsWith("AMP by WP")) + return DocumentGenerator.of("wordpress", "wordpress-amp"); String[] parts = StringUtils.split(generator, " ,:!"); if (parts.length == 0) - return DocumentGenerator.unset(); + return fingerprintServerTech(doc, responseHeaders); int slashIdx = parts[0].indexOf('/'); if (slashIdx >= 0) { @@ -42,7 +45,7 @@ public class DocumentGeneratorExtractor { } if (parts.length > 3) { - return DocumentGenerator.unset(); // if it's still very long after trim(), it's probably a custom hand written message + return fingerprintServerTech(doc, responseHeaders); // if it's still very long after trim(), it's probably a custom hand written message } switch (parts[0]) { @@ -73,7 +76,7 @@ public class DocumentGeneratorExtractor { } // Fallback logic when there is no meta tag - private DocumentGenerator fingerprintByComments(Document doc) { + private DocumentGenerator fingerprintServerTech(Document doc, String responseHeaders) { for (var comment : doc.getElementsByTag("head").comments()) { String data = comment.getData(); @@ -81,22 +84,43 @@ public class DocumentGeneratorExtractor { if (data.contains("Generated by javadoc")) { return DocumentGenerator.of("javadoc"); } - + if (data.contains("Squarespace")) { + return DocumentGenerator.of("squarespace"); + } if (data.contains("phpBB")) { return DocumentGenerator.of("phpbb"); } } for (var tag : doc.head().getElementsByTag("script")) { - if (tag.html().contains("window.lemmyConfig")) { - return DocumentGenerator.of("lemmy"); - } - if (tag.html().contains("URL_DOMAIN = 'wikidot.com'")) { - return DocumentGenerator.of("wikidot"); - } - if (tag.attr("src").contains("wp-content")) { + String scriptSrc = tag.attr("src"); + + if (scriptSrc.contains("wp-content") || scriptSrc.contains("wp-includes")) { return DocumentGenerator.of("wordpress", "wordpress-sneaky"); } + if (scriptSrc.contains("squarespace.com")) { + return DocumentGenerator.of("squarespace"); + } + if (scriptSrc.contains("cdn.cloversites.com")) { + return DocumentGenerator.of("cloversites"); + } + if (scriptSrc.contains("bndzgl.com")) { + return DocumentGenerator.of("bndzgl"); + } + if (scriptSrc.contains("editmysite.com")) { + return DocumentGenerator.of("editmysite"); + } + if (scriptSrc.contains("website-editor.net")) { + return DocumentGenerator.of("website-editor.net"); + } + String scriptHtml = tag.html(); + if (scriptHtml.contains("window.lemmyConfig")) { + return DocumentGenerator.of("lemmy"); + } + if (scriptHtml.contains("URL_DOMAIN = 'wikidot.com'")) { + return DocumentGenerator.of("wikidot"); + } + } for (var tag : doc.head().getElementsByTag("link")) { @@ -109,6 +133,10 @@ public class DocumentGeneratorExtractor { return DocumentGenerator.of("flarum"); } + if (doc.getElementById("tracpowered") != null) { + return DocumentGenerator.of("trac"); + } + if (doc.getElementById("_xfClientLoadTime") != null) { return DocumentGenerator.of("xenforo"); } @@ -117,6 +145,48 @@ public class DocumentGeneratorExtractor { return DocumentGenerator.of("invision"); } + if (doc.getElementById("___gatsby") != null) { + return DocumentGenerator.of("gatsby"); + } + + String[] headers = responseHeaders.toLowerCase().split("\n+"); + for (var header : headers) { + if (header.contains("x-drupal-cache")) { + return DocumentGenerator.of("drupal"); + } + if (header.contains("x-powered-by: asp.net")) { + return DocumentGenerator.of("asp.net"); + } + if (header.contains("x-powered-by: php")) { + return DocumentGenerator.of("php"); + } + if (header.contains("x-powered-by: wp engine")) { + return DocumentGenerator.of("wordpress", "wp-engine", "wordpress-sneaky"); + } + if (header.contains("x-powered-by: statamic")) { + return DocumentGenerator.of("laravel", "statamic"); + } + } + + // These should be all the way down as they are the most generic + for (var header : headers) { + if (header.contains("server: mastodon")) { + return DocumentGenerator.of("mastodon"); + } + if (header.contains("server: gunicorn")) { + return DocumentGenerator.of("gunicorn"); + } + if (header.contains("server: nginx")) { + return DocumentGenerator.of("nginx"); + } + if (header.contains("server: apache")) { + return DocumentGenerator.of("apache"); + } + if (header.contains("server: cowboy")) { + return DocumentGenerator.of("cowboy"); // erlang, really?! + } + } + return DocumentGenerator.unset(); } @@ -138,6 +208,11 @@ public class DocumentGeneratorExtractor { generator = generator.substring(0, dashIdx); } + int parenIdx = generator.indexOf('('); // Some strings have values like 'Drupal 9 (https://www.drupal.org)' + if (parenIdx >= 0) { + generator = generator.substring(0, parenIdx); + } + if (!StringUtils.isAsciiPrintable(generator)) return ""; @@ -170,11 +245,18 @@ public class DocumentGeneratorExtractor { final GeneratorType type = switch (parts[0]) { case "joomla", "wordpress", "drupal", "plone", "postnuke", "divi", "freeway", "unicity", "modx", "sitemagic", "agility", "edlio", "blogger", "slider", "slider_revolution", "gravcms", - "typo3", "dotnetnuke", "cms", "coremedia", "dspace" + "typo3", "dotnetnuke", "cms", "coremedia", "dspace", "laravel", "trac", "bunnypress", "astro", + "ghost", "publii" -> GeneratorType.CMS; case "wix.com", "one.com", "wpbakery", "claris", "wordpress.com", "hubspot", "visual_composer", "mobirise", "everweb", "rapidweaver", "shorthand", - "visual", "nitropack", + "visual", "nitropack", "squarespace", "editmysite", "websiteeditor.net", + + "svbtle.com", "write.as", "montaigne.io", // blogging platforms, maybe should be in another category? + + "cloversites", // clover is a church-oriented website builder, found that kinda neat + "bndzgl", // band websites ..? + /* these are not SAAS but close enough */ "redux", "bootply" -> GeneratorType.SAAS; @@ -185,7 +267,8 @@ public class DocumentGeneratorExtractor { "pdf2htmlex", "nvu", "mozilla", "golive", "tenfingers", "publisher", "allaire", "neooffice" -> GeneratorType.BOOMER_STATIC; - case "hugo", "jekyll", "hakyll", "gatsby", "react", "gridsome" + case "hugo", "jekyll", "hakyll", "nikola", "zola", "olivetti", "pelican", "sushy", "hexo", "eleventy", + "gridsome", "vuepress", "docusaurus", "docpad", "techou", "quarto", "soupault" -> GeneratorType.ZOOMER_STATIC; case "vi", "vim", "emacs", "orgmode", "hand", "vscode", "atom", "bbedit", "nano", "notepad.exe", "gedit", "me", @@ -198,9 +281,9 @@ public class DocumentGeneratorExtractor { -> GeneratorType.FORUM; case "mediawiki", "dokuwiki", "wikidot", "sharepoint" -> GeneratorType.WIKI; - case "pandoc", "mkdocs", "doxygen", "javadoc" + case "pandoc", "mkdocs", "doxygen", "javadoc", "asciidoc", "jsdoc" -> GeneratorType.DOCS; - case "woocommerce", "shopfactory", "prestashop", "magento", "shopify", "sitedirect", "seomatic" + case "woocommerce", "shopfactory", "prestashop", "magento", "shopify", "sitedirect", "seomatic", "osclass" -> GeneratorType.ECOMMERCE_AND_SPAM; default -> GeneratorType.UNKNOWN; @@ -216,7 +299,7 @@ public class DocumentGeneratorExtractor { public static DocumentGenerator multiple() { // It's *generally* WordPress or the like that injects multiple generator tags - return new DocumentGenerator(GeneratorType.CMS, List.of(defaultValue)); + return new DocumentGenerator(GeneratorType.CMS, List.of("wordpress", "wp-best-guess")); } } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java index 9de7af57..91003172 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java @@ -19,14 +19,13 @@ public class DocumentValuator { int textLength) throws DisqualifiedException { double scriptPenalty = getScriptPenalty(parsedDocument); - int textBodyLength = textLength; int rawLength = crawledDocument.documentBody.length(); - if (textBodyLength == 0) { + if (textLength == 0) { throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LENGTH); } - return Math.log(textBodyLength / (double) (1+rawLength))*htmlStandard.scale + return Math.log(textLength / (double) (1+rawLength))*htmlStandard.scale + htmlStandard.offset - scriptPenalty; } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java index 2ea690f1..57a98879 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java @@ -21,25 +21,29 @@ import java.util.Set; @Singleton public class FeatureExtractor { - private static final List trackers = List.of("adform.net", + private static final List innocentTrackers = List.of( + "twitter.com", + "bing.com", + "msn.com"); + private static final List shittyTrackers = List.of("adform.net", "connect.facebook", + "facebook.com/tr", "googletagmanager.com", "googlesyndication.com", - "google.com", - "twitter.com", "smartadserver.com", "doubleclick.com", "2mdn.com", "dmtry.com", - "bing.com", - "msn.com", "amazon-adsystem.com", "alexametrics.com", "rubiconproject.com", "chango.com", "d5nxst8fruw4z.cloudfront.net", "d31qbv1cthcecs.cloudfront.net", - "linkedin.com"); + "linkedin.com", + "perfectaudience.com", + "marketingautomation.services", + "usefathom"); private final AdblockSimulator adblockSimulator; private final RecipeDetector recipeDetector; @@ -71,21 +75,119 @@ public class FeatureExtractor { } for (var scriptTag : scriptTags) { - if (isJavascriptTag(scriptTag)) { + final String type = scriptTag.attr("type"); + + if ("application/ld+json".equalsIgnoreCase(type)) { + features.add(HtmlFeature.JSON_LD); + } + else { features.add(HtmlFeature.JS); } } - // 500 IQ web developers use error or load handlers - // sneakily load JS without explicit script tags - for (var link : doc.head().getElementsByTag("link")) { - if (link.hasAttr("onerror")) { - features.add(HtmlFeature.JS); - break; + if (!doc.head().getElementsByTag("viewport").isEmpty()) { + features.add(HtmlFeature.VIEWPORT); + } + for (var atag : doc.body().getElementsByTag("a")) { + var rel = atag.attr("rel"); + if (rel.equals("dofollow")) { + features.add(HtmlFeature.DOFOLLOW_LINK); } - if (link.hasAttr("onload")) { + } + + if (!doc.getElementsByTag("date").isEmpty()) { + features.add(HtmlFeature.DATE_TAG); + } + if (!doc.getElementsByTag("noscript").isEmpty()) { + features.add(HtmlFeature.NOSCRIPT_TAG); + } + + + for (var link : doc.head().getElementsByTag("link")) { + + // 500 IQ web developers use error or load handlers + // sneakily load JS without explicit script tags + if (link.hasAttr("onerror")) features.add(HtmlFeature.JS); - break; + if (link.hasAttr("onload")) + features.add(HtmlFeature.JS); + + if (link.hasAttr("pingback")) { + features.add(HtmlFeature.PINGBACK); + } + + + var href = link.attr("href"); + + if (href.contains("indieauth")) + features.add(HtmlFeature.INDIEAUTH); + + var rel = link.attr("rel"); + + if (rel.equals("webmention")) + features.add(HtmlFeature.WEBMENTION); + + if (rel.equals("me")) + features.add(HtmlFeature.ME_TAG); + + if (rel.equals("next")) + features.add(HtmlFeature.NEXT_TAG); + + if (rel.equals("alternate") && link.hasAttr("type")) + features.add(HtmlFeature.FEED); + + if (rel.equals("dns-prefetch")) + features.add(HtmlFeature.DNS_PREFETCH); + + if (rel.equals("preload")) + features.add(HtmlFeature.PRELOAD); + + if (rel.equals("preconnect")) + features.add(HtmlFeature.PRECONNECT); + + if (rel.equals("amphtml")) + features.add(HtmlFeature.AMPHTML); + + if (rel.equals("apple-touch-icon")) + features.add(HtmlFeature.APPLE_TOUCH_ICON); + + } + + for (var meta : doc.head().getElementsByTag("meta")) { + // + if (meta.attr("name").equals("robots")) { + var content = meta.attr("content"); + if (!content.contains("noindex") && content.contains("index")) { + features.add(HtmlFeature.ROBOTS_INDEX); + } + if (!content.contains("nofollow") && content.contains("follow")) { + features.add(HtmlFeature.ROBOTS_FOLLOW); + } + if (content.contains("noodp")) { + features.add(HtmlFeature.ROBOTS_NOODP); + } + if (content.contains("noydir")) { + features.add(HtmlFeature.ROBOTS_NOYDIR); + } + } + + if (meta.attr("profile").contains("gmpg")) { + features.add(HtmlFeature.PROFILE_GMPG); + } + if (meta.attr("property").equals("og:description")) { + features.add(HtmlFeature.OPENGRAPH); + } + if (meta.attr("property").equals("og:image")) { + features.add(HtmlFeature.OPENGRAPH_IMAGE); + } + if (meta.attr("name").equals("twitter:description")) { + features.add(HtmlFeature.TWITTERCARD); + } + if (meta.attr("name").equals("twitter:image")) { + features.add(HtmlFeature.TWITTERCARD_IMAGE); + } + if (meta.attr("http-equiv").equals("origin-trial")) { + features.add(HtmlFeature.ORIGIN_TRIAL); } } @@ -100,14 +202,74 @@ public class FeatureExtractor { } for (var scriptTag : scriptTags) { - if (hasTrackingScript(scriptTag)) { - features.add(HtmlFeature.TRACKING); - break; + if (hasInvasiveTrackingScript(scriptTag)) { + features.add(HtmlFeature.TRACKING_INNOCENT); + features.add(HtmlFeature.TRACKING_EVIL); + } + else if (hasNaiveTrackingScript(scriptTag)) { + features.add(HtmlFeature.TRACKING_INNOCENT); + } + + if (scriptTag.hasAttr("didomi/javascript")) { + features.add(HtmlFeature.DIDOMI); + } + + String src = scriptTag.attr("src"); + if (src.contains("OneSignalSDK")) { + features.add(HtmlFeature.ONESIGNAL); + } + + String scriptText = scriptTag.html(); + + if (scriptText.contains("'pd.js'")) { + features.add(HtmlFeature.PARDOT); + } + if (scriptText.contains("https://cmp.quantcast.com")) { + features.add(HtmlFeature.QUANTCAST); + } + if (scriptText.contains("https://quantcast.mgr.consensu.org")) { + features.add(HtmlFeature.QUANTCAST); + } + if (scriptText.contains("https://cdn.cookielaw.org")) { + features.add(HtmlFeature.COOKIELAW); + } + if (scriptText.contains("_linkedin_data_partner_id")) { + features.add(HtmlFeature.TRACKING_EVIL); + } + if (scriptText.contains("window.OneSignal")) { + features.add(HtmlFeature.ONESIGNAL); + } + if (scriptText.contains("connect.facebook.net")) { + features.add(HtmlFeature.TRACKING_EVIL); + } + if (scriptText.contains("hotjar.com")) { + features.add(HtmlFeature.TRACKING_INNOCENT); + } + } + + for (var noscript : doc.getElementsByTag("noscript")) { + for (var iframe : noscript.getElementsByTag("iframe")) { + if (hasInvasiveTrackingScript(iframe)) { + features.add(HtmlFeature.TRACKING_INNOCENT); + features.add(HtmlFeature.TRACKING_EVIL); + } + else if (hasNaiveTrackingScript(iframe)) { + features.add(HtmlFeature.TRACKING_INNOCENT); + } + } + for (var img : noscript.getElementsByTag("img")) { + if (hasInvasiveTrackingScript(img)) { + features.add(HtmlFeature.TRACKING_INNOCENT); + features.add(HtmlFeature.TRACKING_EVIL); + } + else if (hasNaiveTrackingScript(img)) { + features.add(HtmlFeature.TRACKING_INNOCENT); + } } } if (scriptTags.html().contains("google-analytics.com")) { - features.add(HtmlFeature.TRACKING); + features.add(HtmlFeature.TRACKING_INNOCENT); } for (var aTag : doc.getElementsByTag("a")) { @@ -129,30 +291,33 @@ public class FeatureExtractor { return features; } - private boolean hasTrackingScript(Element scriptTag) { - return hasTrackingScript(scriptTag.attr("src")); + private boolean hasInvasiveTrackingScript(Element scriptTag) { + return hasInvasiveTrackingScript(scriptTag.attr("src")); } + private boolean hasNaiveTrackingScript(Element scriptTag) { + return hasNaiveTrackingScript(scriptTag.attr("src")); + } + private boolean hasInvasiveTrackingScript(String src) { - private boolean hasTrackingScript(String scriptText) { - - for (var tracker : trackers) { - if (scriptText.contains(tracker)) { + for (var tracker : shittyTrackers) { + if (src.contains(tracker)) { return true; } } return false; } - private boolean isJavascriptTag(Element scriptTag) { - final String type = scriptTag.attr("type"); + private boolean hasNaiveTrackingScript(String src) { - if ("application/ld+json".equalsIgnoreCase(type)) { - return false; + for (var tracker : innocentTrackers) { + if (src.contains(tracker)) { + return true; + } } - - return true; + return false; } + boolean isAmazonAffiliateLink(Element aTag) { final String href = aTag.attr("href").toLowerCase(); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java index 15163c6c..c2119688 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java @@ -111,9 +111,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin final EdgeUrl url = new EdgeUrl(crawledDocument.url); - final var generatorParts = documentGeneratorExtractor.generatorCleaned(doc); + final var generatorParts = documentGeneratorExtractor.detectGenerator(doc, crawledDocument.headers); - final var specialization = htmlProcessorSpecializations.select(generatorParts); + final var specialization = htmlProcessorSpecializations.select(generatorParts, url); if (!specialization.shouldIndex(url)) { throw new DisqualifiedException(DisqualificationReason.IRRELEVANT); @@ -167,7 +167,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin .addGenerator(generatorParts.keywords()) .build(); + words.addAllSyntheticTerms(tagWords); + specialization.amendWords(doc, words); getLinks(url, ret, doc, words); @@ -216,8 +218,23 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin return true; } - // Annoying wordpress crap - if (url.path.startsWith("/tag/") && url.path.endsWith("/")) { + // Annoying blog crap + if (url.path.contains("/tag/") && url.path.endsWith("/")) { + return true; + } + if (url.path.contains("/tags/") && url.path.endsWith("/")) { + return true; + } + if (url.path.contains("/category/") && url.path.endsWith("/")) { + return true; + } + if (url.path.contains("/categories/") && url.path.endsWith("/")) { + return true; + } + if (url.path.contains("/section/") && url.path.endsWith("/")) { + return true; + } + if (url.path.contains("/sections/") && url.path.endsWith("/")) { return true; } return false; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/specialization/BlogSpecialization.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/specialization/BlogSpecialization.java new file mode 100644 index 00000000..f40654bc --- /dev/null +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/specialization/BlogSpecialization.java @@ -0,0 +1,210 @@ +package nu.marginalia.converting.processor.plugin.specialization; + +import ca.rmen.porterstemmer.PorterStemmer; +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.keyword.model.DocumentKeywordsBuilder; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.idx.WordFlags; +import nu.marginalia.summary.SummaryExtractor; +import org.apache.logging.log4j.util.Strings; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.select.NodeFilter; +import org.jsoup.select.NodeVisitor; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.function.Predicate; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +/** The blog specialization is used for blogs, and makes heavy assumptions about the nature of the document + * that aren't generally true, but if the categorization is correct, will yield much better results. + */ +@Singleton +public class BlogSpecialization extends DefaultSpecialization { + + @Inject + public BlogSpecialization(SummaryExtractor summaryExtractor) { + super(summaryExtractor); + } + + @Override + public Document prune(Document original) { + var doc = original.clone(); + + // Remove all nav junk, comments and other stuff + doc.filter(new BlogPruningFilter()); + + // If there is an article tag, use that as the root + var articleTags = doc.getElementsByTag("article"); + var firstArticle = articleTags.first(); + if (firstArticle != null) { + var art = firstArticle.clone(); + + doc.body().empty(); + doc.body().appendChild(art); + + return doc; + } + + // Use the default pruning as a fallback + return super.prune(doc); + } + + @Override + public String getSummary(Document original, Set importantWords) { + return super.getSummary(original, importantWords); + } + + private final static List badPathElements = + List.of("/tag/", "/tags/", "/tagged/", "/category/", "/categories/", "/section/", "/sections/", "/page/", "/author/"); + + private final static Predicate dateIndexTest1 = Pattern.compile("^/(\\d{4}/(\\d{2}/){0,2}?)$").asMatchPredicate(); + private final static Predicate dateIndexTest2 = Pattern.compile("^/(\\d{2}/){1,2}$").asMatchPredicate(); + + @Override + public boolean shouldIndex(EdgeUrl url) { + String path = url.path; + + // Don't index the root path for blogs, as it is usually an ephemeral list of all posts + if ("/".equals(path)) return false; + + // Likewise for the blog's home page + if (path.endsWith("/blog/")) return false; + if (path.endsWith("/log/")) return false; + if (path.endsWith("/weblog/")) return false; + if (path.endsWith("/posts/")) return false; + if (path.endsWith("/articles/")) return false; + + // Refuse paths that contain any of the bad path elements + for (String badPathElement : badPathElements) { + if (path.contains(badPathElement)) return false; + } + + // We don't want chronological listings + if (dateIndexTest1.test(path)) return false; + if (dateIndexTest2.test(path)) return false; + + return true; + } + + private static PorterStemmer ps = new PorterStemmer(); + public void amendWords(Document doc, DocumentKeywordsBuilder words) { + var tagExtractor = new BlogTagExtractor(); + doc.traverse(tagExtractor); + + var tags = tagExtractor.getTags(); + if (!tags.isEmpty()) { + var stemmed = tags.stream().map(ps::stemWord).collect(Collectors.toSet()); + words.setFlagOnMetadataForWords(WordFlags.Subjects, stemmed); + + Set specialTags = tags.stream().map(s -> "tag:" + s).collect(Collectors.toSet()); + words.addAllSyntheticTerms(specialTags); + } + + } + + /** Removes all the non-content elements from the document, + * making strong blog-specific assumptions about the nature of + * the layout */ + private static class BlogPruningFilter implements NodeFilter { + private static final List badClassElements = Arrays.asList("comment", "reply", "sidebar", "header", "footer", "nav"); + private static final List badIdElements = Arrays.asList("comments", "header", "footer", "nav"); + + @Override + public FilterResult head(Node node, int depth) { + if (node instanceof Element el) { + String classes = el.attr("class"); + String id = el.id(); + + for (String badClassElement : badClassElements) { + if (classes.contains(badClassElement)) { + return FilterResult.REMOVE; + } + } + for (String badIdElement : badIdElements) { + if (id.contains(badIdElement)) { + return FilterResult.REMOVE; + } + } + } + return FilterResult.CONTINUE; + } + } + + + // Extract tag keywords from the blog post + public static class BlogTagExtractor implements NodeVisitor { + private final Set tags = new HashSet<>(); + int lookForTags = -1; + + public Set getTags() { + Set tagsClean = tags.stream().map(String::toLowerCase).map(this::cleanTag).filter(Strings::isNotBlank).collect(Collectors.toSet()); + + // If there are more than 5 tags, it's probably a global tag listing + // and not a post-specific tag listing + if (tagsClean.size() > 5) + return Set.of(); + + return tagsClean; + } + + private final Pattern splitterPattern = Pattern.compile("\\s+"); + private final Pattern noisePattern = Pattern.compile("[^a-zA-Z0-9]"); + + // This is hideously expensive but blog posts are relatively few and far between + private String cleanTag(String tag) { + + String[] parts = splitterPattern.split(tag); + + if (parts.length > 3) + return ""; + + for (int i = 0; i < parts.length; i++) { + if (parts[i].startsWith("#")) + parts[i] = parts[i].substring(1); + else if (parts[i].startsWith("(") && parts[i].endsWith(")")) + parts[i] = ""; + else + parts[i] = noisePattern.matcher(parts[i]).replaceAll(""); + + if (parts[i].equals("tags")) + parts[i] = ""; + } + + + return Arrays.stream(parts).filter(Strings::isNotBlank).collect(Collectors.joining("_")); + } + + @Override + public void head(Node node, int depth) { + + if (!(node instanceof Element el)) { + return; + } + + if (lookForTags < 0) { + if (el.attr("class").contains("tags")) { + lookForTags = depth; + } + if (el.tagName().equals("a")) { + if (el.attr("class").contains("tag") + || el.attr("href").startsWith("/tag/")) + tags.add(el.text()); + } + } + else if (el.tagName().equals("a")) { + tags.add(el.text()); + } + + } + public void tail(Node node, int depth) { + if (depth <= lookForTags) { lookForTags = -1; } + } + } +} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/specialization/HtmlProcessorSpecializations.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/specialization/HtmlProcessorSpecializations.java index dab6df24..b64c1dde 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/specialization/HtmlProcessorSpecializations.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/specialization/HtmlProcessorSpecializations.java @@ -2,7 +2,9 @@ package nu.marginalia.converting.processor.plugin.specialization; import com.google.inject.Inject; import com.google.inject.Singleton; +import nu.marginalia.converting.processor.ConverterDomainTypes; import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor; +import nu.marginalia.keyword.model.DocumentKeywordsBuilder; import nu.marginalia.model.EdgeUrl; import org.jsoup.nodes.Document; @@ -10,27 +12,41 @@ import java.util.Set; @Singleton public class HtmlProcessorSpecializations { + private final ConverterDomainTypes domainTypes; private final LemmySpecialization lemmySpecialization; private final XenForoSpecialization xenforoSpecialization; private final PhpBBSpecialization phpBBSpecialization; private final JavadocSpecialization javadocSpecialization; + private final BlogSpecialization blogSpecialization; private final DefaultSpecialization defaultSpecialization; @Inject - public HtmlProcessorSpecializations(LemmySpecialization lemmySpecialization, + public HtmlProcessorSpecializations(ConverterDomainTypes domainTypes, + LemmySpecialization lemmySpecialization, XenForoSpecialization xenforoSpecialization, PhpBBSpecialization phpBBSpecialization, JavadocSpecialization javadocSpecialization, + BlogSpecialization blogSpecialization, DefaultSpecialization defaultSpecialization) { + this.domainTypes = domainTypes; this.lemmySpecialization = lemmySpecialization; this.xenforoSpecialization = xenforoSpecialization; this.phpBBSpecialization = phpBBSpecialization; this.javadocSpecialization = javadocSpecialization; + this.blogSpecialization = blogSpecialization; this.defaultSpecialization = defaultSpecialization; } /** Depending on the generator tag, we may want to use specialized logic for pruning and summarizing the document */ - public HtmlProcessorSpecializationIf select(DocumentGeneratorExtractor.DocumentGenerator generator) { + public HtmlProcessorSpecializationIf select( + DocumentGeneratorExtractor.DocumentGenerator generator, + EdgeUrl url) + { + + if (domainTypes.isBlog(url.domain)) { + return blogSpecialization; + } + if (generator.keywords().contains("lemmy")) { return lemmySpecialization; } @@ -58,5 +74,8 @@ public class HtmlProcessorSpecializations { default boolean shouldIndex(EdgeUrl url) { return true; } default double lengthModifier() { return 1.0; } + + default void amendWords(Document doc, DocumentKeywordsBuilder words) {} + } } diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTestModule.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTestModule.java index f92b1bc9..4471e4d1 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTestModule.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTestModule.java @@ -4,6 +4,8 @@ import com.google.inject.AbstractModule; import com.google.inject.name.Names; import nu.marginalia.LanguageModels; import nu.marginalia.WmsaHome; +import nu.marginalia.converting.processor.ConverterDomainTypes; +import org.mockito.Mockito; public class ConvertingIntegrationTestModule extends AbstractModule { public void configure() { @@ -13,5 +15,6 @@ public class ConvertingIntegrationTestModule extends AbstractModule { bind(Integer.class).annotatedWith(Names.named("max-summary-length")).toInstance(255); bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels()); + bind(ConverterDomainTypes.class).toInstance(Mockito.mock(ConverterDomainTypes.class)); } } diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/plugin/specialization/BlogSpecializationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/plugin/specialization/BlogSpecializationTest.java new file mode 100644 index 00000000..4ec49f14 --- /dev/null +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/plugin/specialization/BlogSpecializationTest.java @@ -0,0 +1,17 @@ +package nu.marginalia.converting.processor.plugin.specialization; + +import nu.marginalia.model.EdgeUrl; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +class BlogSpecializationTest { + + @Test + void shouldIndex() throws Exception { + var spec = new BlogSpecialization(null); + assertFalse(spec.shouldIndex(new EdgeUrl("https://blog.marginalia.nu/2023/00/22/"))); + assertFalse(spec.shouldIndex(new EdgeUrl("https://blog.marginalia.nu/2023/00/"))); + assertFalse(spec.shouldIndex(new EdgeUrl("https://blog.marginalia.nu/00/22/"))); + } +} \ No newline at end of file diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/plugin/specialization/JavadocSpecializationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/plugin/specialization/JavadocSpecializationTest.java index 823b92f8..355921ea 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/plugin/specialization/JavadocSpecializationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/plugin/specialization/JavadocSpecializationTest.java @@ -34,7 +34,7 @@ class JavadocSpecializationTest { @Test void generatorExtraction() { - var gen = generatorExtractor.generatorCleaned(Jsoup.parse(thread)); + var gen = generatorExtractor.detectGenerator(Jsoup.parse(thread), ""); System.out.println(gen); } diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/plugin/specialization/LemmySpecializationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/plugin/specialization/LemmySpecializationTest.java index f89abd17..7aab1759 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/plugin/specialization/LemmySpecializationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/plugin/specialization/LemmySpecializationTest.java @@ -37,8 +37,8 @@ class LemmySpecializationTest { @Test void generatorExtraction() { - var generatorIndex = generatorExtractor.generatorCleaned(Jsoup.parse(lemmyIndexHtml)); - var generatorPost = generatorExtractor.generatorCleaned(Jsoup.parse(lemmyPost)); + var generatorIndex = generatorExtractor.detectGenerator(Jsoup.parse(lemmyIndexHtml), ""); + var generatorPost = generatorExtractor.detectGenerator(Jsoup.parse(lemmyPost), ""); System.out.println(generatorIndex); System.out.println(generatorPost); diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/plugin/specialization/XenForoSpecializationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/plugin/specialization/XenForoSpecializationTest.java index a10e3ca0..40914ba8 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/plugin/specialization/XenForoSpecializationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/plugin/specialization/XenForoSpecializationTest.java @@ -34,7 +34,7 @@ class XenForoSpecializationTest { @Test void generatorExtraction() { - var gen = generatorExtractor.generatorCleaned(Jsoup.parse(thread)); + var gen = generatorExtractor.detectGenerator(Jsoup.parse(thread), ""); System.out.println(gen); } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/LinkFilterSelector.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/LinkFilterSelector.java index fcb70166..e3b5f998 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/LinkFilterSelector.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/LinkFilterSelector.java @@ -20,7 +20,8 @@ public class LinkFilterSelector { } if (isLemmy(head)) { - return url -> url.path.startsWith("/post/") || url.path.startsWith("/c/"); + return url -> url.path.startsWith("/post/") + || (url.path.startsWith("/c/") && !url.path.contains("@")); } if (isDiscourse(head)) { return url -> url.path.startsWith("/t/") || url.path.contains("/latest"); diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java index 37e7bf62..c100388e 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java @@ -211,7 +211,7 @@ public class IndexQueryService { return switch (priority) { case BEST -> false; case GOOD -> resultCount > params.fetchSize / 4; - case FALLBACK -> resultCount > params.fetchSize / 256; + case FALLBACK -> resultCount > params.fetchSize / 8; }; } diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java index 6a89483b..b44f2551 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java @@ -121,7 +121,7 @@ public class UrlDetails { for (var problem :EnumSet.of( HtmlFeature.JS, - HtmlFeature.TRACKING, + HtmlFeature.TRACKING_INNOCENT, HtmlFeature.AFFILIATE_LINK, HtmlFeature.COOKIES, HtmlFeature.ADVERTISEMENT)) { @@ -156,7 +156,7 @@ public class UrlDetails { return HtmlFeature.hasFeature(features, HtmlFeature.JS); } public boolean isTracking() { - return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING); + return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING_INNOCENT); } public boolean isAffiliate() { return HtmlFeature.hasFeature(features, HtmlFeature.AFFILIATE_LINK); diff --git a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/DebugConverterExperiment.java b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/DebugConverterExperiment.java index fb8c536d..452be709 100644 --- a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/DebugConverterExperiment.java +++ b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/DebugConverterExperiment.java @@ -1,18 +1,12 @@ package nu.marginalia.tools.experiments; import com.google.inject.Inject; -import nu.marginalia.converting.model.GeneratorType; -import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.processor.DomainProcessor; -import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor; +import nu.marginalia.converting.processor.plugin.specialization.BlogSpecialization; import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.tools.Experiment; -import org.apache.commons.lang3.StringUtils; import org.jsoup.Jsoup; -import java.util.HashSet; -import java.util.Set; - public class DebugConverterExperiment extends Experiment { @@ -24,56 +18,25 @@ public class DebugConverterExperiment extends Experiment { } - Set seenGenerators = new HashSet<>(); - @Override public boolean process(CrawledDomain domain) { if (domain.doc == null) return true; - var dge = new DocumentGeneratorExtractor(); - for (var doc : domain.doc) { if (doc.documentBody == null) continue; var parsed = Jsoup.parse(doc.documentBody.decode()); - parsed.getElementsByTag("head").comments() - .stream().filter(c -> { - String data = c.getData(); - if (data.contains("" + generators.type()); - if (generators.type() == GeneratorType.UNKNOWN) { - System.out.println(parsed.select("meta[name=generator]") - .attr("content")); - System.out.println(doc.url); - } - } + var tagExtractor = new BlogSpecialization.BlogTagExtractor(); + parsed.traverse(tagExtractor); + var tags = tagExtractor.getTags(); + if (!tags.isEmpty()) { + System.out.println(tags); } } -// -// var ret = domainProcessor.process(domain); -// -// -// ret.documents.stream() -// .filter(ProcessedDocument::isProcessedFully) -// .peek(d -> System.out.println(d.url)) -// .map(d -> d.details.metadata) -// .forEach(System.out::println); - return true; } From fba466d6e20e65bdc8f66de0e5b25bc420d96c2c Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 10 Jul 2023 18:04:43 +0200 Subject: [PATCH 022/157] (crawler) Update URL blocklist * Don't crawl MDN mirrors * More mailing list variants --- .../java/nu/marginalia/ip_blocklist/UrlBlocklist.java | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/UrlBlocklist.java b/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/UrlBlocklist.java index b8d6a596..f3574b87 100644 --- a/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/UrlBlocklist.java +++ b/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/UrlBlocklist.java @@ -71,6 +71,11 @@ public class UrlBlocklist { return true; } + // MDN is nice, but we don't need to crawl a bunch of MDN mirrors >.> + if (url.path.contains("developer.mozilla.org")) { + return true; + } + if ("github.com".equals(url.domain.domain)) { return url.path.chars().filter(c -> c == '/').count() > 2; } @@ -94,6 +99,12 @@ public class UrlBlocklist { if (path.contains("mailinglist")) { return true; } + if (path.contains("mail-archive")) { + return true; + } + if (path.contains("mailman")) { + return true; + } return false; } } From 2283ceb77d6dab18668582cacc6206e7c91afeb4 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 10 Jul 2023 18:58:33 +0200 Subject: [PATCH 023/157] (control) WIP control service --- .../nu/marginalia/control/ControlService.java | 34 ++++++++-- .../marginalia/control/EventLogService.java | 9 ++- .../marginalia/control/HeartbeatService.java | 9 ++- .../control/MessageQueueViewService.java | 67 +++++++++++++++++++ .../control/model/MessageQueueEntry.java | 17 +++++ .../main/resources/static/control/style.css | 34 ++++++++++ .../resources/templates/control/events.hdb | 34 ++++++++++ .../resources/templates/control/index.hdb | 2 +- .../templates/control/message-queue.hdb | 47 +++++++++++++ .../templates/control/partials/nav.hdb | 2 + .../resources/templates/control/services.hdb | 23 +------ docker-compose.yml | 1 + run/nginx-site.conf | 22 +++++- 13 files changed, 267 insertions(+), 34 deletions(-) create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/MessageQueueViewService.java create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/MessageQueueEntry.java create mode 100644 code/services-satellite/control-service/src/main/resources/templates/control/events.hdb create mode 100644 code/services-satellite/control-service/src/main/resources/templates/control/message-queue.hdb diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java index 93873abb..2a566077 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java @@ -11,6 +11,8 @@ import nu.marginalia.renderer.RendererFactory; import nu.marginalia.service.server.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import spark.Request; +import spark.Response; import spark.Spark; import java.io.IOException; @@ -25,7 +27,10 @@ public class ControlService extends Service { private final ServiceMonitors monitors; private final MustacheRenderer indexRenderer; private final MustacheRenderer> servicesRenderer; + private final MustacheRenderer> eventsRenderer; + private final MustacheRenderer> messageQueueRenderer; private final MqPersistence messageQueuePersistence; + private final StaticResources staticResources; @Inject @@ -35,14 +40,20 @@ public class ControlService extends Service { EventLogService eventLogService, RendererFactory rendererFactory, MqPersistence messageQueuePersistence, - ControlProcesses controlProcesses + ControlProcesses controlProcesses, + StaticResources staticResources, + MessageQueueViewService messageQueueViewService ) throws IOException { super(params); this.monitors = monitors; indexRenderer = rendererFactory.renderer("control/index"); servicesRenderer = rendererFactory.renderer("control/services"); + eventsRenderer = rendererFactory.renderer("control/events"); + messageQueueRenderer = rendererFactory.renderer("control/message-queue"); + this.messageQueuePersistence = messageQueuePersistence; + this.staticResources = staticResources; Spark.get("/public/heartbeats", (req, res) -> { res.type("application/json"); @@ -50,15 +61,18 @@ public class ControlService extends Service { }, gson::toJson); Spark.get("/public/", (req, rsp) -> indexRenderer.render(Map.of())); - Spark.get("/public/services", (req, rsp) -> servicesRenderer.render( - Map.of("heartbeats", heartbeatService.getHeartbeats(), - "events", eventLogService.getLastEntries(100) - ))); + + Spark.get("/public/services", (req, rsp) -> servicesRenderer.render(Map.of("heartbeats", heartbeatService.getHeartbeats()))); + Spark.get("/public/events", (req, rsp) -> eventsRenderer.render(Map.of("events", eventLogService.getLastEntries(20)))); + Spark.get("/public/message-queue", (req, rsp) -> messageQueueRenderer.render(Map.of("messages", messageQueueViewService.getLastEntries(20)))); + Spark.get("/public/repartition", (req, rsp) -> { controlProcesses.start("REPARTITION-REINDEX"); return "OK"; }); + Spark.get("/public/:resource", this::serveStatic); + monitors.subscribe(this::logMonitorStateChange); Thread reaperThread = new Thread(this::reapMessageQueue, "message-queue-reaper"); @@ -66,6 +80,16 @@ public class ControlService extends Service { reaperThread.start(); } + + private Object serveStatic(Request request, Response response) { + String resource = request.params("resource"); + + staticResources.serveStatic("control", resource, request, response); + + return ""; + } + + private void reapMessageQueue() { for (;;) { diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/EventLogService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/EventLogService.java index 842fe86e..9165204d 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/EventLogService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/EventLogService.java @@ -32,7 +32,7 @@ public class EventLogService { while (rs.next()) { entries.add(new EventLogEntry( rs.getString("SERVICE_NAME"), - rs.getString("INSTANCE"), + trimUUID(rs.getString("INSTANCE")), rs.getTimestamp("EVENT_TIME").toLocalDateTime().toLocalTime().toString(), rs.getString("EVENT_TYPE"), rs.getString("EVENT_MESSAGE") @@ -44,6 +44,11 @@ public class EventLogService { throw new RuntimeException(ex); } } - + private String trimUUID(String uuid) { + if (uuid.length() > 8) { + return uuid.substring(0, 8); + } + return uuid; + } } diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/HeartbeatService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/HeartbeatService.java index d0fd67cb..370fa15d 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/HeartbeatService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/HeartbeatService.java @@ -33,7 +33,7 @@ public class HeartbeatService { heartbeats.add(new ServiceHeartbeat( rs.getString("SERVICE_NAME"), rs.getString("SERVICE_BASE"), - rs.getString("INSTANCE"), + trimUUID(rs.getString("INSTANCE")), rs.getInt("TSDIFF") / 1000., rs.getBoolean("ALIVE") )); @@ -45,4 +45,11 @@ public class HeartbeatService { return heartbeats; } + + private String trimUUID(String uuid) { + if (uuid.length() > 8) { + return uuid.substring(0, 8); + } + return uuid; + } } diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/MessageQueueViewService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/MessageQueueViewService.java new file mode 100644 index 00000000..9d49c1ed --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/MessageQueueViewService.java @@ -0,0 +1,67 @@ +package nu.marginalia.control; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.control.model.EventLogEntry; +import nu.marginalia.control.model.MessageQueueEntry; + +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.List; + +@Singleton +public class MessageQueueViewService { + + private final HikariDataSource dataSource; + + @Inject + public MessageQueueViewService(HikariDataSource dataSource) { + this.dataSource = dataSource; + } + + public List getLastEntries(int n) { + try (var conn = dataSource.getConnection(); + var query = conn.prepareStatement(""" + SELECT ID, RELATED_ID, SENDER_INBOX, RECIPIENT_INBOX, FUNCTION, OWNER_INSTANCE, OWNER_TICK, STATE, CREATED_TIME, UPDATED_TIME, TTL + FROM PROC_MESSAGE + ORDER BY ID DESC + LIMIT ? + """)) { + + query.setInt(1, n); + List entries = new ArrayList<>(n); + var rs = query.executeQuery(); + while (rs.next()) { + entries.add(new MessageQueueEntry( + rs.getLong("ID"), + rs.getLong("RELATED_ID"), + rs.getString("SENDER_INBOX"), + rs.getString("RECIPIENT_INBOX"), + rs.getString("FUNCTION"), + trimUUID(rs.getString("OWNER_INSTANCE")), + rs.getLong("OWNER_TICK"), + rs.getString("STATE"), + rs.getTimestamp("CREATED_TIME").toLocalDateTime().toLocalTime().toString(), + rs.getTimestamp("UPDATED_TIME").toLocalDateTime().toLocalTime().toString(), + rs.getInt("TTL") + )); + } + return entries; + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } + private String trimUUID(String uuid) { + if (null == uuid) { + return ""; + } + + if (uuid.length() > 8) { + return uuid.substring(0, 8); + } + return uuid; + } + +} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/MessageQueueEntry.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/MessageQueueEntry.java new file mode 100644 index 00000000..9694ac1e --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/MessageQueueEntry.java @@ -0,0 +1,17 @@ +package nu.marginalia.control.model; + +public record MessageQueueEntry ( + long id, + long relatedId, + String senderInbox, + String recipientInbox, + String function, + String ownerInstance, + long ownerTick, + String state, + String createdTime, + String updatedTime, + int ttl +) +{ +} diff --git a/code/services-satellite/control-service/src/main/resources/static/control/style.css b/code/services-satellite/control-service/src/main/resources/static/control/style.css index 6bd9166e..3df019e5 100644 --- a/code/services-satellite/control-service/src/main/resources/static/control/style.css +++ b/code/services-satellite/control-service/src/main/resources/static/control/style.css @@ -1,4 +1,38 @@ body { font-family: serif; line-height: 1.6; + + display: grid; + grid-template-columns: 20ch auto; + grid-gap: 1em; + grid-template-areas: + "left right"; +} +body > nav { + grid-area: left; +} +nav ul { + list-style-type: none; + padding: 0; +} +nav ul li { + line-height: 2; +} +nav ul li a { + text-decoration: none; + padding: 0.5ch; + display: block; + color: #000; + background-color: #ccc; +} +nav ul li a:focus { + text-decoration: underline; +} +nav ul li a.current { + color: #000; + background-color: #fff; +} + +body > section { + grid-area: right; } \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/events.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/events.hdb new file mode 100644 index 00000000..941c4dea --- /dev/null +++ b/code/services-satellite/control-service/src/main/resources/templates/control/events.hdb @@ -0,0 +1,34 @@ + + + + Control Service + + + + + {{> control/partials/nav}} + +
+

Events

+ + + + + + + + + + {{#each events}} + + + + + + + + {{/each}} +
Service NameInstanceEvent TimeTypeMessage
{{serviceName}}{{instance}}{{eventTime}}{{eventType}}{{eventMessage}}
+
+ + \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/index.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/index.hdb index 701ed915..6ca3119f 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/index.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/index.hdb @@ -2,7 +2,7 @@ Control Service - + diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/message-queue.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/message-queue.hdb new file mode 100644 index 00000000..06817679 --- /dev/null +++ b/code/services-satellite/control-service/src/main/resources/templates/control/message-queue.hdb @@ -0,0 +1,47 @@ + + + + Control Service + + + + + {{> control/partials/nav}} + +
+

Events

+ + + + + + + + + + + + + + + + {{#each messages}} + + + + + + + + + + + + + + + {{/each}} +
Message IDRelated IDRecipientSenderFunctionOwner InstanceOwner TickStateCreated TimeUpdated TimeTTL
{{id}}{{relatedId}}{{recipientInbox}}{{senderInbox}}{{function}}{{ownerInstance}}{{ownerTick}}{{state}}{{createdTime}}{{updatedTime}}{{ttl}}
+
+ + \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/partials/nav.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/partials/nav.hdb index 9b68f4b2..98283bfc 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/partials/nav.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/partials/nav.hdb @@ -2,6 +2,8 @@ \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/services.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/services.hdb index 5b5febf2..41184eab 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/services.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/services.hdb @@ -2,7 +2,7 @@ Control Service - + @@ -24,27 +24,6 @@ {{/each}} - -

Events

- - - - - - - - - - {{#each events}} - - - - - - - - {{/each}} -
Service NameInstanceEvent TimeTypeMessage
{{serviceName}}{{instance}}{{eventTime}}{{eventType}}{{eventMessage}}
\ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index fc88dcc3..8490d5a7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -98,6 +98,7 @@ services: container_name: "nginx-gw" ports: - "127.0.0.1:8080:80" + - "127.0.0.1:8081:81" volumes: - "./run/nginx-site.conf:/etc/nginx/conf.d/default.conf" networks: diff --git a/run/nginx-site.conf b/run/nginx-site.conf index 90f93ff9..f9887ad3 100644 --- a/run/nginx-site.conf +++ b/run/nginx-site.conf @@ -33,11 +33,27 @@ server { proxy_pass http://assistant-service:5025/public$request_uri; access_log off; } - location /control/ { - proxy_pass http://control-service:5090/public/; - } location / { proxy_pass http://search-service:5023/public/; } } + + +server { + listen 81; + listen [::]:81; + server_name control; + + proxy_set_header X-Context $remote_addr-$connection; + proxy_set_header X-Extern-Url $scheme://$host$request_uri; + proxy_set_header X-Extern-Domain $scheme://$host; + proxy_set_header X-User-Agent $http_user_agent; + + proxy_set_header X-Public "1"; + + location / { + proxy_pass http://control-service:5090/public/; + } + +} \ No newline at end of file From 98b5f2210405d563fb4bcc13b317cd47b710412b Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 10 Jul 2023 21:33:57 +0200 Subject: [PATCH 024/157] (control) WIP control service * Set messages to OK when received so they're cleaned up properly. --- .../nu/marginalia/mq/outbox/MqOutbox.java | 18 ++++++++-- .../mq/persistence/MqPersistence.java | 16 +++++++-- .../java/nu/marginalia/mqsm/StateMachine.java | 19 ++++++++--- .../mq/persistence/MqPersistenceTest.java | 1 + .../nu/marginalia/control/ControlService.java | 17 ++++++++-- .../control/MessageQueueViewService.java | 1 - .../control/model/ControlProcess.java | 9 +++++ .../control/process/ControlProcesses.java | 33 ++++++++++++++----- 8 files changed, 94 insertions(+), 20 deletions(-) create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ControlProcess.java diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java index 75fb8fcd..a3cc319b 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java @@ -1,6 +1,7 @@ package nu.marginalia.mq.outbox; import nu.marginalia.mq.MqMessage; +import nu.marginalia.mq.MqMessageState; import nu.marginalia.mq.persistence.MqPersistence; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -116,14 +117,25 @@ public class MqOutbox { while (!pendingResponses.containsKey(id)) { pendingResponses.wait(100); } - return pendingResponses.remove(id); + + var msg = pendingResponses.remove(id); + // Mark the response as OK so it can be cleaned up + persistence.updateMessageState(msg.msgId(), MqMessageState.OK); + + return msg; } } /** Polls for a response for the given message id. */ - public Optional pollResponse(long id) { + public Optional pollResponse(long id) throws SQLException { // no need to sync here if we aren't going to wait() - return Optional.ofNullable(pendingResponses.remove(id)); + var response = pendingResponses.get(id); + + if (response != null) { + // Mark the response as OK so it can be cleaned up + persistence.updateMessageState(response.msgId(), MqMessageState.OK); + } + return Optional.ofNullable(response); } public long notify(String function, String payload) throws Exception { diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java index d5356c55..129ad5b8 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java @@ -24,14 +24,26 @@ public class MqPersistence { /** Flags messages as dead if they have not been set to a terminal state within a TTL after the last update. */ public int reapDeadMessages() throws SQLException { try (var conn = dataSource.getConnection(); - var stmt = conn.prepareStatement(""" + var setToDead = conn.prepareStatement(""" UPDATE PROC_MESSAGE SET STATE='DEAD', UPDATED_TIME=CURRENT_TIMESTAMP(6) WHERE STATE IN ('NEW', 'ACK') AND TTL IS NOT NULL AND TIMESTAMPDIFF(SECOND, UPDATED_TIME, CURRENT_TIMESTAMP(6)) > TTL """)) { - return stmt.executeUpdate(); + return setToDead.executeUpdate(); + } + } + + public int cleanOldMessages() throws SQLException { + try (var conn = dataSource.getConnection(); + var setToDead = conn.prepareStatement(""" + DELETE FROM PROC_MESSAGE + WHERE STATE = 'OK' + AND TTL IS NOT NULL + AND TIMESTAMPDIFF(SECOND, UPDATED_TIME, CURRENT_TIMESTAMP(6)) > 3600 + """)) { + return setToDead.executeUpdate(); } } diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java index 3518e9e5..7e56e6ba 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java @@ -13,11 +13,9 @@ import nu.marginalia.mqsm.state.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.UUID; +import java.util.*; import java.util.concurrent.Executors; +import java.util.function.BiConsumer; /** A state machine that can be used to implement a finite state machine * using a message queue as the persistence layer. The state machine is @@ -35,6 +33,8 @@ public class StateMachine { private final MachineState finalState = new StateFactory.FinalState(); private final MachineState resumingState = new StateFactory.ResumingState(); + private final List> stateChangeListeners = new ArrayList<>(); + private final Map allStates = new HashMap<>(); public StateMachine(MqPersistence persistence, @@ -58,6 +58,11 @@ public class StateMachine { } } + /** Listen to state changes */ + public void listen(BiConsumer listener) { + stateChangeListeners.add(listener); + } + /** Register the state graph */ void registerStates(List states) { for (var state : states) { @@ -188,6 +193,12 @@ public class StateMachine { @Override public void onNotification(MqMessage msg) { onStateTransition(msg.function(), msg.payload()); + try { + stateChangeListeners.forEach(l -> l.accept(msg.function(), msg.payload())); + } + catch (Exception ex) { + ex.printStackTrace(); + } } } } diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mq/persistence/MqPersistenceTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mq/persistence/MqPersistenceTest.java index 7166531d..605d2b30 100644 --- a/code/common/message-queue/src/test/java/nu/marginalia/mq/persistence/MqPersistenceTest.java +++ b/code/common/message-queue/src/test/java/nu/marginalia/mq/persistence/MqPersistenceTest.java @@ -63,6 +63,7 @@ public class MqPersistenceTest { var messages = MqTestUtil.getMessages(dataSource, recipientId); assertEquals(1, messages.size()); assertEquals(MqMessageState.NEW, messages.get(0).state()); + System.out.println(messages); TimeUnit.SECONDS.sleep(5); diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java index 2a566077..ccf167a8 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java @@ -3,11 +3,13 @@ package nu.marginalia.control; import com.google.gson.Gson; import com.google.inject.Inject; import nu.marginalia.client.ServiceMonitors; +import nu.marginalia.control.model.ControlProcess; import nu.marginalia.control.process.ControlProcesses; import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.renderer.MustacheRenderer; import nu.marginalia.renderer.RendererFactory; +import nu.marginalia.service.control.ServiceEventLog; import nu.marginalia.service.server.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -31,6 +33,7 @@ public class ControlService extends Service { private final MustacheRenderer> messageQueueRenderer; private final MqPersistence messageQueuePersistence; private final StaticResources staticResources; + private final ServiceEventLog eventLog; @Inject @@ -47,6 +50,8 @@ public class ControlService extends Service { super(params); this.monitors = monitors; + this.eventLog = params.eventLog; + indexRenderer = rendererFactory.renderer("control/index"); servicesRenderer = rendererFactory.renderer("control/services"); eventsRenderer = rendererFactory.renderer("control/events"); @@ -67,7 +72,7 @@ public class ControlService extends Service { Spark.get("/public/message-queue", (req, rsp) -> messageQueueRenderer.render(Map.of("messages", messageQueueViewService.getLastEntries(20)))); Spark.get("/public/repartition", (req, rsp) -> { - controlProcesses.start("REPARTITION-REINDEX"); + controlProcesses.start(ControlProcess.REPARTITION_REINDEX); return "OK"; }); @@ -94,12 +99,20 @@ public class ControlService extends Service { for (;;) { try { - TimeUnit.MINUTES.sleep(30); + TimeUnit.MINUTES.sleep(10); int outcome = messageQueuePersistence.reapDeadMessages(); if (outcome > 0) { + eventLog.logEvent("MESSAGE-QUEUE-REAPED", Integer.toString(outcome)); logger.info("Reaped {} dead messages from message queue", outcome); } + + outcome = messageQueuePersistence.cleanOldMessages(); + if (outcome > 0) { + eventLog.logEvent("MESSAGE-QUEUE-CLEANED", Integer.toString(outcome)); + logger.info("Cleaned {} stale messages from message queue", outcome); + } + } catch (InterruptedException ex) { logger.info("Message queue reaper interrupted"); diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/MessageQueueViewService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/MessageQueueViewService.java index 9d49c1ed..35a27cbc 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/MessageQueueViewService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/MessageQueueViewService.java @@ -3,7 +3,6 @@ package nu.marginalia.control; import com.google.inject.Inject; import com.google.inject.Singleton; import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.control.model.EventLogEntry; import nu.marginalia.control.model.MessageQueueEntry; import java.sql.SQLException; diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ControlProcess.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ControlProcess.java new file mode 100644 index 00000000..613dd2e5 --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ControlProcess.java @@ -0,0 +1,9 @@ +package nu.marginalia.control.model; + +public enum ControlProcess { + REPARTITION_REINDEX; + + public String id() { + return "fsm:" + name().toLowerCase(); + } +} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ControlProcesses.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ControlProcesses.java index 7a70eb83..5813bdbb 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ControlProcesses.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ControlProcesses.java @@ -2,9 +2,12 @@ package nu.marginalia.control.process; import com.google.inject.Inject; import com.google.inject.Singleton; +import nu.marginalia.control.model.ControlProcess; import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.mqsm.StateMachine; import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.service.control.ServiceEventLog; +import nu.marginalia.service.server.BaseServiceParams; import java.util.HashMap; import java.util.Map; @@ -13,26 +16,40 @@ import java.util.UUID; @Singleton public class ControlProcesses { private final MqPersistence persistence; - public Map stateMachines = new HashMap<>(); + private final ServiceEventLog eventLog; + public Map stateMachines = new HashMap<>(); @Inject public ControlProcesses(MqPersistence persistence, + BaseServiceParams baseServiceParams, RepartitionReindexProcess repartitionReindexProcess ) { this.persistence = persistence; + this.eventLog = baseServiceParams.eventLog; - register("REPARTITION-REINDEX", repartitionReindexProcess); + register(ControlProcess.REPARTITION_REINDEX, repartitionReindexProcess); } - private void register(String name, AbstractStateGraph graph) { - stateMachines.put(name, new StateMachine(persistence, name, UUID.randomUUID(), graph)); + private void register(ControlProcess process, AbstractStateGraph graph) { + var sm = new StateMachine(persistence, process.id(), UUID.randomUUID(), graph); + + sm.listen((function, param) -> logStateChange(process, function)); + + stateMachines.put(process, sm); } - public void start(String name) throws Exception { - stateMachines.get(name).init(); + private void logStateChange(ControlProcess process, String state) { + eventLog.logEvent("FSM-STATE-CHANGE", process.id() + " -> " + state); } - public void resume(String name) throws Exception { - stateMachines.get(name).resume(); + public void start(ControlProcess process) throws Exception { + eventLog.logEvent("FSM-START", process.id()); + + stateMachines.get(process).init(); + } + + public void resume(ControlProcess process) throws Exception { + eventLog.logEvent("FSM-RESUME", process.id()); + stateMachines.get(process).resume(); } } From ec7826659a77adca3fc96df31186b3c5caaa3d72 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 10 Jul 2023 21:52:25 +0200 Subject: [PATCH 025/157] (minor) Javadoc comments for MqPersistance and MqMessageState --- .../java/nu/marginalia/mq/MqMessageState.java | 5 ++++ .../mq/persistence/MqPersistence.java | 25 ++++++++++++++++++- 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/MqMessageState.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/MqMessageState.java index d1d03f15..94f7411b 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/MqMessageState.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/MqMessageState.java @@ -1,9 +1,14 @@ package nu.marginalia.mq; public enum MqMessageState { + /** The message is new and has not yet been acknowledged by the recipient */ NEW, + /** The message has been acknowledged by the recipient */ ACK, + /** The message has been processed successfully by the recipient */ OK, + /** The message processing has failed */ ERR, + /** The message did not reach a terminal state within the TTL */ DEAD } diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java index 129ad5b8..08c19da6 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java @@ -35,6 +35,7 @@ public class MqPersistence { } } + /** Removes messages that have been set to a terminal state a while after their last update timestamp */ public int cleanOldMessages() throws SQLException { try (var conn = dataSource.getConnection(); var setToDead = conn.prepareStatement(""" @@ -47,6 +48,16 @@ public class MqPersistence { } } + /** + * Adds a new message to the message queue. + * + * @param recipientInboxName The recipient's inbox name + * @param senderInboxName (nullable) The sender's inbox name. Only needed if a reply is expected. If null, the message is not expected to be replied to. + * @param function The function to call + * @param payload The payload to send, typically JSON. + * @param ttl (nullable) The time to live of the message, in seconds. If null, the message will never set to DEAD. + * @return The id of the message + */ public long sendNewMessage(String recipientInboxName, @Nullable String senderInboxName, @@ -82,7 +93,7 @@ public class MqPersistence { } } - + /** Modifies the state of a message by id */ public void updateMessageState(long id, MqMessageState mqMessageState) throws SQLException { try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(""" @@ -99,6 +110,9 @@ public class MqPersistence { } } + /** Creates a new message in the queue referencing as a reply to an existing message + * This message will have it's RELATED_ID set to the original message's ID. + */ public long sendResponse(long id, MqMessageState mqMessageState, String message) throws SQLException { try (var conn = dataSource.getConnection()) { conn.setAutoCommit(false); @@ -149,6 +163,10 @@ public class MqPersistence { } + /** Marks unclaimed messages addressed to this inbox with instanceUUID and tick, + * then returns the number of messages marked. This is an atomic operation that + * ensures that messages aren't double processed. + */ private int markInboxMessages(String inboxName, String instanceUUID, long tick) throws SQLException { try (var conn = dataSource.getConnection(); var updateStmt = conn.prepareStatement(""" @@ -170,11 +188,13 @@ public class MqPersistence { */ public Collection pollInbox(String inboxName, String instanceUUID, long tick) throws SQLException { + // Mark new messages as claimed int expected = markInboxMessages(inboxName, instanceUUID, tick); if (expected == 0) { return Collections.emptyList(); } + // Then fetch the messages that were marked try (var conn = dataSource.getConnection(); var queryStmt = conn.prepareStatement(""" SELECT ID, RELATED_ID, FUNCTION, PAYLOAD, STATE, SENDER_INBOX FROM PROC_MESSAGE @@ -213,11 +233,13 @@ public class MqPersistence { */ public Collection pollReplyInbox(String inboxName, String instanceUUID, long tick) throws SQLException { + // Mark new messages as claimed int expected = markInboxMessages(inboxName, instanceUUID, tick); if (expected == 0) { return Collections.emptyList(); } + // Then fetch the messages that were marked try (var conn = dataSource.getConnection(); var queryStmt = conn.prepareStatement(""" SELECT SELF.ID, SELF.RELATED_ID, SELF.FUNCTION, SELF.PAYLOAD, PARENT.STATE FROM PROC_MESSAGE SELF @@ -249,6 +271,7 @@ public class MqPersistence { } } + /** Returns the last N messages sent to this inbox */ public List lastNMessages(String inboxName, int lastN) throws SQLException { try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(""" From f59cab300e3e4d7006cd38edd989ac8fa2643edd Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 10 Jul 2023 21:59:51 +0200 Subject: [PATCH 026/157] (minor) Javadoc comments for MqPersistance and MqMessageState --- .../src/main/resources/templates/control/message-queue.hdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/message-queue.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/message-queue.hdb index 06817679..844e9a07 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/message-queue.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/message-queue.hdb @@ -9,7 +9,7 @@ {{> control/partials/nav}}
-

Events

+

Message Queue

From 4c016b031851370b63029e6e8e75e4f784ff47a6 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 11 Jul 2023 14:46:21 +0200 Subject: [PATCH 027/157] Process monitoring * Also refactored the SQL tables a bit --- .../sql/current/10-service-status.sql | 17 -- .../11-service-status.sql} | 16 +- .../12-message-queue.sql} | 7 +- .../sql/migrations/03-service-status.sql | 27 +++ .../04-message-queue.sql} | 12 +- .../mq/persistence/MqPersistence.java | 24 +-- .../java/nu/marginalia/mq/MqTestUtil.java | 2 +- .../nu/marginalia/mq/outbox/MqOutboxTest.java | 2 +- .../mq/persistence/MqPersistenceTest.java | 2 +- .../mqsm/StateMachineErrorTest.java | 2 +- .../mqsm/StateMachineResumeTest.java | 2 +- .../nu/marginalia/mqsm/StateMachineTest.java | 2 +- code/common/process/build.gradle | 1 + .../nu/marginalia/ProcessConfiguration.java | 7 + .../process/control/ProcessHeartbeat.java | 155 ++++++++++++++++++ .../nu/marginalia/client/ServiceMonitors.java | 2 +- .../service/control/ServiceEventLog.java | 2 +- .../service/control/ServiceHeartbeat.java | 6 +- .../src/main/java/plan/CrawlPlan.java | 12 ++ .../marginalia/converting/ConverterMain.java | 15 +- .../converting/ConverterModule.java | 5 + code/processes/crawling-process/build.gradle | 1 + .../java/nu/marginalia/crawl/CrawlerMain.java | 25 ++- .../nu/marginalia/loading/LoaderMain.java | 38 +++-- .../nu/marginalia/loading/LoaderModule.java | 3 + .../nu/marginalia/control/ControlService.java | 7 +- .../marginalia/control/EventLogService.java | 2 +- .../marginalia/control/HeartbeatService.java | 33 +++- .../control/MessageQueueViewService.java | 2 +- .../control/model/ProcessHeartbeat.java | 25 +++ .../main/resources/static/control/style.css | 13 +- .../resources/templates/control/events.hdb | 1 + .../resources/templates/control/index.hdb | 1 + .../templates/control/message-queue.hdb | 1 + .../templates/control/partials/nav.hdb | 2 +- .../resources/templates/control/processes.hdb | 34 ++++ .../resources/templates/control/services.hdb | 1 + 37 files changed, 431 insertions(+), 78 deletions(-) delete mode 100644 code/common/db/src/main/resources/sql/current/10-service-status.sql rename code/common/db/src/main/resources/sql/{migrations/02-service-status.sql => current/11-service-status.sql} (59%) rename code/common/db/src/main/resources/sql/{migrations/03-message-queue.sql => current/12-message-queue.sql} (96%) create mode 100644 code/common/db/src/main/resources/sql/migrations/03-service-status.sql rename code/common/db/src/main/resources/sql/{current/11-message-queue.sql => migrations/04-message-queue.sql} (75%) create mode 100644 code/common/process/src/main/java/nu/marginalia/ProcessConfiguration.java create mode 100644 code/common/process/src/main/java/nu/marginalia/process/control/ProcessHeartbeat.java create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java create mode 100644 code/services-satellite/control-service/src/main/resources/templates/control/processes.hdb diff --git a/code/common/db/src/main/resources/sql/current/10-service-status.sql b/code/common/db/src/main/resources/sql/current/10-service-status.sql deleted file mode 100644 index ca934785..00000000 --- a/code/common/db/src/main/resources/sql/current/10-service-status.sql +++ /dev/null @@ -1,17 +0,0 @@ -CREATE TABLE PROC_SERVICE_HEARTBEAT( - SERVICE_NAME VARCHAR(255) PRIMARY KEY COMMENT 'Full name of the service, including node id if applicable, e.g. search-service:0', - SERVICE_BASE VARCHAR(255) NOT NULL COMMENT 'Base name of the service, e.g. search-service', - INSTANCE VARCHAR(255) NOT NULL COMMENT 'UUID of the service instance', - ALIVE BOOLEAN NOT NULL DEFAULT TRUE COMMENT 'Set to false when the service is doing an orderly shutdown', - HEARTBEAT_TIME TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) COMMENT 'Service was last seen at this point' -); - -CREATE TABLE PROC_SERVICE_EVENTLOG( - ID BIGINT AUTO_INCREMENT PRIMARY KEY COMMENT 'Unique id', - SERVICE_NAME VARCHAR(255) NOT NULL COMMENT 'Full name of the service, including node id if applicable, e.g. search-service:0', - SERVICE_BASE VARCHAR(255) NOT NULL COMMENT 'Base name of the service, e.g. search-service', - INSTANCE VARCHAR(255) NOT NULL COMMENT 'UUID of the service instance', - EVENT_TIME TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) COMMENT 'Event time', - EVENT_TYPE VARCHAR(255) NOT NULL COMMENT 'Event type', - EVENT_MESSAGE VARCHAR(255) NOT NULL COMMENT 'Event message' -); \ No newline at end of file diff --git a/code/common/db/src/main/resources/sql/migrations/02-service-status.sql b/code/common/db/src/main/resources/sql/current/11-service-status.sql similarity index 59% rename from code/common/db/src/main/resources/sql/migrations/02-service-status.sql rename to code/common/db/src/main/resources/sql/current/11-service-status.sql index acb9645a..a5d392c5 100644 --- a/code/common/db/src/main/resources/sql/migrations/02-service-status.sql +++ b/code/common/db/src/main/resources/sql/current/11-service-status.sql @@ -1,4 +1,4 @@ -CREATE TABLE PROC_SERVICE_HEARTBEAT( +CREATE TABLE IF NOT EXISTS SERVICE_HEARTBEAT ( SERVICE_NAME VARCHAR(255) PRIMARY KEY COMMENT "Full name of the service, including node id if applicable, e.g. search-service:0", SERVICE_BASE VARCHAR(255) NOT NULL COMMENT "Base name of the service, e.g. search-service", INSTANCE VARCHAR(255) NOT NULL COMMENT "UUID of the service instance", @@ -6,7 +6,16 @@ CREATE TABLE PROC_SERVICE_HEARTBEAT( HEARTBEAT_TIME TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) COMMENT "Service was last seen at this point" ); -CREATE TABLE PROC_SERVICE_EVENTLOG( +CREATE TABLE IF NOT EXISTS PROCESS_HEARTBEAT ( + PROCESS_NAME VARCHAR(255) PRIMARY KEY COMMENT "Full name of the process, including node id if applicable, e.g. converter:0", + PROCESS_BASE VARCHAR(255) NOT NULL COMMENT "Base name of the process, e.g. converter", + INSTANCE VARCHAR(255) NOT NULL COMMENT "UUID of the process instance", + STATUS ENUM ('STARTING', 'RUNNING', 'STOPPED') NOT NULL DEFAULT 'STARTING' COMMENT "Status of the process", + PROGRESS INT NOT NULL DEFAULT 0 COMMENT "Progress of the process", + HEARTBEAT_TIME TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) COMMENT "Process was last seen at this point" +); + +CREATE TABLE IF NOT EXISTS SERVICE_EVENTLOG( ID BIGINT AUTO_INCREMENT PRIMARY KEY COMMENT "Unique id", SERVICE_NAME VARCHAR(255) NOT NULL COMMENT "Full name of the service, including node id if applicable, e.g. search-service:0", SERVICE_BASE VARCHAR(255) NOT NULL COMMENT "Base name of the service, e.g. search-service", @@ -14,4 +23,5 @@ CREATE TABLE PROC_SERVICE_EVENTLOG( EVENT_TIME TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) COMMENT "Event time", EVENT_TYPE VARCHAR(255) NOT NULL COMMENT "Event type", EVENT_MESSAGE VARCHAR(255) NOT NULL COMMENT "Event message" -); \ No newline at end of file +); + diff --git a/code/common/db/src/main/resources/sql/migrations/03-message-queue.sql b/code/common/db/src/main/resources/sql/current/12-message-queue.sql similarity index 96% rename from code/common/db/src/main/resources/sql/migrations/03-message-queue.sql rename to code/common/db/src/main/resources/sql/current/12-message-queue.sql index d357650e..fd04f666 100644 --- a/code/common/db/src/main/resources/sql/migrations/03-message-queue.sql +++ b/code/common/db/src/main/resources/sql/current/12-message-queue.sql @@ -1,22 +1,17 @@ -CREATE TABLE PROC_MESSAGE( +CREATE TABLE IF NOT EXISTS MESSAGE_QUEUE ( ID BIGINT AUTO_INCREMENT PRIMARY KEY COMMENT 'Unique id', - RELATED_ID BIGINT COMMENT 'Unique id a related message', SENDER_INBOX VARCHAR(255) COMMENT 'Name of the sender inbox', - RECIPIENT_INBOX VARCHAR(255) NOT NULL COMMENT 'Name of the recipient inbox', FUNCTION VARCHAR(255) NOT NULL COMMENT 'Which function to run', PAYLOAD TEXT COMMENT 'Message to recipient', - -- These fields are used to avoid double processing of messages -- instance marks the unique instance of the party, and the tick marks -- the current polling iteration. Both are necessary. OWNER_INSTANCE VARCHAR(255) COMMENT 'Instance UUID corresponding to the party that has claimed the message', OWNER_TICK BIGINT DEFAULT -1 COMMENT 'Used by recipient to determine which messages it has processed', - STATE ENUM('NEW', 'ACK', 'OK', 'ERR', 'DEAD') NOT NULL DEFAULT 'NEW' COMMENT 'Processing state', - CREATED_TIME TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) COMMENT 'Time of creation', UPDATED_TIME TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) COMMENT 'Time of last update', TTL INT COMMENT 'Time to live in seconds' diff --git a/code/common/db/src/main/resources/sql/migrations/03-service-status.sql b/code/common/db/src/main/resources/sql/migrations/03-service-status.sql new file mode 100644 index 00000000..a5d392c5 --- /dev/null +++ b/code/common/db/src/main/resources/sql/migrations/03-service-status.sql @@ -0,0 +1,27 @@ +CREATE TABLE IF NOT EXISTS SERVICE_HEARTBEAT ( + SERVICE_NAME VARCHAR(255) PRIMARY KEY COMMENT "Full name of the service, including node id if applicable, e.g. search-service:0", + SERVICE_BASE VARCHAR(255) NOT NULL COMMENT "Base name of the service, e.g. search-service", + INSTANCE VARCHAR(255) NOT NULL COMMENT "UUID of the service instance", + ALIVE BOOLEAN NOT NULL DEFAULT TRUE COMMENT "Set to false when the service is doing an orderly shutdown", + HEARTBEAT_TIME TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) COMMENT "Service was last seen at this point" +); + +CREATE TABLE IF NOT EXISTS PROCESS_HEARTBEAT ( + PROCESS_NAME VARCHAR(255) PRIMARY KEY COMMENT "Full name of the process, including node id if applicable, e.g. converter:0", + PROCESS_BASE VARCHAR(255) NOT NULL COMMENT "Base name of the process, e.g. converter", + INSTANCE VARCHAR(255) NOT NULL COMMENT "UUID of the process instance", + STATUS ENUM ('STARTING', 'RUNNING', 'STOPPED') NOT NULL DEFAULT 'STARTING' COMMENT "Status of the process", + PROGRESS INT NOT NULL DEFAULT 0 COMMENT "Progress of the process", + HEARTBEAT_TIME TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) COMMENT "Process was last seen at this point" +); + +CREATE TABLE IF NOT EXISTS SERVICE_EVENTLOG( + ID BIGINT AUTO_INCREMENT PRIMARY KEY COMMENT "Unique id", + SERVICE_NAME VARCHAR(255) NOT NULL COMMENT "Full name of the service, including node id if applicable, e.g. search-service:0", + SERVICE_BASE VARCHAR(255) NOT NULL COMMENT "Base name of the service, e.g. search-service", + INSTANCE VARCHAR(255) NOT NULL COMMENT "UUID of the service instance", + EVENT_TIME TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) COMMENT "Event time", + EVENT_TYPE VARCHAR(255) NOT NULL COMMENT "Event type", + EVENT_MESSAGE VARCHAR(255) NOT NULL COMMENT "Event message" +); + diff --git a/code/common/db/src/main/resources/sql/current/11-message-queue.sql b/code/common/db/src/main/resources/sql/migrations/04-message-queue.sql similarity index 75% rename from code/common/db/src/main/resources/sql/current/11-message-queue.sql rename to code/common/db/src/main/resources/sql/migrations/04-message-queue.sql index 97e20d5a..fd04f666 100644 --- a/code/common/db/src/main/resources/sql/current/11-message-queue.sql +++ b/code/common/db/src/main/resources/sql/migrations/04-message-queue.sql @@ -1,19 +1,17 @@ -CREATE TABLE PROC_MESSAGE( +CREATE TABLE IF NOT EXISTS MESSAGE_QUEUE ( ID BIGINT AUTO_INCREMENT PRIMARY KEY COMMENT 'Unique id', - - RELATED_ID BIGINT NOT NULL DEFAULT -1 COMMENT 'Unique id a related message', + RELATED_ID BIGINT COMMENT 'Unique id a related message', SENDER_INBOX VARCHAR(255) COMMENT 'Name of the sender inbox', - RECIPIENT_INBOX VARCHAR(255) NOT NULL COMMENT 'Name of the recipient inbox', FUNCTION VARCHAR(255) NOT NULL COMMENT 'Which function to run', PAYLOAD TEXT COMMENT 'Message to recipient', - + -- These fields are used to avoid double processing of messages + -- instance marks the unique instance of the party, and the tick marks + -- the current polling iteration. Both are necessary. OWNER_INSTANCE VARCHAR(255) COMMENT 'Instance UUID corresponding to the party that has claimed the message', OWNER_TICK BIGINT DEFAULT -1 COMMENT 'Used by recipient to determine which messages it has processed', - STATE ENUM('NEW', 'ACK', 'OK', 'ERR', 'DEAD') NOT NULL DEFAULT 'NEW' COMMENT 'Processing state', - CREATED_TIME TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) COMMENT 'Time of creation', UPDATED_TIME TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) COMMENT 'Time of last update', TTL INT COMMENT 'Time to live in seconds' diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java index 08c19da6..a62a0227 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java @@ -25,7 +25,7 @@ public class MqPersistence { public int reapDeadMessages() throws SQLException { try (var conn = dataSource.getConnection(); var setToDead = conn.prepareStatement(""" - UPDATE PROC_MESSAGE + UPDATE MESSAGE_QUEUE SET STATE='DEAD', UPDATED_TIME=CURRENT_TIMESTAMP(6) WHERE STATE IN ('NEW', 'ACK') AND TTL IS NOT NULL @@ -39,7 +39,7 @@ public class MqPersistence { public int cleanOldMessages() throws SQLException { try (var conn = dataSource.getConnection(); var setToDead = conn.prepareStatement(""" - DELETE FROM PROC_MESSAGE + DELETE FROM MESSAGE_QUEUE WHERE STATE = 'OK' AND TTL IS NOT NULL AND TIMESTAMPDIFF(SECOND, UPDATED_TIME, CURRENT_TIMESTAMP(6)) > 3600 @@ -67,7 +67,7 @@ public class MqPersistence { ) throws Exception { try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(""" - INSERT INTO PROC_MESSAGE(RECIPIENT_INBOX, SENDER_INBOX, FUNCTION, PAYLOAD, TTL) + INSERT INTO MESSAGE_QUEUE(RECIPIENT_INBOX, SENDER_INBOX, FUNCTION, PAYLOAD, TTL) VALUES(?, ?, ?, ?, ?) """); var lastIdQuery = conn.prepareStatement("SELECT LAST_INSERT_ID()")) { @@ -97,7 +97,7 @@ public class MqPersistence { public void updateMessageState(long id, MqMessageState mqMessageState) throws SQLException { try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(""" - UPDATE PROC_MESSAGE + UPDATE MESSAGE_QUEUE SET STATE=?, UPDATED_TIME=CURRENT_TIMESTAMP(6) WHERE ID=? """)) { @@ -118,14 +118,14 @@ public class MqPersistence { conn.setAutoCommit(false); try (var updateState = conn.prepareStatement(""" - UPDATE PROC_MESSAGE + UPDATE MESSAGE_QUEUE SET STATE=?, UPDATED_TIME=CURRENT_TIMESTAMP(6) WHERE ID=? """); var addResponse = conn.prepareStatement(""" - INSERT INTO PROC_MESSAGE(RECIPIENT_INBOX, RELATED_ID, FUNCTION, PAYLOAD) + INSERT INTO MESSAGE_QUEUE(RECIPIENT_INBOX, RELATED_ID, FUNCTION, PAYLOAD) SELECT SENDER_INBOX, ID, ?, ? - FROM PROC_MESSAGE + FROM MESSAGE_QUEUE WHERE ID=? AND SENDER_INBOX IS NOT NULL """); var lastIdQuery = conn.prepareStatement("SELECT LAST_INSERT_ID()") @@ -170,7 +170,7 @@ public class MqPersistence { private int markInboxMessages(String inboxName, String instanceUUID, long tick) throws SQLException { try (var conn = dataSource.getConnection(); var updateStmt = conn.prepareStatement(""" - UPDATE PROC_MESSAGE + UPDATE MESSAGE_QUEUE SET OWNER_INSTANCE=?, OWNER_TICK=?, UPDATED_TIME=CURRENT_TIMESTAMP(6), STATE='ACK' WHERE RECIPIENT_INBOX=? AND OWNER_INSTANCE IS NULL AND STATE='NEW' @@ -197,7 +197,7 @@ public class MqPersistence { // Then fetch the messages that were marked try (var conn = dataSource.getConnection(); var queryStmt = conn.prepareStatement(""" - SELECT ID, RELATED_ID, FUNCTION, PAYLOAD, STATE, SENDER_INBOX FROM PROC_MESSAGE + SELECT ID, RELATED_ID, FUNCTION, PAYLOAD, STATE, SENDER_INBOX FROM MESSAGE_QUEUE WHERE OWNER_INSTANCE=? AND OWNER_TICK=? """) ) { @@ -242,8 +242,8 @@ public class MqPersistence { // Then fetch the messages that were marked try (var conn = dataSource.getConnection(); var queryStmt = conn.prepareStatement(""" - SELECT SELF.ID, SELF.RELATED_ID, SELF.FUNCTION, SELF.PAYLOAD, PARENT.STATE FROM PROC_MESSAGE SELF - LEFT JOIN PROC_MESSAGE PARENT ON SELF.RELATED_ID=PARENT.ID + SELECT SELF.ID, SELF.RELATED_ID, SELF.FUNCTION, SELF.PAYLOAD, PARENT.STATE FROM MESSAGE_QUEUE SELF + LEFT JOIN MESSAGE_QUEUE PARENT ON SELF.RELATED_ID=PARENT.ID WHERE SELF.OWNER_INSTANCE=? AND SELF.OWNER_TICK=? """) ) { @@ -275,7 +275,7 @@ public class MqPersistence { public List lastNMessages(String inboxName, int lastN) throws SQLException { try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(""" - SELECT ID, RELATED_ID, FUNCTION, PAYLOAD, STATE, SENDER_INBOX FROM PROC_MESSAGE + SELECT ID, RELATED_ID, FUNCTION, PAYLOAD, STATE, SENDER_INBOX FROM MESSAGE_QUEUE WHERE RECIPIENT_INBOX = ? ORDER BY ID DESC LIMIT ? """)) { diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mq/MqTestUtil.java b/code/common/message-queue/src/test/java/nu/marginalia/mq/MqTestUtil.java index dcefaf1a..b3ba62cf 100644 --- a/code/common/message-queue/src/test/java/nu/marginalia/mq/MqTestUtil.java +++ b/code/common/message-queue/src/test/java/nu/marginalia/mq/MqTestUtil.java @@ -20,7 +20,7 @@ public class MqTestUtil { OWNER_INSTANCE, OWNER_TICK, CREATED_TIME, UPDATED_TIME, TTL - FROM PROC_MESSAGE + FROM MESSAGE_QUEUE WHERE RECIPIENT_INBOX = ? """)) { diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqOutboxTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqOutboxTest.java index 3b7996f1..849c30b0 100644 --- a/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqOutboxTest.java +++ b/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqOutboxTest.java @@ -28,7 +28,7 @@ public class MqOutboxTest { .withDatabaseName("WMSA_prod") .withUsername("wmsa") .withPassword("wmsa") - .withInitScript("sql/current/11-message-queue.sql") + .withInitScript("sql/current/12-message-queue.sql") .withNetworkAliases("mariadb"); static HikariDataSource dataSource; diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mq/persistence/MqPersistenceTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mq/persistence/MqPersistenceTest.java index 605d2b30..ead78f45 100644 --- a/code/common/message-queue/src/test/java/nu/marginalia/mq/persistence/MqPersistenceTest.java +++ b/code/common/message-queue/src/test/java/nu/marginalia/mq/persistence/MqPersistenceTest.java @@ -24,7 +24,7 @@ public class MqPersistenceTest { .withDatabaseName("WMSA_prod") .withUsername("wmsa") .withPassword("wmsa") - .withInitScript("sql/current/11-message-queue.sql") + .withInitScript("sql/current/12-message-queue.sql") .withNetworkAliases("mariadb"); static HikariDataSource dataSource; diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineErrorTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineErrorTest.java index 06279f34..9d7306c2 100644 --- a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineErrorTest.java +++ b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineErrorTest.java @@ -27,7 +27,7 @@ public class StateMachineErrorTest { .withDatabaseName("WMSA_prod") .withUsername("wmsa") .withPassword("wmsa") - .withInitScript("sql/current/11-message-queue.sql") + .withInitScript("sql/current/12-message-queue.sql") .withNetworkAliases("mariadb"); static HikariDataSource dataSource; diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineResumeTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineResumeTest.java index 654e3623..f3524968 100644 --- a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineResumeTest.java +++ b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineResumeTest.java @@ -28,7 +28,7 @@ public class StateMachineResumeTest { .withDatabaseName("WMSA_prod") .withUsername("wmsa") .withPassword("wmsa") - .withInitScript("sql/current/11-message-queue.sql") + .withInitScript("sql/current/12-message-queue.sql") .withNetworkAliases("mariadb"); static HikariDataSource dataSource; diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineTest.java index a6adfa4c..1130fe04 100644 --- a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineTest.java +++ b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineTest.java @@ -24,7 +24,7 @@ public class StateMachineTest { .withDatabaseName("WMSA_prod") .withUsername("wmsa") .withPassword("wmsa") - .withInitScript("sql/current/11-message-queue.sql") + .withInitScript("sql/current/12-message-queue.sql") .withNetworkAliases("mariadb"); static HikariDataSource dataSource; diff --git a/code/common/process/build.gradle b/code/common/process/build.gradle index a762887b..be27f536 100644 --- a/code/common/process/build.gradle +++ b/code/common/process/build.gradle @@ -20,6 +20,7 @@ dependencies { implementation libs.guava implementation libs.guice + implementation libs.bundles.mariadb implementation libs.commons.lang3 implementation libs.snakeyaml diff --git a/code/common/process/src/main/java/nu/marginalia/ProcessConfiguration.java b/code/common/process/src/main/java/nu/marginalia/ProcessConfiguration.java new file mode 100644 index 00000000..35e1433f --- /dev/null +++ b/code/common/process/src/main/java/nu/marginalia/ProcessConfiguration.java @@ -0,0 +1,7 @@ +package nu.marginalia; + +import java.util.UUID; + +public record ProcessConfiguration(String processName, int node, UUID instanceUuid) { + +} diff --git a/code/common/process/src/main/java/nu/marginalia/process/control/ProcessHeartbeat.java b/code/common/process/src/main/java/nu/marginalia/process/control/ProcessHeartbeat.java new file mode 100644 index 00000000..82b2c95e --- /dev/null +++ b/code/common/process/src/main/java/nu/marginalia/process/control/ProcessHeartbeat.java @@ -0,0 +1,155 @@ +package nu.marginalia.process.control; + + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.ProcessConfiguration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.SQLException; +import java.util.concurrent.TimeUnit; + +/** This service sends a heartbeat to the database every 5 seconds. + */ +@Singleton +public class ProcessHeartbeat { + private final Logger logger = LoggerFactory.getLogger(ProcessHeartbeat.class); + private final String processName; + private final String processBase; + private final String instanceUUID; + private final HikariDataSource dataSource; + + + private final Thread runnerThread; + private final int heartbeatInterval = Integer.getInteger("mcp.heartbeat.interval", 1); + + private volatile boolean running = false; + + private volatile int progress = -1; + + @Inject + public ProcessHeartbeat(ProcessConfiguration configuration, + HikariDataSource dataSource) + { + this.processName = configuration.processName() + ":" + configuration.node(); + this.processBase = configuration.processName(); + this.dataSource = dataSource; + + this.instanceUUID = configuration.instanceUuid().toString(); + + runnerThread = new Thread(this::run); + + Runtime.getRuntime().addShutdownHook(new Thread(this::shutDown)); + } + + public void setProgress(double progress) { + this.progress = (int) (progress * 100); + } + + public void start() { + if (!running) { + runnerThread.start(); + } + } + + public void shutDown() { + if (!running) + return; + + running = false; + + try { + runnerThread.join(); + heartbeatStop(); + } + catch (InterruptedException|SQLException ex) { + logger.warn("ServiceHeartbeat shutdown failed", ex); + } + } + + private void run() { + if (!running) + running = true; + else + return; + + try { + heartbeatInit(); + + while (running) { + + try { + heartbeatUpdate(); + } + catch (SQLException ex) { + logger.warn("ServiceHeartbeat failed to update", ex); + } + + TimeUnit.SECONDS.sleep(heartbeatInterval); + } + } + catch (InterruptedException|SQLException ex) { + logger.error("ServiceHeartbeat caught irrecoverable exception, killing service", ex); + System.exit(255); + } + } + + private void heartbeatInit() throws SQLException { + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement( + """ + INSERT INTO PROCESS_HEARTBEAT (PROCESS_NAME, PROCESS_BASE, INSTANCE, HEARTBEAT_TIME, STATUS) + VALUES (?, ?, ?, CURRENT_TIMESTAMP(6), 'STARTING') + ON DUPLICATE KEY UPDATE + INSTANCE = ?, + HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), + STATUS = 'STARTING' + """ + )) + { + stmt.setString(1, processName); + stmt.setString(2, processBase); + stmt.setString(3, instanceUUID); + stmt.setString(4, instanceUUID); + stmt.executeUpdate(); + } + } + } + + private void heartbeatUpdate() throws SQLException { + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement( + """ + UPDATE PROCESS_HEARTBEAT + SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), STATUS = 'RUNNING', PROGRESS = ? + WHERE INSTANCE = ? + """) + ) + { + stmt.setInt(1, progress); + stmt.setString(2, instanceUUID); + stmt.executeUpdate(); + } + } + } + + private void heartbeatStop() throws SQLException { + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement( + """ + UPDATE PROCESS_HEARTBEAT + SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), STATUS='STOPPED', PROGRESS=? + WHERE INSTANCE = ? + """) + ) + { + stmt.setInt(1, progress); + stmt.setString( 2, instanceUUID); + stmt.executeUpdate(); + } + } + } +} + diff --git a/code/common/service-client/src/main/java/nu/marginalia/client/ServiceMonitors.java b/code/common/service-client/src/main/java/nu/marginalia/client/ServiceMonitors.java index 1ce8ae0c..a77768be 100644 --- a/code/common/service-client/src/main/java/nu/marginalia/client/ServiceMonitors.java +++ b/code/common/service-client/src/main/java/nu/marginalia/client/ServiceMonitors.java @@ -81,7 +81,7 @@ public class ServiceMonitors { try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(""" SELECT SERVICE_BASE, TIMESTAMPDIFF(SECOND, HEARTBEAT_TIME, CURRENT_TIMESTAMP(6)) - FROM PROC_SERVICE_HEARTBEAT + FROM SERVICE_HEARTBEAT WHERE ALIVE=1 """)) { try (var rs = stmt.executeQuery()) { diff --git a/code/common/service/src/main/java/nu/marginalia/service/control/ServiceEventLog.java b/code/common/service/src/main/java/nu/marginalia/service/control/ServiceEventLog.java index 217e670e..f5f6e90b 100644 --- a/code/common/service/src/main/java/nu/marginalia/service/control/ServiceEventLog.java +++ b/code/common/service/src/main/java/nu/marginalia/service/control/ServiceEventLog.java @@ -40,7 +40,7 @@ public class ServiceEventLog { try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(""" - INSERT INTO PROC_SERVICE_EVENTLOG(SERVICE_NAME, SERVICE_BASE, INSTANCE, EVENT_TYPE, EVENT_MESSAGE) + INSERT INTO SERVICE_EVENTLOG(SERVICE_NAME, SERVICE_BASE, INSTANCE, EVENT_TYPE, EVENT_MESSAGE) VALUES (?, ?, ?, ?, ?) """)) { stmt.setString(1, serviceName); diff --git a/code/common/service/src/main/java/nu/marginalia/service/control/ServiceHeartbeat.java b/code/common/service/src/main/java/nu/marginalia/service/control/ServiceHeartbeat.java index 8850ae7f..ff5c8755 100644 --- a/code/common/service/src/main/java/nu/marginalia/service/control/ServiceHeartbeat.java +++ b/code/common/service/src/main/java/nu/marginalia/service/control/ServiceHeartbeat.java @@ -93,7 +93,7 @@ public class ServiceHeartbeat { try (var connection = dataSource.getConnection()) { try (var stmt = connection.prepareStatement( """ - INSERT INTO PROC_SERVICE_HEARTBEAT (SERVICE_NAME, SERVICE_BASE, INSTANCE, HEARTBEAT_TIME, ALIVE) + INSERT INTO SERVICE_HEARTBEAT (SERVICE_NAME, SERVICE_BASE, INSTANCE, HEARTBEAT_TIME, ALIVE) VALUES (?, ?, ?, CURRENT_TIMESTAMP(6), 1) ON DUPLICATE KEY UPDATE INSTANCE = ?, @@ -115,7 +115,7 @@ public class ServiceHeartbeat { try (var connection = dataSource.getConnection()) { try (var stmt = connection.prepareStatement( """ - UPDATE PROC_SERVICE_HEARTBEAT + UPDATE SERVICE_HEARTBEAT SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6) WHERE INSTANCE = ? AND ALIVE = 1 """) @@ -131,7 +131,7 @@ public class ServiceHeartbeat { try (var connection = dataSource.getConnection()) { try (var stmt = connection.prepareStatement( """ - UPDATE PROC_SERVICE_HEARTBEAT + UPDATE SERVICE_HEARTBEAT SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), ALIVE = 0 WHERE INSTANCE = ? """) diff --git a/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java b/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java index ff299d68..a23cdede 100644 --- a/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java +++ b/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java @@ -107,6 +107,18 @@ public class CrawlPlan { throw new RuntimeException(ex); } } + + public int countCrawledDomains() { + try (Stream entryStream = WorkLog.streamLog(crawl.getLogFile())) { + return (int) entryStream + .map(WorkLogEntry::path) + .count(); + } + catch (IOException ex) { + return 0; + } + } + public void forEachCrawledDomain(Predicate idReadPredicate, Consumer consumer) { final CrawledDomainReader reader = new CrawledDomainReader(); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java index 3ecebb80..8f49c853 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java @@ -4,6 +4,7 @@ import com.google.gson.Gson; import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; +import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.process.log.WorkLog; import nu.marginalia.service.module.DatabaseModule; import plan.CrawlPlanLoader; @@ -19,6 +20,7 @@ import org.slf4j.LoggerFactory; import java.io.IOException; import java.nio.file.Path; import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; public class ConverterMain { @@ -46,13 +48,21 @@ public class ConverterMain { CrawlPlan plan, DomainProcessor processor, InstructionsCompiler compiler, - Gson gson + Gson gson, + ProcessHeartbeat heartbeat ) throws Exception { + + heartbeat.start(); + logger.info("Starting pipe"); try (WorkLog processLog = plan.createProcessWorkLog(); ConversionLog log = new ConversionLog(plan.process.getDir())) { instructionWriter = new InstructionWriter(log, plan.process.getDir(), gson); + + int totalDomains = plan.countCrawledDomains(); + AtomicInteger processedDomains = new AtomicInteger(0); + var pipe = new ParallelPipe("Converter", 16, 4, 2) { @Override @@ -78,6 +88,8 @@ public class ConverterMain { String where = instructionWriter.accept(processedInstructions.id, instructions); processLog.setJobToFinished(processedInstructions.id, where, instructions.size()); + + heartbeat.setProgress(processedDomains.incrementAndGet() / (double) totalDomains); } finally { Thread.currentThread().setName("Converter:Receiver[IDLE]"); @@ -86,6 +98,7 @@ public class ConverterMain { }; + plan.forEachCrawledDomain(id -> !processLog.isJobFinished(id), pipe::accept); pipe.join(); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterModule.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterModule.java index e7a70aeb..814c32ec 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterModule.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterModule.java @@ -4,10 +4,13 @@ import com.google.gson.Gson; import com.google.inject.AbstractModule; import com.google.inject.name.Names; import nu.marginalia.LanguageModels; +import nu.marginalia.ProcessConfiguration; import nu.marginalia.WmsaHome; import plan.CrawlPlan; import nu.marginalia.model.gson.GsonFactory; +import java.util.UUID; + public class ConverterModule extends AbstractModule { private final CrawlPlan plan; @@ -21,6 +24,8 @@ public class ConverterModule extends AbstractModule { bind(Gson.class).toInstance(createGson()); + bind(ProcessConfiguration.class).toInstance(new ProcessConfiguration("converter", 0, UUID.randomUUID())); + bind(Double.class).annotatedWith(Names.named("min-document-quality")).toInstance(-15.); bind(Integer.class).annotatedWith(Names.named("min-document-length")).toInstance(250); bind(Integer.class).annotatedWith(Names.named("max-title-length")).toInstance(128); diff --git a/code/processes/crawling-process/build.gradle b/code/processes/crawling-process/build.gradle index de504915..b62b3a68 100644 --- a/code/processes/crawling-process/build.gradle +++ b/code/processes/crawling-process/build.gradle @@ -21,6 +21,7 @@ tasks.distZip.enabled = false dependencies { implementation project(':code:common:process') + implementation project(':code:common:db') implementation project(':code:common:model') implementation project(':code:common:config') implementation project(':code:common:service') diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java index cbd9513a..4c436ca3 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java @@ -1,10 +1,12 @@ package nu.marginalia.crawl; +import nu.marginalia.ProcessConfiguration; import nu.marginalia.UserAgent; import nu.marginalia.WmsaHome; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; -import nu.marginalia.crawl.retreival.fetcher.SitemapRetriever; +import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.process.log.WorkLog; +import nu.marginalia.service.module.DatabaseModule; import plan.CrawlPlanLoader; import plan.CrawlPlan; import nu.marginalia.crawling.io.CrawledDomainWriter; @@ -20,7 +22,9 @@ import org.slf4j.LoggerFactory; import java.nio.file.Path; import java.util.HashSet; import java.util.Set; +import java.util.UUID; import java.util.concurrent.*; +import java.util.concurrent.atomic.AtomicInteger; public class CrawlerMain implements AutoCloseable { private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -45,6 +49,8 @@ public class CrawlerMain implements AutoCloseable { AbortMonitor abortMonitor = AbortMonitor.getInstance(); Semaphore taskSem = new Semaphore(poolSize); + private static ProcessHeartbeat heartbeat; + public CrawlerMain(CrawlPlan plan) throws Exception { this.plan = plan; this.userAgent = WmsaHome.getUserAgent(); @@ -77,9 +83,16 @@ public class CrawlerMain implements AutoCloseable { } var plan = new CrawlPlanLoader().load(Path.of(args[0])); + heartbeat = new ProcessHeartbeat(new ProcessConfiguration("crawler", 0, UUID.randomUUID()), + new DatabaseModule().provideConnection()); + try (var crawler = new CrawlerMain(plan)) { + heartbeat.start(); crawler.run(); } + finally { + heartbeat.shutDown(); + } System.exit(0); } @@ -87,12 +100,18 @@ public class CrawlerMain implements AutoCloseable { public void run() throws InterruptedException { // First a validation run to ensure the file is all good to parse logger.info("Validating JSON"); - plan.forEachCrawlingSpecification(unused -> {}); + AtomicInteger countTotal = new AtomicInteger(); + AtomicInteger countProcessed = new AtomicInteger(); + + plan.forEachCrawlingSpecification(unused -> countTotal.incrementAndGet()); logger.info("Let's go"); // TODO: Make this into an iterable instead so we can abort it - plan.forEachCrawlingSpecification(this::startCrawlTask); + plan.forEachCrawlingSpecification((spec) -> { + heartbeat.setProgress(countProcessed.incrementAndGet() / (double) countTotal.get()); + startCrawlTask(spec); + }); } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java index eb04a06b..c17193a7 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java @@ -5,6 +5,7 @@ import com.google.inject.Inject; import com.google.inject.Injector; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; +import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.process.log.WorkLog; import plan.CrawlPlanLoader; import plan.CrawlPlan; @@ -32,9 +33,10 @@ public class LoaderMain { private final LoaderFactory loaderFactory; private final IndexLoadKeywords indexLoadKeywords; + private final ProcessHeartbeat heartbeat; private volatile boolean running = true; - final Thread processorThread = new Thread(this::processor, "Processor Thread"); + final Thread processorThread; public static void main(String... args) throws IOException { if (args.length != 1) { @@ -59,16 +61,23 @@ public class LoaderMain { public LoaderMain(CrawlPlan plan, ConvertedDomainReader instructionsReader, HikariDataSource dataSource, - LoaderFactory loaderFactory, IndexLoadKeywords indexLoadKeywords) { + LoaderFactory loaderFactory, + IndexLoadKeywords indexLoadKeywords, + ProcessHeartbeat heartbeat + ) { this.plan = plan; this.instructionsReader = instructionsReader; this.loaderFactory = loaderFactory; this.indexLoadKeywords = indexLoadKeywords; + this.heartbeat = heartbeat; + + heartbeat.start(); nukeTables(dataSource); Runtime.getRuntime().addShutdownHook(new Thread(this::shutDownIndex)); + processorThread = new Thread(this::processor, "Processor Thread"); processorThread.start(); } @@ -97,17 +106,26 @@ public class LoaderMain { public void run() { var logFile = plan.process.getLogFile(); - AtomicInteger loadTotal = new AtomicInteger(); - WorkLog.readLog(logFile, entry -> { loadTotal.incrementAndGet(); }); - LoaderMain.loadTotal = loadTotal.get(); + try { + AtomicInteger loadTotal = new AtomicInteger(); + WorkLog.readLog(logFile, entry -> { + loadTotal.incrementAndGet(); + }); + LoaderMain.loadTotal = loadTotal.get(); - WorkLog.readLog(logFile, entry -> { - load(plan, entry.path(), entry.cnt()); - }); + AtomicInteger loaded = new AtomicInteger(); + WorkLog.readLog(logFile, entry -> { + heartbeat.setProgress(loaded.incrementAndGet() / (double) loadTotal.get()); - running = false; - processorThread.join(); + load(plan, entry.path(), entry.cnt()); + }); + running = false; + processorThread.join(); + } + finally { + heartbeat.shutDown(); + } System.exit(0); } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java index fe8c022e..09d5be2e 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java @@ -4,6 +4,7 @@ import com.google.gson.Gson; import com.google.inject.AbstractModule; import com.google.inject.name.Names; import nu.marginalia.LanguageModels; +import nu.marginalia.ProcessConfiguration; import nu.marginalia.WmsaHome; import plan.CrawlPlan; import nu.marginalia.model.gson.GsonFactory; @@ -11,6 +12,7 @@ import nu.marginalia.service.SearchServiceDescriptors; import nu.marginalia.service.descriptor.ServiceDescriptors; import java.nio.file.Path; +import java.util.UUID; public class LoaderModule extends AbstractModule { @@ -24,6 +26,7 @@ public class LoaderModule extends AbstractModule { bind(CrawlPlan.class).toInstance(plan); bind(ServiceDescriptors.class).toInstance(SearchServiceDescriptors.descriptors); + bind(ProcessConfiguration.class).toInstance(new ProcessConfiguration("loader", 0, UUID.randomUUID())); bind(Gson.class).toInstance(createGson()); diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java index ccf167a8..fcd79ff4 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java @@ -29,6 +29,7 @@ public class ControlService extends Service { private final ServiceMonitors monitors; private final MustacheRenderer indexRenderer; private final MustacheRenderer> servicesRenderer; + private final MustacheRenderer> processesRenderer; private final MustacheRenderer> eventsRenderer; private final MustacheRenderer> messageQueueRenderer; private final MqPersistence messageQueuePersistence; @@ -54,6 +55,7 @@ public class ControlService extends Service { indexRenderer = rendererFactory.renderer("control/index"); servicesRenderer = rendererFactory.renderer("control/services"); + processesRenderer = rendererFactory.renderer("control/processes"); eventsRenderer = rendererFactory.renderer("control/events"); messageQueueRenderer = rendererFactory.renderer("control/message-queue"); @@ -62,12 +64,13 @@ public class ControlService extends Service { Spark.get("/public/heartbeats", (req, res) -> { res.type("application/json"); - return heartbeatService.getHeartbeats(); + return heartbeatService.getServiceHeartbeats(); }, gson::toJson); Spark.get("/public/", (req, rsp) -> indexRenderer.render(Map.of())); - Spark.get("/public/services", (req, rsp) -> servicesRenderer.render(Map.of("heartbeats", heartbeatService.getHeartbeats()))); + Spark.get("/public/services", (req, rsp) -> servicesRenderer.render(Map.of("heartbeats", heartbeatService.getServiceHeartbeats()))); + Spark.get("/public/processes", (req, rsp) -> processesRenderer.render(Map.of("heartbeats", heartbeatService.getProcessHeartbeats()))); Spark.get("/public/events", (req, rsp) -> eventsRenderer.render(Map.of("events", eventLogService.getLastEntries(20)))); Spark.get("/public/message-queue", (req, rsp) -> messageQueueRenderer.render(Map.of("messages", messageQueueViewService.getLastEntries(20)))); diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/EventLogService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/EventLogService.java index 9165204d..41a325ec 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/EventLogService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/EventLogService.java @@ -23,7 +23,7 @@ public class EventLogService { try (var conn = dataSource.getConnection(); var query = conn.prepareStatement(""" SELECT SERVICE_NAME, INSTANCE, EVENT_TIME, EVENT_TYPE, EVENT_MESSAGE - FROM PROC_SERVICE_EVENTLOG ORDER BY ID DESC LIMIT ? + FROM SERVICE_EVENTLOG ORDER BY ID DESC LIMIT ? """)) { query.setInt(1, n); diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/HeartbeatService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/HeartbeatService.java index 370fa15d..823ca045 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/HeartbeatService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/HeartbeatService.java @@ -3,6 +3,7 @@ package nu.marginalia.control; import com.google.inject.Inject; import com.google.inject.Singleton; import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.control.model.ProcessHeartbeat; import nu.marginalia.control.model.ServiceHeartbeat; import java.sql.SQLException; @@ -18,14 +19,14 @@ public class HeartbeatService { this.dataSource = dataSource; } - public List getHeartbeats() { + public List getServiceHeartbeats() { List heartbeats = new ArrayList<>(); try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(""" SELECT SERVICE_NAME, SERVICE_BASE, INSTANCE, ALIVE, TIMESTAMPDIFF(MICROSECOND, HEARTBEAT_TIME, CURRENT_TIMESTAMP(6)) AS TSDIFF - FROM PROC_SERVICE_HEARTBEAT + FROM SERVICE_HEARTBEAT """)) { var rs = stmt.executeQuery(); @@ -46,6 +47,34 @@ public class HeartbeatService { return heartbeats; } + public List getProcessHeartbeats() { + List heartbeats = new ArrayList<>(); + + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT PROCESS_NAME, PROCESS_BASE, INSTANCE, STATUS, PROGRESS, + TIMESTAMPDIFF(MICROSECOND, HEARTBEAT_TIME, CURRENT_TIMESTAMP(6)) AS TSDIFF + FROM PROCESS_HEARTBEAT + """)) { + + var rs = stmt.executeQuery(); + while (rs.next()) { + heartbeats.add(new ProcessHeartbeat( + rs.getString("PROCESS_NAME"), + rs.getString("PROCESS_BASE"), + trimUUID(rs.getString("INSTANCE")), + rs.getInt("TSDIFF") / 1000., + rs.getInt("PROGRESS"), + rs.getString("STATUS") + )); + } + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + + return heartbeats; + } private String trimUUID(String uuid) { if (uuid.length() > 8) { return uuid.substring(0, 8); diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/MessageQueueViewService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/MessageQueueViewService.java index 35a27cbc..6cec667c 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/MessageQueueViewService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/MessageQueueViewService.java @@ -23,7 +23,7 @@ public class MessageQueueViewService { try (var conn = dataSource.getConnection(); var query = conn.prepareStatement(""" SELECT ID, RELATED_ID, SENDER_INBOX, RECIPIENT_INBOX, FUNCTION, OWNER_INSTANCE, OWNER_TICK, STATE, CREATED_TIME, UPDATED_TIME, TTL - FROM PROC_MESSAGE + FROM MESSAGE_QUEUE ORDER BY ID DESC LIMIT ? """)) { diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java new file mode 100644 index 00000000..ddbe0d35 --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java @@ -0,0 +1,25 @@ +package nu.marginalia.control.model; + +public record ProcessHeartbeat( + String processId, + String processBase, + String uuid, + double lastSeenMillis, + int progress, + String status +) { + public boolean isMissing() { + return lastSeenMillis > 10000; + } + public boolean isStopped() { + return "STOPPED".equals(status); + } + public String progressStyle() { + if ("RUNNING".equals(status) && progress > 0) { + return """ + background: linear-gradient(90deg, #ccc %d%%, #ccc %d%%, #fff %d%%) + """.formatted(progress, progress, progress); + } + return ""; + } +} diff --git a/code/services-satellite/control-service/src/main/resources/static/control/style.css b/code/services-satellite/control-service/src/main/resources/static/control/style.css index 3df019e5..ada93e58 100644 --- a/code/services-satellite/control-service/src/main/resources/static/control/style.css +++ b/code/services-satellite/control-service/src/main/resources/static/control/style.css @@ -1,5 +1,5 @@ body { - font-family: serif; + font-family: sans-serif; line-height: 1.6; display: grid; @@ -8,6 +8,17 @@ body { grid-template-areas: "left right"; } +h1 { + font-family: serif; +} +table { + font-family: monospace; +} +th { text-align: left; } +td,th { padding-right: 1ch; border: 1px solid #ccc; } +tr:nth-last-of-type(2n) { + background-color: #eee; +} body > nav { grid-area: left; } diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/events.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/events.hdb index 941c4dea..2c0b20b8 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/events.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/events.hdb @@ -4,6 +4,7 @@ Control Service + {{> control/partials/nav}} diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/index.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/index.hdb index 6ca3119f..a1331540 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/index.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/index.hdb @@ -4,6 +4,7 @@ Control Service + {{> control/partials/nav}} diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/message-queue.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/message-queue.hdb index 844e9a07..41bb73f8 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/message-queue.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/message-queue.hdb @@ -4,6 +4,7 @@ Control Service + {{> control/partials/nav}} diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/partials/nav.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/partials/nav.hdb index 98283bfc..771266f2 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/partials/nav.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/partials/nav.hdb @@ -2,8 +2,8 @@ \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/processes.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/processes.hdb new file mode 100644 index 00000000..1ab85c66 --- /dev/null +++ b/code/services-satellite/control-service/src/main/resources/templates/control/processes.hdb @@ -0,0 +1,34 @@ + + + + Control Service + + + + + + {{> control/partials/nav}} + +
+

Processes

+
+ + + + + + + + {{#each heartbeats}} + + + + + + + + {{/each}} +
Process IDUUIDStatusProgressLast Seen (ms)
{{processId}}{{uuid}}{{status}}{{progress}}{{#unless isStopped}}{{lastSeenMillis}}{{/unless}}
+
+ + \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/services.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/services.hdb index 41184eab..fc7b0c5b 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/services.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/services.hdb @@ -4,6 +4,7 @@ Control Service + {{> control/partials/nav}} From 4ee3f6ba3fc96627e4a7f36d65044f644301d874 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 11 Jul 2023 14:51:51 +0200 Subject: [PATCH 028/157] (minor) Refactor ControlService --- .../nu/marginalia/control/ControlService.java | 45 +++---------- .../control/{ => svc}/EventLogService.java | 2 +- .../control/{ => svc}/HeartbeatService.java | 2 +- .../svc/MessageQueueMonitorService.java | 63 +++++++++++++++++++ .../{ => svc}/MessageQueueViewService.java | 2 +- 5 files changed, 75 insertions(+), 39 deletions(-) rename code/services-satellite/control-service/src/main/java/nu/marginalia/control/{ => svc}/EventLogService.java (97%) rename code/services-satellite/control-service/src/main/java/nu/marginalia/control/{ => svc}/HeartbeatService.java (98%) create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueMonitorService.java rename code/services-satellite/control-service/src/main/java/nu/marginalia/control/{ => svc}/MessageQueueViewService.java (98%) diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java index fcd79ff4..39969083 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java @@ -5,6 +5,10 @@ import com.google.inject.Inject; import nu.marginalia.client.ServiceMonitors; import nu.marginalia.control.model.ControlProcess; import nu.marginalia.control.process.ControlProcesses; +import nu.marginalia.control.svc.EventLogService; +import nu.marginalia.control.svc.HeartbeatService; +import nu.marginalia.control.svc.MessageQueueMonitorService; +import nu.marginalia.control.svc.MessageQueueViewService; import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.renderer.MustacheRenderer; @@ -34,7 +38,7 @@ public class ControlService extends Service { private final MustacheRenderer> messageQueueRenderer; private final MqPersistence messageQueuePersistence; private final StaticResources staticResources; - private final ServiceEventLog eventLog; + private final MessageQueueMonitorService messageQueueMonitorService; @Inject @@ -46,12 +50,12 @@ public class ControlService extends Service { MqPersistence messageQueuePersistence, ControlProcesses controlProcesses, StaticResources staticResources, - MessageQueueViewService messageQueueViewService + MessageQueueViewService messageQueueViewService, + MessageQueueMonitorService messageQueueMonitorService ) throws IOException { super(params); this.monitors = monitors; - this.eventLog = params.eventLog; indexRenderer = rendererFactory.renderer("control/index"); servicesRenderer = rendererFactory.renderer("control/services"); @@ -61,6 +65,7 @@ public class ControlService extends Service { this.messageQueuePersistence = messageQueuePersistence; this.staticResources = staticResources; + this.messageQueueMonitorService = messageQueueMonitorService; Spark.get("/public/heartbeats", (req, res) -> { res.type("application/json"); @@ -74,6 +79,7 @@ public class ControlService extends Service { Spark.get("/public/events", (req, rsp) -> eventsRenderer.render(Map.of("events", eventLogService.getLastEntries(20)))); Spark.get("/public/message-queue", (req, rsp) -> messageQueueRenderer.render(Map.of("messages", messageQueueViewService.getLastEntries(20)))); + // TODO: This should be a POST Spark.get("/public/repartition", (req, rsp) -> { controlProcesses.start(ControlProcess.REPARTITION_REINDEX); return "OK"; @@ -82,10 +88,6 @@ public class ControlService extends Service { Spark.get("/public/:resource", this::serveStatic); monitors.subscribe(this::logMonitorStateChange); - - Thread reaperThread = new Thread(this::reapMessageQueue, "message-queue-reaper"); - reaperThread.setDaemon(true); - reaperThread.start(); } @@ -98,35 +100,6 @@ public class ControlService extends Service { } - private void reapMessageQueue() { - - for (;;) { - try { - TimeUnit.MINUTES.sleep(10); - - int outcome = messageQueuePersistence.reapDeadMessages(); - if (outcome > 0) { - eventLog.logEvent("MESSAGE-QUEUE-REAPED", Integer.toString(outcome)); - logger.info("Reaped {} dead messages from message queue", outcome); - } - - outcome = messageQueuePersistence.cleanOldMessages(); - if (outcome > 0) { - eventLog.logEvent("MESSAGE-QUEUE-CLEANED", Integer.toString(outcome)); - logger.info("Cleaned {} stale messages from message queue", outcome); - } - - } - catch (InterruptedException ex) { - logger.info("Message queue reaper interrupted"); - return; - } - catch (Exception ex) { - logger.error("Message queue reaper failed", ex); - } - } - } - private void logMonitorStateChange() { logger.info("Service state change: {}", monitors.getRunningServices()); } diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/EventLogService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/EventLogService.java similarity index 97% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/EventLogService.java rename to code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/EventLogService.java index 41a325ec..f54e6996 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/EventLogService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/EventLogService.java @@ -1,4 +1,4 @@ -package nu.marginalia.control; +package nu.marginalia.control.svc; import com.google.inject.Inject; import com.google.inject.Singleton; diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/HeartbeatService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/HeartbeatService.java similarity index 98% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/HeartbeatService.java rename to code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/HeartbeatService.java index 823ca045..def90b42 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/HeartbeatService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/HeartbeatService.java @@ -1,4 +1,4 @@ -package nu.marginalia.control; +package nu.marginalia.control.svc; import com.google.inject.Inject; import com.google.inject.Singleton; diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueMonitorService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueMonitorService.java new file mode 100644 index 00000000..a5200275 --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueMonitorService.java @@ -0,0 +1,63 @@ +package nu.marginalia.control.svc; + +import nu.marginalia.mq.persistence.MqPersistence; +import nu.marginalia.service.control.ServiceEventLog; +import nu.marginalia.service.server.BaseServiceParams; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.inject.Inject; +import javax.inject.Singleton; +import java.sql.SQLException; +import java.util.concurrent.TimeUnit; + +@Singleton +public class MessageQueueMonitorService { + private final Logger logger = LoggerFactory.getLogger(MessageQueueMonitorService.class); + private final MqPersistence persistence; + private final ServiceEventLog eventLog; + + @Inject + public MessageQueueMonitorService(BaseServiceParams params) { + this.persistence = params.messageQueuePersistence; + this.eventLog = params.eventLog; + + Thread reaperThread = new Thread(this::run, "message-queue-reaper"); + reaperThread.setDaemon(true); + reaperThread.start(); + } + + + private void run() { + + for (;;) { + try { + TimeUnit.MINUTES.sleep(10); + + reapMessages(); + } + catch (InterruptedException ex) { + logger.info("Message queue reaper interrupted"); + break; + } + catch (Exception ex) { + logger.error("Message queue reaper failed", ex); + } + } + } + + private void reapMessages() throws SQLException { + int outcome = persistence.reapDeadMessages(); + if (outcome > 0) { + eventLog.logEvent("MESSAGE-QUEUE-REAPED", Integer.toString(outcome)); + logger.info("Reaped {} dead messages from message queue", outcome); + } + + outcome = persistence.cleanOldMessages(); + if (outcome > 0) { + eventLog.logEvent("MESSAGE-QUEUE-CLEANED", Integer.toString(outcome)); + logger.info("Cleaned {} stale messages from message queue", outcome); + } + } + +} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/MessageQueueViewService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueViewService.java similarity index 98% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/MessageQueueViewService.java rename to code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueViewService.java index 6cec667c..c8016c78 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/MessageQueueViewService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueViewService.java @@ -1,4 +1,4 @@ -package nu.marginalia.control; +package nu.marginalia.control.svc; import com.google.inject.Inject; import com.google.inject.Singleton; From 3c7c77fe216dfdcfc94cb2e158b0a1fa6384cc66 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 11 Jul 2023 17:06:52 +0200 Subject: [PATCH 029/157] (minor) Bugfix in Path handling --- .../converting/processor/logic/links/FileLinks.java | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/links/FileLinks.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/links/FileLinks.java index cbfbeaea..e8809b67 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/links/FileLinks.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/links/FileLinks.java @@ -31,11 +31,13 @@ public class FileLinks { private static void synthesizeFilenameKeyword(Set fileKeywords, EdgeUrl link) { - Path pFilename = Path.of(link.path.toLowerCase()).getFileName(); + int lastSlash = link.path.lastIndexOf('/'); + if (lastSlash < 0) return; - if (pFilename == null) return; + String filename = link.path + .substring(lastSlash + 1) + .toLowerCase(); - String filename = pFilename.toString(); if (filename.length() > 32 || filename.endsWith(".xml") || filename.endsWith(".jpg") From 77261a38cddccecf6fe60fd903205cc66c5abb0a Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 11 Jul 2023 17:08:43 +0200 Subject: [PATCH 030/157] (control, WIP) MQFSM and ProcessService are sitting in a tree We're spawning processes from the MSFSM in control service now! --- build.gradle | 19 ++- .../java/nu/marginalia/mqsm/StateMachine.java | 13 ++ .../nu/marginalia/loading/LoaderMain.java | 4 +- .../nu/marginalia/loading/LoaderModule.java | 2 +- .../nu/marginalia/control/ControlMain.java | 1 + .../control/ControlProcessModule.java | 15 +++ .../nu/marginalia/control/ControlService.java | 14 +- .../control/model/ControlProcess.java | 3 +- .../control/process/ControlProcesses.java | 16 ++- .../process/ReconvertAndLoadProcess.java | 77 +++++++++++ .../control/svc/HeartbeatService.java | 4 +- .../control/svc/ProcessService.java | 127 ++++++++++++++++++ docker-compose.yml | 16 ++- run/dist/.gitignore | 0 run/env/service.env | 3 +- 15 files changed, 296 insertions(+), 18 deletions(-) create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlProcessModule.java create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ReconvertAndLoadProcess.java create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java create mode 100644 run/dist/.gitignore diff --git a/build.gradle b/build.gradle index 155d5b89..49bf7b98 100644 --- a/build.gradle +++ b/build.gradle @@ -9,14 +9,29 @@ version 'SNAPSHOT' compileJava.options.encoding = "UTF-8" compileTestJava.options.encoding = "UTF-8" -task dist(type: Copy) { +tasks.register('dist', Copy) { from subprojects.collect { it.tasks.withType(Tar) } into "$buildDir/dist" -} + doLast { + copy { + from tarTree("$buildDir/dist/converter-process.tar") + into "$projectDir/run/dist/" + } + copy { + from tarTree("$buildDir/dist/crawler-process.tar") + into "$projectDir/run/dist/" + } + copy { + from tarTree("$buildDir/dist/loader-process.tar") + into "$projectDir/run/dist/" + } + } +} idea { module { excludeDirs.add(file("$projectDir/run/model")) + excludeDirs.add(file("$projectDir/run/dist")) excludeDirs.add(file("$projectDir/run/samples")) excludeDirs.add(file("$projectDir/run/db")) excludeDirs.add(file("$projectDir/run/logs")) diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java index 7e56e6ba..b8ffc739 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java @@ -103,6 +103,19 @@ public class StateMachine { smOutbox.notify(transition.state(), transition.message()); } + /** Initialize the state machine. */ + public void init(String jsonEncodedArgument) throws Exception { + var transition = StateTransition.to("INITIAL", jsonEncodedArgument); + + synchronized (this) { + this.state = allStates.get(transition.state()); + notifyAll(); + } + + smInbox.start(); + smOutbox.notify(transition.state(), transition.message()); + } + /** Resume the state machine from the last known state. */ public void resume() throws Exception { diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java index c17193a7..c70573a6 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java @@ -108,9 +108,7 @@ public class LoaderMain { try { AtomicInteger loadTotal = new AtomicInteger(); - WorkLog.readLog(logFile, entry -> { - loadTotal.incrementAndGet(); - }); + WorkLog.readLog(logFile, entry -> loadTotal.incrementAndGet()); LoaderMain.loadTotal = loadTotal.get(); AtomicInteger loaded = new AtomicInteger(); diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java index 09d5be2e..338e722f 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java @@ -28,7 +28,7 @@ public class LoaderModule extends AbstractModule { bind(ServiceDescriptors.class).toInstance(SearchServiceDescriptors.descriptors); bind(ProcessConfiguration.class).toInstance(new ProcessConfiguration("loader", 0, UUID.randomUUID())); - bind(Gson.class).toInstance(createGson()); + bind(Gson.class).toProvider(this::createGson); bind(Path.class).annotatedWith(Names.named("local-index-path")).toInstance(Path.of(System.getProperty("local-index-path", "/vol"))); bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels()); diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlMain.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlMain.java index e3d12163..52307353 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlMain.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlMain.java @@ -21,6 +21,7 @@ public class ControlMain extends MainClass { Injector injector = Guice.createInjector( new DatabaseModule(), + new ControlProcessModule(), new ConfigurationModule(SearchServiceDescriptors.descriptors, ServiceId.Control)); injector.getInstance(ControlMain.class); diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlProcessModule.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlProcessModule.java new file mode 100644 index 00000000..3530a89b --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlProcessModule.java @@ -0,0 +1,15 @@ +package nu.marginalia.control; + +import com.google.inject.AbstractModule; +import com.google.inject.Module; +import com.google.inject.name.Names; + +import java.nio.file.Path; + +public class ControlProcessModule extends AbstractModule { + @Override + protected void configure() { + String dist = System.getProperty("distPath", System.getProperty("WMSA_HOME") + "/dist/current"); + bind(Path.class).annotatedWith(Names.named("distPath")).toInstance(Path.of(dist)); + } +} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java index 39969083..9d660a1e 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java @@ -5,10 +5,7 @@ import com.google.inject.Inject; import nu.marginalia.client.ServiceMonitors; import nu.marginalia.control.model.ControlProcess; import nu.marginalia.control.process.ControlProcesses; -import nu.marginalia.control.svc.EventLogService; -import nu.marginalia.control.svc.HeartbeatService; -import nu.marginalia.control.svc.MessageQueueMonitorService; -import nu.marginalia.control.svc.MessageQueueViewService; +import nu.marginalia.control.svc.*; import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.renderer.MustacheRenderer; @@ -22,6 +19,7 @@ import spark.Response; import spark.Spark; import java.io.IOException; +import java.nio.file.Path; import java.util.Map; import java.util.concurrent.TimeUnit; @@ -51,7 +49,8 @@ public class ControlService extends Service { ControlProcesses controlProcesses, StaticResources staticResources, MessageQueueViewService messageQueueViewService, - MessageQueueMonitorService messageQueueMonitorService + MessageQueueMonitorService messageQueueMonitorService, + ProcessService processService ) throws IOException { super(params); @@ -84,6 +83,11 @@ public class ControlService extends Service { controlProcesses.start(ControlProcess.REPARTITION_REINDEX); return "OK"; }); + // TODO: This should be a POST + Spark.get("/public/reconvert", (req, rsp) -> { + controlProcesses.start(ControlProcess.RECONVERT_LOAD, "/samples/crawl-blogs/plan.yaml"); + return "OK"; + }); Spark.get("/public/:resource", this::serveStatic); diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ControlProcess.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ControlProcess.java index 613dd2e5..b7db26db 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ControlProcess.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ControlProcess.java @@ -1,7 +1,8 @@ package nu.marginalia.control.model; public enum ControlProcess { - REPARTITION_REINDEX; + REPARTITION_REINDEX, + RECONVERT_LOAD; public String id() { return "fsm:" + name().toLowerCase(); diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ControlProcesses.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ControlProcesses.java index 5813bdbb..6b8a64eb 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ControlProcesses.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ControlProcesses.java @@ -1,8 +1,10 @@ package nu.marginalia.control.process; +import com.google.gson.Gson; import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.control.model.ControlProcess; +import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.mqsm.StateMachine; import nu.marginalia.mqsm.graph.AbstractStateGraph; @@ -17,17 +19,21 @@ import java.util.UUID; public class ControlProcesses { private final MqPersistence persistence; private final ServiceEventLog eventLog; + private final Gson gson; public Map stateMachines = new HashMap<>(); @Inject public ControlProcesses(MqPersistence persistence, + GsonFactory gsonFactory, BaseServiceParams baseServiceParams, - RepartitionReindexProcess repartitionReindexProcess + RepartitionReindexProcess repartitionReindexProcess, + ReconvertAndLoadProcess reconvertAndLoadProcess ) { this.persistence = persistence; this.eventLog = baseServiceParams.eventLog; - + this.gson = gsonFactory.get(); register(ControlProcess.REPARTITION_REINDEX, repartitionReindexProcess); + register(ControlProcess.RECONVERT_LOAD, reconvertAndLoadProcess); } private void register(ControlProcess process, AbstractStateGraph graph) { @@ -48,6 +54,12 @@ public class ControlProcesses { stateMachines.get(process).init(); } + public void start(ControlProcess process, Object arg) throws Exception { + eventLog.logEvent("FSM-START", process.id()); + + stateMachines.get(process).init(gson.toJson(arg)); + } + public void resume(ControlProcess process) throws Exception { eventLog.logEvent("FSM-RESUME", process.id()); stateMachines.get(process).resume(); diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ReconvertAndLoadProcess.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ReconvertAndLoadProcess.java new file mode 100644 index 00000000..1b329b97 --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ReconvertAndLoadProcess.java @@ -0,0 +1,77 @@ +package nu.marginalia.control.process; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.control.svc.ProcessService; +import nu.marginalia.index.client.IndexClient; +import nu.marginalia.index.client.IndexMqEndpoints; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.outbox.MqOutbox; +import nu.marginalia.mqsm.StateFactory; +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mqsm.graph.GraphState; +import nu.marginalia.mqsm.graph.ResumeBehavior; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardCopyOption; + +@Singleton +public class ReconvertAndLoadProcess extends AbstractStateGraph { + + // STATES + + private static final String INITIAL = "INITIAL"; + private static final String RECONVERT = "RECONVERT"; + private static final String LOAD = "LOAD"; + private static final String MOVE_INDEX_FILES = "MOVE_INDEX_FILES"; + private static final String END = "END"; + private final ProcessService processService; + + + @Inject + public ReconvertAndLoadProcess(StateFactory stateFactory, ProcessService processService) { + super(stateFactory); + this.processService = processService; + } + + @GraphState(name = INITIAL, next = RECONVERT) + public String init(String crawlJob) throws Exception { + Path path = Path.of(crawlJob); + + if (!Files.exists(path)) { + error("Bad crawl job path"); + } + + Files.deleteIfExists(path.getParent().resolve("process/process.log")); + + return path.toString(); + } + + @GraphState(name = RECONVERT, next = LOAD, resume = ResumeBehavior.RETRY) + public String reconvert(String crawlJob) throws Exception { + if (!processService.trigger(ProcessService.ProcessId.CONVERTER, Path.of(crawlJob))) + error(); + + return crawlJob; + } + + @GraphState(name = LOAD, next = MOVE_INDEX_FILES, resume = ResumeBehavior.RETRY) + public void load(String crawlJob) throws Exception { + if (!processService.trigger(ProcessService.ProcessId.LOADER, Path.of(crawlJob))) + error(); + } + + @GraphState(name = MOVE_INDEX_FILES, next = END, resume = ResumeBehavior.ERROR) + public String moveIndexFiles(String crawlJob) throws Exception { + Path indexData = Path.of("/vol/index.dat"); + Path indexDest = Path.of("/vol/iw/0/page-index.dat"); + + if (!Files.exists(indexData)) + error("Index data not found"); + + Files.move(indexData, indexDest, StandardCopyOption.REPLACE_EXISTING); + + return crawlJob; + } +} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/HeartbeatService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/HeartbeatService.java index def90b42..bd7f56c7 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/HeartbeatService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/HeartbeatService.java @@ -35,7 +35,7 @@ public class HeartbeatService { rs.getString("SERVICE_NAME"), rs.getString("SERVICE_BASE"), trimUUID(rs.getString("INSTANCE")), - rs.getInt("TSDIFF") / 1000., + rs.getLong("TSDIFF") / 1000., rs.getBoolean("ALIVE") )); } @@ -63,7 +63,7 @@ public class HeartbeatService { rs.getString("PROCESS_NAME"), rs.getString("PROCESS_BASE"), trimUUID(rs.getString("INSTANCE")), - rs.getInt("TSDIFF") / 1000., + rs.getLong("TSDIFF") / 1000., rs.getInt("PROGRESS"), rs.getString("STATUS") )); diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java new file mode 100644 index 00000000..b5198a9e --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java @@ -0,0 +1,127 @@ +package nu.marginalia.control.svc; + +import com.google.inject.name.Named; +import nu.marginalia.service.control.ServiceEventLog; +import nu.marginalia.service.server.BaseServiceParams; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import spark.utils.IOUtils; + +import javax.inject.Inject; +import javax.inject.Singleton; +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +@Singleton +public class ProcessService { + private final Logger logger = LoggerFactory.getLogger(getClass()); + private final ServiceEventLog eventLog; + private final Path distPath; + + private final ConcurrentHashMap processes = new ConcurrentHashMap<>(); + + public enum ProcessId { + CRAWLER("crawler-process/bin/crawler-process"), + CONVERTER("converter-process/bin/converter-process"), + LOADER("loader-process/bin/loader-process"); + + public final String path; + ProcessId(String path) { + this.path = path; + } + }; + + @Inject + public ProcessService(BaseServiceParams params, + @Named("distPath") Path distPath) { + this.eventLog = params.eventLog; + this.distPath = distPath; + } + + public boolean trigger(ProcessId processId, Path plan) throws Exception { + String processPath = processPath(processId); + String[] args = new String[] { + processPath, + plan.toString() + }; + String[] env = env(plan); + + Process process; + + if (!Files.exists(Path.of(processPath))) { + logger.error("Process not found: {}", processPath); + return false; + } + if (!Files.exists(plan)) { + logger.error("Plan not found: {}", processPath); + return false; + } + + logger.info("Starting process: {}", processId + ": " + Arrays.toString(args) + " // " + Arrays.toString(env)); + synchronized (processes) { + if (processes.containsKey(processId)) return false; + process = Runtime.getRuntime().exec(args, env); + processes.put(processId, process); + } + + try (var es = new BufferedReader(new InputStreamReader(process.getErrorStream())); + var os = new BufferedReader(new InputStreamReader(process.getInputStream())) + ) { + eventLog.logEvent("PROCESS-STARTED", processId.toString()); + process.onExit().whenComplete((p,t) -> eventLog.logEvent("PROCESS-EXIT", processId.toString())); + + while (process.isAlive()) { + if (es.ready()) + logger.warn("{}:{}", processId, es.readLine()); + if (os.ready()) + logger.debug("{}:{}", processId, os.readLine()); + } + + return 0 == process.waitFor(); + } + finally { + processes.remove(processId); + } + + + } + + public boolean isRunning(ProcessId processId) { + return processes.containsKey(processId); + } + + public boolean kill(ProcessId processId) { + Process process = processes.get(processId); + if (process == null) return false; + + eventLog.logEvent("PROCESS-KILL", processId.toString()); + process.destroy(); + processes.remove(processId); + + return true; + } + + private String processPath(ProcessId id) { + return distPath.resolve(id.path).toString(); + } + + private String[] env(Path plan) { + + Map opts = new HashMap<>(); + String WMSA_HOME = System.getenv("WMSA_HOME"); + if (WMSA_HOME == null || WMSA_HOME.isBlank()) { + WMSA_HOME = "/var/lib/wmsa"; + } + opts.put("WMSA_HOME", WMSA_HOME); + opts.put("JAVA_HOME", System.getenv("JAVA_HOME")); + opts.put("JAVA_OPTS", "-Dcrawl.rootDirRewrite=/crawl:" + plan.getParent().toString()); + + return opts.entrySet().stream().map(e -> e.getKey() + "=" + e.getValue()).toArray(String[]::new); + } +} diff --git a/docker-compose.yml b/docker-compose.yml index 8490d5a7..4aff54db 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -6,6 +6,8 @@ x-svc: &service - conf:/wmsa/conf:ro - model:/wmsa/model - data:/wmsa/data + - dist:/dist + - samples:/samples - logs:/var/log/wmsa networks: - wmsa @@ -143,4 +145,16 @@ volumes: driver_opts: type: none o: bind - device: run/data \ No newline at end of file + device: run/data + dist: + driver: local + driver_opts: + type: none + o: bind + device: run/dist + samples: + driver: local + driver_opts: + type: none + o: bind + device: run/samples \ No newline at end of file diff --git a/run/dist/.gitignore b/run/dist/.gitignore new file mode 100644 index 00000000..e69de29b diff --git a/run/env/service.env b/run/env/service.env index 2fb7f09e..db871699 100644 --- a/run/env/service.env +++ b/run/env/service.env @@ -1 +1,2 @@ -WMSA_HOME=run/ \ No newline at end of file +WMSA_HOME=run/ +CONTROL_SERVICE_OPTS="-DdistPath=/dist" \ No newline at end of file From 88b9ec70c693ab438342961b80a754608186f273 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 11 Jul 2023 18:05:37 +0200 Subject: [PATCH 031/157] (control, WIP) Run reconvert-load from converter :D --- .../index/client/IndexMqEndpoints.java | 2 + code/api/search-api/build.gradle | 1 + .../search/client/SearchClient.java | 20 ++++++++- .../search/client/SearchMqEndpoints.java | 6 +++ .../nu/marginalia/dict/DictionaryData.java | 5 ++- .../nu/marginalia/dict/DictionaryMap.java | 2 + .../dict/OffHeapDictionaryHashMap.java | 7 ++++ .../marginalia/dict/OnHeapDictionaryMap.java | 5 +++ .../nu/marginalia/lexicon/KeywordLexicon.java | 7 ++++ .../lexicon/KeywordLexiconReadOnlyView.java | 5 +++ .../journal/KeywordLexiconJournal.java | 1 + .../journal/KeywordLexiconJournalFile.java | 4 ++ .../nu/marginalia/index/IndexService.java | 11 +++++ .../marginalia/index/svc/IndexOpsService.java | 11 ++++- .../nu/marginalia/search/SearchService.java | 13 ++++++ .../process/ReconvertAndLoadProcess.java | 42 +++++++++++++++++-- 16 files changed, 135 insertions(+), 7 deletions(-) create mode 100644 code/api/search-api/src/main/java/nu/marginalia/search/client/SearchMqEndpoints.java diff --git a/code/api/index-api/src/main/java/nu/marginalia/index/client/IndexMqEndpoints.java b/code/api/index-api/src/main/java/nu/marginalia/index/client/IndexMqEndpoints.java index 9d2476f8..f8349eb7 100644 --- a/code/api/index-api/src/main/java/nu/marginalia/index/client/IndexMqEndpoints.java +++ b/code/api/index-api/src/main/java/nu/marginalia/index/client/IndexMqEndpoints.java @@ -3,6 +3,8 @@ package nu.marginalia.index.client; public class IndexMqEndpoints { public static final String INDEX_IS_BLOCKED = "INDEX-IS-BLOCKED"; public static final String INDEX_REPARTITION = "INDEX-REPARTITION"; + + public static final String INDEX_RELOAD_LEXICON = "INDEX-RELOAD-LEXICON"; public static final String INDEX_REINDEX = "INDEX-REINDEX"; } diff --git a/code/api/search-api/build.gradle b/code/api/search-api/build.gradle index 8c38b5f3..ba00a702 100644 --- a/code/api/search-api/build.gradle +++ b/code/api/search-api/build.gradle @@ -14,6 +14,7 @@ java { dependencies { implementation project(':code:common:model') implementation project(':code:common:config') + implementation project(':code:common:message-queue') implementation project(':code:common:service-discovery') implementation project(':code:common:service-client') diff --git a/code/api/search-api/src/main/java/nu/marginalia/search/client/SearchClient.java b/code/api/search-api/src/main/java/nu/marginalia/search/client/SearchClient.java index 393fa285..69e011bd 100644 --- a/code/api/search-api/src/main/java/nu/marginalia/search/client/SearchClient.java +++ b/code/api/search-api/src/main/java/nu/marginalia/search/client/SearchClient.java @@ -5,6 +5,8 @@ import com.google.inject.Singleton; import io.reactivex.rxjava3.core.Observable; import nu.marginalia.client.AbstractDynamicClient; import nu.marginalia.model.gson.GsonFactory; +import nu.marginalia.mq.outbox.MqOutbox; +import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.search.client.model.ApiSearchResults; import nu.marginalia.service.descriptor.ServiceDescriptors; import nu.marginalia.service.id.ServiceId; @@ -16,14 +18,30 @@ import org.slf4j.LoggerFactory; import javax.annotation.CheckReturnValue; import java.net.URLEncoder; import java.nio.charset.StandardCharsets; +import java.util.UUID; @Singleton public class SearchClient extends AbstractDynamicClient { private final Logger logger = LoggerFactory.getLogger(getClass()); + private final MqOutbox outbox; + @Inject - public SearchClient(ServiceDescriptors descriptors) { + public SearchClient(ServiceDescriptors descriptors, + MqPersistence persistence) { + super(descriptors.forId(ServiceId.Search), WmsaHome.getHostsFile(), GsonFactory::get); + + String inboxName = ServiceId.Search.name + ":" + "0"; + String outboxName = System.getProperty("service-name", UUID.randomUUID().toString()); + + outbox = new MqOutbox(persistence, inboxName, outboxName, UUID.randomUUID()); + + } + + + public MqOutbox outbox() { + return outbox; } @CheckReturnValue diff --git a/code/api/search-api/src/main/java/nu/marginalia/search/client/SearchMqEndpoints.java b/code/api/search-api/src/main/java/nu/marginalia/search/client/SearchMqEndpoints.java new file mode 100644 index 00000000..1c546b3e --- /dev/null +++ b/code/api/search-api/src/main/java/nu/marginalia/search/client/SearchMqEndpoints.java @@ -0,0 +1,6 @@ +package nu.marginalia.search.client; + +public class SearchMqEndpoints { + /** Flushes the URL caches, run if significant changes have occurred in the URLs database */ + public static final String FLUSH_CACHES = "FLUSH_CACHES"; +} diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/dict/DictionaryData.java b/code/features-index/lexicon/src/main/java/nu/marginalia/dict/DictionaryData.java index 830ed4a7..ea291052 100644 --- a/code/features-index/lexicon/src/main/java/nu/marginalia/dict/DictionaryData.java +++ b/code/features-index/lexicon/src/main/java/nu/marginalia/dict/DictionaryData.java @@ -9,7 +9,6 @@ public class DictionaryData { public DictionaryData(int bankSize) { this.bankSize = bankSize; - banks.add(new DictionaryDataBank(0, bankSize)); } @@ -36,4 +35,8 @@ public class DictionaryData { return banks.get(offset/ bankSize).keyEquals(offset, otherKey); } + public void clear() { + banks.clear(); + banks.add(new DictionaryDataBank(0, bankSize)); + } } diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/dict/DictionaryMap.java b/code/features-index/lexicon/src/main/java/nu/marginalia/dict/DictionaryMap.java index dc904441..1f9525a2 100644 --- a/code/features-index/lexicon/src/main/java/nu/marginalia/dict/DictionaryMap.java +++ b/code/features-index/lexicon/src/main/java/nu/marginalia/dict/DictionaryMap.java @@ -7,6 +7,8 @@ public interface DictionaryMap { return new OnHeapDictionaryMap(); } + void clear(); + int size(); int put(long key); diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/dict/OffHeapDictionaryHashMap.java b/code/features-index/lexicon/src/main/java/nu/marginalia/dict/OffHeapDictionaryHashMap.java index e17c9c19..6a7aa07f 100644 --- a/code/features-index/lexicon/src/main/java/nu/marginalia/dict/OffHeapDictionaryHashMap.java +++ b/code/features-index/lexicon/src/main/java/nu/marginalia/dict/OffHeapDictionaryHashMap.java @@ -58,6 +58,13 @@ public class OffHeapDictionaryHashMap implements DictionaryMap { } } + @Override + public void clear() { + dictionaryData.clear(); + initializeBuffers(); + sz.set(0); + } + @Override public int size() { return sz.get(); diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/dict/OnHeapDictionaryMap.java b/code/features-index/lexicon/src/main/java/nu/marginalia/dict/OnHeapDictionaryMap.java index 3b70e7e4..96dd5d13 100644 --- a/code/features-index/lexicon/src/main/java/nu/marginalia/dict/OnHeapDictionaryMap.java +++ b/code/features-index/lexicon/src/main/java/nu/marginalia/dict/OnHeapDictionaryMap.java @@ -6,6 +6,11 @@ public class OnHeapDictionaryMap implements DictionaryMap { private static final int DEFAULT_SIZE = Integer.getInteger("lexiconSizeHint", 100_000); private final Long2IntOpenHashMap entries = new Long2IntOpenHashMap(DEFAULT_SIZE, 0.75f); + @Override + public void clear() { + entries.clear(); + } + @Override public int size() { return entries.size(); diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexicon.java b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexicon.java index 40f9d73b..bd88efc8 100644 --- a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexicon.java +++ b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexicon.java @@ -9,6 +9,7 @@ import nu.marginalia.lexicon.journal.KeywordLexiconJournal; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.locks.Lock; @@ -46,6 +47,12 @@ public class KeywordLexicon implements AutoCloseable { logger.info("Done creating dictionary writer"); } + public void reload() throws IOException { + logger.info("Reloading dictionary writer"); + journal.loadFile(bytes -> reverseIndex.put(hashFunction.hashBytes(bytes).padToLong())); + logger.info("Done reloading dictionary writer"); + } + public int getOrInsert(String macroWord) { return getOrInsert(macroWord.getBytes(StandardCharsets.UTF_8)); } diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexiconReadOnlyView.java b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexiconReadOnlyView.java index 9cdef151..ba7983a5 100644 --- a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexiconReadOnlyView.java +++ b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexiconReadOnlyView.java @@ -4,6 +4,7 @@ import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; import lombok.SneakyThrows; +import java.io.IOException; import java.util.concurrent.TimeUnit; public class KeywordLexiconReadOnlyView { @@ -21,4 +22,8 @@ public class KeywordLexiconReadOnlyView { return cache.get(word, () -> writer.getReadOnly(word)); } + public boolean reload() throws IOException { + writer.reload(); + return true; + } } diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournal.java b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournal.java index 84a23247..013f2c49 100644 --- a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournal.java +++ b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournal.java @@ -64,6 +64,7 @@ public class KeywordLexiconJournal { } public void loadFile(Consumer loadJournalEntry) throws IOException { + journalFile.rewind(); journalFile.loadFile(loadJournalEntry); } } diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalFile.java b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalFile.java index 7473e4df..f7404296 100644 --- a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalFile.java +++ b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalFile.java @@ -27,6 +27,10 @@ public class KeywordLexiconJournalFile implements AutoCloseable { this.journalFile = journalFile; } + public void rewind() throws IOException { + journalFileRAF.seek(0); + } + public void loadFile(Consumer acceptEntry) throws IOException { if (!journalFile.exists()) { logger.info("File {} does not exist, can't load", journalFile); diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexService.java index 82ed2617..a0ff5582 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexService.java @@ -74,6 +74,17 @@ public class IndexService extends Service { volatile boolean initialized = false; + @MqRequest(endpoint = IndexMqEndpoints.INDEX_RELOAD_LEXICON) + public String reloadLexicon(String message) throws Exception { + + if (!opsService.reloadLexicon()) { + throw new IllegalStateException("Ops lock busy"); + } + + return "ok"; + } + + @MqRequest(endpoint = IndexMqEndpoints.INDEX_REPARTITION) public String repartition(String message) { if (!opsService.repartition()) { diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexOpsService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexOpsService.java index 36377c7c..31192d37 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexOpsService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexOpsService.java @@ -3,11 +3,13 @@ package nu.marginalia.index.svc; import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.index.index.SearchIndex; +import nu.marginalia.lexicon.KeywordLexiconReadOnlyView; import spark.Request; import spark.Response; import spark.Spark; import javax.annotation.CheckReturnValue; +import java.io.IOException; import java.util.Optional; import java.util.concurrent.Callable; import java.util.concurrent.locks.ReentrantLock; @@ -18,12 +20,15 @@ public class IndexOpsService { private final SearchIndex index; private final IndexSearchSetsService searchSetService; + private final KeywordLexiconReadOnlyView lexicon; @Inject public IndexOpsService(SearchIndex index, - IndexSearchSetsService searchSetService) { + IndexSearchSetsService searchSetService, + KeywordLexiconReadOnlyView lexicon) { this.index = index; this.searchSetService = searchSetService; + this.lexicon = lexicon; } public boolean isBusy() { @@ -36,6 +41,9 @@ public class IndexOpsService { public boolean reindex() throws Exception { return run(index::switchIndex).isPresent(); } + public boolean reloadLexicon() throws Exception { + return run(lexicon::reload).isPresent(); + } public Object repartitionEndpoint(Request request, Response response) throws Exception { @@ -80,5 +88,6 @@ public class IndexOpsService { } } + } diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/SearchService.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/SearchService.java index 61ff69c3..b6e7a5d2 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/SearchService.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/SearchService.java @@ -6,9 +6,12 @@ import lombok.SneakyThrows; import nu.marginalia.WebsiteUrl; import nu.marginalia.client.Context; import nu.marginalia.model.gson.GsonFactory; +import nu.marginalia.search.client.SearchMqEndpoints; +import nu.marginalia.search.db.DbUrlDetailsQuery; import nu.marginalia.search.svc.SearchFrontPageService; import nu.marginalia.search.svc.*; import nu.marginalia.service.server.*; +import nu.marginalia.service.server.mq.MqRequest; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import spark.Request; @@ -21,6 +24,7 @@ import java.nio.charset.StandardCharsets; public class SearchService extends Service { private final WebsiteUrl websiteUrl; + private final DbUrlDetailsQuery dbUrlDetailsQuery; private final StaticResources staticResources; private static final Logger logger = LoggerFactory.getLogger(SearchService.class); @@ -29,6 +33,7 @@ public class SearchService extends Service { @Inject public SearchService(BaseServiceParams params, WebsiteUrl websiteUrl, + DbUrlDetailsQuery dbUrlDetailsQuery, StaticResources staticResources, SearchFrontPageService frontPageService, SearchErrorPageService errorPageService, @@ -40,6 +45,7 @@ public class SearchService extends Service { super(params); this.websiteUrl = websiteUrl; + this.dbUrlDetailsQuery = dbUrlDetailsQuery; this.staticResources = staticResources; Spark.staticFiles.expireTime(600); @@ -70,6 +76,13 @@ public class SearchService extends Service { Spark.awaitInitialization(); } + @MqRequest(endpoint = SearchMqEndpoints.FLUSH_CACHES) + public String flushCaches(String unusedArg) { + logger.info("Flushing caches"); + dbUrlDetailsQuery.clearCaches(); + return "OK"; + } + private Object serveStatic(Request request, Response response) { String resource = request.params("resource"); staticResources.serveStatic("search", resource, request, response); diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ReconvertAndLoadProcess.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ReconvertAndLoadProcess.java index 1b329b97..be4b22ca 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ReconvertAndLoadProcess.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ReconvertAndLoadProcess.java @@ -11,6 +11,8 @@ import nu.marginalia.mqsm.StateFactory; import nu.marginalia.mqsm.graph.AbstractStateGraph; import nu.marginalia.mqsm.graph.GraphState; import nu.marginalia.mqsm.graph.ResumeBehavior; +import nu.marginalia.search.client.SearchClient; +import nu.marginalia.search.client.SearchMqEndpoints; import java.nio.file.Files; import java.nio.file.Path; @@ -25,14 +27,25 @@ public class ReconvertAndLoadProcess extends AbstractStateGraph { private static final String RECONVERT = "RECONVERT"; private static final String LOAD = "LOAD"; private static final String MOVE_INDEX_FILES = "MOVE_INDEX_FILES"; + private static final String RELOAD_LEXICON = "RELOAD_LEXICON"; + private static final String RELOAD_LEXICON_WAIT = "RELOAD_LEXICON_WAIT"; + private static final String FLUSH_CACHES = "FLUSH_CACHES"; private static final String END = "END"; private final ProcessService processService; + private final MqOutbox mqIndexOutbox; + private final MqOutbox mqSearchOutbox; @Inject - public ReconvertAndLoadProcess(StateFactory stateFactory, ProcessService processService) { + public ReconvertAndLoadProcess(StateFactory stateFactory, + ProcessService processService, + IndexClient indexClient, + SearchClient searchClient + ) { super(stateFactory); this.processService = processService; + this.mqIndexOutbox = indexClient.outbox(); + this.mqSearchOutbox = searchClient.outbox(); } @GraphState(name = INITIAL, next = RECONVERT) @@ -62,8 +75,8 @@ public class ReconvertAndLoadProcess extends AbstractStateGraph { error(); } - @GraphState(name = MOVE_INDEX_FILES, next = END, resume = ResumeBehavior.ERROR) - public String moveIndexFiles(String crawlJob) throws Exception { + @GraphState(name = MOVE_INDEX_FILES, next = RELOAD_LEXICON, resume = ResumeBehavior.ERROR) + public void moveIndexFiles(String crawlJob) throws Exception { Path indexData = Path.of("/vol/index.dat"); Path indexDest = Path.of("/vol/iw/0/page-index.dat"); @@ -71,7 +84,28 @@ public class ReconvertAndLoadProcess extends AbstractStateGraph { error("Index data not found"); Files.move(indexData, indexDest, StandardCopyOption.REPLACE_EXISTING); + } - return crawlJob; + @GraphState(name = RELOAD_LEXICON, next = RELOAD_LEXICON_WAIT, resume = ResumeBehavior.ERROR) + public long reloadLexicon() throws Exception { + return mqIndexOutbox.sendAsync(IndexMqEndpoints.INDEX_RELOAD_LEXICON, ""); + } + + @GraphState(name = RELOAD_LEXICON_WAIT, next = FLUSH_CACHES, resume = ResumeBehavior.RETRY) + public void reloadLexiconWait(long id) throws Exception { + var rsp = mqIndexOutbox.waitResponse(id); + + if (rsp.state() != MqMessageState.OK) { + error("RELOAD_LEXICON failed"); + } + } + + @GraphState(name = FLUSH_CACHES, next = END, resume = ResumeBehavior.RETRY) + public void flushCaches() throws Exception { + var rsp = mqSearchOutbox.send(SearchMqEndpoints.FLUSH_CACHES, ""); + + if (rsp.state() != MqMessageState.OK) { + error("FLUSH_CACHES failed"); + } } } From 00d9773b44fec5d31baac1d4c688cb6a7e7d7f6c Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 11 Jul 2023 21:37:32 +0200 Subject: [PATCH 032/157] (control) Better looking progress bar --- .../java/nu/marginalia/control/model/ProcessHeartbeat.java | 4 ++-- .../src/main/resources/templates/control/processes.hdb | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java index ddbe0d35..e072a2e2 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java @@ -5,7 +5,7 @@ public record ProcessHeartbeat( String processBase, String uuid, double lastSeenMillis, - int progress, + Integer progress, String status ) { public boolean isMissing() { @@ -17,7 +17,7 @@ public record ProcessHeartbeat( public String progressStyle() { if ("RUNNING".equals(status) && progress > 0) { return """ - background: linear-gradient(90deg, #ccc %d%%, #ccc %d%%, #fff %d%%) + background: linear-gradient(90deg, #fff 0%%, #ccc %d%%, #fff %d%%) """.formatted(progress, progress, progress); } return ""; diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/processes.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/processes.hdb index 1ab85c66..3f6e08eb 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/processes.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/processes.hdb @@ -24,7 +24,7 @@ {{processId}} {{uuid}} {{status}} - {{progress}} + {{#if progress}}{{progress}}%{{/if}} {{#unless isStopped}}{{lastSeenMillis}}{{/unless}} {{/each}} From 0b0cf48849c663e6accaa1415b9ba06c74f4fa8d Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 11 Jul 2023 23:11:02 +0200 Subject: [PATCH 033/157] (control) Better looking UUIDs --- .../nu/marginalia/control/model/EventLogEntry.java | 11 ++++++++++- .../control/model/MessageQueueEntry.java | 11 ++++++++++- .../marginalia/control/model/ProcessHeartbeat.java | 11 ++++++++++- .../marginalia/control/model/ServiceHeartbeat.java | 12 ++++++++++-- .../nu/marginalia/control/svc/EventLogService.java | 8 +------- .../marginalia/control/svc/HeartbeatService.java | 14 +++++--------- .../control/svc/MessageQueueViewService.java | 12 +----------- .../src/main/resources/static/control/style.css | 5 +++++ .../main/resources/templates/control/events.hdb | 5 ++++- .../resources/templates/control/message-queue.hdb | 5 ++++- .../main/resources/templates/control/processes.hdb | 5 ++++- .../main/resources/templates/control/services.hdb | 5 ++++- 12 files changed, 68 insertions(+), 36 deletions(-) diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/EventLogEntry.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/EventLogEntry.java index 65de7699..d044ca91 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/EventLogEntry.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/EventLogEntry.java @@ -2,9 +2,18 @@ package nu.marginalia.control.model; public record EventLogEntry( String serviceName, - String instance, + String instanceFull, String eventTime, String eventType, String eventMessage) { + public String instance() { + return instanceFull.substring(0, 8); + } + public String instanceColor() { + return '#' + instanceFull.substring(0, 6); + } + public String instanceColor2() { + return '#' + instanceFull.substring(25, 31); + } } diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/MessageQueueEntry.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/MessageQueueEntry.java index 9694ac1e..c3de8cca 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/MessageQueueEntry.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/MessageQueueEntry.java @@ -6,7 +6,7 @@ public record MessageQueueEntry ( String senderInbox, String recipientInbox, String function, - String ownerInstance, + String ownerInstanceFull, long ownerTick, String state, String createdTime, @@ -14,4 +14,13 @@ public record MessageQueueEntry ( int ttl ) { + public String ownerInstance() { + return ownerInstanceFull.substring(0, 8); + } + public String ownerInstanceColor() { + return '#' + ownerInstanceFull.substring(0, 6); + } + public String ownerInstanceColor2() { + return '#' + ownerInstanceFull.substring(25, 31); + } } diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java index e072a2e2..703635d0 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java @@ -3,11 +3,20 @@ package nu.marginalia.control.model; public record ProcessHeartbeat( String processId, String processBase, - String uuid, + String uuidFull, double lastSeenMillis, Integer progress, String status ) { + public String uuid() { + return uuidFull.substring(0, 8); + } + public String uuidColor() { + return '#' + uuidFull.substring(0, 6); + } + public String uuidColor2() { + return '#' + uuidFull.substring(25, 31); + } public boolean isMissing() { return lastSeenMillis > 10000; } diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ServiceHeartbeat.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ServiceHeartbeat.java index dcb4d94e..f43d9058 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ServiceHeartbeat.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ServiceHeartbeat.java @@ -3,12 +3,20 @@ package nu.marginalia.control.model; public record ServiceHeartbeat( String serviceId, String serviceBase, - String uuid, + String uuidFull, double lastSeenMillis, boolean alive ) { public boolean isMissing() { return lastSeenMillis > 10000; } - + public String uuid() { + return uuidFull.substring(0, 8); + } + public String uuidColor() { + return '#' + uuidFull.substring(0, 6); + } + public String uuidColor2() { + return '#' + uuidFull.substring(25, 31); + } } diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/EventLogService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/EventLogService.java index f54e6996..8167c71c 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/EventLogService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/EventLogService.java @@ -32,7 +32,7 @@ public class EventLogService { while (rs.next()) { entries.add(new EventLogEntry( rs.getString("SERVICE_NAME"), - trimUUID(rs.getString("INSTANCE")), + rs.getString("INSTANCE"), rs.getTimestamp("EVENT_TIME").toLocalDateTime().toLocalTime().toString(), rs.getString("EVENT_TYPE"), rs.getString("EVENT_MESSAGE") @@ -44,11 +44,5 @@ public class EventLogService { throw new RuntimeException(ex); } } - private String trimUUID(String uuid) { - if (uuid.length() > 8) { - return uuid.substring(0, 8); - } - return uuid; - } } diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/HeartbeatService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/HeartbeatService.java index bd7f56c7..5f8b28f3 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/HeartbeatService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/HeartbeatService.java @@ -34,7 +34,7 @@ public class HeartbeatService { heartbeats.add(new ServiceHeartbeat( rs.getString("SERVICE_NAME"), rs.getString("SERVICE_BASE"), - trimUUID(rs.getString("INSTANCE")), + rs.getString("INSTANCE"), rs.getLong("TSDIFF") / 1000., rs.getBoolean("ALIVE") )); @@ -59,12 +59,13 @@ public class HeartbeatService { var rs = stmt.executeQuery(); while (rs.next()) { + int progress = rs.getInt("PROGRESS"); heartbeats.add(new ProcessHeartbeat( rs.getString("PROCESS_NAME"), rs.getString("PROCESS_BASE"), - trimUUID(rs.getString("INSTANCE")), + rs.getString("INSTANCE"), rs.getLong("TSDIFF") / 1000., - rs.getInt("PROGRESS"), + progress < 0 ? null : progress, rs.getString("STATUS") )); } @@ -75,10 +76,5 @@ public class HeartbeatService { return heartbeats; } - private String trimUUID(String uuid) { - if (uuid.length() > 8) { - return uuid.substring(0, 8); - } - return uuid; - } + } diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueViewService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueViewService.java index c8016c78..9531c0b4 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueViewService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueViewService.java @@ -38,7 +38,7 @@ public class MessageQueueViewService { rs.getString("SENDER_INBOX"), rs.getString("RECIPIENT_INBOX"), rs.getString("FUNCTION"), - trimUUID(rs.getString("OWNER_INSTANCE")), + rs.getString("OWNER_INSTANCE"), rs.getLong("OWNER_TICK"), rs.getString("STATE"), rs.getTimestamp("CREATED_TIME").toLocalDateTime().toLocalTime().toString(), @@ -52,15 +52,5 @@ public class MessageQueueViewService { throw new RuntimeException(ex); } } - private String trimUUID(String uuid) { - if (null == uuid) { - return ""; - } - - if (uuid.length() > 8) { - return uuid.substring(0, 8); - } - return uuid; - } } diff --git a/code/services-satellite/control-service/src/main/resources/static/control/style.css b/code/services-satellite/control-service/src/main/resources/static/control/style.css index ada93e58..26b96fbf 100644 --- a/code/services-satellite/control-service/src/main/resources/static/control/style.css +++ b/code/services-satellite/control-service/src/main/resources/static/control/style.css @@ -8,6 +8,11 @@ body { grid-template-areas: "left right"; } +.uuidPip { + margin-left: 0.25ch; + border-radius: 2ch; + border: 1px solid #ccc; +} h1 { font-family: serif; } diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/events.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/events.hdb index 2c0b20b8..b1cf526d 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/events.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/events.hdb @@ -23,7 +23,10 @@ {{#each events}} {{serviceName}} - {{instance}} + +    + {{instance}} + {{eventTime}} {{eventType}} {{eventMessage}} diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/message-queue.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/message-queue.hdb index 41bb73f8..d4f071e3 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/message-queue.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/message-queue.hdb @@ -33,7 +33,10 @@ {{recipientInbox}} {{senderInbox}} {{function}} - {{ownerInstance}} + +    + {{ownerInstance}} + {{ownerTick}} {{state}} {{createdTime}} diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/processes.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/processes.hdb index 3f6e08eb..1c0f6ebe 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/processes.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/processes.hdb @@ -22,7 +22,10 @@ {{#each heartbeats}} {{processId}} - {{uuid}} + +    + {{uuid}} + {{status}} {{#if progress}}{{progress}}%{{/if}} {{#unless isStopped}}{{lastSeenMillis}}{{/unless}} diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/services.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/services.hdb index fc7b0c5b..6a506194 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/services.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/services.hdb @@ -20,7 +20,10 @@ {{#each heartbeats}} {{serviceId}} - {{uuid}} + +    + {{uuid}} + {{lastSeenMillis}} {{/each}} From 7087ab5f078c563c69128b89f31256d273227c22 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 11 Jul 2023 23:11:34 +0200 Subject: [PATCH 034/157] (run) Reduce nginx access log noise for local setup --- run/nginx-site.conf | 1 + 1 file changed, 1 insertion(+) diff --git a/run/nginx-site.conf b/run/nginx-site.conf index f9887ad3..5a32bb1a 100644 --- a/run/nginx-site.conf +++ b/run/nginx-site.conf @@ -54,6 +54,7 @@ server { location / { proxy_pass http://control-service:5090/public/; + access_log off; } } \ No newline at end of file From 74caf9e38aa4e0e276f158d1e9f44f5ea565963d Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 12 Jul 2023 17:47:36 +0200 Subject: [PATCH 035/157] (processes) Remove forEach-constructs in favor of iterators. --- .../process/log/WorkLoadIterable.java | 52 ++++++++ .../nu/marginalia/process/log/WorkLog.java | 48 +++----- .../marginalia/process/log/WorkLogEntry.java | 11 ++ .../spec/CrawlerSpecificationLoader.java | 32 +++-- .../src/main/java/plan/CrawlPlan.java | 112 +++++------------- .../marginalia/converting/ConverterMain.java | 5 +- .../java/nu/marginalia/crawl/CrawlerMain.java | 15 +-- .../nu/marginalia/loading/LoaderMain.java | 18 +-- .../tools/ExperimentRunnerMain.java | 11 +- 9 files changed, 161 insertions(+), 143 deletions(-) create mode 100644 code/common/process/src/main/java/nu/marginalia/process/log/WorkLoadIterable.java diff --git a/code/common/process/src/main/java/nu/marginalia/process/log/WorkLoadIterable.java b/code/common/process/src/main/java/nu/marginalia/process/log/WorkLoadIterable.java new file mode 100644 index 00000000..992c1991 --- /dev/null +++ b/code/common/process/src/main/java/nu/marginalia/process/log/WorkLoadIterable.java @@ -0,0 +1,52 @@ +package nu.marginalia.process.log; + +import lombok.SneakyThrows; +import org.jetbrains.annotations.NotNull; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Iterator; +import java.util.Optional; +import java.util.function.Function; + +class WorkLoadIterable implements Iterable { + + private final Path logFile; + private final Function> mapper; + + WorkLoadIterable(Path logFile, Function> mapper) { + this.logFile = logFile; + this.mapper = mapper; + } + + @NotNull + @Override + @SneakyThrows + public Iterator iterator() { + var stream = Files.lines(logFile); + return new Iterator<>() { + final Iterator iter = stream + .filter(WorkLogEntry::isJobId) + .map(WorkLogEntry::parse) + .map(mapper) + .filter(Optional::isPresent) + .map(Optional::get) + .iterator(); + + @Override + public boolean hasNext() { + if (iter.hasNext()) { + return true; + } else { + stream.close(); + return false; + } + } + + @Override + public T next() { + return iter.next(); + } + }; + } +} diff --git a/code/common/process/src/main/java/nu/marginalia/process/log/WorkLog.java b/code/common/process/src/main/java/nu/marginalia/process/log/WorkLog.java index db5b22a8..c552d8f6 100644 --- a/code/common/process/src/main/java/nu/marginalia/process/log/WorkLog.java +++ b/code/common/process/src/main/java/nu/marginalia/process/log/WorkLog.java @@ -1,20 +1,14 @@ package nu.marginalia.process.log; -import com.google.errorprone.annotations.MustBeClosed; -import org.apache.logging.log4j.util.Strings; - -import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.time.LocalDateTime; -import java.util.HashSet; -import java.util.Set; -import java.util.function.Consumer; +import java.util.*; +import java.util.function.Function; import java.util.regex.Pattern; -import java.util.stream.Stream; public class WorkLog implements AutoCloseable { private final Set finishedJobs = new HashSet<>(); @@ -27,24 +21,22 @@ public class WorkLog implements AutoCloseable { writeLogEntry("# Starting WorkLog @ " + LocalDateTime.now()); } - public static void readLog(Path logFile, Consumer entryConsumer) throws FileNotFoundException { - if (!Files.exists(logFile)) { - throw new FileNotFoundException("Log file not found " + logFile); - } - - try (var entries = streamLog(logFile)) { - entries.forEach(entryConsumer); - } catch (IOException e) { - e.printStackTrace(); - } + /** Create an iterable over the work log + *
+ * Caveat: If the iterator is not iterated to the end, + * it will leak a file descriptor. + */ + public static Iterable iterable(Path logFile) { + return new WorkLoadIterable<>(logFile, Optional::of); } - @MustBeClosed - public static Stream streamLog(Path logFile) throws IOException { - return Files.lines(logFile).filter(WorkLog::isJobId).map(line -> { - String[] parts = line.split("\\s+"); - return new WorkLogEntry(parts[0], parts[1], parts[2], Integer.parseInt(parts[3])); - }); + /** Create an iterable over the work log, applying a mapping function to each item + *
+ * Caveat: If the iterator is not iterated to the end, + * it will leak a file descriptor. + */ + public static Iterable iterableMap(Path logFile, Function> mapper) { + return new WorkLoadIterable<>(logFile, mapper); } private void loadLog(Path logFile) throws IOException { @@ -53,14 +45,12 @@ public class WorkLog implements AutoCloseable { } try (var lines = Files.lines(logFile)) { - lines.filter(WorkLog::isJobId).map(this::getJobIdFromWrittenString).forEach(finishedJobs::add); + lines.filter(WorkLogEntry::isJobId) + .map(this::getJobIdFromWrittenString) + .forEach(finishedJobs::add); } } - private static boolean isJobId(String s) { - return Strings.isNotBlank(s) && !s.startsWith("#"); - } - private static final Pattern splitPattern = Pattern.compile("\\s+"); private String getJobIdFromWrittenString(String s) { diff --git a/code/common/process/src/main/java/nu/marginalia/process/log/WorkLogEntry.java b/code/common/process/src/main/java/nu/marginalia/process/log/WorkLogEntry.java index 9f9579f3..31b93610 100644 --- a/code/common/process/src/main/java/nu/marginalia/process/log/WorkLogEntry.java +++ b/code/common/process/src/main/java/nu/marginalia/process/log/WorkLogEntry.java @@ -1,4 +1,15 @@ package nu.marginalia.process.log; +import org.apache.logging.log4j.util.Strings; + public record WorkLogEntry(String id, String ts, String path, int cnt) { + + static WorkLogEntry parse(String line) { + String[] parts = line.split("\\s+"); + return new WorkLogEntry(parts[0], parts[1], parts[2], Integer.parseInt(parts[3])); + } + + static boolean isJobId(String line) { + return Strings.isNotBlank(line) && !line.startsWith("#"); + } } diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/spec/CrawlerSpecificationLoader.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/spec/CrawlerSpecificationLoader.java index cf6fb1fb..2ea956d5 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/spec/CrawlerSpecificationLoader.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/spec/CrawlerSpecificationLoader.java @@ -3,26 +3,38 @@ package nu.marginalia.crawling.model.spec; import com.github.luben.zstd.ZstdInputStream; import com.google.gson.Gson; import com.google.gson.JsonStreamParser; +import lombok.SneakyThrows; import nu.marginalia.model.gson.GsonFactory; import java.io.BufferedReader; import java.io.FileInputStream; -import java.io.IOException; import java.io.InputStreamReader; import java.nio.file.Path; -import java.util.function.Consumer; +import java.util.Iterator; public class CrawlerSpecificationLoader { private final static Gson gson = GsonFactory.get(); - public static void readInputSpec(Path inputSpec, Consumer consumer) { - try (var inputStream = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(inputSpec.toFile()))))) { - var parser = new JsonStreamParser(inputStream); - while (parser.hasNext()) { - consumer.accept(gson.fromJson(parser.next(), CrawlingSpecification.class)); + @SneakyThrows + public static Iterable asIterable(Path inputSpec) { + var inputStream = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(inputSpec.toFile())))); + var parser = new JsonStreamParser(inputStream); + + return () -> new Iterator<>() { + @Override + @SneakyThrows + public boolean hasNext() { + if (!parser.hasNext()) { + inputStream.close(); + return false; + } + return true; } - } catch (IOException e) { - e.printStackTrace(); - } + + @Override + public CrawlingSpecification next() { + return gson.fromJson(parser.next(), CrawlingSpecification.class); + } + }; } } diff --git a/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java b/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java index a23cdede..b425e29b 100644 --- a/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java +++ b/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java @@ -78,100 +78,48 @@ public class CrawlPlan { return new WorkLog(process.getLogFile()); } - public void forEachCrawlingSpecification(Consumer consumer) { - CrawlerSpecificationLoader.readInputSpec(getJobSpec(), consumer); - } - - public void forEachCrawlingLogEntry(Consumer consumer) throws FileNotFoundException { - WorkLog.readLog(this.crawl.getLogFile(), consumer); - } - public void forEachProcessingLogEntry(Consumer consumer) throws FileNotFoundException { - WorkLog.readLog(this.process.getLogFile(), consumer); - } - - public void forEachCrawledDomain(Consumer consumer) { - final CrawledDomainReader reader = new CrawledDomainReader(); - - try (Stream entryStream = WorkLog.streamLog(crawl.getLogFile())) { - entryStream - .map(WorkLogEntry::path) - .map(this::getCrawledFilePath) - .map(reader::readOptionally) - .filter(Optional::isPresent) - .map(Optional::get) - .forEach(consumer); - } - catch (IOException ex) { - logger.warn("Failed to read domains", ex); - - throw new RuntimeException(ex); - } + public Iterable crawlingSpecificationIterable() { + return CrawlerSpecificationLoader.asIterable(getJobSpec()); } public int countCrawledDomains() { - try (Stream entryStream = WorkLog.streamLog(crawl.getLogFile())) { - return (int) entryStream - .map(WorkLogEntry::path) - .count(); - } - catch (IOException ex) { - return 0; + int count = 0; + for (var ignored : WorkLog.iterable(crawl.getLogFile())) { + count++; } + return count; } - public void forEachCrawledDomain(Predicate idReadPredicate, Consumer consumer) { + public Iterable domainsIterable() { final CrawledDomainReader reader = new CrawledDomainReader(); - try (Stream entryStream = WorkLog.streamLog(crawl.getLogFile())) { - entryStream - .filter(entry -> idReadPredicate.test(entry.id())) - .map(WorkLogEntry::path) - .map(this::getCrawledFilePath) - .filter(path -> { - if (!Files.exists(path)) { - logger.warn("File not found: {}", path); - return false; - } - return true; - }) - .map(reader::readOptionally) - .filter(Optional::isPresent) - .map(Optional::get) - .forEach(consumer); - } - catch (IOException ex) { - logger.error("Failed to read domains", ex); - - throw new RuntimeException(ex); - } - } - public DomainsIterable domainsIterable() throws IOException { - return new DomainsIterable(); + return WorkLog.iterableMap(crawl.getLogFile(), + entry -> { + var path = getCrawledFilePath(entry.path()); + if (!Files.exists(path)) { + logger.warn("File not found: {}", path); + return Optional.empty(); + } + return reader.readOptionally(path); + }); } - public class DomainsIterable implements Iterable, AutoCloseable { - private final Stream stream; + public Iterable domainsIterable(Predicate idPredicate) { + final CrawledDomainReader reader = new CrawledDomainReader(); - DomainsIterable() throws IOException { - final CrawledDomainReader reader = new CrawledDomainReader(); + return WorkLog.iterableMap(crawl.getLogFile(), + entry -> { + if (!idPredicate.test(entry.path())) { + return Optional.empty(); + } - stream = WorkLog.streamLog(crawl.getLogFile()) - .map(WorkLogEntry::path) - .map(CrawlPlan.this::getCrawledFilePath) - .map(reader::readOptionally) - .filter(Optional::isPresent) - .map(Optional::get); - } + var path = getCrawledFilePath(entry.path()); - @Override - public void close() { - stream.close(); - } - - @NotNull - @Override - public Iterator iterator() { - return stream.iterator(); - } + if (!Files.exists(path)) { + logger.warn("File not found: {}", path); + return Optional.empty(); + } + return reader.readOptionally(path); + }); } } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java index 8f49c853..16381cc2 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java @@ -98,8 +98,9 @@ public class ConverterMain { }; - - plan.forEachCrawledDomain(id -> !processLog.isJobFinished(id), pipe::accept); + for (var domain : plan.domainsIterable(id -> !processLog.isJobFinished(id))) { + pipe.accept(domain); + } pipe.join(); } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java index 4c436ca3..a0a3f8b7 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java @@ -100,18 +100,19 @@ public class CrawlerMain implements AutoCloseable { public void run() throws InterruptedException { // First a validation run to ensure the file is all good to parse logger.info("Validating JSON"); - AtomicInteger countTotal = new AtomicInteger(); - AtomicInteger countProcessed = new AtomicInteger(); + int countTotal = 0; + int countProcessed = 0; - plan.forEachCrawlingSpecification(unused -> countTotal.incrementAndGet()); + for (var unused : plan.crawlingSpecificationIterable()) { + countTotal++; + } logger.info("Let's go"); - // TODO: Make this into an iterable instead so we can abort it - plan.forEachCrawlingSpecification((spec) -> { - heartbeat.setProgress(countProcessed.incrementAndGet() / (double) countTotal.get()); + for (var spec : plan.crawlingSpecificationIterable()) { + heartbeat.setProgress(countProcessed / (double) countTotal); startCrawlTask(spec); - }); + } } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java index c70573a6..30b84527 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java @@ -107,16 +107,20 @@ public class LoaderMain { var logFile = plan.process.getLogFile(); try { - AtomicInteger loadTotal = new AtomicInteger(); - WorkLog.readLog(logFile, entry -> loadTotal.incrementAndGet()); - LoaderMain.loadTotal = loadTotal.get(); + int loadTotal = 0; + int loaded = 0; - AtomicInteger loaded = new AtomicInteger(); - WorkLog.readLog(logFile, entry -> { - heartbeat.setProgress(loaded.incrementAndGet() / (double) loadTotal.get()); + for (var unused : WorkLog.iterable(logFile)) { + loadTotal++; + } + + LoaderMain.loadTotal = loadTotal; + + for (var entry : WorkLog.iterable(logFile)) { + heartbeat.setProgress(loaded++ / (double) loadTotal); load(plan, entry.path(), entry.cnt()); - }); + } running = false; processorThread.join(); diff --git a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/ExperimentRunnerMain.java b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/ExperimentRunnerMain.java index 09a3cc71..4febc294 100644 --- a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/ExperimentRunnerMain.java +++ b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/ExperimentRunnerMain.java @@ -47,14 +47,13 @@ public class ExperimentRunnerMain { experiment.args(Arrays.copyOfRange(args, 2, args.length)); Map idToDomain = new HashMap<>(); - plan.forEachCrawlingSpecification(spec -> { + for (var spec : plan.crawlingSpecificationIterable()) { idToDomain.put(spec.id, spec.domain); - }); + } - plan.forEachCrawledDomain( - id -> experiment.isInterested(idToDomain.get(id)), - experiment::process - ); + for (var domain : plan.domainsIterable(id -> experiment.isInterested(idToDomain.get(id)))) { + experiment.process(domain); + } experiment.onFinish(); From 363368b15010184b80bde65c0d9e3dec5a48bd3d Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 12 Jul 2023 17:48:37 +0200 Subject: [PATCH 036/157] (converter) Remove auto-refresh. --- .../src/main/resources/templates/control/events.hdb | 1 - .../src/main/resources/templates/control/index.hdb | 1 - .../src/main/resources/templates/control/message-queue.hdb | 1 - .../src/main/resources/templates/control/processes.hdb | 1 - .../src/main/resources/templates/control/services.hdb | 1 - 5 files changed, 5 deletions(-) diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/events.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/events.hdb index b1cf526d..9791fad3 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/events.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/events.hdb @@ -4,7 +4,6 @@ Control Service - {{> control/partials/nav}} diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/index.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/index.hdb index a1331540..6ca3119f 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/index.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/index.hdb @@ -4,7 +4,6 @@ Control Service - {{> control/partials/nav}} diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/message-queue.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/message-queue.hdb index d4f071e3..16597fb6 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/message-queue.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/message-queue.hdb @@ -4,7 +4,6 @@ Control Service - {{> control/partials/nav}} diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/processes.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/processes.hdb index 1c0f6ebe..c1225dd6 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/processes.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/processes.hdb @@ -4,7 +4,6 @@ Control Service - {{> control/partials/nav}} diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/services.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/services.hdb index 6a506194..57800f7c 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/services.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/services.hdb @@ -4,7 +4,6 @@ Control Service - {{> control/partials/nav}} From 5deec636678b768448e9e2717a34b0cd0d444d9c Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 12 Jul 2023 18:04:06 +0200 Subject: [PATCH 037/157] (work-log) Better tests --- code/common/process/build.gradle | 12 +++ .../nu/marginalia/process/log/WorkLog.java | 38 ++++++-- .../marginalia/process/log/WorkLogTest.java | 96 +++++++++++++++++++ .../nu/marginalia/crawling/WorkLogTest.java | 56 ----------- 4 files changed, 136 insertions(+), 66 deletions(-) create mode 100644 code/common/process/src/test/java/nu/marginalia/process/log/WorkLogTest.java delete mode 100644 code/processes/crawling-process/src/test/java/nu/marginalia/crawling/WorkLogTest.java diff --git a/code/common/process/build.gradle b/code/common/process/build.gradle index be27f536..d1b9ae5b 100644 --- a/code/common/process/build.gradle +++ b/code/common/process/build.gradle @@ -30,4 +30,16 @@ dependencies { testImplementation libs.mockito } +test { + maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1 + maxHeapSize = "8G" + useJUnitPlatform() +} +task fastTests(type: Test) { + maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1 + maxHeapSize = "8G" + useJUnitPlatform { + excludeTags "slow" + } +} \ No newline at end of file diff --git a/code/common/process/src/main/java/nu/marginalia/process/log/WorkLog.java b/code/common/process/src/main/java/nu/marginalia/process/log/WorkLog.java index c552d8f6..a66b62e7 100644 --- a/code/common/process/src/main/java/nu/marginalia/process/log/WorkLog.java +++ b/code/common/process/src/main/java/nu/marginalia/process/log/WorkLog.java @@ -1,5 +1,8 @@ package nu.marginalia.process.log; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import java.io.FileOutputStream; import java.io.IOException; import java.nio.charset.StandardCharsets; @@ -10,15 +13,28 @@ import java.util.*; import java.util.function.Function; import java.util.regex.Pattern; +/** WorkLog is a journal of work done by a process, + * so that it can be resumed after a crash or termination. + *

+ * The log file itself is a tab-separated file with the following columns: + *

    + *
  • Job ID
  • + *
  • Timestamp
  • + *
  • Location (e.g. path on disk)
  • + *
  • Size
  • + *

    + * + */ public class WorkLog implements AutoCloseable { private final Set finishedJobs = new HashSet<>(); private final FileOutputStream logWriter; + private final Logger logger = LoggerFactory.getLogger(getClass()); public WorkLog(Path logFile) throws IOException { loadLog(logFile); logWriter = new FileOutputStream(logFile.toFile(), true); - writeLogEntry("# Starting WorkLog @ " + LocalDateTime.now()); + writeLogEntry("# Starting WorkLog @ " + LocalDateTime.now() + "\n"); } /** Create an iterable over the work log @@ -39,6 +55,17 @@ public class WorkLog implements AutoCloseable { return new WorkLoadIterable<>(logFile, mapper); } + // Use synchro over concurrent set to avoid competing writes + // - correct is better than fast here, it's sketchy enough to use + // a PrintWriter + public synchronized void setJobToFinished(String id, String where, int size) throws IOException { + if (!finishedJobs.add(id)) { + logger.warn("Setting job {} to finished, but it was already finished", id); + } + + writeLogEntry(String.format("%s\t%s\t%s\t%d\n",id, LocalDateTime.now(), where, size)); + } + private void loadLog(Path logFile) throws IOException { if (!Files.exists(logFile)) { return; @@ -61,19 +88,10 @@ public class WorkLog implements AutoCloseable { return finishedJobs.contains(id); } - // Use synchro over concurrent set to avoid competing writes - // - correct is better than fast here, it's sketchy enough to use - // a PrintWriter - public synchronized void setJobToFinished(String id, String where, int size) throws IOException { - finishedJobs.add(id); - - writeLogEntry(String.format("%s\t%s\t%s\t%d",id, LocalDateTime.now(), where, size)); - } private void writeLogEntry(String entry) throws IOException { logWriter.write(entry.getBytes(StandardCharsets.UTF_8)); - logWriter.write("\n".getBytes(StandardCharsets.UTF_8)); logWriter.flush(); } diff --git a/code/common/process/src/test/java/nu/marginalia/process/log/WorkLogTest.java b/code/common/process/src/test/java/nu/marginalia/process/log/WorkLogTest.java new file mode 100644 index 00000000..8ed7e68f --- /dev/null +++ b/code/common/process/src/test/java/nu/marginalia/process/log/WorkLogTest.java @@ -0,0 +1,96 @@ +package nu.marginalia.process.log; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.HashMap; +import java.util.Map; + +import static org.junit.jupiter.api.Assertions.*; + +class WorkLogTest { + + private Path logFile; + @BeforeEach + public void setUp() throws IOException { + logFile = Files.createTempFile("worklog", ".log"); + } + + @AfterEach + public void tearDown() throws IOException { + Files.deleteIfExists(logFile); + } + + + @Test + public void testLog() throws IOException { + var log = new WorkLog(logFile); + log.setJobToFinished("A", "a.txt",1); + log.setJobToFinished("B", "b.txt",2); + log.setJobToFinished("C", "c.txt",3); + assertTrue(log.isJobFinished("A")); + assertTrue(log.isJobFinished("B")); + assertTrue(log.isJobFinished("C")); + assertFalse(log.isJobFinished("E")); + } + + @Test + public void testLogResume() throws Exception { + WorkLog log = new WorkLog(logFile); + log.setJobToFinished("A", "a.txt",1); + log.setJobToFinished("B", "b.txt",2); + log.setJobToFinished("C", "c.txt",3); + log.close(); + log = new WorkLog(logFile); + log.setJobToFinished("E", "e.txt",4); + assertTrue(log.isJobFinished("A")); + assertTrue(log.isJobFinished("B")); + assertTrue(log.isJobFinished("C")); + assertTrue(log.isJobFinished("E")); + log.close(); + + Files.readAllLines(logFile).forEach(System.out::println); + } + + @Test + public void test() { + try (var workLog = new WorkLog(logFile)) { + workLog.setJobToFinished("test", "loc1", 4); + workLog.setJobToFinished("test2", "loc2", 5); + workLog.setJobToFinished("test3", "loc3", 1); + } catch (Exception e) { + e.printStackTrace(); + fail(); + } + + try (var workLog = new WorkLog(logFile)) { + workLog.setJobToFinished("test4", "loc4", 0); + + assertTrue(workLog.isJobFinished("test")); + assertTrue(workLog.isJobFinished("test2")); + assertTrue(workLog.isJobFinished("test3")); + assertTrue(workLog.isJobFinished("test4")); + assertFalse(workLog.isJobFinished("test5")); + } + catch (Exception e) { + e.printStackTrace(); + fail(); + } + + + Map entriesById = new HashMap<>(); + WorkLog.iterable(logFile).forEach(e -> entriesById.put(e.id(), e)); + + assertEquals(4, entriesById.size()); + + assertEquals("loc1", entriesById.get("test").path()); + assertEquals("loc2", entriesById.get("test2").path()); + assertEquals("loc3", entriesById.get("test3").path()); + assertEquals("loc4", entriesById.get("test4").path()); + + } +} \ No newline at end of file diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/WorkLogTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/WorkLogTest.java deleted file mode 100644 index 34046445..00000000 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/WorkLogTest.java +++ /dev/null @@ -1,56 +0,0 @@ -package nu.marginalia.crawling; - -import nu.marginalia.process.log.WorkLog; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; - -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertTrue; - -class WorkLogTest { - Path outFile; - @BeforeEach - public void setUp() throws IOException { - outFile = Files.createTempFile(getClass().getSimpleName(), ".log"); - } - @AfterEach - public void tearDown() throws IOException { - Files.delete(outFile); - } - - @Test - public void testLog() throws IOException { - var log = new WorkLog(outFile); - log.setJobToFinished("A", "a.txt",1); - log.setJobToFinished("B", "b.txt",2); - log.setJobToFinished("C", "c.txt",3); - assertTrue(log.isJobFinished("A")); - assertTrue(log.isJobFinished("B")); - assertTrue(log.isJobFinished("C")); - assertFalse(log.isJobFinished("E")); - } - - @Test - public void testLogResume() throws Exception { - WorkLog log = new WorkLog(outFile); - log.setJobToFinished("A", "a.txt",1); - log.setJobToFinished("B", "b.txt",2); - log.setJobToFinished("C", "c.txt",3); - log.close(); - log = new WorkLog(outFile); - log.setJobToFinished("E", "e.txt",4); - assertTrue(log.isJobFinished("A")); - assertTrue(log.isJobFinished("B")); - assertTrue(log.isJobFinished("C")); - assertTrue(log.isJobFinished("E")); - log.close(); - - Files.readAllLines(outFile).forEach(System.out::println); - } - -} From 8c16a2aedea1787e39a344b83e0890dc61781166 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 12 Jul 2023 18:10:05 +0200 Subject: [PATCH 038/157] (work-log, minor) Clean up code --- .../nu/marginalia/process/log/WorkLog.java | 35 ++++++++----------- .../marginalia/process/log/WorkLogEntry.java | 10 +++++- 2 files changed, 23 insertions(+), 22 deletions(-) diff --git a/code/common/process/src/main/java/nu/marginalia/process/log/WorkLog.java b/code/common/process/src/main/java/nu/marginalia/process/log/WorkLog.java index a66b62e7..86dd100c 100644 --- a/code/common/process/src/main/java/nu/marginalia/process/log/WorkLog.java +++ b/code/common/process/src/main/java/nu/marginalia/process/log/WorkLog.java @@ -31,7 +31,13 @@ public class WorkLog implements AutoCloseable { private final Logger logger = LoggerFactory.getLogger(getClass()); public WorkLog(Path logFile) throws IOException { - loadLog(logFile); + if (Files.exists(logFile)) { + try (var lines = Files.lines(logFile)) { + lines.filter(WorkLogEntry::isJobId) + .map(WorkLogEntry::parseJobIdFromLogLine) + .forEach(finishedJobs::add); + } + } logWriter = new FileOutputStream(logFile.toFile(), true); writeLogEntry("# Starting WorkLog @ " + LocalDateTime.now() + "\n"); @@ -58,6 +64,13 @@ public class WorkLog implements AutoCloseable { // Use synchro over concurrent set to avoid competing writes // - correct is better than fast here, it's sketchy enough to use // a PrintWriter + + /** Mark the job as finished in the work log + * + * @param id job identifier + * @param where free form field, e.g. location on disk + * @param size free form field, e.g. how many items were processed + */ public synchronized void setJobToFinished(String id, String where, int size) throws IOException { if (!finishedJobs.add(id)) { logger.warn("Setting job {} to finished, but it was already finished", id); @@ -66,30 +79,10 @@ public class WorkLog implements AutoCloseable { writeLogEntry(String.format("%s\t%s\t%s\t%d\n",id, LocalDateTime.now(), where, size)); } - private void loadLog(Path logFile) throws IOException { - if (!Files.exists(logFile)) { - return; - } - - try (var lines = Files.lines(logFile)) { - lines.filter(WorkLogEntry::isJobId) - .map(this::getJobIdFromWrittenString) - .forEach(finishedJobs::add); - } - } - - private static final Pattern splitPattern = Pattern.compile("\\s+"); - - private String getJobIdFromWrittenString(String s) { - return splitPattern.split(s, 2)[0]; - } - public synchronized boolean isJobFinished(String id) { return finishedJobs.contains(id); } - - private void writeLogEntry(String entry) throws IOException { logWriter.write(entry.getBytes(StandardCharsets.UTF_8)); logWriter.flush(); diff --git a/code/common/process/src/main/java/nu/marginalia/process/log/WorkLogEntry.java b/code/common/process/src/main/java/nu/marginalia/process/log/WorkLogEntry.java index 31b93610..d0cf0ef8 100644 --- a/code/common/process/src/main/java/nu/marginalia/process/log/WorkLogEntry.java +++ b/code/common/process/src/main/java/nu/marginalia/process/log/WorkLogEntry.java @@ -2,14 +2,22 @@ package nu.marginalia.process.log; import org.apache.logging.log4j.util.Strings; +import java.util.regex.Pattern; + public record WorkLogEntry(String id, String ts, String path, int cnt) { + private static final Pattern splitPattern = Pattern.compile("\\s+"); static WorkLogEntry parse(String line) { - String[] parts = line.split("\\s+"); + String[] parts = splitPattern.split(line); return new WorkLogEntry(parts[0], parts[1], parts[2], Integer.parseInt(parts[3])); } static boolean isJobId(String line) { return Strings.isNotBlank(line) && !line.startsWith("#"); } + + static String parseJobIdFromLogLine(String s) { + return splitPattern.split(s, 2)[0]; + } + } From 89e4343fdbb77b5213218e6b67a3dfc3b8700248 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 12 Jul 2023 18:15:50 +0200 Subject: [PATCH 039/157] (minor) Fix test --- .../java/nu/marginalia/mq/persistence/MqPersistenceTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mq/persistence/MqPersistenceTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mq/persistence/MqPersistenceTest.java index ead78f45..74f69682 100644 --- a/code/common/message-queue/src/test/java/nu/marginalia/mq/persistence/MqPersistenceTest.java +++ b/code/common/message-queue/src/test/java/nu/marginalia/mq/persistence/MqPersistenceTest.java @@ -161,7 +161,7 @@ public class MqPersistenceTest { long id = persistence.sendNewMessage(recipientId, null,"function", "payload", Duration.ofSeconds(30)); - var messagesPollFirstTime = persistence.pollInbox(recipientId, instanceId , tick); + var messagesPollFirstTime = persistence.pollInbox(recipientId, instanceId , tick, 10); /** CHECK POLL RESULT */ assertEquals(1, messagesPollFirstTime.size()); @@ -184,7 +184,7 @@ public class MqPersistenceTest { assertEquals(tick, message.ownerTick()); /** VERIFY SECOND POLL IS EMPTY */ - var messagePollSecondTime = persistence.pollInbox(recipientId, instanceId , 1); + var messagePollSecondTime = persistence.pollInbox(recipientId, instanceId , 1, 10); assertEquals(0, messagePollSecondTime.size()); } } From 480abfe966868553855adeb20270d0a959d42ffd Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 12 Jul 2023 18:16:23 +0200 Subject: [PATCH 040/157] (minor) Add limit to pol count in MqPersistence, fix test --- .../main/java/nu/marginalia/mq/inbox/MqInbox.java | 3 ++- .../main/java/nu/marginalia/mq/outbox/MqOutbox.java | 3 ++- .../nu/marginalia/mq/persistence/MqPersistence.java | 12 +++++++----- .../nu/marginalia/crawl/CrawlJobSpecWriterTest.java | 4 +++- 4 files changed, 14 insertions(+), 8 deletions(-) diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInbox.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInbox.java index 20184f32..6f48f481 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInbox.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInbox.java @@ -28,6 +28,7 @@ public class MqInbox { private volatile boolean run = true; private final int pollIntervalMs = Integer.getInteger("mq.inbox.poll-interval-ms", 100); + private final int maxPollCount = Integer.getInteger("mq.inbox.max-poll-count", 10); private final List eventSubscribers = new ArrayList<>(); private final LinkedBlockingQueue queue = new LinkedBlockingQueue<>(32); @@ -194,7 +195,7 @@ public class MqInbox { private Collection pollInbox(long tick) { try { - return persistence.pollInbox(inboxName, instanceUUID, tick); + return persistence.pollInbox(inboxName, instanceUUID, tick, maxPollCount); } catch (SQLException ex) { logger.error("Failed to poll inbox", ex); diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java index a3cc319b..5bdeabd3 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java @@ -23,6 +23,7 @@ public class MqOutbox { private final ConcurrentHashMap pendingResponses = new ConcurrentHashMap<>(); private final int pollIntervalMs = Integer.getInteger("mq.outbox.poll-interval-ms", 100); + private final int maxPollCount = Integer.getInteger("mq.outbox.max-poll-count", 10); private final Thread pollThread; private volatile boolean run = true; @@ -71,7 +72,7 @@ public class MqOutbox { return; try { - var updates = persistence.pollReplyInbox(replyInboxName, instanceUUID, tick); + var updates = persistence.pollReplyInbox(replyInboxName, instanceUUID, tick, maxPollCount); for (var message : updates) { pendingResponses.put(message.relatedId(), message); diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java index a62a0227..4e1f3843 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java @@ -167,18 +167,20 @@ public class MqPersistence { * then returns the number of messages marked. This is an atomic operation that * ensures that messages aren't double processed. */ - private int markInboxMessages(String inboxName, String instanceUUID, long tick) throws SQLException { + private int markInboxMessages(String inboxName, String instanceUUID, long tick, int n) throws SQLException { try (var conn = dataSource.getConnection(); var updateStmt = conn.prepareStatement(""" UPDATE MESSAGE_QUEUE SET OWNER_INSTANCE=?, OWNER_TICK=?, UPDATED_TIME=CURRENT_TIMESTAMP(6), STATE='ACK' WHERE RECIPIENT_INBOX=? AND OWNER_INSTANCE IS NULL AND STATE='NEW' + LIMIT ? """); ) { updateStmt.setString(1, instanceUUID); updateStmt.setLong(2, tick); updateStmt.setString(3, inboxName); + updateStmt.setInt(4, n); return updateStmt.executeUpdate(); } } @@ -186,10 +188,10 @@ public class MqPersistence { /** Marks unclaimed messages addressed to this inbox with instanceUUID and tick, * then returns these messages. */ - public Collection pollInbox(String inboxName, String instanceUUID, long tick) throws SQLException { + public Collection pollInbox(String inboxName, String instanceUUID, long tick, int n) throws SQLException { // Mark new messages as claimed - int expected = markInboxMessages(inboxName, instanceUUID, tick); + int expected = markInboxMessages(inboxName, instanceUUID, tick, n); if (expected == 0) { return Collections.emptyList(); } @@ -231,10 +233,10 @@ public class MqPersistence { /** Marks unclaimed messages addressed to this inbox with instanceUUID and tick, * then returns these messages. */ - public Collection pollReplyInbox(String inboxName, String instanceUUID, long tick) throws SQLException { + public Collection pollReplyInbox(String inboxName, String instanceUUID, long tick, int n) throws SQLException { // Mark new messages as claimed - int expected = markInboxMessages(inboxName, instanceUUID, tick); + int expected = markInboxMessages(inboxName, instanceUUID, tick, n); if (expected == 0) { return Collections.emptyList(); } diff --git a/code/tools/crawl-job-extractor/src/test/java/nu/marginalia/crawl/CrawlJobSpecWriterTest.java b/code/tools/crawl-job-extractor/src/test/java/nu/marginalia/crawl/CrawlJobSpecWriterTest.java index ad9700da..38cfc4fb 100644 --- a/code/tools/crawl-job-extractor/src/test/java/nu/marginalia/crawl/CrawlJobSpecWriterTest.java +++ b/code/tools/crawl-job-extractor/src/test/java/nu/marginalia/crawl/CrawlJobSpecWriterTest.java @@ -37,7 +37,9 @@ public class CrawlJobSpecWriterTest { } List outputs = new ArrayList<>(); - CrawlerSpecificationLoader.readInputSpec(tempFile, outputs::add); + for (var item : CrawlerSpecificationLoader.asIterable(tempFile)) { + outputs.add(item); + } assertEquals(outputs.size(), 3); } From 0ed938545b5a268ed2dbbc0cebd9a783049de18c Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 12 Jul 2023 18:41:27 +0200 Subject: [PATCH 041/157] (mq) Add single-shot inbox --- .../java/nu/marginalia/mq/inbox/MqInbox.java | 6 +++ .../mq/inbox/MqSingleShotInbox.java | 52 ++++++++++++++++++ .../nu/marginalia/mq/outbox/MqOutbox.java | 25 ++++++++- .../nu/marginalia/mq/outbox/MqOutboxTest.java | 54 ++++++++++++++++++- 4 files changed, 135 insertions(+), 2 deletions(-) create mode 100644 code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSingleShotInbox.java diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInbox.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInbox.java index 6f48f481..49f34feb 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInbox.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInbox.java @@ -17,6 +17,7 @@ import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeUnit; import java.util.function.Consumer; +/** Message queue inbox */ public class MqInbox { private final Logger logger = LoggerFactory.getLogger(MqInbox.class); @@ -53,10 +54,14 @@ public class MqInbox { this.instanceUUID = instanceUUID.toString(); } + /** Subscribe to messages on this inbox. Must be run before start()! */ public void subscribe(MqSubscription subscription) { eventSubscribers.add(subscription); } + /** Start receiving messages.

    + * Note: Subscribe to messages before calling this method. + *

    */ public void start() { run = true; @@ -76,6 +81,7 @@ public class MqInbox { notifyThread.start(); } + /** Stop receiving messages and shut down all threads */ public void stop() throws InterruptedException { if (!run) return; diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSingleShotInbox.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSingleShotInbox.java new file mode 100644 index 00000000..68fca86e --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSingleShotInbox.java @@ -0,0 +1,52 @@ +package nu.marginalia.mq.inbox; + +import nu.marginalia.mq.MqMessage; +import nu.marginalia.mq.persistence.MqPersistence; + +import java.sql.SQLException; +import java.util.Optional; +import java.util.concurrent.TimeUnit; + +/** A single-shot inbox that can be used to wait for a single message + * to arrive in an inbox, and then reply to that message + */ +public class MqSingleShotInbox { + + private final String inboxName; + private final String instanceUUID; + private final MqPersistence persistence; + + public MqSingleShotInbox(String inboxName, + String instanceUUID, + MqPersistence persistence) { + this.inboxName = inboxName; + this.instanceUUID = instanceUUID; + this.persistence = persistence; + } + + public Optional waitForMessage(long timeout, TimeUnit unit) throws InterruptedException, SQLException { + final long deadline = System.currentTimeMillis() + unit.toMillis(timeout); + + for (int i = 0;; i++) { + if (System.currentTimeMillis() >= deadline) { + return Optional.empty(); + } + + var messages = persistence.pollInbox(inboxName, instanceUUID, i, 1); + + if (messages.size() > 0) { + return Optional.of(messages.iterator().next()); + } + + TimeUnit.SECONDS.sleep(1); + } + } + + public void sendResponse(MqMessage originalMessage, MqInboxResponse response) { + try { + persistence.sendResponse(originalMessage.msgId(), response.state(), response.message()); + } catch (SQLException e) { + throw new RuntimeException(e); + } + } +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java index 5bdeabd3..88b9601f 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java @@ -7,10 +7,12 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.sql.SQLException; +import java.sql.Time; import java.util.Optional; import java.util.UUID; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; public class MqOutbox { private final Logger logger = LoggerFactory.getLogger(MqOutbox.class); @@ -112,7 +114,7 @@ public class MqOutbox { return id; } - /** Blocks until a response arrives for the given message id. */ + /** Blocks until a response arrives for the given message id (possibly forever) */ public MqMessage waitResponse(long id) throws Exception { synchronized (pendingResponses) { while (!pendingResponses.containsKey(id)) { @@ -127,6 +129,27 @@ public class MqOutbox { } } + + /** Blocks until a response arrives for the given message id or the timeout passes */ + public MqMessage waitResponse(long id, int timeout, TimeUnit unit) throws TimeoutException, SQLException, InterruptedException { + long deadline = System.currentTimeMillis() + unit.toMillis(timeout); + + synchronized (pendingResponses) { + while (!pendingResponses.containsKey(id)) { + if (System.currentTimeMillis() > deadline) + throw new TimeoutException("Timeout waiting for response"); + + pendingResponses.wait(100); + } + + var msg = pendingResponses.remove(id); + // Mark the response as OK so it can be cleaned up + persistence.updateMessageState(msg.msgId(), MqMessageState.OK); + + return msg; + } + } + /** Polls for a response for the given message id. */ public Optional pollResponse(long id) throws SQLException { // no need to sync here if we aren't going to wait() diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqOutboxTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqOutboxTest.java index 849c30b0..cb866b52 100644 --- a/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqOutboxTest.java +++ b/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqOutboxTest.java @@ -7,6 +7,7 @@ import nu.marginalia.mq.MqMessageState; import nu.marginalia.mq.MqTestUtil; import nu.marginalia.mq.inbox.MqInboxResponse; import nu.marginalia.mq.inbox.MqInbox; +import nu.marginalia.mq.inbox.MqSingleShotInbox; import nu.marginalia.mq.inbox.MqSubscription; import nu.marginalia.mq.persistence.MqPersistence; import org.junit.jupiter.api.*; @@ -17,8 +18,9 @@ import org.testcontainers.junit.jupiter.Testcontainers; import java.util.UUID; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; -import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.*; @Tag("slow") @Testcontainers @@ -59,6 +61,56 @@ public class MqOutboxTest { outbox.stop(); } + @Test + public void testSingleShotInboxTimeout() throws Exception { + var inbox = new MqSingleShotInbox(inboxId, UUID.randomUUID().toString(), new MqPersistence(dataSource)); + var message = inbox.waitForMessage(100, TimeUnit.MILLISECONDS); + assertTrue(message.isEmpty()); + } + + @Test + public void testOutboxTimeout() throws Exception { + var outbox = new MqOutbox(new MqPersistence(dataSource), inboxId, inboxId+"/reply", UUID.randomUUID()); + long id = outbox.sendAsync("test", "Hello World"); + try { + outbox.waitResponse(id, 100, TimeUnit.MILLISECONDS); + } + catch (TimeoutException ex) { + return; // ok + } + catch (Exception ex) { + ex.printStackTrace(); + } + fail(); + } + + @Test + public void testSingleShotInbox() throws Exception { + // Send a message to the inbox + var outbox = new MqOutbox(new MqPersistence(dataSource), inboxId,inboxId+"/reply", UUID.randomUUID()); + long id = outbox.sendAsync("test", "Hello World"); + + // Create a single-shot inbox + var inbox = new MqSingleShotInbox(inboxId, UUID.randomUUID().toString(), new MqPersistence(dataSource)); + + // Wait for the message to arrive + var message = inbox.waitForMessage(1, TimeUnit.SECONDS); + + // Check that the message arrived + assertTrue(message.isPresent()); + assertEquals("Hello World", message.get().payload()); + + // Send a response + inbox.sendResponse(message.get(), new MqInboxResponse("Alright then", MqMessageState.OK)); + + // Wait for the response to arrive + var response = outbox.waitResponse(id, 1, TimeUnit.SECONDS); + + // Check that the response arrived + assertEquals(MqMessageState.OK, response.state()); + assertEquals("Alright then", response.payload()); + } + @Test public void testSend() throws Exception { var outbox = new MqOutbox(new MqPersistence(dataSource), inboxId,inboxId+"/reply", UUID.randomUUID()); From 8a53e107faba44ac5198f878d419c7f69b2bf45c Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 12 Jul 2023 20:12:52 +0200 Subject: [PATCH 042/157] (mq) Synchronous and Asynchronous inboxes. --- .../marginalia/index/client/IndexClient.java | 6 +- .../search/client/SearchClient.java | 6 +- .../main/java/nu/marginalia/mq/MqFactory.java | 42 ++++ ...{MqInbox.java => MqAsynchronousInbox.java} | 46 ++-- .../nu/marginalia/mq/inbox/MqInboxIf.java | 15 ++ .../marginalia/mq/inbox/MqInboxShredder.java | 29 +++ .../mq/inbox/MqSingleShotInbox.java | 10 +- .../mq/inbox/MqSynchronousInbox.java | 197 ++++++++++++++++++ .../mq/persistence/MqPersistence.java | 1 + .../java/nu/marginalia/mqsm/StateMachine.java | 13 +- .../nu/marginalia/mq/outbox/MqOutboxTest.java | 96 +++++++-- .../mqsm/StateMachineErrorTest.java | 5 +- .../mqsm/StateMachineResumeTest.java | 13 +- .../nu/marginalia/mqsm/StateMachineTest.java | 9 +- .../service/server/BaseServiceParams.java | 10 +- .../nu/marginalia/service/server/Service.java | 10 +- .../control/process/ControlProcesses.java | 10 +- .../svc/MessageQueueMonitorService.java | 7 +- 18 files changed, 437 insertions(+), 88 deletions(-) create mode 100644 code/common/message-queue/src/main/java/nu/marginalia/mq/MqFactory.java rename code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/{MqInbox.java => MqAsynchronousInbox.java} (86%) create mode 100644 code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInboxIf.java create mode 100644 code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInboxShredder.java create mode 100644 code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSynchronousInbox.java diff --git a/code/api/index-api/src/main/java/nu/marginalia/index/client/IndexClient.java b/code/api/index-api/src/main/java/nu/marginalia/index/client/IndexClient.java index b8d2e683..7ea9d6c9 100644 --- a/code/api/index-api/src/main/java/nu/marginalia/index/client/IndexClient.java +++ b/code/api/index-api/src/main/java/nu/marginalia/index/client/IndexClient.java @@ -10,8 +10,8 @@ import nu.marginalia.client.Context; import nu.marginalia.index.client.model.query.SearchSpecification; import nu.marginalia.index.client.model.results.SearchResultSet; import nu.marginalia.model.gson.GsonFactory; +import nu.marginalia.mq.MqFactory; import nu.marginalia.mq.outbox.MqOutbox; -import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.service.descriptor.ServiceDescriptors; import nu.marginalia.service.id.ServiceId; @@ -27,13 +27,13 @@ public class IndexClient extends AbstractDynamicClient { @Inject public IndexClient(ServiceDescriptors descriptors, - MqPersistence persistence) { + MqFactory messageQueueFactory) { super(descriptors.forId(ServiceId.Index), WmsaHome.getHostsFile(), GsonFactory::get); String inboxName = ServiceId.Index.name + ":" + "0"; String outboxName = System.getProperty("service-name", UUID.randomUUID().toString()); - outbox = new MqOutbox(persistence, inboxName, outboxName, UUID.randomUUID()); + outbox = messageQueueFactory.createOutbox(inboxName, outboxName, UUID.randomUUID()); setTimeout(30); } diff --git a/code/api/search-api/src/main/java/nu/marginalia/search/client/SearchClient.java b/code/api/search-api/src/main/java/nu/marginalia/search/client/SearchClient.java index 69e011bd..6a4f2c4d 100644 --- a/code/api/search-api/src/main/java/nu/marginalia/search/client/SearchClient.java +++ b/code/api/search-api/src/main/java/nu/marginalia/search/client/SearchClient.java @@ -5,8 +5,8 @@ import com.google.inject.Singleton; import io.reactivex.rxjava3.core.Observable; import nu.marginalia.client.AbstractDynamicClient; import nu.marginalia.model.gson.GsonFactory; +import nu.marginalia.mq.MqFactory; import nu.marginalia.mq.outbox.MqOutbox; -import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.search.client.model.ApiSearchResults; import nu.marginalia.service.descriptor.ServiceDescriptors; import nu.marginalia.service.id.ServiceId; @@ -28,14 +28,14 @@ public class SearchClient extends AbstractDynamicClient { @Inject public SearchClient(ServiceDescriptors descriptors, - MqPersistence persistence) { + MqFactory messageQueueFactory) { super(descriptors.forId(ServiceId.Search), WmsaHome.getHostsFile(), GsonFactory::get); String inboxName = ServiceId.Search.name + ":" + "0"; String outboxName = System.getProperty("service-name", UUID.randomUUID().toString()); - outbox = new MqOutbox(persistence, inboxName, outboxName, UUID.randomUUID()); + outbox = messageQueueFactory.createOutbox(inboxName, outboxName, UUID.randomUUID()); } diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/MqFactory.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/MqFactory.java new file mode 100644 index 00000000..792d0bd8 --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/MqFactory.java @@ -0,0 +1,42 @@ +package nu.marginalia.mq; + +import nu.marginalia.mq.inbox.MqAsynchronousInbox; +import nu.marginalia.mq.inbox.MqInboxIf; +import nu.marginalia.mq.inbox.MqSingleShotInbox; +import nu.marginalia.mq.inbox.MqSynchronousInbox; +import nu.marginalia.mq.outbox.MqOutbox; +import nu.marginalia.mq.persistence.MqPersistence; + +import javax.inject.Inject; +import javax.inject.Singleton; +import java.util.UUID; + +@Singleton +public class MqFactory { + private final MqPersistence persistence; + + @Inject + public MqFactory(MqPersistence persistence) { + this.persistence = persistence; + } + + public MqInboxIf createAsynchronousInbox(String inboxName, UUID instanceUUID) + { + return new MqAsynchronousInbox(persistence, inboxName, instanceUUID); + } + + public MqInboxIf createSynchronousInbox(String inboxName, UUID instanceUUID) + { + return new MqSynchronousInbox(persistence, inboxName, instanceUUID); + } + + public MqSingleShotInbox createSingleShotInbox(String inboxName, UUID instanceUUID) + { + return new MqSingleShotInbox(persistence, inboxName, instanceUUID); + } + + public MqOutbox createOutbox(String inboxName, String outboxName, UUID instanceUUID) + { + return new MqOutbox(persistence, inboxName, outboxName, instanceUUID); + } +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInbox.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqAsynchronousInbox.java similarity index 86% rename from code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInbox.java rename to code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqAsynchronousInbox.java index 49f34feb..94fa82f6 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInbox.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqAsynchronousInbox.java @@ -15,11 +15,10 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeUnit; -import java.util.function.Consumer; -/** Message queue inbox */ -public class MqInbox { - private final Logger logger = LoggerFactory.getLogger(MqInbox.class); +/** Message queue inbox that spawns news threads for each message */ +public class MqAsynchronousInbox implements MqInboxIf { + private final Logger logger = LoggerFactory.getLogger(MqAsynchronousInbox.class); private final String inboxName; private final String instanceUUID; @@ -36,17 +35,17 @@ public class MqInbox { private Thread pollDbThread; private Thread notifyThread; - public MqInbox(MqPersistence persistence, - String inboxName, - UUID instanceUUID) + public MqAsynchronousInbox(MqPersistence persistence, + String inboxName, + UUID instanceUUID) { this(persistence, inboxName, instanceUUID, Executors.newCachedThreadPool()); } - public MqInbox(MqPersistence persistence, - String inboxName, - UUID instanceUUID, - ExecutorService executorService) + public MqAsynchronousInbox(MqPersistence persistence, + String inboxName, + UUID instanceUUID, + ExecutorService executorService) { this.threadPool = executorService; this.persistence = persistence; @@ -55,6 +54,7 @@ public class MqInbox { } /** Subscribe to messages on this inbox. Must be run before start()! */ + @Override public void subscribe(MqSubscription subscription) { eventSubscribers.add(subscription); } @@ -62,6 +62,7 @@ public class MqInbox { /** Start receiving messages.

    * Note: Subscribe to messages before calling this method. *

    */ + @Override public void start() { run = true; @@ -82,6 +83,7 @@ public class MqInbox { } /** Stop receiving messages and shut down all threads */ + @Override public void stop() throws InterruptedException { if (!run) return; @@ -185,7 +187,7 @@ public class MqInbox { } } - public void pollDb() { + private void pollDb() { try { for (long tick = 1; run; tick++) { @@ -210,6 +212,7 @@ public class MqInbox { } /** Retrieve the last N messages from the inbox. */ + @Override public List replay(int lastN) { try { return persistence.lastNMessages(inboxName, lastN); @@ -220,23 +223,4 @@ public class MqInbox { } } - - private class MqInboxShredder implements MqSubscription { - - @Override - public boolean filter(MqMessage rawMessage) { - return true; - } - - @Override - public MqInboxResponse onRequest(MqMessage msg) { - logger.warn("Unhandled message {}", msg.msgId()); - return MqInboxResponse.err(); - } - - @Override - public void onNotification(MqMessage msg) { - logger.warn("Unhandled message {}", msg.msgId()); - } - } } diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInboxIf.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInboxIf.java new file mode 100644 index 00000000..b317a1c5 --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInboxIf.java @@ -0,0 +1,15 @@ +package nu.marginalia.mq.inbox; + +import nu.marginalia.mq.MqMessage; + +import java.util.List; + +public interface MqInboxIf { + void subscribe(MqSubscription subscription); + + void start(); + + void stop() throws InterruptedException; + + List replay(int lastN); +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInboxShredder.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInboxShredder.java new file mode 100644 index 00000000..18c346f2 --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInboxShredder.java @@ -0,0 +1,29 @@ +package nu.marginalia.mq.inbox; + +import nu.marginalia.mq.MqMessage; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +class MqInboxShredder implements MqSubscription { + + private final Logger logger = LoggerFactory.getLogger(getClass()); + + public MqInboxShredder() { + } + + @Override + public boolean filter(MqMessage rawMessage) { + return true; + } + + @Override + public MqInboxResponse onRequest(MqMessage msg) { + logger.warn("Unhandled message {}", msg.msgId()); + return MqInboxResponse.err(); + } + + @Override + public void onNotification(MqMessage msg) { + logger.warn("Unhandled message {}", msg.msgId()); + } +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSingleShotInbox.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSingleShotInbox.java index 68fca86e..791a195c 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSingleShotInbox.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSingleShotInbox.java @@ -5,6 +5,7 @@ import nu.marginalia.mq.persistence.MqPersistence; import java.sql.SQLException; import java.util.Optional; +import java.util.UUID; import java.util.concurrent.TimeUnit; /** A single-shot inbox that can be used to wait for a single message @@ -16,11 +17,12 @@ public class MqSingleShotInbox { private final String instanceUUID; private final MqPersistence persistence; - public MqSingleShotInbox(String inboxName, - String instanceUUID, - MqPersistence persistence) { + public MqSingleShotInbox(MqPersistence persistence, + String inboxName, + UUID instanceUUID + ) { this.inboxName = inboxName; - this.instanceUUID = instanceUUID; + this.instanceUUID = instanceUUID.toString(); this.persistence = persistence; } diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSynchronousInbox.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSynchronousInbox.java new file mode 100644 index 00000000..a150a239 --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSynchronousInbox.java @@ -0,0 +1,197 @@ +package nu.marginalia.mq.inbox; + +import nu.marginalia.mq.MqMessage; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.persistence.MqPersistence; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.UUID; +import java.util.concurrent.TimeUnit; + +/** Message queue inbox that responds to a single message at a time + * within the polling thread + */ +public class MqSynchronousInbox implements MqInboxIf { + private final Logger logger = LoggerFactory.getLogger(MqSynchronousInbox.class); + + private final String inboxName; + private final String instanceUUID; + private final MqPersistence persistence; + + private volatile boolean run = true; + + private final int pollIntervalMs = Integer.getInteger("mq.inbox.poll-interval-ms", 100); + private final List eventSubscribers = new ArrayList<>(); + + private Thread pollDbThread; + + public MqSynchronousInbox(MqPersistence persistence, + String inboxName, + UUID instanceUUID) + { + this.persistence = persistence; + this.inboxName = inboxName; + this.instanceUUID = instanceUUID.toString(); + } + + /** Subscribe to messages on this inbox. Must be run before start()! */ + @Override + public void subscribe(MqSubscription subscription) { + eventSubscribers.add(subscription); + } + + /** Start receiving messages.

    + * Note: Subscribe to messages before calling this method. + *

    */ + @Override + public void start() { + run = true; + + if (eventSubscribers.isEmpty()) { + logger.error("No subscribers for inbox {}, registering shredder", inboxName); + } + + // Add a final handler that fails any message that is not handled + eventSubscribers.add(new MqInboxShredder()); + + pollDbThread = new Thread(this::pollDb, "mq-inbox-update-thread:"+inboxName); + pollDbThread.setDaemon(true); + pollDbThread.start(); + } + + /** Stop receiving messages and shut down all threads */ + @Override + public void stop() throws InterruptedException { + if (!run) + return; + + logger.info("Shutting down inbox {}", inboxName); + + run = false; + pollDbThread.join(); + + } + + private void handleMessageWithSubscriber(MqSubscription subscriber, MqMessage msg) { + + if (msg.expectsResponse()) { + respondToMessage(subscriber, msg); + } + else { + acknowledgeNotification(subscriber, msg); + } + } + + private void respondToMessage(MqSubscription subscriber, MqMessage msg) { + try { + final var rsp = subscriber.onRequest(msg); + sendResponse(msg, rsp.state(), rsp.message()); + } catch (Exception ex) { + logger.error("Message Queue subscriber threw exception", ex); + sendResponse(msg, MqMessageState.ERR); + } + } + + private void acknowledgeNotification(MqSubscription subscriber, MqMessage msg) { + try { + subscriber.onNotification(msg); + updateMessageState(msg, MqMessageState.OK); + } catch (Exception ex) { + logger.error("Message Queue subscriber threw exception", ex); + updateMessageState(msg, MqMessageState.ERR); + } + } + + private void sendResponse(MqMessage msg, MqMessageState state) { + try { + persistence.updateMessageState(msg.msgId(), state); + } + catch (SQLException ex) { + logger.error("Failed to update message state", ex); + } + } + + private void updateMessageState(MqMessage msg, MqMessageState state) { + try { + persistence.updateMessageState(msg.msgId(), state); + } + catch (SQLException ex2) { + logger.error("Failed to update message state", ex2); + } + } + + private void sendResponse(MqMessage msg, MqMessageState mqMessageState, String response) { + try { + persistence.sendResponse(msg.msgId(), mqMessageState, response); + } + catch (SQLException ex) { + logger.error("Failed to update message state", ex); + } + } + + public void pollDb() { + try { + for (long tick = 1; run; tick++) { + + var messages = pollInbox(tick); + + for (var msg : messages) { + handleMessage(msg); + } + + if (messages.isEmpty()) { + TimeUnit.MILLISECONDS.sleep(pollIntervalMs); + } + } + } + catch (InterruptedException ex) { + logger.error("MQ inbox update thread interrupted", ex); + } + } + + private void handleMessage(MqMessage msg) { + logger.info("Notifying subscribers of msg {}", msg.msgId()); + + boolean handled = false; + + for (var eventSubscriber : eventSubscribers) { + if (eventSubscriber.filter(msg)) { + handleMessageWithSubscriber(eventSubscriber, msg); + handled = true; + break; + } + } + + if (!handled) { + logger.error("No subscriber wanted to handle msg {}", msg.msgId()); + } + } + + private Collection pollInbox(long tick) { + try { + return persistence.pollInbox(inboxName, instanceUUID, tick, 1); + } + catch (SQLException ex) { + logger.error("Failed to poll inbox", ex); + return List.of(); + } + } + + /** Retrieve the last N messages from the inbox. */ + @Override + public List replay(int lastN) { + try { + return persistence.lastNMessages(inboxName, lastN); + } + catch (SQLException ex) { + logger.error("Failed to replay inbox", ex); + return List.of(); + } + } + +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java index 4e1f3843..198914b3 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java @@ -174,6 +174,7 @@ public class MqPersistence { SET OWNER_INSTANCE=?, OWNER_TICK=?, UPDATED_TIME=CURRENT_TIMESTAMP(6), STATE='ACK' WHERE RECIPIENT_INBOX=? AND OWNER_INSTANCE IS NULL AND STATE='NEW' + ORDER BY ID ASC LIMIT ? """); ) { diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java index b8ffc739..9b7d2cfa 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java @@ -1,12 +1,12 @@ package nu.marginalia.mqsm; +import nu.marginalia.mq.MqFactory; import nu.marginalia.mq.MqMessage; import nu.marginalia.mq.MqMessageState; -import nu.marginalia.mq.inbox.MqInbox; +import nu.marginalia.mq.inbox.MqInboxIf; import nu.marginalia.mq.inbox.MqInboxResponse; import nu.marginalia.mq.inbox.MqSubscription; import nu.marginalia.mq.outbox.MqOutbox; -import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.mqsm.graph.ResumeBehavior; import nu.marginalia.mqsm.graph.AbstractStateGraph; import nu.marginalia.mqsm.state.*; @@ -14,7 +14,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.*; -import java.util.concurrent.Executors; import java.util.function.BiConsumer; /** A state machine that can be used to implement a finite state machine @@ -24,7 +23,7 @@ import java.util.function.BiConsumer; public class StateMachine { private final Logger logger = LoggerFactory.getLogger(StateMachine.class); - private final MqInbox smInbox; + private final MqInboxIf smInbox; private final MqOutbox smOutbox; private final String queueName; private MachineState state; @@ -37,14 +36,14 @@ public class StateMachine { private final Map allStates = new HashMap<>(); - public StateMachine(MqPersistence persistence, + public StateMachine(MqFactory messageQueueFactory, String queueName, UUID instanceUUID, AbstractStateGraph stateGraph) { this.queueName = queueName; - smInbox = new MqInbox(persistence, queueName, instanceUUID, Executors.newSingleThreadExecutor()); - smOutbox = new MqOutbox(persistence, queueName, queueName+"//out", instanceUUID); + smInbox = messageQueueFactory.createSynchronousInbox(queueName, instanceUUID); + smOutbox = messageQueueFactory.createOutbox(queueName, queueName+"//out", instanceUUID); smInbox.subscribe(new StateEventSubscription()); diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqOutboxTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqOutboxTest.java index cb866b52..4411df25 100644 --- a/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqOutboxTest.java +++ b/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqOutboxTest.java @@ -5,10 +5,7 @@ import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.mq.MqMessage; import nu.marginalia.mq.MqMessageState; import nu.marginalia.mq.MqTestUtil; -import nu.marginalia.mq.inbox.MqInboxResponse; -import nu.marginalia.mq.inbox.MqInbox; -import nu.marginalia.mq.inbox.MqSingleShotInbox; -import nu.marginalia.mq.inbox.MqSubscription; +import nu.marginalia.mq.inbox.*; import nu.marginalia.mq.persistence.MqPersistence; import org.junit.jupiter.api.*; import org.testcontainers.containers.MariaDBContainer; @@ -63,7 +60,7 @@ public class MqOutboxTest { @Test public void testSingleShotInboxTimeout() throws Exception { - var inbox = new MqSingleShotInbox(inboxId, UUID.randomUUID().toString(), new MqPersistence(dataSource)); + var inbox = new MqSingleShotInbox(new MqPersistence(dataSource), inboxId, UUID.randomUUID()); var message = inbox.waitForMessage(100, TimeUnit.MILLISECONDS); assertTrue(message.isEmpty()); } @@ -91,7 +88,7 @@ public class MqOutboxTest { long id = outbox.sendAsync("test", "Hello World"); // Create a single-shot inbox - var inbox = new MqSingleShotInbox(inboxId, UUID.randomUUID().toString(), new MqPersistence(dataSource)); + var inbox = new MqSingleShotInbox(new MqPersistence(dataSource), inboxId, UUID.randomUUID()); // Wait for the message to arrive var message = inbox.waitForMessage(1, TimeUnit.SECONDS); @@ -125,11 +122,12 @@ public class MqOutboxTest { outbox.stop(); } + @Test - public void testSendAndRespond() throws Exception { + public void testSendAndRespondAsyncInbox() throws Exception { var outbox = new MqOutbox(new MqPersistence(dataSource), inboxId,inboxId+"/reply", UUID.randomUUID()); - var inbox = new MqInbox(new MqPersistence(dataSource), inboxId, UUID.randomUUID()); + var inbox = new MqAsynchronousInbox(new MqPersistence(dataSource), inboxId, UUID.randomUUID()); inbox.subscribe(justRespond("Alright then")); inbox.start(); @@ -147,10 +145,31 @@ public class MqOutboxTest { } @Test - public void testSendMultiple() throws Exception { + public void testSendAndRespondSyncInbox() throws Exception { var outbox = new MqOutbox(new MqPersistence(dataSource), inboxId,inboxId+"/reply", UUID.randomUUID()); - var inbox = new MqInbox(new MqPersistence(dataSource), inboxId, UUID.randomUUID()); + var inbox = new MqSynchronousInbox(new MqPersistence(dataSource), inboxId, UUID.randomUUID()); + inbox.subscribe(justRespond("Alright then")); + inbox.start(); + + var rsp = outbox.send("test", "Hello World"); + + assertEquals(MqMessageState.OK, rsp.state()); + assertEquals("Alright then", rsp.payload()); + + var messages = MqTestUtil.getMessages(dataSource, inboxId); + assertEquals(1, messages.size()); + assertEquals(MqMessageState.OK, messages.get(0).state()); + + outbox.stop(); + inbox.stop(); + } + + @Test + public void testSendMultipleAsyncInbox() throws Exception { + var outbox = new MqOutbox(new MqPersistence(dataSource), inboxId,inboxId+"/reply", UUID.randomUUID()); + + var inbox = new MqAsynchronousInbox(new MqPersistence(dataSource), inboxId, UUID.randomUUID()); inbox.subscribe(echo()); inbox.start(); @@ -181,9 +200,62 @@ public class MqOutboxTest { } @Test - public void testSendAndRespondWithErrorHandler() throws Exception { + public void testSendMultipleSyncInbox() throws Exception { var outbox = new MqOutbox(new MqPersistence(dataSource), inboxId,inboxId+"/reply", UUID.randomUUID()); - var inbox = new MqInbox(new MqPersistence(dataSource), inboxId, UUID.randomUUID()); + + var inbox = new MqSynchronousInbox(new MqPersistence(dataSource), inboxId, UUID.randomUUID()); + inbox.subscribe(echo()); + inbox.start(); + + var rsp1 = outbox.send("test", "one"); + var rsp2 = outbox.send("test", "two"); + var rsp3 = outbox.send("test", "three"); + var rsp4 = outbox.send("test", "four"); + + Thread.sleep(500); + + assertEquals(MqMessageState.OK, rsp1.state()); + assertEquals("one", rsp1.payload()); + assertEquals(MqMessageState.OK, rsp2.state()); + assertEquals("two", rsp2.payload()); + assertEquals(MqMessageState.OK, rsp3.state()); + assertEquals("three", rsp3.payload()); + assertEquals(MqMessageState.OK, rsp4.state()); + assertEquals("four", rsp4.payload()); + + var messages = MqTestUtil.getMessages(dataSource, inboxId); + assertEquals(4, messages.size()); + for (var message : messages) { + assertEquals(MqMessageState.OK, message.state()); + } + + outbox.stop(); + inbox.stop(); + } + + @Test + public void testSendAndRespondWithErrorHandlerAsyncInbox() throws Exception { + var outbox = new MqOutbox(new MqPersistence(dataSource), inboxId,inboxId+"/reply", UUID.randomUUID()); + var inbox = new MqAsynchronousInbox(new MqPersistence(dataSource), inboxId, UUID.randomUUID()); + + inbox.start(); + + var rsp = outbox.send("test", "Hello World"); + + assertEquals(MqMessageState.ERR, rsp.state()); + + var messages = MqTestUtil.getMessages(dataSource, inboxId); + assertEquals(1, messages.size()); + assertEquals(MqMessageState.ERR, messages.get(0).state()); + + outbox.stop(); + inbox.stop(); + } + + @Test + public void testSendAndRespondWithErrorHandlerSyncInbox() throws Exception { + var outbox = new MqOutbox(new MqPersistence(dataSource), inboxId,inboxId+"/reply", UUID.randomUUID()); + var inbox = new MqSynchronousInbox(new MqPersistence(dataSource), inboxId, UUID.randomUUID()); inbox.start(); diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineErrorTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineErrorTest.java index 9d7306c2..f41a7dbd 100644 --- a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineErrorTest.java +++ b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineErrorTest.java @@ -3,6 +3,7 @@ package nu.marginalia.mqsm; import com.google.gson.GsonBuilder; import com.zaxxer.hikari.HikariConfig; import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.mq.MqFactory; import nu.marginalia.mq.MqMessageRow; import nu.marginalia.mq.MqTestUtil; import nu.marginalia.mq.persistence.MqPersistence; @@ -32,6 +33,7 @@ public class StateMachineErrorTest { static HikariDataSource dataSource; static MqPersistence persistence; + static MqFactory messageQueueFactory; private String inboxId; @BeforeEach @@ -47,6 +49,7 @@ public class StateMachineErrorTest { dataSource = new HikariDataSource(config); persistence = new MqPersistence(dataSource); + messageQueueFactory = new MqFactory(persistence); } @AfterAll @@ -78,7 +81,7 @@ public class StateMachineErrorTest { @Test public void smResumeResumableFromNew() throws Exception { var stateFactory = new StateFactory(new GsonBuilder().create()); - var sm = new StateMachine(persistence, inboxId, UUID.randomUUID(), new ErrorHurdles(stateFactory)); + var sm = new StateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new ErrorHurdles(stateFactory)); sm.init(); diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineResumeTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineResumeTest.java index f3524968..79af8d07 100644 --- a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineResumeTest.java +++ b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineResumeTest.java @@ -3,6 +3,7 @@ package nu.marginalia.mqsm; import com.google.gson.GsonBuilder; import com.zaxxer.hikari.HikariConfig; import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.mq.MqFactory; import nu.marginalia.mq.MqMessageRow; import nu.marginalia.mq.MqMessageState; import nu.marginalia.mq.MqTestUtil; @@ -33,6 +34,7 @@ public class StateMachineResumeTest { static HikariDataSource dataSource; static MqPersistence persistence; + static MqFactory messageQueueFactory; private String inboxId; @BeforeEach @@ -48,6 +50,7 @@ public class StateMachineResumeTest { dataSource = new HikariDataSource(config); persistence = new MqPersistence(dataSource); + messageQueueFactory = new MqFactory(persistence); } @AfterAll @@ -76,7 +79,7 @@ public class StateMachineResumeTest { @Test public void smResumeResumableFromNew() throws Exception { var stateFactory = new StateFactory(new GsonBuilder().create()); - var sm = new StateMachine(persistence, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); + var sm = new StateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); persistence.sendNewMessage(inboxId, null,"RESUMABLE", "", null); @@ -97,7 +100,7 @@ public class StateMachineResumeTest { @Test public void smResumeFromAck() throws Exception { var stateFactory = new StateFactory(new GsonBuilder().create()); - var sm = new StateMachine(persistence, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); + var sm = new StateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); long id = persistence.sendNewMessage(inboxId, null,"RESUMABLE", "", null); persistence.updateMessageState(id, MqMessageState.ACK); @@ -120,7 +123,7 @@ public class StateMachineResumeTest { @Test public void smResumeNonResumableFromNew() throws Exception { var stateFactory = new StateFactory(new GsonBuilder().create()); - var sm = new StateMachine(persistence, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); + var sm = new StateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); persistence.sendNewMessage(inboxId, null,"NON-RESUMABLE", "", null); @@ -141,7 +144,7 @@ public class StateMachineResumeTest { @Test public void smResumeNonResumableFromAck() throws Exception { var stateFactory = new StateFactory(new GsonBuilder().create()); - var sm = new StateMachine(persistence, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); + var sm = new StateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); long id = persistence.sendNewMessage(inboxId, null,"NON-RESUMABLE", "", null); persistence.updateMessageState(id, MqMessageState.ACK); @@ -163,7 +166,7 @@ public class StateMachineResumeTest { @Test public void smResumeEmptyQueue() throws Exception { var stateFactory = new StateFactory(new GsonBuilder().create()); - var sm = new StateMachine(persistence, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); + var sm = new StateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); sm.resume(); diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineTest.java index 1130fe04..27ae869e 100644 --- a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineTest.java +++ b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineTest.java @@ -3,6 +3,7 @@ package nu.marginalia.mqsm; import com.google.gson.GsonBuilder; import com.zaxxer.hikari.HikariConfig; import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.mq.MqFactory; import nu.marginalia.mq.MqTestUtil; import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.mqsm.graph.GraphState; @@ -29,6 +30,7 @@ public class StateMachineTest { static HikariDataSource dataSource; static MqPersistence persistence; + static MqFactory messageQueueFactory; private String inboxId; @BeforeEach @@ -44,6 +46,7 @@ public class StateMachineTest { dataSource = new HikariDataSource(config); persistence = new MqPersistence(dataSource); + messageQueueFactory = new MqFactory(persistence); } @AfterAll @@ -83,7 +86,7 @@ public class StateMachineTest { var graph = new TestGraph(stateFactory); - var sm = new StateMachine(persistence, inboxId, UUID.randomUUID(), graph); + var sm = new StateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), graph); sm.registerStates(graph); sm.init(); @@ -98,7 +101,7 @@ public class StateMachineTest { @Test public void testStartStopStartStop() throws Exception { var stateFactory = new StateFactory(new GsonBuilder().create()); - var sm = new StateMachine(persistence, inboxId, UUID.randomUUID(), new TestGraph(stateFactory)); + var sm = new StateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new TestGraph(stateFactory)); sm.init(); @@ -107,7 +110,7 @@ public class StateMachineTest { System.out.println("-------------------- "); - var sm2 = new StateMachine(persistence, inboxId, UUID.randomUUID(), new TestGraph(stateFactory)); + var sm2 = new StateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new TestGraph(stateFactory)); sm2.resume(); sm2.join(); sm2.stop(); diff --git a/code/common/service/src/main/java/nu/marginalia/service/server/BaseServiceParams.java b/code/common/service/src/main/java/nu/marginalia/service/server/BaseServiceParams.java index abec5e55..2ff07b55 100644 --- a/code/common/service/src/main/java/nu/marginalia/service/server/BaseServiceParams.java +++ b/code/common/service/src/main/java/nu/marginalia/service/server/BaseServiceParams.java @@ -2,7 +2,7 @@ package nu.marginalia.service.server; import com.google.inject.Inject; import com.google.inject.Singleton; -import nu.marginalia.mq.persistence.MqPersistence; +import nu.marginalia.mq.MqFactory; import nu.marginalia.service.control.ServiceEventLog; import nu.marginalia.service.control.ServiceHeartbeat; import nu.marginalia.service.module.ServiceConfiguration; @@ -15,19 +15,19 @@ public class BaseServiceParams { public final MetricsServer metricsServer; public final ServiceHeartbeat heartbeat; public final ServiceEventLog eventLog; - public final MqPersistence messageQueuePersistence; - + public final MqFactory messageQueueInboxFactory; @Inject public BaseServiceParams(ServiceConfiguration configuration, Initialization initialization, MetricsServer metricsServer, ServiceHeartbeat heartbeat, - ServiceEventLog eventLog, MqPersistence messageQueuePersistence) { + ServiceEventLog eventLog, + MqFactory messageQueueInboxFactory) { this.configuration = configuration; this.initialization = initialization; this.metricsServer = metricsServer; this.heartbeat = heartbeat; this.eventLog = eventLog; - this.messageQueuePersistence = messageQueuePersistence; + this.messageQueueInboxFactory = messageQueueInboxFactory; } } diff --git a/code/common/service/src/main/java/nu/marginalia/service/server/Service.java b/code/common/service/src/main/java/nu/marginalia/service/server/Service.java index e8386fb8..ebd75753 100644 --- a/code/common/service/src/main/java/nu/marginalia/service/server/Service.java +++ b/code/common/service/src/main/java/nu/marginalia/service/server/Service.java @@ -3,7 +3,7 @@ package nu.marginalia.service.server; import io.prometheus.client.Counter; import nu.marginalia.client.Context; import nu.marginalia.client.exception.MessagingException; -import nu.marginalia.mq.inbox.MqInbox; +import nu.marginalia.mq.inbox.*; import nu.marginalia.service.server.mq.MqRequest; import nu.marginalia.service.server.mq.ServiceMqSubscription; import org.slf4j.Logger; @@ -39,7 +39,7 @@ public class Service { private final String serviceName; private static volatile boolean initialized = false; - protected final MqInbox messageQueueInbox; + protected final MqInboxIf messageQueueInbox; public Service(BaseServiceParams params, Runnable configureStaticFiles @@ -49,9 +49,9 @@ public class Service { String inboxName = config.serviceName() + ":" + config.node(); logger.info("Inbox name: {}", inboxName); - messageQueueInbox = new MqInbox(params.messageQueuePersistence, - inboxName, - config.instanceUuid()); + + var mqInboxFactory = params.messageQueueInboxFactory; + messageQueueInbox = mqInboxFactory.createAsynchronousInbox(inboxName, config.instanceUuid()); messageQueueInbox.subscribe(new ServiceMqSubscription(this)); serviceName = System.getProperty("service-name"); diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ControlProcesses.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ControlProcesses.java index 6b8a64eb..e553d7df 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ControlProcesses.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ControlProcesses.java @@ -5,7 +5,7 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.control.model.ControlProcess; import nu.marginalia.model.gson.GsonFactory; -import nu.marginalia.mq.persistence.MqPersistence; +import nu.marginalia.mq.MqFactory; import nu.marginalia.mqsm.StateMachine; import nu.marginalia.mqsm.graph.AbstractStateGraph; import nu.marginalia.service.control.ServiceEventLog; @@ -17,19 +17,19 @@ import java.util.UUID; @Singleton public class ControlProcesses { - private final MqPersistence persistence; private final ServiceEventLog eventLog; private final Gson gson; + private final MqFactory messageQueueFactory; public Map stateMachines = new HashMap<>(); @Inject - public ControlProcesses(MqPersistence persistence, + public ControlProcesses(MqFactory messageQueueFactory, GsonFactory gsonFactory, BaseServiceParams baseServiceParams, RepartitionReindexProcess repartitionReindexProcess, ReconvertAndLoadProcess reconvertAndLoadProcess ) { - this.persistence = persistence; + this.messageQueueFactory = messageQueueFactory; this.eventLog = baseServiceParams.eventLog; this.gson = gsonFactory.get(); register(ControlProcess.REPARTITION_REINDEX, repartitionReindexProcess); @@ -37,7 +37,7 @@ public class ControlProcesses { } private void register(ControlProcess process, AbstractStateGraph graph) { - var sm = new StateMachine(persistence, process.id(), UUID.randomUUID(), graph); + var sm = new StateMachine(messageQueueFactory, process.id(), UUID.randomUUID(), graph); sm.listen((function, param) -> logStateChange(process, function)); diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueMonitorService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueMonitorService.java index a5200275..4ba2585c 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueMonitorService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueMonitorService.java @@ -2,7 +2,6 @@ package nu.marginalia.control.svc; import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.service.control.ServiceEventLog; -import nu.marginalia.service.server.BaseServiceParams; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -18,9 +17,9 @@ public class MessageQueueMonitorService { private final ServiceEventLog eventLog; @Inject - public MessageQueueMonitorService(BaseServiceParams params) { - this.persistence = params.messageQueuePersistence; - this.eventLog = params.eventLog; + public MessageQueueMonitorService(ServiceEventLog eventLog, MqPersistence persistence) { + this.eventLog = eventLog; + this.persistence = persistence; Thread reaperThread = new Thread(this::run, "message-queue-reaper"); reaperThread.setDaemon(true); From bf783dad7af7ee14ebb186d011835d3c81b439e0 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 12 Jul 2023 20:13:01 +0200 Subject: [PATCH 043/157] (converter) NPE fix --- .../main/java/nu/marginalia/control/model/ProcessHeartbeat.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java index 703635d0..4fbdcde9 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java @@ -24,7 +24,7 @@ public record ProcessHeartbeat( return "STOPPED".equals(status); } public String progressStyle() { - if ("RUNNING".equals(status) && progress > 0) { + if ("RUNNING".equals(status) && progress != null) { return """ background: linear-gradient(90deg, #fff 0%%, #ccc %d%%, #fff %d%%) """.formatted(progress, progress, progress); From 6c88f00a9d859e25fe247fea799b79a970243e16 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 12 Jul 2023 22:44:05 +0200 Subject: [PATCH 044/157] (mqsm) guard against spurious transitions from unexpected messages --- .../sql/current/12-message-queue.sql | 2 +- .../sql/migrations/04-message-queue.sql | 2 +- .../nu/marginalia/mq/outbox/MqOutbox.java | 11 +++-- .../mq/persistence/MqPersistence.java | 25 ++++++----- .../java/nu/marginalia/mqsm/StateMachine.java | 43 +++++++++++++++---- .../mq/persistence/MqPersistenceTest.java | 12 +++--- .../mqsm/StateMachineResumeTest.java | 8 ++-- .../nu/marginalia/mqsm/StateMachineTest.java | 21 +++++++++ 8 files changed, 90 insertions(+), 34 deletions(-) diff --git a/code/common/db/src/main/resources/sql/current/12-message-queue.sql b/code/common/db/src/main/resources/sql/current/12-message-queue.sql index fd04f666..25bdc636 100644 --- a/code/common/db/src/main/resources/sql/current/12-message-queue.sql +++ b/code/common/db/src/main/resources/sql/current/12-message-queue.sql @@ -1,6 +1,6 @@ CREATE TABLE IF NOT EXISTS MESSAGE_QUEUE ( ID BIGINT AUTO_INCREMENT PRIMARY KEY COMMENT 'Unique id', - RELATED_ID BIGINT COMMENT 'Unique id a related message', + RELATED_ID BIGINT NOT NULL DEFAULT -1 COMMENT 'Unique id a related message', SENDER_INBOX VARCHAR(255) COMMENT 'Name of the sender inbox', RECIPIENT_INBOX VARCHAR(255) NOT NULL COMMENT 'Name of the recipient inbox', FUNCTION VARCHAR(255) NOT NULL COMMENT 'Which function to run', diff --git a/code/common/db/src/main/resources/sql/migrations/04-message-queue.sql b/code/common/db/src/main/resources/sql/migrations/04-message-queue.sql index fd04f666..25bdc636 100644 --- a/code/common/db/src/main/resources/sql/migrations/04-message-queue.sql +++ b/code/common/db/src/main/resources/sql/migrations/04-message-queue.sql @@ -1,6 +1,6 @@ CREATE TABLE IF NOT EXISTS MESSAGE_QUEUE ( ID BIGINT AUTO_INCREMENT PRIMARY KEY COMMENT 'Unique id', - RELATED_ID BIGINT COMMENT 'Unique id a related message', + RELATED_ID BIGINT NOT NULL DEFAULT -1 COMMENT 'Unique id a related message', SENDER_INBOX VARCHAR(255) COMMENT 'Name of the sender inbox', RECIPIENT_INBOX VARCHAR(255) NOT NULL COMMENT 'Name of the recipient inbox', FUNCTION VARCHAR(255) NOT NULL COMMENT 'Which function to run', diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java index 88b9601f..22b4bc85 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java @@ -7,7 +7,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.sql.SQLException; -import java.sql.Time; import java.util.Optional; import java.util.UUID; import java.util.concurrent.ConcurrentHashMap; @@ -107,7 +106,7 @@ public class MqOutbox { *
    * Use waitResponse(id) or pollResponse(id) to fetch the response. */ public long sendAsync(String function, String payload) throws Exception { - var id = persistence.sendNewMessage(inboxName, replyInboxName, function, payload, null); + var id = persistence.sendNewMessage(inboxName, replyInboxName, null, function, payload, null); pendingRequests.put(id, id); @@ -163,7 +162,13 @@ public class MqOutbox { } public long notify(String function, String payload) throws Exception { - return persistence.sendNewMessage(inboxName, null, function, payload, null); + return persistence.sendNewMessage(inboxName, null, null, function, payload, null); + } + public long notify(long relatedId, String function, String payload) throws Exception { + return persistence.sendNewMessage(inboxName, null, relatedId, function, payload, null); } + public void flagAsBad(long id) throws SQLException { + persistence.updateMessageState(id, MqMessageState.ERR); + } } \ No newline at end of file diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java index 198914b3..d075d445 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java @@ -52,23 +52,25 @@ public class MqPersistence { * Adds a new message to the message queue. * * @param recipientInboxName The recipient's inbox name - * @param senderInboxName (nullable) The sender's inbox name. Only needed if a reply is expected. If null, the message is not expected to be replied to. - * @param function The function to call - * @param payload The payload to send, typically JSON. - * @param ttl (nullable) The time to live of the message, in seconds. If null, the message will never set to DEAD. + * @param senderInboxName (nullable) The sender's inbox name. Only needed if a reply is expected. If null, the message is not expected to be replied to. + * @param relatedMessageId (nullable) The id of the message this message is related to. If null, the message is not related to any other message. + * @param function The function to call + * @param payload The payload to send, typically JSON. + * @param ttl (nullable) The time to live of the message, in seconds. If null, the message will never set to DEAD. * @return The id of the message */ public long sendNewMessage(String recipientInboxName, @Nullable String senderInboxName, + Long relatedMessageId, String function, String payload, @Nullable Duration ttl ) throws Exception { try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(""" - INSERT INTO MESSAGE_QUEUE(RECIPIENT_INBOX, SENDER_INBOX, FUNCTION, PAYLOAD, TTL) - VALUES(?, ?, ?, ?, ?) + INSERT INTO MESSAGE_QUEUE(RECIPIENT_INBOX, SENDER_INBOX, RELATED_ID, FUNCTION, PAYLOAD, TTL) + VALUES(?, ?, ?, ?, ?, ?) """); var lastIdQuery = conn.prepareStatement("SELECT LAST_INSERT_ID()")) { @@ -77,10 +79,13 @@ public class MqPersistence { if (senderInboxName == null) stmt.setNull(2, java.sql.Types.VARCHAR); else stmt.setString(2, senderInboxName); - stmt.setString(3, function); - stmt.setString(4, payload); - if (ttl == null) stmt.setNull(5, java.sql.Types.BIGINT); - else stmt.setLong(5, ttl.toSeconds()); + if (relatedMessageId == null) stmt.setLong(3, -1); + else stmt.setLong(3, relatedMessageId); + + stmt.setString(4, function); + stmt.setString(5, payload); + if (ttl == null) stmt.setNull(6, java.sql.Types.BIGINT); + else stmt.setLong(6, ttl.toSeconds()); stmt.executeUpdate(); var rsp = lastIdQuery.executeQuery(); diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java index 9b7d2cfa..d039f363 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java @@ -36,6 +36,14 @@ public class StateMachine { private final Map allStates = new HashMap<>(); + /* The expectedMessageId guards against spurious state changes being triggered by old messages in the queue + * + * It contains the message id of the last message that was processed, and the messages sent by the state machine to + * itself via the message queue all have relatedId set to expectedMessageId. If the state machine is unitialized or + * in a terminal state, it will accept messages with relatedIds that are equal to -1. + * */ + private long expectedMessageId = -1; + public StateMachine(MqFactory messageQueueFactory, String queueName, UUID instanceUUID, @@ -99,7 +107,7 @@ public class StateMachine { } smInbox.start(); - smOutbox.notify(transition.state(), transition.message()); + smOutbox.notify(expectedMessageId, transition.state(), transition.message()); } /** Initialize the state machine. */ @@ -112,7 +120,7 @@ public class StateMachine { } smInbox.start(); - smOutbox.notify(transition.state(), transition.message()); + smOutbox.notify(expectedMessageId, transition.state(), transition.message()); } /** Resume the state machine from the last known state. */ @@ -133,6 +141,7 @@ public class StateMachine { smInbox.start(); logger.info("Resuming state machine from {}({})/{}", firstMessage.function(), firstMessage.payload(), firstMessage.state()); + expectedMessageId = firstMessage.relatedId(); if (firstMessage.state() == MqMessageState.NEW) { // The message is not acknowledged, so starting the inbox will trigger a state transition @@ -141,10 +150,10 @@ public class StateMachine { state = resumingState; } else if (resumeState.resumeBehavior().equals(ResumeBehavior.ERROR)) { // The message is acknowledged, but the state does not support resuming - smOutbox.notify("ERROR", "Illegal resumption from ACK'ed state " + firstMessage.function()); + smOutbox.notify(expectedMessageId, "ERROR", "Illegal resumption from ACK'ed state " + firstMessage.function()); } else { // The message is already acknowledged, so we replay the last state - onStateTransition(firstMessage.function(), firstMessage.payload()); + onStateTransition(firstMessage); } } @@ -153,13 +162,24 @@ public class StateMachine { smOutbox.stop(); } - private void onStateTransition(String nextState, String message) { + private void onStateTransition(MqMessage msg) { + final String nextState = msg.function(); + final String data = msg.payload(); + final long messageId = msg.msgId(); + final long relatedId = msg.relatedId(); + + if (expectedMessageId != relatedId) { + // We've received a message that we didn't expect, throwing an exception will cause it to be flagged + // as an error in the message queue; the message queue will proceed + throw new IllegalStateException("Unexpected message id " + relatedId + ", expected " + expectedMessageId); + } + try { logger.info("FSM State change in {}: {}->{}({})", queueName, state == null ? "[null]" : state.name(), nextState, - message); + data); if (!allStates.containsKey(nextState)) { logger.error("Unknown state {}", nextState); @@ -173,8 +193,13 @@ public class StateMachine { } if (!state.isFinal()) { - var transition = state.next(message); - smOutbox.notify(transition.state(), transition.message()); + var transition = state.next(msg.payload()); + + expectedMessageId = messageId; + smOutbox.notify(expectedMessageId, transition.state(), transition.message()); + } + else { + expectedMessageId = -1; } } catch (Exception e) { @@ -204,7 +229,7 @@ public class StateMachine { @Override public void onNotification(MqMessage msg) { - onStateTransition(msg.function(), msg.payload()); + onStateTransition(msg); try { stateChangeListeners.forEach(l -> l.accept(msg.function(), msg.payload())); } diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mq/persistence/MqPersistenceTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mq/persistence/MqPersistenceTest.java index 74f69682..4b93fa5e 100644 --- a/code/common/message-queue/src/test/java/nu/marginalia/mq/persistence/MqPersistenceTest.java +++ b/code/common/message-queue/src/test/java/nu/marginalia/mq/persistence/MqPersistenceTest.java @@ -57,7 +57,7 @@ public class MqPersistenceTest { @Test public void testReaper() throws Exception { - long id = persistence.sendNewMessage(recipientId, senderId, "function", "payload", Duration.ofSeconds(2)); + long id = persistence.sendNewMessage(recipientId, senderId, null, "function", "payload", Duration.ofSeconds(2)); persistence.reapDeadMessages(); var messages = MqTestUtil.getMessages(dataSource, recipientId); @@ -77,7 +77,7 @@ public class MqPersistenceTest { @Test public void sendWithReplyAddress() throws Exception { - long id = persistence.sendNewMessage(recipientId, senderId, "function", "payload", Duration.ofSeconds(30)); + long id = persistence.sendNewMessage(recipientId, senderId, null, "function", "payload", Duration.ofSeconds(30)); var messages = MqTestUtil.getMessages(dataSource, recipientId); assertEquals(1, messages.size()); @@ -95,7 +95,7 @@ public class MqPersistenceTest { @Test public void sendNoReplyAddress() throws Exception { - long id = persistence.sendNewMessage(recipientId, null, "function", "payload", Duration.ofSeconds(30)); + long id = persistence.sendNewMessage(recipientId, null, null, "function", "payload", Duration.ofSeconds(30)); var messages = MqTestUtil.getMessages(dataSource, recipientId); assertEquals(1, messages.size()); @@ -114,7 +114,7 @@ public class MqPersistenceTest { @Test public void updateState() throws Exception { - long id = persistence.sendNewMessage(recipientId, senderId, "function", "payload", Duration.ofSeconds(30)); + long id = persistence.sendNewMessage(recipientId, senderId, null, "function", "payload", Duration.ofSeconds(30)); persistence.updateMessageState(id, MqMessageState.OK); System.out.println(id); @@ -131,7 +131,7 @@ public class MqPersistenceTest { @Test public void testReply() throws Exception { - long request = persistence.sendNewMessage(recipientId, senderId, "function", "payload", Duration.ofSeconds(30)); + long request = persistence.sendNewMessage(recipientId, senderId, null, "function", "payload", Duration.ofSeconds(30)); long response = persistence.sendResponse(request, MqMessageState.OK, "response"); var sentMessages = MqTestUtil.getMessages(dataSource, recipientId); @@ -159,7 +159,7 @@ public class MqPersistenceTest { String instanceId = "BATMAN"; long tick = 1234L; - long id = persistence.sendNewMessage(recipientId, null,"function", "payload", Duration.ofSeconds(30)); + long id = persistence.sendNewMessage(recipientId, null, null, "function", "payload", Duration.ofSeconds(30)); var messagesPollFirstTime = persistence.pollInbox(recipientId, instanceId , tick, 10); diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineResumeTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineResumeTest.java index 79af8d07..bf4e9990 100644 --- a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineResumeTest.java +++ b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineResumeTest.java @@ -81,7 +81,7 @@ public class StateMachineResumeTest { var stateFactory = new StateFactory(new GsonBuilder().create()); var sm = new StateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); - persistence.sendNewMessage(inboxId, null,"RESUMABLE", "", null); + persistence.sendNewMessage(inboxId, null, -1L, "RESUMABLE", "", null); sm.resume(); @@ -102,7 +102,7 @@ public class StateMachineResumeTest { var stateFactory = new StateFactory(new GsonBuilder().create()); var sm = new StateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); - long id = persistence.sendNewMessage(inboxId, null,"RESUMABLE", "", null); + long id = persistence.sendNewMessage(inboxId, null, -1L, "RESUMABLE", "", null); persistence.updateMessageState(id, MqMessageState.ACK); sm.resume(); @@ -125,7 +125,7 @@ public class StateMachineResumeTest { var stateFactory = new StateFactory(new GsonBuilder().create()); var sm = new StateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); - persistence.sendNewMessage(inboxId, null,"NON-RESUMABLE", "", null); + persistence.sendNewMessage(inboxId, null, -1L, "NON-RESUMABLE", "", null); sm.resume(); @@ -146,7 +146,7 @@ public class StateMachineResumeTest { var stateFactory = new StateFactory(new GsonBuilder().create()); var sm = new StateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); - long id = persistence.sendNewMessage(inboxId, null,"NON-RESUMABLE", "", null); + long id = persistence.sendNewMessage(inboxId, null, null, "NON-RESUMABLE", "", null); persistence.updateMessageState(id, MqMessageState.ACK); sm.resume(); diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineTest.java index 27ae869e..e8dcaa83 100644 --- a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineTest.java +++ b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineTest.java @@ -118,4 +118,25 @@ public class StateMachineTest { MqTestUtil.getMessages(dataSource, inboxId).forEach(System.out::println); } + @Test + public void testFalseTransition() throws Exception { + var stateFactory = new StateFactory(new GsonBuilder().create()); + var sm = new StateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new TestGraph(stateFactory)); + + // Prep the queue with a message to set the state to initial, + // and an additional message to trigger the false transition back to initial + + persistence.sendNewMessage(inboxId, null, null, "INITIAL", "", null); + persistence.sendNewMessage(inboxId, null, null, "INITIAL", "", null); + + sm.resume(); + + Thread.sleep(50); + + sm.join(); + sm.stop(); + + MqTestUtil.getMessages(dataSource, inboxId).forEach(System.out::println); + } + } From a5118fe8f113199082127a4e0875a2094f4669e2 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 12 Jul 2023 22:46:14 +0200 Subject: [PATCH 045/157] (minor) clean-up --- .../main/java/nu/marginalia/mq/persistence/MqPersistence.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java index d075d445..402d03f1 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java @@ -79,8 +79,8 @@ public class MqPersistence { if (senderInboxName == null) stmt.setNull(2, java.sql.Types.VARCHAR); else stmt.setString(2, senderInboxName); - if (relatedMessageId == null) stmt.setLong(3, -1); - else stmt.setLong(3, relatedMessageId); + // Translate null to -1, as 0 is a valid id + stmt.setLong(3, Objects.requireNonNullElse(relatedMessageId, -1L)); stmt.setString(4, function); stmt.setString(5, payload); From 1ec6f9cde28277926a303037bdd063e015e39306 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 13 Jul 2023 14:55:45 +0200 Subject: [PATCH 046/157] (mq) More robust resume and recovery logic, protection against spurious state changes, minor bugfixes --- .../marginalia/index/client/IndexClient.java | 4 +- .../search/client/SearchClient.java | 4 +- ...qFactory.java => MessageQueueFactory.java} | 14 +- .../nu/marginalia/mq/outbox/MqOutbox.java | 18 +-- .../java/nu/marginalia/mqsm/StateMachine.java | 146 +++++++++++++----- .../marginalia/mqsm/state/MachineState.java | 1 + .../mqsm/StateMachineErrorTest.java | 12 +- .../mqsm/StateMachineResumeTest.java | 44 +++--- .../nu/marginalia/mqsm/StateMachineTest.java | 20 +-- .../service/server/BaseServiceParams.java | 6 +- .../control/model/MessageQueueEntry.java | 25 +++ .../control/process/ControlProcesses.java | 10 +- .../process/ReconvertAndLoadProcess.java | 3 +- .../process/RepartitionReindexProcess.java | 8 +- .../templates/control/message-queue.hdb | 5 +- 15 files changed, 211 insertions(+), 109 deletions(-) rename code/common/message-queue/src/main/java/nu/marginalia/mq/{MqFactory.java => MessageQueueFactory.java} (92%) diff --git a/code/api/index-api/src/main/java/nu/marginalia/index/client/IndexClient.java b/code/api/index-api/src/main/java/nu/marginalia/index/client/IndexClient.java index 7ea9d6c9..db60948d 100644 --- a/code/api/index-api/src/main/java/nu/marginalia/index/client/IndexClient.java +++ b/code/api/index-api/src/main/java/nu/marginalia/index/client/IndexClient.java @@ -10,7 +10,7 @@ import nu.marginalia.client.Context; import nu.marginalia.index.client.model.query.SearchSpecification; import nu.marginalia.index.client.model.results.SearchResultSet; import nu.marginalia.model.gson.GsonFactory; -import nu.marginalia.mq.MqFactory; +import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mq.outbox.MqOutbox; import nu.marginalia.service.descriptor.ServiceDescriptors; import nu.marginalia.service.id.ServiceId; @@ -27,7 +27,7 @@ public class IndexClient extends AbstractDynamicClient { @Inject public IndexClient(ServiceDescriptors descriptors, - MqFactory messageQueueFactory) { + MessageQueueFactory messageQueueFactory) { super(descriptors.forId(ServiceId.Index), WmsaHome.getHostsFile(), GsonFactory::get); String inboxName = ServiceId.Index.name + ":" + "0"; diff --git a/code/api/search-api/src/main/java/nu/marginalia/search/client/SearchClient.java b/code/api/search-api/src/main/java/nu/marginalia/search/client/SearchClient.java index 6a4f2c4d..8faef5be 100644 --- a/code/api/search-api/src/main/java/nu/marginalia/search/client/SearchClient.java +++ b/code/api/search-api/src/main/java/nu/marginalia/search/client/SearchClient.java @@ -5,7 +5,7 @@ import com.google.inject.Singleton; import io.reactivex.rxjava3.core.Observable; import nu.marginalia.client.AbstractDynamicClient; import nu.marginalia.model.gson.GsonFactory; -import nu.marginalia.mq.MqFactory; +import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mq.outbox.MqOutbox; import nu.marginalia.search.client.model.ApiSearchResults; import nu.marginalia.service.descriptor.ServiceDescriptors; @@ -28,7 +28,7 @@ public class SearchClient extends AbstractDynamicClient { @Inject public SearchClient(ServiceDescriptors descriptors, - MqFactory messageQueueFactory) { + MessageQueueFactory messageQueueFactory) { super(descriptors.forId(ServiceId.Search), WmsaHome.getHostsFile(), GsonFactory::get); diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/MqFactory.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/MessageQueueFactory.java similarity index 92% rename from code/common/message-queue/src/main/java/nu/marginalia/mq/MqFactory.java rename to code/common/message-queue/src/main/java/nu/marginalia/mq/MessageQueueFactory.java index 792d0bd8..5791793e 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/MqFactory.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/MessageQueueFactory.java @@ -12,14 +12,20 @@ import javax.inject.Singleton; import java.util.UUID; @Singleton -public class MqFactory { +public class MessageQueueFactory { private final MqPersistence persistence; @Inject - public MqFactory(MqPersistence persistence) { + public MessageQueueFactory(MqPersistence persistence) { this.persistence = persistence; } + public MqSingleShotInbox createSingleShotInbox(String inboxName, UUID instanceUUID) + { + return new MqSingleShotInbox(persistence, inboxName, instanceUUID); + } + + public MqInboxIf createAsynchronousInbox(String inboxName, UUID instanceUUID) { return new MqAsynchronousInbox(persistence, inboxName, instanceUUID); @@ -30,10 +36,6 @@ public class MqFactory { return new MqSynchronousInbox(persistence, inboxName, instanceUUID); } - public MqSingleShotInbox createSingleShotInbox(String inboxName, UUID instanceUUID) - { - return new MqSingleShotInbox(persistence, inboxName, instanceUUID); - } public MqOutbox createOutbox(String inboxName, String outboxName, UUID instanceUUID) { diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java index 22b4bc85..d604a585 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java @@ -20,7 +20,6 @@ public class MqOutbox { private final String replyInboxName; private final String instanceUUID; - private final ConcurrentHashMap pendingRequests = new ConcurrentHashMap<>(); private final ConcurrentHashMap pendingResponses = new ConcurrentHashMap<>(); private final int pollIntervalMs = Integer.getInteger("mq.outbox.poll-interval-ms", 100); @@ -50,8 +49,6 @@ public class MqOutbox { logger.info("Shutting down outbox {}", inboxName); - pendingRequests.clear(); - run = false; pollThread.join(); } @@ -69,18 +66,14 @@ public class MqOutbox { } private void pollDb(long tick) { - if (pendingRequests.isEmpty()) - return; - try { var updates = persistence.pollReplyInbox(replyInboxName, instanceUUID, tick, maxPollCount); for (var message : updates) { pendingResponses.put(message.relatedId(), message); - pendingRequests.remove(message.relatedId()); } - if (updates.isEmpty() || pendingResponses.isEmpty()) + if (updates.isEmpty()) return; logger.info("Notifying {} pending responses", pendingResponses.size()); @@ -106,11 +99,7 @@ public class MqOutbox { *
    * Use waitResponse(id) or pollResponse(id) to fetch the response. */ public long sendAsync(String function, String payload) throws Exception { - var id = persistence.sendNewMessage(inboxName, replyInboxName, null, function, payload, null); - - pendingRequests.put(id, id); - - return id; + return persistence.sendNewMessage(inboxName, replyInboxName, null, function, payload, null); } /** Blocks until a response arrives for the given message id (possibly forever) */ @@ -171,4 +160,7 @@ public class MqOutbox { public void flagAsBad(long id) throws SQLException { persistence.updateMessageState(id, MqMessageState.ERR); } + public void flagAsDead(long id) throws SQLException { + persistence.updateMessageState(id, MqMessageState.DEAD); + } } \ No newline at end of file diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java index d039f363..94118113 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java @@ -1,6 +1,6 @@ package nu.marginalia.mqsm; -import nu.marginalia.mq.MqFactory; +import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mq.MqMessage; import nu.marginalia.mq.MqMessageState; import nu.marginalia.mq.inbox.MqInboxIf; @@ -14,6 +14,8 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.*; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; import java.util.function.BiConsumer; /** A state machine that can be used to implement a finite state machine @@ -33,21 +35,15 @@ public class StateMachine { private final MachineState resumingState = new StateFactory.ResumingState(); private final List> stateChangeListeners = new ArrayList<>(); - private final Map allStates = new HashMap<>(); - /* The expectedMessageId guards against spurious state changes being triggered by old messages in the queue - * - * It contains the message id of the last message that was processed, and the messages sent by the state machine to - * itself via the message queue all have relatedId set to expectedMessageId. If the state machine is unitialized or - * in a terminal state, it will accept messages with relatedIds that are equal to -1. - * */ - private long expectedMessageId = -1; + private ExpectedMessage expectedMessage = ExpectedMessage.anyUnrelated(); - public StateMachine(MqFactory messageQueueFactory, + public StateMachine(MessageQueueFactory messageQueueFactory, String queueName, UUID instanceUUID, - AbstractStateGraph stateGraph) { + AbstractStateGraph stateGraph) + { this.queueName = queueName; smInbox = messageQueueFactory.createSynchronousInbox(queueName, instanceUUID); @@ -63,6 +59,10 @@ public class StateMachine { throw new IllegalArgumentException("State " + declaredState + " is not defined in the state graph"); } } + + resume(); + + smInbox.start(); } /** Listen to state changes */ @@ -96,6 +96,22 @@ public class StateMachine { } } + /** Wait for the state machine to reach a final state up to a given timeout. + */ + public void join(long timeout, TimeUnit timeUnit) throws InterruptedException, TimeoutException { + long deadline = System.currentTimeMillis() + timeUnit.toMillis(timeout); + + synchronized (this) { + if (null == state) + return; + + while (!state.isFinal()) { + if (deadline <= System.currentTimeMillis()) + throw new TimeoutException("Timeout waiting for state machine to reach final state"); + wait(100); + } + } + } /** Initialize the state machine. */ public void init() throws Exception { @@ -106,8 +122,7 @@ public class StateMachine { notifyAll(); } - smInbox.start(); - smOutbox.notify(expectedMessageId, transition.state(), transition.message()); + smOutbox.notify(transition.state(), transition.message()); } /** Initialize the state machine. */ @@ -119,41 +134,66 @@ public class StateMachine { notifyAll(); } - smInbox.start(); - smOutbox.notify(expectedMessageId, transition.state(), transition.message()); + smOutbox.notify(transition.state(), transition.message()); } /** Resume the state machine from the last known state. */ - public void resume() throws Exception { + private void resume() { + // We only permit resuming from the unitialized state if (state != null) { return; } - var messages = smInbox.replay(1); - if (messages.isEmpty()) { - init(); + // Fetch the last messages from the inbox + var message = smInbox.replay(5) + .stream() + .filter(m -> (m.state() == MqMessageState.NEW) || (m.state() == MqMessageState.ACK)) + .findFirst(); + + if (message.isEmpty()) { + // No messages in the inbox, so start in a terminal state + expectedMessage = ExpectedMessage.anyUnrelated(); + state = finalState; return; } - var firstMessage = messages.get(0); + var firstMessage = message.get(); var resumeState = allStates.get(firstMessage.function()); - smInbox.start(); logger.info("Resuming state machine from {}({})/{}", firstMessage.function(), firstMessage.payload(), firstMessage.state()); - expectedMessageId = firstMessage.relatedId(); + expectedMessage = ExpectedMessage.expectThis(firstMessage); if (firstMessage.state() == MqMessageState.NEW) { // The message is not acknowledged, so starting the inbox will trigger a state transition // We still need to set a state here so that the join() method works state = resumingState; - } else if (resumeState.resumeBehavior().equals(ResumeBehavior.ERROR)) { - // The message is acknowledged, but the state does not support resuming - smOutbox.notify(expectedMessageId, "ERROR", "Illegal resumption from ACK'ed state " + firstMessage.function()); - } else { - // The message is already acknowledged, so we replay the last state - onStateTransition(firstMessage); + } + else if (firstMessage.state() == MqMessageState.ACK) { + resumeFromAck(resumeState, firstMessage); + } + } + + private void resumeFromAck(MachineState resumeState, + MqMessage message) + { + try { + if (resumeState.resumeBehavior().equals(ResumeBehavior.ERROR)) { + // The message is acknowledged, but the state does not support resuming + smOutbox.notify(expectedMessage.id, "ERROR", "Illegal resumption from ACK'ed state " + message.function()); + } else { + + this.state = resumeState; + + // The message is already acknowledged, we flag it as dead and then send an identical message + smOutbox.flagAsDead(message.msgId()); + expectedMessage = ExpectedMessage.responseTo(message); + smOutbox.notify(message.msgId(), message.function(), message.payload()); + } + } + catch (Exception e) { + logger.error("Failed to replay state", e); } } @@ -165,13 +205,14 @@ public class StateMachine { private void onStateTransition(MqMessage msg) { final String nextState = msg.function(); final String data = msg.payload(); - final long messageId = msg.msgId(); + final long relatedId = msg.relatedId(); - if (expectedMessageId != relatedId) { + if (!expectedMessage.isExpected(msg)) { // We've received a message that we didn't expect, throwing an exception will cause it to be flagged // as an error in the message queue; the message queue will proceed - throw new IllegalStateException("Unexpected message id " + relatedId + ", expected " + expectedMessageId); + + throw new IllegalStateException("Unexpected message id " + relatedId + ", expected " + expectedMessage.id); } try { @@ -193,13 +234,15 @@ public class StateMachine { } if (!state.isFinal()) { + logger.info("Transitining from state {}", state.name()); var transition = state.next(msg.payload()); - expectedMessageId = messageId; - smOutbox.notify(expectedMessageId, transition.state(), transition.message()); + expectedMessage = ExpectedMessage.responseTo(msg); + smOutbox.notify(expectedMessage.id, transition.state(), transition.message()); } else { - expectedMessageId = -1; + // On terminal transition, we expect any message + expectedMessage = ExpectedMessage.anyUnrelated(); } } catch (Exception e) { @@ -234,8 +277,41 @@ public class StateMachine { stateChangeListeners.forEach(l -> l.accept(msg.function(), msg.payload())); } catch (Exception ex) { - ex.printStackTrace(); + // Rethrowing this will flag the message as an error in the message queue + throw new RuntimeException("Error in state change listener", ex); } } } } + +/** ExpectedMessage guards against spurious state changes being triggered by old messages in the queue + * + * It contains the message id of the last message that was processed, and the messages sent by the state machine to + * itself via the message queue all have relatedId set to expectedMessageId. If the state machine is unitialized or + * in a terminal state, it will accept messages with relatedIds that are equal to -1. + * */ +class ExpectedMessage { + public final long id; + public ExpectedMessage(long id) { + this.id = id; + } + + public static ExpectedMessage expectThis(MqMessage message) { + return new ExpectedMessage(message.relatedId()); + } + + public static ExpectedMessage responseTo(MqMessage message) { + return new ExpectedMessage(message.msgId()); + } + + public static ExpectedMessage anyUnrelated() { + return new ExpectedMessage(-1); + } + + public boolean isExpected(MqMessage message) { + if (id < 0) + return true; + + return id == message.relatedId(); + } +} \ No newline at end of file diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/MachineState.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/MachineState.java index ec3c26ff..84a0b11c 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/MachineState.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/MachineState.java @@ -10,4 +10,5 @@ public interface MachineState { ResumeBehavior resumeBehavior(); boolean isFinal(); + } diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineErrorTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineErrorTest.java index f41a7dbd..863e1ce0 100644 --- a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineErrorTest.java +++ b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineErrorTest.java @@ -3,7 +3,7 @@ package nu.marginalia.mqsm; import com.google.gson.GsonBuilder; import com.zaxxer.hikari.HikariConfig; import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.mq.MqFactory; +import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mq.MqMessageRow; import nu.marginalia.mq.MqTestUtil; import nu.marginalia.mq.persistence.MqPersistence; @@ -11,17 +11,21 @@ import nu.marginalia.mqsm.graph.GraphState; import nu.marginalia.mqsm.graph.AbstractStateGraph; import nu.marginalia.mqsm.graph.ResumeBehavior; import org.junit.jupiter.api.*; +import org.junit.jupiter.api.parallel.Execution; import org.testcontainers.containers.MariaDBContainer; import org.testcontainers.junit.jupiter.Container; import org.testcontainers.junit.jupiter.Testcontainers; import java.util.List; import java.util.UUID; +import java.util.concurrent.TimeUnit; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD; @Tag("slow") @Testcontainers +@Execution(SAME_THREAD) public class StateMachineErrorTest { @Container static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") @@ -33,7 +37,7 @@ public class StateMachineErrorTest { static HikariDataSource dataSource; static MqPersistence persistence; - static MqFactory messageQueueFactory; + static MessageQueueFactory messageQueueFactory; private String inboxId; @BeforeEach @@ -49,7 +53,7 @@ public class StateMachineErrorTest { dataSource = new HikariDataSource(config); persistence = new MqPersistence(dataSource); - messageQueueFactory = new MqFactory(persistence); + messageQueueFactory = new MessageQueueFactory(persistence); } @AfterAll @@ -85,7 +89,7 @@ public class StateMachineErrorTest { sm.init(); - sm.join(); + sm.join(2, TimeUnit.SECONDS); sm.stop(); List states = MqTestUtil.getMessages(dataSource, inboxId) diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineResumeTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineResumeTest.java index bf4e9990..dadc8b87 100644 --- a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineResumeTest.java +++ b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineResumeTest.java @@ -3,7 +3,7 @@ package nu.marginalia.mqsm; import com.google.gson.GsonBuilder; import com.zaxxer.hikari.HikariConfig; import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.mq.MqFactory; +import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mq.MqMessageRow; import nu.marginalia.mq.MqMessageState; import nu.marginalia.mq.MqTestUtil; @@ -12,17 +12,21 @@ import nu.marginalia.mqsm.graph.GraphState; import nu.marginalia.mqsm.graph.AbstractStateGraph; import nu.marginalia.mqsm.graph.ResumeBehavior; import org.junit.jupiter.api.*; +import org.junit.jupiter.api.parallel.Execution; import org.testcontainers.containers.MariaDBContainer; import org.testcontainers.junit.jupiter.Container; import org.testcontainers.junit.jupiter.Testcontainers; import java.util.List; import java.util.UUID; +import java.util.concurrent.TimeUnit; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD; @Tag("slow") @Testcontainers +@Execution(SAME_THREAD) public class StateMachineResumeTest { @Container static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") @@ -34,7 +38,7 @@ public class StateMachineResumeTest { static HikariDataSource dataSource; static MqPersistence persistence; - static MqFactory messageQueueFactory; + static MessageQueueFactory messageQueueFactory; private String inboxId; @BeforeEach @@ -50,7 +54,7 @@ public class StateMachineResumeTest { dataSource = new HikariDataSource(config); persistence = new MqPersistence(dataSource); - messageQueueFactory = new MqFactory(persistence); + messageQueueFactory = new MessageQueueFactory(persistence); } @AfterAll @@ -79,13 +83,12 @@ public class StateMachineResumeTest { @Test public void smResumeResumableFromNew() throws Exception { var stateFactory = new StateFactory(new GsonBuilder().create()); - var sm = new StateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); + persistence.sendNewMessage(inboxId, null, -1L, "RESUMABLE", "", null); + var sm = new StateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); - sm.resume(); - - sm.join(); + sm.join(2, TimeUnit.SECONDS); sm.stop(); List states = MqTestUtil.getMessages(dataSource, inboxId) @@ -100,14 +103,13 @@ public class StateMachineResumeTest { @Test public void smResumeFromAck() throws Exception { var stateFactory = new StateFactory(new GsonBuilder().create()); - var sm = new StateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); long id = persistence.sendNewMessage(inboxId, null, -1L, "RESUMABLE", "", null); persistence.updateMessageState(id, MqMessageState.ACK); - sm.resume(); + var sm = new StateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); - sm.join(); + sm.join(4, TimeUnit.SECONDS); sm.stop(); List states = MqTestUtil.getMessages(dataSource, inboxId) @@ -116,20 +118,20 @@ public class StateMachineResumeTest { .map(MqMessageRow::function) .toList(); - assertEquals(List.of("RESUMABLE", "NON-RESUMABLE", "OK", "END"), states); + assertEquals(List.of("RESUMABLE", "RESUMABLE", "NON-RESUMABLE", "OK", "END"), states); } @Test public void smResumeNonResumableFromNew() throws Exception { var stateFactory = new StateFactory(new GsonBuilder().create()); - var sm = new StateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); + persistence.sendNewMessage(inboxId, null, -1L, "NON-RESUMABLE", "", null); - sm.resume(); + var sm = new StateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); - sm.join(); + sm.join(2, TimeUnit.SECONDS); sm.stop(); List states = MqTestUtil.getMessages(dataSource, inboxId) @@ -144,14 +146,14 @@ public class StateMachineResumeTest { @Test public void smResumeNonResumableFromAck() throws Exception { var stateFactory = new StateFactory(new GsonBuilder().create()); - var sm = new StateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); + long id = persistence.sendNewMessage(inboxId, null, null, "NON-RESUMABLE", "", null); persistence.updateMessageState(id, MqMessageState.ACK); - sm.resume(); + var sm = new StateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); - sm.join(); + sm.join(2, TimeUnit.SECONDS); sm.stop(); List states = MqTestUtil.getMessages(dataSource, inboxId) @@ -166,11 +168,11 @@ public class StateMachineResumeTest { @Test public void smResumeEmptyQueue() throws Exception { var stateFactory = new StateFactory(new GsonBuilder().create()); + + var sm = new StateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); - sm.resume(); - - sm.join(); + sm.join(2, TimeUnit.SECONDS); sm.stop(); List states = MqTestUtil.getMessages(dataSource, inboxId) @@ -179,6 +181,6 @@ public class StateMachineResumeTest { .map(MqMessageRow::function) .toList(); - assertEquals(List.of("INITIAL", "RESUMABLE", "NON-RESUMABLE", "OK", "END"), states); + assertEquals(List.of(), states); } } diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineTest.java index e8dcaa83..360df468 100644 --- a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineTest.java +++ b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineTest.java @@ -3,22 +3,26 @@ package nu.marginalia.mqsm; import com.google.gson.GsonBuilder; import com.zaxxer.hikari.HikariConfig; import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.mq.MqFactory; +import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mq.MqTestUtil; import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.mqsm.graph.GraphState; import nu.marginalia.mqsm.graph.AbstractStateGraph; import org.junit.jupiter.api.*; +import org.junit.jupiter.api.parallel.Execution; import org.testcontainers.containers.MariaDBContainer; import org.testcontainers.junit.jupiter.Container; import org.testcontainers.junit.jupiter.Testcontainers; import java.util.UUID; +import java.util.concurrent.TimeUnit; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD; @Tag("slow") @Testcontainers +@Execution(SAME_THREAD) public class StateMachineTest { @Container static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") @@ -30,7 +34,7 @@ public class StateMachineTest { static HikariDataSource dataSource; static MqPersistence persistence; - static MqFactory messageQueueFactory; + static MessageQueueFactory messageQueueFactory; private String inboxId; @BeforeEach @@ -46,7 +50,7 @@ public class StateMachineTest { dataSource = new HikariDataSource(config); persistence = new MqPersistence(dataSource); - messageQueueFactory = new MqFactory(persistence); + messageQueueFactory = new MessageQueueFactory(persistence); } @AfterAll @@ -91,7 +95,7 @@ public class StateMachineTest { sm.init(); - sm.join(); + sm.join(2, TimeUnit.SECONDS); sm.stop(); MqTestUtil.getMessages(dataSource, inboxId).forEach(System.out::println); @@ -111,8 +115,7 @@ public class StateMachineTest { System.out.println("-------------------- "); var sm2 = new StateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new TestGraph(stateFactory)); - sm2.resume(); - sm2.join(); + sm2.join(2, TimeUnit.SECONDS); sm2.stop(); MqTestUtil.getMessages(dataSource, inboxId).forEach(System.out::println); @@ -121,7 +124,6 @@ public class StateMachineTest { @Test public void testFalseTransition() throws Exception { var stateFactory = new StateFactory(new GsonBuilder().create()); - var sm = new StateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new TestGraph(stateFactory)); // Prep the queue with a message to set the state to initial, // and an additional message to trigger the false transition back to initial @@ -129,11 +131,11 @@ public class StateMachineTest { persistence.sendNewMessage(inboxId, null, null, "INITIAL", "", null); persistence.sendNewMessage(inboxId, null, null, "INITIAL", "", null); - sm.resume(); + var sm = new StateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new TestGraph(stateFactory)); Thread.sleep(50); - sm.join(); + sm.join(2, TimeUnit.SECONDS); sm.stop(); MqTestUtil.getMessages(dataSource, inboxId).forEach(System.out::println); diff --git a/code/common/service/src/main/java/nu/marginalia/service/server/BaseServiceParams.java b/code/common/service/src/main/java/nu/marginalia/service/server/BaseServiceParams.java index 2ff07b55..73706dc8 100644 --- a/code/common/service/src/main/java/nu/marginalia/service/server/BaseServiceParams.java +++ b/code/common/service/src/main/java/nu/marginalia/service/server/BaseServiceParams.java @@ -2,7 +2,7 @@ package nu.marginalia.service.server; import com.google.inject.Inject; import com.google.inject.Singleton; -import nu.marginalia.mq.MqFactory; +import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.service.control.ServiceEventLog; import nu.marginalia.service.control.ServiceHeartbeat; import nu.marginalia.service.module.ServiceConfiguration; @@ -15,14 +15,14 @@ public class BaseServiceParams { public final MetricsServer metricsServer; public final ServiceHeartbeat heartbeat; public final ServiceEventLog eventLog; - public final MqFactory messageQueueInboxFactory; + public final MessageQueueFactory messageQueueInboxFactory; @Inject public BaseServiceParams(ServiceConfiguration configuration, Initialization initialization, MetricsServer metricsServer, ServiceHeartbeat heartbeat, ServiceEventLog eventLog, - MqFactory messageQueueInboxFactory) { + MessageQueueFactory messageQueueInboxFactory) { this.configuration = configuration; this.initialization = initialization; this.metricsServer = metricsServer; diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/MessageQueueEntry.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/MessageQueueEntry.java index c3de8cca..f11591ac 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/MessageQueueEntry.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/MessageQueueEntry.java @@ -15,12 +15,37 @@ public record MessageQueueEntry ( ) { public String ownerInstance() { + if (ownerInstanceFull == null) { + return ""; + } + return ownerInstanceFull.substring(0, 8); } public String ownerInstanceColor() { + if (ownerInstanceFull == null) { + return "#000000"; + } return '#' + ownerInstanceFull.substring(0, 6); } public String ownerInstanceColor2() { + if (ownerInstanceFull == null) { + return "#000000"; + } + return '#' + ownerInstanceFull.substring(25, 31); } + + public String stateCode() { + if (state == null) { + return ""; + } + return switch (state) { + case "NEW" -> "\uD83D\uDC23"; + case "ACK" -> "\uD83D\uDD27"; + case "ERR" -> "\u274C"; + case "OK" -> "\u2705"; + case "DEAD" -> "\uD83D\uDC80"; + default -> ""; + }; + } } diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ControlProcesses.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ControlProcesses.java index e553d7df..35987f14 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ControlProcesses.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ControlProcesses.java @@ -5,7 +5,7 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.control.model.ControlProcess; import nu.marginalia.model.gson.GsonFactory; -import nu.marginalia.mq.MqFactory; +import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mqsm.StateMachine; import nu.marginalia.mqsm.graph.AbstractStateGraph; import nu.marginalia.service.control.ServiceEventLog; @@ -19,11 +19,11 @@ import java.util.UUID; public class ControlProcesses { private final ServiceEventLog eventLog; private final Gson gson; - private final MqFactory messageQueueFactory; + private final MessageQueueFactory messageQueueFactory; public Map stateMachines = new HashMap<>(); @Inject - public ControlProcesses(MqFactory messageQueueFactory, + public ControlProcesses(MessageQueueFactory messageQueueFactory, GsonFactory gsonFactory, BaseServiceParams baseServiceParams, RepartitionReindexProcess repartitionReindexProcess, @@ -60,8 +60,4 @@ public class ControlProcesses { stateMachines.get(process).init(gson.toJson(arg)); } - public void resume(ControlProcess process) throws Exception { - eventLog.logEvent("FSM-RESUME", process.id()); - stateMachines.get(process).resume(); - } } diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ReconvertAndLoadProcess.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ReconvertAndLoadProcess.java index be4b22ca..f0abc5a4 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ReconvertAndLoadProcess.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ReconvertAndLoadProcess.java @@ -41,7 +41,8 @@ public class ReconvertAndLoadProcess extends AbstractStateGraph { ProcessService processService, IndexClient indexClient, SearchClient searchClient - ) { + ) + { super(stateFactory); this.processService = processService; this.mqIndexOutbox = indexClient.outbox(); diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/RepartitionReindexProcess.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/RepartitionReindexProcess.java index ef76e654..c668d230 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/RepartitionReindexProcess.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/RepartitionReindexProcess.java @@ -9,6 +9,7 @@ import nu.marginalia.mq.outbox.MqOutbox; import nu.marginalia.mqsm.StateFactory; import nu.marginalia.mqsm.graph.AbstractStateGraph; import nu.marginalia.mqsm.graph.GraphState; +import nu.marginalia.mqsm.graph.ResumeBehavior; @Singleton public class RepartitionReindexProcess extends AbstractStateGraph { @@ -26,7 +27,8 @@ public class RepartitionReindexProcess extends AbstractStateGraph { @Inject - public RepartitionReindexProcess(StateFactory stateFactory, IndexClient indexClient) { + public RepartitionReindexProcess(StateFactory stateFactory, + IndexClient indexClient) { super(stateFactory); indexOutbox = indexClient.outbox(); @@ -46,7 +48,7 @@ public class RepartitionReindexProcess extends AbstractStateGraph { return indexOutbox.sendAsync(IndexMqEndpoints.INDEX_REPARTITION, ""); } - @GraphState(name = REPARTITION_REPLY, next = REINDEX) + @GraphState(name = REPARTITION_REPLY, next = REINDEX, resume = ResumeBehavior.RETRY) public void repartitionReply(Long id) throws Exception { var rsp = indexOutbox.waitResponse(id); @@ -60,7 +62,7 @@ public class RepartitionReindexProcess extends AbstractStateGraph { return indexOutbox.sendAsync(IndexMqEndpoints.INDEX_REINDEX, ""); } - @GraphState(name = REINDEX_REPLY, next = END) + @GraphState(name = REINDEX_REPLY, next = END, resume = ResumeBehavior.RETRY) public void reindexReply(Long id) throws Exception { var rsp = indexOutbox.waitResponse(id); diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/message-queue.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/message-queue.hdb index 16597fb6..b6633eb8 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/message-queue.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/message-queue.hdb @@ -33,11 +33,10 @@ {{senderInbox}} {{function}} -    - {{ownerInstance}} +    {{ownerInstance}} {{ownerTick}} - {{state}} + {{stateCode}} {{state}} {{createdTime}} {{updatedTime}} {{ttl}} From 825fd10efa510493529b9243e28c0274be917339 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 13 Jul 2023 15:14:04 +0200 Subject: [PATCH 047/157] (control) Clean up the MQ ui a bit --- .../templates/control/message-queue.hdb | 31 +++++++++---------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/message-queue.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/message-queue.hdb index b6633eb8..d0dee31c 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/message-queue.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/message-queue.hdb @@ -13,35 +13,32 @@ - - - - + + + - - - - - - + + {{#each messages}} + - - - - - - - + + + + + + + + {{/each}}
    Message IDRelated IDRecipientSenderState
    TTL
    Msg ID
    Related ID
    Recipient
    Sender
    FunctionOwner InstanceOwner TickStateCreated TimeUpdated TimeTTLOwner Instance
    Owner Tick
    Created
    Updated
    {{stateCode}} {{state}} {{id}}{{relatedId}} {{recipientInbox}}{{senderInbox}} {{function}}    {{ownerInstance}} {{ownerTick}}{{stateCode}} {{state}} {{createdTime}}{{updatedTime}}{{ttl}}
    {{ttl}}{{relatedId}}{{senderInbox}}{{ownerTick}}{{updatedTime}}
    From 0960e18f8ef460c8418e1020f34afe866973ee63 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 13 Jul 2023 15:44:36 +0200 Subject: [PATCH 048/157] (control) Auto-refreshing tables --- .../main/resources/static/control/refresh.js | 17 +++++++++++++++++ .../main/resources/templates/control/events.hdb | 10 ++++++++-- .../main/resources/templates/control/index.hdb | 3 ++- .../templates/control/message-queue.hdb | 8 +++++++- .../resources/templates/control/processes.hdb | 6 ++++++ .../resources/templates/control/services.hdb | 6 ++++++ 6 files changed, 46 insertions(+), 4 deletions(-) create mode 100644 code/services-satellite/control-service/src/main/resources/static/control/refresh.js diff --git a/code/services-satellite/control-service/src/main/resources/static/control/refresh.js b/code/services-satellite/control-service/src/main/resources/static/control/refresh.js new file mode 100644 index 00000000..457bb0e6 --- /dev/null +++ b/code/services-satellite/control-service/src/main/resources/static/control/refresh.js @@ -0,0 +1,17 @@ +function refresh(ids) { + fetch(window.location.href) + .then(response => response.text()) + .then(html => { + const parser = new DOMParser(); + const newDocument = parser.parseFromString(html, "text/html"); + console.log(newDocument); + + ids.forEach(id => { + const newElement = newDocument.getElementById(id); + document.getElementById(id).innerHTML = newDocument.getElementById(id).innerHTML; + }); + }) + .catch(error => { + console.error("Error fetching webpage:", error); + }); +} \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/events.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/events.hdb index 9791fad3..83d5f449 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/events.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/events.hdb @@ -11,7 +11,7 @@

    Events

    - +
    @@ -34,4 +34,10 @@
    Service Name Instance
    - \ No newline at end of file + + + diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/index.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/index.hdb index 6ca3119f..71647683 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/index.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/index.hdb @@ -11,4 +11,5 @@

    Overview

    - \ No newline at end of file + + diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/message-queue.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/message-queue.hdb index d0dee31c..c46193b4 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/message-queue.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/message-queue.hdb @@ -11,7 +11,7 @@

    Message Queue

    - +
    @@ -43,4 +43,10 @@
    State
    TTL
    Msg ID
    Related ID
    + + \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/processes.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/processes.hdb index c1225dd6..53902f39 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/processes.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/processes.hdb @@ -33,4 +33,10 @@ + + \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/services.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/services.hdb index 57800f7c..a09d5c27 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/services.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/services.hdb @@ -29,4 +29,10 @@ + + \ No newline at end of file From 948d4d5f08c63453091586c4318b4c62abcf81d4 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 13 Jul 2023 17:24:21 +0200 Subject: [PATCH 049/157] (control) Clean up the number of GUI views, abortable FSM tasks --- .../java/nu/marginalia/mqsm/StateMachine.java | 47 +++++++++++++++-- .../nu/marginalia/control/ControlService.java | 28 ++++++++-- .../control/model/ControlProcessState.java | 12 +++++ .../control/model/MessageQueueEntry.java | 1 + .../control/process/ControlProcesses.java | 21 ++++++++ .../control/svc/MessageQueueViewService.java | 3 +- .../resources/templates/control/events.hdb | 43 --------------- .../templates/control/message-queue.hdb | 52 ------------------- .../control/partials/events-table.hdb | 23 ++++++++ .../templates/control/partials/fsm-table.hdb | 23 ++++++++ .../control/partials/message-queue-table.hdb | 32 ++++++++++++ .../templates/control/partials/nav.hdb | 2 - .../control/partials/processes-table.hdb | 23 ++++++++ .../control/partials/services-table.hdb | 18 +++++++ .../resources/templates/control/processes.hdb | 28 ++-------- .../resources/templates/control/services.hdb | 23 ++------ 16 files changed, 229 insertions(+), 150 deletions(-) create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ControlProcessState.java delete mode 100644 code/services-satellite/control-service/src/main/resources/templates/control/events.hdb delete mode 100644 code/services-satellite/control-service/src/main/resources/templates/control/message-queue.hdb create mode 100644 code/services-satellite/control-service/src/main/resources/templates/control/partials/events-table.hdb create mode 100644 code/services-satellite/control-service/src/main/resources/templates/control/partials/fsm-table.hdb create mode 100644 code/services-satellite/control-service/src/main/resources/templates/control/partials/message-queue-table.hdb create mode 100644 code/services-satellite/control-service/src/main/resources/templates/control/partials/processes-table.hdb create mode 100644 code/services-satellite/control-service/src/main/resources/templates/control/partials/services-table.hdb diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java index 94118113..94e969a9 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java @@ -28,7 +28,11 @@ public class StateMachine { private final MqInboxIf smInbox; private final MqOutbox smOutbox; private final String queueName; - private MachineState state; + + + private volatile MachineState state; + private volatile ExpectedMessage expectedMessage = ExpectedMessage.anyUnrelated(); + private final MachineState errorState = new StateFactory.ErrorState(); private final MachineState finalState = new StateFactory.FinalState(); @@ -37,7 +41,6 @@ public class StateMachine { private final List> stateChangeListeners = new ArrayList<>(); private final Map allStates = new HashMap<>(); - private ExpectedMessage expectedMessage = ExpectedMessage.anyUnrelated(); public StateMachine(MessageQueueFactory messageQueueFactory, String queueName, @@ -237,8 +240,13 @@ public class StateMachine { logger.info("Transitining from state {}", state.name()); var transition = state.next(msg.payload()); - expectedMessage = ExpectedMessage.responseTo(msg); - smOutbox.notify(expectedMessage.id, transition.state(), transition.message()); + if (!expectedMessage.isExpected(msg)) { + logger.warn("Expected message changed during execution, skipping state transition to {}", transition.state()); + } + else { + expectedMessage = ExpectedMessage.responseTo(msg); + smOutbox.notify(expectedMessage.id, transition.state(), transition.message()); + } } else { // On terminal transition, we expect any message @@ -258,6 +266,33 @@ public class StateMachine { } } + public MachineState getState() { + return state; + } + + public void abortExecution() throws Exception { + // Create a fake message to abort the execution + // This helps make sense of the queue when debugging + // and also permits the real termination message to have an + // unique expected ID + + long abortMsgId = smOutbox.notify(expectedMessage.id, "ABORT", "Aborting execution"); + + // Set it as dead to clean up the queue from mystery ACK messages + smOutbox.flagAsDead(abortMsgId); + + // Set the expected message to the abort message, + // technically there's a slight chance of a race condition here, + // which will cause this message to be ERR'd and the process to + // continue, but it's very unlikely and the worst that can happen + // is you have to abort twice. + + expectedMessage = ExpectedMessage.expectId(abortMsgId); + + // Add a state transition to the final state + smOutbox.notify(abortMsgId, finalState.name(), ""); + } + private class StateEventSubscription implements MqSubscription { @Override @@ -308,6 +343,10 @@ class ExpectedMessage { return new ExpectedMessage(-1); } + public static ExpectedMessage expectId(long id) { + return new ExpectedMessage(id); + } + public boolean isExpected(MqMessage message) { if (id < 0) return true; diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java index 9d660a1e..88f51186 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java @@ -34,6 +34,7 @@ public class ControlService extends Service { private final MustacheRenderer> processesRenderer; private final MustacheRenderer> eventsRenderer; private final MustacheRenderer> messageQueueRenderer; + private final MustacheRenderer> fsmStateRenderer; private final MqPersistence messageQueuePersistence; private final StaticResources staticResources; private final MessageQueueMonitorService messageQueueMonitorService; @@ -61,6 +62,7 @@ public class ControlService extends Service { processesRenderer = rendererFactory.renderer("control/processes"); eventsRenderer = rendererFactory.renderer("control/events"); messageQueueRenderer = rendererFactory.renderer("control/message-queue"); + fsmStateRenderer = rendererFactory.renderer("control/fsm-states"); this.messageQueuePersistence = messageQueuePersistence; this.staticResources = staticResources; @@ -73,16 +75,34 @@ public class ControlService extends Service { Spark.get("/public/", (req, rsp) -> indexRenderer.render(Map.of())); - Spark.get("/public/services", (req, rsp) -> servicesRenderer.render(Map.of("heartbeats", heartbeatService.getServiceHeartbeats()))); - Spark.get("/public/processes", (req, rsp) -> processesRenderer.render(Map.of("heartbeats", heartbeatService.getProcessHeartbeats()))); - Spark.get("/public/events", (req, rsp) -> eventsRenderer.render(Map.of("events", eventLogService.getLastEntries(20)))); - Spark.get("/public/message-queue", (req, rsp) -> messageQueueRenderer.render(Map.of("messages", messageQueueViewService.getLastEntries(20)))); + Spark.get("/public/services", + (req, rsp) -> Map.of("services", heartbeatService.getServiceHeartbeats(), + "events", eventLogService.getLastEntries(20)), + (map) -> servicesRenderer.render((Map) map)); + + Spark.get("/public/processes", + (req, rsp) -> Map.of("processes", heartbeatService.getProcessHeartbeats(), + "fsms", controlProcesses.getFsmStates(), + "messages", messageQueueViewService.getLastEntries(20)), + (map) -> processesRenderer.render((Map) map)); + + Spark.post("/public/fsms/:fsm/start", (req, rsp) -> { + controlProcesses.start(ControlProcess.valueOf(req.params("fsm").toUpperCase())); + rsp.redirect("/processes"); + return ""; + }); + Spark.post("/public/fsms/:fsm/stop", (req, rsp) -> { + controlProcesses.stop(ControlProcess.valueOf(req.params("fsm").toUpperCase())); + rsp.redirect("/processes"); + return ""; + }); // TODO: This should be a POST Spark.get("/public/repartition", (req, rsp) -> { controlProcesses.start(ControlProcess.REPARTITION_REINDEX); return "OK"; }); + // TODO: This should be a POST Spark.get("/public/reconvert", (req, rsp) -> { controlProcesses.start(ControlProcess.RECONVERT_LOAD, "/samples/crawl-blogs/plan.yaml"); diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ControlProcessState.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ControlProcessState.java new file mode 100644 index 00000000..39d69ebd --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ControlProcessState.java @@ -0,0 +1,12 @@ +package nu.marginalia.control.model; + +public record ControlProcessState(String name, String state, boolean terminal) { + public String stateIcon() { + if (terminal) { + return "\uD83D\uDE34"; + } + else { + return "\uD83C\uDFC3"; + } + } +} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/MessageQueueEntry.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/MessageQueueEntry.java index f11591ac..43c5bf07 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/MessageQueueEntry.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/MessageQueueEntry.java @@ -6,6 +6,7 @@ public record MessageQueueEntry ( String senderInbox, String recipientInbox, String function, + String payload, String ownerInstanceFull, long ownerTick, String state, diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ControlProcesses.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ControlProcesses.java index 35987f14..404bd273 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ControlProcesses.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ControlProcesses.java @@ -3,15 +3,19 @@ package nu.marginalia.control.process; import com.google.gson.Gson; import com.google.inject.Inject; import com.google.inject.Singleton; +import lombok.SneakyThrows; import nu.marginalia.control.model.ControlProcess; +import nu.marginalia.control.model.ControlProcessState; import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mqsm.StateMachine; import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mqsm.state.MachineState; import nu.marginalia.service.control.ServiceEventLog; import nu.marginalia.service.server.BaseServiceParams; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.UUID; @@ -60,4 +64,21 @@ public class ControlProcesses { stateMachines.get(process).init(gson.toJson(arg)); } + public List getFsmStates() { + return stateMachines.entrySet().stream().sorted(Map.Entry.comparingByKey()).map(e -> { + + final MachineState state = e.getValue().getState(); + + final String machineName = e.getKey().name(); + final String stateName = state.name(); + final boolean terminal = state.isFinal(); + + return new ControlProcessState(machineName, stateName, terminal); + }).toList(); + } + + @SneakyThrows + public void stop(ControlProcess fsm) { + stateMachines.get(fsm).abortExecution(); + } } diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueViewService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueViewService.java index 9531c0b4..439b1c2f 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueViewService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueViewService.java @@ -22,7 +22,7 @@ public class MessageQueueViewService { public List getLastEntries(int n) { try (var conn = dataSource.getConnection(); var query = conn.prepareStatement(""" - SELECT ID, RELATED_ID, SENDER_INBOX, RECIPIENT_INBOX, FUNCTION, OWNER_INSTANCE, OWNER_TICK, STATE, CREATED_TIME, UPDATED_TIME, TTL + SELECT ID, RELATED_ID, SENDER_INBOX, RECIPIENT_INBOX, FUNCTION, PAYLOAD, OWNER_INSTANCE, OWNER_TICK, STATE, CREATED_TIME, UPDATED_TIME, TTL FROM MESSAGE_QUEUE ORDER BY ID DESC LIMIT ? @@ -38,6 +38,7 @@ public class MessageQueueViewService { rs.getString("SENDER_INBOX"), rs.getString("RECIPIENT_INBOX"), rs.getString("FUNCTION"), + rs.getString("PAYLOAD"), rs.getString("OWNER_INSTANCE"), rs.getLong("OWNER_TICK"), rs.getString("STATE"), diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/events.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/events.hdb deleted file mode 100644 index 83d5f449..00000000 --- a/code/services-satellite/control-service/src/main/resources/templates/control/events.hdb +++ /dev/null @@ -1,43 +0,0 @@ - - - - Control Service - - - - - {{> control/partials/nav}} - -
    -

    Events

    - - - - - - - - - - {{#each events}} - - - - - - - - {{/each}} -
    Service NameInstanceEvent TimeTypeMessage
    {{serviceName}} -    - {{instance}} - {{eventTime}}{{eventType}}{{eventMessage}}
    -
    - - - - diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/message-queue.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/message-queue.hdb deleted file mode 100644 index c46193b4..00000000 --- a/code/services-satellite/control-service/src/main/resources/templates/control/message-queue.hdb +++ /dev/null @@ -1,52 +0,0 @@ - - - - Control Service - - - - - {{> control/partials/nav}} - -
    -

    Message Queue

    - - - - - - - - - - - {{#each messages}} - - - - - - - - - - - - - - - - - {{/each}} -
    State
    TTL
    Msg ID
    Related ID
    Recipient
    Sender
    FunctionOwner Instance
    Owner Tick
    Created
    Updated
    {{stateCode}} {{state}}{{id}}{{recipientInbox}}{{function}} -    {{ownerInstance}} - {{createdTime}}
    {{ttl}}{{relatedId}}{{senderInbox}}{{ownerTick}}{{updatedTime}}
    -
    - - - - \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/partials/events-table.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/partials/events-table.hdb new file mode 100644 index 00000000..23324a13 --- /dev/null +++ b/code/services-satellite/control-service/src/main/resources/templates/control/partials/events-table.hdb @@ -0,0 +1,23 @@ +

    Events

    + + + + + + + + + + {{#each events}} + + + + + + + + {{/each}} +
    Service NameInstanceEvent TimeTypeMessage
    {{serviceName}} +    + {{instance}} + {{eventTime}}{{eventType}}{{eventMessage}}
    \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/partials/fsm-table.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/partials/fsm-table.hdb new file mode 100644 index 00000000..c7b66e9a --- /dev/null +++ b/code/services-satellite/control-service/src/main/resources/templates/control/partials/fsm-table.hdb @@ -0,0 +1,23 @@ +

    FSMs

    + + + + + + + {{#each fsms}} + + + + + + {{/each}} +
    FSMStateAction
    {{name}}{{stateIcon}} {{state}} + {{#unless terminal}} +
    + {{/unless}} + {{#if terminal}} +
    + {{/if}} + +
    \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/partials/message-queue-table.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/partials/message-queue-table.hdb new file mode 100644 index 00000000..cf584ab2 --- /dev/null +++ b/code/services-satellite/control-service/src/main/resources/templates/control/partials/message-queue-table.hdb @@ -0,0 +1,32 @@ +

    Message Queue

    + + + + + + + + + + + {{#each messages}} + + + + + + + + + + + + + + + + + {{/each}} +
    State
    TTL
    Msg ID
    Related ID
    Recipient
    Sender
    Function
    Payload
    Owner Instance
    Owner Tick
    Created
    Updated
    {{stateCode}} {{state}}{{id}}{{recipientInbox}}{{function}} +    {{ownerInstance}} + {{createdTime}}
    {{ttl}}{{relatedId}}{{senderInbox}}{{payload}}{{ownerTick}}{{updatedTime}}
    \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/partials/nav.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/partials/nav.hdb index 771266f2..9b68f4b2 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/partials/nav.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/partials/nav.hdb @@ -3,7 +3,5 @@
  • Overview
  • Services
  • Processes
  • -
  • Events
  • -
  • Message Queue
\ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/partials/processes-table.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/partials/processes-table.hdb new file mode 100644 index 00000000..47d7dc64 --- /dev/null +++ b/code/services-satellite/control-service/src/main/resources/templates/control/partials/processes-table.hdb @@ -0,0 +1,23 @@ + +

Processes

+ + + + + + + + + {{#each processes}} + + + + + + + + {{/each}} +
Process IDUUIDStatusProgressLast Seen (ms)
{{processId}} +    + {{uuid}} + {{status}}{{#if progress}}{{progress}}%{{/if}}{{#unless isStopped}}{{lastSeenMillis}}{{/unless}}
\ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/partials/services-table.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/partials/services-table.hdb new file mode 100644 index 00000000..2137f1fe --- /dev/null +++ b/code/services-satellite/control-service/src/main/resources/templates/control/partials/services-table.hdb @@ -0,0 +1,18 @@ +

Services

+ + + + + + + {{#each services}} + + + + + + {{/each}} +
Service IDUUIDLast Seen (ms)
{{serviceId}} +    + {{uuid}} + {{lastSeenMillis}}
\ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/processes.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/processes.hdb index 53902f39..7d348be1 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/processes.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/processes.hdb @@ -7,36 +7,16 @@ {{> control/partials/nav}} -
-

Processes

- - - - - - - - - {{#each heartbeats}} - - - - - - - - {{/each}} -
Process IDUUIDStatusProgressLast Seen (ms)
{{processId}} -    - {{uuid}} - {{status}}{{#if progress}}{{progress}}%{{/if}}{{#unless isStopped}}{{lastSeenMillis}}{{/unless}}
+ {{> control/partials/processes-table}} + {{> control/partials/fsm-table}} + {{> control/partials/message-queue-table}}
\ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/services.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/services.hdb index a09d5c27..2c0542b9 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/services.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/services.hdb @@ -7,32 +7,15 @@ {{> control/partials/nav}} -
-

Services

- - - - - - - {{#each heartbeats}} - - - - - - {{/each}} -
Service IDUUIDLast Seen (ms)
{{serviceId}} -    - {{uuid}} - {{lastSeenMillis}}
+ {{> control/partials/services-table }} + {{> control/partials/events-table }}
\ No newline at end of file From d36e36c8fd9a794f2509c8aefdd70c3f0eac8fcc Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 14 Jul 2023 11:39:15 +0200 Subject: [PATCH 050/157] (mq) Bugfix lastNMessages; use Lists.reverse properly --- .../main/java/nu/marginalia/mq/persistence/MqPersistence.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java index 402d03f1..4f2cc564 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java @@ -308,8 +308,8 @@ public class MqPersistence { messages.add(msg); } - Lists.reverse(messages); - return messages; + // We want the last N messages in ascending order + return Lists.reverse(messages); } } From 23169ad818939435a45fb8825ca52b0b785f19c4 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 14 Jul 2023 11:40:05 +0200 Subject: [PATCH 051/157] (db) Model for file storage areas --- .../db/storage/FileStorageService.java | 240 ++++++++++++++++++ .../db/storage/model/FileStorage.java | 24 ++ .../db/storage/model/FileStorageBase.java | 25 ++ .../db/storage/model/FileStorageBaseId.java | 3 + .../db/storage/model/FileStorageBaseType.java | 7 + .../db/storage/model/FileStorageId.java | 3 + .../db/storage/model/FileStorageType.java | 11 + .../resources/sql/current/13-file-storage.sql | 35 +++ .../db/storage/FileStorageServiceTest.java | 155 +++++++++++ 9 files changed, 503 insertions(+) create mode 100644 code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java create mode 100644 code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorage.java create mode 100644 code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageBase.java create mode 100644 code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageBaseId.java create mode 100644 code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageBaseType.java create mode 100644 code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageId.java create mode 100644 code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageType.java create mode 100644 code/common/db/src/main/resources/sql/current/13-file-storage.sql create mode 100644 code/common/db/src/test/java/nu/marginalia/db/storage/FileStorageServiceTest.java diff --git a/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java b/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java new file mode 100644 index 00000000..75fa5ccf --- /dev/null +++ b/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java @@ -0,0 +1,240 @@ +package nu.marginalia.db.storage; + +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.db.storage.model.*; + +import javax.inject.Inject; +import javax.inject.Singleton; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.attribute.PosixFilePermissions; +import java.sql.SQLException; + +/** Manages file storage for processes and services + */ +@Singleton +public class FileStorageService { + private final HikariDataSource dataSource; + + @Inject + public FileStorageService(HikariDataSource dataSource) { + this.dataSource = dataSource; + } + + /** @return the storage base with the given id, or null if it does not exist */ + public FileStorageBase getStorageBase(FileStorageBaseId type) throws SQLException { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT ID, NAME, PATH, TYPE, MUST_CLEAN, PERMIT_TEMP + FROM FILE_STORAGE_BASE WHERE ID = ? + """)) { + stmt.setLong(1, type.id()); + try (var rs = stmt.executeQuery()) { + if (rs.next()) { + return new FileStorageBase( + new FileStorageBaseId(rs.getLong(1)), + FileStorageBaseType.valueOf(rs.getString(4)), + rs.getString(2), + rs.getString(3), + rs.getBoolean(5), + rs.getBoolean(6) + ); + } + } + } + return null; + } + + /** @return the storage base with the given type, or null if it does not exist */ + public FileStorageBase getStorageBase(FileStorageBaseType type) throws SQLException { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT ID, NAME, PATH, TYPE, MUST_CLEAN, PERMIT_TEMP + FROM FILE_STORAGE_BASE WHERE TYPE = ? + """)) { + stmt.setString(1, type.name()); + try (var rs = stmt.executeQuery()) { + if (rs.next()) { + return new FileStorageBase( + new FileStorageBaseId(rs.getLong(1)), + FileStorageBaseType.valueOf(rs.getString(4)), + rs.getString(2), + rs.getString(3), + rs.getBoolean(5), + rs.getBoolean(6) + ); + } + } + } + return null; + } + + public FileStorageBase createStorageBase(String name, Path path, FileStorageBaseType type, boolean mustClean, boolean permitTemp) throws SQLException, FileNotFoundException { + + if (!Files.exists(path)) { + throw new FileNotFoundException("Storage base path does not exist: " + path); + } + + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + INSERT INTO FILE_STORAGE_BASE(NAME, PATH, TYPE, MUST_CLEAN, PERMIT_TEMP) + VALUES (?, ?, ?, ?, ?) + """)) { + stmt.setString(1, name); + stmt.setString(2, path.toString()); + stmt.setString(3, type.name()); + stmt.setBoolean(4, mustClean); + stmt.setBoolean(5, permitTemp); + + int update = stmt.executeUpdate(); + if (update < 0) { + throw new SQLException("Failed to create storage base"); + } + } + + return getStorageBase(type); + } + + /** Allocate a temporary storage of the given type if temporary allocation is permitted */ + public FileStorage allocateTemporaryStorage(FileStorageBase base, + FileStorageType type, + String prefix, + String description) throws IOException, SQLException + { + if (!base.permitTemp()) { + throw new IllegalArgumentException("Temporary storage not permitted in base " + base.name()); + } + + Path tempDir = Files.createTempDirectory(base.asPath(), prefix, + PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rwxr-xr-x")) + ); + + try (var conn = dataSource.getConnection(); + var update = conn.prepareStatement(""" + INSERT INTO FILE_STORAGE(PATH, TYPE, DESCRIPTION, BASE_ID) + VALUES (?, ?, ?, ?) + """); + var query = conn.prepareStatement(""" + SELECT ID FROM FILE_STORAGE WHERE PATH = ? AND BASE_ID = ? + """) + ) { + update.setString(1, tempDir.toString()); + update.setString(2, type.name()); + update.setString(3, description); + update.setLong(4, base.id().id()); + + if (update.executeUpdate() < 1) + throw new SQLException("Failed to insert storage"); + + query.setString(1, tempDir.toString()); + query.setLong(2, base.id().id()); + var rs = query.executeQuery(); + + if (rs.next()) { + return new FileStorage( + new FileStorageId(rs.getLong("ID")), + base, + type, + tempDir.toString(), + description + ); + } + + } + + throw new SQLException("Failed to insert storage"); + } + + + /** Allocate permanent storage in base */ + public FileStorage allocatePermanentStorage(FileStorageBase base, String relativePath, FileStorageType type, String description) throws IOException, SQLException { + + Path newDir = base.asPath().resolve(relativePath); + + if (Files.exists(newDir)) { + throw new IllegalArgumentException("Storage already exists: " + newDir); + } + + Files.createDirectory(newDir, PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rwxr-xr-x"))); + + try (var conn = dataSource.getConnection(); + var update = conn.prepareStatement(""" + INSERT INTO FILE_STORAGE(PATH, TYPE, DESCRIPTION, BASE_ID) + VALUES (?, ?, ?, ?) + """); + var query = conn.prepareStatement(""" + SELECT ID + FROM FILE_STORAGE WHERE PATH = ? AND BASE_ID = ? + """) + ) { + update.setString(1, relativePath); + update.setString(2, type.name()); + update.setString(3, description); + update.setLong(4, base.id().id()); + + if (update.executeUpdate() < 1) + throw new SQLException("Failed to insert storage"); + + query.setString(1, relativePath); + query.setLong(2, base.id().id()); + var rs = query.executeQuery(); + + if (rs.next()) { + return new FileStorage( + new FileStorageId(rs.getLong("ID")), + base, + type, + newDir.toString(), + description + ); + } + + } + + throw new SQLException("Failed to insert storage"); + } + + /** @return the storage with the given id, or null if it does not exist */ + public FileStorage getStorage(FileStorageId id) throws SQLException { + + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT PATH, TYPE, DESCRIPTION, ID, BASE_ID + FROM FILE_STORAGE_VIEW WHERE ID = ? + """)) { + stmt.setLong(1, id.id()); + + long storageId; + long baseId; + String path; + String description; + FileStorageType type; + + try (var rs = stmt.executeQuery()) { + if (rs.next()) { + baseId = rs.getLong("BASE_ID"); + storageId = rs.getLong("ID"); + type = FileStorageType.valueOf(rs.getString("TYPE")); + path = rs.getString("PATH"); + description = rs.getString("DESCRIPTION"); + } + else { + return null; + } + + var base = getStorageBase(new FileStorageBaseId(baseId)); + + return new FileStorage( + new FileStorageId(storageId), + base, + type, + path, + description + ); + } + } + } + +} diff --git a/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorage.java b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorage.java new file mode 100644 index 00000000..3a619809 --- /dev/null +++ b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorage.java @@ -0,0 +1,24 @@ +package nu.marginalia.db.storage.model; + +import java.nio.file.Path; + +/** + * Represents a file storage area + * + * @param id the id of the storage in the database + * @param base the base of the storage + * @param type the type of data expected + * @param path the full path of the storage on disk + * @param description a description of the storage + */ +public record FileStorage( + FileStorageId id, + FileStorageBase base, + FileStorageType type, + String path, + String description) +{ + public Path asPath() { + return Path.of(path); + } +} diff --git a/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageBase.java b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageBase.java new file mode 100644 index 00000000..96f09698 --- /dev/null +++ b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageBase.java @@ -0,0 +1,25 @@ +package nu.marginalia.db.storage.model; + +import java.nio.file.Path; + +/** + * Represents a file storage base directory + * + * @param id the id of the storage base in the database + * @param type the type of the storage base + * @param name the name of the storage base + * @param path the path of the storage base + * @param mustClean if true, the storage is small and *must* be cleaned after use + * @param permitTemp if true, the storage may be used for temporary files + */ +public record FileStorageBase(FileStorageBaseId id, + FileStorageBaseType type, + String name, + String path, + boolean mustClean, + boolean permitTemp + ) { + public Path asPath() { + return Path.of(path); + } +} diff --git a/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageBaseId.java b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageBaseId.java new file mode 100644 index 00000000..e4dbaf68 --- /dev/null +++ b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageBaseId.java @@ -0,0 +1,3 @@ +package nu.marginalia.db.storage.model; + +public record FileStorageBaseId(long id) {} diff --git a/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageBaseType.java b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageBaseType.java new file mode 100644 index 00000000..df9f497f --- /dev/null +++ b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageBaseType.java @@ -0,0 +1,7 @@ +package nu.marginalia.db.storage.model; + +public enum FileStorageBaseType { + SSD_INDEX, + SSD_WORK, + SLOW +} diff --git a/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageId.java b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageId.java new file mode 100644 index 00000000..da8849ff --- /dev/null +++ b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageId.java @@ -0,0 +1,3 @@ +package nu.marginalia.db.storage.model; + +public record FileStorageId(long id) {} diff --git a/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageType.java b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageType.java new file mode 100644 index 00000000..04d5cc81 --- /dev/null +++ b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageType.java @@ -0,0 +1,11 @@ +package nu.marginalia.db.storage.model; + +public enum FileStorageType { + CRAWL_SPEC, + CRAWL_DATA, + PROCESSED_DATA, + INDEX_STAGING, + LEXICON_STAGING, + INDEX_LIVE, + LEXICON_LIVE +} diff --git a/code/common/db/src/main/resources/sql/current/13-file-storage.sql b/code/common/db/src/main/resources/sql/current/13-file-storage.sql new file mode 100644 index 00000000..c09b140b --- /dev/null +++ b/code/common/db/src/main/resources/sql/current/13-file-storage.sql @@ -0,0 +1,35 @@ +CREATE TABLE IF NOT EXISTS FILE_STORAGE_BASE ( + ID BIGINT PRIMARY KEY AUTO_INCREMENT, + NAME VARCHAR(255) NOT NULL UNIQUE, + PATH VARCHAR(255) NOT NULL UNIQUE COMMENT 'The path to the storage base', + TYPE ENUM ('SSD_INDEX', 'SSD_WORK', 'SLOW') NOT NULL, + MUST_CLEAN BOOLEAN NOT NULL DEFAULT FALSE COMMENT 'If true, the storage must be cleaned after use', + PERMIT_TEMP BOOLEAN NOT NULL DEFAULT FALSE COMMENT 'If true, the storage can be used for temporary files' +) +CHARACTER SET utf8mb4 +COLLATE utf8mb4_bin; + +CREATE TABLE IF NOT EXISTS FILE_STORAGE ( + ID BIGINT PRIMARY KEY AUTO_INCREMENT, + BASE_ID BIGINT NOT NULL, + PATH VARCHAR(255) NOT NULL COMMENT 'The path to the storage relative to the base', + DESCRIPTION VARCHAR(255) NOT NULL, + TYPE ENUM ('CRAWL_SPEC', 'CRAWL_DATA', 'PROCESSED_DATA', 'INDEX_STAGING', 'LEXICON_STAGING', 'INDEX_LIVE', 'LEXICON_LIVE') NOT NULL, + DO_PURGE BOOLEAN NOT NULL DEFAULT FALSE COMMENT 'If true, the storage may be cleaned', + CREATE_DATE TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6), + CONSTRAINT CONS UNIQUE (BASE_ID, PATH), + FOREIGN KEY (BASE_ID) REFERENCES FILE_STORAGE_BASE(ID) ON DELETE CASCADE +) +CHARACTER SET utf8mb4 +COLLATE utf8mb4_bin; + +CREATE VIEW FILE_STORAGE_VIEW +AS SELECT + CONCAT(BASE.PATH, '/', STORAGE.PATH) AS PATH, + STORAGE.TYPE AS TYPE, + DESCRIPTION AS DESCRIPTION, + CREATE_DATE AS CREATE_DATE, + STORAGE.ID AS ID, + BASE.ID AS BASE_ID +FROM FILE_STORAGE STORAGE +INNER JOIN FILE_STORAGE_BASE BASE ON STORAGE.BASE_ID=BASE.ID; diff --git a/code/common/db/src/test/java/nu/marginalia/db/storage/FileStorageServiceTest.java b/code/common/db/src/test/java/nu/marginalia/db/storage/FileStorageServiceTest.java new file mode 100644 index 00000000..cfd1df26 --- /dev/null +++ b/code/common/db/src/test/java/nu/marginalia/db/storage/FileStorageServiceTest.java @@ -0,0 +1,155 @@ +package nu.marginalia.db.storage; + +import com.google.common.collect.Lists; +import com.zaxxer.hikari.HikariConfig; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.db.storage.model.FileStorageBaseType; +import nu.marginalia.db.storage.model.FileStorageType; +import org.junit.jupiter.api.*; +import org.junit.jupiter.api.parallel.Execution; +import org.testcontainers.containers.MariaDBContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.List; +import java.util.UUID; + +import static org.junit.Assert.*; +import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD; + +@Testcontainers +@Execution(SAME_THREAD) +@Tag("slow") +public class FileStorageServiceTest { + @Container + static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") + .withDatabaseName("WMSA_prod") + .withUsername("wmsa") + .withPassword("wmsa") + .withInitScript("sql/current/13-file-storage.sql") + .withNetworkAliases("mariadb"); + + static HikariDataSource dataSource; + static FileStorageService fileStorageService; + + static List tempDirs = new ArrayList<>(); + + @BeforeAll + public static void setup() { + HikariConfig config = new HikariConfig(); + config.setJdbcUrl(mariaDBContainer.getJdbcUrl()); + config.setUsername("wmsa"); + config.setPassword("wmsa"); + + dataSource = new HikariDataSource(config); + } + + + @BeforeEach + public void setupEach() { + fileStorageService = new FileStorageService(dataSource); + } + + @AfterEach + public void tearDownEach() { + try (var conn = dataSource.getConnection(); + var stmt = conn.createStatement()) { + stmt.execute("DELETE FROM FILE_STORAGE"); + stmt.execute("DELETE FROM FILE_STORAGE_BASE"); + } catch (SQLException e) { + throw new RuntimeException(e); + } + } + + @AfterAll + public static void teardown() { + dataSource.close(); + + Lists.reverse(tempDirs).forEach(path -> { + try { + System.out.println("Deleting " + path); + Files.delete(path); + } catch (IOException e) { + e.printStackTrace(); + } + }); + } + + private Path createTempDir() { + try { + Path dir = Files.createTempDirectory("file-storage-test"); + tempDirs.add(dir); + return dir; + } catch (IOException e) { + throw new RuntimeException(e); + } + + } + + @Test + public void testCreateBase() throws SQLException, FileNotFoundException { + String name = "test-" + UUID.randomUUID(); + + var storage = new FileStorageService(dataSource); + var base = storage.createStorageBase(name, createTempDir(), FileStorageBaseType.SLOW, false, false); + + Assertions.assertEquals(name, base.name()); + Assertions.assertEquals(FileStorageBaseType.SLOW, base.type()); + Assertions.assertFalse(base.mustClean()); + Assertions.assertFalse(base.permitTemp()); + } + @Test + public void testAllocateTempInNonPermitted() throws SQLException, FileNotFoundException { + String name = "test-" + UUID.randomUUID(); + + var storage = new FileStorageService(dataSource); + + var base = storage.createStorageBase(name, createTempDir(), FileStorageBaseType.SLOW, false, false); + + try { + storage.allocateTemporaryStorage(base, FileStorageType.CRAWL_DATA, "xyz", "thisShouldFail"); + fail(); + } + catch (IllegalArgumentException ex) {} // ok + catch (Exception ex) { + ex.printStackTrace(); + fail(); + } + } + + @Test + public void testAllocatePermanentInNonPermitted() throws SQLException, IOException { + String name = "test-" + UUID.randomUUID(); + + var storage = new FileStorageService(dataSource); + + var base = storage.createStorageBase(name, createTempDir(), FileStorageBaseType.SLOW, false, false); + + var created = storage.allocatePermanentStorage(base, "xyz", FileStorageType.CRAWL_DATA, "thisShouldFail"); + tempDirs.add(created.asPath()); + + var actual = storage.getStorage(created.id()); + Assertions.assertEquals(created, actual); + } + + @Test + public void testAllocateTempInPermitted() throws IOException, SQLException { + String name = "test-" + UUID.randomUUID(); + + var storage = new FileStorageService(dataSource); + + var base = storage.createStorageBase(name, createTempDir(), FileStorageBaseType.SLOW, false, true); + var fileStorage = storage.allocateTemporaryStorage(base, FileStorageType.CRAWL_DATA, "xyz", "thisShouldSucceed"); + + Assertions.assertTrue(Files.exists(fileStorage.asPath())); + tempDirs.add(fileStorage.asPath()); + } + + +} \ No newline at end of file From 8b74e3aa0df7ac5bcfb71df596aa0b499d43b205 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 14 Jul 2023 17:08:10 +0200 Subject: [PATCH 052/157] (*) File Storage WIP --- .../src/main/java/nu/marginalia/WmsaHome.java | 30 --- .../db/storage/FileStorageService.java | 55 ++++- .../db/storage/model/FileStorageId.java | 6 +- .../db/storage/model/FileStorageType.java | 3 +- .../resources/sql/current/13-file-storage.sql | 2 +- .../mqsm/graph/AbstractStateGraph.java | 2 + .../converting-model/build.gradle | 1 + .../converting/mqapi/ConvertRequest.java | 10 + .../converting/mqapi/ConverterInboxNames.java | 6 + .../converting/mqapi/LoadRequest.java | 10 + .../crawling-model/build.gradle | 1 + .../crawling/mqapi/CrawlRequest.java | 9 + .../processes/converting-process/build.gradle | 1 + .../marginalia/converting/ConverterMain.java | 124 +++++++++-- .../converting/ConverterModule.java | 19 +- code/processes/loading-process/build.gradle | 2 + .../nu/marginalia/loading/LoaderMain.java | 108 ++++++--- .../nu/marginalia/loading/LoaderModule.java | 6 +- .../loader/LoaderIndexJournalWriter.java | 17 +- code/services-core/index-service/build.gradle | 1 + .../java/nu/marginalia/index/IndexModule.java | 16 +- .../index/IndexServicesFactory.java | 43 ++-- .../marginalia/index/IndexTablesModule.java | 10 - ...ndexQueryServiceIntegrationTestModule.java | 18 +- .../control-service/build.gradle | 2 + .../nu/marginalia/control/ControlService.java | 23 +- .../model/FileStorageBaseWithStorage.java | 9 + .../process/ReconvertAndLoadProcess.java | 208 +++++++++++++----- .../svc/ControlFileStorageService.java | 63 ++++++ .../control/svc/ProcessOutboxFactory.java | 27 +++ .../control/svc/ProcessService.java | 15 +- .../templates/control/partials/nav.hdb | 1 + .../resources/templates/control/storage.hdb | 52 +++++ .../tools/ExperimentRunnerMain.java | 2 +- .../SentenceStatisticsExperiment.java | 4 - run/setup.sh | 2 +- 36 files changed, 678 insertions(+), 230 deletions(-) create mode 100644 code/process-models/converting-model/src/main/java/nu/marginalia/converting/mqapi/ConvertRequest.java create mode 100644 code/process-models/converting-model/src/main/java/nu/marginalia/converting/mqapi/ConverterInboxNames.java create mode 100644 code/process-models/converting-model/src/main/java/nu/marginalia/converting/mqapi/LoadRequest.java create mode 100644 code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/mqapi/CrawlRequest.java create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageBaseWithStorage.java create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlFileStorageService.java create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessOutboxFactory.java create mode 100644 code/services-satellite/control-service/src/main/resources/templates/control/storage.hdb diff --git a/code/common/config/src/main/java/nu/marginalia/WmsaHome.java b/code/common/config/src/main/java/nu/marginalia/WmsaHome.java index ae09940e..d63be333 100644 --- a/code/common/config/src/main/java/nu/marginalia/WmsaHome.java +++ b/code/common/config/src/main/java/nu/marginalia/WmsaHome.java @@ -10,7 +10,6 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.Optional; -import java.util.Properties; import java.util.stream.Stream; public class WmsaHome { @@ -79,35 +78,6 @@ public class WmsaHome { return getHomePath().resolve("data").resolve("IP2LOCATION-LITE-DB1.CSV"); } - public static Path getDisk(String name) { - var pathStr = getDiskProperties().getProperty(name); - if (null == pathStr) { - throw new RuntimeException("Disk " + name + " was not configured"); - } - Path p = Path.of(pathStr); - if (!Files.isDirectory(p)) { - throw new RuntimeException("Disk " + name + " does not exist or is not a directory!"); - } - return p; - } - - public static Properties getDiskProperties() { - Path settingsFile = getHomePath().resolve("conf/disks.properties"); - - if (!Files.isRegularFile(settingsFile)) { - throw new RuntimeException("Could not find disk settings " + settingsFile); - } - - try (var is = Files.newInputStream(settingsFile)) { - var props = new Properties(); - props.load(is); - return props; - } - catch (IOException ex) { - throw new RuntimeException(ex); - } - } - public static LanguageModels getLanguageModels() { final Path home = getHomePath(); diff --git a/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java b/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java index 75fa5ccf..a954b6bb 100644 --- a/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java +++ b/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java @@ -111,8 +111,10 @@ public class FileStorageService { PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rwxr-xr-x")) ); + String relDir = base.asPath().relativize(tempDir).normalize().toString(); + try (var conn = dataSource.getConnection(); - var update = conn.prepareStatement(""" + var insert = conn.prepareStatement(""" INSERT INTO FILE_STORAGE(PATH, TYPE, DESCRIPTION, BASE_ID) VALUES (?, ?, ?, ?) """); @@ -120,15 +122,17 @@ public class FileStorageService { SELECT ID FROM FILE_STORAGE WHERE PATH = ? AND BASE_ID = ? """) ) { - update.setString(1, tempDir.toString()); - update.setString(2, type.name()); - update.setString(3, description); - update.setLong(4, base.id().id()); + insert.setString(1, relDir); + insert.setString(2, type.name()); + insert.setString(3, description); + insert.setLong(4, base.id().id()); - if (update.executeUpdate() < 1) + if (insert.executeUpdate() < 1) { throw new SQLException("Failed to insert storage"); + } - query.setString(1, tempDir.toString()); + + query.setString(1, relDir); query.setLong(2, base.id().id()); var rs = query.executeQuery(); @@ -196,6 +200,43 @@ public class FileStorageService { throw new SQLException("Failed to insert storage"); } + public FileStorage getStorageByType(FileStorageType type) throws SQLException { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT PATH, DESCRIPTION, ID, BASE_ID + FROM FILE_STORAGE_VIEW WHERE TYPE = ? + """)) { + stmt.setString(1, type.name()); + + long storageId; + long baseId; + String path; + String description; + + try (var rs = stmt.executeQuery()) { + if (rs.next()) { + baseId = rs.getLong("BASE_ID"); + storageId = rs.getLong("ID"); + path = rs.getString("PATH"); + description = rs.getString("DESCRIPTION"); + } + else { + return null; + } + + var base = getStorageBase(new FileStorageBaseId(baseId)); + + return new FileStorage( + new FileStorageId(storageId), + base, + type, + path, + description + ); + } + } + } + /** @return the storage with the given id, or null if it does not exist */ public FileStorage getStorage(FileStorageId id) throws SQLException { diff --git a/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageId.java b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageId.java index da8849ff..43e5503f 100644 --- a/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageId.java +++ b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageId.java @@ -1,3 +1,7 @@ package nu.marginalia.db.storage.model; -public record FileStorageId(long id) {} +public record FileStorageId(long id) { + public static FileStorageId of(int storageId) { + return new FileStorageId(storageId); + } +} diff --git a/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageType.java b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageType.java index 04d5cc81..390262ec 100644 --- a/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageType.java +++ b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageType.java @@ -7,5 +7,6 @@ public enum FileStorageType { INDEX_STAGING, LEXICON_STAGING, INDEX_LIVE, - LEXICON_LIVE + LEXICON_LIVE, + SEARCH_SETS } diff --git a/code/common/db/src/main/resources/sql/current/13-file-storage.sql b/code/common/db/src/main/resources/sql/current/13-file-storage.sql index c09b140b..af111186 100644 --- a/code/common/db/src/main/resources/sql/current/13-file-storage.sql +++ b/code/common/db/src/main/resources/sql/current/13-file-storage.sql @@ -14,7 +14,7 @@ CREATE TABLE IF NOT EXISTS FILE_STORAGE ( BASE_ID BIGINT NOT NULL, PATH VARCHAR(255) NOT NULL COMMENT 'The path to the storage relative to the base', DESCRIPTION VARCHAR(255) NOT NULL, - TYPE ENUM ('CRAWL_SPEC', 'CRAWL_DATA', 'PROCESSED_DATA', 'INDEX_STAGING', 'LEXICON_STAGING', 'INDEX_LIVE', 'LEXICON_LIVE') NOT NULL, + TYPE ENUM ('CRAWL_SPEC', 'CRAWL_DATA', 'PROCESSED_DATA', 'INDEX_STAGING', 'LEXICON_STAGING', 'INDEX_LIVE', 'LEXICON_LIVE', 'SEARCH_SETS') NOT NULL, DO_PURGE BOOLEAN NOT NULL DEFAULT FALSE COMMENT 'If true, the storage may be cleaned', CREATE_DATE TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6), CONSTRAINT CONS UNIQUE (BASE_ID, PATH), diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/AbstractStateGraph.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/AbstractStateGraph.java index 10aca984..2890910f 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/AbstractStateGraph.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/AbstractStateGraph.java @@ -29,9 +29,11 @@ public abstract class AbstractStateGraph { public void error() { throw new ControlFlowException("ERROR", ""); } + public void error(T payload) { throw new ControlFlowException("ERROR", payload); } + public void error(Exception ex) { throw new ControlFlowException("ERROR", ex.getClass().getSimpleName() + ":" + ex.getMessage()); } diff --git a/code/process-models/converting-model/build.gradle b/code/process-models/converting-model/build.gradle index 1c2ef076..ae48aa32 100644 --- a/code/process-models/converting-model/build.gradle +++ b/code/process-models/converting-model/build.gradle @@ -14,6 +14,7 @@ dependencies { implementation project(':third-party:monkey-patch-gson') + implementation project(':code:common:db') implementation project(':code:common:model') implementation project(':code:api:index-api') implementation project(':code:common:service-discovery') diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/mqapi/ConvertRequest.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/mqapi/ConvertRequest.java new file mode 100644 index 00000000..881d75a2 --- /dev/null +++ b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/mqapi/ConvertRequest.java @@ -0,0 +1,10 @@ +package nu.marginalia.converting.mqapi; + +import lombok.AllArgsConstructor; +import nu.marginalia.db.storage.model.FileStorageId; + +@AllArgsConstructor +public class ConvertRequest { + public final FileStorageId crawlStorage; + public final FileStorageId processedDataStorage; +} diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/mqapi/ConverterInboxNames.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/mqapi/ConverterInboxNames.java new file mode 100644 index 00000000..5ce3ebff --- /dev/null +++ b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/mqapi/ConverterInboxNames.java @@ -0,0 +1,6 @@ +package nu.marginalia.converting.mqapi; + +public class ConverterInboxNames { + public static final String CONVERTER_INBOX = "converter"; + public static final String LOADER_INBOX = "loader"; +} diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/mqapi/LoadRequest.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/mqapi/LoadRequest.java new file mode 100644 index 00000000..186f0f7e --- /dev/null +++ b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/mqapi/LoadRequest.java @@ -0,0 +1,10 @@ +package nu.marginalia.converting.mqapi; + +import lombok.AllArgsConstructor; +import nu.marginalia.db.storage.model.FileStorageId; + +@AllArgsConstructor +public class LoadRequest { + public FileStorageId processedDataStorage; + +} diff --git a/code/process-models/crawling-model/build.gradle b/code/process-models/crawling-model/build.gradle index 7a5e5fab..6f8d26e5 100644 --- a/code/process-models/crawling-model/build.gradle +++ b/code/process-models/crawling-model/build.gradle @@ -14,6 +14,7 @@ java { dependencies { implementation project(':code:common:model') + implementation project(':code:common:db') implementation project(':code:common:process') implementation project(':code:libraries:big-string') implementation project(':code:api:index-api') diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/mqapi/CrawlRequest.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/mqapi/CrawlRequest.java new file mode 100644 index 00000000..53f387d5 --- /dev/null +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/mqapi/CrawlRequest.java @@ -0,0 +1,9 @@ +package nu.marginalia.crawling.mqapi; + +import nu.marginalia.db.storage.model.FileStorageId; + +/** A request to start a crawl */ +public class CrawlRequest { + FileStorageId specStorage; + FileStorageId crawlStorage; +} diff --git a/code/processes/converting-process/build.gradle b/code/processes/converting-process/build.gradle index 4cc4c63b..7a9121d8 100644 --- a/code/processes/converting-process/build.gradle +++ b/code/processes/converting-process/build.gradle @@ -32,6 +32,7 @@ dependencies { implementation project(':code:common:db') implementation project(':code:common:service') implementation project(':code:common:config') + implementation project(':code:common:message-queue') implementation project(':code:common:service-discovery') implementation project(':code:common:service-client') diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java index 16381cc2..b07060cb 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java @@ -4,10 +4,14 @@ import com.google.gson.Gson; import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.mq.MessageQueueFactory; +import nu.marginalia.mq.MqMessage; +import nu.marginalia.mq.inbox.MqInboxResponse; +import nu.marginalia.mq.inbox.MqSingleShotInbox; import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.process.log.WorkLog; import nu.marginalia.service.module.DatabaseModule; -import plan.CrawlPlanLoader; import plan.CrawlPlan; import nu.marginalia.converting.compiler.InstructionsCompiler; import nu.marginalia.converting.instruction.Instruction; @@ -18,47 +22,78 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; -import java.nio.file.Path; import java.util.List; +import java.util.UUID; +import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; +import static nu.marginalia.converting.mqapi.ConverterInboxNames.CONVERTER_INBOX; + public class ConverterMain { - private final Logger logger = LoggerFactory.getLogger(getClass()); - private final InstructionWriter instructionWriter; + private static final Logger logger = LoggerFactory.getLogger(ConverterMain.class); + private final DomainProcessor processor; + private final InstructionsCompiler compiler; + private final Gson gson; + private final ProcessHeartbeat heartbeat; + private final MessageQueueFactory messageQueueFactory; + private final FileStorageService fileStorageService; - public static void main(String... args) throws IOException { + public static void main(String... args) throws Exception { - if (args.length != 1) { - System.err.println("Arguments: crawl-plan.yaml"); - System.exit(0); - } - var plan = new CrawlPlanLoader().load(Path.of(args[0])); Injector injector = Guice.createInjector( - new ConverterModule(plan), + new ConverterModule(), new DatabaseModule() ); - injector.getInstance(ConverterMain.class); + var converter = injector.getInstance(ConverterMain.class); + + logger.info("Starting pipe"); + + var request = converter.fetchInstructions(); + try { + converter.load(request); + request.ok(); + } + catch (Exception ex) { + logger.error("Conversion failed", ex); + request.err(); + } + + logger.info("Finished"); + + System.exit(0); } @Inject public ConverterMain( - CrawlPlan plan, DomainProcessor processor, InstructionsCompiler compiler, Gson gson, - ProcessHeartbeat heartbeat - ) throws Exception { + ProcessHeartbeat heartbeat, + MessageQueueFactory messageQueueFactory, + FileStorageService fileStorageService + ) { + this.processor = processor; + this.compiler = compiler; + this.gson = gson; + this.heartbeat = heartbeat; + this.messageQueueFactory = messageQueueFactory; + this.fileStorageService = fileStorageService; heartbeat.start(); + } - logger.info("Starting pipe"); + + + public void load(ConvertRequest request) throws Exception { + + var plan = request.getPlan(); try (WorkLog processLog = plan.createProcessWorkLog(); ConversionLog log = new ConversionLog(plan.process.getDir())) { - instructionWriter = new InstructionWriter(log, plan.process.getDir(), gson); + var instructionWriter = new InstructionWriter(log, plan.process.getDir(), gson); int totalDomains = plan.countCrawledDomains(); AtomicInteger processedDomains = new AtomicInteger(0); @@ -103,13 +138,64 @@ public class ConverterMain { } pipe.join(); + request.ok(); + } + catch (Exception e) { + request.err(); + throw e; + } + } + + private static class ConvertRequest { + private final CrawlPlan plan; + private final MqMessage message; + private final MqSingleShotInbox inbox; + + ConvertRequest(CrawlPlan plan, MqMessage message, MqSingleShotInbox inbox) { + this.plan = plan; + this.message = message; + this.inbox = inbox; } - logger.info("Finished"); + public CrawlPlan getPlan() { + return plan; + } + + public void ok() { + inbox.sendResponse(message, MqInboxResponse.ok()); + } + public void err() { + inbox.sendResponse(message, MqInboxResponse.err()); + } - System.exit(0); } + private ConvertRequest fetchInstructions() throws Exception { + + var inbox = messageQueueFactory.createSingleShotInbox(CONVERTER_INBOX, UUID.randomUUID()); + + var msgOpt = inbox.waitForMessage(30, TimeUnit.SECONDS); + if (msgOpt.isEmpty()) + throw new RuntimeException("No instruction received in inbox"); + var msg = msgOpt.get(); + + if (!nu.marginalia.converting.mqapi.ConvertRequest.class.getSimpleName().equals(msg.function())) { + throw new RuntimeException("Unexpected message in inbox: " + msg); + } + + var request = gson.fromJson(msg.payload(), nu.marginalia.converting.mqapi.ConvertRequest.class); + + var crawlData = fileStorageService.getStorage(request.crawlStorage); + var processData = fileStorageService.getStorage(request.processedDataStorage); + + var plan = new CrawlPlan(null, + new CrawlPlan.WorkDir(crawlData.path(), "crawler.log"), + new CrawlPlan.WorkDir(processData.path(), "processor.log")); + + return new ConvertRequest(plan, msg, inbox); + } + + record ProcessingInstructions(String id, List instructions) {} } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterModule.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterModule.java index 814c32ec..121159ed 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterModule.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterModule.java @@ -2,26 +2,33 @@ package nu.marginalia.converting; import com.google.gson.Gson; import com.google.inject.AbstractModule; +import com.google.inject.Provides; +import com.google.inject.Singleton; import com.google.inject.name.Names; +import lombok.SneakyThrows; import nu.marginalia.LanguageModels; import nu.marginalia.ProcessConfiguration; import nu.marginalia.WmsaHome; +import nu.marginalia.converting.mqapi.ConvertRequest; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.mq.MessageQueueFactory; +import nu.marginalia.mq.persistence.MqPersistence; import plan.CrawlPlan; import nu.marginalia.model.gson.GsonFactory; +import plan.CrawlPlanLoader; +import java.io.IOException; +import java.nio.file.Path; +import java.sql.SQLException; import java.util.UUID; +import java.util.concurrent.TimeUnit; public class ConverterModule extends AbstractModule { - private final CrawlPlan plan; - - public ConverterModule(CrawlPlan plan) { - this.plan = plan; + public ConverterModule() { } public void configure() { - bind(CrawlPlan.class).toInstance(plan); - bind(Gson.class).toInstance(createGson()); bind(ProcessConfiguration.class).toInstance(new ProcessConfiguration("converter", 0, UUID.randomUUID())); diff --git a/code/processes/loading-process/build.gradle b/code/processes/loading-process/build.gradle index 47ec6f59..caba9812 100644 --- a/code/processes/loading-process/build.gradle +++ b/code/processes/loading-process/build.gradle @@ -22,6 +22,8 @@ dependencies { implementation project(':code:api:index-api') implementation project(':code:common:model') + implementation project(':code:common:db') + implementation project(':code:common:message-queue') implementation project(':code:common:config') implementation project(':code:common:service') implementation project(':code:common:service-discovery') diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java index 30b84527..7250889d 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java @@ -1,15 +1,20 @@ package nu.marginalia.loading; +import com.google.gson.Gson; import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.db.storage.model.FileStorage; +import nu.marginalia.mq.MessageQueueFactory; +import nu.marginalia.mq.MqMessage; +import nu.marginalia.mq.inbox.MqInboxResponse; +import nu.marginalia.mq.inbox.MqSingleShotInbox; import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.process.log.WorkLog; -import plan.CrawlPlanLoader; import plan.CrawlPlan; -import nu.marginalia.loading.loader.IndexLoadKeywords; import nu.marginalia.loading.loader.Loader; import nu.marginalia.loading.loader.LoaderFactory; import nu.marginalia.converting.instruction.Instruction; @@ -17,66 +22,63 @@ import nu.marginalia.service.module.DatabaseModule; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; import java.nio.file.Path; import java.sql.SQLException; import java.util.List; +import java.util.UUID; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicInteger; + +import static nu.marginalia.converting.mqapi.ConverterInboxNames.LOADER_INBOX; public class LoaderMain { private static final Logger logger = LoggerFactory.getLogger(LoaderMain.class); - private final CrawlPlan plan; private final ConvertedDomainReader instructionsReader; private final LoaderFactory loaderFactory; - private final IndexLoadKeywords indexLoadKeywords; private final ProcessHeartbeat heartbeat; + private final MessageQueueFactory messageQueueFactory; + private final FileStorageService fileStorageService; + private final Gson gson; private volatile boolean running = true; final Thread processorThread; - public static void main(String... args) throws IOException { - if (args.length != 1) { - System.err.println("Arguments: crawl-plan.yaml"); - System.exit(0); - } - + public static void main(String... args) throws Exception { new org.mariadb.jdbc.Driver(); - var plan = new CrawlPlanLoader().load(Path.of(args[0])); - Injector injector = Guice.createInjector( - new LoaderModule(plan), + new LoaderModule(), new DatabaseModule() ); var instance = injector.getInstance(LoaderMain.class); - instance.run(); + var instructions = instance.fetchInstructions(); + instance.run(instructions); } @Inject - public LoaderMain(CrawlPlan plan, - ConvertedDomainReader instructionsReader, + public LoaderMain(ConvertedDomainReader instructionsReader, HikariDataSource dataSource, LoaderFactory loaderFactory, - IndexLoadKeywords indexLoadKeywords, - ProcessHeartbeat heartbeat + ProcessHeartbeat heartbeat, + MessageQueueFactory messageQueueFactory, + FileStorageService fileStorageService, + Gson gson ) { - this.plan = plan; this.instructionsReader = instructionsReader; this.loaderFactory = loaderFactory; - this.indexLoadKeywords = indexLoadKeywords; this.heartbeat = heartbeat; + this.messageQueueFactory = messageQueueFactory; + this.fileStorageService = fileStorageService; + this.gson = gson; heartbeat.start(); nukeTables(dataSource); - Runtime.getRuntime().addShutdownHook(new Thread(this::shutDownIndex)); processorThread = new Thread(this::processor, "Processor Thread"); processorThread.start(); } @@ -97,13 +99,8 @@ public class LoaderMain { } @SneakyThrows - private void shutDownIndex() { - // This must run otherwise the journal doesn't get a proper header - indexLoadKeywords.close(); - } - - @SneakyThrows - public void run() { + public void run(LoadRequest instructions) { + var plan = instructions.getPlan(); var logFile = plan.process.getLogFile(); try { @@ -124,6 +121,12 @@ public class LoaderMain { running = false; processorThread.join(); + instructions.ok(); + } + catch (Exception ex) { + logger.error("Failed to load", ex); + instructions.err(); + throw ex; } finally { heartbeat.shutDown(); @@ -183,5 +186,50 @@ public class LoaderMain { } } + private static class LoadRequest { + private final CrawlPlan plan; + private final MqMessage message; + private final MqSingleShotInbox inbox; + + LoadRequest(CrawlPlan plan, MqMessage message, MqSingleShotInbox inbox) { + this.plan = plan; + this.message = message; + this.inbox = inbox; + } + + public CrawlPlan getPlan() { + return plan; + } + + public void ok() { + inbox.sendResponse(message, MqInboxResponse.ok()); + } + public void err() { + inbox.sendResponse(message, MqInboxResponse.err()); + } + + } + + private LoadRequest fetchInstructions() throws Exception { + + var inbox = messageQueueFactory.createSingleShotInbox(LOADER_INBOX, UUID.randomUUID()); + + var msgOpt = inbox.waitForMessage(30, TimeUnit.SECONDS); + if (msgOpt.isEmpty()) + throw new RuntimeException("No instruction received in inbox"); + var msg = msgOpt.get(); + + if (!nu.marginalia.converting.mqapi.LoadRequest.class.getSimpleName().equals(msg.function())) { + throw new RuntimeException("Unexpected message in inbox: " + msg); + } + + var request = gson.fromJson(msg.payload(), nu.marginalia.converting.mqapi.LoadRequest.class); + + var processData = fileStorageService.getStorage(request.processedDataStorage); + + var plan = new CrawlPlan(null, null, new CrawlPlan.WorkDir(processData.path(), "processor.log")); + + return new LoadRequest(plan, msg, inbox); + } } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java index 338e722f..a2df0ea9 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java @@ -16,15 +16,11 @@ import java.util.UUID; public class LoaderModule extends AbstractModule { - private final CrawlPlan plan; - public LoaderModule(CrawlPlan plan) { - this.plan = plan; + public LoaderModule() { } public void configure() { - bind(CrawlPlan.class).toInstance(plan); - bind(ServiceDescriptors.class).toInstance(SearchServiceDescriptors.descriptors); bind(ProcessConfiguration.class).toInstance(new ProcessConfiguration("loader", 0, UUID.randomUUID())); diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java index 49cbd402..35f8e79f 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java @@ -2,7 +2,8 @@ package nu.marginalia.loading.loader; import com.google.inject.Inject; import com.google.inject.Singleton; -import com.google.inject.name.Named; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.db.storage.model.FileStorageType; import nu.marginalia.dict.OffHeapDictionaryHashMap; import nu.marginalia.index.journal.model.IndexJournalEntryData; import nu.marginalia.index.journal.model.IndexJournalEntryHeader; @@ -19,7 +20,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; -import java.nio.file.Path; +import java.sql.SQLException; import java.util.Arrays; @Singleton @@ -30,11 +31,15 @@ public class LoaderIndexJournalWriter { private static final Logger logger = LoggerFactory.getLogger(LoaderIndexJournalWriter.class); @Inject - public LoaderIndexJournalWriter(@Named("local-index-path") Path path) throws IOException { + public LoaderIndexJournalWriter(FileStorageService fileStorageService) throws IOException, SQLException { + var lexiconArea = fileStorageService.getStorageByType(FileStorageType.LEXICON_STAGING); + var indexArea = fileStorageService.getStorageByType(FileStorageType.INDEX_STAGING); - var lexiconJournal = new KeywordLexiconJournal(path.resolve("dictionary.dat").toFile()); - lexicon = new KeywordLexicon(lexiconJournal); - indexWriter = new IndexJournalWriterImpl(lexicon, path.resolve("index.dat")); + var lexiconPath = lexiconArea.asPath().resolve("dictionary.dat"); + var indexPath = indexArea.asPath().resolve("page-index.dat"); + + lexicon = new KeywordLexicon(new KeywordLexiconJournal(lexiconPath.toFile())); + indexWriter = new IndexJournalWriterImpl(lexicon, indexPath); } public void putWords(EdgeId domain, EdgeId url, diff --git a/code/services-core/index-service/build.gradle b/code/services-core/index-service/build.gradle index 103d736f..4801e722 100644 --- a/code/services-core/index-service/build.gradle +++ b/code/services-core/index-service/build.gradle @@ -23,6 +23,7 @@ java { dependencies { implementation project(':code:common:config') implementation project(':code:common:model') + implementation project(':code:common:db') implementation project(':code:common:service') implementation project(':code:api:index-api') implementation project(':code:common:service-discovery') diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexModule.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexModule.java index 80f8187a..a0bad25d 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexModule.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexModule.java @@ -2,7 +2,10 @@ package nu.marginalia.index; import com.google.inject.AbstractModule; import com.google.inject.Provides; +import com.google.inject.Singleton; import lombok.SneakyThrows; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.db.storage.model.FileStorageType; import nu.marginalia.index.config.RankingSettings; import nu.marginalia.WmsaHome; import nu.marginalia.lexicon.KeywordLexicon; @@ -21,16 +24,15 @@ public class IndexModule extends AbstractModule { @Provides @SneakyThrows - private KeywordLexiconReadOnlyView createLexicon(ServiceEventLog eventLog) { + @Singleton + private KeywordLexiconReadOnlyView createLexicon(ServiceEventLog eventLog, FileStorageService fileStorageService) { try { eventLog.logEvent("INDEX-LEXICON-LOAD-BEGIN", ""); - return new KeywordLexiconReadOnlyView( - new KeywordLexicon( - new KeywordLexiconJournal(WmsaHome.getDisk("index-write").resolve("dictionary.dat").toFile() - ) - ) - ); + var area = fileStorageService.getStorageByType(FileStorageType.LEXICON_LIVE); + var path = area.asPath().resolve("dictionary.dat"); + + return new KeywordLexiconReadOnlyView(new KeywordLexicon(new KeywordLexiconJournal(path.toFile()))); } finally { eventLog.logEvent("INDEX-LEXICON-LOAD-OK", ""); diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexServicesFactory.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexServicesFactory.java index ec43819a..11008677 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexServicesFactory.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexServicesFactory.java @@ -3,6 +3,8 @@ package nu.marginalia.index; import com.google.inject.Inject; import com.google.inject.Singleton; import com.google.inject.name.Named; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.db.storage.model.FileStorageType; import nu.marginalia.index.forward.ForwardIndexConverter; import nu.marginalia.index.forward.ForwardIndexReader; import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile; @@ -24,12 +26,15 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardCopyOption; +import java.sql.SQLException; import java.util.concurrent.Callable; import java.util.stream.Stream; @Singleton public class IndexServicesFactory { private final Path tmpFileDir; + private final Path liveStorage; + private final Path stagingStorage; private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -50,28 +55,28 @@ public class IndexServicesFactory { @Inject public IndexServicesFactory( - @Named("tmp-file-dir") Path tmpFileDir, - @Named("partition-root-slow") Path partitionRootSlow, - @Named("partition-root-fast") Path partitionRootFast - ) throws IOException { + FileStorageService fileStorageService + ) throws IOException, SQLException { - this.tmpFileDir = tmpFileDir; + liveStorage = fileStorageService.getStorageByType(FileStorageType.INDEX_LIVE).asPath(); + stagingStorage = fileStorageService.getStorageByType(FileStorageType.INDEX_STAGING).asPath(); + tmpFileDir = fileStorageService.getStorageByType(FileStorageType.INDEX_STAGING).asPath().resolve("tmp"); + searchSetsBase = fileStorageService.getStorageByType(FileStorageType.SEARCH_SETS).asPath(); - this.writerIndexFile = new PartitionedDataFile(partitionRootSlow, "page-index.dat"); - - fwdIndexDocId = new PartitionedDataFile(partitionRootFast, "fwd-doc-id.dat"); - fwdIndexDocData = new PartitionedDataFile(partitionRootFast, "fwd-doc-data.dat"); - - revIndexDoc = new PartitionedDataFile(partitionRootFast, "rev-doc.dat"); - revIndexWords = new PartitionedDataFile(partitionRootFast, "rev-words.dat"); - - revPrioIndexDoc = new PartitionedDataFile(partitionRootFast, "rev-prio-doc.dat"); - revPrioIndexWords = new PartitionedDataFile(partitionRootFast, "rev-prio-words.dat"); - - searchSetsBase = partitionRootSlow.resolve("search-sets"); - if (!Files.isDirectory(searchSetsBase)) { - Files.createDirectory(searchSetsBase); + if (!Files.exists(tmpFileDir)) { + Files.createDirectories(tmpFileDir); } + + writerIndexFile = new PartitionedDataFile(stagingStorage, "page-index.dat"); + + fwdIndexDocId = new PartitionedDataFile(liveStorage, "fwd-doc-id.dat"); + fwdIndexDocData = new PartitionedDataFile(liveStorage, "fwd-doc-data.dat"); + + revIndexDoc = new PartitionedDataFile(liveStorage, "rev-doc.dat"); + revIndexWords = new PartitionedDataFile(liveStorage, "rev-words.dat"); + + revPrioIndexDoc = new PartitionedDataFile(liveStorage, "rev-prio-doc.dat"); + revPrioIndexWords = new PartitionedDataFile(liveStorage, "rev-prio-words.dat"); } public Path getSearchSetsBase() { diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexTablesModule.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexTablesModule.java index c26ca5e3..d4bf43c9 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexTablesModule.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexTablesModule.java @@ -1,20 +1,10 @@ package nu.marginalia.index; import com.google.inject.AbstractModule; -import com.google.inject.name.Names; -import nu.marginalia.WmsaHome; - -import java.nio.file.Path; public class IndexTablesModule extends AbstractModule { public void configure() { - bind(Path.class).annotatedWith(Names.named("partition-root-slow")).toInstance(WmsaHome.getDisk("index-write")); - bind(Path.class).annotatedWith(Names.named("partition-root-fast")).toInstance(WmsaHome.getDisk("index-read")); - - bind(Path.class).annotatedWith(Names.named("partition-root-slow-tmp")).toInstance(WmsaHome.getDisk("tmp-slow")); - bind(Path.class).annotatedWith(Names.named("tmp-file-dir")).toInstance(WmsaHome.getDisk("tmp-fast")); - } } diff --git a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java index 2b573c92..77ea0a2e 100644 --- a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java +++ b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java @@ -1,6 +1,9 @@ package nu.marginalia.index.svc; import com.google.inject.AbstractModule; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.db.storage.model.FileStorage; +import nu.marginalia.db.storage.model.FileStorageType; import nu.marginalia.index.IndexServicesFactory; import nu.marginalia.index.journal.writer.IndexJournalWriter; import nu.marginalia.lexicon.KeywordLexicon; @@ -19,6 +22,7 @@ import org.mockito.Mockito; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; +import java.sql.SQLException; import java.util.Random; import java.util.UUID; @@ -48,8 +52,16 @@ public class IndexQueryServiceIntegrationTestModule extends AbstractModule { protected void configure() { try { - var servicesFactory = new IndexServicesFactory(Path.of("/tmp"), - slowDir, fastDir + var fileStorageServiceMock = Mockito.mock(FileStorageService.class); + + when(fileStorageServiceMock.getStorageByType(FileStorageType.SEARCH_SETS)).thenReturn(new FileStorage(null, null, null, fastDir.toString(), null)); + when(fileStorageServiceMock.getStorageByType(FileStorageType.LEXICON_LIVE)).thenReturn(new FileStorage(null, null, null, fastDir.toString(), null)); + when(fileStorageServiceMock.getStorageByType(FileStorageType.LEXICON_STAGING)).thenReturn(new FileStorage(null, null, null, fastDir.toString(), null)); + when(fileStorageServiceMock.getStorageByType(FileStorageType.INDEX_LIVE)).thenReturn(new FileStorage(null, null, null, fastDir.toString(), null)); + when(fileStorageServiceMock.getStorageByType(FileStorageType.INDEX_STAGING)).thenReturn(new FileStorage(null, null, null, slowDir.toString(), null)); + + var servicesFactory = new IndexServicesFactory( + fileStorageServiceMock ); bind(IndexServicesFactory.class).toInstance(servicesFactory); @@ -76,7 +88,7 @@ public class IndexQueryServiceIntegrationTestModule extends AbstractModule { UUID.randomUUID() )); - } catch (IOException e) { + } catch (IOException | SQLException e) { throw new RuntimeException(e); } diff --git a/code/services-satellite/control-service/build.gradle b/code/services-satellite/control-service/build.gradle index fac386e2..d90d926a 100644 --- a/code/services-satellite/control-service/build.gradle +++ b/code/services-satellite/control-service/build.gradle @@ -30,6 +30,8 @@ dependencies { implementation project(':code:common:message-queue') implementation project(':code:common:service-discovery') implementation project(':code:common:service-client') + implementation project(':code:process-models:converting-model') + implementation project(':code:process-models:crawling-model') implementation project(':code:api:search-api') implementation project(':code:api:index-api') diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java index 88f51186..d224159c 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java @@ -6,6 +6,7 @@ import nu.marginalia.client.ServiceMonitors; import nu.marginalia.control.model.ControlProcess; import nu.marginalia.control.process.ControlProcesses; import nu.marginalia.control.svc.*; +import nu.marginalia.db.storage.model.FileStorageId; import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.renderer.MustacheRenderer; @@ -32,12 +33,8 @@ public class ControlService extends Service { private final MustacheRenderer indexRenderer; private final MustacheRenderer> servicesRenderer; private final MustacheRenderer> processesRenderer; - private final MustacheRenderer> eventsRenderer; - private final MustacheRenderer> messageQueueRenderer; - private final MustacheRenderer> fsmStateRenderer; - private final MqPersistence messageQueuePersistence; + private final MustacheRenderer> storageRenderer; private final StaticResources staticResources; - private final MessageQueueMonitorService messageQueueMonitorService; @Inject @@ -46,12 +43,10 @@ public class ControlService extends Service { HeartbeatService heartbeatService, EventLogService eventLogService, RendererFactory rendererFactory, - MqPersistence messageQueuePersistence, ControlProcesses controlProcesses, StaticResources staticResources, MessageQueueViewService messageQueueViewService, - MessageQueueMonitorService messageQueueMonitorService, - ProcessService processService + ControlFileStorageService controlFileStorageService ) throws IOException { super(params); @@ -60,13 +55,9 @@ public class ControlService extends Service { indexRenderer = rendererFactory.renderer("control/index"); servicesRenderer = rendererFactory.renderer("control/services"); processesRenderer = rendererFactory.renderer("control/processes"); - eventsRenderer = rendererFactory.renderer("control/events"); - messageQueueRenderer = rendererFactory.renderer("control/message-queue"); - fsmStateRenderer = rendererFactory.renderer("control/fsm-states"); + storageRenderer = rendererFactory.renderer("control/storage"); - this.messageQueuePersistence = messageQueuePersistence; this.staticResources = staticResources; - this.messageQueueMonitorService = messageQueueMonitorService; Spark.get("/public/heartbeats", (req, res) -> { res.type("application/json"); @@ -86,6 +77,10 @@ public class ControlService extends Service { "messages", messageQueueViewService.getLastEntries(20)), (map) -> processesRenderer.render((Map) map)); + Spark.get("/public/storage", + (req, rsp) -> Map.of("storage", controlFileStorageService.getStorageList()), + (map) -> storageRenderer.render((Map) map)); + Spark.post("/public/fsms/:fsm/start", (req, rsp) -> { controlProcesses.start(ControlProcess.valueOf(req.params("fsm").toUpperCase())); rsp.redirect("/processes"); @@ -105,7 +100,7 @@ public class ControlService extends Service { // TODO: This should be a POST Spark.get("/public/reconvert", (req, rsp) -> { - controlProcesses.start(ControlProcess.RECONVERT_LOAD, "/samples/crawl-blogs/plan.yaml"); + controlProcesses.start(ControlProcess.RECONVERT_LOAD, FileStorageId.of(11)); return "OK"; }); diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageBaseWithStorage.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageBaseWithStorage.java new file mode 100644 index 00000000..94a39e2b --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageBaseWithStorage.java @@ -0,0 +1,9 @@ +package nu.marginalia.control.model; + +import nu.marginalia.db.storage.model.FileStorage; +import nu.marginalia.db.storage.model.FileStorageBase; + +import java.util.List; + +public record FileStorageBaseWithStorage(FileStorageBase base, List storage) { +} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ReconvertAndLoadProcess.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ReconvertAndLoadProcess.java index f0abc5a4..b72876a9 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ReconvertAndLoadProcess.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ReconvertAndLoadProcess.java @@ -1,10 +1,22 @@ package nu.marginalia.control.process; +import com.google.gson.Gson; import com.google.inject.Inject; import com.google.inject.Singleton; +import lombok.AllArgsConstructor; +import lombok.NoArgsConstructor; +import lombok.With; +import nu.marginalia.control.svc.ProcessOutboxFactory; import nu.marginalia.control.svc.ProcessService; +import nu.marginalia.converting.mqapi.ConvertRequest; +import nu.marginalia.converting.mqapi.LoadRequest; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.db.storage.model.FileStorageBaseType; +import nu.marginalia.db.storage.model.FileStorageId; +import nu.marginalia.db.storage.model.FileStorageType; import nu.marginalia.index.client.IndexClient; import nu.marginalia.index.client.IndexMqEndpoints; +import nu.marginalia.mq.MqMessage; import nu.marginalia.mq.MqMessageState; import nu.marginalia.mq.outbox.MqOutbox; import nu.marginalia.mqsm.StateFactory; @@ -17,6 +29,9 @@ import nu.marginalia.search.client.SearchMqEndpoints; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardCopyOption; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; @Singleton public class ReconvertAndLoadProcess extends AbstractStateGraph { @@ -25,7 +40,9 @@ public class ReconvertAndLoadProcess extends AbstractStateGraph { private static final String INITIAL = "INITIAL"; private static final String RECONVERT = "RECONVERT"; + private static final String RECONVERT_WAIT = "RECONVERT_WAIT"; private static final String LOAD = "LOAD"; + private static final String LOAD_WAIT = "LOAD_WAIT"; private static final String MOVE_INDEX_FILES = "MOVE_INDEX_FILES"; private static final String RELOAD_LEXICON = "RELOAD_LEXICON"; private static final String RELOAD_LEXICON_WAIT = "RELOAD_LEXICON_WAIT"; @@ -34,79 +51,160 @@ public class ReconvertAndLoadProcess extends AbstractStateGraph { private final ProcessService processService; private final MqOutbox mqIndexOutbox; private final MqOutbox mqSearchOutbox; + private final MqOutbox mqConverterOutbox; + private final MqOutbox mqLoaderOutbox; + private final FileStorageService storageService; + private final Gson gson; + @AllArgsConstructor @With @NoArgsConstructor + public static class Message { + public FileStorageId crawlStorageId = null; + public FileStorageId processedStorageId = null; + public long converterMsgId = 0L; + public long loaderMsgId = 0L; + }; + @Inject public ReconvertAndLoadProcess(StateFactory stateFactory, ProcessService processService, IndexClient indexClient, - SearchClient searchClient + ProcessOutboxFactory processOutboxFactory, + SearchClient searchClient, + FileStorageService storageService, + Gson gson ) { super(stateFactory); this.processService = processService; this.mqIndexOutbox = indexClient.outbox(); this.mqSearchOutbox = searchClient.outbox(); + this.mqConverterOutbox = processOutboxFactory.createConverterOutbox(); + this.mqLoaderOutbox = processOutboxFactory.createLoaderOutbox(); + this.storageService = storageService; + this.gson = gson; } @GraphState(name = INITIAL, next = RECONVERT) - public String init(String crawlJob) throws Exception { - Path path = Path.of(crawlJob); + public Message init(FileStorageId crawlStorageId) throws Exception { + var storage = storageService.getStorage(crawlStorageId); - if (!Files.exists(path)) { - error("Bad crawl job path"); + if (storage == null) error("Bad storage id"); + if (storage.type() != FileStorageType.CRAWL_DATA) error("Bad storage type " + storage.type()); + + return new Message().withCrawlStorageId(crawlStorageId); + } + + @GraphState(name = RECONVERT, next = RECONVERT_WAIT, resume = ResumeBehavior.ERROR) + public Message reconvert(Message message) throws Exception { + // Create processed data area + var base = storageService.getStorageBase(FileStorageBaseType.SLOW); + var processedArea = storageService.allocateTemporaryStorage(base, FileStorageType.PROCESSED_DATA, "processed-data", "Processed Data"); + + // Pre-send convert request + var request = new ConvertRequest(message.crawlStorageId, processedArea.id()); + long id = mqConverterOutbox.sendAsync(ConvertRequest.class.getSimpleName(), gson.toJson(request)); + + Executors.defaultThreadFactory().newThread(() -> { + try { + processService.trigger(ProcessService.ProcessId.CONVERTER); + } catch (Exception e) { + throw new RuntimeException(e); + } + }).start(); + + return message + .withProcessedStorageId(processedArea.id()) + .withConverterMsgId(id); + } + @GraphState(name = RECONVERT_WAIT, next = LOAD, resume = ResumeBehavior.RETRY) + public Message reconvertWait(Message message) throws Exception { + var rsp = waitResponse(mqConverterOutbox, ProcessService.ProcessId.CONVERTER, message.converterMsgId); + + if (rsp.state() != MqMessageState.OK) + error("Converter failed"); + + return message; + } + + + @GraphState(name = LOAD, next = LOAD_WAIT, resume = ResumeBehavior.ERROR) + public Message load(Message message) throws Exception { + + var request = new LoadRequest(message.processedStorageId); + long id = mqLoaderOutbox.sendAsync(LoadRequest.class.getSimpleName(), gson.toJson(request)); + + Executors.defaultThreadFactory().newThread(() -> { + try { + processService.trigger(ProcessService.ProcessId.LOADER); + } catch (Exception e) { + throw new RuntimeException(e); + } + }).start(); + + return message.withLoaderMsgId(id); + + } + + @GraphState(name = LOAD_WAIT, next = END, resume = ResumeBehavior.RETRY) + public void loadWait(Message message) throws Exception { + var rsp = waitResponse(mqLoaderOutbox, ProcessService.ProcessId.LOADER, message.loaderMsgId); + + if (rsp.state() != MqMessageState.OK) + error("Loader failed"); + } + + public MqMessage waitResponse(MqOutbox outbox, ProcessService.ProcessId processId, long id) throws Exception { + + for (;;) { + try { + return outbox.waitResponse(id, 1, TimeUnit.SECONDS); + } + catch (TimeoutException ex) { + if (!processService.isRunning(processId)) { + try { + return outbox.waitResponse(id, 10, TimeUnit.SECONDS); + } + catch (TimeoutException ex2) { + error("Process " + processId + " is not running"); + } + } + } } - Files.deleteIfExists(path.getParent().resolve("process/process.log")); - - return path.toString(); } - @GraphState(name = RECONVERT, next = LOAD, resume = ResumeBehavior.RETRY) - public String reconvert(String crawlJob) throws Exception { - if (!processService.trigger(ProcessService.ProcessId.CONVERTER, Path.of(crawlJob))) - error(); - - return crawlJob; - } - - @GraphState(name = LOAD, next = MOVE_INDEX_FILES, resume = ResumeBehavior.RETRY) - public void load(String crawlJob) throws Exception { - if (!processService.trigger(ProcessService.ProcessId.LOADER, Path.of(crawlJob))) - error(); - } - - @GraphState(name = MOVE_INDEX_FILES, next = RELOAD_LEXICON, resume = ResumeBehavior.ERROR) - public void moveIndexFiles(String crawlJob) throws Exception { - Path indexData = Path.of("/vol/index.dat"); - Path indexDest = Path.of("/vol/iw/0/page-index.dat"); - - if (!Files.exists(indexData)) - error("Index data not found"); - - Files.move(indexData, indexDest, StandardCopyOption.REPLACE_EXISTING); - } - - @GraphState(name = RELOAD_LEXICON, next = RELOAD_LEXICON_WAIT, resume = ResumeBehavior.ERROR) - public long reloadLexicon() throws Exception { - return mqIndexOutbox.sendAsync(IndexMqEndpoints.INDEX_RELOAD_LEXICON, ""); - } - - @GraphState(name = RELOAD_LEXICON_WAIT, next = FLUSH_CACHES, resume = ResumeBehavior.RETRY) - public void reloadLexiconWait(long id) throws Exception { - var rsp = mqIndexOutbox.waitResponse(id); - - if (rsp.state() != MqMessageState.OK) { - error("RELOAD_LEXICON failed"); - } - } - - @GraphState(name = FLUSH_CACHES, next = END, resume = ResumeBehavior.RETRY) - public void flushCaches() throws Exception { - var rsp = mqSearchOutbox.send(SearchMqEndpoints.FLUSH_CACHES, ""); - - if (rsp.state() != MqMessageState.OK) { - error("FLUSH_CACHES failed"); - } - } +// @GraphState(name = MOVE_INDEX_FILES, next = RELOAD_LEXICON, resume = ResumeBehavior.ERROR) +// public void moveIndexFiles(String crawlJob) throws Exception { +// Path indexData = Path.of("/vol/index.dat"); +// Path indexDest = Path.of("/vol/iw/0/page-index.dat"); +// +// if (!Files.exists(indexData)) +// error("Index data not found"); +// +// Files.move(indexData, indexDest, StandardCopyOption.REPLACE_EXISTING); +// } +// +// @GraphState(name = RELOAD_LEXICON, next = RELOAD_LEXICON_WAIT, resume = ResumeBehavior.ERROR) +// public long reloadLexicon() throws Exception { +// return mqIndexOutbox.sendAsync(IndexMqEndpoints.INDEX_RELOAD_LEXICON, ""); +// } +// +// @GraphState(name = RELOAD_LEXICON_WAIT, next = FLUSH_CACHES, resume = ResumeBehavior.RETRY) +// public void reloadLexiconWait(long id) throws Exception { +// var rsp = mqIndexOutbox.waitResponse(id); +// +// if (rsp.state() != MqMessageState.OK) { +// error("RELOAD_LEXICON failed"); +// } +// } +// +// @GraphState(name = FLUSH_CACHES, next = END, resume = ResumeBehavior.RETRY) +// public void flushCaches() throws Exception { +// var rsp = mqSearchOutbox.send(SearchMqEndpoints.FLUSH_CACHES, ""); +// +// if (rsp.state() != MqMessageState.OK) { +// error("FLUSH_CACHES failed"); +// } +// } } diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlFileStorageService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlFileStorageService.java new file mode 100644 index 00000000..04dc34ae --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlFileStorageService.java @@ -0,0 +1,63 @@ +package nu.marginalia.control.svc; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.zaxxer.hikari.HikariDataSource; +import lombok.SneakyThrows; +import nu.marginalia.control.model.FileStorageBaseWithStorage; +import nu.marginalia.control.model.ProcessHeartbeat; +import nu.marginalia.control.model.ServiceHeartbeat; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.db.storage.model.FileStorage; +import nu.marginalia.db.storage.model.FileStorageBase; +import nu.marginalia.db.storage.model.FileStorageBaseId; +import nu.marginalia.db.storage.model.FileStorageId; + +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +@Singleton +public class ControlFileStorageService { + private final HikariDataSource dataSource; + private final FileStorageService fileStorageService; + + @Inject + public ControlFileStorageService(HikariDataSource dataSource, FileStorageService fileStorageService) { + this.dataSource = dataSource; + this.fileStorageService = fileStorageService; + } + + @SneakyThrows + public List getStorageList() { + Map fileStorageBaseByBaseId = new HashMap<>(); + Map> fileStoragByBaseId = new HashMap<>(); + + List storageIds = new ArrayList<>(); + + try (var conn = dataSource.getConnection(); + var storageByIdStmt = conn.prepareStatement("SELECT ID FROM FILE_STORAGE")) { + var rs = storageByIdStmt.executeQuery(); + while (rs.next()) { + storageIds.add(new FileStorageId(rs.getLong("ID"))); + } + } + + for (var id : storageIds) { + var storage = fileStorageService.getStorage(id); + fileStorageBaseByBaseId.computeIfAbsent(storage.base().id(), k -> storage.base()); + fileStoragByBaseId.computeIfAbsent(storage.base().id(), k -> new ArrayList<>()).add(storage); + } + + List result = new ArrayList<>(); + for (var baseId : fileStorageBaseByBaseId.keySet()) { + result.add(new FileStorageBaseWithStorage(fileStorageBaseByBaseId.get(baseId), fileStoragByBaseId.get(baseId))); + } + + return result; + } + + +} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessOutboxFactory.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessOutboxFactory.java new file mode 100644 index 00000000..e1b5a3b1 --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessOutboxFactory.java @@ -0,0 +1,27 @@ +package nu.marginalia.control.svc; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.converting.mqapi.ConverterInboxNames; +import nu.marginalia.mq.outbox.MqOutbox; +import nu.marginalia.mq.persistence.MqPersistence; +import nu.marginalia.service.server.BaseServiceParams; + +@Singleton +public class ProcessOutboxFactory { + private final BaseServiceParams params; + private final MqPersistence persistence; + + @Inject + public ProcessOutboxFactory(BaseServiceParams params, MqPersistence persistence) { + this.params = params; + this.persistence = persistence; + } + + public MqOutbox createConverterOutbox() { + return new MqOutbox(persistence, ConverterInboxNames.CONVERTER_INBOX, params.configuration.serviceName(), params.configuration.instanceUuid()); + } + public MqOutbox createLoaderOutbox() { + return new MqOutbox(persistence, ConverterInboxNames.LOADER_INBOX, params.configuration.serviceName(), params.configuration.instanceUuid()); + } +} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java index b5198a9e..e1034921 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java @@ -44,13 +44,12 @@ public class ProcessService { this.distPath = distPath; } - public boolean trigger(ProcessId processId, Path plan) throws Exception { + public boolean trigger(ProcessId processId) throws Exception { String processPath = processPath(processId); String[] args = new String[] { - processPath, - plan.toString() + processPath }; - String[] env = env(plan); + String[] env = env(); Process process; @@ -58,10 +57,7 @@ public class ProcessService { logger.error("Process not found: {}", processPath); return false; } - if (!Files.exists(plan)) { - logger.error("Plan not found: {}", processPath); - return false; - } + logger.info("Starting process: {}", processId + ": " + Arrays.toString(args) + " // " + Arrays.toString(env)); synchronized (processes) { @@ -111,7 +107,7 @@ public class ProcessService { return distPath.resolve(id.path).toString(); } - private String[] env(Path plan) { + private String[] env() { Map opts = new HashMap<>(); String WMSA_HOME = System.getenv("WMSA_HOME"); @@ -120,7 +116,6 @@ public class ProcessService { } opts.put("WMSA_HOME", WMSA_HOME); opts.put("JAVA_HOME", System.getenv("JAVA_HOME")); - opts.put("JAVA_OPTS", "-Dcrawl.rootDirRewrite=/crawl:" + plan.getParent().toString()); return opts.entrySet().stream().map(e -> e.getKey() + "=" + e.getValue()).toArray(String[]::new); } diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/partials/nav.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/partials/nav.hdb index 9b68f4b2..e3f38897 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/partials/nav.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/partials/nav.hdb @@ -3,5 +3,6 @@
  • Overview
  • Services
  • Processes
  • +
  • Storage
  • \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/storage.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/storage.hdb new file mode 100644 index 00000000..72f55e29 --- /dev/null +++ b/code/services-satellite/control-service/src/main/resources/templates/control/storage.hdb @@ -0,0 +1,52 @@ + + + + Control Service + + + + + {{> control/partials/nav}} +
    + + + {{#each storage}} + + + + + + + + + + + + + + + + + + + + + {{#each storage}} + + + + + + + {{/each}} + {{/each}} +
    TypeNamePathMust CleanPermit Temp
    {{base.type}}{{base.name}}{{base.path}}{{base.mustClean}}{{base.permitTemp}}
    TypePathDescription
    {{type}}{{path}}{{description}}
    +
    + + + + \ No newline at end of file diff --git a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/ExperimentRunnerMain.java b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/ExperimentRunnerMain.java index 4febc294..16c06e45 100644 --- a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/ExperimentRunnerMain.java +++ b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/ExperimentRunnerMain.java @@ -39,7 +39,7 @@ public class ExperimentRunnerMain { Injector injector = Guice.createInjector( new DatabaseModule(), - new ConverterModule(plan) + new ConverterModule() ); Experiment experiment = injector.getInstance(experiments.get(args[1])); diff --git a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java index 4e55deec..7bf2f784 100644 --- a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java +++ b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java @@ -2,14 +2,10 @@ package nu.marginalia.tools.experiments; import com.google.inject.Inject; import nu.marginalia.WmsaHome; -import nu.marginalia.adblock.GoogleAnwersSpamDetector; import nu.marginalia.converting.processor.logic.dom.DomPruningFilter; import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.tools.Experiment; -import nu.marginalia.topic.RecipeDetector; -import nu.marginalia.topic.TextileCraftDetector; -import nu.marginalia.topic.WoodworkingDetector; import org.jsoup.Jsoup; import java.io.BufferedOutputStream; diff --git a/run/setup.sh b/run/setup.sh index f4287bd5..ba4ac355 100755 --- a/run/setup.sh +++ b/run/setup.sh @@ -18,7 +18,7 @@ function download_model { pushd $(dirname $0) -mkdir -p model logs db samples install vol/ir/{0,1}/ vol/iw/{0,1}/search-sets vol/{tmpf,tmps} data +mkdir -p model logs db samples install vol/ir/{0,1}/ vol/{lr,lw} vol/iw/{0,1}/search-sets vol/{tmpf,tmps} data download_model model/English.DICT https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.DICT download_model model/English.RDR https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.RDR From cdae74d3952333f440c30c4f04582427be5c051c Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 15 Jul 2023 14:11:35 +0200 Subject: [PATCH 053/157] (control) Working redirects --- .../nu/marginalia/control/ControlService.java | 22 ++++++++++++++----- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java index d224159c..e2e53017 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java @@ -83,25 +83,35 @@ public class ControlService extends Service { Spark.post("/public/fsms/:fsm/start", (req, rsp) -> { controlProcesses.start(ControlProcess.valueOf(req.params("fsm").toUpperCase())); - rsp.redirect("/processes"); - return ""; + return """ + + + """; }); Spark.post("/public/fsms/:fsm/stop", (req, rsp) -> { controlProcesses.stop(ControlProcess.valueOf(req.params("fsm").toUpperCase())); - rsp.redirect("/processes"); - return ""; + return """ + + + """; }); // TODO: This should be a POST Spark.get("/public/repartition", (req, rsp) -> { controlProcesses.start(ControlProcess.REPARTITION_REINDEX); - return "OK"; + return """ + + + """; }); // TODO: This should be a POST Spark.get("/public/reconvert", (req, rsp) -> { controlProcesses.start(ControlProcess.RECONVERT_LOAD, FileStorageId.of(11)); - return "OK"; + return """ + + + """; }); Spark.get("/public/:resource", this::serveStatic); From 5ec10634d8c621895748a1a1e2110bd1ebb1ab3c Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 15 Jul 2023 14:11:48 +0200 Subject: [PATCH 054/157] (mqfsm) Abortable state machine --- .../nu/marginalia/mq/MessageQueueFactory.java | 4 +- .../mq/inbox/MqSynchronousInbox.java | 37 ++++++++++++++++--- .../java/nu/marginalia/mqsm/StateMachine.java | 8 +++- .../mqsm/graph/AbstractStateGraph.java | 7 +++- 4 files changed, 46 insertions(+), 10 deletions(-) diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/MessageQueueFactory.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/MessageQueueFactory.java index 5791793e..bc664d38 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/MessageQueueFactory.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/MessageQueueFactory.java @@ -26,12 +26,12 @@ public class MessageQueueFactory { } - public MqInboxIf createAsynchronousInbox(String inboxName, UUID instanceUUID) + public MqAsynchronousInbox createAsynchronousInbox(String inboxName, UUID instanceUUID) { return new MqAsynchronousInbox(persistence, inboxName, instanceUUID); } - public MqInboxIf createSynchronousInbox(String inboxName, UUID instanceUUID) + public MqSynchronousInbox createSynchronousInbox(String inboxName, UUID instanceUUID) { return new MqSynchronousInbox(persistence, inboxName, instanceUUID); } diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSynchronousInbox.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSynchronousInbox.java index a150a239..af0b5197 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSynchronousInbox.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSynchronousInbox.java @@ -7,10 +7,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.sql.SQLException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; -import java.util.UUID; +import java.util.*; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; /** Message queue inbox that responds to a single message at a time @@ -29,6 +28,7 @@ public class MqSynchronousInbox implements MqInboxIf { private final List eventSubscribers = new ArrayList<>(); private Thread pollDbThread; + private ExecutorService executorService = Executors.newSingleThreadExecutor(); public MqSynchronousInbox(MqPersistence persistence, String inboxName, @@ -74,6 +74,8 @@ public class MqSynchronousInbox implements MqInboxIf { run = false; pollDbThread.join(); + executorService.shutdown(); + executorService.awaitTermination(10, TimeUnit.SECONDS); } @@ -101,7 +103,8 @@ public class MqSynchronousInbox implements MqInboxIf { try { subscriber.onNotification(msg); updateMessageState(msg, MqMessageState.OK); - } catch (Exception ex) { + } + catch (Exception ex) { logger.error("Message Queue subscriber threw exception", ex); updateMessageState(msg, MqMessageState.ERR); } @@ -134,6 +137,7 @@ public class MqSynchronousInbox implements MqInboxIf { } } + private volatile java.util.concurrent.Future currentTask = null; public void pollDb() { try { for (long tick = 1; run; tick++) { @@ -141,7 +145,18 @@ public class MqSynchronousInbox implements MqInboxIf { var messages = pollInbox(tick); for (var msg : messages) { - handleMessage(msg); + // Handle message in a separate thread but wait for that thread, so we can interrupt that thread + // without interrupting the polling thread and shutting down the inbox completely + try { + currentTask = executorService.submit(() -> handleMessage(msg)); + currentTask.get(); + } + catch (Exception ex) { + logger.error("Inbox task was aborted", ex); + } + finally { + currentTask = null; + } } if (messages.isEmpty()) { @@ -154,6 +169,16 @@ public class MqSynchronousInbox implements MqInboxIf { } } + /** Attempt to abort the current task using an interrupt */ + public void abortCurrentTask() { + var task = currentTask; // capture the value to avoid race conditions with the + // polling thread between the check and the interrupt + if (task != null) { + task.cancel(true); + } + } + + private void handleMessage(MqMessage msg) { logger.info("Notifying subscribers of msg {}", msg.msgId()); diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java index 94e969a9..e3937894 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java @@ -6,6 +6,7 @@ import nu.marginalia.mq.MqMessageState; import nu.marginalia.mq.inbox.MqInboxIf; import nu.marginalia.mq.inbox.MqInboxResponse; import nu.marginalia.mq.inbox.MqSubscription; +import nu.marginalia.mq.inbox.MqSynchronousInbox; import nu.marginalia.mq.outbox.MqOutbox; import nu.marginalia.mqsm.graph.ResumeBehavior; import nu.marginalia.mqsm.graph.AbstractStateGraph; @@ -25,7 +26,7 @@ import java.util.function.BiConsumer; public class StateMachine { private final Logger logger = LoggerFactory.getLogger(StateMachine.class); - private final MqInboxIf smInbox; + private final MqSynchronousInbox smInbox; private final MqOutbox smOutbox; private final String queueName; @@ -291,6 +292,11 @@ public class StateMachine { // Add a state transition to the final state smOutbox.notify(abortMsgId, finalState.name(), ""); + + // Dislodge the current task with an interrupt. + // It's actually fine if we accidentally interrupt the wrong thread + // (i.e. the abort task), since it shouldn't be doing anything interruptable + smInbox.abortCurrentTask(); } private class StateEventSubscription implements MqSubscription { diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/AbstractStateGraph.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/AbstractStateGraph.java index 2890910f..3b19f764 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/AbstractStateGraph.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/AbstractStateGraph.java @@ -124,7 +124,12 @@ public abstract class AbstractStateGraph { if (ex instanceof ControlFlowException cfe) { return stateFactory.transition(cfe.getState(), cfe.getPayload()); - } else { + } + else if (ex instanceof InterruptedException intE) { + logger.error("State execution was interrupted " + state); + return StateTransition.to("ERR", "Execution interrupted"); + } + else { logger.error("Error in state invocation " + state, ex); return StateTransition.to("ERROR", "Exception: " + ex.getClass().getSimpleName() + "/" + ex.getMessage()); From c4dd9a0547b5c443b20b6a46c5050ce1bd12ea6f Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 16 Jul 2023 11:58:47 +0200 Subject: [PATCH 055/157] (control) Use MQFSMs to monitor and spawn processes when messages are sent to them --- .../mq/persistence/MqPersistence.java | 63 ++++++++++++++-- .../control/model/ControlProcess.java | 5 +- .../control/process/ControlProcesses.java | 7 +- .../process/ConverterMonitorProcess.java | 73 +++++++++++++++++++ .../control/process/LoaderMonitorProcess.java | 73 +++++++++++++++++++ .../process/ReconvertAndLoadProcess.java | 42 +++++------ 6 files changed, 230 insertions(+), 33 deletions(-) create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ConverterMonitorProcess.java create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/LoaderMonitorProcess.java diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java index 4f2cc564..5d6511f4 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java @@ -191,6 +191,48 @@ public class MqPersistence { } } + /** Return up to n unprocessed messages from the specified inbox that are in states 'NEW' or 'ACK' */ + public Collection eavesdrop(String inboxName, int n) throws SQLException { + try (var conn = dataSource.getConnection(); + var queryStmt = conn.prepareStatement(""" + SELECT + ID, + RELATED_ID, + FUNCTION, + PAYLOAD, + STATE, + SENDER_INBOX IS NOT NULL AS EXPECTS_RESPONSE + FROM MESSAGE_QUEUE + WHERE STATE IN ('NEW', 'ACK') + AND RECIPIENT_INBOX=? + LIMIT ? + """) + ) { + queryStmt.setString(1, inboxName); + queryStmt.setInt(2, n); + var rs = queryStmt.executeQuery(); + + List messages = new ArrayList<>(n); + + while (rs.next()) { + long msgId = rs.getLong("ID"); + long relatedId = rs.getLong("RELATED_ID"); + + String function = rs.getString("FUNCTION"); + String payload = rs.getString("PAYLOAD"); + + MqMessageState state = MqMessageState.valueOf(rs.getString("STATE")); + boolean expectsResponse = rs.getBoolean("EXPECTS_RESPONSE"); + + var msg = new MqMessage(msgId, relatedId, function, payload, state, expectsResponse); + + messages.add(msg); + } + + return messages; + } + +} /** Marks unclaimed messages addressed to this inbox with instanceUUID and tick, * then returns these messages. */ @@ -205,7 +247,14 @@ public class MqPersistence { // Then fetch the messages that were marked try (var conn = dataSource.getConnection(); var queryStmt = conn.prepareStatement(""" - SELECT ID, RELATED_ID, FUNCTION, PAYLOAD, STATE, SENDER_INBOX FROM MESSAGE_QUEUE + SELECT + ID, + RELATED_ID, + FUNCTION, + PAYLOAD, + STATE, + SENDER_INBOX IS NOT NULL AS EXPECTS_RESPONSE + FROM MESSAGE_QUEUE WHERE OWNER_INSTANCE=? AND OWNER_TICK=? """) ) { @@ -216,14 +265,14 @@ public class MqPersistence { List messages = new ArrayList<>(expected); while (rs.next()) { - long msgId = rs.getLong(1); - long relatedId = rs.getLong(2); + long msgId = rs.getLong("ID"); + long relatedId = rs.getLong("RELATED_ID"); - String function = rs.getString(3); - String payload = rs.getString(4); + String function = rs.getString("FUNCTION"); + String payload = rs.getString("PAYLOAD"); - MqMessageState state = MqMessageState.valueOf(rs.getString(5)); - boolean expectsResponse = rs.getBoolean(6); + MqMessageState state = MqMessageState.valueOf(rs.getString("STATE")); + boolean expectsResponse = rs.getBoolean("EXPECTS_RESPONSE"); var msg = new MqMessage(msgId, relatedId, function, payload, state, expectsResponse); diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ControlProcess.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ControlProcess.java index b7db26db..6cdc219a 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ControlProcess.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ControlProcess.java @@ -2,7 +2,10 @@ package nu.marginalia.control.model; public enum ControlProcess { REPARTITION_REINDEX, - RECONVERT_LOAD; + RECONVERT_LOAD, + CONVERTER_MONITOR, + LOADER_MONITOR + ; public String id() { return "fsm:" + name().toLowerCase(); diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ControlProcesses.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ControlProcesses.java index 404bd273..eb5eaef7 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ControlProcesses.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ControlProcesses.java @@ -31,13 +31,18 @@ public class ControlProcesses { GsonFactory gsonFactory, BaseServiceParams baseServiceParams, RepartitionReindexProcess repartitionReindexProcess, - ReconvertAndLoadProcess reconvertAndLoadProcess + ReconvertAndLoadProcess reconvertAndLoadProcess, + ConverterMonitorProcess converterMonitorProcess, + LoaderMonitorProcess loaderMonitorProcess ) { this.messageQueueFactory = messageQueueFactory; this.eventLog = baseServiceParams.eventLog; this.gson = gsonFactory.get(); + register(ControlProcess.REPARTITION_REINDEX, repartitionReindexProcess); register(ControlProcess.RECONVERT_LOAD, reconvertAndLoadProcess); + register(ControlProcess.CONVERTER_MONITOR, converterMonitorProcess); + register(ControlProcess.LOADER_MONITOR, loaderMonitorProcess); } private void register(ControlProcess process, AbstractStateGraph graph) { diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ConverterMonitorProcess.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ConverterMonitorProcess.java new file mode 100644 index 00000000..a1c0258f --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ConverterMonitorProcess.java @@ -0,0 +1,73 @@ +package nu.marginalia.control.process; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.control.svc.ProcessService; +import nu.marginalia.converting.mqapi.ConverterInboxNames; +import nu.marginalia.mq.persistence.MqPersistence; +import nu.marginalia.mqsm.StateFactory; +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mqsm.graph.GraphState; +import nu.marginalia.mqsm.graph.ResumeBehavior; + +import java.sql.SQLException; +import java.util.concurrent.TimeUnit; + +@Singleton +public class ConverterMonitorProcess extends AbstractStateGraph { + + private final MqPersistence persistence; + private final ProcessService processService; + public static final String INITIAL = "INITIAL"; + public static final String CHECK = "CHECK"; + public static final String RUN = "RUN"; + public static final String END = "END"; + + public static final int MAX_ATTEMPTS = 3; + public static final String inboxName = ConverterInboxNames.CONVERTER_INBOX; + public static final ProcessService.ProcessId processId = ProcessService.ProcessId.CONVERTER; + + @Inject + public ConverterMonitorProcess(StateFactory stateFactory, + MqPersistence persistence, + ProcessService processService) { + super(stateFactory); + this.persistence = persistence; + this.processService = processService; + } + + @GraphState(name = INITIAL, next = CHECK) + public void init() { + + } + + @GraphState(name = CHECK, resume = ResumeBehavior.RETRY) + public void check() throws SQLException, InterruptedException { + + for (;;) { + var messages = persistence.eavesdrop(inboxName, 1); + + if (messages.isEmpty() && !processService.isRunning(processId)) { + TimeUnit.SECONDS.sleep(5); + } else { + transition(RUN, 0); + } + } + } + + @GraphState(name = RUN) + public void run(Integer attempts) throws Exception { + try { + processService.trigger(processId); + } + catch (Exception e) { + if (attempts < MAX_ATTEMPTS) { + transition(RUN, attempts + 1); + } + else throw e; + } + + transition(CHECK); + } + +} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/LoaderMonitorProcess.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/LoaderMonitorProcess.java new file mode 100644 index 00000000..813c7da7 --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/LoaderMonitorProcess.java @@ -0,0 +1,73 @@ +package nu.marginalia.control.process; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.control.svc.ProcessService; +import nu.marginalia.converting.mqapi.ConverterInboxNames; +import nu.marginalia.mq.persistence.MqPersistence; +import nu.marginalia.mqsm.StateFactory; +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mqsm.graph.GraphState; +import nu.marginalia.mqsm.graph.ResumeBehavior; + +import java.sql.SQLException; +import java.util.concurrent.TimeUnit; + +@Singleton +public class LoaderMonitorProcess extends AbstractStateGraph { + + private final MqPersistence persistence; + private final ProcessService processService; + public static final String INITIAL = "INITIAL"; + public static final String CHECK = "CHECK"; + public static final String RUN = "RUN"; + public static final String END = "END"; + + public static final int MAX_ATTEMPTS = 1; + public static final String inboxName = ConverterInboxNames.LOADER_INBOX; + public static final ProcessService.ProcessId processId = ProcessService.ProcessId.LOADER; + + @Inject + public LoaderMonitorProcess(StateFactory stateFactory, + MqPersistence persistence, + ProcessService processService) { + super(stateFactory); + this.persistence = persistence; + this.processService = processService; + } + + @GraphState(name = INITIAL, next = CHECK) + public void init() { + + } + + @GraphState(name = CHECK, resume = ResumeBehavior.RETRY) + public void check() throws SQLException, InterruptedException { + + for (;;) { + var messages = persistence.eavesdrop(inboxName, 1); + + if (messages.isEmpty() && !processService.isRunning(processId)) { + TimeUnit.SECONDS.sleep(5); + } else { + transition(RUN, 0); + } + } + } + + @GraphState(name = RUN) + public void run(Integer attempts) throws Exception { + try { + processService.trigger(processId); + } + catch (Exception e) { + if (attempts < MAX_ATTEMPTS) { + transition(RUN, attempts + 1); + } + else throw e; + } + + transition(CHECK); + } + +} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ReconvertAndLoadProcess.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ReconvertAndLoadProcess.java index b72876a9..1c1439f9 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ReconvertAndLoadProcess.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ReconvertAndLoadProcess.java @@ -105,14 +105,6 @@ public class ReconvertAndLoadProcess extends AbstractStateGraph { var request = new ConvertRequest(message.crawlStorageId, processedArea.id()); long id = mqConverterOutbox.sendAsync(ConvertRequest.class.getSimpleName(), gson.toJson(request)); - Executors.defaultThreadFactory().newThread(() -> { - try { - processService.trigger(ProcessService.ProcessId.CONVERTER); - } catch (Exception e) { - throw new RuntimeException(e); - } - }).start(); - return message .withProcessedStorageId(processedArea.id()) .withConverterMsgId(id); @@ -134,14 +126,6 @@ public class ReconvertAndLoadProcess extends AbstractStateGraph { var request = new LoadRequest(message.processedStorageId); long id = mqLoaderOutbox.sendAsync(LoadRequest.class.getSimpleName(), gson.toJson(request)); - Executors.defaultThreadFactory().newThread(() -> { - try { - processService.trigger(ProcessService.ProcessId.LOADER); - } catch (Exception e) { - throw new RuntimeException(e); - } - }).start(); - return message.withLoaderMsgId(id); } @@ -155,23 +139,33 @@ public class ReconvertAndLoadProcess extends AbstractStateGraph { } public MqMessage waitResponse(MqOutbox outbox, ProcessService.ProcessId processId, long id) throws Exception { - + if (!waitForProcess(processId, TimeUnit.SECONDS, 30)) { + error("Process " + processId + " did not launch"); + } for (;;) { try { return outbox.waitResponse(id, 1, TimeUnit.SECONDS); } catch (TimeoutException ex) { - if (!processService.isRunning(processId)) { - try { - return outbox.waitResponse(id, 10, TimeUnit.SECONDS); - } - catch (TimeoutException ex2) { - error("Process " + processId + " is not running"); - } + if (!waitForProcess(processId, TimeUnit.SECONDS, 30)) { + error("Process " + processId + " died and did not re-launch"); } } } + } + public boolean waitForProcess(ProcessService.ProcessId processId, TimeUnit unit, int duration) throws InterruptedException { + + // Wait for process to start + long deadline = System.currentTimeMillis() + TimeUnit.SECONDS.toMillis(30); + while (System.currentTimeMillis() < deadline) { + if (processService.isRunning(processId)) + return true; + + TimeUnit.SECONDS.sleep(1); + } + + return false; } // @GraphState(name = MOVE_INDEX_FILES, next = RELOAD_LEXICON, resume = ResumeBehavior.ERROR) From 6e41e78f364b92d431ec598c610d550dd14cf442 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 16 Jul 2023 12:03:32 +0200 Subject: [PATCH 056/157] (control) Higlight missing processes --- .../src/main/resources/static/control/style.css | 3 +++ 1 file changed, 3 insertions(+) diff --git a/code/services-satellite/control-service/src/main/resources/static/control/style.css b/code/services-satellite/control-service/src/main/resources/static/control/style.css index 26b96fbf..d3a2aacd 100644 --- a/code/services-satellite/control-service/src/main/resources/static/control/style.css +++ b/code/services-satellite/control-service/src/main/resources/static/control/style.css @@ -8,6 +8,9 @@ body { grid-template-areas: "left right"; } +#services .missing { + color: #800; +} .uuidPip { margin-left: 0.25ch; border-radius: 2ch; From e618aa34e98d261722c1fb3ba5ed51e5ee36bfcd Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 17 Jul 2023 12:27:27 +0200 Subject: [PATCH 057/157] (control) Name change process->fsm, new fsm:s * FSM for spawning processes when messages appear for them * FSM for removing data flagged for purging --- .../db/storage/FileStorageService.java | 25 +++++++ .../mq/inbox/MqSingleShotInbox.java | 28 +++++++ .../mq/inbox/MqSynchronousInbox.java | 2 +- .../mq/persistence/MqPersistence.java | 16 ++++ .../marginalia/converting/ConverterMain.java | 27 +++++-- .../nu/marginalia/loading/LoaderMain.java | 18 ++++- .../control-service/build.gradle | 1 + .../nu/marginalia/control/ControlService.java | 20 ++--- .../ControlFSMs.java} | 37 ++++++---- .../monitor/AbstractProcessSpawnerFSM.java} | 29 ++++---- .../fsm/monitor/ConverterMonitorFSM.java | 22 ++++++ .../fsm/monitor/FileStorageMonitorFSM.java | 72 ++++++++++++++++++ .../control/fsm/monitor/LoaderMonitorFSM.java | 24 ++++++ .../fsm/monitor/MessageQueueMonitorFSM.java | 45 ++++++++++++ .../monitor/ProcessLivenessMonitorFSM.java | 55 ++++++++++++++ .../task/ReconvertAndLoadFSM.java} | 24 +++--- .../task/RepartitionReindexFSM.java} | 8 +- .../control/model/ControlProcess.java | 6 +- .../control/model/ProcessHeartbeat.java | 14 ++++ .../control/process/LoaderMonitorProcess.java | 73 ------------------- .../control/svc/HeartbeatService.java | 25 ++++++- .../svc/MessageQueueMonitorService.java | 62 ---------------- 22 files changed, 429 insertions(+), 204 deletions(-) rename code/services-satellite/control-service/src/main/java/nu/marginalia/control/{process/ControlProcesses.java => fsm/ControlFSMs.java} (67%) rename code/services-satellite/control-service/src/main/java/nu/marginalia/control/{process/ConverterMonitorProcess.java => fsm/monitor/AbstractProcessSpawnerFSM.java} (63%) create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/ConverterMonitorFSM.java create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/FileStorageMonitorFSM.java create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/LoaderMonitorFSM.java create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/MessageQueueMonitorFSM.java create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/ProcessLivenessMonitorFSM.java rename code/services-satellite/control-service/src/main/java/nu/marginalia/control/{process/ReconvertAndLoadProcess.java => fsm/task/ReconvertAndLoadFSM.java} (90%) rename code/services-satellite/control-service/src/main/java/nu/marginalia/control/{process/RepartitionReindexProcess.java => fsm/task/RepartitionReindexFSM.java} (90%) delete mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/LoaderMonitorProcess.java delete mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueMonitorService.java diff --git a/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java b/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java index a954b6bb..7ed94a46 100644 --- a/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java +++ b/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java @@ -11,6 +11,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.attribute.PosixFilePermissions; import java.sql.SQLException; +import java.util.Optional; /** Manages file storage for processes and services */ @@ -23,6 +24,21 @@ public class FileStorageService { this.dataSource = dataSource; } + public Optional findFileStorageToDelete() { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT ID FROM FILE_STORAGE WHERE DO_PURGE LIMIT 1 + """)) { + var rs = stmt.executeQuery(); + if (rs.next()) { + return Optional.of(getStorage(new FileStorageId(rs.getLong(1)))); + } + } catch (SQLException e) { + return Optional.empty(); + } + return Optional.empty(); + } + /** @return the storage base with the given id, or null if it does not exist */ public FileStorageBase getStorageBase(FileStorageBaseId type) throws SQLException { try (var conn = dataSource.getConnection(); @@ -278,4 +294,13 @@ public class FileStorageService { } } + public void removeFileStorage(FileStorageId id) throws SQLException { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + DELETE FROM FILE_STORAGE WHERE ID = ? + """)) { + stmt.setLong(1, id.id()); + stmt.executeUpdate(); + } + } } diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSingleShotInbox.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSingleShotInbox.java index 791a195c..85f7e2f5 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSingleShotInbox.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSingleShotInbox.java @@ -1,5 +1,6 @@ package nu.marginalia.mq.inbox; +import lombok.SneakyThrows; import nu.marginalia.mq.MqMessage; import nu.marginalia.mq.persistence.MqPersistence; @@ -7,6 +8,7 @@ import java.sql.SQLException; import java.util.Optional; import java.util.UUID; import java.util.concurrent.TimeUnit; +import java.util.function.Predicate; /** A single-shot inbox that can be used to wait for a single message * to arrive in an inbox, and then reply to that message @@ -26,6 +28,12 @@ public class MqSingleShotInbox { this.persistence = persistence; } + /** Wait for a message to arrive in the specified inbox, up to the specified timeout. + * + * @param timeout The timeout + * @param unit The time unit + * @return The message, or empty if no message arrived before the timeout + */ public Optional waitForMessage(long timeout, TimeUnit unit) throws InterruptedException, SQLException { final long deadline = System.currentTimeMillis() + unit.toMillis(timeout); @@ -44,6 +52,25 @@ public class MqSingleShotInbox { } } + + /** Steal a message from the inbox, and change the owner to this instance. This is useful + * for resuming an aborted process. + * + * @param predicate A predicate that must be true for the message to be stolen + * @return The stolen message, or empty if no message was stolen + */ + @SneakyThrows + public Optional stealMessage(Predicate predicate) { + for (var message : persistence.eavesdrop(inboxName, 5)) { + if (predicate.test(message)) { + persistence.changeOwner(message.msgId(), instanceUUID, -1); + return Optional.of(message); + } + } + + return Optional.empty(); + } + public void sendResponse(MqMessage originalMessage, MqInboxResponse response) { try { persistence.sendResponse(originalMessage.msgId(), response.state(), response.message()); @@ -51,4 +78,5 @@ public class MqSingleShotInbox { throw new RuntimeException(e); } } + } diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSynchronousInbox.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSynchronousInbox.java index af0b5197..09749209 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSynchronousInbox.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSynchronousInbox.java @@ -152,7 +152,7 @@ public class MqSynchronousInbox implements MqInboxIf { currentTask.get(); } catch (Exception ex) { - logger.error("Inbox task was aborted", ex); + logger.error("Inbox task was aborted"); } finally { currentTask = null; diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java index 5d6511f4..dce9d402 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java @@ -362,4 +362,20 @@ public class MqPersistence { } } + + public void changeOwner(long id, String instanceUUID, int tick) { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + UPDATE MESSAGE_QUEUE SET OWNER_INSTANCE=?, OWNER_TICK=? + WHERE ID=? + """)) { + stmt.setString(1, instanceUUID); + stmt.setInt(2, tick); + stmt.setLong(3, id); + stmt.executeUpdate(); + } catch (SQLException e) { + throw new RuntimeException(e); + } + } + } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java index b07060cb..a42f5b67 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java @@ -22,7 +22,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; +import java.sql.SQLException; import java.util.List; +import java.util.Optional; import java.util.UUID; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; @@ -174,14 +176,8 @@ public class ConverterMain { var inbox = messageQueueFactory.createSingleShotInbox(CONVERTER_INBOX, UUID.randomUUID()); - var msgOpt = inbox.waitForMessage(30, TimeUnit.SECONDS); - if (msgOpt.isEmpty()) - throw new RuntimeException("No instruction received in inbox"); - var msg = msgOpt.get(); - - if (!nu.marginalia.converting.mqapi.ConvertRequest.class.getSimpleName().equals(msg.function())) { - throw new RuntimeException("Unexpected message in inbox: " + msg); - } + var msgOpt = getMessage(inbox, nu.marginalia.converting.mqapi.ConvertRequest.class.getSimpleName()); + var msg = msgOpt.orElseThrow(() -> new RuntimeException("No message received")); var request = gson.fromJson(msg.payload(), nu.marginalia.converting.mqapi.ConvertRequest.class); @@ -195,6 +191,21 @@ public class ConverterMain { return new ConvertRequest(plan, msg, inbox); } + private Optional getMessage(MqSingleShotInbox inbox, String expectedFunction) throws SQLException, InterruptedException { + var opt = inbox.waitForMessage(30, TimeUnit.SECONDS); + if (opt.isPresent()) { + if (!opt.get().function().equals(expectedFunction)) { + throw new RuntimeException("Unexpected function: " + opt.get().function()); + } + return opt; + } + else { + var stolenMessage = inbox.stealMessage(msg -> msg.function().equals(expectedFunction)); + stolenMessage.ifPresent(mqMessage -> logger.info("Stole message {}", mqMessage)); + return stolenMessage; + } + } + record ProcessingInstructions(String id, List instructions) {} diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java index 7250889d..08649808 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java @@ -25,6 +25,7 @@ import org.slf4j.LoggerFactory; import java.nio.file.Path; import java.sql.SQLException; import java.util.List; +import java.util.Optional; import java.util.UUID; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeUnit; @@ -214,7 +215,7 @@ public class LoaderMain { var inbox = messageQueueFactory.createSingleShotInbox(LOADER_INBOX, UUID.randomUUID()); - var msgOpt = inbox.waitForMessage(30, TimeUnit.SECONDS); + var msgOpt = getMessage(inbox, nu.marginalia.converting.mqapi.LoadRequest.class.getSimpleName()); if (msgOpt.isEmpty()) throw new RuntimeException("No instruction received in inbox"); var msg = msgOpt.get(); @@ -232,4 +233,19 @@ public class LoaderMain { return new LoadRequest(plan, msg, inbox); } + private Optional getMessage(MqSingleShotInbox inbox, String expectedFunction) throws SQLException, InterruptedException { + var opt = inbox.waitForMessage(30, TimeUnit.SECONDS); + if (opt.isPresent()) { + if (!opt.get().function().equals(expectedFunction)) { + throw new RuntimeException("Unexpected function: " + opt.get().function()); + } + return opt; + } + else { + var stolenMessage = inbox.stealMessage(msg -> msg.function().equals(expectedFunction)); + stolenMessage.ifPresent(mqMessage -> logger.info("Stole message {}", mqMessage)); + return stolenMessage; + } + } + } diff --git a/code/services-satellite/control-service/build.gradle b/code/services-satellite/control-service/build.gradle index d90d926a..72c0552e 100644 --- a/code/services-satellite/control-service/build.gradle +++ b/code/services-satellite/control-service/build.gradle @@ -46,6 +46,7 @@ dependencies { implementation libs.trove implementation libs.spark implementation libs.fastutil + implementation libs.commons.io implementation libs.bundles.gson implementation libs.bundles.mariadb diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java index e2e53017..34e600f5 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java @@ -4,14 +4,12 @@ import com.google.gson.Gson; import com.google.inject.Inject; import nu.marginalia.client.ServiceMonitors; import nu.marginalia.control.model.ControlProcess; -import nu.marginalia.control.process.ControlProcesses; +import nu.marginalia.control.fsm.ControlFSMs; import nu.marginalia.control.svc.*; import nu.marginalia.db.storage.model.FileStorageId; import nu.marginalia.model.gson.GsonFactory; -import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.renderer.MustacheRenderer; import nu.marginalia.renderer.RendererFactory; -import nu.marginalia.service.control.ServiceEventLog; import nu.marginalia.service.server.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -20,9 +18,7 @@ import spark.Response; import spark.Spark; import java.io.IOException; -import java.nio.file.Path; import java.util.Map; -import java.util.concurrent.TimeUnit; public class ControlService extends Service { @@ -43,7 +39,7 @@ public class ControlService extends Service { HeartbeatService heartbeatService, EventLogService eventLogService, RendererFactory rendererFactory, - ControlProcesses controlProcesses, + ControlFSMs controlFSMs, StaticResources staticResources, MessageQueueViewService messageQueueViewService, ControlFileStorageService controlFileStorageService @@ -73,7 +69,7 @@ public class ControlService extends Service { Spark.get("/public/processes", (req, rsp) -> Map.of("processes", heartbeatService.getProcessHeartbeats(), - "fsms", controlProcesses.getFsmStates(), + "fsms", controlFSMs.getFsmStates(), "messages", messageQueueViewService.getLastEntries(20)), (map) -> processesRenderer.render((Map) map)); @@ -82,14 +78,14 @@ public class ControlService extends Service { (map) -> storageRenderer.render((Map) map)); Spark.post("/public/fsms/:fsm/start", (req, rsp) -> { - controlProcesses.start(ControlProcess.valueOf(req.params("fsm").toUpperCase())); + controlFSMs.start(ControlProcess.valueOf(req.params("fsm").toUpperCase())); return """ """; }); Spark.post("/public/fsms/:fsm/stop", (req, rsp) -> { - controlProcesses.stop(ControlProcess.valueOf(req.params("fsm").toUpperCase())); + controlFSMs.stop(ControlProcess.valueOf(req.params("fsm").toUpperCase())); return """ @@ -98,7 +94,7 @@ public class ControlService extends Service { // TODO: This should be a POST Spark.get("/public/repartition", (req, rsp) -> { - controlProcesses.start(ControlProcess.REPARTITION_REINDEX); + controlFSMs.start(ControlProcess.REPARTITION_REINDEX); return """ @@ -106,8 +102,8 @@ public class ControlService extends Service { }); // TODO: This should be a POST - Spark.get("/public/reconvert", (req, rsp) -> { - controlProcesses.start(ControlProcess.RECONVERT_LOAD, FileStorageId.of(11)); + Spark.get("/public/reconvert/:fid", (req, rsp) -> { + controlFSMs.start(ControlProcess.RECONVERT_LOAD, FileStorageId.of(Integer.parseInt(req.params("fid")))); return """ diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ControlProcesses.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/ControlFSMs.java similarity index 67% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ControlProcesses.java rename to code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/ControlFSMs.java index eb5eaef7..0c756114 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ControlProcesses.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/ControlFSMs.java @@ -1,4 +1,4 @@ -package nu.marginalia.control.process; +package nu.marginalia.control.fsm; import com.google.gson.Gson; import com.google.inject.Inject; @@ -6,6 +6,11 @@ import com.google.inject.Singleton; import lombok.SneakyThrows; import nu.marginalia.control.model.ControlProcess; import nu.marginalia.control.model.ControlProcessState; +import nu.marginalia.control.fsm.monitor.*; +import nu.marginalia.control.fsm.monitor.ConverterMonitorFSM; +import nu.marginalia.control.fsm.monitor.LoaderMonitorFSM; +import nu.marginalia.control.fsm.task.ReconvertAndLoadFSM; +import nu.marginalia.control.fsm.task.RepartitionReindexFSM; import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mqsm.StateMachine; @@ -20,29 +25,35 @@ import java.util.Map; import java.util.UUID; @Singleton -public class ControlProcesses { +public class ControlFSMs { private final ServiceEventLog eventLog; private final Gson gson; private final MessageQueueFactory messageQueueFactory; public Map stateMachines = new HashMap<>(); @Inject - public ControlProcesses(MessageQueueFactory messageQueueFactory, - GsonFactory gsonFactory, - BaseServiceParams baseServiceParams, - RepartitionReindexProcess repartitionReindexProcess, - ReconvertAndLoadProcess reconvertAndLoadProcess, - ConverterMonitorProcess converterMonitorProcess, - LoaderMonitorProcess loaderMonitorProcess + public ControlFSMs(MessageQueueFactory messageQueueFactory, + GsonFactory gsonFactory, + BaseServiceParams baseServiceParams, + RepartitionReindexFSM repartitionReindexFSM, + ReconvertAndLoadFSM reconvertAndLoadFSM, + ConverterMonitorFSM converterMonitorFSM, + LoaderMonitorFSM loaderMonitor, + MessageQueueMonitorFSM messageQueueMonitor, + ProcessLivenessMonitorFSM processMonitorFSM, + FileStorageMonitorFSM fileStorageMonitorFSM ) { this.messageQueueFactory = messageQueueFactory; this.eventLog = baseServiceParams.eventLog; this.gson = gsonFactory.get(); - register(ControlProcess.REPARTITION_REINDEX, repartitionReindexProcess); - register(ControlProcess.RECONVERT_LOAD, reconvertAndLoadProcess); - register(ControlProcess.CONVERTER_MONITOR, converterMonitorProcess); - register(ControlProcess.LOADER_MONITOR, loaderMonitorProcess); + register(ControlProcess.REPARTITION_REINDEX, repartitionReindexFSM); + register(ControlProcess.RECONVERT_LOAD, reconvertAndLoadFSM); + register(ControlProcess.CONVERTER_MONITOR, converterMonitorFSM); + register(ControlProcess.LOADER_MONITOR, loaderMonitor); + register(ControlProcess.MESSAGE_QUEUE_MONITOR, messageQueueMonitor); + register(ControlProcess.PROCESS_LIVENESS_MONITOR, processMonitorFSM); + register(ControlProcess.FILE_STORAGE_MONITOR, fileStorageMonitorFSM); } private void register(ControlProcess process, AbstractStateGraph graph) { diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ConverterMonitorProcess.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/AbstractProcessSpawnerFSM.java similarity index 63% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ConverterMonitorProcess.java rename to code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/AbstractProcessSpawnerFSM.java index a1c0258f..75944553 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ConverterMonitorProcess.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/AbstractProcessSpawnerFSM.java @@ -1,9 +1,8 @@ -package nu.marginalia.control.process; +package nu.marginalia.control.fsm.monitor; import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.control.svc.ProcessService; -import nu.marginalia.converting.mqapi.ConverterInboxNames; import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.mqsm.StateFactory; import nu.marginalia.mqsm.graph.AbstractStateGraph; @@ -14,35 +13,39 @@ import java.sql.SQLException; import java.util.concurrent.TimeUnit; @Singleton -public class ConverterMonitorProcess extends AbstractStateGraph { +public class AbstractProcessSpawnerFSM extends AbstractStateGraph { private final MqPersistence persistence; private final ProcessService processService; public static final String INITIAL = "INITIAL"; - public static final String CHECK = "CHECK"; + public static final String MONITOR = "MONITOR"; public static final String RUN = "RUN"; public static final String END = "END"; public static final int MAX_ATTEMPTS = 3; - public static final String inboxName = ConverterInboxNames.CONVERTER_INBOX; - public static final ProcessService.ProcessId processId = ProcessService.ProcessId.CONVERTER; + private final String inboxName; + private final ProcessService.ProcessId processId; @Inject - public ConverterMonitorProcess(StateFactory stateFactory, - MqPersistence persistence, - ProcessService processService) { + public AbstractProcessSpawnerFSM(StateFactory stateFactory, + MqPersistence persistence, + ProcessService processService, + String inboxName, + ProcessService.ProcessId processId) { super(stateFactory); this.persistence = persistence; this.processService = processService; + this.inboxName = inboxName; + this.processId = processId; } - @GraphState(name = INITIAL, next = CHECK) + @GraphState(name = INITIAL, next = MONITOR) public void init() { } - @GraphState(name = CHECK, resume = ResumeBehavior.RETRY) - public void check() throws SQLException, InterruptedException { + @GraphState(name = MONITOR, resume = ResumeBehavior.RETRY) + public void monitor() throws SQLException, InterruptedException { for (;;) { var messages = persistence.eavesdrop(inboxName, 1); @@ -67,7 +70,7 @@ public class ConverterMonitorProcess extends AbstractStateGraph { else throw e; } - transition(CHECK); + transition(MONITOR); } } diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/ConverterMonitorFSM.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/ConverterMonitorFSM.java new file mode 100644 index 00000000..d5dd3908 --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/ConverterMonitorFSM.java @@ -0,0 +1,22 @@ +package nu.marginalia.control.fsm.monitor; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.control.svc.ProcessService; +import nu.marginalia.converting.mqapi.ConverterInboxNames; +import nu.marginalia.mq.persistence.MqPersistence; +import nu.marginalia.mqsm.StateFactory; + +@Singleton +public class ConverterMonitorFSM extends AbstractProcessSpawnerFSM { + + + @Inject + public ConverterMonitorFSM(StateFactory stateFactory, + MqPersistence persistence, + ProcessService processService) { + super(stateFactory, persistence, processService, ConverterInboxNames.CONVERTER_INBOX, ProcessService.ProcessId.CONVERTER); + } + + +} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/FileStorageMonitorFSM.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/FileStorageMonitorFSM.java new file mode 100644 index 00000000..5d760dfc --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/FileStorageMonitorFSM.java @@ -0,0 +1,72 @@ +package nu.marginalia.control.fsm.monitor; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.db.storage.model.FileStorage; +import nu.marginalia.db.storage.model.FileStorageId; +import nu.marginalia.mqsm.StateFactory; +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mqsm.graph.GraphState; +import nu.marginalia.mqsm.graph.ResumeBehavior; +import org.apache.commons.io.FileUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Optional; +import java.util.concurrent.TimeUnit; + +@Singleton +public class FileStorageMonitorFSM extends AbstractStateGraph { + + private final Logger logger = LoggerFactory.getLogger(getClass()); + // STATES + + private static final String INITIAL = "INITIAL"; + private static final String MONITOR = "MONITOR"; + private static final String PURGE = "PURGE"; + private static final String END = "END"; + private final FileStorageService fileStorageService; + + + @Inject + public FileStorageMonitorFSM(StateFactory stateFactory, + FileStorageService fileStorageService) { + super(stateFactory); + this.fileStorageService = fileStorageService; + } + + @GraphState(name = INITIAL, next = MONITOR) + public void init() { + } + + @GraphState(name = MONITOR, resume = ResumeBehavior.RETRY) + public void monitor() throws Exception { + + for (;;) { + Optional toDeleteOpt = fileStorageService.findFileStorageToDelete(); + + if (toDeleteOpt.isEmpty()) { + TimeUnit.SECONDS.sleep(10); + } + else { + transition(PURGE, toDeleteOpt.get().id()); + } + } + } + + @GraphState(name = PURGE, next = MONITOR, resume = ResumeBehavior.RETRY) + public void purge(FileStorageId id) throws Exception { + var storage = fileStorageService.getStorage(id); + logger.info("Deleting {} ", storage.path()); + Path path = storage.asPath(); + + if (Files.exists(path)) { + FileUtils.deleteDirectory(path.toFile()); + } + + fileStorageService.removeFileStorage(storage.id()); + } +} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/LoaderMonitorFSM.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/LoaderMonitorFSM.java new file mode 100644 index 00000000..ff81433e --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/LoaderMonitorFSM.java @@ -0,0 +1,24 @@ +package nu.marginalia.control.fsm.monitor; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.control.svc.ProcessService; +import nu.marginalia.converting.mqapi.ConverterInboxNames; +import nu.marginalia.mq.persistence.MqPersistence; +import nu.marginalia.mqsm.StateFactory; + +@Singleton +public class LoaderMonitorFSM extends AbstractProcessSpawnerFSM { + + + @Inject + public LoaderMonitorFSM(StateFactory stateFactory, + MqPersistence persistence, + ProcessService processService) { + + super(stateFactory, persistence, processService, + ConverterInboxNames.LOADER_INBOX, + ProcessService.ProcessId.LOADER); + } + +} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/MessageQueueMonitorFSM.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/MessageQueueMonitorFSM.java new file mode 100644 index 00000000..d6c5ff82 --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/MessageQueueMonitorFSM.java @@ -0,0 +1,45 @@ +package nu.marginalia.control.fsm.monitor; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.mq.persistence.MqPersistence; +import nu.marginalia.mqsm.StateFactory; +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mqsm.graph.GraphState; +import nu.marginalia.mqsm.graph.ResumeBehavior; + +import java.util.concurrent.TimeUnit; + +@Singleton +public class MessageQueueMonitorFSM extends AbstractStateGraph { + + // STATES + + private static final String INITIAL = "INITIAL"; + private static final String MONITOR = "MONITOR"; + private static final String END = "END"; + private final MqPersistence persistence; + + + @Inject + public MessageQueueMonitorFSM(StateFactory stateFactory, + MqPersistence persistence) { + super(stateFactory); + this.persistence = persistence; + } + + @GraphState(name = INITIAL, next = MONITOR) + public void init() { + } + + @GraphState(name = MONITOR, resume = ResumeBehavior.RETRY) + public void monitor() throws Exception { + + for (;;) { + persistence.reapDeadMessages(); + persistence.cleanOldMessages(); + TimeUnit.SECONDS.sleep(60); + } + } + +} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/ProcessLivenessMonitorFSM.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/ProcessLivenessMonitorFSM.java new file mode 100644 index 00000000..f6afa68f --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/ProcessLivenessMonitorFSM.java @@ -0,0 +1,55 @@ +package nu.marginalia.control.fsm.monitor; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.control.model.ProcessHeartbeat; +import nu.marginalia.control.svc.HeartbeatService; +import nu.marginalia.control.svc.ProcessService; +import nu.marginalia.mqsm.StateFactory; +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mqsm.graph.GraphState; +import nu.marginalia.mqsm.graph.ResumeBehavior; + +import java.util.concurrent.TimeUnit; + +@Singleton +public class ProcessLivenessMonitorFSM extends AbstractStateGraph { + + // STATES + + private static final String INITIAL = "INITIAL"; + private static final String MONITOR = "MONITOR"; + private static final String END = "END"; + private final ProcessService processService; + private final HeartbeatService heartbeatService; + + + @Inject + public ProcessLivenessMonitorFSM(StateFactory stateFactory, + ProcessService processService, + HeartbeatService heartbeatService) { + super(stateFactory); + this.processService = processService; + this.heartbeatService = heartbeatService; + } + + @GraphState(name = INITIAL, next = MONITOR) + public void init() { + } + + @GraphState(name = MONITOR, resume = ResumeBehavior.RETRY) + public void monitor() throws Exception { + + for (;;) { + var processHeartbeats = heartbeatService.getProcessHeartbeats(); + + processHeartbeats.stream() + .filter(ProcessHeartbeat::isRunning) + .filter(p -> !processService.isRunning(p.getProcessId())) + .forEach(heartbeatService::flagProcessAsStopped); + + TimeUnit.SECONDS.sleep(60); + } + } + +} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ReconvertAndLoadProcess.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/task/ReconvertAndLoadFSM.java similarity index 90% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ReconvertAndLoadProcess.java rename to code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/task/ReconvertAndLoadFSM.java index 1c1439f9..19881851 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/ReconvertAndLoadProcess.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/task/ReconvertAndLoadFSM.java @@ -1,4 +1,4 @@ -package nu.marginalia.control.process; +package nu.marginalia.control.fsm.task; import com.google.gson.Gson; import com.google.inject.Inject; @@ -15,7 +15,6 @@ import nu.marginalia.db.storage.model.FileStorageBaseType; import nu.marginalia.db.storage.model.FileStorageId; import nu.marginalia.db.storage.model.FileStorageType; import nu.marginalia.index.client.IndexClient; -import nu.marginalia.index.client.IndexMqEndpoints; import nu.marginalia.mq.MqMessage; import nu.marginalia.mq.MqMessageState; import nu.marginalia.mq.outbox.MqOutbox; @@ -24,17 +23,12 @@ import nu.marginalia.mqsm.graph.AbstractStateGraph; import nu.marginalia.mqsm.graph.GraphState; import nu.marginalia.mqsm.graph.ResumeBehavior; import nu.marginalia.search.client.SearchClient; -import nu.marginalia.search.client.SearchMqEndpoints; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.StandardCopyOption; -import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; @Singleton -public class ReconvertAndLoadProcess extends AbstractStateGraph { +public class ReconvertAndLoadFSM extends AbstractStateGraph { // STATES @@ -66,13 +60,13 @@ public class ReconvertAndLoadProcess extends AbstractStateGraph { }; @Inject - public ReconvertAndLoadProcess(StateFactory stateFactory, - ProcessService processService, - IndexClient indexClient, - ProcessOutboxFactory processOutboxFactory, - SearchClient searchClient, - FileStorageService storageService, - Gson gson + public ReconvertAndLoadFSM(StateFactory stateFactory, + ProcessService processService, + IndexClient indexClient, + ProcessOutboxFactory processOutboxFactory, + SearchClient searchClient, + FileStorageService storageService, + Gson gson ) { super(stateFactory); diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/RepartitionReindexProcess.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/task/RepartitionReindexFSM.java similarity index 90% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/RepartitionReindexProcess.java rename to code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/task/RepartitionReindexFSM.java index c668d230..ed3aad0a 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/RepartitionReindexProcess.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/task/RepartitionReindexFSM.java @@ -1,4 +1,4 @@ -package nu.marginalia.control.process; +package nu.marginalia.control.fsm.task; import com.google.inject.Inject; import com.google.inject.Singleton; @@ -12,7 +12,7 @@ import nu.marginalia.mqsm.graph.GraphState; import nu.marginalia.mqsm.graph.ResumeBehavior; @Singleton -public class RepartitionReindexProcess extends AbstractStateGraph { +public class RepartitionReindexFSM extends AbstractStateGraph { private final MqOutbox indexOutbox; @@ -27,8 +27,8 @@ public class RepartitionReindexProcess extends AbstractStateGraph { @Inject - public RepartitionReindexProcess(StateFactory stateFactory, - IndexClient indexClient) { + public RepartitionReindexFSM(StateFactory stateFactory, + IndexClient indexClient) { super(stateFactory); indexOutbox = indexClient.outbox(); diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ControlProcess.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ControlProcess.java index 6cdc219a..a09ee9e9 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ControlProcess.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ControlProcess.java @@ -4,9 +4,13 @@ public enum ControlProcess { REPARTITION_REINDEX, RECONVERT_LOAD, CONVERTER_MONITOR, - LOADER_MONITOR + LOADER_MONITOR, + MESSAGE_QUEUE_MONITOR, + PROCESS_LIVENESS_MONITOR, + FILE_STORAGE_MONITOR ; + public String id() { return "fsm:" + name().toLowerCase(); } diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java index 4fbdcde9..e92a2a1a 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java @@ -1,5 +1,7 @@ package nu.marginalia.control.model; +import nu.marginalia.control.svc.ProcessService; + public record ProcessHeartbeat( String processId, String processBase, @@ -23,6 +25,9 @@ public record ProcessHeartbeat( public boolean isStopped() { return "STOPPED".equals(status); } + public boolean isRunning() { + return "RUNNING".equals(status); + } public String progressStyle() { if ("RUNNING".equals(status) && progress != null) { return """ @@ -31,4 +36,13 @@ public record ProcessHeartbeat( } return ""; } + + public ProcessService.ProcessId getProcessId() { + return switch (processBase) { + case "converter" -> ProcessService.ProcessId.CONVERTER; + case "crawler" -> ProcessService.ProcessId.CRAWLER; + case "loader" -> ProcessService.ProcessId.LOADER; + default -> throw new RuntimeException("Unknown process base: " + processBase); + }; + } } diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/LoaderMonitorProcess.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/LoaderMonitorProcess.java deleted file mode 100644 index 813c7da7..00000000 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/process/LoaderMonitorProcess.java +++ /dev/null @@ -1,73 +0,0 @@ -package nu.marginalia.control.process; - -import com.google.inject.Inject; -import com.google.inject.Singleton; -import nu.marginalia.control.svc.ProcessService; -import nu.marginalia.converting.mqapi.ConverterInboxNames; -import nu.marginalia.mq.persistence.MqPersistence; -import nu.marginalia.mqsm.StateFactory; -import nu.marginalia.mqsm.graph.AbstractStateGraph; -import nu.marginalia.mqsm.graph.GraphState; -import nu.marginalia.mqsm.graph.ResumeBehavior; - -import java.sql.SQLException; -import java.util.concurrent.TimeUnit; - -@Singleton -public class LoaderMonitorProcess extends AbstractStateGraph { - - private final MqPersistence persistence; - private final ProcessService processService; - public static final String INITIAL = "INITIAL"; - public static final String CHECK = "CHECK"; - public static final String RUN = "RUN"; - public static final String END = "END"; - - public static final int MAX_ATTEMPTS = 1; - public static final String inboxName = ConverterInboxNames.LOADER_INBOX; - public static final ProcessService.ProcessId processId = ProcessService.ProcessId.LOADER; - - @Inject - public LoaderMonitorProcess(StateFactory stateFactory, - MqPersistence persistence, - ProcessService processService) { - super(stateFactory); - this.persistence = persistence; - this.processService = processService; - } - - @GraphState(name = INITIAL, next = CHECK) - public void init() { - - } - - @GraphState(name = CHECK, resume = ResumeBehavior.RETRY) - public void check() throws SQLException, InterruptedException { - - for (;;) { - var messages = persistence.eavesdrop(inboxName, 1); - - if (messages.isEmpty() && !processService.isRunning(processId)) { - TimeUnit.SECONDS.sleep(5); - } else { - transition(RUN, 0); - } - } - } - - @GraphState(name = RUN) - public void run(Integer attempts) throws Exception { - try { - processService.trigger(processId); - } - catch (Exception e) { - if (attempts < MAX_ATTEMPTS) { - transition(RUN, attempts + 1); - } - else throw e; - } - - transition(CHECK); - } - -} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/HeartbeatService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/HeartbeatService.java index 5f8b28f3..8a8a693e 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/HeartbeatService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/HeartbeatService.java @@ -5,6 +5,7 @@ import com.google.inject.Singleton; import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.control.model.ProcessHeartbeat; import nu.marginalia.control.model.ServiceHeartbeat; +import nu.marginalia.service.control.ServiceEventLog; import java.sql.SQLException; import java.util.ArrayList; @@ -13,10 +14,13 @@ import java.util.List; @Singleton public class HeartbeatService { private final HikariDataSource dataSource; + private final ServiceEventLog eventLogService; @Inject - public HeartbeatService(HikariDataSource dataSource) { + public HeartbeatService(HikariDataSource dataSource, + ServiceEventLog eventLogService) { this.dataSource = dataSource; + this.eventLogService = eventLogService; } public List getServiceHeartbeats() { @@ -77,4 +81,23 @@ public class HeartbeatService { return heartbeats; } + public void flagProcessAsStopped(ProcessHeartbeat processHeartbeat) { + eventLogService.logEvent("PROCESS-MISSING", "Marking stale process heartbeat " + + processHeartbeat.processId() + " / " + processHeartbeat.uuidFull() + " as stopped"); + + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + UPDATE PROCESS_HEARTBEAT + SET STATUS = 'STOPPED' + WHERE INSTANCE = ? + """)) { + + stmt.setString(1, processHeartbeat.uuidFull()); + stmt.executeUpdate(); + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } + } diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueMonitorService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueMonitorService.java deleted file mode 100644 index 4ba2585c..00000000 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueMonitorService.java +++ /dev/null @@ -1,62 +0,0 @@ -package nu.marginalia.control.svc; - -import nu.marginalia.mq.persistence.MqPersistence; -import nu.marginalia.service.control.ServiceEventLog; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javax.inject.Inject; -import javax.inject.Singleton; -import java.sql.SQLException; -import java.util.concurrent.TimeUnit; - -@Singleton -public class MessageQueueMonitorService { - private final Logger logger = LoggerFactory.getLogger(MessageQueueMonitorService.class); - private final MqPersistence persistence; - private final ServiceEventLog eventLog; - - @Inject - public MessageQueueMonitorService(ServiceEventLog eventLog, MqPersistence persistence) { - this.eventLog = eventLog; - this.persistence = persistence; - - Thread reaperThread = new Thread(this::run, "message-queue-reaper"); - reaperThread.setDaemon(true); - reaperThread.start(); - } - - - private void run() { - - for (;;) { - try { - TimeUnit.MINUTES.sleep(10); - - reapMessages(); - } - catch (InterruptedException ex) { - logger.info("Message queue reaper interrupted"); - break; - } - catch (Exception ex) { - logger.error("Message queue reaper failed", ex); - } - } - } - - private void reapMessages() throws SQLException { - int outcome = persistence.reapDeadMessages(); - if (outcome > 0) { - eventLog.logEvent("MESSAGE-QUEUE-REAPED", Integer.toString(outcome)); - logger.info("Reaped {} dead messages from message queue", outcome); - } - - outcome = persistence.cleanOldMessages(); - if (outcome > 0) { - eventLog.logEvent("MESSAGE-QUEUE-CLEANED", Integer.toString(outcome)); - logger.info("Cleaned {} stale messages from message queue", outcome); - } - } - -} From bca4bbb6c8fb3a84f633c9cc96b8a725f865bf49 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 17 Jul 2023 13:57:32 +0200 Subject: [PATCH 058/157] (*) Refactor MQ and MQSM --- code/api/process-mqapi/build.gradle | 30 ++++++ .../marginalia/mqapi/ProcessInboxNames.java} | 5 +- .../mqapi/converting}/ConvertRequest.java | 2 +- .../mqapi/crawling}/CrawlRequest.java | 4 +- .../mqapi/loading}/LoadRequest.java | 3 +- code/api/readme.md | 15 ++- .../db/storage/model/FileStorageBaseId.java | 7 +- .../db/storage/model/FileStorageId.java | 4 + code/common/message-queue/readme.md | 77 ++++++++++++++- .../java/nu/marginalia/mqsm/StateFactory.java | 10 ++ .../java/nu/marginalia/mqsm/StateMachine.java | 10 +- .../marginalia/mqsm/graph/ResumeBehavior.java | 4 +- .../marginalia/mqsm/StateMachineNullTest.java | 98 +++++++++++++++++++ .../processes/converting-process/build.gradle | 2 + .../marginalia/converting/ConverterMain.java | 6 +- .../converting/ConverterModule.java | 13 --- code/processes/loading-process/build.gradle | 2 +- .../nu/marginalia/loading/LoaderMain.java | 9 +- .../control-service/build.gradle | 7 +- .../nu/marginalia/control/ControlService.java | 37 +++---- .../nu/marginalia/control/HtmlRedirect.java | 19 ++++ .../fsm/monitor/ConverterMonitorFSM.java | 4 +- .../control/fsm/monitor/LoaderMonitorFSM.java | 4 +- .../control/fsm/task/ReconvertAndLoadFSM.java | 4 +- .../model/FileStorageBaseWithStorage.java | 5 +- .../control/model/FileStorageWithActions.java | 16 +++ .../svc/ControlFileStorageService.java | 26 ++++- .../control/svc/ProcessOutboxFactory.java | 6 +- .../resources/templates/control/storage.hdb | 24 ++++- settings.gradle | 1 + 30 files changed, 378 insertions(+), 76 deletions(-) create mode 100644 code/api/process-mqapi/build.gradle rename code/{process-models/converting-model/src/main/java/nu/marginalia/converting/mqapi/ConverterInboxNames.java => api/process-mqapi/src/main/java/nu/marginalia/mqapi/ProcessInboxNames.java} (50%) rename code/{process-models/converting-model/src/main/java/nu/marginalia/converting/mqapi => api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting}/ConvertRequest.java (85%) rename code/{process-models/crawling-model/src/main/java/nu/marginalia/crawling/mqapi => api/process-mqapi/src/main/java/nu/marginalia/mqapi/crawling}/CrawlRequest.java (66%) rename code/{process-models/converting-model/src/main/java/nu/marginalia/converting/mqapi => api/process-mqapi/src/main/java/nu/marginalia/mqapi/loading}/LoadRequest.java (81%) create mode 100644 code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineNullTest.java create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/HtmlRedirect.java create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithActions.java diff --git a/code/api/process-mqapi/build.gradle b/code/api/process-mqapi/build.gradle new file mode 100644 index 00000000..0b360576 --- /dev/null +++ b/code/api/process-mqapi/build.gradle @@ -0,0 +1,30 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + implementation project(':code:common:db') + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} \ No newline at end of file diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/mqapi/ConverterInboxNames.java b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/ProcessInboxNames.java similarity index 50% rename from code/process-models/converting-model/src/main/java/nu/marginalia/converting/mqapi/ConverterInboxNames.java rename to code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/ProcessInboxNames.java index 5ce3ebff..9ca91fe6 100644 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/mqapi/ConverterInboxNames.java +++ b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/ProcessInboxNames.java @@ -1,6 +1,7 @@ -package nu.marginalia.converting.mqapi; +package nu.marginalia.mqapi; -public class ConverterInboxNames { +public class ProcessInboxNames { public static final String CONVERTER_INBOX = "converter"; public static final String LOADER_INBOX = "loader"; + public static final String CRAWLER_INBOX = "crawler"; } diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/mqapi/ConvertRequest.java b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertRequest.java similarity index 85% rename from code/process-models/converting-model/src/main/java/nu/marginalia/converting/mqapi/ConvertRequest.java rename to code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertRequest.java index 881d75a2..64091146 100644 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/mqapi/ConvertRequest.java +++ b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertRequest.java @@ -1,4 +1,4 @@ -package nu.marginalia.converting.mqapi; +package nu.marginalia.mqapi.converting; import lombok.AllArgsConstructor; import nu.marginalia.db.storage.model.FileStorageId; diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/mqapi/CrawlRequest.java b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/crawling/CrawlRequest.java similarity index 66% rename from code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/mqapi/CrawlRequest.java rename to code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/crawling/CrawlRequest.java index 53f387d5..5aaecc5d 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/mqapi/CrawlRequest.java +++ b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/crawling/CrawlRequest.java @@ -1,8 +1,10 @@ -package nu.marginalia.crawling.mqapi; +package nu.marginalia.mqapi.crawling; +import lombok.AllArgsConstructor; import nu.marginalia.db.storage.model.FileStorageId; /** A request to start a crawl */ +@AllArgsConstructor public class CrawlRequest { FileStorageId specStorage; FileStorageId crawlStorage; diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/mqapi/LoadRequest.java b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/loading/LoadRequest.java similarity index 81% rename from code/process-models/converting-model/src/main/java/nu/marginalia/converting/mqapi/LoadRequest.java rename to code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/loading/LoadRequest.java index 186f0f7e..eff92c9c 100644 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/mqapi/LoadRequest.java +++ b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/loading/LoadRequest.java @@ -1,4 +1,4 @@ -package nu.marginalia.converting.mqapi; +package nu.marginalia.mqapi.loading; import lombok.AllArgsConstructor; import nu.marginalia.db.storage.model.FileStorageId; @@ -6,5 +6,4 @@ import nu.marginalia.db.storage.model.FileStorageId; @AllArgsConstructor public class LoadRequest { public FileStorageId processedDataStorage; - } diff --git a/code/api/readme.md b/code/api/readme.md index 4b19381f..f98f326a 100644 --- a/code/api/readme.md +++ b/code/api/readme.md @@ -1,4 +1,10 @@ -# Core Service Clients +# Clients + +## Core Services + +* [assistant-api](assistant-api/) +* [search-api](search-api/) +* [index-api](index-api/) These are clients for the [core services](../services-core/), along with what models are necessary for speaking to them. They each implement the abstract client classes from @@ -8,3 +14,10 @@ All that is necessary is to `@Inject` them into the constructor and then requests can be sent. **Note:** If you are looking for the public API, it's handled by the api service in [services-satellite/api-service](../services-satellite/api-service). + +## MQ-API Process API + +[process-mqapi](process-mqapi/) defines requests and inboxes for the message queue based API used +for interacting with processes. + +See [common/message-queue](../common/message-queue) and [services-satellite/control-service](../services-satellite/control-service). \ No newline at end of file diff --git a/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageBaseId.java b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageBaseId.java index e4dbaf68..1c7ededd 100644 --- a/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageBaseId.java +++ b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageBaseId.java @@ -1,3 +1,8 @@ package nu.marginalia.db.storage.model; -public record FileStorageBaseId(long id) {} +public record FileStorageBaseId(long id) { + + public String toString() { + return Long.toString(id); + } +} diff --git a/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageId.java b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageId.java index 43e5503f..3d6331e3 100644 --- a/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageId.java +++ b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageId.java @@ -4,4 +4,8 @@ public record FileStorageId(long id) { public static FileStorageId of(int storageId) { return new FileStorageId(storageId); } + + public String toString() { + return Long.toString(id); + } } diff --git a/code/common/message-queue/readme.md b/code/common/message-queue/readme.md index 20e59642..cbb5082c 100644 --- a/code/common/message-queue/readme.md +++ b/code/common/message-queue/readme.md @@ -5,4 +5,79 @@ as well as a finite state machine library backed by the message queue that enables long-running tasks that outlive the execution lifespan of the involved processes. -![Message States](msgstate.svg) \ No newline at end of file +![Message States](msgstate.svg) + +The message queue is interacted with via the Inbox and Outbox classes. + +There are three types of inboxes; + +Name|Description +---|--- +MqSingleShotInbox|A single message is received and then the inbox is closed. +MqAsynchronousInbox|Messages are received asynchronously and can be processed in parallel. +MqSynchronousInbox|Messages are received synchronously and will be processed in order; message processing can be aborted. + +A single outbox implementation exists, the `MqOutbox`, which implements multiple message sending strategies, +including blocking and asynchronous paradigms. Lower level access to the message queue itself is provided by the `MqPersistence` class. + +The inbox implementations as well as the outbox can be constructed via the `MessageQueueFactory` class. + +## Message Queue State Machine (MQSM) + +The MQSM is a finite state machine that is backed by the message queue. The machine itself +is defined through a class that extends the 'AbstractStateGraph'; with state transitions and +names defined as implementations. + +Example: + +```java +class ExampleStateMachine extends AbstractStateGraph { + + @GraphState(name = "INITIAL", next="GREET") + public void initial() { + return "World"; // passed to the next state + } + + @GraphState(name = "GREET", next="COUNT-TO-FIVE") + public void greet(String name) { + System.out.println("Hello " + name); + } + + @GraphState(name = "COUNT-TO-FIVE", next="END") + public void countToFive(Integer value) { + // value is passed from the previous state, since greet didn't pass a value, + // null will be the default. + + if (null == value) { + // jumps to the current state with a value of 0 + transition("COUNT-TO-FIVE", 0); + } + + + System.out.println(++value); + if (value < 5) { + // Loops the current state until value = 5 + transition("COUNT-TO-FIVE", value); + } + + if (value > 5) { + // demonstrates an error condition + error("Illegal value"); + } + + // Default transition is to END + } + + @GraphState(name="END") + public void end() { + System.out.println("Done"); + } +} +``` + +Each method should ideally be idempotent, or at least be able to handle being called multiple times. +It can not be assumed that the states are invoked within the same process, or even on the same machine, +on the same day, etc. + +The usual considerations for writing deterministic Java code are advisable unless unavoidable; +all state must be local, don't iterate over hash maps, etc. \ No newline at end of file diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateFactory.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateFactory.java index 6a143157..cd7824a7 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateFactory.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateFactory.java @@ -28,6 +28,11 @@ public class StateFactory { @Override public StateTransition next(String message) { + + if (message.equals("")) { + return logic.apply(null); + } + return logic.apply(gson.fromJson(message, param)); } @@ -72,6 +77,11 @@ public class StateFactory { } public StateTransition transition(String state, Object message) { + + if (null == message) { + return StateTransition.to(state); + } + return StateTransition.to(state, gson.toJson(message)); } diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java index e3937894..36ed81cf 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java @@ -186,8 +186,16 @@ public class StateMachine { if (resumeState.resumeBehavior().equals(ResumeBehavior.ERROR)) { // The message is acknowledged, but the state does not support resuming smOutbox.notify(expectedMessage.id, "ERROR", "Illegal resumption from ACK'ed state " + message.function()); - } else { + } + else if (resumeState.resumeBehavior().equals(ResumeBehavior.RESTART)) { + this.state = resumeState; + // The message is already acknowledged, we flag it as dead and then send an identical message + smOutbox.flagAsDead(message.msgId()); + expectedMessage = ExpectedMessage.responseTo(message); + smOutbox.notify(message.msgId(), "INITIAL", ""); + } + else { this.state = resumeState; // The message is already acknowledged, we flag it as dead and then send an identical message diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/ResumeBehavior.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/ResumeBehavior.java index 2e275cb5..33dacb5d 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/ResumeBehavior.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/ResumeBehavior.java @@ -4,5 +4,7 @@ public enum ResumeBehavior { /** Retry the state on resume */ RETRY, /** Jump to ERROR on resume if the message has been acknowledged */ - ERROR + ERROR, + /** Jump to INITIAL on resume */ + RESTART } diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineNullTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineNullTest.java new file mode 100644 index 00000000..301b75a1 --- /dev/null +++ b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineNullTest.java @@ -0,0 +1,98 @@ +package nu.marginalia.mqsm; + +import com.google.gson.GsonBuilder; +import com.zaxxer.hikari.HikariConfig; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.mq.MessageQueueFactory; +import nu.marginalia.mq.MqTestUtil; +import nu.marginalia.mq.persistence.MqPersistence; +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mqsm.graph.GraphState; +import org.junit.jupiter.api.*; +import org.junit.jupiter.api.parallel.Execution; +import org.testcontainers.containers.MariaDBContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +import java.util.UUID; +import java.util.concurrent.TimeUnit; + +import static org.junit.Assert.fail; +import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD; + +@Tag("slow") +@Testcontainers +@Execution(SAME_THREAD) +public class StateMachineNullTest { + @Container + static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") + .withDatabaseName("WMSA_prod") + .withUsername("wmsa") + .withPassword("wmsa") + .withInitScript("sql/current/12-message-queue.sql") + .withNetworkAliases("mariadb"); + + static HikariDataSource dataSource; + static MqPersistence persistence; + static MessageQueueFactory messageQueueFactory; + private String inboxId; + + @BeforeEach + public void setUp() { + inboxId = UUID.randomUUID().toString(); + } + @BeforeAll + public static void setUpAll() { + HikariConfig config = new HikariConfig(); + config.setJdbcUrl(mariaDBContainer.getJdbcUrl()); + config.setUsername("wmsa"); + config.setPassword("wmsa"); + + dataSource = new HikariDataSource(config); + persistence = new MqPersistence(dataSource); + messageQueueFactory = new MessageQueueFactory(persistence); + } + + @AfterAll + public static void tearDownAll() { + dataSource.close(); + } + + public static class TestGraph extends AbstractStateGraph { + public TestGraph(StateFactory stateFactory) { + super(stateFactory); + } + + @GraphState(name = "INITIAL", next = "GREET") + public void initial() {} + + @GraphState(name = "GREET", next = "END") + public void greet(String message) { + if (null == message) { + System.out.println("Hello, null!"); + return; + } + Assertions.fail("Should not be called"); + } + + } + + @Test + public void testStateGraphNullSerialization() throws Exception { + var stateFactory = new StateFactory(new GsonBuilder().create()); + var graph = new TestGraph(stateFactory); + + + var sm = new StateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), graph); + sm.registerStates(graph); + + sm.init(); + + sm.join(2, TimeUnit.SECONDS); + sm.stop(); + + MqTestUtil.getMessages(dataSource, inboxId).forEach(System.out::println); + + } + +} diff --git a/code/processes/converting-process/build.gradle b/code/processes/converting-process/build.gradle index 7a9121d8..b85a829b 100644 --- a/code/processes/converting-process/build.gradle +++ b/code/processes/converting-process/build.gradle @@ -26,7 +26,9 @@ dependencies { implementation project(':third-party:porterstemmer') implementation project(':third-party:count-min-sketch') + implementation project(':code:api:index-api') + implementation project(':code:api:process-mqapi') implementation project(':code:common:model') implementation project(':code:common:db') diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java index a42f5b67..36e5b558 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java @@ -29,7 +29,7 @@ import java.util.UUID; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; -import static nu.marginalia.converting.mqapi.ConverterInboxNames.CONVERTER_INBOX; +import static nu.marginalia.mqapi.ProcessInboxNames.CONVERTER_INBOX; public class ConverterMain { @@ -176,10 +176,10 @@ public class ConverterMain { var inbox = messageQueueFactory.createSingleShotInbox(CONVERTER_INBOX, UUID.randomUUID()); - var msgOpt = getMessage(inbox, nu.marginalia.converting.mqapi.ConvertRequest.class.getSimpleName()); + var msgOpt = getMessage(inbox, nu.marginalia.mqapi.converting.ConvertRequest.class.getSimpleName()); var msg = msgOpt.orElseThrow(() -> new RuntimeException("No message received")); - var request = gson.fromJson(msg.payload(), nu.marginalia.converting.mqapi.ConvertRequest.class); + var request = gson.fromJson(msg.payload(), nu.marginalia.mqapi.converting.ConvertRequest.class); var crawlData = fileStorageService.getStorage(request.crawlStorage); var processData = fileStorageService.getStorage(request.processedDataStorage); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterModule.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterModule.java index 121159ed..90d4e3ad 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterModule.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterModule.java @@ -2,26 +2,13 @@ package nu.marginalia.converting; import com.google.gson.Gson; import com.google.inject.AbstractModule; -import com.google.inject.Provides; -import com.google.inject.Singleton; import com.google.inject.name.Names; -import lombok.SneakyThrows; import nu.marginalia.LanguageModels; import nu.marginalia.ProcessConfiguration; import nu.marginalia.WmsaHome; -import nu.marginalia.converting.mqapi.ConvertRequest; -import nu.marginalia.db.storage.FileStorageService; -import nu.marginalia.mq.MessageQueueFactory; -import nu.marginalia.mq.persistence.MqPersistence; -import plan.CrawlPlan; import nu.marginalia.model.gson.GsonFactory; -import plan.CrawlPlanLoader; -import java.io.IOException; -import java.nio.file.Path; -import java.sql.SQLException; import java.util.UUID; -import java.util.concurrent.TimeUnit; public class ConverterModule extends AbstractModule { diff --git a/code/processes/loading-process/build.gradle b/code/processes/loading-process/build.gradle index caba9812..d204247d 100644 --- a/code/processes/loading-process/build.gradle +++ b/code/processes/loading-process/build.gradle @@ -19,7 +19,7 @@ tasks.distZip.enabled = false dependencies { implementation project(':code:common:process') - + implementation project(':code:api:process-mqapi') implementation project(':code:api:index-api') implementation project(':code:common:model') implementation project(':code:common:db') diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java index 08649808..f6ccc79d 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java @@ -7,7 +7,6 @@ import com.google.inject.Injector; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; import nu.marginalia.db.storage.FileStorageService; -import nu.marginalia.db.storage.model.FileStorage; import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mq.MqMessage; import nu.marginalia.mq.inbox.MqInboxResponse; @@ -30,7 +29,7 @@ import java.util.UUID; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeUnit; -import static nu.marginalia.converting.mqapi.ConverterInboxNames.LOADER_INBOX; +import static nu.marginalia.mqapi.ProcessInboxNames.LOADER_INBOX; public class LoaderMain { private static final Logger logger = LoggerFactory.getLogger(LoaderMain.class); @@ -215,16 +214,16 @@ public class LoaderMain { var inbox = messageQueueFactory.createSingleShotInbox(LOADER_INBOX, UUID.randomUUID()); - var msgOpt = getMessage(inbox, nu.marginalia.converting.mqapi.LoadRequest.class.getSimpleName()); + var msgOpt = getMessage(inbox, nu.marginalia.mqapi.loading.LoadRequest.class.getSimpleName()); if (msgOpt.isEmpty()) throw new RuntimeException("No instruction received in inbox"); var msg = msgOpt.get(); - if (!nu.marginalia.converting.mqapi.LoadRequest.class.getSimpleName().equals(msg.function())) { + if (!nu.marginalia.mqapi.loading.LoadRequest.class.getSimpleName().equals(msg.function())) { throw new RuntimeException("Unexpected message in inbox: " + msg); } - var request = gson.fromJson(msg.payload(), nu.marginalia.converting.mqapi.LoadRequest.class); + var request = gson.fromJson(msg.payload(), nu.marginalia.mqapi.loading.LoadRequest.class); var processData = fileStorageService.getStorage(request.processedDataStorage); diff --git a/code/services-satellite/control-service/build.gradle b/code/services-satellite/control-service/build.gradle index 72c0552e..90b832da 100644 --- a/code/services-satellite/control-service/build.gradle +++ b/code/services-satellite/control-service/build.gradle @@ -22,6 +22,8 @@ tasks.distZip.enabled = false apply from: "$rootProject.projectDir/docker-service.gradle" dependencies { + implementation libs.bundles.gson + implementation project(':code:common:db') implementation project(':code:common:model') implementation project(':code:common:service') @@ -30,10 +32,9 @@ dependencies { implementation project(':code:common:message-queue') implementation project(':code:common:service-discovery') implementation project(':code:common:service-client') - implementation project(':code:process-models:converting-model') - implementation project(':code:process-models:crawling-model') implementation project(':code:api:search-api') implementation project(':code:api:index-api') + implementation project(':code:api:process-mqapi') implementation libs.lombok @@ -43,11 +44,11 @@ dependencies { implementation libs.prometheus implementation libs.notnull implementation libs.guice + implementation libs.trove implementation libs.spark implementation libs.fastutil implementation libs.commons.io - implementation libs.bundles.gson implementation libs.bundles.mariadb testImplementation libs.bundles.slf4j.test diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java index 34e600f5..a3db382b 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java @@ -77,38 +77,31 @@ public class ControlService extends Service { (req, rsp) -> Map.of("storage", controlFileStorageService.getStorageList()), (map) -> storageRenderer.render((Map) map)); + final HtmlRedirect redirectToServices = new HtmlRedirect("/services"); + final HtmlRedirect redirectToProcesses = new HtmlRedirect("/processes"); + final HtmlRedirect redirectToStorage = new HtmlRedirect("/storage"); + Spark.post("/public/fsms/:fsm/start", (req, rsp) -> { controlFSMs.start(ControlProcess.valueOf(req.params("fsm").toUpperCase())); - return """ - - - """; - }); + return ""; + }, redirectToProcesses); + Spark.post("/public/fsms/:fsm/stop", (req, rsp) -> { controlFSMs.stop(ControlProcess.valueOf(req.params("fsm").toUpperCase())); - return """ - - - """; - }); + return ""; + }, redirectToProcesses); // TODO: This should be a POST Spark.get("/public/repartition", (req, rsp) -> { controlFSMs.start(ControlProcess.REPARTITION_REINDEX); - return """ - - - """; - }); + return ""; + } , redirectToProcesses); - // TODO: This should be a POST - Spark.get("/public/reconvert/:fid", (req, rsp) -> { + Spark.post("/public/storage/:fid/process", (req, rsp) -> { controlFSMs.start(ControlProcess.RECONVERT_LOAD, FileStorageId.of(Integer.parseInt(req.params("fid")))); - return """ - - - """; - }); + return ""; + }, redirectToProcesses); + Spark.post("/public/storage/:fid/delete", controlFileStorageService::flagFileForDeletionRequest, redirectToStorage); Spark.get("/public/:resource", this::serveStatic); diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/HtmlRedirect.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/HtmlRedirect.java new file mode 100644 index 00000000..fd49bd6d --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/HtmlRedirect.java @@ -0,0 +1,19 @@ +package nu.marginalia.control; + +import spark.ResponseTransformer; + +public class HtmlRedirect implements ResponseTransformer { + private final String html; + + public HtmlRedirect(String destination) { + this.html = """ + + + """.formatted(destination); + } + + @Override + public String render(Object any) throws Exception { + return html; + } +} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/ConverterMonitorFSM.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/ConverterMonitorFSM.java index d5dd3908..674d064a 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/ConverterMonitorFSM.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/ConverterMonitorFSM.java @@ -3,7 +3,7 @@ package nu.marginalia.control.fsm.monitor; import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.control.svc.ProcessService; -import nu.marginalia.converting.mqapi.ConverterInboxNames; +import nu.marginalia.mqapi.ProcessInboxNames; import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.mqsm.StateFactory; @@ -15,7 +15,7 @@ public class ConverterMonitorFSM extends AbstractProcessSpawnerFSM { public ConverterMonitorFSM(StateFactory stateFactory, MqPersistence persistence, ProcessService processService) { - super(stateFactory, persistence, processService, ConverterInboxNames.CONVERTER_INBOX, ProcessService.ProcessId.CONVERTER); + super(stateFactory, persistence, processService, ProcessInboxNames.CONVERTER_INBOX, ProcessService.ProcessId.CONVERTER); } diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/LoaderMonitorFSM.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/LoaderMonitorFSM.java index ff81433e..69015f65 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/LoaderMonitorFSM.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/LoaderMonitorFSM.java @@ -3,7 +3,7 @@ package nu.marginalia.control.fsm.monitor; import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.control.svc.ProcessService; -import nu.marginalia.converting.mqapi.ConverterInboxNames; +import nu.marginalia.mqapi.ProcessInboxNames; import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.mqsm.StateFactory; @@ -17,7 +17,7 @@ public class LoaderMonitorFSM extends AbstractProcessSpawnerFSM { ProcessService processService) { super(stateFactory, persistence, processService, - ConverterInboxNames.LOADER_INBOX, + ProcessInboxNames.LOADER_INBOX, ProcessService.ProcessId.LOADER); } diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/task/ReconvertAndLoadFSM.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/task/ReconvertAndLoadFSM.java index 19881851..781882a5 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/task/ReconvertAndLoadFSM.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/task/ReconvertAndLoadFSM.java @@ -8,8 +8,8 @@ import lombok.NoArgsConstructor; import lombok.With; import nu.marginalia.control.svc.ProcessOutboxFactory; import nu.marginalia.control.svc.ProcessService; -import nu.marginalia.converting.mqapi.ConvertRequest; -import nu.marginalia.converting.mqapi.LoadRequest; +import nu.marginalia.mqapi.converting.ConvertRequest; +import nu.marginalia.mqapi.loading.LoadRequest; import nu.marginalia.db.storage.FileStorageService; import nu.marginalia.db.storage.model.FileStorageBaseType; import nu.marginalia.db.storage.model.FileStorageId; diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageBaseWithStorage.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageBaseWithStorage.java index 94a39e2b..7411e3c7 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageBaseWithStorage.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageBaseWithStorage.java @@ -1,9 +1,10 @@ package nu.marginalia.control.model; -import nu.marginalia.db.storage.model.FileStorage; import nu.marginalia.db.storage.model.FileStorageBase; import java.util.List; -public record FileStorageBaseWithStorage(FileStorageBase base, List storage) { +public record FileStorageBaseWithStorage(FileStorageBase base, + List storage) +{ } diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithActions.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithActions.java new file mode 100644 index 00000000..927262d2 --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithActions.java @@ -0,0 +1,16 @@ +package nu.marginalia.control.model; + +import nu.marginalia.db.storage.model.FileStorage; +import nu.marginalia.db.storage.model.FileStorageType; + +public record FileStorageWithActions(FileStorage storage) { + public boolean isLoadable() { + return storage.type() == FileStorageType.PROCESSED_DATA; + } + public boolean isConvertible() { + return storage.type() == FileStorageType.CRAWL_DATA; + } + public boolean isDeletable() { + return storage.type() == FileStorageType.PROCESSED_DATA; + } +} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlFileStorageService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlFileStorageService.java index 04dc34ae..982c42e0 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlFileStorageService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlFileStorageService.java @@ -5,6 +5,7 @@ import com.google.inject.Singleton; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; import nu.marginalia.control.model.FileStorageBaseWithStorage; +import nu.marginalia.control.model.FileStorageWithActions; import nu.marginalia.control.model.ProcessHeartbeat; import nu.marginalia.control.model.ServiceHeartbeat; import nu.marginalia.db.storage.FileStorageService; @@ -12,6 +13,8 @@ import nu.marginalia.db.storage.model.FileStorage; import nu.marginalia.db.storage.model.FileStorageBase; import nu.marginalia.db.storage.model.FileStorageBaseId; import nu.marginalia.db.storage.model.FileStorageId; +import spark.Request; +import spark.Response; import java.sql.SQLException; import java.util.ArrayList; @@ -30,10 +33,24 @@ public class ControlFileStorageService { this.fileStorageService = fileStorageService; } + public Object flagFileForDeletionRequest(Request request, Response response) throws SQLException { + FileStorageId fid = new FileStorageId(Long.parseLong(request.params(":fid"))); + flagFileForDeletion(fid); + return ""; + } + + public void flagFileForDeletion(FileStorageId id) throws SQLException { + try (var conn = dataSource.getConnection(); + var flagStmt = conn.prepareStatement("UPDATE FILE_STORAGE SET DO_PURGE = TRUE WHERE ID = ?")) { + flagStmt.setLong(1, id.id()); + flagStmt.executeUpdate(); + } + } + @SneakyThrows public List getStorageList() { Map fileStorageBaseByBaseId = new HashMap<>(); - Map> fileStoragByBaseId = new HashMap<>(); + Map> fileStoragByBaseId = new HashMap<>(); List storageIds = new ArrayList<>(); @@ -48,12 +65,15 @@ public class ControlFileStorageService { for (var id : storageIds) { var storage = fileStorageService.getStorage(id); fileStorageBaseByBaseId.computeIfAbsent(storage.base().id(), k -> storage.base()); - fileStoragByBaseId.computeIfAbsent(storage.base().id(), k -> new ArrayList<>()).add(storage); + fileStoragByBaseId.computeIfAbsent(storage.base().id(), k -> new ArrayList<>()).add(new FileStorageWithActions(storage)); } List result = new ArrayList<>(); for (var baseId : fileStorageBaseByBaseId.keySet()) { - result.add(new FileStorageBaseWithStorage(fileStorageBaseByBaseId.get(baseId), fileStoragByBaseId.get(baseId))); + result.add(new FileStorageBaseWithStorage(fileStorageBaseByBaseId.get(baseId), + fileStoragByBaseId.get(baseId) + + )); } return result; diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessOutboxFactory.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessOutboxFactory.java index e1b5a3b1..4c296069 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessOutboxFactory.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessOutboxFactory.java @@ -2,7 +2,7 @@ package nu.marginalia.control.svc; import com.google.inject.Inject; import com.google.inject.Singleton; -import nu.marginalia.converting.mqapi.ConverterInboxNames; +import nu.marginalia.mqapi.ProcessInboxNames; import nu.marginalia.mq.outbox.MqOutbox; import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.service.server.BaseServiceParams; @@ -19,9 +19,9 @@ public class ProcessOutboxFactory { } public MqOutbox createConverterOutbox() { - return new MqOutbox(persistence, ConverterInboxNames.CONVERTER_INBOX, params.configuration.serviceName(), params.configuration.instanceUuid()); + return new MqOutbox(persistence, ProcessInboxNames.CONVERTER_INBOX, params.configuration.serviceName(), params.configuration.instanceUuid()); } public MqOutbox createLoaderOutbox() { - return new MqOutbox(persistence, ConverterInboxNames.LOADER_INBOX, params.configuration.serviceName(), params.configuration.instanceUuid()); + return new MqOutbox(persistence, ProcessInboxNames.LOADER_INBOX, params.configuration.serviceName(), params.configuration.instanceUuid()); } } diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/storage.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/storage.hdb index 72f55e29..68410646 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/storage.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/storage.hdb @@ -33,10 +33,26 @@ {{#each storage}} - - {{type}} - {{path}} - {{description}} + + {{#if isLoadable}} +
    + +
    + {{/if}} + {{#if isConvertible}} +
    + +
    + {{/if}} + {{#if isDeletable}} +
    + +
    + {{/if}} + + {{storage.type}} + {{storage.path}} + {{storage.description}} {{/each}} {{/each}} diff --git a/settings.gradle b/settings.gradle index 41e0cb53..7e6d02a0 100644 --- a/settings.gradle +++ b/settings.gradle @@ -44,6 +44,7 @@ include 'code:features-index:domain-ranking' include 'code:api:search-api' include 'code:api:index-api' include 'code:api:assistant-api' +include 'code:api:process-mqapi' include 'code:common:service-discovery' include 'code:common:service-client' From d7ab21fe3406b7d8172cbe483112c09bf4219d5e Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 17 Jul 2023 21:20:31 +0200 Subject: [PATCH 059/157] (*) Refactor Control Service and processes --- .../db/storage/model/FileStorageBaseType.java | 3 +- .../db/storage/model/FileStorageType.java | 1 + .../resources/sql/current/13-file-storage.sql | 4 +- .../java/nu/marginalia/mqsm/StateMachine.java | 24 +++++ .../nu/marginalia/process/log/WorkLog.java | 4 + .../service/module/DatabaseModule.java | 4 +- .../src/main/resources/log4j2.properties | 39 +++++++- .../forward/ForwardIndexConverterTest.java | 3 +- .../writer/IndexJournalWriterImpl.java | 3 +- .../ReverseIndexFullConverterTest.java | 3 +- .../ReverseIndexFullConverterTest2.java | 3 +- .../ReverseIndexPriorityConverterTest2.java | 3 +- .../nu/marginalia/dict/DictionaryMap.java | 10 ++ .../nu/marginalia/lexicon/KeywordLexicon.java | 43 +++++++- .../lexicon/KeywordLexiconReadOnlyView.java | 19 +++- .../journal/KeywordLexiconJournal.java | 66 ++++++++++--- .../KeywordLexiconJournalCommitQueue.java | 3 + .../journal/KeywordLexiconJournalFile.java | 1 - .../KeywordLexiconJournalFingerprint.java | 10 ++ .../journal/KeywordLexiconJournalMode.java | 6 ++ .../lexicon/KeywordLexiconTest.java | 5 +- .../src/main/java/plan/CrawlPlan.java | 2 +- .../marginalia/converting/ConverterMain.java | 8 +- .../nu/marginalia/loading/LoaderMain.java | 7 ++ .../nu/marginalia/loading/loader/Loader.java | 8 -- .../loader/LoaderIndexJournalWriter.java | 11 ++- .../java/nu/marginalia/index/IndexModule.java | 3 +- .../index/IndexServicesFactory.java | 27 ++--- .../marginalia/index/svc/IndexOpsService.java | 8 +- ...ndexQueryServiceIntegrationTestModule.java | 12 ++- .../nu/marginalia/control/ControlService.java | 91 ++++++++--------- .../marginalia/control/fsm/ControlFSMs.java | 33 ++++--- .../monitor/AbstractProcessSpawnerFSM.java | 2 +- .../control/fsm/task/ReconvertAndLoadFSM.java | 98 +++++++++---------- .../fsm/task/RepartitionReindexFSM.java | 20 ++-- .../control/model/ControlProcessState.java | 6 ++ .../control/model/FileStorageWithActions.java | 3 +- .../control/model/ProcessHeartbeat.java | 2 +- .../control/svc/ControlFsmService.java | 72 ++++++++++++++ .../control/svc/EventLogService.java | 62 ++++++++++++ .../control/svc/ProcessService.java | 11 ++- .../resources/templates/control/index.hdb | 2 +- .../templates/control/partials/nav.hdb | 6 +- .../control/partials/processes-table.hdb | 4 +- .../control/partials/services-table.hdb | 2 +- .../resources/templates/control/processes.hdb | 2 +- .../templates/control/service-by-id.hdb | 21 ++++ .../resources/templates/control/services.hdb | 2 +- .../resources/templates/control/storage.hdb | 4 +- run/env/service.env | 3 +- 50 files changed, 585 insertions(+), 204 deletions(-) create mode 100644 code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalFingerprint.java create mode 100644 code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalMode.java create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlFsmService.java create mode 100644 code/services-satellite/control-service/src/main/resources/templates/control/service-by-id.hdb diff --git a/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageBaseType.java b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageBaseType.java index df9f497f..08d67069 100644 --- a/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageBaseType.java +++ b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageBaseType.java @@ -3,5 +3,6 @@ package nu.marginalia.db.storage.model; public enum FileStorageBaseType { SSD_INDEX, SSD_WORK, - SLOW + SLOW, + BACKUP } diff --git a/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageType.java b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageType.java index 390262ec..97eef136 100644 --- a/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageType.java +++ b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageType.java @@ -8,5 +8,6 @@ public enum FileStorageType { LEXICON_STAGING, INDEX_LIVE, LEXICON_LIVE, + BACKUP, SEARCH_SETS } diff --git a/code/common/db/src/main/resources/sql/current/13-file-storage.sql b/code/common/db/src/main/resources/sql/current/13-file-storage.sql index af111186..763f39a0 100644 --- a/code/common/db/src/main/resources/sql/current/13-file-storage.sql +++ b/code/common/db/src/main/resources/sql/current/13-file-storage.sql @@ -2,7 +2,7 @@ CREATE TABLE IF NOT EXISTS FILE_STORAGE_BASE ( ID BIGINT PRIMARY KEY AUTO_INCREMENT, NAME VARCHAR(255) NOT NULL UNIQUE, PATH VARCHAR(255) NOT NULL UNIQUE COMMENT 'The path to the storage base', - TYPE ENUM ('SSD_INDEX', 'SSD_WORK', 'SLOW') NOT NULL, + TYPE ENUM ('SSD_INDEX', 'SSD_WORK', 'SLOW', 'BACKUP') NOT NULL, MUST_CLEAN BOOLEAN NOT NULL DEFAULT FALSE COMMENT 'If true, the storage must be cleaned after use', PERMIT_TEMP BOOLEAN NOT NULL DEFAULT FALSE COMMENT 'If true, the storage can be used for temporary files' ) @@ -14,7 +14,7 @@ CREATE TABLE IF NOT EXISTS FILE_STORAGE ( BASE_ID BIGINT NOT NULL, PATH VARCHAR(255) NOT NULL COMMENT 'The path to the storage relative to the base', DESCRIPTION VARCHAR(255) NOT NULL, - TYPE ENUM ('CRAWL_SPEC', 'CRAWL_DATA', 'PROCESSED_DATA', 'INDEX_STAGING', 'LEXICON_STAGING', 'INDEX_LIVE', 'LEXICON_LIVE', 'SEARCH_SETS') NOT NULL, + TYPE ENUM ('CRAWL_SPEC', 'CRAWL_DATA', 'PROCESSED_DATA', 'INDEX_STAGING', 'LEXICON_STAGING', 'INDEX_LIVE', 'LEXICON_LIVE', 'SEARCH_SETS', 'BACKUP') NOT NULL, DO_PURGE BOOLEAN NOT NULL DEFAULT FALSE COMMENT 'If true, the storage may be cleaned', CREATE_DATE TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6), CONSTRAINT CONS UNIQUE (BASE_ID, PATH), diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java index 36ed81cf..c3b32cd6 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java @@ -129,6 +129,18 @@ public class StateMachine { smOutbox.notify(transition.state(), transition.message()); } + /** Initialize the state machine. */ + public void initFrom(String firstState) throws Exception { + var transition = StateTransition.to(firstState); + + synchronized (this) { + this.state = allStates.get(transition.state()); + notifyAll(); + } + + smOutbox.notify(transition.state(), transition.message()); + } + /** Initialize the state machine. */ public void init(String jsonEncodedArgument) throws Exception { var transition = StateTransition.to("INITIAL", jsonEncodedArgument); @@ -141,6 +153,18 @@ public class StateMachine { smOutbox.notify(transition.state(), transition.message()); } + /** Initialize the state machine. */ + public void initFrom(String state, String jsonEncodedArgument) throws Exception { + var transition = StateTransition.to(state, jsonEncodedArgument); + + synchronized (this) { + this.state = allStates.get(transition.state()); + notifyAll(); + } + + smOutbox.notify(transition.state(), transition.message()); + } + /** Resume the state machine from the last known state. */ private void resume() { diff --git a/code/common/process/src/main/java/nu/marginalia/process/log/WorkLog.java b/code/common/process/src/main/java/nu/marginalia/process/log/WorkLog.java index 86dd100c..b74ab5b4 100644 --- a/code/common/process/src/main/java/nu/marginalia/process/log/WorkLog.java +++ b/code/common/process/src/main/java/nu/marginalia/process/log/WorkLog.java @@ -93,4 +93,8 @@ public class WorkLog implements AutoCloseable { logWriter.flush(); logWriter.close(); } + + public int countFinishedJobs() { + return finishedJobs.size(); + } } diff --git a/code/common/service/src/main/java/nu/marginalia/service/module/DatabaseModule.java b/code/common/service/src/main/java/nu/marginalia/service/module/DatabaseModule.java index e3d660ad..70af3ed4 100644 --- a/code/common/service/src/main/java/nu/marginalia/service/module/DatabaseModule.java +++ b/code/common/service/src/main/java/nu/marginalia/service/module/DatabaseModule.java @@ -90,8 +90,8 @@ public class DatabaseModule extends AbstractModule { config.addDataSourceProperty("prepStmtCacheSize", "250"); config.addDataSourceProperty("prepStmtCacheSqlLimit", "2048"); - config.setMaximumPoolSize(100); - config.setMinimumIdle(10); + config.setMaximumPoolSize(20); + config.setMinimumIdle(2); config.setMaxLifetime(Duration.ofMinutes(9).toMillis()); diff --git a/code/common/service/src/main/resources/log4j2.properties b/code/common/service/src/main/resources/log4j2.properties index 66d688b0..96c73ea0 100644 --- a/code/common/service/src/main/resources/log4j2.properties +++ b/code/common/service/src/main/resources/log4j2.properties @@ -4,6 +4,22 @@ appender.console.type = Console appender.console.name = LogToConsole appender.console.layout.type = PatternLayout appender.console.layout.pattern = %d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1} -- %msg{nolookups}%n +appender.console.filter.process.type = MarkerFilter +appender.console.filter.process.onMismatch=ACCEPT +appender.console.filter.process.onMatch=DENY +appender.console.filter.process.marker=PROCESS +appender.console.filter.http.type = MarkerFilter +appender.console.filter.http.onMismatch=ACCEPT +appender.console.filter.http.onMatch=DENY +appender.console.filter.http.marker=HTTP +appender.processconsole.type = Console +appender.processconsole.name = ProcessLogToConsole +appender.processconsole.layout.type = PatternLayout +appender.processconsole.layout.pattern = %msg{nolookups}%n +appender.processconsole.filter.process.type = MarkerFilter +appender.processconsole.filter.process.onMismatch=DENY +appender.processconsole.filter.process.onMatch=ACCEPT +appender.processconsole.filter.process.marker=PROCESS appender.rolling.type = RollingFile appender.rolling.name = RollingFile appender.rolling.fileName = /var/log/wmsa/wmsa-${sys:service-name}.log @@ -23,6 +39,27 @@ appender.rolling.filter.http.type = MarkerFilter appender.rolling.filter.http.onMismatch=ACCEPT appender.rolling.filter.http.onMatch=DENY appender.rolling.filter.http.marker=HTTP +appender.rolling.filter.process.type = MarkerFilter +appender.rolling.filter.process.onMismatch=ACCEPT +appender.rolling.filter.process.onMatch=DENY +appender.rolling.filter.process.marker=PROCESS +appender.process.type = RollingFile +appender.process.name = ProcessFile +appender.process.fileName = /var/log/wmsa/process.log +appender.process.filePattern = /var/log/wmsa/process-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz +appender.process.layout.pattern = %msg{nolookups}%n +appender.process.layout.type = PatternLayout +appender.process.policies.type = Policies +appender.process.policies.size.type = SizeBasedTriggeringPolicy +appender.process.policies.size.size=10MB +appender.process.strategy.type = DefaultRolloverStrategy +appender.process.strategy.max = 10 +appender.process.filter.process.type = MarkerFilter +appender.process.filter.process.onMismatch=DENY +appender.process.filter.process.onMatch=ACCEPT +appender.process.filter.process.marker=PROCESS rootLogger.level = info rootLogger.appenderRef.console.ref = LogToConsole -rootLogger.appenderRef.rolling.ref = RollingFile \ No newline at end of file +rootLogger.appenderRef.processconsole.ref = ProcessLogToConsole +rootLogger.appenderRef.rolling.ref = RollingFile +rootLogger.appenderRef.process.ref = ProcessFile \ No newline at end of file diff --git a/code/features-index/index-forward/src/test/java/nu/marginalia/index/forward/ForwardIndexConverterTest.java b/code/features-index/index-forward/src/test/java/nu/marginalia/index/forward/ForwardIndexConverterTest.java index 8e8bc252..c2411575 100644 --- a/code/features-index/index-forward/src/test/java/nu/marginalia/index/forward/ForwardIndexConverterTest.java +++ b/code/features-index/index-forward/src/test/java/nu/marginalia/index/forward/ForwardIndexConverterTest.java @@ -5,6 +5,7 @@ import nu.marginalia.dict.OffHeapDictionaryHashMap; import nu.marginalia.index.journal.model.IndexJournalEntry; import nu.marginalia.index.journal.writer.IndexJournalWriterImpl; import nu.marginalia.index.journal.writer.IndexJournalWriter; +import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode; import nu.marginalia.ranking.DomainRankings; import nu.marginalia.lexicon.KeywordLexicon; import nu.marginalia.lexicon.journal.KeywordLexiconJournal; @@ -45,7 +46,7 @@ class ForwardIndexConverterTest { dictionaryFile = Files.createTempFile("tmp", ".dict"); dictionaryFile.toFile().deleteOnExit(); - keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile())); + keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile(), KeywordLexiconJournalMode.READ_WRITE)); keywordLexicon.getOrInsert("0"); indexFile = Files.createTempFile("tmp", ".idx"); diff --git a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/writer/IndexJournalWriterImpl.java b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/writer/IndexJournalWriterImpl.java index c9bf44cd..4406350f 100644 --- a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/writer/IndexJournalWriterImpl.java +++ b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/writer/IndexJournalWriterImpl.java @@ -27,7 +27,8 @@ public class IndexJournalWriterImpl implements IndexJournalWriter{ this.lexicon = lexicon; this.outputFile = outputFile; - var fileStream = Files.newOutputStream(outputFile, StandardOpenOption.CREATE); + var fileStream = Files.newOutputStream(outputFile, StandardOpenOption.CREATE, + StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING); writeHeaderPlaceholder(fileStream); diff --git a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest.java b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest.java index a61f2a91..01df3e2f 100644 --- a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest.java +++ b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest.java @@ -8,6 +8,7 @@ import nu.marginalia.index.journal.model.IndexJournalEntry; import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile; import nu.marginalia.index.journal.writer.IndexJournalWriterImpl; import nu.marginalia.index.journal.writer.IndexJournalWriter; +import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode; import nu.marginalia.ranking.DomainRankings; import nu.marginalia.lexicon.KeywordLexicon; import nu.marginalia.lexicon.journal.KeywordLexiconJournal; @@ -42,7 +43,7 @@ class ReverseIndexFullConverterTest { dictionaryFile = Files.createTempFile("tmp", ".dict"); dictionaryFile.toFile().deleteOnExit(); - keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile())); + keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile(), KeywordLexiconJournalMode.READ_WRITE)); keywordLexicon.getOrInsert("0"); indexFile = Files.createTempFile("tmp", ".idx"); diff --git a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest2.java b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest2.java index 5ce603c1..4488912b 100644 --- a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest2.java +++ b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest2.java @@ -10,6 +10,7 @@ import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile import nu.marginalia.index.journal.writer.IndexJournalWriterImpl; import nu.marginalia.index.journal.writer.IndexJournalWriter; import nu.marginalia.index.priority.ReverseIndexPriorityParameters; +import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode; import nu.marginalia.ranking.DomainRankings; import nu.marginalia.lexicon.KeywordLexicon; import nu.marginalia.lexicon.journal.KeywordLexiconJournal; @@ -52,7 +53,7 @@ class ReverseIndexFullConverterTest2 { dictionaryFile = Files.createTempFile("tmp", ".dict"); dictionaryFile.toFile().deleteOnExit(); - keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile())); + keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile(), KeywordLexiconJournalMode.READ_WRITE)); keywordLexicon.getOrInsert("0"); indexFile = Files.createTempFile("tmp", ".idx"); diff --git a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexPriorityConverterTest2.java b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexPriorityConverterTest2.java index 21d6198b..d634c175 100644 --- a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexPriorityConverterTest2.java +++ b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexPriorityConverterTest2.java @@ -12,6 +12,7 @@ import nu.marginalia.index.priority.ReverseIndexPriorityConverter; import nu.marginalia.index.priority.ReverseIndexPriorityParameters; import nu.marginalia.lexicon.KeywordLexicon; import nu.marginalia.lexicon.journal.KeywordLexiconJournal; +import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode; import nu.marginalia.ranking.DomainRankings; import nu.marginalia.test.TestUtil; import org.junit.jupiter.api.AfterEach; @@ -52,7 +53,7 @@ class ReverseIndexPriorityConverterTest2 { dictionaryFile = Files.createTempFile("tmp", ".dict"); dictionaryFile.toFile().deleteOnExit(); - keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile())); + keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile(), KeywordLexiconJournalMode.READ_WRITE)); keywordLexicon.getOrInsert("0"); indexFile = Files.createTempFile("tmp", ".idx"); diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/dict/DictionaryMap.java b/code/features-index/lexicon/src/main/java/nu/marginalia/dict/DictionaryMap.java index 1f9525a2..260015be 100644 --- a/code/features-index/lexicon/src/main/java/nu/marginalia/dict/DictionaryMap.java +++ b/code/features-index/lexicon/src/main/java/nu/marginalia/dict/DictionaryMap.java @@ -1,9 +1,19 @@ package nu.marginalia.dict; +/** Backing store for the KeywordLexicon, available in on and off-heap versions. + *

    + * The off-heap version is necessary when loading a lexicon that is too large to fit in RAM, due + * to Java's 2GB limit on the size of a single array. It is slower and less optimized than the on-heap version. + *

    + * The off-heap version is on the precipice of being deprecated and its use is discouraged. + */ public interface DictionaryMap { int NO_VALUE = Integer.MIN_VALUE; static DictionaryMap create() { + // Default to on-heap version + // TODO: Make this configurable + return new OnHeapDictionaryMap(); } diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexicon.java b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexicon.java index bd88efc8..4929b9c1 100644 --- a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexicon.java +++ b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexicon.java @@ -6,6 +6,7 @@ import io.prometheus.client.Gauge; import lombok.SneakyThrows; import nu.marginalia.dict.DictionaryMap; import nu.marginalia.lexicon.journal.KeywordLexiconJournal; +import nu.marginalia.lexicon.journal.KeywordLexiconJournalFingerprint; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -16,6 +17,19 @@ import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; +/** The keyword lexicon is used to map keywords to unique numeric IDs. + * This class is used to both construct the lexicon, and to read from it. + *

    + * Readers will want to use the KeywordLexiconReadOnlyView wrapper, as it + * only exposes read-only methods and hides the mutating methods. + *

    + * Between instances, the lexicon is stored in a journal file, exactly in the + * order they were received by the writer. The journal file is then replayed + * on startup to reconstruct the lexicon, giving each term an ID according to + * the order they are loaded. It is therefore important that the journal file + * is not tampered with, as this will cause the lexicon to be corrupted. + * */ + public class KeywordLexicon implements AutoCloseable { private final DictionaryMap reverseIndex; @@ -30,6 +44,8 @@ public class KeywordLexicon implements AutoCloseable { .register(); private final KeywordLexiconJournal journal; + private volatile KeywordLexiconJournalFingerprint fingerprint = null; + @SneakyThrows public KeywordLexicon(KeywordLexiconJournal keywordLexiconJournal) { @@ -42,21 +58,36 @@ public class KeywordLexicon implements AutoCloseable { logger.error("MULTIPLE LEXICON INSTANCES!"); } - journal.loadFile(bytes -> reverseIndex.put(hashFunction.hashBytes(bytes).padToLong())); + reload(); logger.info("Done creating dictionary writer"); } - public void reload() throws IOException { - logger.info("Reloading dictionary writer"); - journal.loadFile(bytes -> reverseIndex.put(hashFunction.hashBytes(bytes).padToLong())); - logger.info("Done reloading dictionary writer"); + public boolean needsReload() throws IOException { + var newFingerprint = journal.journalFingerprint(); + return !newFingerprint.equals(fingerprint); } + /** Reload the lexicon from the journal */ + public void reload() throws IOException { + var lock = memoryLock.writeLock(); + lock.lock(); + try { + reverseIndex.clear(); + journal.loadFile(bytes -> reverseIndex.put(hashFunction.hashBytes(bytes).padToLong())); + fingerprint = journal.journalFingerprint(); + } + finally { + lock.unlock(); + } + } + + /** Get method that inserts the word into the lexicon if it is not present */ public int getOrInsert(String macroWord) { return getOrInsert(macroWord.getBytes(StandardCharsets.UTF_8)); } + /** Get method that inserts the word into the lexicon if it is not present */ @SneakyThrows private int getOrInsert(byte[] bytes) { if (bytes.length >= Byte.MAX_VALUE) { @@ -96,11 +127,13 @@ public class KeywordLexicon implements AutoCloseable { } } + /** Get method that does not modify the lexicon if the word is not present */ public int getReadOnly(String word) { final byte[] bytes = word.getBytes(StandardCharsets.UTF_8); return getReadOnly(hashFunction.hashBytes(bytes).padToLong()); } + /** Get method that does not modify the lexicon if the word is not present */ public int getReadOnly(long hashedKey) { Lock lock = memoryLock.readLock(); try { diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexiconReadOnlyView.java b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexiconReadOnlyView.java index ba7983a5..076cc84d 100644 --- a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexiconReadOnlyView.java +++ b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexiconReadOnlyView.java @@ -3,13 +3,19 @@ package nu.marginalia.lexicon; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; import lombok.SneakyThrows; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.concurrent.TimeUnit; +/** A read-only view of a keyword lexicon. + * + * @see KeywordLexicon + * */ public class KeywordLexiconReadOnlyView { private final KeywordLexicon writer; - + private final Logger logger = LoggerFactory.getLogger(getClass()); private final Cache cache = CacheBuilder.newBuilder().maximumSize(10_000).expireAfterAccess(60, TimeUnit.SECONDS).build(); @SneakyThrows @@ -22,8 +28,15 @@ public class KeywordLexiconReadOnlyView { return cache.get(word, () -> writer.getReadOnly(word)); } - public boolean reload() throws IOException { - writer.reload(); + public boolean suggestReload() throws IOException { + if (writer.needsReload()) { + logger.info("Reloading lexicon"); + writer.reload(); + cache.invalidateAll(); + } + else { + logger.info("Foregoing lexicon reload"); + } return true; } } diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournal.java b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournal.java index 013f2c49..01ba412b 100644 --- a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournal.java +++ b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournal.java @@ -5,35 +5,70 @@ import org.slf4j.LoggerFactory; import java.io.File; import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.attribute.BasicFileAttributes; import java.util.List; import java.util.function.Consumer; +/** The journal for the keyword lexicon. + * It's used both for writing the lexicon, but also for reconstructing it for reading later. + */ public class KeywordLexiconJournal { private static final boolean noCommit = Boolean.getBoolean("DictionaryJournal.noCommit"); private final KeywordLexiconJournalCommitQueue commitQueue; - private final KeywordLexiconJournalFile journalFile; + private KeywordLexiconJournalFile journalFile; private final Logger logger = LoggerFactory.getLogger(getClass()); private final Thread commitToDiskThread; private volatile boolean running = true; + private final Path journalFilePath; - public KeywordLexiconJournal(File file) throws IOException { - commitQueue = new KeywordLexiconJournalCommitQueue(); - journalFile = new KeywordLexiconJournalFile(file); + /** Create a new journal. + * + * @param file The file to use for the journal. + * @param mode The mode to use for the journal. If READ_ONLY, the journal will be read-only and refuse + * to accept new entries. + */ + public KeywordLexiconJournal(File file, KeywordLexiconJournalMode mode) throws IOException { + journalFilePath = file.toPath(); - commitToDiskThread = new Thread(this::commitToDiskRunner, "CommitToDiskThread"); - commitToDiskThread.start(); + if (mode == KeywordLexiconJournalMode.READ_WRITE) { + commitQueue = new KeywordLexiconJournalCommitQueue(); + journalFile = new KeywordLexiconJournalFile(file); - Runtime.getRuntime().addShutdownHook(new Thread(this::commitToDisk)); + commitToDiskThread = new Thread(this::commitToDiskRunner, "CommitToDiskThread"); + commitToDiskThread.start(); + + Runtime.getRuntime().addShutdownHook(new Thread(this::commitToDisk)); + } + else { + journalFile = new KeywordLexiconJournalFile(file); + + commitQueue = null; + commitToDiskThread = null; + } } public void enqueue(byte[] word) throws InterruptedException { + if (null == commitQueue) + throw new UnsupportedOperationException("Lexicon journal is read-only"); + commitQueue.enqueue(word); } + public KeywordLexiconJournalFingerprint journalFingerprint() throws IOException { + var attributes = Files.readAttributes(journalFilePath, BasicFileAttributes.class); + + long cTime = attributes.creationTime().toMillis(); + long mTime = attributes.lastModifiedTime().toMillis(); + long size = attributes.size(); + + return new KeywordLexiconJournalFingerprint(cTime, mTime, size); + } public void commitToDiskRunner() { if (noCommit) return; @@ -57,14 +92,23 @@ public class KeywordLexiconJournal { public void close() throws Exception { logger.info("Closing Journal"); running = false; - commitToDiskThread.join(); - commitToDisk(); - journalFile.close(); + if (commitToDiskThread != null) { + commitToDiskThread.join(); + commitToDisk(); + } + + if (journalFile != null) { + journalFile.close(); + } } public void loadFile(Consumer loadJournalEntry) throws IOException { - journalFile.rewind(); + if (journalFile != null) { + journalFile.close(); + } + + journalFile = new KeywordLexiconJournalFile(journalFilePath.toFile()); journalFile.loadFile(loadJournalEntry); } } diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalCommitQueue.java b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalCommitQueue.java index 7c6a460f..8ff12d6d 100644 --- a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalCommitQueue.java +++ b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalCommitQueue.java @@ -7,6 +7,9 @@ import java.util.ArrayList; import java.util.Collections; import java.util.List; +/** An in-memory queue for lexicon journal entries used to improve the performance of + * large bursts of insert-operations. + */ class KeywordLexiconJournalCommitQueue { private final ArrayList commitQueue = new ArrayList<>(10_000); private final Logger logger = LoggerFactory.getLogger(getClass()); diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalFile.java b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalFile.java index f7404296..81789891 100644 --- a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalFile.java +++ b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalFile.java @@ -1,6 +1,5 @@ package nu.marginalia.lexicon.journal; -import lombok.SneakyThrows; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalFingerprint.java b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalFingerprint.java new file mode 100644 index 00000000..a08d7124 --- /dev/null +++ b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalFingerprint.java @@ -0,0 +1,10 @@ +package nu.marginalia.lexicon.journal; + +/** Contains values used to assess whether the lexicon is in sync with the journal + * or if it has been replaced with a newer version and should be reloaded + * */ +public record KeywordLexiconJournalFingerprint(long createdTime, + long mTime, + long sizeBytes) +{ +} diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalMode.java b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalMode.java new file mode 100644 index 00000000..6208fc47 --- /dev/null +++ b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalMode.java @@ -0,0 +1,6 @@ +package nu.marginalia.lexicon.journal; + +public enum KeywordLexiconJournalMode { + READ_ONLY, + READ_WRITE +} diff --git a/code/features-index/lexicon/src/test/java/nu/marginalia/lexicon/KeywordLexiconTest.java b/code/features-index/lexicon/src/test/java/nu/marginalia/lexicon/KeywordLexiconTest.java index ca044e5e..98249c27 100644 --- a/code/features-index/lexicon/src/test/java/nu/marginalia/lexicon/KeywordLexiconTest.java +++ b/code/features-index/lexicon/src/test/java/nu/marginalia/lexicon/KeywordLexiconTest.java @@ -2,6 +2,7 @@ package nu.marginalia.lexicon; import nu.marginalia.dict.OnHeapDictionaryMap; import nu.marginalia.lexicon.journal.KeywordLexiconJournal; +import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach; @@ -23,7 +24,7 @@ public class KeywordLexiconTest { public void setUp() throws IOException { journalFile = Files.createTempFile(getClass().getSimpleName(), ".dat"); - var lexiconJournal = new KeywordLexiconJournal(journalFile.toFile()); + var lexiconJournal = new KeywordLexiconJournal(journalFile.toFile(), KeywordLexiconJournalMode.READ_WRITE); lexicon = new KeywordLexicon(lexiconJournal); } @@ -64,7 +65,7 @@ public class KeywordLexiconTest { int c = lexicon.getOrInsert("ccc"); lexicon.commitToDisk(); - var lexiconJournal = new KeywordLexiconJournal(journalFile.toFile()); + var lexiconJournal = new KeywordLexiconJournal(journalFile.toFile(), KeywordLexiconJournalMode.READ_WRITE); try (var anotherLexicon = new KeywordLexicon(lexiconJournal)) { assertEquals(a, anotherLexicon.getReadOnly("aaa")); assertEquals(b, anotherLexicon.getReadOnly("bbb")); diff --git a/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java b/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java index b425e29b..655525d6 100644 --- a/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java +++ b/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java @@ -109,7 +109,7 @@ public class CrawlPlan { return WorkLog.iterableMap(crawl.getLogFile(), entry -> { - if (!idPredicate.test(entry.path())) { + if (!idPredicate.test(entry.id())) { return Optional.empty(); } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java index 36e5b558..5488a6c2 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java @@ -28,6 +28,7 @@ import java.util.Optional; import java.util.UUID; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.Predicate; import static nu.marginalia.mqapi.ProcessInboxNames.CONVERTER_INBOX; @@ -135,7 +136,12 @@ public class ConverterMain { }; - for (var domain : plan.domainsIterable(id -> !processLog.isJobFinished(id))) { + // Advance the progress bar to the current position if this is a resumption + processedDomains.set(processLog.countFinishedJobs()); + heartbeat.setProgress(processedDomains.incrementAndGet() / (double) totalDomains); + + for (var domain : plan.domainsIterable(id -> !processLog.isJobFinished(id))) + { pipe.accept(domain); } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java index f6ccc79d..5dff9388 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java @@ -7,6 +7,7 @@ import com.google.inject.Injector; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.loading.loader.IndexLoadKeywords; import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mq.MqMessage; import nu.marginalia.mq.inbox.MqInboxResponse; @@ -40,6 +41,7 @@ public class LoaderMain { private final ProcessHeartbeat heartbeat; private final MessageQueueFactory messageQueueFactory; private final FileStorageService fileStorageService; + private final IndexLoadKeywords indexLoadKeywords; private final Gson gson; private volatile boolean running = true; @@ -65,6 +67,7 @@ public class LoaderMain { ProcessHeartbeat heartbeat, MessageQueueFactory messageQueueFactory, FileStorageService fileStorageService, + IndexLoadKeywords indexLoadKeywords, Gson gson ) { @@ -73,6 +76,7 @@ public class LoaderMain { this.heartbeat = heartbeat; this.messageQueueFactory = messageQueueFactory; this.fileStorageService = fileStorageService; + this.indexLoadKeywords = indexLoadKeywords; this.gson = gson; heartbeat.start(); @@ -122,6 +126,9 @@ public class LoaderMain { running = false; processorThread.join(); instructions.ok(); + + // This needs to be done in order to have a readable index journal + indexLoadKeywords.close(); } catch (Exception ex) { logger.error("Failed to load", ex); diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java index 66eea626..21216b35 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java @@ -144,12 +144,4 @@ public class Loader implements Interpreter { sqlLoadProcessedDocument.loadWithError(data, processedDocumentWithErrorList); } - public void close() { - try { - indexLoadKeywords.close(); - } - catch (Exception ex) { - logger.error("Error when closing the index loader", ex); - } - } } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java index 35f8e79f..14962f9b 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java @@ -12,6 +12,7 @@ import nu.marginalia.index.journal.writer.IndexJournalWriter; import nu.marginalia.keyword.model.DocumentKeywords; import nu.marginalia.lexicon.KeywordLexicon; import nu.marginalia.lexicon.journal.KeywordLexiconJournal; +import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; @@ -20,6 +21,8 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.attribute.PosixFilePermissions; import java.sql.SQLException; import java.util.Arrays; @@ -38,7 +41,13 @@ public class LoaderIndexJournalWriter { var lexiconPath = lexiconArea.asPath().resolve("dictionary.dat"); var indexPath = indexArea.asPath().resolve("page-index.dat"); - lexicon = new KeywordLexicon(new KeywordLexiconJournal(lexiconPath.toFile())); + Files.deleteIfExists(lexiconPath); + Files.deleteIfExists(indexPath); + + Files.createFile(indexPath, PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--"))); + Files.createFile(lexiconPath, PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--"))); + + lexicon = new KeywordLexicon(new KeywordLexiconJournal(lexiconPath.toFile(), KeywordLexiconJournalMode.READ_WRITE)); indexWriter = new IndexJournalWriterImpl(lexicon, indexPath); } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexModule.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexModule.java index a0bad25d..e0a3b2de 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexModule.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexModule.java @@ -11,6 +11,7 @@ import nu.marginalia.WmsaHome; import nu.marginalia.lexicon.KeywordLexicon; import nu.marginalia.lexicon.KeywordLexiconReadOnlyView; import nu.marginalia.lexicon.journal.KeywordLexiconJournal; +import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode; import nu.marginalia.service.control.ServiceEventLog; import java.nio.file.Path; @@ -32,7 +33,7 @@ public class IndexModule extends AbstractModule { var area = fileStorageService.getStorageByType(FileStorageType.LEXICON_LIVE); var path = area.asPath().resolve("dictionary.dat"); - return new KeywordLexiconReadOnlyView(new KeywordLexicon(new KeywordLexiconJournal(path.toFile()))); + return new KeywordLexiconReadOnlyView(new KeywordLexicon(new KeywordLexiconJournal(path.toFile(), KeywordLexiconJournalMode.READ_ONLY))); } finally { eventLog.logEvent("INDEX-LEXICON-LOAD-OK", ""); diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexServicesFactory.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexServicesFactory.java index 11008677..eafd3d57 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexServicesFactory.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexServicesFactory.java @@ -38,7 +38,7 @@ public class IndexServicesFactory { private final Logger logger = LoggerFactory.getLogger(getClass()); - private final PartitionedDataFile writerIndexFile; + private final Path writerIndexFile; private final PartitionedDataFile fwdIndexDocId; private final PartitionedDataFile fwdIndexDocData; @@ -67,7 +67,7 @@ public class IndexServicesFactory { Files.createDirectories(tmpFileDir); } - writerIndexFile = new PartitionedDataFile(stagingStorage, "page-index.dat"); + writerIndexFile = stagingStorage.resolve("page-index.dat"); fwdIndexDocId = new PartitionedDataFile(liveStorage, "fwd-doc-id.dat"); fwdIndexDocData = new PartitionedDataFile(liveStorage, "fwd-doc-data.dat"); @@ -85,7 +85,7 @@ public class IndexServicesFactory { public boolean isPreconvertedIndexPresent() { return Stream.of( - writerIndexFile.get(LIVE_PART).toPath() + writerIndexFile ).allMatch(Files::exists); } @@ -100,10 +100,6 @@ public class IndexServicesFactory { ).noneMatch(Files::exists); } - public IndexJournalWriter createIndexJournalWriter(KeywordLexicon lexicon) throws IOException { - return new IndexJournalWriterImpl(lexicon, writerIndexFile.get(LIVE_PART).toPath()); - } - public void convertIndex(DomainRankings domainRankings) throws IOException { convertForwardIndex(domainRankings); convertFullReverseIndex(domainRankings); @@ -111,11 +107,9 @@ public class IndexServicesFactory { } private void convertFullReverseIndex(DomainRankings domainRankings) throws IOException { - var source = writerIndexFile.get(0).toPath(); + logger.info("Converting full reverse index {}", writerIndexFile); - logger.info("Converting full reverse index {}", source); - - var journalReader = new IndexJournalReaderSingleCompressedFile(source); + var journalReader = new IndexJournalReaderSingleCompressedFile(writerIndexFile); var converter = new ReverseIndexFullConverter(tmpFileDir, journalReader, domainRankings, @@ -129,11 +123,9 @@ public class IndexServicesFactory { private void convertPriorityReverseIndex(DomainRankings domainRankings) throws IOException { - var source = writerIndexFile.get(0).toPath(); + logger.info("Converting priority reverse index {}", writerIndexFile); - logger.info("Converting priority reverse index {}", source); - - var journalReader = new IndexJournalReaderSingleCompressedFile(source, null, + var journalReader = new IndexJournalReaderSingleCompressedFile(writerIndexFile, null, ReverseIndexPriorityParameters::filterPriorityRecord); var converter = new ReverseIndexPriorityConverter(tmpFileDir, @@ -149,11 +141,10 @@ public class IndexServicesFactory { private void convertForwardIndex(DomainRankings domainRankings) throws IOException { - var source = writerIndexFile.get(0); - logger.info("Converting forward index data {}", source); + logger.info("Converting forward index data {}", writerIndexFile); - new ForwardIndexConverter(source, + new ForwardIndexConverter(writerIndexFile.toFile(), fwdIndexDocId.get(NEXT_PART).toPath(), fwdIndexDocData.get(NEXT_PART).toPath(), domainRankings) diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexOpsService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexOpsService.java index 31192d37..22e514d8 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexOpsService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexOpsService.java @@ -9,7 +9,6 @@ import spark.Response; import spark.Spark; import javax.annotation.CheckReturnValue; -import java.io.IOException; import java.util.Optional; import java.util.concurrent.Callable; import java.util.concurrent.locks.ReentrantLock; @@ -39,10 +38,13 @@ public class IndexOpsService { return run(searchSetService::recalculateAll); } public boolean reindex() throws Exception { - return run(index::switchIndex).isPresent(); + return run(() -> { + return index.switchIndex() && lexicon.suggestReload(); + }).isPresent(); } + public boolean reloadLexicon() throws Exception { - return run(lexicon::reload).isPresent(); + return run(lexicon::suggestReload).isPresent(); } public Object repartitionEndpoint(Request request, Response response) throws Exception { diff --git a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java index 77ea0a2e..997e2a74 100644 --- a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java +++ b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java @@ -6,9 +6,11 @@ import nu.marginalia.db.storage.model.FileStorage; import nu.marginalia.db.storage.model.FileStorageType; import nu.marginalia.index.IndexServicesFactory; import nu.marginalia.index.journal.writer.IndexJournalWriter; +import nu.marginalia.index.journal.writer.IndexJournalWriterImpl; import nu.marginalia.lexicon.KeywordLexicon; import nu.marginalia.lexicon.KeywordLexiconReadOnlyView; import nu.marginalia.lexicon.journal.KeywordLexiconJournal; +import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode; import nu.marginalia.ranking.DomainRankings; import nu.marginalia.index.svc.searchset.SearchSetAny; import nu.marginalia.index.util.TestUtil; @@ -70,15 +72,19 @@ public class IndexQueryServiceIntegrationTestModule extends AbstractModule { when(setsServiceMock.getDomainRankings()).thenReturn(new DomainRankings()); bind(IndexSearchSetsService.class).toInstance(setsServiceMock); - var keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(slowDir.resolve("dictionary.dat").toFile())); + var keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal( + slowDir.resolve("dictionary.dat").toFile(), + KeywordLexiconJournalMode.READ_WRITE) + ); bind(KeywordLexicon.class).toInstance(keywordLexicon); bind(KeywordLexiconReadOnlyView.class).toInstance(new KeywordLexiconReadOnlyView(keywordLexicon)); - bind(IndexJournalWriter.class).toInstance(servicesFactory.createIndexJournalWriter(keywordLexicon)); - bind(ServiceEventLog.class).toInstance(Mockito.mock(ServiceEventLog.class)); bind(ServiceHeartbeat.class).toInstance(Mockito.mock(ServiceHeartbeat.class)); + bind(IndexJournalWriter.class).toInstance(new IndexJournalWriterImpl(keywordLexicon, + slowDir.resolve("page-index.dat"))); + bind(ServiceConfiguration.class).toInstance(new ServiceConfiguration( ServiceId.Index, 0, diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java index a3db382b..2c04fe6f 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java @@ -3,10 +3,7 @@ package nu.marginalia.control; import com.google.gson.Gson; import com.google.inject.Inject; import nu.marginalia.client.ServiceMonitors; -import nu.marginalia.control.model.ControlProcess; -import nu.marginalia.control.fsm.ControlFSMs; import nu.marginalia.control.svc.*; -import nu.marginalia.db.storage.model.FileStorageId; import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.renderer.MustacheRenderer; import nu.marginalia.renderer.RendererFactory; @@ -26,11 +23,12 @@ public class ControlService extends Service { private final Gson gson = GsonFactory.get(); private final ServiceMonitors monitors; - private final MustacheRenderer indexRenderer; - private final MustacheRenderer> servicesRenderer; - private final MustacheRenderer> processesRenderer; - private final MustacheRenderer> storageRenderer; + private final HeartbeatService heartbeatService; + private final EventLogService eventLogService; + private final ControlFsmService controlFsmService; private final StaticResources staticResources; + private final MessageQueueViewService messageQueueViewService; + private final ControlFileStorageService controlFileStorageService; @Inject @@ -39,7 +37,7 @@ public class ControlService extends Service { HeartbeatService heartbeatService, EventLogService eventLogService, RendererFactory rendererFactory, - ControlFSMs controlFSMs, + ControlFsmService controlFsmService, StaticResources staticResources, MessageQueueViewService messageQueueViewService, ControlFileStorageService controlFileStorageService @@ -47,13 +45,20 @@ public class ControlService extends Service { super(params); this.monitors = monitors; + this.heartbeatService = heartbeatService; + this.eventLogService = eventLogService; - indexRenderer = rendererFactory.renderer("control/index"); - servicesRenderer = rendererFactory.renderer("control/services"); - processesRenderer = rendererFactory.renderer("control/processes"); - storageRenderer = rendererFactory.renderer("control/storage"); + var indexRenderer = rendererFactory.renderer("control/index"); + var servicesRenderer = rendererFactory.renderer("control/services"); + var serviceByIdRenderer = rendererFactory.renderer("control/service-by-id"); + var processesRenderer = rendererFactory.renderer("control/processes"); + var storageRenderer = rendererFactory.renderer("control/storage"); + + this.controlFsmService = controlFsmService; this.staticResources = staticResources; + this.messageQueueViewService = messageQueueViewService; + this.controlFileStorageService = controlFileStorageService; Spark.get("/public/heartbeats", (req, res) -> { res.type("application/json"); @@ -62,45 +67,21 @@ public class ControlService extends Service { Spark.get("/public/", (req, rsp) -> indexRenderer.render(Map.of())); - Spark.get("/public/services", - (req, rsp) -> Map.of("services", heartbeatService.getServiceHeartbeats(), - "events", eventLogService.getLastEntries(20)), - (map) -> servicesRenderer.render((Map) map)); - - Spark.get("/public/processes", - (req, rsp) -> Map.of("processes", heartbeatService.getProcessHeartbeats(), - "fsms", controlFSMs.getFsmStates(), - "messages", messageQueueViewService.getLastEntries(20)), - (map) -> processesRenderer.render((Map) map)); - - Spark.get("/public/storage", - (req, rsp) -> Map.of("storage", controlFileStorageService.getStorageList()), - (map) -> storageRenderer.render((Map) map)); + Spark.get("/public/services", this::servicesModel, servicesRenderer::render); + Spark.get("/public/services/:id", this::serviceModel, serviceByIdRenderer::render); + Spark.get("/public/processes", this::processesModel, processesRenderer::render); + Spark.get("/public/storage", this::storageModel, storageRenderer::render); final HtmlRedirect redirectToServices = new HtmlRedirect("/services"); final HtmlRedirect redirectToProcesses = new HtmlRedirect("/processes"); final HtmlRedirect redirectToStorage = new HtmlRedirect("/storage"); - Spark.post("/public/fsms/:fsm/start", (req, rsp) -> { - controlFSMs.start(ControlProcess.valueOf(req.params("fsm").toUpperCase())); - return ""; - }, redirectToProcesses); + Spark.post("/public/fsms/:fsm/start", controlFsmService::startFsm, redirectToProcesses); + Spark.post("/public/fsms/:fsm/stop", controlFsmService::stopFsm, redirectToProcesses); - Spark.post("/public/fsms/:fsm/stop", (req, rsp) -> { - controlFSMs.stop(ControlProcess.valueOf(req.params("fsm").toUpperCase())); - return ""; - }, redirectToProcesses); + Spark.post("/public/storage/:fid/process", controlFsmService::triggerProcessing, redirectToProcesses); + Spark.post("/public/storage/:fid/load", controlFsmService::loadProcessedData, redirectToProcesses); - // TODO: This should be a POST - Spark.get("/public/repartition", (req, rsp) -> { - controlFSMs.start(ControlProcess.REPARTITION_REINDEX); - return ""; - } , redirectToProcesses); - - Spark.post("/public/storage/:fid/process", (req, rsp) -> { - controlFSMs.start(ControlProcess.RECONVERT_LOAD, FileStorageId.of(Integer.parseInt(req.params("fid")))); - return ""; - }, redirectToProcesses); Spark.post("/public/storage/:fid/delete", controlFileStorageService::flagFileForDeletionRequest, redirectToStorage); Spark.get("/public/:resource", this::serveStatic); @@ -108,6 +89,28 @@ public class ControlService extends Service { monitors.subscribe(this::logMonitorStateChange); } + private Object serviceModel(Request request, Response response) { + String serviceName = request.params("id"); + + return Map.of( + "id", serviceName, + "events", eventLogService.getLastEntriesForService(serviceName, 20)); + } + + private Object storageModel(Request request, Response response) { + return Map.of("storage", controlFileStorageService.getStorageList()); + } + + private Object servicesModel(Request request, Response response) { + return Map.of("services", heartbeatService.getServiceHeartbeats(), + "events", eventLogService.getLastEntries(20)); + } + + private Object processesModel(Request request, Response response) { + return Map.of("processes", heartbeatService.getProcessHeartbeats(), + "fsms", controlFsmService.getFsmStates(), + "messages", messageQueueViewService.getLastEntries(20)); + } private Object serveStatic(Request request, Response response) { String resource = request.params("resource"); diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/ControlFSMs.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/ControlFSMs.java index 0c756114..4945c6d5 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/ControlFSMs.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/ControlFSMs.java @@ -23,6 +23,7 @@ import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.UUID; +import java.util.stream.Collectors; @Singleton public class ControlFSMs { @@ -68,33 +69,39 @@ public class ControlFSMs { eventLog.logEvent("FSM-STATE-CHANGE", process.id() + " -> " + state); } + public void startFrom(ControlProcess process, String state) throws Exception { + eventLog.logEvent("FSM-START", process.id()); + + stateMachines.get(process).initFrom(state); + } + public void start(ControlProcess process) throws Exception { eventLog.logEvent("FSM-START", process.id()); stateMachines.get(process).init(); } + public void startFrom(ControlProcess process, String state, Object arg) throws Exception { + eventLog.logEvent("FSM-START", process.id()); + + stateMachines.get(process).initFrom(state, gson.toJson(arg)); + } + public void start(ControlProcess process, Object arg) throws Exception { eventLog.logEvent("FSM-START", process.id()); stateMachines.get(process).init(gson.toJson(arg)); } - public List getFsmStates() { - return stateMachines.entrySet().stream().sorted(Map.Entry.comparingByKey()).map(e -> { - - final MachineState state = e.getValue().getState(); - - final String machineName = e.getKey().name(); - final String stateName = state.name(); - final boolean terminal = state.isFinal(); - - return new ControlProcessState(machineName, stateName, terminal); - }).toList(); - } - @SneakyThrows public void stop(ControlProcess fsm) { stateMachines.get(fsm).abortExecution(); } + + public Map getMachineStates() { + return stateMachines.entrySet().stream().collect( + Collectors.toMap( + Map.Entry::getKey, e -> e.getValue().getState()) + ); + } } diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/AbstractProcessSpawnerFSM.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/AbstractProcessSpawnerFSM.java index 75944553..90a704c9 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/AbstractProcessSpawnerFSM.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/AbstractProcessSpawnerFSM.java @@ -58,7 +58,7 @@ public class AbstractProcessSpawnerFSM extends AbstractStateGraph { } } - @GraphState(name = RUN) + @GraphState(name = RUN, resume = ResumeBehavior.RESTART) public void run(Integer attempts) throws Exception { try { processService.trigger(processId); diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/task/ReconvertAndLoadFSM.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/task/ReconvertAndLoadFSM.java index 781882a5..f3e625a6 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/task/ReconvertAndLoadFSM.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/task/ReconvertAndLoadFSM.java @@ -23,7 +23,13 @@ import nu.marginalia.mqsm.graph.AbstractStateGraph; import nu.marginalia.mqsm.graph.GraphState; import nu.marginalia.mqsm.graph.ResumeBehavior; import nu.marginalia.search.client.SearchClient; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardCopyOption; +import java.time.LocalDateTime; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; @@ -32,23 +38,19 @@ public class ReconvertAndLoadFSM extends AbstractStateGraph { // STATES - private static final String INITIAL = "INITIAL"; - private static final String RECONVERT = "RECONVERT"; - private static final String RECONVERT_WAIT = "RECONVERT_WAIT"; - private static final String LOAD = "LOAD"; - private static final String LOAD_WAIT = "LOAD_WAIT"; - private static final String MOVE_INDEX_FILES = "MOVE_INDEX_FILES"; - private static final String RELOAD_LEXICON = "RELOAD_LEXICON"; - private static final String RELOAD_LEXICON_WAIT = "RELOAD_LEXICON_WAIT"; - private static final String FLUSH_CACHES = "FLUSH_CACHES"; - private static final String END = "END"; + public static final String INITIAL = "INITIAL"; + public static final String RECONVERT = "RECONVERT"; + public static final String RECONVERT_WAIT = "RECONVERT-WAIT"; + public static final String LOAD = "LOAD"; + public static final String LOAD_WAIT = "LOAD-WAIT"; + public static final String SWAP_LEXICON = "SWAP-LEXICON"; + public static final String END = "END"; private final ProcessService processService; - private final MqOutbox mqIndexOutbox; - private final MqOutbox mqSearchOutbox; private final MqOutbox mqConverterOutbox; private final MqOutbox mqLoaderOutbox; private final FileStorageService storageService; private final Gson gson; + private final Logger logger = LoggerFactory.getLogger(getClass()); @AllArgsConstructor @With @NoArgsConstructor @@ -62,17 +64,13 @@ public class ReconvertAndLoadFSM extends AbstractStateGraph { @Inject public ReconvertAndLoadFSM(StateFactory stateFactory, ProcessService processService, - IndexClient indexClient, ProcessOutboxFactory processOutboxFactory, - SearchClient searchClient, FileStorageService storageService, Gson gson ) { super(stateFactory); this.processService = processService; - this.mqIndexOutbox = indexClient.outbox(); - this.mqSearchOutbox = searchClient.outbox(); this.mqConverterOutbox = processOutboxFactory.createConverterOutbox(); this.mqLoaderOutbox = processOutboxFactory.createLoaderOutbox(); this.storageService = storageService; @@ -92,8 +90,12 @@ public class ReconvertAndLoadFSM extends AbstractStateGraph { @GraphState(name = RECONVERT, next = RECONVERT_WAIT, resume = ResumeBehavior.ERROR) public Message reconvert(Message message) throws Exception { // Create processed data area + + var toProcess = storageService.getStorage(message.crawlStorageId); + var base = storageService.getStorageBase(FileStorageBaseType.SLOW); - var processedArea = storageService.allocateTemporaryStorage(base, FileStorageType.PROCESSED_DATA, "processed-data", "Processed Data"); + var processedArea = storageService.allocateTemporaryStorage(base, FileStorageType.PROCESSED_DATA, "processed-data", + "Processed Data; " + toProcess.description()); // Pre-send convert request var request = new ConvertRequest(message.crawlStorageId, processedArea.id()); @@ -124,7 +126,7 @@ public class ReconvertAndLoadFSM extends AbstractStateGraph { } - @GraphState(name = LOAD_WAIT, next = END, resume = ResumeBehavior.RETRY) + @GraphState(name = LOAD_WAIT, next = SWAP_LEXICON, resume = ResumeBehavior.RETRY) public void loadWait(Message message) throws Exception { var rsp = waitResponse(mqLoaderOutbox, ProcessService.ProcessId.LOADER, message.loaderMsgId); @@ -132,6 +134,33 @@ public class ReconvertAndLoadFSM extends AbstractStateGraph { error("Loader failed"); } + + + @GraphState(name = SWAP_LEXICON, next = END, resume = ResumeBehavior.RETRY) + public void swapLexicon(Message message) throws Exception { + var live = storageService.getStorageByType(FileStorageType.LEXICON_LIVE); + + var staging = storageService.getStorageByType(FileStorageType.LEXICON_STAGING); + var fromSource = staging.asPath().resolve("dictionary.dat"); + var liveDest = live.asPath().resolve("dictionary.dat"); + + // Backup live lexicon + var backupBase = storageService.getStorageBase(FileStorageBaseType.BACKUP); + var backup = storageService.allocateTemporaryStorage(backupBase, FileStorageType.BACKUP, + "lexicon", "Lexicon Backup; " + LocalDateTime.now()); + + Path backupDest = backup.asPath().resolve("dictionary.dat"); + + logger.info("Moving " + liveDest + " to " + backupDest); + Files.move(liveDest, backupDest); + + // Swap in new lexicon + logger.info("Moving " + fromSource + " to " + liveDest); + Files.move(fromSource, liveDest, StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.ATOMIC_MOVE); + } + + + public MqMessage waitResponse(MqOutbox outbox, ProcessService.ProcessId processId, long id) throws Exception { if (!waitForProcess(processId, TimeUnit.SECONDS, 30)) { error("Process " + processId + " did not launch"); @@ -162,37 +191,4 @@ public class ReconvertAndLoadFSM extends AbstractStateGraph { return false; } -// @GraphState(name = MOVE_INDEX_FILES, next = RELOAD_LEXICON, resume = ResumeBehavior.ERROR) -// public void moveIndexFiles(String crawlJob) throws Exception { -// Path indexData = Path.of("/vol/index.dat"); -// Path indexDest = Path.of("/vol/iw/0/page-index.dat"); -// -// if (!Files.exists(indexData)) -// error("Index data not found"); -// -// Files.move(indexData, indexDest, StandardCopyOption.REPLACE_EXISTING); -// } -// -// @GraphState(name = RELOAD_LEXICON, next = RELOAD_LEXICON_WAIT, resume = ResumeBehavior.ERROR) -// public long reloadLexicon() throws Exception { -// return mqIndexOutbox.sendAsync(IndexMqEndpoints.INDEX_RELOAD_LEXICON, ""); -// } -// -// @GraphState(name = RELOAD_LEXICON_WAIT, next = FLUSH_CACHES, resume = ResumeBehavior.RETRY) -// public void reloadLexiconWait(long id) throws Exception { -// var rsp = mqIndexOutbox.waitResponse(id); -// -// if (rsp.state() != MqMessageState.OK) { -// error("RELOAD_LEXICON failed"); -// } -// } -// -// @GraphState(name = FLUSH_CACHES, next = END, resume = ResumeBehavior.RETRY) -// public void flushCaches() throws Exception { -// var rsp = mqSearchOutbox.send(SearchMqEndpoints.FLUSH_CACHES, ""); -// -// if (rsp.state() != MqMessageState.OK) { -// error("FLUSH_CACHES failed"); -// } -// } } diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/task/RepartitionReindexFSM.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/task/RepartitionReindexFSM.java index ed3aad0a..deb72004 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/task/RepartitionReindexFSM.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/task/RepartitionReindexFSM.java @@ -18,12 +18,12 @@ public class RepartitionReindexFSM extends AbstractStateGraph { // STATES - private static final String INITIAL = "INITIAL"; - private static final String REPARTITION = "REPARTITION"; - private static final String REPARTITION_REPLY = "REPARTITION-REPLY"; - private static final String REINDEX = "REINDEX"; - private static final String REINDEX_REPLY = "REINDEX-REPLY"; - private static final String END = "END"; + public static final String INITIAL = "INITIAL"; + public static final String REPARTITION = "REPARTITION"; + public static final String REPARTITION_WAIT = "REPARTITION-WAIT"; + public static final String REINDEX = "REINDEX"; + public static final String REINDEX_WAIT = "REINDEX-WAIT"; + public static final String END = "END"; @Inject @@ -43,12 +43,12 @@ public class RepartitionReindexFSM extends AbstractStateGraph { } } - @GraphState(name = REPARTITION, next = REPARTITION_REPLY) + @GraphState(name = REPARTITION, next = REPARTITION_WAIT) public Long repartition() throws Exception { return indexOutbox.sendAsync(IndexMqEndpoints.INDEX_REPARTITION, ""); } - @GraphState(name = REPARTITION_REPLY, next = REINDEX, resume = ResumeBehavior.RETRY) + @GraphState(name = REPARTITION_WAIT, next = REINDEX, resume = ResumeBehavior.RETRY) public void repartitionReply(Long id) throws Exception { var rsp = indexOutbox.waitResponse(id); @@ -57,12 +57,12 @@ public class RepartitionReindexFSM extends AbstractStateGraph { } } - @GraphState(name = REINDEX, next = REINDEX_REPLY) + @GraphState(name = REINDEX, next = REINDEX_WAIT) public Long reindex() throws Exception { return indexOutbox.sendAsync(IndexMqEndpoints.INDEX_REINDEX, ""); } - @GraphState(name = REINDEX_REPLY, next = END, resume = ResumeBehavior.RETRY) + @GraphState(name = REINDEX_WAIT, next = END, resume = ResumeBehavior.RETRY) public void reindexReply(Long id) throws Exception { var rsp = indexOutbox.waitResponse(id); diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ControlProcessState.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ControlProcessState.java index 39d69ebd..a7324164 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ControlProcessState.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ControlProcessState.java @@ -5,6 +5,12 @@ public record ControlProcessState(String name, String state, boolean terminal) { if (terminal) { return "\uD83D\uDE34"; } + else if (state.equals("MONITOR")) { + return "\uD83D\uDD26"; + } + else if (state.endsWith("WAIT") || state.endsWith("REPLY")) { + return "\uD83D\uDD59"; + } else { return "\uD83C\uDFC3"; } diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithActions.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithActions.java index 927262d2..674e92bc 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithActions.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithActions.java @@ -11,6 +11,7 @@ public record FileStorageWithActions(FileStorage storage) { return storage.type() == FileStorageType.CRAWL_DATA; } public boolean isDeletable() { - return storage.type() == FileStorageType.PROCESSED_DATA; + return storage.type() == FileStorageType.PROCESSED_DATA + || storage.type() == FileStorageType.BACKUP; } } diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java index e92a2a1a..47640dde 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java @@ -31,7 +31,7 @@ public record ProcessHeartbeat( public String progressStyle() { if ("RUNNING".equals(status) && progress != null) { return """ - background: linear-gradient(90deg, #fff 0%%, #ccc %d%%, #fff %d%%) + background: linear-gradient(90deg, #ccc 0%%, #ccc %d%%, #fff %d%%) """.formatted(progress, progress, progress); } return ""; diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlFsmService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlFsmService.java new file mode 100644 index 00000000..24e5fd51 --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlFsmService.java @@ -0,0 +1,72 @@ +package nu.marginalia.control.svc; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.control.fsm.ControlFSMs; +import nu.marginalia.control.fsm.task.ReconvertAndLoadFSM; +import nu.marginalia.control.model.ControlProcess; +import nu.marginalia.control.model.ControlProcessState; +import nu.marginalia.db.storage.model.FileStorageId; +import nu.marginalia.mqsm.state.MachineState; +import spark.Request; +import spark.Response; + +import java.util.List; +import java.util.Map; + +@Singleton +public class ControlFsmService { + private final ControlFSMs controlFSMs; + + @Inject + public ControlFsmService(ControlFSMs controlFSMs) { + this.controlFSMs = controlFSMs; + } + + public Object startFsm(Request req, Response rsp) throws Exception { + controlFSMs.start( + ControlProcess.valueOf(req.params("fsm").toUpperCase()) + ); + return ""; + } + + public Object stopFsm(Request req, Response rsp) throws Exception { + controlFSMs.stop( + ControlProcess.valueOf(req.params("fsm").toUpperCase()) + ); + return ""; + } + + public Object triggerProcessing(Request request, Response response) throws Exception { + controlFSMs.start( + ControlProcess.RECONVERT_LOAD, + FileStorageId.of(Integer.parseInt(request.params("fid"))) + ); + return ""; + } + + public Object loadProcessedData(Request request, Response response) throws Exception { + var fid = FileStorageId.of(Integer.parseInt(request.params("fid"))); + + // Start the FSM from the intermediate state that triggers the load + controlFSMs.startFrom( + ControlProcess.RECONVERT_LOAD, + ReconvertAndLoadFSM.LOAD, + new ReconvertAndLoadFSM.Message(null, fid, 0L, 0L) + ); + + return ""; + } + + public Object getFsmStates() { + return controlFSMs.getMachineStates().entrySet().stream().sorted(Map.Entry.comparingByKey()).map(e -> { + + final MachineState state = e.getValue(); + final String machineName = e.getKey().name(); + final String stateName = state.name(); + final boolean terminal = state.isFinal(); + + return new ControlProcessState(machineName, stateName, terminal); + }).toList(); + } +} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/EventLogService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/EventLogService.java index 8167c71c..d2cd6bcb 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/EventLogService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/EventLogService.java @@ -45,4 +45,66 @@ public class EventLogService { } } + public List getLastEntriesForService(String serviceName, int n) { + try (var conn = dataSource.getConnection(); + var query = conn.prepareStatement(""" + SELECT SERVICE_NAME, INSTANCE, EVENT_TIME, EVENT_TYPE, EVENT_MESSAGE + FROM SERVICE_EVENTLOG + WHERE SERVICE_NAME = ? + ORDER BY ID DESC + LIMIT ? + """)) { + + query.setString(1, serviceName); + query.setInt(2, n); + + List entries = new ArrayList<>(n); + var rs = query.executeQuery(); + while (rs.next()) { + entries.add(new EventLogEntry( + rs.getString("SERVICE_NAME"), + rs.getString("INSTANCE"), + rs.getTimestamp("EVENT_TIME").toLocalDateTime().toLocalTime().toString(), + rs.getString("EVENT_TYPE"), + rs.getString("EVENT_MESSAGE") + )); + } + return entries; + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } + + + public List getLastEntriesForInstance(String instance, int n) { + try (var conn = dataSource.getConnection(); + var query = conn.prepareStatement(""" + SELECT SERVICE_NAME, INSTANCE, EVENT_TIME, EVENT_TYPE, EVENT_MESSAGE + FROM SERVICE_EVENTLOG + WHERE INSTANCE = ? + ORDER BY ID DESC + LIMIT ? + """)) { + + query.setString(1, instance); + query.setInt(2, n); + + List entries = new ArrayList<>(n); + var rs = query.executeQuery(); + while (rs.next()) { + entries.add(new EventLogEntry( + rs.getString("SERVICE_NAME"), + rs.getString("INSTANCE"), + rs.getTimestamp("EVENT_TIME").toLocalDateTime().toLocalTime().toString(), + rs.getString("EVENT_TYPE"), + rs.getString("EVENT_MESSAGE") + )); + } + return entries; + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } } diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java index e1034921..009ccdc8 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java @@ -5,6 +5,8 @@ import nu.marginalia.service.control.ServiceEventLog; import nu.marginalia.service.server.BaseServiceParams; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.slf4j.Marker; +import org.slf4j.MarkerFactory; import spark.utils.IOUtils; import javax.inject.Inject; @@ -21,6 +23,8 @@ import java.util.concurrent.ConcurrentHashMap; @Singleton public class ProcessService { private final Logger logger = LoggerFactory.getLogger(getClass()); + private final Marker processMarker = MarkerFactory.getMarker("PROCESS"); + private final ServiceEventLog eventLog; private final Path distPath; @@ -74,9 +78,9 @@ public class ProcessService { while (process.isAlive()) { if (es.ready()) - logger.warn("{}:{}", processId, es.readLine()); + logger.warn(processMarker, es.readLine()); if (os.ready()) - logger.debug("{}:{}", processId, os.readLine()); + logger.info(processMarker, os.readLine()); } return 0 == process.waitFor(); @@ -116,6 +120,9 @@ public class ProcessService { } opts.put("WMSA_HOME", WMSA_HOME); opts.put("JAVA_HOME", System.getenv("JAVA_HOME")); + opts.put("CONVERTER_OPTS", System.getenv("CONVERTER_OPTS")); + opts.put("LOADER_OPTS", System.getenv("LOADER_OPTS")); + opts.put("CRAWLER_OPTS", System.getenv("CRAWLER_OPTS")); return opts.entrySet().stream().map(e -> e.getKey() + "=" + e.getValue()).toArray(String[]::new); } diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/index.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/index.hdb index 71647683..b1034529 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/index.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/index.hdb @@ -3,7 +3,7 @@ Control Service - + {{> control/partials/nav}} diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/partials/nav.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/partials/nav.hdb index e3f38897..974502d5 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/partials/nav.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/partials/nav.hdb @@ -1,8 +1,8 @@ \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/partials/processes-table.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/partials/processes-table.hdb index 47d7dc64..4547e76b 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/partials/processes-table.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/partials/processes-table.hdb @@ -9,14 +9,14 @@ Last Seen (ms) {{#each processes}} - + {{processId}}    {{uuid}} {{status}} - {{#if progress}}{{progress}}%{{/if}} + {{#if progress}}{{progress}}%{{/if}} {{#unless isStopped}}{{lastSeenMillis}}{{/unless}} {{/each}} diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/partials/services-table.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/partials/services-table.hdb index 2137f1fe..5da46a83 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/partials/services-table.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/partials/services-table.hdb @@ -7,7 +7,7 @@ {{#each services}} - {{serviceId}} + {{serviceId}}    {{uuid}} diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/processes.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/processes.hdb index 7d348be1..114b340d 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/processes.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/processes.hdb @@ -3,7 +3,7 @@ Control Service - + {{> control/partials/nav}} diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/service-by-id.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/service-by-id.hdb new file mode 100644 index 00000000..5b1fe6b4 --- /dev/null +++ b/code/services-satellite/control-service/src/main/resources/templates/control/service-by-id.hdb @@ -0,0 +1,21 @@ + + + + Control Service + + + + + {{> control/partials/nav}} +
    +

    Services/{{id}}

    + {{> control/partials/events-table }} +
    + + + + \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/services.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/services.hdb index 2c0542b9..2e73dd92 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/services.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/services.hdb @@ -3,7 +3,7 @@ Control Service - + {{> control/partials/nav}} diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/storage.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/storage.hdb index 68410646..1674d6f5 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/storage.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/storage.hdb @@ -3,12 +3,12 @@ Control Service - + {{> control/partials/nav}}
    - +

    Storage

    {{#each storage}} diff --git a/run/env/service.env b/run/env/service.env index db871699..dfa012b3 100644 --- a/run/env/service.env +++ b/run/env/service.env @@ -1,2 +1,3 @@ WMSA_HOME=run/ -CONTROL_SERVICE_OPTS="-DdistPath=/dist" \ No newline at end of file +CONTROL_SERVICE_OPTS="-DdistPath=/dist" +CONVERTER_OPTS="-ea -Xmx16G -XX:-CompactStrings -XX:+UseParallelGC -XX:GCTimeRatio=14 -XX:ParallelGCThreads=15" \ No newline at end of file From 92ed513e4f59ff454dd3f271c854e4ff10977cbe Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 17 Jul 2023 21:41:56 +0200 Subject: [PATCH 060/157] Less spammy default log settings --- .../service/src/main/resources/log4j2.properties | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/code/common/service/src/main/resources/log4j2.properties b/code/common/service/src/main/resources/log4j2.properties index 96c73ea0..badab181 100644 --- a/code/common/service/src/main/resources/log4j2.properties +++ b/code/common/service/src/main/resources/log4j2.properties @@ -5,11 +5,9 @@ appender.console.name = LogToConsole appender.console.layout.type = PatternLayout appender.console.layout.pattern = %d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1} -- %msg{nolookups}%n appender.console.filter.process.type = MarkerFilter -appender.console.filter.process.onMismatch=ACCEPT appender.console.filter.process.onMatch=DENY appender.console.filter.process.marker=PROCESS appender.console.filter.http.type = MarkerFilter -appender.console.filter.http.onMismatch=ACCEPT appender.console.filter.http.onMatch=DENY appender.console.filter.http.marker=HTTP appender.processconsole.type = Console @@ -18,7 +16,6 @@ appender.processconsole.layout.type = PatternLayout appender.processconsole.layout.pattern = %msg{nolookups}%n appender.processconsole.filter.process.type = MarkerFilter appender.processconsole.filter.process.onMismatch=DENY -appender.processconsole.filter.process.onMatch=ACCEPT appender.processconsole.filter.process.marker=PROCESS appender.rolling.type = RollingFile appender.rolling.name = RollingFile @@ -32,15 +29,15 @@ appender.rolling.policies.size.size=10MB appender.rolling.strategy.type = DefaultRolloverStrategy appender.rolling.strategy.max = 10 appender.rolling.filter.query.type = MarkerFilter -appender.rolling.filter.query.onMismatch=ACCEPT +appender.rolling.filter.query.onMismatch=NEUTRAL appender.rolling.filter.query.onMatch=DENY appender.rolling.filter.query.marker=QUERY appender.rolling.filter.http.type = MarkerFilter -appender.rolling.filter.http.onMismatch=ACCEPT +appender.rolling.filter.http.onMismatch=NEUTRAL appender.rolling.filter.http.onMatch=DENY appender.rolling.filter.http.marker=HTTP appender.rolling.filter.process.type = MarkerFilter -appender.rolling.filter.process.onMismatch=ACCEPT +appender.rolling.filter.process.onMismatch=NEUTRAL appender.rolling.filter.process.onMatch=DENY appender.rolling.filter.process.marker=PROCESS appender.process.type = RollingFile @@ -56,7 +53,6 @@ appender.process.strategy.type = DefaultRolloverStrategy appender.process.strategy.max = 10 appender.process.filter.process.type = MarkerFilter appender.process.filter.process.onMismatch=DENY -appender.process.filter.process.onMatch=ACCEPT appender.process.filter.process.marker=PROCESS rootLogger.level = info rootLogger.appenderRef.console.ref = LogToConsole From f6e2216b879de0465276d042c8662cd5b8cd8bf7 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 17 Jul 2023 21:42:13 +0200 Subject: [PATCH 061/157] Less spammy default log settings --- code/common/service/src/main/resources/log4j2.properties | 3 --- 1 file changed, 3 deletions(-) diff --git a/code/common/service/src/main/resources/log4j2.properties b/code/common/service/src/main/resources/log4j2.properties index badab181..05b50d06 100644 --- a/code/common/service/src/main/resources/log4j2.properties +++ b/code/common/service/src/main/resources/log4j2.properties @@ -29,15 +29,12 @@ appender.rolling.policies.size.size=10MB appender.rolling.strategy.type = DefaultRolloverStrategy appender.rolling.strategy.max = 10 appender.rolling.filter.query.type = MarkerFilter -appender.rolling.filter.query.onMismatch=NEUTRAL appender.rolling.filter.query.onMatch=DENY appender.rolling.filter.query.marker=QUERY appender.rolling.filter.http.type = MarkerFilter -appender.rolling.filter.http.onMismatch=NEUTRAL appender.rolling.filter.http.onMatch=DENY appender.rolling.filter.http.marker=HTTP appender.rolling.filter.process.type = MarkerFilter -appender.rolling.filter.process.onMismatch=NEUTRAL appender.rolling.filter.process.onMatch=DENY appender.rolling.filter.process.marker=PROCESS appender.process.type = RollingFile From f21a3983aabdf4b64ee0d1bd8223ea64af24f482 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 18 Jul 2023 18:39:57 +0200 Subject: [PATCH 062/157] Abortable processes --- .../java/nu/marginalia/mqsm/StateFactory.java | 12 +++- .../mqsm/graph/AbstractStateGraph.java | 2 +- .../monitor/AbstractProcessSpawnerFSM.java | 57 ++++++++++++++++--- .../control/svc/ProcessService.java | 1 - 4 files changed, 61 insertions(+), 11 deletions(-) diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateFactory.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateFactory.java index cd7824a7..6df583b3 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateFactory.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateFactory.java @@ -1,6 +1,7 @@ package nu.marginalia.mqsm; import com.google.gson.Gson; +import com.google.gson.JsonSyntaxException; import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.mqsm.graph.ResumeBehavior; @@ -29,11 +30,18 @@ public class StateFactory { @Override public StateTransition next(String message) { - if (message.equals("")) { + if (message.isEmpty()) { return logic.apply(null); } - return logic.apply(gson.fromJson(message, param)); + try { + var paramObj = gson.fromJson(message, param); + return logic.apply(paramObj); + } + catch (JsonSyntaxException ex) { + throw new IllegalArgumentException("Failed to parse '" + message + + "' into a '" + param.getSimpleName() + "'", ex); + } } @Override diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/AbstractStateGraph.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/AbstractStateGraph.java index 3b19f764..b89ca7c4 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/AbstractStateGraph.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/AbstractStateGraph.java @@ -19,7 +19,7 @@ public abstract class AbstractStateGraph { } public void transition(String state) { - throw new ControlFlowException(state, ""); + throw new ControlFlowException(state, null); } public void transition(String state, T payload) { diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/AbstractProcessSpawnerFSM.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/AbstractProcessSpawnerFSM.java index 90a704c9..4a853bf4 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/AbstractProcessSpawnerFSM.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/AbstractProcessSpawnerFSM.java @@ -8,23 +8,37 @@ import nu.marginalia.mqsm.StateFactory; import nu.marginalia.mqsm.graph.AbstractStateGraph; import nu.marginalia.mqsm.graph.GraphState; import nu.marginalia.mqsm.graph.ResumeBehavior; +import nu.marginalia.mqsm.graph.TerminalState; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.sql.SQLException; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicReference; @Singleton public class AbstractProcessSpawnerFSM extends AbstractStateGraph { private final MqPersistence persistence; private final ProcessService processService; + + private final Logger logger = LoggerFactory.getLogger(getClass()); + public static final String INITIAL = "INITIAL"; public static final String MONITOR = "MONITOR"; + public static final String ABORTED= "ABORTED"; public static final String RUN = "RUN"; + public static final String ERROR = "ERROR"; public static final String END = "END"; public static final int MAX_ATTEMPTS = 3; private final String inboxName; private final ProcessService.ProcessId processId; + private final ExecutorService executorService = Executors.newSingleThreadExecutor(); @Inject public AbstractProcessSpawnerFSM(StateFactory stateFactory, @@ -53,24 +67,53 @@ public class AbstractProcessSpawnerFSM extends AbstractStateGraph { if (messages.isEmpty() && !processService.isRunning(processId)) { TimeUnit.SECONDS.sleep(5); } else { - transition(RUN, 0); + transition(RUN); } } } @GraphState(name = RUN, resume = ResumeBehavior.RESTART) public void run(Integer attempts) throws Exception { + if (attempts == null) + attempts = 0; + try { - processService.trigger(processId); - } - catch (Exception e) { - if (attempts < MAX_ATTEMPTS) { - transition(RUN, attempts + 1); + var exec = new TaskExecution(); + if (exec.isError()) { + if (attempts < MAX_ATTEMPTS) + transition(RUN, attempts + 1); + else + transition(ERROR); } - else throw e; + } + catch (InterruptedException ex) { + processService.kill(processId); + transition(ABORTED); } transition(MONITOR); } + @TerminalState(name = ABORTED) + public void aborted() throws Exception {} + + + private class TaskExecution { + private final AtomicBoolean error = new AtomicBoolean(false); + public TaskExecution() throws ExecutionException, InterruptedException { + // Run this call in a separate thread so that this thread can be interrupted waiting for it + executorService.submit(() -> { + try { + processService.trigger(processId); + } catch (Exception e) { + logger.warn("Error in triggering process", e); + error.set(true); + } + }).get(); // Wait for the process to start + } + + public boolean isError() { + return error.get(); + } + } } diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java index 009ccdc8..124a2a49 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java @@ -102,7 +102,6 @@ public class ProcessService { eventLog.logEvent("PROCESS-KILL", processId.toString()); process.destroy(); - processes.remove(processId); return true; } From c0b5ea0e7defd3b2af3378132372636256fe4330 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 18 Jul 2023 19:28:42 +0200 Subject: [PATCH 063/157] Revert "Less spammy default log settings" This reverts commit f6e2216b879de0465276d042c8662cd5b8cd8bf7. --- code/common/service/src/main/resources/log4j2.properties | 3 +++ 1 file changed, 3 insertions(+) diff --git a/code/common/service/src/main/resources/log4j2.properties b/code/common/service/src/main/resources/log4j2.properties index 05b50d06..badab181 100644 --- a/code/common/service/src/main/resources/log4j2.properties +++ b/code/common/service/src/main/resources/log4j2.properties @@ -29,12 +29,15 @@ appender.rolling.policies.size.size=10MB appender.rolling.strategy.type = DefaultRolloverStrategy appender.rolling.strategy.max = 10 appender.rolling.filter.query.type = MarkerFilter +appender.rolling.filter.query.onMismatch=NEUTRAL appender.rolling.filter.query.onMatch=DENY appender.rolling.filter.query.marker=QUERY appender.rolling.filter.http.type = MarkerFilter +appender.rolling.filter.http.onMismatch=NEUTRAL appender.rolling.filter.http.onMatch=DENY appender.rolling.filter.http.marker=HTTP appender.rolling.filter.process.type = MarkerFilter +appender.rolling.filter.process.onMismatch=NEUTRAL appender.rolling.filter.process.onMatch=DENY appender.rolling.filter.process.marker=PROCESS appender.process.type = RollingFile From 08ca6399eccb8d8042fd85f85b77400fc1fd088c Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 19 Jul 2023 17:14:45 +0200 Subject: [PATCH 064/157] (converter) WIP --- .../java/nu/marginalia/mqsm/StateMachine.java | 12 +- .../mqsm/graph/AbstractStateGraph.java | 28 +++- .../nu/marginalia/mqsm/graph/GraphState.java | 2 + .../marginalia/mqsm/graph/TerminalState.java | 1 + .../src/main/resources/log4j2.properties | 10 +- .../nu/marginalia/control/ControlService.java | 47 ++++-- .../control/actor/ControlActors.java | 111 +++++++++++++ .../monitor/AbstractProcessSpawnerActor.java} | 41 +++-- .../monitor/ConverterMonitorActor.java} | 10 +- .../monitor/FileStorageMonitorActor.java} | 22 ++- .../monitor/LoaderMonitorActor.java} | 10 +- .../monitor/MessageQueueMonitorActor.java} | 13 +- .../monitor/ProcessLivenessMonitorActor.java} | 17 +- .../task/ReconvertAndLoadActor.java} | 150 ++++++++++++++---- .../marginalia/control/fsm/ControlFSMs.java | 107 ------------- .../fsm/task/RepartitionReindexFSM.java | 74 --------- .../model/{ControlProcess.java => Actor.java} | 3 +- ...olProcessState.java => ActorRunState.java} | 2 +- .../marginalia/control/model/ActorState.java | 19 +++ .../control/model/ActorStateGraph.java | 51 ++++++ ...mService.java => ControlActorService.java} | 53 ++++--- .../control/svc/MessageQueueViewService.java | 72 +++++++++ .../main/resources/static/control/style.css | 19 +++ .../templates/control/actor-details.hdb | 22 +++ .../control/{processes.hdb => actors.hdb} | 4 +- .../control/partials/actor-state-graph.hdb | 16 ++ .../control/partials/actors-table.hdb | 50 ++++++ .../templates/control/partials/fsm-table.hdb | 23 --- .../control/partials/message-queue-table.hdb | 62 +++++++- .../templates/control/partials/nav.hdb | 2 +- 30 files changed, 730 insertions(+), 323 deletions(-) create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java rename code/services-satellite/control-service/src/main/java/nu/marginalia/control/{fsm/monitor/AbstractProcessSpawnerFSM.java => actor/monitor/AbstractProcessSpawnerActor.java} (68%) rename code/services-satellite/control-service/src/main/java/nu/marginalia/control/{fsm/monitor/ConverterMonitorFSM.java => actor/monitor/ConverterMonitorActor.java} (58%) rename code/services-satellite/control-service/src/main/java/nu/marginalia/control/{fsm/monitor/FileStorageMonitorFSM.java => actor/monitor/FileStorageMonitorActor.java} (69%) rename code/services-satellite/control-service/src/main/java/nu/marginalia/control/{fsm/monitor/LoaderMonitorFSM.java => actor/monitor/LoaderMonitorActor.java} (60%) rename code/services-satellite/control-service/src/main/java/nu/marginalia/control/{fsm/monitor/MessageQueueMonitorFSM.java => actor/monitor/MessageQueueMonitorActor.java} (68%) rename code/services-satellite/control-service/src/main/java/nu/marginalia/control/{fsm/monitor/ProcessLivenessMonitorFSM.java => actor/monitor/ProcessLivenessMonitorActor.java} (66%) rename code/services-satellite/control-service/src/main/java/nu/marginalia/control/{fsm/task/ReconvertAndLoadFSM.java => actor/task/ReconvertAndLoadActor.java} (57%) delete mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/ControlFSMs.java delete mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/task/RepartitionReindexFSM.java rename code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/{ControlProcess.java => Actor.java} (83%) rename code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/{ControlProcessState.java => ActorRunState.java} (83%) create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ActorState.java create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ActorStateGraph.java rename code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/{ControlFsmService.java => ControlActorService.java} (50%) create mode 100644 code/services-satellite/control-service/src/main/resources/templates/control/actor-details.hdb rename code/services-satellite/control-service/src/main/resources/templates/control/{processes.hdb => actors.hdb} (83%) create mode 100644 code/services-satellite/control-service/src/main/resources/templates/control/partials/actor-state-graph.hdb create mode 100644 code/services-satellite/control-service/src/main/resources/templates/control/partials/actors-table.hdb delete mode 100644 code/services-satellite/control-service/src/main/resources/templates/control/partials/fsm-table.hdb diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java index c3b32cd6..1ef80abb 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java @@ -59,8 +59,16 @@ public class StateMachine { registerStates(stateGraph); for (var declaredState : stateGraph.declaredStates()) { - if (!allStates.containsKey(declaredState)) { - throw new IllegalArgumentException("State " + declaredState + " is not defined in the state graph"); + if (!allStates.containsKey(declaredState.name())) { + throw new IllegalArgumentException("State " + declaredState.name() + " is not defined in the state graph"); + } + if (!allStates.containsKey(declaredState.next())) { + throw new IllegalArgumentException("State " + declaredState.next() + " is not defined in the state graph"); + } + for (var state : declaredState.transitions()) { + if (!allStates.containsKey(state)) { + throw new IllegalArgumentException("State " + state + " is not defined in the state graph"); + } } } diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/AbstractStateGraph.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/AbstractStateGraph.java index b89ca7c4..098c4333 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/AbstractStateGraph.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/AbstractStateGraph.java @@ -30,6 +30,7 @@ public abstract class AbstractStateGraph { throw new ControlFlowException("ERROR", ""); } + public void error(T payload) { throw new ControlFlowException("ERROR", payload); } @@ -38,19 +39,31 @@ public abstract class AbstractStateGraph { throw new ControlFlowException("ERROR", ex.getClass().getSimpleName() + ":" + ex.getMessage()); } - public Set declaredStates() { - Set ret = new HashSet<>(); + public Set declaredStates() { + Set ret = new HashSet<>(); for (var method : getClass().getMethods()) { var gs = method.getAnnotation(GraphState.class); if (gs != null) { - ret.add(gs.name()); - ret.add(gs.next()); + ret.add(gs); } } return ret; } + public Set terminalStates() { + Set ret = new HashSet<>(); + + for (var method : getClass().getMethods()) { + var gs = method.getAnnotation(TerminalState.class); + if (gs != null) { + ret.add(gs); + } + } + + return ret; + } + public List asStateList() { List ret = new ArrayList<>(); @@ -59,6 +72,13 @@ public abstract class AbstractStateGraph { if (gs != null) { ret.add(graphState(method, gs)); } + + var ts = method.getAnnotation(TerminalState.class); + if (ts != null) { + ret.add(stateFactory.create(ts.name(), ResumeBehavior.ERROR, () -> { + throw new ControlFlowException(ts.name(), null); + })); + } } return ret; diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/GraphState.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/GraphState.java index 62183637..bf7be4a6 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/GraphState.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/GraphState.java @@ -8,5 +8,7 @@ import java.lang.annotation.RetentionPolicy; public @interface GraphState { String name(); String next() default "ERROR"; + String[] transitions() default {}; + String description() default ""; ResumeBehavior resume() default ResumeBehavior.ERROR; } diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/TerminalState.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/TerminalState.java index 5ae062b7..46a2be0c 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/TerminalState.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/TerminalState.java @@ -6,4 +6,5 @@ import java.lang.annotation.RetentionPolicy; @Retention(RetentionPolicy.RUNTIME) public @interface TerminalState { String name(); + String description() default ""; } diff --git a/code/common/service/src/main/resources/log4j2.properties b/code/common/service/src/main/resources/log4j2.properties index badab181..96c73ea0 100644 --- a/code/common/service/src/main/resources/log4j2.properties +++ b/code/common/service/src/main/resources/log4j2.properties @@ -5,9 +5,11 @@ appender.console.name = LogToConsole appender.console.layout.type = PatternLayout appender.console.layout.pattern = %d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1} -- %msg{nolookups}%n appender.console.filter.process.type = MarkerFilter +appender.console.filter.process.onMismatch=ACCEPT appender.console.filter.process.onMatch=DENY appender.console.filter.process.marker=PROCESS appender.console.filter.http.type = MarkerFilter +appender.console.filter.http.onMismatch=ACCEPT appender.console.filter.http.onMatch=DENY appender.console.filter.http.marker=HTTP appender.processconsole.type = Console @@ -16,6 +18,7 @@ appender.processconsole.layout.type = PatternLayout appender.processconsole.layout.pattern = %msg{nolookups}%n appender.processconsole.filter.process.type = MarkerFilter appender.processconsole.filter.process.onMismatch=DENY +appender.processconsole.filter.process.onMatch=ACCEPT appender.processconsole.filter.process.marker=PROCESS appender.rolling.type = RollingFile appender.rolling.name = RollingFile @@ -29,15 +32,15 @@ appender.rolling.policies.size.size=10MB appender.rolling.strategy.type = DefaultRolloverStrategy appender.rolling.strategy.max = 10 appender.rolling.filter.query.type = MarkerFilter -appender.rolling.filter.query.onMismatch=NEUTRAL +appender.rolling.filter.query.onMismatch=ACCEPT appender.rolling.filter.query.onMatch=DENY appender.rolling.filter.query.marker=QUERY appender.rolling.filter.http.type = MarkerFilter -appender.rolling.filter.http.onMismatch=NEUTRAL +appender.rolling.filter.http.onMismatch=ACCEPT appender.rolling.filter.http.onMatch=DENY appender.rolling.filter.http.marker=HTTP appender.rolling.filter.process.type = MarkerFilter -appender.rolling.filter.process.onMismatch=NEUTRAL +appender.rolling.filter.process.onMismatch=ACCEPT appender.rolling.filter.process.onMatch=DENY appender.rolling.filter.process.marker=PROCESS appender.process.type = RollingFile @@ -53,6 +56,7 @@ appender.process.strategy.type = DefaultRolloverStrategy appender.process.strategy.max = 10 appender.process.filter.process.type = MarkerFilter appender.process.filter.process.onMismatch=DENY +appender.process.filter.process.onMatch=ACCEPT appender.process.filter.process.marker=PROCESS rootLogger.level = info rootLogger.appenderRef.console.ref = LogToConsole diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java index 2c04fe6f..49ed3dff 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java @@ -3,9 +3,9 @@ package nu.marginalia.control; import com.google.gson.Gson; import com.google.inject.Inject; import nu.marginalia.client.ServiceMonitors; +import nu.marginalia.control.model.Actor; import nu.marginalia.control.svc.*; import nu.marginalia.model.gson.GsonFactory; -import nu.marginalia.renderer.MustacheRenderer; import nu.marginalia.renderer.RendererFactory; import nu.marginalia.service.server.*; import org.slf4j.Logger; @@ -25,7 +25,7 @@ public class ControlService extends Service { private final ServiceMonitors monitors; private final HeartbeatService heartbeatService; private final EventLogService eventLogService; - private final ControlFsmService controlFsmService; + private final ControlActorService controlActorService; private final StaticResources staticResources; private final MessageQueueViewService messageQueueViewService; private final ControlFileStorageService controlFileStorageService; @@ -37,7 +37,7 @@ public class ControlService extends Service { HeartbeatService heartbeatService, EventLogService eventLogService, RendererFactory rendererFactory, - ControlFsmService controlFsmService, + ControlActorService controlActorService, StaticResources staticResources, MessageQueueViewService messageQueueViewService, ControlFileStorageService controlFileStorageService @@ -51,10 +51,11 @@ public class ControlService extends Service { var indexRenderer = rendererFactory.renderer("control/index"); var servicesRenderer = rendererFactory.renderer("control/services"); var serviceByIdRenderer = rendererFactory.renderer("control/service-by-id"); - var processesRenderer = rendererFactory.renderer("control/processes"); + var actorsRenderer = rendererFactory.renderer("control/actors"); + var actorDetailsRenderer = rendererFactory.renderer("control/actor-details"); var storageRenderer = rendererFactory.renderer("control/storage"); - this.controlFsmService = controlFsmService; + this.controlActorService = controlActorService; this.staticResources = staticResources; this.messageQueueViewService = messageQueueViewService; @@ -69,18 +70,20 @@ public class ControlService extends Service { Spark.get("/public/services", this::servicesModel, servicesRenderer::render); Spark.get("/public/services/:id", this::serviceModel, serviceByIdRenderer::render); - Spark.get("/public/processes", this::processesModel, processesRenderer::render); + Spark.get("/public/messages/:id", this::messageModel, gson::toJson); + Spark.get("/public/actors", this::processesModel, actorsRenderer::render); + Spark.get("/public/actors/:fsm", this::actorDetailsModel, actorDetailsRenderer::render); Spark.get("/public/storage", this::storageModel, storageRenderer::render); final HtmlRedirect redirectToServices = new HtmlRedirect("/services"); - final HtmlRedirect redirectToProcesses = new HtmlRedirect("/processes"); + final HtmlRedirect redirectToProcesses = new HtmlRedirect("/actors"); final HtmlRedirect redirectToStorage = new HtmlRedirect("/storage"); - Spark.post("/public/fsms/:fsm/start", controlFsmService::startFsm, redirectToProcesses); - Spark.post("/public/fsms/:fsm/stop", controlFsmService::stopFsm, redirectToProcesses); + Spark.post("/public/fsms/:fsm/start", controlActorService::startFsm, redirectToProcesses); + Spark.post("/public/fsms/:fsm/stop", controlActorService::stopFsm, redirectToProcesses); - Spark.post("/public/storage/:fid/process", controlFsmService::triggerProcessing, redirectToProcesses); - Spark.post("/public/storage/:fid/load", controlFsmService::loadProcessedData, redirectToProcesses); + Spark.post("/public/storage/:fid/process", controlActorService::triggerProcessing, redirectToProcesses); + Spark.post("/public/storage/:fid/load", controlActorService::loadProcessedData, redirectToProcesses); Spark.post("/public/storage/:fid/delete", controlFileStorageService::flagFileForDeletionRequest, redirectToStorage); @@ -89,6 +92,18 @@ public class ControlService extends Service { monitors.subscribe(this::logMonitorStateChange); } + private Object messageModel(Request request, Response response) { + var message = messageQueueViewService.getMessage(Long.parseLong(request.params("id"))); + if (message != null) { + response.type("application/json"); + return message; + } + else { + response.status(404); + return ""; + } + } + private Object serviceModel(Request request, Response response) { String serviceName = request.params("id"); @@ -108,10 +123,18 @@ public class ControlService extends Service { private Object processesModel(Request request, Response response) { return Map.of("processes", heartbeatService.getProcessHeartbeats(), - "fsms", controlFsmService.getFsmStates(), + "actors", controlActorService.getActorStates(), "messages", messageQueueViewService.getLastEntries(20)); } + private Object actorDetailsModel(Request request, Response response) { + final Actor actor = Actor.valueOf(request.params("fsm").toUpperCase()); + final String inbox = actor.id(); + return Map.of( + "actor", actor, + "state-graph", controlActorService.getActorStateGraph(actor), + "messages", messageQueueViewService.getLastEntriesForInbox(inbox, 20)); + } private Object serveStatic(Request request, Response response) { String resource = request.params("resource"); diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java new file mode 100644 index 00000000..c470341c --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java @@ -0,0 +1,111 @@ +package nu.marginalia.control.actor; + +import com.google.gson.Gson; +import com.google.inject.Inject; +import com.google.inject.Singleton; +import lombok.SneakyThrows; +import nu.marginalia.control.model.Actor; +import nu.marginalia.control.actor.monitor.*; +import nu.marginalia.control.actor.monitor.ConverterMonitorActor; +import nu.marginalia.control.actor.monitor.LoaderMonitorActor; +import nu.marginalia.control.actor.task.ReconvertAndLoadActor; +import nu.marginalia.model.gson.GsonFactory; +import nu.marginalia.mq.MessageQueueFactory; +import nu.marginalia.mqsm.StateMachine; +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mqsm.state.MachineState; +import nu.marginalia.service.control.ServiceEventLog; +import nu.marginalia.service.server.BaseServiceParams; + +import java.util.HashMap; +import java.util.Map; +import java.util.UUID; +import java.util.stream.Collectors; + +@Singleton +public class ControlActors { + private final ServiceEventLog eventLog; + private final Gson gson; + private final MessageQueueFactory messageQueueFactory; + public Map stateMachines = new HashMap<>(); + public Map actorDefinitions = new HashMap<>(); + + @Inject + public ControlActors(MessageQueueFactory messageQueueFactory, + GsonFactory gsonFactory, + BaseServiceParams baseServiceParams, + ReconvertAndLoadActor reconvertAndLoadActor, + ConverterMonitorActor converterMonitorFSM, + LoaderMonitorActor loaderMonitor, + MessageQueueMonitorActor messageQueueMonitor, + ProcessLivenessMonitorActor processMonitorFSM, + FileStorageMonitorActor fileStorageMonitorActor + ) { + this.messageQueueFactory = messageQueueFactory; + this.eventLog = baseServiceParams.eventLog; + this.gson = gsonFactory.get(); + + register(Actor.RECONVERT_LOAD, reconvertAndLoadActor); + register(Actor.CONVERTER_MONITOR, converterMonitorFSM); + register(Actor.LOADER_MONITOR, loaderMonitor); + register(Actor.MESSAGE_QUEUE_MONITOR, messageQueueMonitor); + register(Actor.PROCESS_LIVENESS_MONITOR, processMonitorFSM); + register(Actor.FILE_STORAGE_MONITOR, fileStorageMonitorActor); + } + + private void register(Actor process, AbstractStateGraph graph) { + var sm = new StateMachine(messageQueueFactory, process.id(), UUID.randomUUID(), graph); + sm.listen((function, param) -> logStateChange(process, function)); + + stateMachines.put(process, sm); + actorDefinitions.put(process, graph); + } + + private void logStateChange(Actor process, String state) { + eventLog.logEvent("FSM-STATE-CHANGE", process.id() + " -> " + state); + } + + public void startFrom(Actor process, String state) throws Exception { + eventLog.logEvent("FSM-START", process.id()); + + stateMachines.get(process).initFrom(state); + } + + public void start(Actor process) throws Exception { + eventLog.logEvent("FSM-START", process.id()); + + stateMachines.get(process).init(); + } + + public void startFrom(Actor process, String state, Object arg) throws Exception { + eventLog.logEvent("FSM-START", process.id()); + + stateMachines.get(process).initFrom(state, gson.toJson(arg)); + } + + public void start(Actor process, Object arg) throws Exception { + eventLog.logEvent("FSM-START", process.id()); + + stateMachines.get(process).init(gson.toJson(arg)); + } + + @SneakyThrows + public void stop(Actor fsm) { + stateMachines.get(fsm).abortExecution(); + } + + public Map getActorStates() { + return stateMachines.entrySet().stream().collect( + Collectors.toMap( + Map.Entry::getKey, e -> e.getValue().getState()) + ); + } + public MachineState getActorStates(Actor actor) { + return stateMachines.get(actor).getState(); + } + + public AbstractStateGraph getActorDefinition(Actor actor) { + return actorDefinitions.get(actor); + } + +} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/AbstractProcessSpawnerFSM.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java similarity index 68% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/AbstractProcessSpawnerFSM.java rename to code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java index 4a853bf4..7b5b1e11 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/AbstractProcessSpawnerFSM.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java @@ -1,4 +1,4 @@ -package nu.marginalia.control.fsm.monitor; +package nu.marginalia.control.actor.monitor; import com.google.inject.Inject; import com.google.inject.Singleton; @@ -18,10 +18,9 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; -import java.util.concurrent.atomic.AtomicReference; @Singleton -public class AbstractProcessSpawnerFSM extends AbstractStateGraph { +public class AbstractProcessSpawnerActor extends AbstractStateGraph { private final MqPersistence persistence; private final ProcessService processService; @@ -30,9 +29,9 @@ public class AbstractProcessSpawnerFSM extends AbstractStateGraph { public static final String INITIAL = "INITIAL"; public static final String MONITOR = "MONITOR"; - public static final String ABORTED= "ABORTED"; public static final String RUN = "RUN"; public static final String ERROR = "ERROR"; + public static final String ABORTED = "ABORTED"; public static final String END = "END"; public static final int MAX_ATTEMPTS = 3; @@ -41,11 +40,11 @@ public class AbstractProcessSpawnerFSM extends AbstractStateGraph { private final ExecutorService executorService = Executors.newSingleThreadExecutor(); @Inject - public AbstractProcessSpawnerFSM(StateFactory stateFactory, - MqPersistence persistence, - ProcessService processService, - String inboxName, - ProcessService.ProcessId processId) { + public AbstractProcessSpawnerActor(StateFactory stateFactory, + MqPersistence persistence, + ProcessService processService, + String inboxName, + ProcessService.ProcessId processId) { super(stateFactory); this.persistence = persistence; this.processService = processService; @@ -58,7 +57,15 @@ public class AbstractProcessSpawnerFSM extends AbstractStateGraph { } - @GraphState(name = MONITOR, resume = ResumeBehavior.RETRY) + @GraphState(name = MONITOR, + next = MONITOR, + resume = ResumeBehavior.RETRY, + transitions = {MONITOR, RUN}, + description = """ + Monitors the inbox of the process for messages. + If a message is found, transition to RUN. + """ + ) public void monitor() throws SQLException, InterruptedException { for (;;) { @@ -72,7 +79,17 @@ public class AbstractProcessSpawnerFSM extends AbstractStateGraph { } } - @GraphState(name = RUN, resume = ResumeBehavior.RESTART) + @GraphState(name = RUN, + resume = ResumeBehavior.RESTART, + transitions = {MONITOR, ERROR, RUN, ABORTED}, + description = """ + Runs the process. + If the process fails, retransition to RUN up to MAX_ATTEMPTS times. + After MAX_ATTEMPTS at restarting the process, transition to ERROR. + If the process is cancelled, transition to ABORTED. + If the process is successful, transition to MONITOR. + """ + ) public void run(Integer attempts) throws Exception { if (attempts == null) attempts = 0; @@ -94,7 +111,7 @@ public class AbstractProcessSpawnerFSM extends AbstractStateGraph { transition(MONITOR); } - @TerminalState(name = ABORTED) + @TerminalState(name = ABORTED, description = "The process was manually aborted") public void aborted() throws Exception {} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/ConverterMonitorFSM.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/ConverterMonitorActor.java similarity index 58% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/ConverterMonitorFSM.java rename to code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/ConverterMonitorActor.java index 674d064a..b1f37067 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/ConverterMonitorFSM.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/ConverterMonitorActor.java @@ -1,4 +1,4 @@ -package nu.marginalia.control.fsm.monitor; +package nu.marginalia.control.actor.monitor; import com.google.inject.Inject; import com.google.inject.Singleton; @@ -8,13 +8,13 @@ import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.mqsm.StateFactory; @Singleton -public class ConverterMonitorFSM extends AbstractProcessSpawnerFSM { +public class ConverterMonitorActor extends AbstractProcessSpawnerActor { @Inject - public ConverterMonitorFSM(StateFactory stateFactory, - MqPersistence persistence, - ProcessService processService) { + public ConverterMonitorActor(StateFactory stateFactory, + MqPersistence persistence, + ProcessService processService) { super(stateFactory, persistence, processService, ProcessInboxNames.CONVERTER_INBOX, ProcessService.ProcessId.CONVERTER); } diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/FileStorageMonitorFSM.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/FileStorageMonitorActor.java similarity index 69% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/FileStorageMonitorFSM.java rename to code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/FileStorageMonitorActor.java index 5d760dfc..dc6dd69d 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/FileStorageMonitorFSM.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/FileStorageMonitorActor.java @@ -1,4 +1,4 @@ -package nu.marginalia.control.fsm.monitor; +package nu.marginalia.control.actor.monitor; import com.google.inject.Inject; import com.google.inject.Singleton; @@ -19,7 +19,7 @@ import java.util.Optional; import java.util.concurrent.TimeUnit; @Singleton -public class FileStorageMonitorFSM extends AbstractStateGraph { +public class FileStorageMonitorActor extends AbstractStateGraph { private final Logger logger = LoggerFactory.getLogger(getClass()); // STATES @@ -32,8 +32,8 @@ public class FileStorageMonitorFSM extends AbstractStateGraph { @Inject - public FileStorageMonitorFSM(StateFactory stateFactory, - FileStorageService fileStorageService) { + public FileStorageMonitorActor(StateFactory stateFactory, + FileStorageService fileStorageService) { super(stateFactory); this.fileStorageService = fileStorageService; } @@ -42,7 +42,11 @@ public class FileStorageMonitorFSM extends AbstractStateGraph { public void init() { } - @GraphState(name = MONITOR, resume = ResumeBehavior.RETRY) + @GraphState(name = MONITOR, next = PURGE, resume = ResumeBehavior.RETRY, transitions = { PURGE }, + description = """ + Monitor the file storage and trigger at transition to PURGE if any file storage area + has been marked for deletion. + """) public void monitor() throws Exception { for (;;) { @@ -57,7 +61,13 @@ public class FileStorageMonitorFSM extends AbstractStateGraph { } } - @GraphState(name = PURGE, next = MONITOR, resume = ResumeBehavior.RETRY) + @GraphState(name = PURGE, + next = MONITOR, + resume = ResumeBehavior.RETRY, + description = """ + Purge the file storage area and transition back to MONITOR. + """ + ) public void purge(FileStorageId id) throws Exception { var storage = fileStorageService.getStorage(id); logger.info("Deleting {} ", storage.path()); diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/LoaderMonitorFSM.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/LoaderMonitorActor.java similarity index 60% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/LoaderMonitorFSM.java rename to code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/LoaderMonitorActor.java index 69015f65..3b959356 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/LoaderMonitorFSM.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/LoaderMonitorActor.java @@ -1,4 +1,4 @@ -package nu.marginalia.control.fsm.monitor; +package nu.marginalia.control.actor.monitor; import com.google.inject.Inject; import com.google.inject.Singleton; @@ -8,13 +8,13 @@ import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.mqsm.StateFactory; @Singleton -public class LoaderMonitorFSM extends AbstractProcessSpawnerFSM { +public class LoaderMonitorActor extends AbstractProcessSpawnerActor { @Inject - public LoaderMonitorFSM(StateFactory stateFactory, - MqPersistence persistence, - ProcessService processService) { + public LoaderMonitorActor(StateFactory stateFactory, + MqPersistence persistence, + ProcessService processService) { super(stateFactory, persistence, processService, ProcessInboxNames.LOADER_INBOX, diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/MessageQueueMonitorFSM.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/MessageQueueMonitorActor.java similarity index 68% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/MessageQueueMonitorFSM.java rename to code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/MessageQueueMonitorActor.java index d6c5ff82..77384b06 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/MessageQueueMonitorFSM.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/MessageQueueMonitorActor.java @@ -1,4 +1,4 @@ -package nu.marginalia.control.fsm.monitor; +package nu.marginalia.control.actor.monitor; import com.google.inject.Inject; import com.google.inject.Singleton; @@ -11,7 +11,7 @@ import nu.marginalia.mqsm.graph.ResumeBehavior; import java.util.concurrent.TimeUnit; @Singleton -public class MessageQueueMonitorFSM extends AbstractStateGraph { +public class MessageQueueMonitorActor extends AbstractStateGraph { // STATES @@ -22,8 +22,8 @@ public class MessageQueueMonitorFSM extends AbstractStateGraph { @Inject - public MessageQueueMonitorFSM(StateFactory stateFactory, - MqPersistence persistence) { + public MessageQueueMonitorActor(StateFactory stateFactory, + MqPersistence persistence) { super(stateFactory); this.persistence = persistence; } @@ -32,7 +32,10 @@ public class MessageQueueMonitorFSM extends AbstractStateGraph { public void init() { } - @GraphState(name = MONITOR, resume = ResumeBehavior.RETRY) + @GraphState(name = MONITOR, next = MONITOR, resume = ResumeBehavior.RETRY, + description = """ + Periodically clean up the message queue. + """) public void monitor() throws Exception { for (;;) { diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/ProcessLivenessMonitorFSM.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/ProcessLivenessMonitorActor.java similarity index 66% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/ProcessLivenessMonitorFSM.java rename to code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/ProcessLivenessMonitorActor.java index f6afa68f..1623fd49 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/monitor/ProcessLivenessMonitorFSM.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/ProcessLivenessMonitorActor.java @@ -1,4 +1,4 @@ -package nu.marginalia.control.fsm.monitor; +package nu.marginalia.control.actor.monitor; import com.google.inject.Inject; import com.google.inject.Singleton; @@ -13,7 +13,7 @@ import nu.marginalia.mqsm.graph.ResumeBehavior; import java.util.concurrent.TimeUnit; @Singleton -public class ProcessLivenessMonitorFSM extends AbstractStateGraph { +public class ProcessLivenessMonitorActor extends AbstractStateGraph { // STATES @@ -25,9 +25,9 @@ public class ProcessLivenessMonitorFSM extends AbstractStateGraph { @Inject - public ProcessLivenessMonitorFSM(StateFactory stateFactory, - ProcessService processService, - HeartbeatService heartbeatService) { + public ProcessLivenessMonitorActor(StateFactory stateFactory, + ProcessService processService, + HeartbeatService heartbeatService) { super(stateFactory); this.processService = processService; this.heartbeatService = heartbeatService; @@ -37,7 +37,12 @@ public class ProcessLivenessMonitorFSM extends AbstractStateGraph { public void init() { } - @GraphState(name = MONITOR, resume = ResumeBehavior.RETRY) + @GraphState(name = MONITOR, next = MONITOR, resume = ResumeBehavior.RETRY, description = """ + Periodically check to ensure that the control service's view of + running processes is agreement with the process heartbeats table. + + If the process is not running, mark the process as stopped in the table. + """) public void monitor() throws Exception { for (;;) { diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/task/ReconvertAndLoadFSM.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/ReconvertAndLoadActor.java similarity index 57% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/task/ReconvertAndLoadFSM.java rename to code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/ReconvertAndLoadActor.java index f3e625a6..2ffde9b2 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/task/ReconvertAndLoadFSM.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/ReconvertAndLoadActor.java @@ -1,4 +1,4 @@ -package nu.marginalia.control.fsm.task; +package nu.marginalia.control.actor.task; import com.google.gson.Gson; import com.google.inject.Inject; @@ -8,13 +8,14 @@ import lombok.NoArgsConstructor; import lombok.With; import nu.marginalia.control.svc.ProcessOutboxFactory; import nu.marginalia.control.svc.ProcessService; +import nu.marginalia.index.client.IndexClient; +import nu.marginalia.index.client.IndexMqEndpoints; import nu.marginalia.mqapi.converting.ConvertRequest; import nu.marginalia.mqapi.loading.LoadRequest; import nu.marginalia.db.storage.FileStorageService; import nu.marginalia.db.storage.model.FileStorageBaseType; import nu.marginalia.db.storage.model.FileStorageId; import nu.marginalia.db.storage.model.FileStorageType; -import nu.marginalia.index.client.IndexClient; import nu.marginalia.mq.MqMessage; import nu.marginalia.mq.MqMessageState; import nu.marginalia.mq.outbox.MqOutbox; @@ -22,19 +23,16 @@ import nu.marginalia.mqsm.StateFactory; import nu.marginalia.mqsm.graph.AbstractStateGraph; import nu.marginalia.mqsm.graph.GraphState; import nu.marginalia.mqsm.graph.ResumeBehavior; -import nu.marginalia.search.client.SearchClient; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.nio.file.Files; -import java.nio.file.Path; import java.nio.file.StandardCopyOption; -import java.time.LocalDateTime; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; @Singleton -public class ReconvertAndLoadFSM extends AbstractStateGraph { +public class ReconvertAndLoadActor extends AbstractStateGraph { // STATES @@ -44,10 +42,16 @@ public class ReconvertAndLoadFSM extends AbstractStateGraph { public static final String LOAD = "LOAD"; public static final String LOAD_WAIT = "LOAD-WAIT"; public static final String SWAP_LEXICON = "SWAP-LEXICON"; + + public static final String REPARTITION = "REPARTITION"; + public static final String REPARTITION_WAIT = "REPARTITION-WAIT"; + public static final String REINDEX = "REINDEX"; + public static final String REINDEX_WAIT = "REINDEX-WAIT"; public static final String END = "END"; private final ProcessService processService; private final MqOutbox mqConverterOutbox; private final MqOutbox mqLoaderOutbox; + private final MqOutbox indexOutbox; private final FileStorageService storageService; private final Gson gson; private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -62,14 +66,16 @@ public class ReconvertAndLoadFSM extends AbstractStateGraph { }; @Inject - public ReconvertAndLoadFSM(StateFactory stateFactory, - ProcessService processService, - ProcessOutboxFactory processOutboxFactory, - FileStorageService storageService, - Gson gson + public ReconvertAndLoadActor(StateFactory stateFactory, + ProcessService processService, + ProcessOutboxFactory processOutboxFactory, + FileStorageService storageService, + IndexClient indexClient, + Gson gson ) { super(stateFactory); + this.indexOutbox = indexClient.outbox(); this.processService = processService; this.mqConverterOutbox = processOutboxFactory.createConverterOutbox(); this.mqLoaderOutbox = processOutboxFactory.createLoaderOutbox(); @@ -77,8 +83,16 @@ public class ReconvertAndLoadFSM extends AbstractStateGraph { this.gson = gson; } - @GraphState(name = INITIAL, next = RECONVERT) + @GraphState(name = INITIAL, + next = RECONVERT, + description = """ + Validate the input and transition to RECONVERT + """) public Message init(FileStorageId crawlStorageId) throws Exception { + if (null == crawlStorageId) { + error("This Actor requires a FileStorageId to be passed in as a parameter to INITIAL"); + } + var storage = storageService.getStorage(crawlStorageId); if (storage == null) error("Bad storage id"); @@ -87,7 +101,14 @@ public class ReconvertAndLoadFSM extends AbstractStateGraph { return new Message().withCrawlStorageId(crawlStorageId); } - @GraphState(name = RECONVERT, next = RECONVERT_WAIT, resume = ResumeBehavior.ERROR) + @GraphState(name = RECONVERT, + next = RECONVERT_WAIT, + resume = ResumeBehavior.ERROR, + description = """ + Allocate a storage area for the processed data, + then send a convert request to the converter and transition to RECONVERT_WAIT. + """ + ) public Message reconvert(Message message) throws Exception { // Create processed data area @@ -105,7 +126,15 @@ public class ReconvertAndLoadFSM extends AbstractStateGraph { .withProcessedStorageId(processedArea.id()) .withConverterMsgId(id); } - @GraphState(name = RECONVERT_WAIT, next = LOAD, resume = ResumeBehavior.RETRY) + + @GraphState( + name = RECONVERT_WAIT, + next = LOAD, + resume = ResumeBehavior.RETRY, + description = """ + Wait for the converter to finish processing the data. + """ + ) public Message reconvertWait(Message message) throws Exception { var rsp = waitResponse(mqConverterOutbox, ProcessService.ProcessId.CONVERTER, message.converterMsgId); @@ -116,7 +145,13 @@ public class ReconvertAndLoadFSM extends AbstractStateGraph { } - @GraphState(name = LOAD, next = LOAD_WAIT, resume = ResumeBehavior.ERROR) + @GraphState( + name = LOAD, + next = LOAD_WAIT, + resume = ResumeBehavior.ERROR, + description = """ + Send a load request to the loader and transition to LOAD_WAIT. + """) public Message load(Message message) throws Exception { var request = new LoadRequest(message.processedStorageId); @@ -126,7 +161,14 @@ public class ReconvertAndLoadFSM extends AbstractStateGraph { } - @GraphState(name = LOAD_WAIT, next = SWAP_LEXICON, resume = ResumeBehavior.RETRY) + @GraphState( + name = LOAD_WAIT, + next = SWAP_LEXICON, + resume = ResumeBehavior.RETRY, + description = """ + Wait for the loader to finish loading the data. + """ + ) public void loadWait(Message message) throws Exception { var rsp = waitResponse(mqLoaderOutbox, ProcessService.ProcessId.LOADER, message.loaderMsgId); @@ -136,7 +178,15 @@ public class ReconvertAndLoadFSM extends AbstractStateGraph { - @GraphState(name = SWAP_LEXICON, next = END, resume = ResumeBehavior.RETRY) + @GraphState( + name = SWAP_LEXICON, + next = REPARTITION, + resume = ResumeBehavior.RETRY, + description = """ + Move the lexicon from the LEXICON_STAGING area to the LEXICON_LIVE area, + then instruct the index-service to reload the lexicon. + """ + ) public void swapLexicon(Message message) throws Exception { var live = storageService.getStorageByType(FileStorageType.LEXICON_LIVE); @@ -144,22 +194,65 @@ public class ReconvertAndLoadFSM extends AbstractStateGraph { var fromSource = staging.asPath().resolve("dictionary.dat"); var liveDest = live.asPath().resolve("dictionary.dat"); - // Backup live lexicon - var backupBase = storageService.getStorageBase(FileStorageBaseType.BACKUP); - var backup = storageService.allocateTemporaryStorage(backupBase, FileStorageType.BACKUP, - "lexicon", "Lexicon Backup; " + LocalDateTime.now()); - - Path backupDest = backup.asPath().resolve("dictionary.dat"); - - logger.info("Moving " + liveDest + " to " + backupDest); - Files.move(liveDest, backupDest); - // Swap in new lexicon logger.info("Moving " + fromSource + " to " + liveDest); Files.move(fromSource, liveDest, StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.ATOMIC_MOVE); } + @GraphState( + name = REPARTITION, + next = REPARTITION_WAIT, + description = """ + Instruct the index-service to repartition the index then transition to REPARTITION_WAIT. + """ + ) + public Long repartition() throws Exception { + return indexOutbox.sendAsync(IndexMqEndpoints.INDEX_REPARTITION, ""); + } + + @GraphState( + name = REPARTITION_WAIT, + next = REINDEX, + resume = ResumeBehavior.RETRY, + description = """ + Wait for the index-service to finish repartitioning the index. + """ + ) + public void repartitionReply(Long id) throws Exception { + var rsp = indexOutbox.waitResponse(id); + + if (rsp.state() != MqMessageState.OK) { + error("Repartition failed"); + } + } + + @GraphState( + name = REINDEX, + next = REINDEX_WAIT, + description = """ + Instruct the index-service to reindex the data then transition to REINDEX_WAIT. + """ + ) + public Long reindex() throws Exception { + return indexOutbox.sendAsync(IndexMqEndpoints.INDEX_REINDEX, ""); + } + + @GraphState( + name = REINDEX_WAIT, + next = END, + resume = ResumeBehavior.RETRY, + description = """ + Wait for the index-service to finish reindexing the data. + """ + ) + public void reindexReply(Long id) throws Exception { + var rsp = indexOutbox.waitResponse(id); + + if (rsp.state() != MqMessageState.OK) { + error("Repartition failed"); + } + } public MqMessage waitResponse(MqOutbox outbox, ProcessService.ProcessId processId, long id) throws Exception { if (!waitForProcess(processId, TimeUnit.SECONDS, 30)) { @@ -170,6 +263,7 @@ public class ReconvertAndLoadFSM extends AbstractStateGraph { return outbox.waitResponse(id, 1, TimeUnit.SECONDS); } catch (TimeoutException ex) { + // Maybe the process died, wait a moment for it to restart if (!waitForProcess(processId, TimeUnit.SECONDS, 30)) { error("Process " + processId + " died and did not re-launch"); } @@ -180,7 +274,7 @@ public class ReconvertAndLoadFSM extends AbstractStateGraph { public boolean waitForProcess(ProcessService.ProcessId processId, TimeUnit unit, int duration) throws InterruptedException { // Wait for process to start - long deadline = System.currentTimeMillis() + TimeUnit.SECONDS.toMillis(30); + long deadline = System.currentTimeMillis() + unit.toMillis(duration); while (System.currentTimeMillis() < deadline) { if (processService.isRunning(processId)) return true; diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/ControlFSMs.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/ControlFSMs.java deleted file mode 100644 index 4945c6d5..00000000 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/ControlFSMs.java +++ /dev/null @@ -1,107 +0,0 @@ -package nu.marginalia.control.fsm; - -import com.google.gson.Gson; -import com.google.inject.Inject; -import com.google.inject.Singleton; -import lombok.SneakyThrows; -import nu.marginalia.control.model.ControlProcess; -import nu.marginalia.control.model.ControlProcessState; -import nu.marginalia.control.fsm.monitor.*; -import nu.marginalia.control.fsm.monitor.ConverterMonitorFSM; -import nu.marginalia.control.fsm.monitor.LoaderMonitorFSM; -import nu.marginalia.control.fsm.task.ReconvertAndLoadFSM; -import nu.marginalia.control.fsm.task.RepartitionReindexFSM; -import nu.marginalia.model.gson.GsonFactory; -import nu.marginalia.mq.MessageQueueFactory; -import nu.marginalia.mqsm.StateMachine; -import nu.marginalia.mqsm.graph.AbstractStateGraph; -import nu.marginalia.mqsm.state.MachineState; -import nu.marginalia.service.control.ServiceEventLog; -import nu.marginalia.service.server.BaseServiceParams; - -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.UUID; -import java.util.stream.Collectors; - -@Singleton -public class ControlFSMs { - private final ServiceEventLog eventLog; - private final Gson gson; - private final MessageQueueFactory messageQueueFactory; - public Map stateMachines = new HashMap<>(); - - @Inject - public ControlFSMs(MessageQueueFactory messageQueueFactory, - GsonFactory gsonFactory, - BaseServiceParams baseServiceParams, - RepartitionReindexFSM repartitionReindexFSM, - ReconvertAndLoadFSM reconvertAndLoadFSM, - ConverterMonitorFSM converterMonitorFSM, - LoaderMonitorFSM loaderMonitor, - MessageQueueMonitorFSM messageQueueMonitor, - ProcessLivenessMonitorFSM processMonitorFSM, - FileStorageMonitorFSM fileStorageMonitorFSM - ) { - this.messageQueueFactory = messageQueueFactory; - this.eventLog = baseServiceParams.eventLog; - this.gson = gsonFactory.get(); - - register(ControlProcess.REPARTITION_REINDEX, repartitionReindexFSM); - register(ControlProcess.RECONVERT_LOAD, reconvertAndLoadFSM); - register(ControlProcess.CONVERTER_MONITOR, converterMonitorFSM); - register(ControlProcess.LOADER_MONITOR, loaderMonitor); - register(ControlProcess.MESSAGE_QUEUE_MONITOR, messageQueueMonitor); - register(ControlProcess.PROCESS_LIVENESS_MONITOR, processMonitorFSM); - register(ControlProcess.FILE_STORAGE_MONITOR, fileStorageMonitorFSM); - } - - private void register(ControlProcess process, AbstractStateGraph graph) { - var sm = new StateMachine(messageQueueFactory, process.id(), UUID.randomUUID(), graph); - - sm.listen((function, param) -> logStateChange(process, function)); - - stateMachines.put(process, sm); - } - - private void logStateChange(ControlProcess process, String state) { - eventLog.logEvent("FSM-STATE-CHANGE", process.id() + " -> " + state); - } - - public void startFrom(ControlProcess process, String state) throws Exception { - eventLog.logEvent("FSM-START", process.id()); - - stateMachines.get(process).initFrom(state); - } - - public void start(ControlProcess process) throws Exception { - eventLog.logEvent("FSM-START", process.id()); - - stateMachines.get(process).init(); - } - - public void startFrom(ControlProcess process, String state, Object arg) throws Exception { - eventLog.logEvent("FSM-START", process.id()); - - stateMachines.get(process).initFrom(state, gson.toJson(arg)); - } - - public void start(ControlProcess process, Object arg) throws Exception { - eventLog.logEvent("FSM-START", process.id()); - - stateMachines.get(process).init(gson.toJson(arg)); - } - - @SneakyThrows - public void stop(ControlProcess fsm) { - stateMachines.get(fsm).abortExecution(); - } - - public Map getMachineStates() { - return stateMachines.entrySet().stream().collect( - Collectors.toMap( - Map.Entry::getKey, e -> e.getValue().getState()) - ); - } -} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/task/RepartitionReindexFSM.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/task/RepartitionReindexFSM.java deleted file mode 100644 index deb72004..00000000 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/fsm/task/RepartitionReindexFSM.java +++ /dev/null @@ -1,74 +0,0 @@ -package nu.marginalia.control.fsm.task; - -import com.google.inject.Inject; -import com.google.inject.Singleton; -import nu.marginalia.index.client.IndexClient; -import nu.marginalia.index.client.IndexMqEndpoints; -import nu.marginalia.mq.MqMessageState; -import nu.marginalia.mq.outbox.MqOutbox; -import nu.marginalia.mqsm.StateFactory; -import nu.marginalia.mqsm.graph.AbstractStateGraph; -import nu.marginalia.mqsm.graph.GraphState; -import nu.marginalia.mqsm.graph.ResumeBehavior; - -@Singleton -public class RepartitionReindexFSM extends AbstractStateGraph { - - private final MqOutbox indexOutbox; - - // STATES - - public static final String INITIAL = "INITIAL"; - public static final String REPARTITION = "REPARTITION"; - public static final String REPARTITION_WAIT = "REPARTITION-WAIT"; - public static final String REINDEX = "REINDEX"; - public static final String REINDEX_WAIT = "REINDEX-WAIT"; - public static final String END = "END"; - - - @Inject - public RepartitionReindexFSM(StateFactory stateFactory, - IndexClient indexClient) { - super(stateFactory); - - indexOutbox = indexClient.outbox(); - } - - @GraphState(name = INITIAL, next = REPARTITION) - public void init() throws Exception { - var rsp = indexOutbox.send(IndexMqEndpoints.INDEX_IS_BLOCKED, ""); - - if (rsp.payload().equalsIgnoreCase("true")) { - error("Index is blocked"); - } - } - - @GraphState(name = REPARTITION, next = REPARTITION_WAIT) - public Long repartition() throws Exception { - return indexOutbox.sendAsync(IndexMqEndpoints.INDEX_REPARTITION, ""); - } - - @GraphState(name = REPARTITION_WAIT, next = REINDEX, resume = ResumeBehavior.RETRY) - public void repartitionReply(Long id) throws Exception { - var rsp = indexOutbox.waitResponse(id); - - if (rsp.state() != MqMessageState.OK) { - error("Repartition failed"); - } - } - - @GraphState(name = REINDEX, next = REINDEX_WAIT) - public Long reindex() throws Exception { - return indexOutbox.sendAsync(IndexMqEndpoints.INDEX_REINDEX, ""); - } - - @GraphState(name = REINDEX_WAIT, next = END, resume = ResumeBehavior.RETRY) - public void reindexReply(Long id) throws Exception { - var rsp = indexOutbox.waitResponse(id); - - if (rsp.state() != MqMessageState.OK) { - error("Repartition failed"); - } - } - -} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ControlProcess.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/Actor.java similarity index 83% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ControlProcess.java rename to code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/Actor.java index a09ee9e9..dcced17e 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ControlProcess.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/Actor.java @@ -1,7 +1,6 @@ package nu.marginalia.control.model; -public enum ControlProcess { - REPARTITION_REINDEX, +public enum Actor { RECONVERT_LOAD, CONVERTER_MONITOR, LOADER_MONITOR, diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ControlProcessState.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ActorRunState.java similarity index 83% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ControlProcessState.java rename to code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ActorRunState.java index a7324164..69903ef0 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ControlProcessState.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ActorRunState.java @@ -1,6 +1,6 @@ package nu.marginalia.control.model; -public record ControlProcessState(String name, String state, boolean terminal) { +public record ActorRunState(String name, String state, boolean terminal) { public String stateIcon() { if (terminal) { return "\uD83D\uDE34"; diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ActorState.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ActorState.java new file mode 100644 index 00000000..676f3ed2 --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ActorState.java @@ -0,0 +1,19 @@ +package nu.marginalia.control.model; + +import nu.marginalia.mqsm.graph.GraphState; + +import java.util.Arrays; +import java.util.List; +import java.util.stream.Stream; + +public record ActorState(String name, + boolean current, + List transitions, + String description) { + public ActorState(GraphState gs, boolean current) { + this(gs.name(), current, toTransitions(gs.next(), gs.transitions()), gs.description()); + } + private static List toTransitions(String next, String[] transitions) { + return Stream.concat(Stream.of(next), Arrays.stream(transitions)).distinct().toList(); + } +} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ActorStateGraph.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ActorStateGraph.java new file mode 100644 index 00000000..a9d7b783 --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ActorStateGraph.java @@ -0,0 +1,51 @@ +package nu.marginalia.control.model; + +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mqsm.graph.GraphState; +import nu.marginalia.mqsm.state.MachineState; + +import java.util.*; +import java.util.stream.Collectors; + +public record ActorStateGraph(List states) { + + public ActorStateGraph(AbstractStateGraph graph, MachineState currentState) { + this(getStateList(graph, currentState)); + } + + private static List getStateList( + AbstractStateGraph graph, + MachineState currentState) + { + Map declaredStates = graph.declaredStates().stream().collect(Collectors.toMap(GraphState::name, gs -> gs)); + Set seenStates = new HashSet<>(declaredStates.size()); + LinkedList edge = new LinkedList<>(); + + List statesList = new ArrayList<>(declaredStates.size()); + + edge.add(declaredStates.get("INITIAL")); + + while (!edge.isEmpty()) { + var first = edge.removeFirst(); + if (first == null || !seenStates.add(first)) { + continue; + } + statesList.add(new ActorState(first, currentState.name().equals(first.name()))); + + edge.add(declaredStates.get(first.next())); + + for (var transition : first.transitions()) { + edge.add(declaredStates.get(transition)); + } + } + + if (!declaredStates.containsKey("ERROR")) { + statesList.add(new ActorState("ERROR", currentState.name().equals("ERROR"), List.of(), "Terminal error state")); + } + if (!declaredStates.containsKey("END")) { + statesList.add(new ActorState("END", currentState.name().equals("END"), List.of(), "The machine terminated successfully")); + } + + return statesList; + } +} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlFsmService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java similarity index 50% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlFsmService.java rename to code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java index 24e5fd51..d39f9d4f 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlFsmService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java @@ -2,44 +2,50 @@ package nu.marginalia.control.svc; import com.google.inject.Inject; import com.google.inject.Singleton; -import nu.marginalia.control.fsm.ControlFSMs; -import nu.marginalia.control.fsm.task.ReconvertAndLoadFSM; -import nu.marginalia.control.model.ControlProcess; -import nu.marginalia.control.model.ControlProcessState; +import nu.marginalia.control.actor.ControlActors; +import nu.marginalia.control.actor.task.ReconvertAndLoadActor; +import nu.marginalia.control.model.Actor; +import nu.marginalia.control.model.ActorRunState; +import nu.marginalia.control.model.ActorStateGraph; import nu.marginalia.db.storage.model.FileStorageId; import nu.marginalia.mqsm.state.MachineState; import spark.Request; import spark.Response; -import java.util.List; import java.util.Map; @Singleton -public class ControlFsmService { - private final ControlFSMs controlFSMs; +public class ControlActorService { + private final ControlActors controlActors; @Inject - public ControlFsmService(ControlFSMs controlFSMs) { - this.controlFSMs = controlFSMs; + public ControlActorService(ControlActors controlActors) { + this.controlActors = controlActors; + } + + public Object getActorStateGraph(Actor actor) { + var currentState = controlActors.getActorStates().get(actor); + + return new ActorStateGraph(controlActors.getActorDefinition(actor), currentState); } public Object startFsm(Request req, Response rsp) throws Exception { - controlFSMs.start( - ControlProcess.valueOf(req.params("fsm").toUpperCase()) + controlActors.start( + Actor.valueOf(req.params("fsm").toUpperCase()) ); return ""; } public Object stopFsm(Request req, Response rsp) throws Exception { - controlFSMs.stop( - ControlProcess.valueOf(req.params("fsm").toUpperCase()) + controlActors.stop( + Actor.valueOf(req.params("fsm").toUpperCase()) ); return ""; } public Object triggerProcessing(Request request, Response response) throws Exception { - controlFSMs.start( - ControlProcess.RECONVERT_LOAD, + controlActors.start( + Actor.RECONVERT_LOAD, FileStorageId.of(Integer.parseInt(request.params("fid"))) ); return ""; @@ -49,24 +55,25 @@ public class ControlFsmService { var fid = FileStorageId.of(Integer.parseInt(request.params("fid"))); // Start the FSM from the intermediate state that triggers the load - controlFSMs.startFrom( - ControlProcess.RECONVERT_LOAD, - ReconvertAndLoadFSM.LOAD, - new ReconvertAndLoadFSM.Message(null, fid, 0L, 0L) + controlActors.startFrom( + Actor.RECONVERT_LOAD, + ReconvertAndLoadActor.LOAD, + new ReconvertAndLoadActor.Message(null, fid, 0L, 0L) ); return ""; } - public Object getFsmStates() { - return controlFSMs.getMachineStates().entrySet().stream().sorted(Map.Entry.comparingByKey()).map(e -> { + public Object getActorStates() { + return controlActors.getActorStates().entrySet().stream().sorted(Map.Entry.comparingByKey()).map(e -> { final MachineState state = e.getValue(); final String machineName = e.getKey().name(); final String stateName = state.name(); final boolean terminal = state.isFinal(); - return new ControlProcessState(machineName, stateName, terminal); + return new ActorRunState(machineName, stateName, terminal); }).toList(); } -} + +} \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueViewService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueViewService.java index 439b1c2f..f52ba3a1 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueViewService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueViewService.java @@ -3,7 +3,9 @@ package nu.marginalia.control.svc; import com.google.inject.Inject; import com.google.inject.Singleton; import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.control.model.Actor; import nu.marginalia.control.model.MessageQueueEntry; +import nu.marginalia.mqsm.graph.AbstractStateGraph; import java.sql.SQLException; import java.util.ArrayList; @@ -54,4 +56,74 @@ public class MessageQueueViewService { } } + public MessageQueueEntry getMessage(long id) { + try (var conn = dataSource.getConnection(); + var query = conn.prepareStatement(""" + SELECT ID, RELATED_ID, SENDER_INBOX, RECIPIENT_INBOX, FUNCTION, PAYLOAD, OWNER_INSTANCE, OWNER_TICK, STATE, CREATED_TIME, UPDATED_TIME, TTL + FROM MESSAGE_QUEUE + WHERE ID=? + """)) { + + query.setLong(1, id); + + var rs = query.executeQuery(); + if (rs.next()) { + return new MessageQueueEntry( + rs.getLong("ID"), + rs.getLong("RELATED_ID"), + rs.getString("SENDER_INBOX"), + rs.getString("RECIPIENT_INBOX"), + rs.getString("FUNCTION"), + rs.getString("PAYLOAD"), + rs.getString("OWNER_INSTANCE"), + rs.getLong("OWNER_TICK"), + rs.getString("STATE"), + rs.getTimestamp("CREATED_TIME").toLocalDateTime().toLocalTime().toString(), + rs.getTimestamp("UPDATED_TIME").toLocalDateTime().toLocalTime().toString(), + rs.getInt("TTL") + ); + } + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + return null; + } + + public Object getLastEntriesForInbox(String inbox, int n) { + try (var conn = dataSource.getConnection(); + var query = conn.prepareStatement(""" + SELECT ID, RELATED_ID, SENDER_INBOX, RECIPIENT_INBOX, FUNCTION, PAYLOAD, OWNER_INSTANCE, OWNER_TICK, STATE, CREATED_TIME, UPDATED_TIME, TTL + FROM MESSAGE_QUEUE + WHERE RECIPIENT_INBOX=? + ORDER BY ID DESC + LIMIT ? + """)) { + + query.setString(1, inbox); + query.setInt(2, n); + List entries = new ArrayList<>(n); + var rs = query.executeQuery(); + while (rs.next()) { + entries.add(new MessageQueueEntry( + rs.getLong("ID"), + rs.getLong("RELATED_ID"), + rs.getString("SENDER_INBOX"), + rs.getString("RECIPIENT_INBOX"), + rs.getString("FUNCTION"), + rs.getString("PAYLOAD"), + rs.getString("OWNER_INSTANCE"), + rs.getLong("OWNER_TICK"), + rs.getString("STATE"), + rs.getTimestamp("CREATED_TIME").toLocalDateTime().toLocalTime().toString(), + rs.getTimestamp("UPDATED_TIME").toLocalDateTime().toLocalTime().toString(), + rs.getInt("TTL") + )); + } + return entries; + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } } diff --git a/code/services-satellite/control-service/src/main/resources/static/control/style.css b/code/services-satellite/control-service/src/main/resources/static/control/style.css index d3a2aacd..e4be767f 100644 --- a/code/services-satellite/control-service/src/main/resources/static/control/style.css +++ b/code/services-satellite/control-service/src/main/resources/static/control/style.css @@ -8,6 +8,19 @@ body { grid-template-areas: "left right"; } +.toggle-switch-off { + border-left: 5px solid #f00; + width: 8ch; +} +.toggle-switch-on { + border-right: 5px solid #080; + width: 8ch; +} +.toggle-switch-active { + border-left: 5px solid #00f; + border-right: 5px solid #00f; + width: 8ch; +} #services .missing { color: #800; } @@ -54,4 +67,10 @@ nav ul li a.current { body > section { grid-area: right; +} + +#state-graph .current-state td:first-of-type { + border-right: 1em solid #000; + font-weight: bold; + border-color: #000; } \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/actor-details.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/actor-details.hdb new file mode 100644 index 00000000..d3d807e6 --- /dev/null +++ b/code/services-satellite/control-service/src/main/resources/templates/control/actor-details.hdb @@ -0,0 +1,22 @@ + + + + Control Service + + + + + {{> control/partials/nav}} +
    +

    {{actor}}

    + {{> control/partials/actor-state-graph}} + {{> control/partials/message-queue-table}} +
    + + + + \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/processes.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/actors.hdb similarity index 83% rename from code/services-satellite/control-service/src/main/resources/templates/control/processes.hdb rename to code/services-satellite/control-service/src/main/resources/templates/control/actors.hdb index 114b340d..9bb0bfd0 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/processes.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/actors.hdb @@ -9,14 +9,14 @@ {{> control/partials/nav}}
    {{> control/partials/processes-table}} - {{> control/partials/fsm-table}} + {{> control/partials/actors-table}} {{> control/partials/message-queue-table}}
    \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/partials/actor-state-graph.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/partials/actor-state-graph.hdb new file mode 100644 index 00000000..dece5f62 --- /dev/null +++ b/code/services-satellite/control-service/src/main/resources/templates/control/partials/actor-state-graph.hdb @@ -0,0 +1,16 @@ +

    Actor State Graph

    +
    + + + + + + + {{#each state-graph.states}} + + + + + + {{/each}} +
    StateTransitionsDescription
    {{name}}{{#each transitions}} {{.}} {{/each}}{{description}}
    diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/partials/actors-table.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/partials/actors-table.hdb new file mode 100644 index 00000000..9fe27aa6 --- /dev/null +++ b/code/services-satellite/control-service/src/main/resources/templates/control/partials/actors-table.hdb @@ -0,0 +1,50 @@ +

    Actors

    + + + + + + + {{#each actors}} + + + + + + {{/each}} +
    ActorStateAction
    {{name}}{{stateIcon}} {{state}} + {{#unless terminal}} +
    + +
    + {{/unless}} + {{#if terminal}} +
    + +
    + {{/if}} + +
    + + \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/partials/fsm-table.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/partials/fsm-table.hdb deleted file mode 100644 index c7b66e9a..00000000 --- a/code/services-satellite/control-service/src/main/resources/templates/control/partials/fsm-table.hdb +++ /dev/null @@ -1,23 +0,0 @@ -

    FSMs

    - - - - - - - {{#each fsms}} - - - - - - {{/each}} -
    FSMStateAction
    {{name}}{{stateIcon}} {{state}} - {{#unless terminal}} -
    - {{/unless}} - {{#if terminal}} -
    - {{/if}} - -
    \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/partials/message-queue-table.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/partials/message-queue-table.hdb index cf584ab2..5c3397f4 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/partials/message-queue-table.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/partials/message-queue-table.hdb @@ -12,7 +12,7 @@ {{#each messages}} {{stateCode}} {{state}} - {{id}} + {{id}} {{recipientInbox}} {{function}} @@ -29,4 +29,62 @@ {{updatedTime}} {{/each}} - \ No newline at end of file + + + +

    Edit Message

    +
    +
    + + + + + + + + + + + + +
    + +
    +
    + +
    +
    +
    +
    + + \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/partials/nav.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/partials/nav.hdb index 974502d5..40610960 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/partials/nav.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/partials/nav.hdb @@ -2,7 +2,7 @@ \ No newline at end of file From f91d92cccbf3e3a5c0065fdd715c87cb59b2ae15 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 20 Jul 2023 21:05:16 +0200 Subject: [PATCH 065/157] (crawler) WIP --- .../mqapi/crawling/CrawlRequest.java | 4 +- .../db/storage/FileStorageService.java | 53 ++++- .../db/storage/model/FileStorageId.java | 3 + .../resources/sql/current/13-file-storage.sql | 8 + .../crawling/io/CrawledDomainReader.java | 1 - .../crawling/io/CrawledDomainWriter.java | 31 +-- .../crawling/io/CrawlerOutputFile.java | 53 +++++ .../crawling/model/CrawledDocument.java | 2 + .../model/spec/CrawlingSpecification.java | 6 +- .../marginalia/converting/ConverterMain.java | 2 +- .../converting/ConvertingIntegrationTest.java | 3 +- code/processes/crawling-process/build.gradle | 3 + .../nu/marginalia/crawl/CrawlLimiter.java | 72 ++++++ .../java/nu/marginalia/crawl/CrawlerMain.java | 223 +++++++++++++----- .../nu/marginalia/crawl/CrawlerModule.java | 24 ++ .../crawl/retreival/CrawlDataReference.java | 123 ++++++++++ .../crawl/retreival/CrawlerRetreiver.java | 149 +++++++++++- .../crawl/retreival/DomainCrawlFrontier.java | 8 +- .../crawl/retreival/fetcher/HttpFetcher.java | 2 +- .../retreival/fetcher/HttpFetcherImpl.java | 35 ++- .../marginalia/crawling/HttpFetcherTest.java | 4 +- .../retreival/CrawlerMockFetcherTest.java | 9 +- .../retreival/CrawlerRetreiverTest.java | 35 +++ .../nu/marginalia/control/ControlService.java | 2 + .../control/actor/ControlActors.java | 12 +- .../monitor/AbstractProcessSpawnerActor.java | 19 +- .../actor/monitor/CrawlerMonitorActor.java | 25 ++ .../control/actor/task/CrawlActor.java | 171 ++++++++++++++ .../actor/task/ReconvertAndLoadActor.java | 2 + .../control/actor/task/RecrawlActor.java | 185 +++++++++++++++ .../nu/marginalia/control/model/Actor.java | 3 + .../control/model/FileStorageWithActions.java | 7 + .../control/svc/ControlActorService.java | 22 +- .../control/svc/ProcessOutboxFactory.java | 4 + .../resources/templates/control/storage.hdb | 10 + .../crawl/CrawlJobSpecWriterTest.java | 6 +- run/env/service.env | 3 +- 37 files changed, 1186 insertions(+), 138 deletions(-) create mode 100644 code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java create mode 100644 code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java create mode 100644 code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerModule.java create mode 100644 code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/CrawlerMonitorActor.java create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlActor.java create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/RecrawlActor.java diff --git a/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/crawling/CrawlRequest.java b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/crawling/CrawlRequest.java index 5aaecc5d..16cdc6f3 100644 --- a/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/crawling/CrawlRequest.java +++ b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/crawling/CrawlRequest.java @@ -6,6 +6,6 @@ import nu.marginalia.db.storage.model.FileStorageId; /** A request to start a crawl */ @AllArgsConstructor public class CrawlRequest { - FileStorageId specStorage; - FileStorageId crawlStorage; + public FileStorageId specStorage; + public FileStorageId crawlStorage; } diff --git a/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java b/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java index 7ed94a46..334643b1 100644 --- a/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java +++ b/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java @@ -11,6 +11,8 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.attribute.PosixFilePermissions; import java.sql.SQLException; +import java.util.ArrayList; +import java.util.List; import java.util.Optional; /** Manages file storage for processes and services @@ -63,6 +65,49 @@ public class FileStorageService { return null; } + public void relateFileStorages(FileStorageId source, FileStorageId target) { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + INSERT INTO FILE_STORAGE_RELATION(SOURCE_ID, TARGET_ID) VALUES (?, ?) + """)) { + stmt.setLong(1, source.id()); + stmt.setLong(2, target.id()); + stmt.executeUpdate(); + } catch (SQLException e) { + throw new RuntimeException(e); + } + } + + public List getSourceFromStorage(FileStorage storage) throws SQLException { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT SOURCE_ID FROM FILE_STORAGE_RELATION WHERE TARGET_ID = ? + """)) { + stmt.setLong(1, storage.id().id()); + var rs = stmt.executeQuery(); + List ret = new ArrayList<>(); + while (rs.next()) { + ret.add(getStorage(new FileStorageId(rs.getLong(1)))); + } + return ret; + } + } + + public List getTargetFromStorage(FileStorage storage) throws SQLException { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT TARGET_ID FROM FILE_STORAGE_RELATION WHERE SOURCE_ID = ? + """)) { + stmt.setLong(1, storage.id().id()); + var rs = stmt.executeQuery(); + List ret = new ArrayList<>(); + while (rs.next()) { + ret.add(getStorage(new FileStorageId(rs.getLong(1)))); + } + return ret; + } + } + /** @return the storage base with the given type, or null if it does not exist */ public FileStorageBase getStorageBase(FileStorageBaseType type) throws SQLException { try (var conn = dataSource.getConnection(); @@ -153,13 +198,7 @@ public class FileStorageService { var rs = query.executeQuery(); if (rs.next()) { - return new FileStorage( - new FileStorageId(rs.getLong("ID")), - base, - type, - tempDir.toString(), - description - ); + return getStorage(new FileStorageId(rs.getLong("ID"))); } } diff --git a/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageId.java b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageId.java index 3d6331e3..a89ad9f8 100644 --- a/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageId.java +++ b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageId.java @@ -1,6 +1,9 @@ package nu.marginalia.db.storage.model; public record FileStorageId(long id) { + public static FileStorageId parse(String str) { + return new FileStorageId(Long.parseLong(str)); + } public static FileStorageId of(int storageId) { return new FileStorageId(storageId); } diff --git a/code/common/db/src/main/resources/sql/current/13-file-storage.sql b/code/common/db/src/main/resources/sql/current/13-file-storage.sql index 763f39a0..b2063fc8 100644 --- a/code/common/db/src/main/resources/sql/current/13-file-storage.sql +++ b/code/common/db/src/main/resources/sql/current/13-file-storage.sql @@ -23,6 +23,14 @@ CREATE TABLE IF NOT EXISTS FILE_STORAGE ( CHARACTER SET utf8mb4 COLLATE utf8mb4_bin; +CREATE TABLE IF NOT EXISTS FILE_STORAGE_RELATION ( + SOURCE_ID BIGINT NOT NULL, + TARGET_ID BIGINT NOT NULL, + CONSTRAINT CONS UNIQUE (SOURCE_ID, TARGET_ID), + FOREIGN KEY (SOURCE_ID) REFERENCES FILE_STORAGE(ID) ON DELETE CASCADE, + FOREIGN KEY (TARGET_ID) REFERENCES FILE_STORAGE(ID) ON DELETE CASCADE +); + CREATE VIEW FILE_STORAGE_VIEW AS SELECT CONCAT(BASE.PATH, '/', STORAGE.PATH) AS PATH, diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java index 9c293af7..67b95484 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java @@ -64,7 +64,6 @@ public class CrawledDomainReader { return Optional.of(read(path)); } catch (Exception ex) { - logger.warn("Failed to read domain " + path, ex); return Optional.empty(); } } diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java index 51ffab18..83582212 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java @@ -14,12 +14,15 @@ import java.io.OutputStreamWriter; import java.io.Writer; import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.StandardCopyOption; +import java.nio.file.StandardOpenOption; public class CrawledDomainWriter implements AutoCloseable { private final Path outputDir; private final Gson gson = GsonFactory.get(); private static final Logger logger = LoggerFactory.getLogger(CrawledDomainWriter.class); private final Writer writer; + private final Path tmpFile; private final Path outputFile; public CrawledDomainWriter(Path outputDir, String name, String id) throws IOException { @@ -29,8 +32,10 @@ public class CrawledDomainWriter implements AutoCloseable { throw new IllegalArgumentException("Output dir " + outputDir + " does not exist"); } + tmpFile = getOutputFile(id, name + "_tmp"); outputFile = getOutputFile(id, name); - writer = new OutputStreamWriter(new ZstdOutputStream(new BufferedOutputStream(Files.newOutputStream(outputFile)))); + writer = new OutputStreamWriter(new ZstdOutputStream(new BufferedOutputStream(Files.newOutputStream(tmpFile, + StandardOpenOption.CREATE, StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING)))); } public Path getOutputFile() { @@ -46,32 +51,12 @@ public class CrawledDomainWriter implements AutoCloseable { } private Path getOutputFile(String id, String name) throws IOException { - String first = id.substring(0, 2); - String second = id.substring(2, 4); - - Path destDir = outputDir.resolve(first).resolve(second); - if (!Files.exists(destDir)) { - Files.createDirectories(destDir); - } - return destDir.resolve(id + "-" + filesystemSafeName(name) + ".zstd"); - } - - private String filesystemSafeName(String name) { - StringBuilder nameSaneBuilder = new StringBuilder(); - - name.chars() - .map(Character::toLowerCase) - .map(c -> (c & ~0x7F) == 0 ? c : 'X') - .map(c -> (Character.isDigit(c) || Character.isAlphabetic(c) || c == '.') ? c : 'X') - .limit(128) - .forEach(c -> nameSaneBuilder.append((char) c)); - - return nameSaneBuilder.toString(); - + return CrawlerOutputFile.createOutputPath(outputDir, id, name); } @Override public void close() throws IOException { + Files.move(tmpFile, outputFile, StandardCopyOption.REPLACE_EXISTING); writer.close(); } } diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java new file mode 100644 index 00000000..6cf5857f --- /dev/null +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java @@ -0,0 +1,53 @@ +package nu.marginalia.crawling.io; + +import nu.marginalia.crawling.model.spec.CrawlingSpecification; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +public class CrawlerOutputFile { + + public static Path getOutputFile(Path base, CrawlingSpecification spec) { + return getOutputFile(base, spec.id, spec.domain); + } + + + /** Return the Path to a file for the given id and name */ + public static Path getOutputFile(Path base, String id, String name) { + String first = id.substring(0, 2); + String second = id.substring(2, 4); + + Path destDir = base.resolve(first).resolve(second); + return destDir.resolve(id + "-" + filesystemSafeName(name) + ".zstd"); + } + + /** Return the Path to a file for the given id and name, creating the prerequisite + * directory structure as necessary. */ + public static Path createOutputPath(Path base, String id, String name) throws IOException { + String first = id.substring(0, 2); + String second = id.substring(2, 4); + + Path destDir = base.resolve(first).resolve(second); + if (!Files.exists(destDir)) { + Files.createDirectories(destDir); + } + return destDir.resolve(id + "-" + filesystemSafeName(name) + ".zstd"); + } + + + private static String filesystemSafeName(String name) { + StringBuilder nameSaneBuilder = new StringBuilder(); + + name.chars() + .map(Character::toLowerCase) + .map(c -> (c & ~0x7F) == 0 ? c : 'X') + .map(c -> (Character.isDigit(c) || Character.isAlphabetic(c) || c == '.') ? c : 'X') + .limit(128) + .forEach(c -> nameSaneBuilder.append((char) c)); + + return nameSaneBuilder.toString(); + + } + +} diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java index 004408eb..0066ddf2 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java @@ -27,6 +27,8 @@ public class CrawledDocument implements SerializableCrawlData { public String canonicalUrl; public String redirectUrl; + public String recrawlState; + public static final String SERIAL_IDENTIFIER = "// DOCUMENT"; @Override public String getSerialIdentifier() { diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/spec/CrawlingSpecification.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/spec/CrawlingSpecification.java index 47ecf921..f6001166 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/spec/CrawlingSpecification.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/spec/CrawlingSpecification.java @@ -3,10 +3,12 @@ package nu.marginalia.crawling.model.spec; import lombok.AllArgsConstructor; import lombok.Builder; import lombok.NoArgsConstructor; +import lombok.With; +import nu.marginalia.crawling.model.CrawledDomain; import java.util.List; -@AllArgsConstructor @NoArgsConstructor @Builder +@AllArgsConstructor @NoArgsConstructor @Builder @With public class CrawlingSpecification { public String id; @@ -16,6 +18,8 @@ public class CrawlingSpecification { public String domain; public List urls; + public CrawledDomain oldData; + @Override public String toString() { return String.format(getClass().getSimpleName() + "[" + id + "/" + domain + ": " + crawlDepth + "[ " + urls.size() + "]"); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java index 5488a6c2..55c022ba 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java @@ -138,7 +138,7 @@ public class ConverterMain { // Advance the progress bar to the current position if this is a resumption processedDomains.set(processLog.countFinishedJobs()); - heartbeat.setProgress(processedDomains.incrementAndGet() / (double) totalDomains); + heartbeat.setProgress(processedDomains.get() / (double) totalDomains); for (var domain : plan.domainsIterable(id -> !processLog.isJobFinished(id))) { diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java index 5b78ac9e..67aa5299 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java @@ -113,7 +113,8 @@ public class ConvertingIntegrationTest { BigString.encode(readClassPathFile(p.toString())), Double.toString(Math.random()), "https://memex.marginalia.nu/" + file, - null + null, + "" ); docs.add(doc); } diff --git a/code/processes/crawling-process/build.gradle b/code/processes/crawling-process/build.gradle index b62b3a68..48068620 100644 --- a/code/processes/crawling-process/build.gradle +++ b/code/processes/crawling-process/build.gradle @@ -27,9 +27,12 @@ dependencies { implementation project(':code:common:service') implementation project(':code:libraries:big-string') implementation project(':code:api:index-api') + implementation project(':code:api:process-mqapi') implementation project(':code:common:service-discovery') implementation project(':code:common:service-client') + implementation project(':code:common:message-queue') implementation project(':code:libraries:language-processing') + implementation project(':code:libraries:easy-lsh') implementation project(':code:process-models:crawling-model') implementation project(':code:process-models:converting-model') diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java new file mode 100644 index 00000000..29f02e4f --- /dev/null +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java @@ -0,0 +1,72 @@ +package nu.marginalia.crawl; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.concurrent.Semaphore; + +public class CrawlLimiter { + public static final int maxPoolSize = Integer.getInteger("crawler.pool-size", 512); + + // We'll round up to this size when we're crawling a new domain to prevent + // too many concurrent connections + public static final int minCrawlDataSizeKb = 128; // 100 Kb + + // The largest size on disk where we'll permit a refresh crawl + // (these files easily grow into the gigabytes, we don't want that in RAM) + public static final int maxRefreshableCrawlDataSizeKBytes = 1024*128; // 128 Mb + + // This limits how many concurrent crawl tasks we can have running at once + // based on their size on disk. The on-disk size is compressed, and the + // in-ram size is partially compressed (i.e. only the document body); so + // maybe a fair estimate is something like 2-4x this figure for RAM usage + // + public static final int maxConcurrentCrawlTaskSizeKb = 512*1024; // 512 Mb + + static { + // Sanity check; if this is false we'll get a deadlock on taskSemRAM + assert maxConcurrentCrawlTaskSizeKb >= maxRefreshableCrawlDataSizeKBytes + : "maxConcurrentCrawlTaskSizeKb must be larger than maxRefreshableCrawlDataSizeKBytes"; + } + + public record CrawlTaskLimits(Path refreshPath, boolean isRefreshable, int taskSize) {} + + // We use two semaphores to keep track of the number of concurrent crawls; + // first a RAM sempahore to limit the amount of RAM used by refresh crawls. + // then a count semaphore to limit the number of concurrent threads (this keeps the connection count manageable) + private final Semaphore taskSemRAM = new Semaphore(maxConcurrentCrawlTaskSizeKb); + private final Semaphore taskSemCount = new Semaphore(maxPoolSize); + + + public CrawlTaskLimits getTaskLimits(Path fileName) { + long size; + + try { + size = Math.max(minCrawlDataSizeKb, Files.size(fileName) / 1024); + } catch (IOException ex) { + // If we can't read the file, we'll assume it's small since we won't be able to read it later for the refresh either + return new CrawlTaskLimits(null,false, minCrawlDataSizeKb); + } + + // We'll only permit refresh crawls if the file is small enough + boolean isRefreshable = size < maxRefreshableCrawlDataSizeKBytes; + + // We'll truncate this down to maxRefreshableCrawlDataSizeKBytes to ensure + // it's possible to acquire the RAM semaphore + int effectiveSize = (int) Math.min(maxRefreshableCrawlDataSizeKBytes, size); + + return new CrawlTaskLimits(fileName, isRefreshable, effectiveSize); + } + + + public void acquire(CrawlTaskLimits properties) throws InterruptedException { + // It's very important that we acquire the RAM semaphore first to avoid a deadlock + taskSemRAM.acquire(properties.taskSize); + taskSemCount.acquire(1); + } + + public void release(CrawlTaskLimits properties) { + taskSemCount.release(1); + taskSemRAM.release(properties.taskSize); + } +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java index a0a3f8b7..3dd096cb 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java @@ -1,13 +1,23 @@ package nu.marginalia.crawl; -import nu.marginalia.ProcessConfiguration; +import com.google.gson.Gson; +import com.google.inject.Guice; +import com.google.inject.Inject; +import com.google.inject.Injector; import nu.marginalia.UserAgent; import nu.marginalia.WmsaHome; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; +import nu.marginalia.crawling.io.CrawledDomainReader; +import nu.marginalia.crawling.io.CrawlerOutputFile; +import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.mq.MessageQueueFactory; +import nu.marginalia.mq.MqMessage; +import nu.marginalia.mq.inbox.MqInboxResponse; +import nu.marginalia.mq.inbox.MqSingleShotInbox; import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.process.log.WorkLog; import nu.marginalia.service.module.DatabaseModule; -import plan.CrawlPlanLoader; import plan.CrawlPlan; import nu.marginalia.crawling.io.CrawledDomainWriter; import nu.marginalia.crawling.model.spec.CrawlingSpecification; @@ -19,49 +29,63 @@ import okhttp3.internal.Util; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; import java.nio.file.Path; +import java.sql.SQLException; import java.util.HashSet; +import java.util.Optional; import java.util.Set; import java.util.UUID; import java.util.concurrent.*; import java.util.concurrent.atomic.AtomicInteger; +import static nu.marginalia.mqapi.ProcessInboxNames.CRAWLER_INBOX; + public class CrawlerMain implements AutoCloseable { private final Logger logger = LoggerFactory.getLogger(getClass()); - private final CrawlPlan plan; - private final Path crawlDataDir; - - private final WorkLog workLog; + private Path crawlDataDir; + private WorkLog workLog; + private final ProcessHeartbeat heartbeat; private final ConnectionPool connectionPool = new ConnectionPool(5, 10, TimeUnit.SECONDS); private final Dispatcher dispatcher = new Dispatcher(new ThreadPoolExecutor(0, Integer.MAX_VALUE, 5, TimeUnit.SECONDS, new SynchronousQueue<>(), Util.threadFactory("OkHttp Dispatcher", true))); private final UserAgent userAgent; + private final MessageQueueFactory messageQueueFactory; + private final FileStorageService fileStorageService; + private final Gson gson; private final ThreadPoolExecutor pool; - final int poolSize = Integer.getInteger("crawler.pool-size", 512); - final int poolQueueSize = 32; + public final CrawlLimiter crawlLimiter = new CrawlLimiter(); private final Set processedIds = new HashSet<>(); - AbortMonitor abortMonitor = AbortMonitor.getInstance(); - Semaphore taskSem = new Semaphore(poolSize); + final AbortMonitor abortMonitor = AbortMonitor.getInstance(); - private static ProcessHeartbeat heartbeat; + volatile int totalTasks; + final AtomicInteger tasksDone = new AtomicInteger(0); - public CrawlerMain(CrawlPlan plan) throws Exception { - this.plan = plan; - this.userAgent = WmsaHome.getUserAgent(); + @Inject + public CrawlerMain(UserAgent userAgent, + ProcessHeartbeat heartbeat, + MessageQueueFactory messageQueueFactory, + FileStorageService fileStorageService, + Gson gson) { + this.heartbeat = heartbeat; + this.userAgent = userAgent; + this.messageQueueFactory = messageQueueFactory; + this.fileStorageService = fileStorageService; + this.gson = gson; - // Ensure that the user agent is set for Java's HTTP requests - - BlockingQueue queue = new LinkedBlockingQueue<>(poolQueueSize); - pool = new ThreadPoolExecutor(poolSize/128, poolSize, 5, TimeUnit.MINUTES, queue); // maybe need to set -Xss for JVM to deal with this? - - workLog = plan.createCrawlWorkLog(); - crawlDataDir = plan.crawl.getDir(); + // maybe need to set -Xss for JVM to deal with this? + pool = new ThreadPoolExecutor( + CrawlLimiter.maxPoolSize /128, + CrawlLimiter.maxPoolSize, + 5, TimeUnit.MINUTES, + new LinkedBlockingQueue<>(32) + ); } public static void main(String... args) throws Exception { @@ -77,46 +101,65 @@ public class CrawlerMain implements AutoCloseable { System.setProperty("sun.net.client.defaultConnectTimeout", "30000"); System.setProperty("sun.net.client.defaultReadTimeout", "30000"); - if (args.length != 1) { - System.err.println("Arguments: crawl-plan.yaml"); - System.exit(0); - } - var plan = new CrawlPlanLoader().load(Path.of(args[0])); + Injector injector = Guice.createInjector( + new CrawlerModule(), + new DatabaseModule() + ); + var crawler = injector.getInstance(CrawlerMain.class); - heartbeat = new ProcessHeartbeat(new ProcessConfiguration("crawler", 0, UUID.randomUUID()), - new DatabaseModule().provideConnection()); + var instructions = crawler.fetchInstructions(); + try { + crawler.run(instructions.getPlan()); + instructions.ok(); + } + catch (Exception ex) { + System.err.println("Crawler failed"); + ex.printStackTrace(); + instructions.err(); + } - try (var crawler = new CrawlerMain(plan)) { - heartbeat.start(); - crawler.run(); - } - finally { - heartbeat.shutDown(); - } + TimeUnit.SECONDS.sleep(5); System.exit(0); } - public void run() throws InterruptedException { - // First a validation run to ensure the file is all good to parse - logger.info("Validating JSON"); - int countTotal = 0; - int countProcessed = 0; + public void run(CrawlPlan plan) throws InterruptedException, IOException { - for (var unused : plan.crawlingSpecificationIterable()) { - countTotal++; + heartbeat.start(); + try { + // First a validation run to ensure the file is all good to parse + logger.info("Validating JSON"); + + + workLog = plan.createCrawlWorkLog(); + crawlDataDir = plan.crawl.getDir(); + + int countTotal = 0; + for (var unused : plan.crawlingSpecificationIterable()) { + countTotal++; + } + totalTasks = countTotal; + + logger.info("Let's go"); + + for (var spec : plan.crawlingSpecificationIterable()) { + startCrawlTask(plan, spec); + } + + pool.shutdown(); + do { + System.out.println("Waiting for pool to terminate... " + pool.getActiveCount() + " remaining"); + } while (!pool.awaitTermination(60, TimeUnit.SECONDS)); } - - logger.info("Let's go"); - - for (var spec : plan.crawlingSpecificationIterable()) { - heartbeat.setProgress(countProcessed / (double) countTotal); - startCrawlTask(spec); + finally { + heartbeat.shutDown(); } } + CrawledDomainReader reader = new CrawledDomainReader(); - private void startCrawlTask(CrawlingSpecification crawlingSpecification) { + + private void startCrawlTask(CrawlPlan plan, CrawlingSpecification crawlingSpecification) { if (!processedIds.add(crawlingSpecification.id)) { @@ -132,28 +175,41 @@ public class CrawlerMain implements AutoCloseable { return; } + var limits = crawlLimiter.getTaskLimits(CrawlerOutputFile.getOutputFile(crawlDataDir, crawlingSpecification)); + try { - taskSem.acquire(); + crawlLimiter.acquire(limits); } catch (InterruptedException e) { throw new RuntimeException(e); } pool.execute(() -> { try { - fetchDomain(crawlingSpecification); + fetchDomain(crawlingSpecification, limits); + heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks); } finally { - taskSem.release(); + crawlLimiter.release(limits); } }); } - private void fetchDomain(CrawlingSpecification specification) { + + private void fetchDomain(CrawlingSpecification specification, CrawlLimiter.CrawlTaskLimits limits) { if (workLog.isJobFinished(specification.id)) return; HttpFetcher fetcher = new HttpFetcherImpl(userAgent.uaString(), dispatcher, connectionPool); + // Read the previous crawl's data for this domain, if it exists and has a reasonable size + Optional domain; + if (limits.isRefreshable()) { + domain = reader.readOptionally(limits.refreshPath()); + if (domain.isPresent()) { + specification = specification.withOldData(domain.get()); + } + } + try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) { var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept); @@ -167,6 +223,65 @@ public class CrawlerMain implements AutoCloseable { } } + private static class CrawlRequest { + private final CrawlPlan plan; + private final MqMessage message; + private final MqSingleShotInbox inbox; + + CrawlRequest(CrawlPlan plan, MqMessage message, MqSingleShotInbox inbox) { + this.plan = plan; + this.message = message; + this.inbox = inbox; + } + + public CrawlPlan getPlan() { + return plan; + } + + public void ok() { + inbox.sendResponse(message, MqInboxResponse.ok()); + } + public void err() { + inbox.sendResponse(message, MqInboxResponse.err()); + } + + } + + private CrawlRequest fetchInstructions() throws Exception { + + var inbox = messageQueueFactory.createSingleShotInbox(CRAWLER_INBOX, UUID.randomUUID()); + + var msgOpt = getMessage(inbox, nu.marginalia.mqapi.crawling.CrawlRequest.class.getSimpleName()); + var msg = msgOpt.orElseThrow(() -> new RuntimeException("No message received")); + + var request = gson.fromJson(msg.payload(), nu.marginalia.mqapi.crawling.CrawlRequest.class); + + var specData = fileStorageService.getStorage(request.specStorage); + var crawlData = fileStorageService.getStorage(request.crawlStorage); + + var plan = new CrawlPlan(specData.asPath().resolve("crawler.spec").toString(), + new CrawlPlan.WorkDir(crawlData.path(), "crawler.log"), + null); + + return new CrawlRequest(plan, msg, inbox); + } + + private Optional getMessage(MqSingleShotInbox inbox, String expectedFunction) throws SQLException, InterruptedException { + var opt = inbox.waitForMessage(30, TimeUnit.SECONDS); + if (opt.isPresent()) { + if (!opt.get().function().equals(expectedFunction)) { + throw new RuntimeException("Unexpected function: " + opt.get().function()); + } + return opt; + } + else { + var stolenMessage = inbox.stealMessage(msg -> msg.function().equals(expectedFunction)); + stolenMessage.ifPresent(mqMessage -> logger.info("Stole message {}", mqMessage)); + return stolenMessage; + } + } + + public void close() throws Exception { logger.info("Awaiting termination"); pool.shutdown(); @@ -176,8 +291,6 @@ public class CrawlerMain implements AutoCloseable { workLog.close(); dispatcher.executorService().shutdownNow(); - - } } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerModule.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerModule.java new file mode 100644 index 00000000..ebf6d33f --- /dev/null +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerModule.java @@ -0,0 +1,24 @@ +package nu.marginalia.crawl; + +import com.google.gson.Gson; +import com.google.inject.AbstractModule; +import lombok.SneakyThrows; +import nu.marginalia.ProcessConfiguration; +import nu.marginalia.UserAgent; +import nu.marginalia.WmsaHome; +import nu.marginalia.model.gson.GsonFactory; + +import java.util.UUID; + +public class CrawlerModule extends AbstractModule { + @SneakyThrows + public void configure() { + bind(Gson.class).toInstance(createGson()); + bind(UserAgent.class).toInstance(WmsaHome.getUserAgent()); + bind(ProcessConfiguration.class).toInstance(new ProcessConfiguration("crawler", 0, UUID.randomUUID())); + } + + private Gson createGson() { + return GsonFactory.get(); + } +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java new file mode 100644 index 00000000..cc827084 --- /dev/null +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java @@ -0,0 +1,123 @@ +package nu.marginalia.crawl.retreival; + +import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.model.EdgeUrl; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.net.URISyntaxException; +import java.util.*; +import java.util.stream.Collectors; + +/** A reference to a domain that has been crawled before. */ +public class CrawlDataReference { + private final Logger logger = LoggerFactory.getLogger(CrawlDataReference.class); + final Map documents; + final Map etags; + final Map lastModified; + final Set previouslyDeadUrls = new HashSet<>(); + + CrawlDataReference(CrawledDomain referenceDomain) { + + if (referenceDomain == null || referenceDomain.doc == null) { + documents = Collections.emptyMap(); + etags = Collections.emptyMap(); + lastModified = Collections.emptyMap(); + return; + } + + documents = new HashMap<>(referenceDomain.doc.size()); + etags = new HashMap<>(referenceDomain.doc.size()); + lastModified = new HashMap<>(referenceDomain.doc.size()); + + for (var doc : referenceDomain.doc) { + try { + addReference(doc); + } catch (URISyntaxException ex) { + logger.warn("Failed to add reference document {}", doc.url); + } + } + } + + private void addReference(CrawledDocument doc) throws URISyntaxException { + var url = new EdgeUrl(doc.url); + + if (doc.httpStatus == 404) { + previouslyDeadUrls.add(url); + return; + } + + if (doc.httpStatus != 200) { + return; + } + + + documents.put(url, doc); + + String headers = doc.headers; + if (headers != null) { + String[] headersLines = headers.split("\n"); + + String lastmod = null; + String etag = null; + + for (String line : headersLines) { + if (line.toLowerCase().startsWith("etag:")) { + etag = line.substring(5).trim(); + } + if (line.toLowerCase().startsWith("last-modified:")) { + lastmod = line.substring(14).trim(); + } + } + + if (lastmod != null) { + lastModified.put(url, lastmod); + } + if (etag != null) { + etags.put(url, etag); + } + } + } + + public boolean isPreviouslyDead(EdgeUrl url) { + return previouslyDeadUrls.contains(url); + } + public int size() { + return documents.size(); + } + + public String getEtag(EdgeUrl url) { + return etags.get(url); + } + + public String getLastModified(EdgeUrl url) { + return lastModified.get(url); + } + + public Map allDocuments() { + return documents; + } + + + public Map sample(int sampleSize) { + return documents.entrySet().stream().limit(sampleSize).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + } + + public void evict() { + documents.clear(); + etags.clear(); + lastModified.clear(); + } + + public CrawledDocument getDoc(EdgeUrl top) { + return documents.get(top); + } + + // This bit of manual housekeeping is needed to keep the memory footprint low + public void dispose(EdgeUrl url) { + documents.remove(url); + etags.remove(url); + lastModified.remove(url); + } +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index 3af0110a..52927f38 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -10,6 +10,7 @@ import nu.marginalia.crawling.model.spec.CrawlingSpecification; import nu.marginalia.link_parser.LinkParser; import nu.marginalia.crawling.model.*; import nu.marginalia.ip_blocklist.UrlBlocklist; +import nu.marginalia.lsh.EasyLSH; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import org.jsoup.Jsoup; @@ -57,6 +58,7 @@ public class CrawlerRetreiver { private final SitemapRetriever sitemapRetriever; private final DomainCrawlFrontier crawlFrontier; + private final CrawlDataReference oldCrawlData; int errorCount = 0; @@ -64,6 +66,7 @@ public class CrawlerRetreiver { CrawlingSpecification specs, Consumer writer) { this.fetcher = fetcher; + this.oldCrawlData = new CrawlDataReference(specs.oldData); id = specs.id; domain = specs.domain; @@ -73,9 +76,9 @@ public class CrawlerRetreiver { this.crawlFrontier = new DomainCrawlFrontier(new EdgeDomain(domain), specs.urls, specs.crawlDepth); sitemapRetriever = fetcher.createSitemapRetriever(); + // We must always crawl the index page first, this is assumed when fingerprinting the server var fst = crawlFrontier.peek(); if (fst != null) { - // Ensure the index page is always crawled var root = fst.withPathAndParam("/", null); @@ -141,6 +144,29 @@ public class CrawlerRetreiver { var robotsRules = fetcher.fetchRobotRules(crawlFrontier.peek().domain); + CrawlDataComparison comparison = compareWithOldData(robotsRules); + logger.info("Comparison result for {} : {}", domain, comparison); + + // If we have reference data, we will always grow the crawl depth a bit + if (oldCrawlData.size() > 0) { + crawlFrontier.increaseDepth(1.5); + } + + // When the reference data doesn't appear to have changed, we'll forego + // re-fetching it and just use the old data + if (comparison == CrawlDataComparison.NO_CHANGES) { + oldCrawlData.allDocuments().forEach((url, doc) -> { + if (crawlFrontier.addVisited(url)) { + doc.recrawlState = "RETAINED"; + crawledDomainWriter.accept(doc); + } + }); + + // We don't need to hold onto this in RAM anymore + oldCrawlData.evict(); + } + + downloadSitemaps(robotsRules); sniffRootDocument(); @@ -161,18 +187,31 @@ public class CrawlerRetreiver { continue; } + // Don't re-fetch links that were previously found dead as it's very unlikely that a + // 404:ing link will suddenly start working at a later point + if (oldCrawlData.isPreviouslyDead(top)) + continue; + + // Check the link filter if the endpoint should be fetched based on site-type if (!crawlFrontier.filterLink(top)) continue; + + // Check vs blocklist if (urlBlocklist.isUrlBlocked(top)) continue; + if (!isAllowedProtocol(top.proto)) continue; + + // Check if the URL is too long to insert into the DB if (top.toString().length() > 255) continue; + if (!crawlFrontier.addVisited(top)) continue; - if (fetchDocument(top, crawlDelay)) { + + if (fetchDocument(top, crawlDelay).isPresent()) { fetchedCount++; } } @@ -184,6 +223,76 @@ public class CrawlerRetreiver { return fetchedCount; } + private CrawlDataComparison compareWithOldData(SimpleRobotRules robotsRules) { + + int numGoodDocuments = oldCrawlData.size(); + + if (numGoodDocuments == 0) + return CrawlDataComparison.NO_OLD_DATA; + + if (numGoodDocuments < 10) + return CrawlDataComparison.SMALL_SAMPLE; + + // We fetch a sample of the data to assess how much it has changed + int sampleSize = (int) Math.min(20, 0.25 * numGoodDocuments); + Map referenceUrls = oldCrawlData.sample(sampleSize); + + int differences = 0; + + long crawlDelay = robotsRules.getCrawlDelay(); + for (var url : referenceUrls.keySet()) { + + var docMaybe = fetchDocument(url, crawlDelay); + if (docMaybe.isEmpty()) { + differences++; + continue; + } + + var newDoc = docMaybe.get(); + var referenceDoc = referenceUrls.get(url); + + // This looks like a bug but it is not, we want to compare references + // to detect if the page has bounced off etag or last-modified headers + // to avoid having to do a full content comparison + if (newDoc == referenceDoc) + continue; + + if (newDoc.httpStatus != referenceDoc.httpStatus) { + differences++; + continue; + } + + if (newDoc.documentBody == null) { + differences++; + continue; + } + + long referenceLsh = hashDoc(referenceDoc); + long newLsh = hashDoc(newDoc); + + if (EasyLSH.hammingDistance(referenceLsh, newLsh) > 5) { + differences++; + } + } + if (differences > sampleSize/4) { + return CrawlDataComparison.CHANGES_FOUND; + } + else { + return CrawlDataComparison.NO_CHANGES; + } + } + + private static final HashFunction hasher = Hashing.murmur3_128(0); + private long hashDoc(CrawledDocument doc) { + var hash = new EasyLSH(); + long val = 0; + for (var b : doc.documentBody.decode().getBytes()) { + val = val << 8 | (b & 0xFF); + hash.addUnordered(hasher.hashLong(val).asLong()); + } + return hash.get(); + } + private void downloadSitemaps(SimpleRobotRules robotsRules) { List sitemaps = robotsRules.getSitemaps(); @@ -235,7 +344,7 @@ public class CrawlerRetreiver { try { logger.debug("Configuring link filter"); - var url = crawlFrontier.peek(); + var url = crawlFrontier.peek().withPathAndParam("/", null); var maybeSample = fetchUrl(url).filter(sample -> sample.httpStatus == 200); if (maybeSample.isEmpty()) @@ -273,7 +382,7 @@ public class CrawlerRetreiver { } } - private boolean fetchDocument(EdgeUrl top, long crawlDelay) { + private Optional fetchDocument(EdgeUrl top, long crawlDelay) { logger.debug("Fetching {}", top); long startTime = System.currentTimeMillis(); @@ -282,9 +391,14 @@ public class CrawlerRetreiver { if (doc.isPresent()) { var d = doc.get(); crawledDomainWriter.accept(d); + oldCrawlData.dispose(top); if (d.url != null) { - EdgeUrl.parse(d.url).ifPresent(crawlFrontier::addVisited); + // We may have redirected to a different path + EdgeUrl.parse(d.url).ifPresent(url -> { + crawlFrontier.addVisited(url); + oldCrawlData.dispose(url); + }); } if ("ERROR".equals(d.crawlerStatus) && d.httpStatus != 404) { @@ -296,7 +410,7 @@ public class CrawlerRetreiver { long crawledTime = System.currentTimeMillis() - startTime; delay(crawlDelay, crawledTime); - return doc.isPresent(); + return doc; } private boolean isAllowedProtocol(String proto) { @@ -333,7 +447,20 @@ public class CrawlerRetreiver { private CrawledDocument fetchContent(EdgeUrl top) { for (int i = 0; i < 2; i++) { try { - return fetcher.fetchContent(top); + var doc = fetcher.fetchContent(top, oldCrawlData.getEtag(top), oldCrawlData.getLastModified(top)); + + doc.recrawlState = "NEW"; + + if (doc.httpStatus == 304) { + var referenceData = oldCrawlData.getDoc(top); + if (referenceData != null) { + referenceData.recrawlState = "304/UNCHANGED"; + return referenceData; + } + } + + + return doc; } catch (RateLimitException ex) { slowDown = true; @@ -443,4 +570,12 @@ public class CrawlerRetreiver { .build(); } + + enum CrawlDataComparison { + NO_OLD_DATA, + SMALL_SAMPLE, + CHANGES_FOUND, + NO_CHANGES + }; + } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java index b6e23f0c..7d5fc214 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java @@ -17,7 +17,7 @@ public class DomainCrawlFrontier { private Predicate linkFilter = url -> true; - final int depth; + private int depth; public DomainCrawlFrontier(EdgeDomain thisDomain, Collection urls, int depth) { this.thisDomain = thisDomain; @@ -32,6 +32,9 @@ public class DomainCrawlFrontier { } } + public void increaseDepth(double depthIncreaseFactor) { + depth = (int)(depth * depthIncreaseFactor); + } public void setLinkFilter(Predicate linkFilter) { this.linkFilter = linkFilter; } @@ -80,6 +83,9 @@ public class DomainCrawlFrontier { if (queue.size() + visited.size() >= depth + 100) return; + if (visited.contains(url.toString())) + return; + if (known.add(url.toString())) { queue.addLast(url); } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java index 1f630ac5..7f588783 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java @@ -18,7 +18,7 @@ public interface HttpFetcher { FetchResult probeDomain(EdgeUrl url); - CrawledDocument fetchContent(EdgeUrl url) throws RateLimitException; + CrawledDocument fetchContent(EdgeUrl url, String etag, String lastMod) throws RateLimitException; SimpleRobotRules fetchRobotRules(EdgeDomain domain); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java index 55a6d296..36c8bd34 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java @@ -125,29 +125,20 @@ public class HttpFetcherImpl implements HttpFetcher { } } - private Request createHeadRequest(EdgeUrl url) { - return new Request.Builder().head().addHeader("User-agent", userAgent) - .url(url.toString()) - .addHeader("Accept-Encoding", "gzip") - .build(); - } - - private Request createGetRequest(EdgeUrl url) { - return new Request.Builder().get().addHeader("User-agent", userAgent) - .url(url.toString()) - .addHeader("Accept-Encoding", "gzip") - .build(); - - } @Override @SneakyThrows - public CrawledDocument fetchContent(EdgeUrl url) throws RateLimitException { + public CrawledDocument fetchContent(EdgeUrl url, String etag, String lastMod) throws RateLimitException { if (contentTypeLogic.isUrlLikeBinary(url)) { logger.debug("Probing suspected binary {}", url); - var head = createHeadRequest(url); + var headBuilder = new Request.Builder().head() + .addHeader("User-agent", userAgent) + .url(url.toString()) + .addHeader("Accept-Encoding", "gzip"); + + var head = headBuilder.build(); var call = client.newCall(head); try (var rsp = call.execute()) { @@ -165,7 +156,15 @@ public class HttpFetcherImpl implements HttpFetcher { } } - var get = createGetRequest(url); + var getBuilder = new Request.Builder().get(); + getBuilder.addHeader("User-agent", userAgent) + .url(url.toString()) + .addHeader("Accept-Encoding", "gzip"); + + if (etag != null) getBuilder.addHeader("If-None-Match", etag); + if (lastMod != null) getBuilder.addHeader("If-Modified-Since", lastMod); + + var get = getBuilder.build(); var call = client.newCall(get); try (var rsp = call.execute()) { @@ -315,7 +314,7 @@ public class HttpFetcherImpl implements HttpFetcher { private Optional fetchRobotsForProto(String proto, EdgeDomain domain) { try { var url = new EdgeUrl(proto, domain, null, "/robots.txt", null); - return Optional.of(parseRobotsTxt(fetchContent(url))); + return Optional.of(parseRobotsTxt(fetchContent(url, null, null))); } catch (Exception ex) { return Optional.empty(); diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java index f6c2f3a4..2ea9c763 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java @@ -29,14 +29,14 @@ class HttpFetcherTest { @Test void fetchUTF8() throws URISyntaxException, RateLimitException { var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler"); - var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu")); + var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), null, null); System.out.println(str.contentType); } @Test void fetchText() throws URISyntaxException, RateLimitException { var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler"); - var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt")); + var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), null, null); System.out.println(str); } } \ No newline at end of file diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java index 7462b62c..f580a123 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java @@ -33,7 +33,6 @@ public class CrawlerMockFetcherTest { Map mockData = new HashMap<>(); HttpFetcher fetcherMock = new MockFetcher(); - SitemapRetriever sitemapRetriever = new SitemapRetriever(); @AfterEach public void tearDown() { @@ -74,7 +73,7 @@ public class CrawlerMockFetcherTest { registerUrlClasspathData(new EdgeUrl("https://startrek.website/c/startrek"), "mock-crawl-data/lemmy/c_startrek.html"); registerUrlClasspathData(new EdgeUrl("https://startrek.website/post/108995"), "mock-crawl-data/lemmy/108995.html"); - new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 10, "startrek.website", new ArrayList<>()), out::add) + new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 10, "startrek.website", new ArrayList<>(), null), out::add) .withNoDelay() .fetch(); @@ -87,7 +86,7 @@ public class CrawlerMockFetcherTest { registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html"); - new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 10, "en.wikipedia.org", new ArrayList<>()), out::add) + new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 10, "en.wikipedia.org", new ArrayList<>(), null), out::add) .withNoDelay() .fetch(); @@ -102,7 +101,7 @@ public class CrawlerMockFetcherTest { registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/telegram-channel-to-idle-on/3501"), "mock-crawl-data/discourse/telegram.html"); registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/combined-mode-but-grid/4489"), "mock-crawl-data/discourse/grid.html"); - new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 100, "community.tt-rss.org", new ArrayList<>()), out::add) + new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 100, "community.tt-rss.org", new ArrayList<>(), null), out::add) .withNoDelay() .fetch(); @@ -127,7 +126,7 @@ public class CrawlerMockFetcherTest { } @Override - public CrawledDocument fetchContent(EdgeUrl url) { + public CrawledDocument fetchContent(EdgeUrl url, String etag, String lastModified) { logger.info("Fetching {}", url); if (mockData.containsKey(url)) { return mockData.get(url); diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java index 64c7e890..741c8704 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java @@ -6,12 +6,15 @@ import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.spec.CrawlingSpecification; import nu.marginalia.crawling.model.SerializableCrawlData; import org.junit.jupiter.api.*; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.stream.Collectors; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -95,4 +98,36 @@ class CrawlerRetreiverTest { ); } + @Test + public void testRecrawl() { + + var specs = CrawlingSpecification + .builder() + .id("whatever") + .crawlDepth(12) + .domain("www.marginalia.nu") + .urls(List.of("https://www.marginalia.nu/some-dead-link")) + .build(); + + + Map, List> data = new HashMap<>(); + + new CrawlerRetreiver(httpFetcher, specs, d -> { + data.computeIfAbsent(d.getClass(), k->new ArrayList<>()).add(d); + if (d instanceof CrawledDocument doc) { + System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus); + } + }).fetch(); + + CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0); + domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList()); + + var newSpec = specs.withOldData(domain); + + new CrawlerRetreiver(httpFetcher, newSpec, d -> { + if (d instanceof CrawledDocument doc) { + System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus); + } + }).fetch(); + } } \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java index 49ed3dff..82869816 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java @@ -82,6 +82,8 @@ public class ControlService extends Service { Spark.post("/public/fsms/:fsm/start", controlActorService::startFsm, redirectToProcesses); Spark.post("/public/fsms/:fsm/stop", controlActorService::stopFsm, redirectToProcesses); + Spark.post("/public/storage/:fid/crawl", controlActorService::triggerCrawling, redirectToProcesses); + Spark.post("/public/storage/:fid/recrawl", controlActorService::triggerRecrawling, redirectToProcesses); Spark.post("/public/storage/:fid/process", controlActorService::triggerProcessing, redirectToProcesses); Spark.post("/public/storage/:fid/load", controlActorService::loadProcessedData, redirectToProcesses); diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java index c470341c..bfa90be1 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java @@ -4,6 +4,8 @@ import com.google.gson.Gson; import com.google.inject.Inject; import com.google.inject.Singleton; import lombok.SneakyThrows; +import nu.marginalia.control.actor.task.CrawlActor; +import nu.marginalia.control.actor.task.RecrawlActor; import nu.marginalia.control.model.Actor; import nu.marginalia.control.actor.monitor.*; import nu.marginalia.control.actor.monitor.ConverterMonitorActor; @@ -22,6 +24,7 @@ import java.util.Map; import java.util.UUID; import java.util.stream.Collectors; +/** This class is responsible for starting and stopping the various actors in the controller service */ @Singleton public class ControlActors { private final ServiceEventLog eventLog; @@ -35,7 +38,10 @@ public class ControlActors { GsonFactory gsonFactory, BaseServiceParams baseServiceParams, ReconvertAndLoadActor reconvertAndLoadActor, + CrawlActor crawlActor, + RecrawlActor recrawlActor, ConverterMonitorActor converterMonitorFSM, + CrawlerMonitorActor crawlerMonitorActor, LoaderMonitorActor loaderMonitor, MessageQueueMonitorActor messageQueueMonitor, ProcessLivenessMonitorActor processMonitorFSM, @@ -45,9 +51,12 @@ public class ControlActors { this.eventLog = baseServiceParams.eventLog; this.gson = gsonFactory.get(); + register(Actor.CRAWL, crawlActor); + register(Actor.RECRAWL, recrawlActor); register(Actor.RECONVERT_LOAD, reconvertAndLoadActor); register(Actor.CONVERTER_MONITOR, converterMonitorFSM); register(Actor.LOADER_MONITOR, loaderMonitor); + register(Actor.CRAWLER_MONITOR, crawlerMonitorActor); register(Actor.MESSAGE_QUEUE_MONITOR, messageQueueMonitor); register(Actor.PROCESS_LIVENESS_MONITOR, processMonitorFSM); register(Actor.FILE_STORAGE_MONITOR, fileStorageMonitorActor); @@ -100,9 +109,6 @@ public class ControlActors { Map.Entry::getKey, e -> e.getValue().getState()) ); } - public MachineState getActorStates(Actor actor) { - return stateMachines.get(actor).getState(); - } public AbstractStateGraph getActorDefinition(Actor actor) { return actorDefinitions.get(actor); diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java index 7b5b1e11..0f608138 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java @@ -64,17 +64,28 @@ public class AbstractProcessSpawnerActor extends AbstractStateGraph { description = """ Monitors the inbox of the process for messages. If a message is found, transition to RUN. + The state takes an optional Integer parameter errorAttempts + that is passed to run. errorAttempts is set to zero after + a few seconds of silence. """ ) - public void monitor() throws SQLException, InterruptedException { + public void monitor(Integer errorAttempts) throws SQLException, InterruptedException { + if (errorAttempts == null) { + errorAttempts = 0; + } for (;;) { var messages = persistence.eavesdrop(inboxName, 1); if (messages.isEmpty() && !processService.isRunning(processId)) { TimeUnit.SECONDS.sleep(5); + + if (errorAttempts > 0) { // Reset the error counter if there is silence in the inbox + transition(MONITOR, 0); + } + // else continue } else { - transition(RUN); + transition(RUN, errorAttempts); } } } @@ -87,7 +98,7 @@ public class AbstractProcessSpawnerActor extends AbstractStateGraph { If the process fails, retransition to RUN up to MAX_ATTEMPTS times. After MAX_ATTEMPTS at restarting the process, transition to ERROR. If the process is cancelled, transition to ABORTED. - If the process is successful, transition to MONITOR. + If the process is successful, transition to MONITOR(errorAttempts). """ ) public void run(Integer attempts) throws Exception { @@ -108,7 +119,7 @@ public class AbstractProcessSpawnerActor extends AbstractStateGraph { transition(ABORTED); } - transition(MONITOR); + transition(MONITOR, attempts); } @TerminalState(name = ABORTED, description = "The process was manually aborted") diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/CrawlerMonitorActor.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/CrawlerMonitorActor.java new file mode 100644 index 00000000..f50f7b73 --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/CrawlerMonitorActor.java @@ -0,0 +1,25 @@ +package nu.marginalia.control.actor.monitor; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.control.svc.ProcessService; +import nu.marginalia.mq.persistence.MqPersistence; +import nu.marginalia.mqapi.ProcessInboxNames; +import nu.marginalia.mqsm.StateFactory; + +@Singleton +public class CrawlerMonitorActor extends AbstractProcessSpawnerActor { + + @Inject + public CrawlerMonitorActor(StateFactory stateFactory, + MqPersistence persistence, + ProcessService processService) { + super(stateFactory, + persistence, + processService, + ProcessInboxNames.CRAWLER_INBOX, + ProcessService.ProcessId.CRAWLER); + } + + +} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlActor.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlActor.java new file mode 100644 index 00000000..4db5b3e1 --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlActor.java @@ -0,0 +1,171 @@ +package nu.marginalia.control.actor.task; + +import com.google.gson.Gson; +import com.google.inject.Inject; +import com.google.inject.Singleton; +import lombok.AllArgsConstructor; +import lombok.NoArgsConstructor; +import lombok.With; +import nu.marginalia.control.svc.ProcessOutboxFactory; +import nu.marginalia.control.svc.ProcessService; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.db.storage.model.FileStorageBaseType; +import nu.marginalia.db.storage.model.FileStorageId; +import nu.marginalia.db.storage.model.FileStorageType; +import nu.marginalia.index.client.IndexClient; +import nu.marginalia.index.client.IndexMqEndpoints; +import nu.marginalia.mq.MqMessage; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.outbox.MqOutbox; +import nu.marginalia.mqapi.converting.ConvertRequest; +import nu.marginalia.mqapi.crawling.CrawlRequest; +import nu.marginalia.mqapi.loading.LoadRequest; +import nu.marginalia.mqsm.StateFactory; +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mqsm.graph.GraphState; +import nu.marginalia.mqsm.graph.ResumeBehavior; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.nio.file.Files; +import java.nio.file.StandardCopyOption; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; + +@Singleton +public class CrawlActor extends AbstractStateGraph { + + // STATES + + public static final String INITIAL = "INITIAL"; + public static final String CRAWL = "CRAWL"; + public static final String CRAWL_WAIT = "CRAWL-WAIT"; + public static final String END = "END"; + private final ProcessService processService; + private final MqOutbox mqCrawlerOutbox; + private final FileStorageService storageService; + private final Gson gson; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + + @AllArgsConstructor @With @NoArgsConstructor + public static class Message { + public FileStorageId crawlSpecId = null; + public FileStorageId crawlStorageId = null; + public long crawlerMsgId = 0L; + }; + + @Inject + public CrawlActor(StateFactory stateFactory, + ProcessService processService, + ProcessOutboxFactory processOutboxFactory, + FileStorageService storageService, + Gson gson + ) + { + super(stateFactory); + this.processService = processService; + this.mqCrawlerOutbox = processOutboxFactory.createCrawlerOutbox(); + this.storageService = storageService; + this.gson = gson; + } + + @GraphState(name = INITIAL, + next = CRAWL, + description = """ + Validate the input and transition to CRAWL + """) + public Message init(FileStorageId crawlStorageId) throws Exception { + if (null == crawlStorageId) { + error("This Actor requires a FileStorageId to be passed in as a parameter to INITIAL"); + } + + var storage = storageService.getStorage(crawlStorageId); + + if (storage == null) error("Bad storage id"); + if (storage.type() != FileStorageType.CRAWL_SPEC) error("Bad storage type " + storage.type()); + + return new Message().withCrawlSpecId(crawlStorageId); + } + + @GraphState(name = CRAWL, + next = CRAWL_WAIT, + resume = ResumeBehavior.ERROR, + description = """ + Allocate a storage area for the crawled data, + then send a crawl request to the crawler and transition to CRAWL_WAIT. + """ + ) + public Message crawl(Message message) throws Exception { + // Create processed data area + + var toCrawl = storageService.getStorage(message.crawlSpecId); + + var base = storageService.getStorageBase(FileStorageBaseType.SLOW); + var dataArea = storageService.allocateTemporaryStorage( + base, + FileStorageType.CRAWL_DATA, + "crawl-data", + toCrawl.description()); + + storageService.relateFileStorages(toCrawl.id(), dataArea.id()); + + // Pre-send convert request + var request = new CrawlRequest(message.crawlSpecId, dataArea.id()); + long id = mqCrawlerOutbox.sendAsync(CrawlRequest.class.getSimpleName(), gson.toJson(request)); + + return message + .withCrawlStorageId(dataArea.id()) + .withCrawlerMsgId(id); + } + + @GraphState( + name = CRAWL_WAIT, + next = END, + resume = ResumeBehavior.RETRY, + description = """ + Wait for the crawler to finish retreiving the data. + """ + ) + public Message crawlerWait(Message message) throws Exception { + var rsp = waitResponse(mqCrawlerOutbox, ProcessService.ProcessId.CRAWLER, message.crawlerMsgId); + + if (rsp.state() != MqMessageState.OK) + error("Crawler failed"); + + return message; + } + + + public MqMessage waitResponse(MqOutbox outbox, ProcessService.ProcessId processId, long id) throws Exception { + if (!waitForProcess(processId, TimeUnit.SECONDS, 30)) { + error("Process " + processId + " did not launch"); + } + for (;;) { + try { + return outbox.waitResponse(id, 1, TimeUnit.SECONDS); + } + catch (TimeoutException ex) { + // Maybe the process died, wait a moment for it to restart + if (!waitForProcess(processId, TimeUnit.SECONDS, 30)) { + error("Process " + processId + " died and did not re-launch"); + } + } + } + } + + public boolean waitForProcess(ProcessService.ProcessId processId, TimeUnit unit, int duration) throws InterruptedException { + + // Wait for process to start + long deadline = System.currentTimeMillis() + unit.toMillis(duration); + while (System.currentTimeMillis() < deadline) { + if (processService.isRunning(processId)) + return true; + + TimeUnit.SECONDS.sleep(1); + } + + return false; + } + +} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/ReconvertAndLoadActor.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/ReconvertAndLoadActor.java index 2ffde9b2..96730aa2 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/ReconvertAndLoadActor.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/ReconvertAndLoadActor.java @@ -118,6 +118,8 @@ public class ReconvertAndLoadActor extends AbstractStateGraph { var processedArea = storageService.allocateTemporaryStorage(base, FileStorageType.PROCESSED_DATA, "processed-data", "Processed Data; " + toProcess.description()); + storageService.relateFileStorages(toProcess.id(), processedArea.id()); + // Pre-send convert request var request = new ConvertRequest(message.crawlStorageId, processedArea.id()); long id = mqConverterOutbox.sendAsync(ConvertRequest.class.getSimpleName(), gson.toJson(request)); diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/RecrawlActor.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/RecrawlActor.java new file mode 100644 index 00000000..bfa847f2 --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/RecrawlActor.java @@ -0,0 +1,185 @@ +package nu.marginalia.control.actor.task; + +import com.google.gson.Gson; +import com.google.inject.Inject; +import com.google.inject.Singleton; +import lombok.AllArgsConstructor; +import lombok.NoArgsConstructor; +import lombok.With; +import nu.marginalia.control.svc.ProcessOutboxFactory; +import nu.marginalia.control.svc.ProcessService; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.db.storage.model.FileStorage; +import nu.marginalia.db.storage.model.FileStorageId; +import nu.marginalia.db.storage.model.FileStorageType; +import nu.marginalia.index.client.IndexClient; +import nu.marginalia.mq.MqMessage; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.outbox.MqOutbox; +import nu.marginalia.mqapi.crawling.CrawlRequest; +import nu.marginalia.mqsm.StateFactory; +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mqsm.graph.GraphState; +import nu.marginalia.mqsm.graph.ResumeBehavior; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.nio.file.Files; +import java.sql.SQLException; +import java.util.Optional; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; + +@Singleton +public class RecrawlActor extends AbstractStateGraph { + + // STATES + + public static final String INITIAL = "INITIAL"; + public static final String CRAWL = "CRAWL"; + public static final String CRAWL_WAIT = "CRAWL-WAIT"; + public static final String END = "END"; + private final ProcessService processService; + private final MqOutbox mqCrawlerOutbox; + private final FileStorageService storageService; + private final Gson gson; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + + @AllArgsConstructor @With @NoArgsConstructor + public static class RecrawlMessage { + public FileStorageId crawlSpecId = null; + public FileStorageId crawlStorageId = null; + public long crawlerMsgId = 0L; + }; + + public static RecrawlMessage recrawlFromCrawlData(FileStorageId crawlData) { + return new RecrawlMessage(null, crawlData, 0L); + } + public static RecrawlMessage recrawlFromCrawlDataAndCralSpec(FileStorageId crawlData, FileStorageId crawlSpec) { + return new RecrawlMessage(crawlSpec, crawlData, 0L); + } + + @Inject + public RecrawlActor(StateFactory stateFactory, + ProcessService processService, + ProcessOutboxFactory processOutboxFactory, + FileStorageService storageService, + Gson gson + ) + { + super(stateFactory); + this.processService = processService; + this.mqCrawlerOutbox = processOutboxFactory.createCrawlerOutbox(); + this.storageService = storageService; + this.gson = gson; + } + + @GraphState(name = INITIAL, + next = CRAWL, + description = """ + Validate the input and transition to CRAWL + """) + public RecrawlMessage init(RecrawlMessage recrawlMessage) throws Exception { + if (null == recrawlMessage) { + error("This Actor requires a message as an argument"); + } + + + var crawlStorage = storageService.getStorage(recrawlMessage.crawlStorageId); + FileStorage specStorage; + + if (recrawlMessage.crawlSpecId != null) { + specStorage = storageService.getStorage(recrawlMessage.crawlSpecId); + } + else { + specStorage = getSpec(crawlStorage).orElse(null); + } + + if (specStorage == null) error("Bad storage id"); + if (specStorage.type() != FileStorageType.CRAWL_SPEC) error("Bad storage type " + specStorage.type()); + if (crawlStorage == null) error("Bad storage id"); + if (crawlStorage.type() != FileStorageType.CRAWL_DATA) error("Bad storage type " + specStorage.type()); + + Files.deleteIfExists(crawlStorage.asPath().resolve("crawler.log")); + + return recrawlMessage + .withCrawlSpecId(specStorage.id()); + } + + private Optional getSpec(FileStorage crawlStorage) throws SQLException { + return storageService.getSourceFromStorage(crawlStorage) + .stream() + .filter(storage -> storage.type().equals(FileStorageType.CRAWL_SPEC)) + .findFirst(); + } + + @GraphState(name = CRAWL, + next = CRAWL_WAIT, + resume = ResumeBehavior.ERROR, + description = """ + Send a crawl request to the crawler and transition to CRAWL_WAIT. + """ + ) + public RecrawlMessage crawl(RecrawlMessage recrawlMessage) throws Exception { + // Create processed data area + + var toCrawl = storageService.getStorage(recrawlMessage.crawlSpecId); + + // Pre-send crawl request + var request = new CrawlRequest(recrawlMessage.crawlSpecId, recrawlMessage.crawlStorageId); + long id = mqCrawlerOutbox.sendAsync(CrawlRequest.class.getSimpleName(), gson.toJson(request)); + + return recrawlMessage.withCrawlerMsgId(id); + } + + @GraphState( + name = CRAWL_WAIT, + next = END, + resume = ResumeBehavior.RETRY, + description = """ + Wait for the crawler to finish retreiving the data. + """ + ) + public RecrawlMessage crawlerWait(RecrawlMessage recrawlMessage) throws Exception { + var rsp = waitResponse(mqCrawlerOutbox, ProcessService.ProcessId.CRAWLER, recrawlMessage.crawlerMsgId); + + if (rsp.state() != MqMessageState.OK) + error("Crawler failed"); + + return recrawlMessage; + } + + + public MqMessage waitResponse(MqOutbox outbox, ProcessService.ProcessId processId, long id) throws Exception { + if (!waitForProcess(processId, TimeUnit.SECONDS, 30)) { + error("Process " + processId + " did not launch"); + } + for (;;) { + try { + return outbox.waitResponse(id, 1, TimeUnit.SECONDS); + } + catch (TimeoutException ex) { + // Maybe the process died, wait a moment for it to restart + if (!waitForProcess(processId, TimeUnit.SECONDS, 30)) { + error("Process " + processId + " died and did not re-launch"); + } + } + } + } + + public boolean waitForProcess(ProcessService.ProcessId processId, TimeUnit unit, int duration) throws InterruptedException { + + // Wait for process to start + long deadline = System.currentTimeMillis() + unit.toMillis(duration); + while (System.currentTimeMillis() < deadline) { + if (processService.isRunning(processId)) + return true; + + TimeUnit.SECONDS.sleep(1); + } + + return false; + } + +} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/Actor.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/Actor.java index dcced17e..83d0b810 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/Actor.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/Actor.java @@ -1,9 +1,12 @@ package nu.marginalia.control.model; public enum Actor { + CRAWL, + RECRAWL, RECONVERT_LOAD, CONVERTER_MONITOR, LOADER_MONITOR, + CRAWLER_MONITOR, MESSAGE_QUEUE_MONITOR, PROCESS_LIVENESS_MONITOR, FILE_STORAGE_MONITOR diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithActions.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithActions.java index 674e92bc..4ef9a394 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithActions.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithActions.java @@ -4,6 +4,13 @@ import nu.marginalia.db.storage.model.FileStorage; import nu.marginalia.db.storage.model.FileStorageType; public record FileStorageWithActions(FileStorage storage) { + public boolean isCrawlable() { + return storage.type() == FileStorageType.CRAWL_SPEC; + } + public boolean isRecrawlable() { + return storage.type() == FileStorageType.CRAWL_DATA; + } + public boolean isLoadable() { return storage.type() == FileStorageType.PROCESSED_DATA; } diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java index d39f9d4f..c7bab07f 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java @@ -4,6 +4,7 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.control.actor.ControlActors; import nu.marginalia.control.actor.task.ReconvertAndLoadActor; +import nu.marginalia.control.actor.task.RecrawlActor; import nu.marginalia.control.model.Actor; import nu.marginalia.control.model.ActorRunState; import nu.marginalia.control.model.ActorStateGraph; @@ -43,16 +44,33 @@ public class ControlActorService { return ""; } + public Object triggerCrawling(Request request, Response response) throws Exception { + controlActors.start( + Actor.CRAWL, + FileStorageId.parse(request.params("fid")) + ); + return ""; + } + + public Object triggerRecrawling(Request request, Response response) throws Exception { + controlActors.start( + Actor.RECRAWL, + RecrawlActor.recrawlFromCrawlData( + FileStorageId.parse(request.params("fid")) + ) + ); + return ""; + } public Object triggerProcessing(Request request, Response response) throws Exception { controlActors.start( Actor.RECONVERT_LOAD, - FileStorageId.of(Integer.parseInt(request.params("fid"))) + FileStorageId.parse(request.params("fid")) ); return ""; } public Object loadProcessedData(Request request, Response response) throws Exception { - var fid = FileStorageId.of(Integer.parseInt(request.params("fid"))); + var fid = FileStorageId.parse(request.params("fid")); // Start the FSM from the intermediate state that triggers the load controlActors.startFrom( diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessOutboxFactory.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessOutboxFactory.java index 4c296069..52808aef 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessOutboxFactory.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessOutboxFactory.java @@ -24,4 +24,8 @@ public class ProcessOutboxFactory { public MqOutbox createLoaderOutbox() { return new MqOutbox(persistence, ProcessInboxNames.LOADER_INBOX, params.configuration.serviceName(), params.configuration.instanceUuid()); } + + public MqOutbox createCrawlerOutbox() { + return new MqOutbox(persistence, ProcessInboxNames.CRAWLER_INBOX, params.configuration.serviceName(), params.configuration.instanceUuid()); + } } diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/storage.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/storage.hdb index 1674d6f5..7f748489 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/storage.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/storage.hdb @@ -34,6 +34,11 @@ {{#each storage}} + {{#if isCrawlable}} +
    + +
    + {{/if}} {{#if isLoadable}}
    @@ -44,6 +49,11 @@
    {{/if}} + {{#if isRecrawlable}} +
    + +
    + {{/if}} {{#if isDeletable}}
    diff --git a/code/tools/crawl-job-extractor/src/test/java/nu/marginalia/crawl/CrawlJobSpecWriterTest.java b/code/tools/crawl-job-extractor/src/test/java/nu/marginalia/crawl/CrawlJobSpecWriterTest.java index 38cfc4fb..7fd5922f 100644 --- a/code/tools/crawl-job-extractor/src/test/java/nu/marginalia/crawl/CrawlJobSpecWriterTest.java +++ b/code/tools/crawl-job-extractor/src/test/java/nu/marginalia/crawl/CrawlJobSpecWriterTest.java @@ -31,9 +31,9 @@ public class CrawlJobSpecWriterTest { @Test public void testReadWrite() throws IOException { try (CrawlJobSpecWriter writer = new CrawlJobSpecWriter(tempFile)) { - writer.accept(new CrawlingSpecification("first",1, "test1", List.of("a", "b", "c"))); - writer.accept(new CrawlingSpecification("second",1, "test2", List.of("a", "b", "c", "d"))); - writer.accept(new CrawlingSpecification("third",1, "test3", List.of("a", "b"))); + writer.accept(new CrawlingSpecification("first",1, "test1", List.of("a", "b", "c"), null)); + writer.accept(new CrawlingSpecification("second",1, "test2", List.of("a", "b", "c", "d"), null)); + writer.accept(new CrawlingSpecification("third",1, "test3", List.of("a", "b"), null)); } List outputs = new ArrayList<>(); diff --git a/run/env/service.env b/run/env/service.env index dfa012b3..ac745577 100644 --- a/run/env/service.env +++ b/run/env/service.env @@ -1,3 +1,4 @@ WMSA_HOME=run/ CONTROL_SERVICE_OPTS="-DdistPath=/dist" -CONVERTER_OPTS="-ea -Xmx16G -XX:-CompactStrings -XX:+UseParallelGC -XX:GCTimeRatio=14 -XX:ParallelGCThreads=15" \ No newline at end of file +CONVERTER_OPTS="-ea -Xmx16G -XX:-CompactStrings -XX:+UseParallelGC -XX:GCTimeRatio=14 -XX:ParallelGCThreads=15" +CRAWLER_OPTS="-Xmx16G -XX:+UseParallelGC -XX:GCTimeRatio=14 -XX:ParallelGCThreads=15" \ No newline at end of file From 8f455f3b6dbd93cd3b0b65e3307edffaf0d6671a Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 21 Jul 2023 14:12:32 +0200 Subject: [PATCH 066/157] (control) Aborting a process spawner actor cancels the message to the actor. --- .../monitor/AbstractProcessSpawnerActor.java | 23 +++++++++++++++++++ .../control/actor/task/RecrawlActor.java | 7 +----- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java index 0f608138..35a4ed03 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java @@ -3,6 +3,7 @@ package nu.marginalia.control.actor.monitor; import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.control.svc.ProcessService; +import nu.marginalia.mq.MqMessageState; import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.mqsm.StateFactory; import nu.marginalia.mqsm.graph.AbstractStateGraph; @@ -115,13 +116,35 @@ public class AbstractProcessSpawnerActor extends AbstractStateGraph { } } catch (InterruptedException ex) { + // We get this exception when the process is cancelled by the user + processService.kill(processId); + setCurrentMessageToDead(); + transition(ABORTED); } transition(MONITOR, attempts); } + /** Sets the message to dead in the database to avoid + * the service respawning on the same task when we + * re-enable this actor */ + private void setCurrentMessageToDead() { + try { + var messages = persistence.eavesdrop(inboxName, 1); + + if (messages.isEmpty()) // Possibly a race condition where the task is already finished + return; + + var theMessage = messages.iterator().next(); + persistence.updateMessageState(theMessage.msgId(), MqMessageState.DEAD); + } + catch (SQLException ex) { + logger.error("Tried but failed to set the message for " + processId + " to dead", ex); + } + } + @TerminalState(name = ABORTED, description = "The process was manually aborted") public void aborted() throws Exception {} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/RecrawlActor.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/RecrawlActor.java index bfa847f2..a04dd6bf 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/RecrawlActor.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/RecrawlActor.java @@ -85,7 +85,6 @@ public class RecrawlActor extends AbstractStateGraph { error("This Actor requires a message as an argument"); } - var crawlStorage = storageService.getStorage(recrawlMessage.crawlStorageId); FileStorage specStorage; @@ -122,10 +121,6 @@ public class RecrawlActor extends AbstractStateGraph { """ ) public RecrawlMessage crawl(RecrawlMessage recrawlMessage) throws Exception { - // Create processed data area - - var toCrawl = storageService.getStorage(recrawlMessage.crawlSpecId); - // Pre-send crawl request var request = new CrawlRequest(recrawlMessage.crawlSpecId, recrawlMessage.crawlStorageId); long id = mqCrawlerOutbox.sendAsync(CrawlRequest.class.getSimpleName(), gson.toJson(request)); @@ -138,7 +133,7 @@ public class RecrawlActor extends AbstractStateGraph { next = END, resume = ResumeBehavior.RETRY, description = """ - Wait for the crawler to finish retreiving the data. + Wait for the crawler to finish retrieving the data. """ ) public RecrawlMessage crawlerWait(RecrawlMessage recrawlMessage) throws Exception { From 7bc1cff286e5b073743367e7f9b6c02d9b9e576e Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 21 Jul 2023 14:28:37 +0200 Subject: [PATCH 067/157] (minor) code cleanup --- .../monitor/AbstractProcessSpawnerActor.java | 2 ++ .../control/svc/HeartbeatService.java | 4 ++-- .../control/svc/ProcessOutboxFactory.java | 19 +++++++++++++++---- 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java index 35a4ed03..4ff3cde8 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java @@ -149,6 +149,8 @@ public class AbstractProcessSpawnerActor extends AbstractStateGraph { public void aborted() throws Exception {} + /** Encapsulates the execution of the process in a separate thread so that + * we can interrupt the thread if the process is cancelled */ private class TaskExecution { private final AtomicBoolean error = new AtomicBoolean(false); public TaskExecution() throws ExecutionException, InterruptedException { diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/HeartbeatService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/HeartbeatService.java index 8a8a693e..74a504b2 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/HeartbeatService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/HeartbeatService.java @@ -28,7 +28,7 @@ public class HeartbeatService { try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(""" - SELECT SERVICE_NAME, SERVICE_BASE, INSTANCE, ALIVE, + SELECT SERVICE_NAME, SERVICE_BASE, INSTANCE, ALIVE, TIMESTAMPDIFF(MICROSECOND, HEARTBEAT_TIME, CURRENT_TIMESTAMP(6)) AS TSDIFF FROM SERVICE_HEARTBEAT """)) { @@ -56,7 +56,7 @@ public class HeartbeatService { try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(""" - SELECT PROCESS_NAME, PROCESS_BASE, INSTANCE, STATUS, PROGRESS, + SELECT PROCESS_NAME, PROCESS_BASE, INSTANCE, STATUS, PROGRESS, TIMESTAMPDIFF(MICROSECOND, HEARTBEAT_TIME, CURRENT_TIMESTAMP(6)) AS TSDIFF FROM PROCESS_HEARTBEAT """)) { diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessOutboxFactory.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessOutboxFactory.java index 52808aef..fb5598a9 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessOutboxFactory.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessOutboxFactory.java @@ -19,13 +19,24 @@ public class ProcessOutboxFactory { } public MqOutbox createConverterOutbox() { - return new MqOutbox(persistence, ProcessInboxNames.CONVERTER_INBOX, params.configuration.serviceName(), params.configuration.instanceUuid()); + return new MqOutbox(persistence, + ProcessInboxNames.CONVERTER_INBOX, + params.configuration.serviceName(), + params.configuration.instanceUuid() + ); } public MqOutbox createLoaderOutbox() { - return new MqOutbox(persistence, ProcessInboxNames.LOADER_INBOX, params.configuration.serviceName(), params.configuration.instanceUuid()); + return new MqOutbox(persistence, + ProcessInboxNames.LOADER_INBOX, + params.configuration.serviceName(), + params.configuration.instanceUuid() + ); } - public MqOutbox createCrawlerOutbox() { - return new MqOutbox(persistence, ProcessInboxNames.CRAWLER_INBOX, params.configuration.serviceName(), params.configuration.instanceUuid()); + return new MqOutbox(persistence, + ProcessInboxNames.CRAWLER_INBOX, + params.configuration.serviceName(), + params.configuration.instanceUuid() + ); } } From 58f2f86ea8607db9a01b1b581cd0da017e9fcf74 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 21 Jul 2023 19:47:52 +0200 Subject: [PATCH 068/157] (crawler) Don't read all the data into RAM when doing a refresh-crawl --- .../crawling/io/CrawledDomainReader.java | 41 ++++ .../nu/marginalia/crawl/CrawlLimiter.java | 45 +--- .../java/nu/marginalia/crawl/CrawlerMain.java | 24 +- .../crawl/retreival/CrawlerRetreiver.java | 227 ++++++++++-------- .../crawl/retreival/fetcher/ContentTags.java | 24 ++ .../crawl/retreival/fetcher/HttpFetcher.java | 2 +- .../retreival/fetcher/HttpFetcherImpl.java | 31 ++- .../marginalia/crawling/HttpFetcherTest.java | 5 +- .../retreival/CrawlerMockFetcherTest.java | 7 +- .../retreival/CrawlerRetreiverTest.java | 18 +- 10 files changed, 249 insertions(+), 175 deletions(-) create mode 100644 code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/ContentTags.java diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java index 67b95484..abc524ac 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java @@ -2,8 +2,10 @@ package nu.marginalia.crawling.io; import com.github.luben.zstd.ZstdInputStream; import com.google.gson.Gson; +import lombok.SneakyThrows; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.crawling.model.SerializableCrawlData; import nu.marginalia.model.gson.GsonFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -14,6 +16,7 @@ import java.io.IOException; import java.io.InputStreamReader; import java.nio.file.Path; import java.util.ArrayList; +import java.util.Iterator; import java.util.List; import java.util.Optional; import java.util.concurrent.ForkJoinPool; @@ -27,6 +30,44 @@ public class CrawledDomainReader { public CrawledDomainReader() { } + public Iterator createIterator(Path path) throws IOException { + BufferedReader br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile())))); + + return new Iterator<>() { + SerializableCrawlData next; + + @Override + @SneakyThrows + public boolean hasNext() { + String identifier = br.readLine(); + if (identifier == null) { + br.close(); + return false; + } + String data = br.readLine(); + if (data == null) { + br.close(); + return false; + } + + if (identifier.equals(CrawledDomain.SERIAL_IDENTIFIER)) { + next = gson.fromJson(data, CrawledDomain.class); + } else if (identifier.equals(CrawledDocument.SERIAL_IDENTIFIER)) { + next = gson.fromJson(data, CrawledDocument.class); + } + else { + throw new IllegalStateException("Unknown identifier: " + identifier); + } + return true; + } + + @Override + public SerializableCrawlData next() { + return next; + } + }; + } + public CrawledDomain read(Path path) throws IOException { DomainDataAssembler domainData = new DomainDataAssembler(); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java index 29f02e4f..7285b0c5 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java @@ -8,65 +8,22 @@ import java.util.concurrent.Semaphore; public class CrawlLimiter { public static final int maxPoolSize = Integer.getInteger("crawler.pool-size", 512); - // We'll round up to this size when we're crawling a new domain to prevent - // too many concurrent connections - public static final int minCrawlDataSizeKb = 128; // 100 Kb - - // The largest size on disk where we'll permit a refresh crawl - // (these files easily grow into the gigabytes, we don't want that in RAM) - public static final int maxRefreshableCrawlDataSizeKBytes = 1024*128; // 128 Mb - - // This limits how many concurrent crawl tasks we can have running at once - // based on their size on disk. The on-disk size is compressed, and the - // in-ram size is partially compressed (i.e. only the document body); so - // maybe a fair estimate is something like 2-4x this figure for RAM usage - // - public static final int maxConcurrentCrawlTaskSizeKb = 512*1024; // 512 Mb - - static { - // Sanity check; if this is false we'll get a deadlock on taskSemRAM - assert maxConcurrentCrawlTaskSizeKb >= maxRefreshableCrawlDataSizeKBytes - : "maxConcurrentCrawlTaskSizeKb must be larger than maxRefreshableCrawlDataSizeKBytes"; - } - public record CrawlTaskLimits(Path refreshPath, boolean isRefreshable, int taskSize) {} - // We use two semaphores to keep track of the number of concurrent crawls; - // first a RAM sempahore to limit the amount of RAM used by refresh crawls. - // then a count semaphore to limit the number of concurrent threads (this keeps the connection count manageable) - private final Semaphore taskSemRAM = new Semaphore(maxConcurrentCrawlTaskSizeKb); private final Semaphore taskSemCount = new Semaphore(maxPoolSize); public CrawlTaskLimits getTaskLimits(Path fileName) { - long size; - - try { - size = Math.max(minCrawlDataSizeKb, Files.size(fileName) / 1024); - } catch (IOException ex) { - // If we can't read the file, we'll assume it's small since we won't be able to read it later for the refresh either - return new CrawlTaskLimits(null,false, minCrawlDataSizeKb); - } - - // We'll only permit refresh crawls if the file is small enough - boolean isRefreshable = size < maxRefreshableCrawlDataSizeKBytes; - - // We'll truncate this down to maxRefreshableCrawlDataSizeKBytes to ensure - // it's possible to acquire the RAM semaphore - int effectiveSize = (int) Math.min(maxRefreshableCrawlDataSizeKBytes, size); - - return new CrawlTaskLimits(fileName, isRefreshable, effectiveSize); + return new CrawlTaskLimits(fileName, true, 1); } public void acquire(CrawlTaskLimits properties) throws InterruptedException { // It's very important that we acquire the RAM semaphore first to avoid a deadlock - taskSemRAM.acquire(properties.taskSize); taskSemCount.acquire(1); } public void release(CrawlTaskLimits properties) { taskSemCount.release(1); - taskSemRAM.release(properties.taskSize); } } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java index 3dd096cb..6fafb128 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java @@ -10,6 +10,7 @@ import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawling.io.CrawledDomainReader; import nu.marginalia.crawling.io.CrawlerOutputFile; import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.crawling.model.SerializableCrawlData; import nu.marginalia.db.storage.FileStorageService; import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mq.MqMessage; @@ -32,10 +33,7 @@ import org.slf4j.LoggerFactory; import java.io.IOException; import java.nio.file.Path; import java.sql.SQLException; -import java.util.HashSet; -import java.util.Optional; -import java.util.Set; -import java.util.UUID; +import java.util.*; import java.util.concurrent.*; import java.util.concurrent.atomic.AtomicInteger; @@ -201,19 +199,23 @@ public class CrawlerMain implements AutoCloseable { HttpFetcher fetcher = new HttpFetcherImpl(userAgent.uaString(), dispatcher, connectionPool); - // Read the previous crawl's data for this domain, if it exists and has a reasonable size - Optional domain; - if (limits.isRefreshable()) { - domain = reader.readOptionally(limits.refreshPath()); - if (domain.isPresent()) { - specification = specification.withOldData(domain.get()); + Iterator iterator; + try { + if (limits.isRefreshable()) { + iterator = reader.createIterator(limits.refreshPath()); } + else { + iterator = Collections.emptyIterator(); + } + } catch (IOException e) { + logger.warn("Failed to read previous crawl data for {}", specification.domain); + iterator = Collections.emptyIterator(); } try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) { var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept); - int size = retreiver.fetch(); + int size = retreiver.fetch(iterator); workLog.setJobToFinished(specification.id, writer.getOutputFile().toString(), size); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index 52927f38..8091dac8 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -4,6 +4,7 @@ import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; import crawlercommons.robots.SimpleRobotRules; import lombok.SneakyThrows; +import nu.marginalia.crawl.retreival.fetcher.ContentTags; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.SitemapRetriever; import nu.marginalia.crawling.model.spec.CrawlingSpecification; @@ -18,6 +19,7 @@ import org.jsoup.nodes.Document; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import javax.annotation.Nullable; import java.net.InetAddress; import java.net.UnknownHostException; import java.time.LocalDateTime; @@ -58,15 +60,13 @@ public class CrawlerRetreiver { private final SitemapRetriever sitemapRetriever; private final DomainCrawlFrontier crawlFrontier; - private final CrawlDataReference oldCrawlData; - int errorCount = 0; + private String retainedTag = "RETAINED/304"; public CrawlerRetreiver(HttpFetcher fetcher, CrawlingSpecification specs, Consumer writer) { this.fetcher = fetcher; - this.oldCrawlData = new CrawlDataReference(specs.oldData); id = specs.id; domain = specs.domain; @@ -97,10 +97,14 @@ public class CrawlerRetreiver { } public int fetch() { + return fetch(Collections.emptyIterator()); + } + + public int fetch(Iterator oldCrawlData) { final DomainProber.ProbeResult probeResult = domainProber.probeDomain(fetcher, domain, crawlFrontier.peek()); if (probeResult instanceof DomainProber.ProbeResultOk) { - return crawlDomain(); + return crawlDomain(oldCrawlData); } // handle error cases for probe @@ -137,44 +141,29 @@ public class CrawlerRetreiver { throw new IllegalStateException("Unknown probe result: " + probeResult); }; - private int crawlDomain() { + private int crawlDomain(Iterator oldCrawlData) { String ip = findIp(domain); assert !crawlFrontier.isEmpty(); var robotsRules = fetcher.fetchRobotRules(crawlFrontier.peek().domain); + long crawlDelay = robotsRules.getCrawlDelay(); - CrawlDataComparison comparison = compareWithOldData(robotsRules); - logger.info("Comparison result for {} : {}", domain, comparison); + sniffRootDocument(); - // If we have reference data, we will always grow the crawl depth a bit - if (oldCrawlData.size() > 0) { + // Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified + int recrawled = recrawl(oldCrawlData, robotsRules, crawlDelay); + + if (recrawled > 0) { + // If we have reference data, we will always grow the crawl depth a bit crawlFrontier.increaseDepth(1.5); } - // When the reference data doesn't appear to have changed, we'll forego - // re-fetching it and just use the old data - if (comparison == CrawlDataComparison.NO_CHANGES) { - oldCrawlData.allDocuments().forEach((url, doc) -> { - if (crawlFrontier.addVisited(url)) { - doc.recrawlState = "RETAINED"; - crawledDomainWriter.accept(doc); - } - }); - - // We don't need to hold onto this in RAM anymore - oldCrawlData.evict(); - } - - downloadSitemaps(robotsRules); - sniffRootDocument(); - - long crawlDelay = robotsRules.getCrawlDelay(); CrawledDomain ret = new CrawledDomain(id, domain, null, CrawlerDomainStatus.OK.name(), null, ip, new ArrayList<>(), null); - int fetchedCount = 0; + int fetchedCount = recrawled; while (!crawlFrontier.isEmpty() && !crawlFrontier.isCrawlDepthReached() @@ -187,11 +176,6 @@ public class CrawlerRetreiver { continue; } - // Don't re-fetch links that were previously found dead as it's very unlikely that a - // 404:ing link will suddenly start working at a later point - if (oldCrawlData.isPreviouslyDead(top)) - continue; - // Check the link filter if the endpoint should be fetched based on site-type if (!crawlFrontier.filterLink(top)) continue; @@ -211,7 +195,7 @@ public class CrawlerRetreiver { continue; - if (fetchDocument(top, crawlDelay).isPresent()) { + if (fetchDocument(top, null, crawlDelay).isPresent()) { fetchedCount++; } } @@ -223,63 +207,69 @@ public class CrawlerRetreiver { return fetchedCount; } - private CrawlDataComparison compareWithOldData(SimpleRobotRules robotsRules) { + private int recrawl(Iterator oldCrawlData, + SimpleRobotRules robotsRules, + long crawlDelay) { + int recrawled = 0; + int retained = 0; - int numGoodDocuments = oldCrawlData.size(); + while (oldCrawlData.hasNext()) { + if (!(oldCrawlData.next() instanceof CrawledDocument doc)) continue; - if (numGoodDocuments == 0) - return CrawlDataComparison.NO_OLD_DATA; + // This Shouldn't Happen (TM) + var urlMaybe = EdgeUrl.parse(doc.url); + if (urlMaybe.isEmpty()) continue; + var url = urlMaybe.get(); - if (numGoodDocuments < 10) - return CrawlDataComparison.SMALL_SAMPLE; - - // We fetch a sample of the data to assess how much it has changed - int sampleSize = (int) Math.min(20, 0.25 * numGoodDocuments); - Map referenceUrls = oldCrawlData.sample(sampleSize); - - int differences = 0; - - long crawlDelay = robotsRules.getCrawlDelay(); - for (var url : referenceUrls.keySet()) { - - var docMaybe = fetchDocument(url, crawlDelay); - if (docMaybe.isEmpty()) { - differences++; + // If we've previously 404:d on this URL, we'll refrain from trying to fetch it again + if (doc.httpStatus == 404) { + crawlFrontier.addVisited(url); continue; } - var newDoc = docMaybe.get(); - var referenceDoc = referenceUrls.get(url); + if (doc.httpStatus != 200) continue; - // This looks like a bug but it is not, we want to compare references - // to detect if the page has bounced off etag or last-modified headers - // to avoid having to do a full content comparison - if (newDoc == referenceDoc) + if (!robotsRules.isAllowed(url.toString())) { + crawledDomainWriter.accept(createRobotsError(url)); + continue; + } + if (!crawlFrontier.filterLink(url)) + continue; + if (!crawlFrontier.addVisited(url)) continue; - if (newDoc.httpStatus != referenceDoc.httpStatus) { - differences++; + + if (recrawled > 10 + && retained > 0.9 * recrawled + && Math.random() < 0.75) + { + logger.info("Direct-loading {}", url); + + // Since it looks like most of these documents haven't changed, + // we'll load the documents directly; but we do this in a random + // fashion to make sure we eventually catch changes over time + + crawledDomainWriter.accept(doc); + crawlFrontier.addVisited(url); continue; } - if (newDoc.documentBody == null) { - differences++; - continue; + + // GET the document with the stored document as a reference + // providing etag and last-modified headers, so we can recycle the + // document if it hasn't changed without actually downloading it + + var fetchedDocOpt = fetchDocument(url, doc, crawlDelay); + if (fetchedDocOpt.isEmpty()) continue; + + if (Objects.equals(fetchedDocOpt.get().recrawlState, retainedTag)) { + retained ++; } - long referenceLsh = hashDoc(referenceDoc); - long newLsh = hashDoc(newDoc); - - if (EasyLSH.hammingDistance(referenceLsh, newLsh) > 5) { - differences++; - } - } - if (differences > sampleSize/4) { - return CrawlDataComparison.CHANGES_FOUND; - } - else { - return CrawlDataComparison.NO_CHANGES; + recrawled ++; } + + return recrawled; } private static final HashFunction hasher = Hashing.murmur3_128(0); @@ -346,7 +336,7 @@ public class CrawlerRetreiver { var url = crawlFrontier.peek().withPathAndParam("/", null); - var maybeSample = fetchUrl(url).filter(sample -> sample.httpStatus == 200); + var maybeSample = fetchUrl(url, null).filter(sample -> sample.httpStatus == 200); if (maybeSample.isEmpty()) return; var sample = maybeSample.get(); @@ -382,23 +372,21 @@ public class CrawlerRetreiver { } } - private Optional fetchDocument(EdgeUrl top, long crawlDelay) { + private Optional fetchDocument(EdgeUrl top, + @Nullable CrawledDocument reference, + long crawlDelay) { logger.debug("Fetching {}", top); long startTime = System.currentTimeMillis(); - var doc = fetchUrl(top); + var doc = fetchUrl(top, reference); if (doc.isPresent()) { var d = doc.get(); crawledDomainWriter.accept(d); - oldCrawlData.dispose(top); if (d.url != null) { // We may have redirected to a different path - EdgeUrl.parse(d.url).ifPresent(url -> { - crawlFrontier.addVisited(url); - oldCrawlData.dispose(url); - }); + EdgeUrl.parse(d.url).ifPresent(crawlFrontier::addVisited); } if ("ERROR".equals(d.crawlerStatus) && d.httpStatus != 404) { @@ -418,14 +406,31 @@ public class CrawlerRetreiver { || proto.equalsIgnoreCase("https"); } - private Optional fetchUrl(EdgeUrl top) { + private Optional fetchUrl(EdgeUrl top, @Nullable CrawledDocument reference) { try { - var doc = fetchContent(top); + var contentTags = getContentTags(reference); + var fetchedDoc = fetchContent(top, contentTags); + CrawledDocument doc; + + // HTTP status 304 is NOT MODIFIED, which means the document is the same as it was when + // we fetched it last time. We can recycle the reference document. + if (reference != null + && fetchedDoc.httpStatus == 304) + { + doc = reference; + doc.recrawlState = retainedTag; + doc.timestamp = LocalDateTime.now().toString(); + } + else { + doc = fetchedDoc; + } if (doc.documentBody != null) { - doc.documentBodyHash = createHash(doc.documentBody.decode()); + var decoded = doc.documentBody.decode(); - Optional parsedDoc = parseDoc(doc); + doc.documentBodyHash = createHash(decoded); + + Optional parsedDoc = parseDoc(decoded); EdgeUrl url = new EdgeUrl(doc.url); parsedDoc.ifPresent(parsed -> findLinks(url, parsed)); @@ -443,23 +448,37 @@ public class CrawlerRetreiver { } + private ContentTags getContentTags(@Nullable CrawledDocument reference) { + if (null == reference) + return ContentTags.empty(); + + String headers = reference.headers; + if (headers == null) + return ContentTags.empty(); + + String[] headersLines = headers.split("\n"); + + String lastmod = null; + String etag = null; + + for (String line : headersLines) { + if (line.toLowerCase().startsWith("etag:")) { + etag = line.substring(5).trim(); + } + if (line.toLowerCase().startsWith("last-modified:")) { + lastmod = line.substring(14).trim(); + } + } + + return new ContentTags(etag, lastmod); + } + @SneakyThrows - private CrawledDocument fetchContent(EdgeUrl top) { + private CrawledDocument fetchContent(EdgeUrl top, ContentTags tags) { for (int i = 0; i < 2; i++) { try { - var doc = fetcher.fetchContent(top, oldCrawlData.getEtag(top), oldCrawlData.getLastModified(top)); - + var doc = fetcher.fetchContent(top, tags); doc.recrawlState = "NEW"; - - if (doc.httpStatus == 304) { - var referenceData = oldCrawlData.getDoc(top); - if (referenceData != null) { - referenceData.recrawlState = "304/UNCHANGED"; - return referenceData; - } - } - - return doc; } catch (RateLimitException ex) { @@ -478,10 +497,8 @@ public class CrawlerRetreiver { return hashMethod.hashUnencodedChars(documentBodyHash).toString(); } - private Optional parseDoc(CrawledDocument doc) { - if (doc.documentBody == null) - return Optional.empty(); - return Optional.of(Jsoup.parse(doc.documentBody.decode())); + private Optional parseDoc(String decoded) { + return Optional.of(Jsoup.parse(decoded)); } private void findLinks(EdgeUrl baseUrl, Document parsed) { diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/ContentTags.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/ContentTags.java new file mode 100644 index 00000000..e1df86c8 --- /dev/null +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/ContentTags.java @@ -0,0 +1,24 @@ +package nu.marginalia.crawl.retreival.fetcher; + +import okhttp3.Request; + +/** Encapsulates request modifiers; the ETag and Last-Modified tags for a resource */ +public record ContentTags(String etag, String lastMod) { + public static ContentTags empty() { + return new ContentTags(null, null); + } + + public boolean isPresent() { + return etag != null || lastMod != null; + } + + public boolean isEmpty() { + return etag == null && lastMod == null; + } + + /** Paints the tags onto the request builder. */ + public void paint(Request.Builder getBuilder) { + if (etag != null) getBuilder.addHeader("If-None-Match", etag); + if (lastMod != null) getBuilder.addHeader("If-Modified-Since", lastMod); + } +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java index 7f588783..11ad272e 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java @@ -18,7 +18,7 @@ public interface HttpFetcher { FetchResult probeDomain(EdgeUrl url); - CrawledDocument fetchContent(EdgeUrl url, String etag, String lastMod) throws RateLimitException; + CrawledDocument fetchContent(EdgeUrl url, ContentTags tags) throws RateLimitException; SimpleRobotRules fetchRobotRules(EdgeDomain domain); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java index 36c8bd34..be6a6a06 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java @@ -128,9 +128,15 @@ public class HttpFetcherImpl implements HttpFetcher { @Override @SneakyThrows - public CrawledDocument fetchContent(EdgeUrl url, String etag, String lastMod) throws RateLimitException { + public CrawledDocument fetchContent(EdgeUrl url, + ContentTags contentTags) + throws RateLimitException + { - if (contentTypeLogic.isUrlLikeBinary(url)) { + // We don't want to waste time and resources on URLs that are not HTML, so if the file ending + // looks like it might be something else, we perform a HEAD first to check the content type + if (contentTags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url)) + { logger.debug("Probing suspected binary {}", url); var headBuilder = new Request.Builder().head() @@ -146,6 +152,21 @@ public class HttpFetcherImpl implements HttpFetcher { if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) { return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, "Early probe failed"); } + + // Update the URL to the final URL of the HEAD request, otherwise we might end up doing + + // HEAD 301 url1 -> url2 + // HEAD 200 url2 + // GET 301 url1 -> url2 + // GET 200 url2 + + // which is not what we want. Overall we want to do as few requests as possible to not raise + // too many eyebrows when looking at the logs on the target server. Overall it's probably desirable + // that it looks like the traffic makes sense, as opposed to looking like a broken bot. + + var redirectUrl = new EdgeUrl(rsp.request().url().toString()); + if (Objects.equals(redirectUrl.domain, url.domain)) + url = redirectUrl; } catch (SocketTimeoutException ex) { return createTimeoutErrorRsp(url, ex); @@ -157,12 +178,12 @@ public class HttpFetcherImpl implements HttpFetcher { } var getBuilder = new Request.Builder().get(); + getBuilder.addHeader("User-agent", userAgent) .url(url.toString()) .addHeader("Accept-Encoding", "gzip"); - if (etag != null) getBuilder.addHeader("If-None-Match", etag); - if (lastMod != null) getBuilder.addHeader("If-Modified-Since", lastMod); + contentTags.paint(getBuilder); var get = getBuilder.build(); var call = client.newCall(get); @@ -314,7 +335,7 @@ public class HttpFetcherImpl implements HttpFetcher { private Optional fetchRobotsForProto(String proto, EdgeDomain domain) { try { var url = new EdgeUrl(proto, domain, null, "/robots.txt", null); - return Optional.of(parseRobotsTxt(fetchContent(url, null, null))); + return Optional.of(parseRobotsTxt(fetchContent(url, ContentTags.empty()))); } catch (Exception ex) { return Optional.empty(); diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java index 2ea9c763..5893910f 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java @@ -2,6 +2,7 @@ package nu.marginalia.crawling; import lombok.SneakyThrows; import nu.marginalia.crawl.retreival.RateLimitException; +import nu.marginalia.crawl.retreival.fetcher.ContentTags; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.retreival.logic.ContentTypeLogic; import nu.marginalia.model.EdgeUrl; @@ -29,14 +30,14 @@ class HttpFetcherTest { @Test void fetchUTF8() throws URISyntaxException, RateLimitException { var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler"); - var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), null, null); + var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), ContentTags.empty()); System.out.println(str.contentType); } @Test void fetchText() throws URISyntaxException, RateLimitException { var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler"); - var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), null, null); + var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), ContentTags.empty()); System.out.println(str); } } \ No newline at end of file diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java index f580a123..59e3c45e 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java @@ -4,10 +4,7 @@ import crawlercommons.robots.SimpleRobotRules; import lombok.SneakyThrows; import nu.marginalia.bigstring.BigString; import nu.marginalia.crawl.retreival.CrawlerRetreiver; -import nu.marginalia.crawl.retreival.fetcher.FetchResult; -import nu.marginalia.crawl.retreival.fetcher.FetchResultState; -import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; -import nu.marginalia.crawl.retreival.fetcher.SitemapRetriever; +import nu.marginalia.crawl.retreival.fetcher.*; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawlerDocumentStatus; import nu.marginalia.crawling.model.SerializableCrawlData; @@ -126,7 +123,7 @@ public class CrawlerMockFetcherTest { } @Override - public CrawledDocument fetchContent(EdgeUrl url, String etag, String lastModified) { + public CrawledDocument fetchContent(EdgeUrl url, ContentTags tags) { logger.info("Fetching {}", url); if (mockData.containsKey(url)) { return mockData.get(url); diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java index 741c8704..009e9084 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java @@ -5,12 +5,18 @@ import nu.marginalia.WmsaHome; import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; +import nu.marginalia.crawling.io.CrawledDomainReader; +import nu.marginalia.crawling.io.CrawledDomainWriter; +import nu.marginalia.crawling.io.CrawlerOutputFile; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.spec.CrawlingSpecification; import nu.marginalia.crawling.model.SerializableCrawlData; import org.junit.jupiter.api.*; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -99,7 +105,7 @@ class CrawlerRetreiverTest { } @Test - public void testRecrawl() { + public void testRecrawl() throws IOException { var specs = CrawlingSpecification .builder() @@ -110,6 +116,8 @@ class CrawlerRetreiverTest { .build(); + Path out = Files.createTempDirectory("crawling-process"); + var writer = new CrawledDomainWriter(out, "test", "123456"); Map, List> data = new HashMap<>(); new CrawlerRetreiver(httpFetcher, specs, d -> { @@ -117,7 +125,12 @@ class CrawlerRetreiverTest { if (d instanceof CrawledDocument doc) { System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus); } + writer.accept(d); }).fetch(); + writer.close(); + + var reader = new CrawledDomainReader(); + var iter = reader.createIterator(CrawlerOutputFile.getOutputFile(out, "123456", "test")); CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0); domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList()); @@ -128,6 +141,7 @@ class CrawlerRetreiverTest { if (d instanceof CrawledDocument doc) { System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus); } - }).fetch(); + }).fetch(iter); + } } \ No newline at end of file From 995657c6ce3cdcca5d08629982f3161294257da9 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 21 Jul 2023 19:50:35 +0200 Subject: [PATCH 069/157] (big-string) Make big-string disable:able --- .../src/main/java/nu/marginalia/bigstring/BigString.java | 5 ++++- doc/system-properties.md | 6 ++++++ run/env/service.env | 2 +- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/code/libraries/big-string/src/main/java/nu/marginalia/bigstring/BigString.java b/code/libraries/big-string/src/main/java/nu/marginalia/bigstring/BigString.java index f1533977..55a26cd7 100644 --- a/code/libraries/big-string/src/main/java/nu/marginalia/bigstring/BigString.java +++ b/code/libraries/big-string/src/main/java/nu/marginalia/bigstring/BigString.java @@ -1,8 +1,11 @@ package nu.marginalia.bigstring; public interface BigString { + + boolean disableBigString = Boolean.getBoolean("bigstring.disabled"); + static BigString encode(String stringValue) { - if (stringValue.length() > 64) { + if (!disableBigString && stringValue.length() > 64) { return new CompressedBigString(stringValue); } else { diff --git a/doc/system-properties.md b/doc/system-properties.md index f28eaca3..e79228bd 100644 --- a/doc/system-properties.md +++ b/doc/system-properties.md @@ -29,3 +29,9 @@ These are JVM system properties used by each service |lexiconSizeHint| 800000000 | The default size of the lexicon | |local-index-path| /some/path | Selects the location the loader will write index data | |crawl.rootDirRewrite|/some/path|Sets the base directory of a crawl plan | + +## Other + +|flag| values | description | +|---|------------|---------------------------------------------| +|bigstring.disabled| true/false | Disables transparent big string compression | \ No newline at end of file diff --git a/run/env/service.env b/run/env/service.env index ac745577..5553f603 100644 --- a/run/env/service.env +++ b/run/env/service.env @@ -1,4 +1,4 @@ WMSA_HOME=run/ CONTROL_SERVICE_OPTS="-DdistPath=/dist" CONVERTER_OPTS="-ea -Xmx16G -XX:-CompactStrings -XX:+UseParallelGC -XX:GCTimeRatio=14 -XX:ParallelGCThreads=15" -CRAWLER_OPTS="-Xmx16G -XX:+UseParallelGC -XX:GCTimeRatio=14 -XX:ParallelGCThreads=15" \ No newline at end of file +CRAWLER_OPTS="-Dbigstring.disabled=true -Xmx16G -XX:+UseParallelGC -XX:GCTimeRatio=14 -XX:ParallelGCThreads=15" \ No newline at end of file From d6b07e4d01d756331936b00552363312cde7e831 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 21 Jul 2023 19:56:16 +0200 Subject: [PATCH 070/157] (controller) Improve the storage interface --- code/libraries/big-string/readme.md | 4 + .../nu/marginalia/control/ControlService.java | 26 +++++- .../model/FileStorageWithRelatedEntries.java | 10 +++ .../svc/ControlFileStorageService.java | 71 +++++++++++++--- .../main/resources/static/control/style.css | 13 ++- .../resources/templates/control/index.hdb | 2 +- .../control/partials/storage-table.hdb | 34 ++++++++ .../control/partials/storage-types.hdb | 6 ++ .../templates/control/storage-crawls.hdb | 28 +++++++ .../templates/control/storage-details.hdb | 81 +++++++++++++++++++ .../templates/control/storage-overview.hdb | 54 +++++++++++++ .../templates/control/storage-processed.hdb | 26 ++++++ .../templates/control/storage-specs.hdb | 64 +++++++++++++++ .../resources/templates/control/storage.hdb | 78 ------------------ 14 files changed, 406 insertions(+), 91 deletions(-) create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithRelatedEntries.java create mode 100644 code/services-satellite/control-service/src/main/resources/templates/control/partials/storage-table.hdb create mode 100644 code/services-satellite/control-service/src/main/resources/templates/control/partials/storage-types.hdb create mode 100644 code/services-satellite/control-service/src/main/resources/templates/control/storage-crawls.hdb create mode 100644 code/services-satellite/control-service/src/main/resources/templates/control/storage-details.hdb create mode 100644 code/services-satellite/control-service/src/main/resources/templates/control/storage-overview.hdb create mode 100644 code/services-satellite/control-service/src/main/resources/templates/control/storage-processed.hdb create mode 100644 code/services-satellite/control-service/src/main/resources/templates/control/storage-specs.hdb delete mode 100644 code/services-satellite/control-service/src/main/resources/templates/control/storage.hdb diff --git a/code/libraries/big-string/readme.md b/code/libraries/big-string/readme.md index 84fab2a2..f03c64ad 100644 --- a/code/libraries/big-string/readme.md +++ b/code/libraries/big-string/readme.md @@ -4,6 +4,10 @@ Microlibrary that offers string compression. This is useful when having to load of HTML documents in memory during conversion. XML has been described as the opposite of a compression scheme, and as a result, HTML compresses ridiculously well. +## Configuration + +If the Java property 'bigstring.disabled' is set to true, the BigString class will not compress strings. + ## Demo ```java diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java index 82869816..eb43f9cb 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java @@ -5,6 +5,8 @@ import com.google.inject.Inject; import nu.marginalia.client.ServiceMonitors; import nu.marginalia.control.model.Actor; import nu.marginalia.control.svc.*; +import nu.marginalia.db.storage.model.FileStorageId; +import nu.marginalia.db.storage.model.FileStorageType; import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.renderer.RendererFactory; import nu.marginalia.service.server.*; @@ -15,6 +17,7 @@ import spark.Response; import spark.Spark; import java.io.IOException; +import java.sql.SQLException; import java.util.Map; public class ControlService extends Service { @@ -53,7 +56,11 @@ public class ControlService extends Service { var serviceByIdRenderer = rendererFactory.renderer("control/service-by-id"); var actorsRenderer = rendererFactory.renderer("control/actors"); var actorDetailsRenderer = rendererFactory.renderer("control/actor-details"); - var storageRenderer = rendererFactory.renderer("control/storage"); + var storageRenderer = rendererFactory.renderer("control/storage-overview"); + var storageSpecsRenderer = rendererFactory.renderer("control/storage-specs"); + var storageCrawlsRenderer = rendererFactory.renderer("control/storage-crawls"); + var storageProcessedRenderer = rendererFactory.renderer("control/storage-processed"); + var storageDetailsRenderer = rendererFactory.renderer("control/storage-details"); this.controlActorService = controlActorService; @@ -74,6 +81,11 @@ public class ControlService extends Service { Spark.get("/public/actors", this::processesModel, actorsRenderer::render); Spark.get("/public/actors/:fsm", this::actorDetailsModel, actorDetailsRenderer::render); Spark.get("/public/storage", this::storageModel, storageRenderer::render); + Spark.get("/public/storage/specs", this::storageModelSpecs, storageSpecsRenderer::render); + Spark.get("/public/storage/crawls", this::storageModelCrawls, storageCrawlsRenderer::render); + Spark.get("/public/storage/processed", this::storageModelProcessed, storageProcessedRenderer::render); + Spark.get("/public/storage/:id", this::storageDetailsModel, storageDetailsRenderer::render); + final HtmlRedirect redirectToServices = new HtmlRedirect("/services"); final HtmlRedirect redirectToProcesses = new HtmlRedirect("/actors"); @@ -118,6 +130,18 @@ public class ControlService extends Service { return Map.of("storage", controlFileStorageService.getStorageList()); } + private Object storageDetailsModel(Request request, Response response) throws SQLException { + return Map.of("storage", controlFileStorageService.getFileStorageWithRelatedEntries(FileStorageId.parse(request.params("id")))); + } + private Object storageModelSpecs(Request request, Response response) { + return Map.of("storage", controlFileStorageService.getStorageList(FileStorageType.CRAWL_SPEC)); + } + private Object storageModelCrawls(Request request, Response response) { + return Map.of("storage", controlFileStorageService.getStorageList(FileStorageType.CRAWL_DATA)); + } + private Object storageModelProcessed(Request request, Response response) { + return Map.of("storage", controlFileStorageService.getStorageList(FileStorageType.PROCESSED_DATA)); + } private Object servicesModel(Request request, Response response) { return Map.of("services", heartbeatService.getServiceHeartbeats(), "events", eventLogService.getLastEntries(20)); diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithRelatedEntries.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithRelatedEntries.java new file mode 100644 index 00000000..28afba5d --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithRelatedEntries.java @@ -0,0 +1,10 @@ +package nu.marginalia.control.model; + +import nu.marginalia.db.storage.model.FileStorage; +import nu.marginalia.db.storage.model.FileStorageType; + +import java.util.List; + +public record FileStorageWithRelatedEntries(FileStorageWithActions self, List related) { + +} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlFileStorageService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlFileStorageService.java index 982c42e0..db122a7c 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlFileStorageService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlFileStorageService.java @@ -4,15 +4,9 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; -import nu.marginalia.control.model.FileStorageBaseWithStorage; -import nu.marginalia.control.model.FileStorageWithActions; -import nu.marginalia.control.model.ProcessHeartbeat; -import nu.marginalia.control.model.ServiceHeartbeat; +import nu.marginalia.control.model.*; import nu.marginalia.db.storage.FileStorageService; -import nu.marginalia.db.storage.model.FileStorage; -import nu.marginalia.db.storage.model.FileStorageBase; -import nu.marginalia.db.storage.model.FileStorageBaseId; -import nu.marginalia.db.storage.model.FileStorageId; +import nu.marginalia.db.storage.model.*; import spark.Request; import spark.Response; @@ -49,9 +43,17 @@ public class ControlFileStorageService { @SneakyThrows public List getStorageList() { - Map fileStorageBaseByBaseId = new HashMap<>(); - Map> fileStoragByBaseId = new HashMap<>(); + var storageIds = getFileStorageIds(); + return makeFileStorageBaseWithStorage(storageIds); + } + @SneakyThrows + public List getStorageList(FileStorageType type) { + var storageIds = getFileStorageIds(type); + return makeFileStorageBaseWithStorage(storageIds); + } + + private List getFileStorageIds() throws SQLException { List storageIds = new ArrayList<>(); try (var conn = dataSource.getConnection(); @@ -62,6 +64,29 @@ public class ControlFileStorageService { } } + return storageIds; + } + + private List getFileStorageIds(FileStorageType type) throws SQLException { + List storageIds = new ArrayList<>(); + + try (var conn = dataSource.getConnection(); + var storageByIdStmt = conn.prepareStatement("SELECT ID FROM FILE_STORAGE WHERE TYPE = ?")) { + storageByIdStmt.setString(1, type.name()); + var rs = storageByIdStmt.executeQuery(); + while (rs.next()) { + storageIds.add(new FileStorageId(rs.getLong("ID"))); + } + } + + return storageIds; + } + + private List makeFileStorageBaseWithStorage(List storageIds) throws SQLException { + + Map fileStorageBaseByBaseId = new HashMap<>(); + Map> fileStoragByBaseId = new HashMap<>(); + for (var id : storageIds) { var storage = fileStorageService.getStorage(id); fileStorageBaseByBaseId.computeIfAbsent(storage.base().id(), k -> storage.base()); @@ -79,5 +104,31 @@ public class ControlFileStorageService { return result; } + public FileStorageWithRelatedEntries getFileStorageWithRelatedEntries(FileStorageId id) throws SQLException { + var storage = fileStorageService.getStorage(id); + var related = getRelatedEntries(id); + return new FileStorageWithRelatedEntries(new FileStorageWithActions(storage), related); + } + private List getRelatedEntries(FileStorageId id) { + List ret = new ArrayList<>(); + try (var conn = dataSource.getConnection(); + var relatedIds = conn.prepareStatement(""" + (SELECT SOURCE_ID AS ID FROM FILE_STORAGE_RELATION WHERE TARGET_ID = ?) + UNION + (SELECT TARGET_ID AS ID FROM FILE_STORAGE_RELATION WHERE SOURCE_ID = ?) + """)) + { + + relatedIds.setLong(1, id.id()); + relatedIds.setLong(2, id.id()); + var rs = relatedIds.executeQuery(); + while (rs.next()) { + ret.add(fileStorageService.getStorage(new FileStorageId(rs.getLong("ID")))); + } + } catch (SQLException throwables) { + throwables.printStackTrace(); + } + return ret; + } } diff --git a/code/services-satellite/control-service/src/main/resources/static/control/style.css b/code/services-satellite/control-service/src/main/resources/static/control/style.css index e4be767f..4056c91e 100644 --- a/code/services-satellite/control-service/src/main/resources/static/control/style.css +++ b/code/services-satellite/control-service/src/main/resources/static/control/style.css @@ -8,6 +8,17 @@ body { grid-template-areas: "left right"; } +section nav.tabs > a { + color: #000; + text-decoration: none; + background-color: #ccc; + padding: 0.5ch; + border-radius: .5ch; +} +section nav.tabs a.selected { + background-color: #eee; +} + .toggle-switch-off { border-left: 5px solid #f00; width: 8ch; @@ -37,7 +48,7 @@ table { } th { text-align: left; } td,th { padding-right: 1ch; border: 1px solid #ccc; } -tr:nth-last-of-type(2n) { +tr:nth-of-type(2n) { background-color: #eee; } body > nav { diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/index.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/index.hdb index b1034529..5e72a451 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/index.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/index.hdb @@ -11,5 +11,5 @@

    Overview

    - + diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/partials/storage-table.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/partials/storage-table.hdb new file mode 100644 index 00000000..9be012e5 --- /dev/null +++ b/code/services-satellite/control-service/src/main/resources/templates/control/partials/storage-table.hdb @@ -0,0 +1,34 @@ + + {{#each storage}} + + + + + + + + + + + + + + + + + + + + + {{#each storage}} + + + + + + + {{/each}} + {{/each}} +
    TypeNamePathMust CleanPermit Temp
    {{base.type}}{{base.name}}{{base.path}}{{base.mustClean}}{{base.permitTemp}}
    TypePathDescription
    + Info + {{storage.type}}{{storage.path}}{{storage.description}}
    \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/partials/storage-types.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/partials/storage-types.hdb new file mode 100644 index 00000000..575797f9 --- /dev/null +++ b/code/services-satellite/control-service/src/main/resources/templates/control/partials/storage-types.hdb @@ -0,0 +1,6 @@ + \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/storage-crawls.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/storage-crawls.hdb new file mode 100644 index 00000000..627072a3 --- /dev/null +++ b/code/services-satellite/control-service/src/main/resources/templates/control/storage-crawls.hdb @@ -0,0 +1,28 @@ + + + + Control Service + + + + + {{> control/partials/nav}} +
    + {{> control/partials/storage-types}} +

    Crawl Data

    + {{> control/partials/storage-table}} + +

    About

    +

    Crawl data is the content of websites that have been downloaded by the crawler.

    +

    Crawl data can be turned into processed data, and loaded into the index to make + it searchable.

    +
    + + + + + \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/storage-details.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/storage-details.hdb new file mode 100644 index 00000000..9038d510 --- /dev/null +++ b/code/services-satellite/control-service/src/main/resources/templates/control/storage-details.hdb @@ -0,0 +1,81 @@ + + + + Control Service + + + + + {{> control/partials/nav}} +
    + {{> control/partials/storage-types}} +

    Storage Details

    + {{#with storage.self.storage}} + + + + + + + + + + + +
    TypePathDetails
    {{type}}{{path}}{{description}}
    + {{/with}} +

    Actions

    + {{#with storage.self}} + {{#if isCrawlable}} + + Perform a full re-crawl of this data:
    + + {{/if}} + {{#if isLoadable}} +
    + Load this data into index:
    +
    + {{/if}} + {{#if isConvertible}} +
    + Process and load this data into index:
    +
    + {{/if}} + {{#if isRecrawlable}} +
    + Perform a re-crawl of this data:
    +
    + {{/if}} + {{#if isDeletable}} +
    + Delete this data:
    +
    + {{/if}} + {{/with}} + {{#if storage.related}} +

    Related

    + + + + + + + {{#each storage.related}} + + + + + + {{/each}} +
    TypePathDetails
    {{type}}{{path}}{{description}}
    + {{/if}} +
    + + + + + \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/storage-overview.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/storage-overview.hdb new file mode 100644 index 00000000..7d978fb9 --- /dev/null +++ b/code/services-satellite/control-service/src/main/resources/templates/control/storage-overview.hdb @@ -0,0 +1,54 @@ + + + + Control Service + + + + + {{> control/partials/nav}} +
    + {{> control/partials/storage-types}} +

    Storage

    + + {{#each storage}} + + + + + + + + + + + + + + + + + + + + + {{#each storage}} + + + + + + + {{/each}} + {{/each}} +
    TypeNamePathMust CleanPermit Temp
    {{base.type}}{{base.name}}{{base.path}}{{base.mustClean}}{{base.permitTemp}}
    TypePathDescription
    + {{storage.type}}{{storage.path}}{{storage.description}}
    +
    + + + + \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/storage-processed.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/storage-processed.hdb new file mode 100644 index 00000000..9a0da6c7 --- /dev/null +++ b/code/services-satellite/control-service/src/main/resources/templates/control/storage-processed.hdb @@ -0,0 +1,26 @@ + + + + Control Service + + + + + {{> control/partials/nav}} +
    + {{> control/partials/storage-types}} +

    Processed Data

    + {{> control/partials/storage-table}} + +

    About

    +

    Processed data is crawl data that has been analyzed, and had its keywords extracted, + and is ready to be loaded into the index.

    +
    + + + + \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/storage-specs.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/storage-specs.hdb new file mode 100644 index 00000000..c1e64963 --- /dev/null +++ b/code/services-satellite/control-service/src/main/resources/templates/control/storage-specs.hdb @@ -0,0 +1,64 @@ + + + + Control Service + + + + + {{> control/partials/nav}} +
    + {{> control/partials/storage-types}} + +

    Crawl Specifications

    + {{> control/partials/storage-table}} + +

    About

    + +

    Crawling specifications are a work order for the crawler, in essence a list of domains that are to be crawled, + combined with a list of known URLs for each domain, and instructions on how deep to crawl. The crawler requires + a specification in order to understand what to do. +

    +

    + A crawling specification can either be generated from the links in the database, or from a list of domains + provided via a URL that links to a text file. +

    +

    Create New Specification

    + +

    To create a new specification fill out the form below.

    +
    +
    +
    +
    +

    (This is how you'll be able to find the + specification later so give it a good and descriptive name)

    + +

    Source

    +
    +
    + +
    + +
    +
    + +
    + + + + \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/storage.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/storage.hdb deleted file mode 100644 index 7f748489..00000000 --- a/code/services-satellite/control-service/src/main/resources/templates/control/storage.hdb +++ /dev/null @@ -1,78 +0,0 @@ - - - - Control Service - - - - - {{> control/partials/nav}} -
    -

    Storage

    - - {{#each storage}} - - - - - - - - - - - - - - - - - - - - - {{#each storage}} - - - - - - - {{/each}} - {{/each}} -
    TypeNamePathMust CleanPermit Temp
    {{base.type}}{{base.name}}{{base.path}}{{base.mustClean}}{{base.permitTemp}}
    TypePathDescription
    - {{#if isCrawlable}} -
    - -
    - {{/if}} - {{#if isLoadable}} -
    - -
    - {{/if}} - {{#if isConvertible}} -
    - -
    - {{/if}} - {{#if isRecrawlable}} -
    - -
    - {{/if}} - {{#if isDeletable}} -
    - -
    - {{/if}} -
    {{storage.type}}{{storage.path}}{{storage.description}}
    -
    - - - - \ No newline at end of file From e22e65eee466c298d7ef4476b84bac9adc7d2dff Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 22 Jul 2023 14:20:52 +0200 Subject: [PATCH 071/157] (index) Fix bug related to debug print statements --- .../index/svc/SearchTermsService.java | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchTermsService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchTermsService.java index 7e5cad50..944517d6 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchTermsService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchTermsService.java @@ -50,8 +50,8 @@ public class SearchTermsService { } // we don't care if we can't find these: - addEachTerm(excludes, request.searchTermsExclude); - addEachTerm(priority, request.searchTermsPriority); + addEachNonMandatoryTerm(excludes, request.searchTermsExclude); + addEachNonMandatoryTerm(priority, request.searchTermsPriority); return new SearchIndexSearchTerms(includes, excludes, priority, coherences); } @@ -59,11 +59,11 @@ public class SearchTermsService { private boolean addEachTerm(IntList ret, List words) { boolean success = true; - for (var exclude : words) { - var word = lookUpWord(exclude); + for (var word : words) { + var termId = lookUpWord(word); - if (word.isPresent()) { - lookUpWord(exclude).ifPresent(ret::add); + if (termId.isPresent()) { + lookUpWord(word).ifPresent(ret::add); } else { success = false; @@ -72,6 +72,12 @@ public class SearchTermsService { return success; } + private void addEachNonMandatoryTerm(IntList ret, List words) { + for (var word : words) { + ret.add(lexicon.get(word)); + } + } + public OptionalInt lookUpWord(String s) { int ret = lexicon.get(s); From 9e4aa7da7c87ef05489d3d3c6ce3987b218ed65b Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 22 Jul 2023 15:14:21 +0200 Subject: [PATCH 072/157] (crawler) Support for X-Robots-Tag --- .../retreival/fetcher/HttpFetcherImpl.java | 60 ++++++++++++++++++- .../fetcher/HttpFetcherImplTest.java | 36 +++++++++++ 2 files changed, 95 insertions(+), 1 deletion(-) create mode 100644 code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImplTest.java diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java index be6a6a06..025c0aa9 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java @@ -17,6 +17,7 @@ import nu.marginalia.crawl.retreival.logic.ContentTypeLogic; import nu.marginalia.crawl.retreival.logic.ContentTypeParser; import okhttp3.*; import org.apache.commons.io.input.BOMInputStream; +import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -24,7 +25,6 @@ import javax.net.ssl.X509TrustManager; import java.io.IOException; import java.net.SocketTimeoutException; import java.net.URISyntaxException; -import java.net.URL; import java.nio.charset.Charset; import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.StandardCharsets; @@ -273,6 +273,17 @@ public class HttpFetcherImpl implements HttpFetcher { return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CHARSET, ""); } + if (!isXRobotsTagsPermitted(rsp.headers("X-Robots-Tag"), userAgent)) { + return CrawledDocument.builder() + .crawlerStatus(CrawlerDocumentStatus.ROBOTS_TXT.name()) + .crawlerStatusDesc("X-Robots-Tag") + .url(responseUrl.toString()) + .httpStatus(-1) + .timestamp(LocalDateTime.now().toString()) + .headers(rsp.headers().toString()) + .build(); + } + var strData = getStringData(data, contentType); var canonical = rsp.header("rel=canonical", ""); @@ -288,6 +299,53 @@ public class HttpFetcherImpl implements HttpFetcher { .build(); } + /** Check X-Robots-Tag header tag to see if we are allowed to index this page. + *

    + * Reference: https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag + * + * @param xRobotsHeaderTags List of X-Robots-Tag values + * @param userAgent User agent string + * @return true if we are allowed to index this page + */ + // Visible for tests + public static boolean isXRobotsTagsPermitted(List xRobotsHeaderTags, String userAgent) { + boolean isPermittedGeneral = true; + boolean isPermittedMarginalia = false; + boolean isForbiddenMarginalia = false; + + for (String header : xRobotsHeaderTags) { + if (header.indexOf(':') >= 0) { + String[] parts = StringUtils.split(header, ":", 2); + + if (parts.length < 2) + continue; + + // Is this relevant to us? + if (!Objects.equals(parts[0].trim(), userAgent)) + continue; + + if (parts[1].contains("noindex")) + isForbiddenMarginalia = true; + else if (parts[1].contains("none")) + isForbiddenMarginalia = true; + else if (parts[1].contains("all")) + isPermittedMarginalia = true; + } + else { + if (header.contains("noindex")) + isPermittedGeneral = false; + if (header.contains("none")) + isPermittedGeneral = false; + } + } + + if (isPermittedMarginalia) + return true; + if (isForbiddenMarginalia) + return false; + return isPermittedGeneral; + } + private String getStringData(byte[] data, ContentType contentType) { Charset charset; try { diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImplTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImplTest.java new file mode 100644 index 00000000..27b55760 --- /dev/null +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImplTest.java @@ -0,0 +1,36 @@ +package nu.marginalia.crawl.retreival.fetcher; + +import org.junit.jupiter.api.Test; + +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +class HttpFetcherImplTest { + + @Test + public void testXRobotsTag() { + assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("foo:"), "search.marginalia.nu")); + assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of(":bar"), "search.marginalia.nu")); + assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of(":"), "search.marginalia.nu")); + assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of(""), "search.marginalia.nu")); + + assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("doindex"), "search.marginalia.nu")); + assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("noindex"), "search.marginalia.nu")); + assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("none"), "search.marginalia.nu")); + assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: noindex"), "search.marginalia.nu")); + assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none"), "search.marginalia.nu")); + assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("googlebot: noindex"), "search.marginalia.nu")); + + assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("noindex", "search.marginalia.nu: all"), "search.marginalia.nu")); + assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("none", "search.marginalia.nu: all"), "search.marginalia.nu")); + assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("none", "search.marginalia.nu: none"), "search.marginalia.nu")); + assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("all", "search.marginalia.nu: none"), "search.marginalia.nu")); + assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: all", "noindex"), "search.marginalia.nu")); + assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: all", "none"), "search.marginalia.nu")); + assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none", "none"), "search.marginalia.nu")); + assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none", "all"), "search.marginalia.nu")); + } + +} \ No newline at end of file From c069c8c1824f4d55d80451bfc47c58a104b717b3 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 22 Jul 2023 18:36:29 +0200 Subject: [PATCH 073/157] (crawler) Clean up crawl data reference and recrawl logic --- .../crawling/io/CrawledDomainReader.java | 6 +- .../model/spec/CrawlingSpecification.java | 2 - .../nu/marginalia/crawl/CrawlLimiter.java | 11 +- .../java/nu/marginalia/crawl/CrawlerMain.java | 37 +++-- .../crawl/retreival/CrawlDataReference.java | 138 ++++++------------ .../crawl/retreival/CrawlerRetreiver.java | 19 ++- .../retreival/CrawlerMockFetcherTest.java | 6 +- .../retreival/CrawlerRetreiverTest.java | 13 +- .../crawl/CrawlJobSpecWriterTest.java | 6 +- 9 files changed, 94 insertions(+), 144 deletions(-) diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java index abc524ac..c3dddb3c 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java @@ -6,6 +6,7 @@ import lombok.SneakyThrows; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.SerializableCrawlData; +import nu.marginalia.crawling.model.spec.CrawlingSpecification; import nu.marginalia.model.gson.GsonFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -30,7 +31,10 @@ public class CrawledDomainReader { public CrawledDomainReader() { } - public Iterator createIterator(Path path) throws IOException { + public Iterator createIterator(Path basePath, CrawlingSpecification spec) throws IOException { + + final var path = CrawlerOutputFile.getOutputFile(basePath, spec.id, spec.domain); + BufferedReader br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile())))); return new Iterator<>() { diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/spec/CrawlingSpecification.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/spec/CrawlingSpecification.java index f6001166..718e2d7f 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/spec/CrawlingSpecification.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/spec/CrawlingSpecification.java @@ -18,8 +18,6 @@ public class CrawlingSpecification { public String domain; public List urls; - public CrawledDomain oldData; - @Override public String toString() { return String.format(getClass().getSimpleName() + "[" + id + "/" + domain + ": " + crawlDepth + "[ " + urls.size() + "]"); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java index 7285b0c5..e987c926 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java @@ -8,22 +8,15 @@ import java.util.concurrent.Semaphore; public class CrawlLimiter { public static final int maxPoolSize = Integer.getInteger("crawler.pool-size", 512); - public record CrawlTaskLimits(Path refreshPath, boolean isRefreshable, int taskSize) {} - private final Semaphore taskSemCount = new Semaphore(maxPoolSize); - public CrawlTaskLimits getTaskLimits(Path fileName) { - return new CrawlTaskLimits(fileName, true, 1); - } - - - public void acquire(CrawlTaskLimits properties) throws InterruptedException { + public void acquire() throws InterruptedException { // It's very important that we acquire the RAM semaphore first to avoid a deadlock taskSemCount.acquire(1); } - public void release(CrawlTaskLimits properties) { + public void release() { taskSemCount.release(1); } } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java index 6fafb128..2c7e6e41 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java @@ -6,6 +6,7 @@ import com.google.inject.Inject; import com.google.inject.Injector; import nu.marginalia.UserAgent; import nu.marginalia.WmsaHome; +import nu.marginalia.crawl.retreival.CrawlDataReference; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawling.io.CrawledDomainReader; import nu.marginalia.crawling.io.CrawlerOutputFile; @@ -173,49 +174,37 @@ public class CrawlerMain implements AutoCloseable { return; } - var limits = crawlLimiter.getTaskLimits(CrawlerOutputFile.getOutputFile(crawlDataDir, crawlingSpecification)); - try { - crawlLimiter.acquire(limits); + crawlLimiter.acquire(); } catch (InterruptedException e) { throw new RuntimeException(e); } pool.execute(() -> { try { - fetchDomain(crawlingSpecification, limits); + fetchDomain(crawlingSpecification); heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks); } finally { - crawlLimiter.release(limits); + crawlLimiter.release(); } }); } - private void fetchDomain(CrawlingSpecification specification, CrawlLimiter.CrawlTaskLimits limits) { + private void fetchDomain(CrawlingSpecification specification) { if (workLog.isJobFinished(specification.id)) return; HttpFetcher fetcher = new HttpFetcherImpl(userAgent.uaString(), dispatcher, connectionPool); - Iterator iterator; - try { - if (limits.isRefreshable()) { - iterator = reader.createIterator(limits.refreshPath()); - } - else { - iterator = Collections.emptyIterator(); - } - } catch (IOException e) { - logger.warn("Failed to read previous crawl data for {}", specification.domain); - iterator = Collections.emptyIterator(); - } try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) { var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept); - int size = retreiver.fetch(iterator); + CrawlDataReference reference = getReference(specification); + + int size = retreiver.fetch(reference); workLog.setJobToFinished(specification.id, writer.getOutputFile().toString(), size); @@ -225,6 +214,16 @@ public class CrawlerMain implements AutoCloseable { } } + private CrawlDataReference getReference(CrawlingSpecification specification) { + try { + var iterator = reader.createIterator(crawlDataDir, specification); + return new CrawlDataReference(iterator); + } catch (IOException e) { + logger.warn("Failed to read previous crawl data for {}", specification.domain); + return new CrawlDataReference(); + } + } + private static class CrawlRequest { private final CrawlPlan plan; private final MqMessage message; diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java index cc827084..4c4a33d8 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java @@ -1,123 +1,73 @@ package nu.marginalia.crawl.retreival; +import com.google.common.hash.HashCode; +import com.google.common.hash.HashFunction; +import com.google.common.hash.Hashing; +import nu.marginalia.bigstring.BigString; import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawledDomain; -import nu.marginalia.model.EdgeUrl; +import nu.marginalia.crawling.model.SerializableCrawlData; +import nu.marginalia.lsh.EasyLSH; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.net.URISyntaxException; +import javax.annotation.Nullable; import java.util.*; -import java.util.stream.Collectors; /** A reference to a domain that has been crawled before. */ public class CrawlDataReference { private final Logger logger = LoggerFactory.getLogger(CrawlDataReference.class); - final Map documents; - final Map etags; - final Map lastModified; - final Set previouslyDeadUrls = new HashSet<>(); - CrawlDataReference(CrawledDomain referenceDomain) { + private final Iterator data; + private final HashFunction hashFunction = Hashing.murmur3_128(); - if (referenceDomain == null || referenceDomain.doc == null) { - documents = Collections.emptyMap(); - etags = Collections.emptyMap(); - lastModified = Collections.emptyMap(); - return; - } + public CrawlDataReference(Iterator data) { + this.data = data; + } - documents = new HashMap<>(referenceDomain.doc.size()); - etags = new HashMap<>(referenceDomain.doc.size()); - lastModified = new HashMap<>(referenceDomain.doc.size()); + public CrawlDataReference() { + this(Collections.emptyIterator()); + } - for (var doc : referenceDomain.doc) { - try { - addReference(doc); - } catch (URISyntaxException ex) { - logger.warn("Failed to add reference document {}", doc.url); + @Nullable + public CrawledDocument nextDocument() { + while (data.hasNext()) { + if (data.next() instanceof CrawledDocument doc) { + return doc; } } + return null; } - private void addReference(CrawledDocument doc) throws URISyntaxException { - var url = new EdgeUrl(doc.url); + public boolean isContentSame(CrawledDocument one, CrawledDocument other) { + assert one.documentBody != null; + assert other.documentBody != null; - if (doc.httpStatus == 404) { - previouslyDeadUrls.add(url); - return; - } + final long contentHashOne = contentHash(one.documentBody); + final long contentHashOther = contentHash(other.documentBody); - if (doc.httpStatus != 200) { - return; - } + return EasyLSH.hammingDistance(contentHashOne, contentHashOther) < 4; + } - documents.put(url, doc); + private long contentHash(BigString documentBody) { + String content = documentBody.decode(); + EasyLSH hash = new EasyLSH(); + int next = 0; - String headers = doc.headers; - if (headers != null) { - String[] headersLines = headers.split("\n"); - - String lastmod = null; - String etag = null; - - for (String line : headersLines) { - if (line.toLowerCase().startsWith("etag:")) { - etag = line.substring(5).trim(); - } - if (line.toLowerCase().startsWith("last-modified:")) { - lastmod = line.substring(14).trim(); - } - } - - if (lastmod != null) { - lastModified.put(url, lastmod); - } - if (etag != null) { - etags.put(url, etag); + boolean isInTag = false; + for (int i = 0; i < content.length(); i++) { + char c = content.charAt(i); + if (c == '<') { + isInTag = true; + } else if (c == '>') { + isInTag = false; + } else if (!isInTag) { + next = (next << 8) | (byte) c; + hash.addHashUnordered(hashFunction.hashInt(next).asInt()); } } + + return hash.get(); } - public boolean isPreviouslyDead(EdgeUrl url) { - return previouslyDeadUrls.contains(url); - } - public int size() { - return documents.size(); - } - - public String getEtag(EdgeUrl url) { - return etags.get(url); - } - - public String getLastModified(EdgeUrl url) { - return lastModified.get(url); - } - - public Map allDocuments() { - return documents; - } - - - public Map sample(int sampleSize) { - return documents.entrySet().stream().limit(sampleSize).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); - } - - public void evict() { - documents.clear(); - etags.clear(); - lastModified.clear(); - } - - public CrawledDocument getDoc(EdgeUrl top) { - return documents.get(top); - } - - // This bit of manual housekeeping is needed to keep the memory footprint low - public void dispose(EdgeUrl url) { - documents.remove(url); - etags.remove(url); - lastModified.remove(url); - } } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index 8091dac8..48587bdb 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -97,10 +97,10 @@ public class CrawlerRetreiver { } public int fetch() { - return fetch(Collections.emptyIterator()); + return fetch(new CrawlDataReference()); } - public int fetch(Iterator oldCrawlData) { + public int fetch(CrawlDataReference oldCrawlData) { final DomainProber.ProbeResult probeResult = domainProber.probeDomain(fetcher, domain, crawlFrontier.peek()); if (probeResult instanceof DomainProber.ProbeResultOk) { @@ -141,7 +141,7 @@ public class CrawlerRetreiver { throw new IllegalStateException("Unknown probe result: " + probeResult); }; - private int crawlDomain(Iterator oldCrawlData) { + private int crawlDomain(CrawlDataReference oldCrawlData) { String ip = findIp(domain); assert !crawlFrontier.isEmpty(); @@ -207,14 +207,18 @@ public class CrawlerRetreiver { return fetchedCount; } - private int recrawl(Iterator oldCrawlData, + private int recrawl(CrawlDataReference oldCrawlData, SimpleRobotRules robotsRules, long crawlDelay) { int recrawled = 0; int retained = 0; - while (oldCrawlData.hasNext()) { - if (!(oldCrawlData.next() instanceof CrawledDocument doc)) continue; + for (;;) { + CrawledDocument doc = oldCrawlData.nextDocument(); + + if (doc == null) { + break; + } // This Shouldn't Happen (TM) var urlMaybe = EdgeUrl.parse(doc.url); @@ -265,6 +269,9 @@ public class CrawlerRetreiver { if (Objects.equals(fetchedDocOpt.get().recrawlState, retainedTag)) { retained ++; } + else if (oldCrawlData.isContentSame(doc, fetchedDocOpt.get())) { + retained ++; + } recrawled ++; } diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java index 59e3c45e..ea1dd08e 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java @@ -70,7 +70,7 @@ public class CrawlerMockFetcherTest { registerUrlClasspathData(new EdgeUrl("https://startrek.website/c/startrek"), "mock-crawl-data/lemmy/c_startrek.html"); registerUrlClasspathData(new EdgeUrl("https://startrek.website/post/108995"), "mock-crawl-data/lemmy/108995.html"); - new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 10, "startrek.website", new ArrayList<>(), null), out::add) + new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 10, "startrek.website", new ArrayList<>()), out::add) .withNoDelay() .fetch(); @@ -83,7 +83,7 @@ public class CrawlerMockFetcherTest { registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html"); - new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 10, "en.wikipedia.org", new ArrayList<>(), null), out::add) + new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 10, "en.wikipedia.org", new ArrayList<>()), out::add) .withNoDelay() .fetch(); @@ -98,7 +98,7 @@ public class CrawlerMockFetcherTest { registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/telegram-channel-to-idle-on/3501"), "mock-crawl-data/discourse/telegram.html"); registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/combined-mode-but-grid/4489"), "mock-crawl-data/discourse/grid.html"); - new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 100, "community.tt-rss.org", new ArrayList<>(), null), out::add) + new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 100, "community.tt-rss.org", new ArrayList<>()), out::add) .withNoDelay() .fetch(); diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java index 009e9084..2a37707f 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java @@ -2,6 +2,7 @@ package nu.marginalia.crawling.retreival; import lombok.SneakyThrows; import nu.marginalia.WmsaHome; +import nu.marginalia.crawl.retreival.CrawlDataReference; import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; @@ -109,7 +110,7 @@ class CrawlerRetreiverTest { var specs = CrawlingSpecification .builder() - .id("whatever") + .id("123456") .crawlDepth(12) .domain("www.marginalia.nu") .urls(List.of("https://www.marginalia.nu/some-dead-link")) @@ -117,7 +118,7 @@ class CrawlerRetreiverTest { Path out = Files.createTempDirectory("crawling-process"); - var writer = new CrawledDomainWriter(out, "test", "123456"); + var writer = new CrawledDomainWriter(out, "www.marginalia.nu", "123456"); Map, List> data = new HashMap<>(); new CrawlerRetreiver(httpFetcher, specs, d -> { @@ -130,18 +131,16 @@ class CrawlerRetreiverTest { writer.close(); var reader = new CrawledDomainReader(); - var iter = reader.createIterator(CrawlerOutputFile.getOutputFile(out, "123456", "test")); + var iter = reader.createIterator(out, specs); CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0); domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList()); - var newSpec = specs.withOldData(domain); - - new CrawlerRetreiver(httpFetcher, newSpec, d -> { + new CrawlerRetreiver(httpFetcher, specs, d -> { if (d instanceof CrawledDocument doc) { System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus); } - }).fetch(iter); + }).fetch(new CrawlDataReference(iter)); } } \ No newline at end of file diff --git a/code/tools/crawl-job-extractor/src/test/java/nu/marginalia/crawl/CrawlJobSpecWriterTest.java b/code/tools/crawl-job-extractor/src/test/java/nu/marginalia/crawl/CrawlJobSpecWriterTest.java index 7fd5922f..38cfc4fb 100644 --- a/code/tools/crawl-job-extractor/src/test/java/nu/marginalia/crawl/CrawlJobSpecWriterTest.java +++ b/code/tools/crawl-job-extractor/src/test/java/nu/marginalia/crawl/CrawlJobSpecWriterTest.java @@ -31,9 +31,9 @@ public class CrawlJobSpecWriterTest { @Test public void testReadWrite() throws IOException { try (CrawlJobSpecWriter writer = new CrawlJobSpecWriter(tempFile)) { - writer.accept(new CrawlingSpecification("first",1, "test1", List.of("a", "b", "c"), null)); - writer.accept(new CrawlingSpecification("second",1, "test2", List.of("a", "b", "c", "d"), null)); - writer.accept(new CrawlingSpecification("third",1, "test3", List.of("a", "b"), null)); + writer.accept(new CrawlingSpecification("first",1, "test1", List.of("a", "b", "c"))); + writer.accept(new CrawlingSpecification("second",1, "test2", List.of("a", "b", "c", "d"))); + writer.accept(new CrawlingSpecification("third",1, "test3", List.of("a", "b"))); } List outputs = new ArrayList<>(); From 69f333c0bf8bf76da9f7bc1a91be285c6284886a Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 23 Jul 2023 18:59:14 +0200 Subject: [PATCH 074/157] (crawler) Clean up and refactor the code a bit --- .../crawl/retreival/CrawlDataReference.java | 20 +- .../crawl/retreival/CrawlDelayTimer.java | 57 ++++ .../crawl/retreival/CrawlerRetreiver.java | 265 ++++++++---------- .../crawl/retreival/DomainCrawlFrontier.java | 2 +- .../retreival/CrawlerMockFetcherTest.java | 3 - .../retreival/CrawlerRetreiverTest.java | 5 +- 6 files changed, 190 insertions(+), 162 deletions(-) create mode 100644 code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java index 4c4a33d8..8f331a65 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java @@ -1,24 +1,19 @@ package nu.marginalia.crawl.retreival; -import com.google.common.hash.HashCode; import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; import nu.marginalia.bigstring.BigString; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.SerializableCrawlData; import nu.marginalia.lsh.EasyLSH; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import javax.annotation.Nullable; import java.util.*; /** A reference to a domain that has been crawled before. */ public class CrawlDataReference { - private final Logger logger = LoggerFactory.getLogger(CrawlDataReference.class); private final Iterator data; - private final HashFunction hashFunction = Hashing.murmur3_128(); public CrawlDataReference(Iterator data) { this.data = data; @@ -38,7 +33,7 @@ public class CrawlDataReference { return null; } - public boolean isContentSame(CrawledDocument one, CrawledDocument other) { + public boolean isContentBodySame(CrawledDocument one, CrawledDocument other) { assert one.documentBody != null; assert other.documentBody != null; @@ -48,13 +43,15 @@ public class CrawlDataReference { return EasyLSH.hammingDistance(contentHashOne, contentHashOther) < 4; } - private long contentHash(BigString documentBody) { String content = documentBody.decode(); EasyLSH hash = new EasyLSH(); int next = 0; boolean isInTag = false; + + // In a naive best-effort fashion, extract the text + // content of the document and feed it into the LSH for (int i = 0; i < content.length(); i++) { char c = content.charAt(i); if (c == '<') { @@ -62,12 +59,17 @@ public class CrawlDataReference { } else if (c == '>') { isInTag = false; } else if (!isInTag) { - next = (next << 8) | (byte) c; - hash.addHashUnordered(hashFunction.hashInt(next).asInt()); + next = (next << 8) | (c & 0xff); + hash.addHashUnordered(hashInt(next)); } } return hash.get(); } + private final HashFunction hashFunction = Hashing.murmur3_128(); + private int hashInt(int v) { + return hashFunction.hashInt(v).asInt(); + } + } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java new file mode 100644 index 00000000..ca2494dc --- /dev/null +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java @@ -0,0 +1,57 @@ +package nu.marginalia.crawl.retreival; + +import lombok.SneakyThrows; + +import static java.lang.Math.max; +import static java.lang.Math.min; + +public class CrawlDelayTimer { + + // When no crawl delay is specified, lean toward twice the fetch+process time, within these limits: + private static final long DEFAULT_CRAWL_DELAY_MIN_MS = Long.getLong("defaultCrawlDelay", 1000); + private static final long DEFAULT_CRAWL_DELAY_MAX_MS = Long.getLong("defaultCrawlDelaySlow", 2500); + + /** Flag to indicate that the crawler should slow down, e.g. from 429s */ + private boolean slowDown = false; + + private final long delayTime; + + public CrawlDelayTimer(long delayTime) { + this.delayTime = delayTime; + } + + @SneakyThrows + public void delay(long spentTime) { + long sleepTime = delayTime; + + if (sleepTime >= 1) { + if (spentTime > sleepTime) + return; + + Thread.sleep(min(sleepTime - spentTime, 5000)); + } + else if (slowDown) { + // Additional delay when the server is signalling it wants slower requests + Thread.sleep( DEFAULT_CRAWL_DELAY_MIN_MS); + } + else { + // When no crawl delay is specified, lean toward twice the fetch+process time, + // within sane limits. This means slower servers get slower crawling, and faster + // servers get faster crawling. + + sleepTime = spentTime * 2; + sleepTime = min(sleepTime, DEFAULT_CRAWL_DELAY_MAX_MS); + sleepTime = max(sleepTime, DEFAULT_CRAWL_DELAY_MIN_MS); + + if (spentTime > sleepTime) + return; + + Thread.sleep(sleepTime - spentTime); + } + } + + /** Increase the delay between requests if the server is signalling it wants slower requests with HTTP 429 */ + public void slowDown() { + slowDown = true; + } +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index 48587bdb..ebdbd4f0 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -11,7 +11,6 @@ import nu.marginalia.crawling.model.spec.CrawlingSpecification; import nu.marginalia.link_parser.LinkParser; import nu.marginalia.crawling.model.*; import nu.marginalia.ip_blocklist.UrlBlocklist; -import nu.marginalia.lsh.EasyLSH; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import org.jsoup.Jsoup; @@ -26,25 +25,12 @@ import java.time.LocalDateTime; import java.util.*; import java.util.function.Consumer; -import static java.lang.Math.max; -import static java.lang.Math.min; - public class CrawlerRetreiver { - private static final long DEFAULT_CRAWL_DELAY_MIN_MS = Long.getLong("defaultCrawlDelay", 1000); - private static final long DEFAULT_CRAWL_DELAY_MAX_MS = Long.getLong("defaultCrawlDelaySlow", 2500); private static final int MAX_ERRORS = 20; private final HttpFetcher fetcher; - - /** Flag to indicate that the crawler should slow down, e.g. from 429s */ - private boolean slowDown = false; - - - /** Testing flag to disable crawl delay (otherwise crawler tests take several minutes) */ - private boolean testFlagIgnoreDelay = false; - private final String id; private final String domain; private final Consumer crawledDomainWriter; @@ -61,7 +47,12 @@ public class CrawlerRetreiver { private final DomainCrawlFrontier crawlFrontier; int errorCount = 0; - private String retainedTag = "RETAINED/304"; + + /** recrawlState tag for documents that had a HTTP status 304 */ + private static final String documentWasRetainedTag = "RETAINED/304"; + + /** recrawlState tag for documents that had a 200 status but were identical to a previous version */ + private static final String documentWasSameTag = "SAME-BY-COMPARISON"; public CrawlerRetreiver(HttpFetcher fetcher, CrawlingSpecification specs, @@ -91,11 +82,6 @@ public class CrawlerRetreiver { } } - public CrawlerRetreiver withNoDelay() { - testFlagIgnoreDelay = true; - return this; - } - public int fetch() { return fetch(new CrawlDataReference()); } @@ -146,13 +132,13 @@ public class CrawlerRetreiver { assert !crawlFrontier.isEmpty(); - var robotsRules = fetcher.fetchRobotRules(crawlFrontier.peek().domain); - long crawlDelay = robotsRules.getCrawlDelay(); + final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(crawlFrontier.peek().domain); + final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay()); - sniffRootDocument(); + sniffRootDocument(delayTimer); // Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified - int recrawled = recrawl(oldCrawlData, robotsRules, crawlDelay); + int recrawled = recrawl(oldCrawlData, robotsRules, delayTimer); if (recrawled > 0) { // If we have reference data, we will always grow the crawl depth a bit @@ -195,7 +181,7 @@ public class CrawlerRetreiver { continue; - if (fetchDocument(top, null, crawlDelay).isPresent()) { + if (fetchWriteAndSleep(top, delayTimer, DocumentWithReference.empty()).isPresent()) { fetchedCount++; } } @@ -207,9 +193,10 @@ public class CrawlerRetreiver { return fetchedCount; } + /** Performs a re-crawl of old documents, comparing etags and last-modified */ private int recrawl(CrawlDataReference oldCrawlData, SimpleRobotRules robotsRules, - long crawlDelay) { + CrawlDelayTimer delayTimer) { int recrawled = 0; int retained = 0; @@ -247,8 +234,6 @@ public class CrawlerRetreiver { && retained > 0.9 * recrawled && Math.random() < 0.75) { - logger.info("Direct-loading {}", url); - // Since it looks like most of these documents haven't changed, // we'll load the documents directly; but we do this in a random // fashion to make sure we eventually catch changes over time @@ -263,15 +248,13 @@ public class CrawlerRetreiver { // providing etag and last-modified headers, so we can recycle the // document if it hasn't changed without actually downloading it - var fetchedDocOpt = fetchDocument(url, doc, crawlDelay); + var fetchedDocOpt = fetchWriteAndSleep(url, + delayTimer, + new DocumentWithReference(doc, oldCrawlData)); if (fetchedDocOpt.isEmpty()) continue; - if (Objects.equals(fetchedDocOpt.get().recrawlState, retainedTag)) { - retained ++; - } - else if (oldCrawlData.isContentSame(doc, fetchedDocOpt.get())) { - retained ++; - } + if (documentWasRetainedTag.equals(fetchedDocOpt.get().recrawlState)) retained ++; + else if (documentWasSameTag.equals(fetchedDocOpt.get().recrawlState)) retained ++; recrawled ++; } @@ -279,18 +262,6 @@ public class CrawlerRetreiver { return recrawled; } - private static final HashFunction hasher = Hashing.murmur3_128(0); - private long hashDoc(CrawledDocument doc) { - var hash = new EasyLSH(); - long val = 0; - for (var b : doc.documentBody.decode().getBytes()) { - val = val << 8 | (b & 0xFF); - hash.addUnordered(hasher.hashLong(val).asLong()); - } - return hash.get(); - } - - private void downloadSitemaps(SimpleRobotRules robotsRules) { List sitemaps = robotsRules.getSitemaps(); if (sitemaps.isEmpty()) { @@ -337,13 +308,13 @@ public class CrawlerRetreiver { logger.debug("Queue is now {}", crawlFrontier.queueSize()); } - private void sniffRootDocument() { + private void sniffRootDocument(CrawlDelayTimer delayTimer) { try { logger.debug("Configuring link filter"); var url = crawlFrontier.peek().withPathAndParam("/", null); - var maybeSample = fetchUrl(url, null).filter(sample -> sample.httpStatus == 200); + var maybeSample = fetchUrl(url, delayTimer, DocumentWithReference.empty()).filter(sample -> sample.httpStatus == 200); if (maybeSample.isEmpty()) return; var sample = maybeSample.get(); @@ -379,33 +350,41 @@ public class CrawlerRetreiver { } } - private Optional fetchDocument(EdgeUrl top, - @Nullable CrawledDocument reference, - long crawlDelay) { + private Optional fetchWriteAndSleep(EdgeUrl top, + CrawlDelayTimer timer, + DocumentWithReference reference) { logger.debug("Fetching {}", top); long startTime = System.currentTimeMillis(); - var doc = fetchUrl(top, reference); - if (doc.isPresent()) { - var d = doc.get(); - crawledDomainWriter.accept(d); + var docOpt = fetchUrl(top, timer, reference); - if (d.url != null) { - // We may have redirected to a different path - EdgeUrl.parse(d.url).ifPresent(crawlFrontier::addVisited); + if (docOpt.isPresent()) { + var doc = docOpt.get(); + + if (!Objects.equals(doc.recrawlState, documentWasRetainedTag) + && reference.isContentBodySame(doc)) + { + // The document didn't change since the last time + doc.recrawlState = documentWasSameTag; } - if ("ERROR".equals(d.crawlerStatus) && d.httpStatus != 404) { + crawledDomainWriter.accept(doc); + + if (doc.url != null) { + // We may have redirected to a different path + EdgeUrl.parse(doc.url).ifPresent(crawlFrontier::addVisited); + } + + if ("ERROR".equals(doc.crawlerStatus) && doc.httpStatus != 404) { errorCount++; } } - long crawledTime = System.currentTimeMillis() - startTime; - delay(crawlDelay, crawledTime); + timer.delay(System.currentTimeMillis() - startTime); - return doc; + return docOpt; } private boolean isAllowedProtocol(String proto) { @@ -413,35 +392,23 @@ public class CrawlerRetreiver { || proto.equalsIgnoreCase("https"); } - private Optional fetchUrl(EdgeUrl top, @Nullable CrawledDocument reference) { + private Optional fetchUrl(EdgeUrl top, CrawlDelayTimer timer, DocumentWithReference reference) { try { - var contentTags = getContentTags(reference); - var fetchedDoc = fetchContent(top, contentTags); - CrawledDocument doc; + var contentTags = reference.getContentTags(); + var fetchedDoc = tryDownload(top, timer, contentTags); - // HTTP status 304 is NOT MODIFIED, which means the document is the same as it was when - // we fetched it last time. We can recycle the reference document. - if (reference != null - && fetchedDoc.httpStatus == 304) - { - doc = reference; - doc.recrawlState = retainedTag; - doc.timestamp = LocalDateTime.now().toString(); - } - else { - doc = fetchedDoc; - } + CrawledDocument doc = reference.replaceOn304(fetchedDoc); if (doc.documentBody != null) { var decoded = doc.documentBody.decode(); doc.documentBodyHash = createHash(decoded); - Optional parsedDoc = parseDoc(decoded); + var parsedDoc = Jsoup.parse(decoded); EdgeUrl url = new EdgeUrl(doc.url); - parsedDoc.ifPresent(parsed -> findLinks(url, parsed)); - parsedDoc.flatMap(parsed -> findCanonicalUrl(url, parsed)) + findLinks(url, parsedDoc); + findCanonicalUrl(url, parsedDoc) .ifPresent(canonicalLink -> doc.canonicalUrl = canonicalLink.toString()); } @@ -455,33 +422,9 @@ public class CrawlerRetreiver { } - private ContentTags getContentTags(@Nullable CrawledDocument reference) { - if (null == reference) - return ContentTags.empty(); - - String headers = reference.headers; - if (headers == null) - return ContentTags.empty(); - - String[] headersLines = headers.split("\n"); - - String lastmod = null; - String etag = null; - - for (String line : headersLines) { - if (line.toLowerCase().startsWith("etag:")) { - etag = line.substring(5).trim(); - } - if (line.toLowerCase().startsWith("last-modified:")) { - lastmod = line.substring(14).trim(); - } - } - - return new ContentTags(etag, lastmod); - } @SneakyThrows - private CrawledDocument fetchContent(EdgeUrl top, ContentTags tags) { + private CrawledDocument tryDownload(EdgeUrl top, CrawlDelayTimer timer, ContentTags tags) { for (int i = 0; i < 2; i++) { try { var doc = fetcher.fetchContent(top, tags); @@ -489,7 +432,8 @@ public class CrawlerRetreiver { return doc; } catch (RateLimitException ex) { - slowDown = true; + timer.slowDown(); + int delay = ex.retryAfter(); if (delay > 0 && delay < 5000) { Thread.sleep(delay); @@ -504,10 +448,6 @@ public class CrawlerRetreiver { return hashMethod.hashUnencodedChars(documentBodyHash).toString(); } - private Optional parseDoc(String decoded) { - return Optional.of(Jsoup.parse(decoded)); - } - private void findLinks(EdgeUrl baseUrl, Document parsed) { baseUrl = linkParser.getBaseLink(parsed, baseUrl); @@ -547,36 +487,6 @@ public class CrawlerRetreiver { } } - @SneakyThrows - private void delay(long sleepTime, long spentTime) { - if (testFlagIgnoreDelay) - return; - - if (sleepTime >= 1) { - if (spentTime > sleepTime) - return; - - Thread.sleep(min(sleepTime - spentTime, 5000)); - } - else if (slowDown) { - Thread.sleep( 1000); - } - else { - // When no crawl delay is specified, lean toward twice the fetch+process time, - // within sane limits. This means slower servers get slower crawling, and faster - // servers get faster crawling. - - sleepTime = spentTime * 2; - sleepTime = min(sleepTime, DEFAULT_CRAWL_DELAY_MAX_MS); - sleepTime = max(sleepTime, DEFAULT_CRAWL_DELAY_MIN_MS); - - if (spentTime > sleepTime) - return; - - Thread.sleep(sleepTime - spentTime); - } - } - private CrawledDocument createRobotsError(EdgeUrl url) { return CrawledDocument.builder() .url(url.toString()) @@ -594,12 +504,71 @@ public class CrawlerRetreiver { .build(); } + private record DocumentWithReference( + @Nullable CrawledDocument doc, + @Nullable CrawlDataReference reference) { - enum CrawlDataComparison { - NO_OLD_DATA, - SMALL_SAMPLE, - CHANGES_FOUND, - NO_CHANGES - }; + private static final DocumentWithReference emptyInstance = new DocumentWithReference(null, null); + public static DocumentWithReference empty() { + return emptyInstance; + } + + public boolean isContentBodySame(CrawledDocument newDoc) { + if (reference == null) + return false; + if (doc == null) + return false; + + return reference.isContentBodySame(doc, newDoc); + } + + private ContentTags getContentTags() { + if (null == doc) + return ContentTags.empty(); + + String headers = doc.headers; + if (headers == null) + return ContentTags.empty(); + + String[] headersLines = headers.split("\n"); + + String lastmod = null; + String etag = null; + + for (String line : headersLines) { + if (line.toLowerCase().startsWith("etag:")) { + etag = line.substring(5).trim(); + } + if (line.toLowerCase().startsWith("last-modified:")) { + lastmod = line.substring(14).trim(); + } + } + + return new ContentTags(etag, lastmod); + } + + public boolean isEmpty() { + return doc == null || reference == null; + } + + /** If the provided document has HTTP status 304, and the reference document is provided, + * return the reference document; otherwise return the provided document. + */ + public CrawledDocument replaceOn304(CrawledDocument fetchedDoc) { + + if (doc == null) + return fetchedDoc; + + // HTTP status 304 is NOT MODIFIED, which means the document is the same as it was when + // we fetched it last time. We can recycle the reference document. + if (fetchedDoc.httpStatus != 304) + return fetchedDoc; + + var ret = doc; + ret.recrawlState = documentWasRetainedTag; + ret.timestamp = LocalDateTime.now().toString(); + return ret; + } + } } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java index 7d5fc214..4b9cc265 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java @@ -80,7 +80,7 @@ public class DomainCrawlFrontier { return; // reduce memory usage by not growing queue huge when crawling large sites - if (queue.size() + visited.size() >= depth + 100) + if (queue.size() + visited.size() >= depth + 1000) return; if (visited.contains(url.toString())) diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java index ea1dd08e..ae8e4679 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java @@ -71,7 +71,6 @@ public class CrawlerMockFetcherTest { registerUrlClasspathData(new EdgeUrl("https://startrek.website/post/108995"), "mock-crawl-data/lemmy/108995.html"); new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 10, "startrek.website", new ArrayList<>()), out::add) - .withNoDelay() .fetch(); out.forEach(System.out::println); @@ -84,7 +83,6 @@ public class CrawlerMockFetcherTest { registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html"); new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 10, "en.wikipedia.org", new ArrayList<>()), out::add) - .withNoDelay() .fetch(); out.forEach(System.out::println); @@ -99,7 +97,6 @@ public class CrawlerMockFetcherTest { registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/combined-mode-but-grid/4489"), "mock-crawl-data/discourse/grid.html"); new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 100, "community.tt-rss.org", new ArrayList<>()), out::add) - .withNoDelay() .fetch(); out.forEach(System.out::println); diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java index 2a37707f..bb4dd6f4 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java @@ -118,13 +118,16 @@ class CrawlerRetreiverTest { Path out = Files.createTempDirectory("crawling-process"); - var writer = new CrawledDomainWriter(out, "www.marginalia.nu", "123456"); + var writer = new CrawledDomainWriter(out, specs.domain, specs.id); Map, List> data = new HashMap<>(); new CrawlerRetreiver(httpFetcher, specs, d -> { data.computeIfAbsent(d.getClass(), k->new ArrayList<>()).add(d); if (d instanceof CrawledDocument doc) { System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus); + if (Math.random() > 0.5) { + doc.headers = ""; + } } writer.accept(d); }).fetch(); From 35b29e4f9e49cdda78b71b57b8942f5fd25217ac Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 23 Jul 2023 19:06:37 +0200 Subject: [PATCH 075/157] (crawler) Clean up and refactor the code a bit --- .../nu/marginalia/crawling/io/CrawledDomainWriter.java | 7 ++++--- .../src/main/java/nu/marginalia/crawl/CrawlerMain.java | 2 +- .../crawling/retreival/CrawlerRetreiverTest.java | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java index 83582212..1598428a 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java @@ -4,6 +4,7 @@ import com.github.luben.zstd.ZstdOutputStream; import com.google.gson.Gson; import lombok.SneakyThrows; import nu.marginalia.crawling.model.SerializableCrawlData; +import nu.marginalia.crawling.model.spec.CrawlingSpecification; import nu.marginalia.model.gson.GsonFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -25,15 +26,15 @@ public class CrawledDomainWriter implements AutoCloseable { private final Path tmpFile; private final Path outputFile; - public CrawledDomainWriter(Path outputDir, String name, String id) throws IOException { + public CrawledDomainWriter(Path outputDir, CrawlingSpecification spec) throws IOException { this.outputDir = outputDir; if (!Files.isDirectory(outputDir)) { throw new IllegalArgumentException("Output dir " + outputDir + " does not exist"); } - tmpFile = getOutputFile(id, name + "_tmp"); - outputFile = getOutputFile(id, name); + tmpFile = getOutputFile(spec.id, spec.domain + "_tmp"); + outputFile = getOutputFile(spec.id, spec.domain); writer = new OutputStreamWriter(new ZstdOutputStream(new BufferedOutputStream(Files.newOutputStream(tmpFile, StandardOpenOption.CREATE, StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING)))); } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java index 2c7e6e41..f633a294 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java @@ -199,7 +199,7 @@ public class CrawlerMain implements AutoCloseable { HttpFetcher fetcher = new HttpFetcherImpl(userAgent.uaString(), dispatcher, connectionPool); - try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) { + try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification)) { var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept); CrawlDataReference reference = getReference(specification); diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java index bb4dd6f4..96f475a9 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java @@ -118,7 +118,7 @@ class CrawlerRetreiverTest { Path out = Files.createTempDirectory("crawling-process"); - var writer = new CrawledDomainWriter(out, specs.domain, specs.id); + var writer = new CrawledDomainWriter(out, specs); Map, List> data = new HashMap<>(); new CrawlerRetreiver(httpFetcher, specs, d -> { From 789e8eea8544fe8b545af4c02b3555ca2f0fd4a4 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 23 Jul 2023 19:08:38 +0200 Subject: [PATCH 076/157] (crawler) Clean up and refactor the code a bit --- .../crawling/io/CrawledDomainWriter.java | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java index 1598428a..f431538c 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java @@ -6,8 +6,6 @@ import lombok.SneakyThrows; import nu.marginalia.crawling.model.SerializableCrawlData; import nu.marginalia.crawling.model.spec.CrawlingSpecification; import nu.marginalia.model.gson.GsonFactory; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import java.io.BufferedOutputStream; import java.io.IOException; @@ -21,10 +19,9 @@ import java.nio.file.StandardOpenOption; public class CrawledDomainWriter implements AutoCloseable { private final Path outputDir; private final Gson gson = GsonFactory.get(); - private static final Logger logger = LoggerFactory.getLogger(CrawledDomainWriter.class); private final Writer writer; private final Path tmpFile; - private final Path outputFile; + private final Path actualFile; public CrawledDomainWriter(Path outputDir, CrawlingSpecification spec) throws IOException { this.outputDir = outputDir; @@ -33,14 +30,19 @@ public class CrawledDomainWriter implements AutoCloseable { throw new IllegalArgumentException("Output dir " + outputDir + " does not exist"); } + + // Do the actual writing to a temporary file first, then move it to the actual file when close() is invoked + // this lets us read the old file and compare its contents while writing the new file. It also guards against + // half-written files if the process is killed. + tmpFile = getOutputFile(spec.id, spec.domain + "_tmp"); - outputFile = getOutputFile(spec.id, spec.domain); + actualFile = getOutputFile(spec.id, spec.domain); writer = new OutputStreamWriter(new ZstdOutputStream(new BufferedOutputStream(Files.newOutputStream(tmpFile, StandardOpenOption.CREATE, StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING)))); } public Path getOutputFile() { - return outputFile; + return actualFile; } @SneakyThrows @@ -57,7 +59,7 @@ public class CrawledDomainWriter implements AutoCloseable { @Override public void close() throws IOException { - Files.move(tmpFile, outputFile, StandardCopyOption.REPLACE_EXISTING); + Files.move(tmpFile, actualFile, StandardCopyOption.REPLACE_EXISTING); writer.close(); } } From bc330acfc914a41acc9a65b2c4713f84b8e5cdf2 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 23 Jul 2023 19:26:16 +0200 Subject: [PATCH 077/157] (control) Better refresh script that doesn't cause weird artifacts --- .../src/main/resources/static/control/refresh.js | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/code/services-satellite/control-service/src/main/resources/static/control/refresh.js b/code/services-satellite/control-service/src/main/resources/static/control/refresh.js index 457bb0e6..0ee10bbf 100644 --- a/code/services-satellite/control-service/src/main/resources/static/control/refresh.js +++ b/code/services-satellite/control-service/src/main/resources/static/control/refresh.js @@ -4,11 +4,19 @@ function refresh(ids) { .then(html => { const parser = new DOMParser(); const newDocument = parser.parseFromString(html, "text/html"); - console.log(newDocument); ids.forEach(id => { const newElement = newDocument.getElementById(id); - document.getElementById(id).innerHTML = newDocument.getElementById(id).innerHTML; + const targetElement = document.getElementById(id); + + if (newElement == null) + return; + if (targetElement == null) + return; + + if (!newElement.isEqualNode(targetElement)) { + targetElement.replaceWith(document.importNode(newElement, true)) + } }); }) .catch(error => { From 7470c170b1736dd001cbba5281c9b53aa47f4a8a Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 24 Jul 2023 15:06:57 +0200 Subject: [PATCH 078/157] (minor) EdgeUrl.parse() should deal with null --- .../model/src/main/java/nu/marginalia/model/EdgeUrl.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/code/common/model/src/main/java/nu/marginalia/model/EdgeUrl.java b/code/common/model/src/main/java/nu/marginalia/model/EdgeUrl.java index 19a9eb1b..90181263 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/EdgeUrl.java +++ b/code/common/model/src/main/java/nu/marginalia/model/EdgeUrl.java @@ -5,6 +5,7 @@ import lombok.Getter; import lombok.Setter; import nu.marginalia.util.QueryParams; +import javax.annotation.Nullable; import java.net.MalformedURLException; import java.net.URI; import java.net.URISyntaxException; @@ -33,8 +34,12 @@ public class EdgeUrl { this(new URI(urlencodeFixer(url))); } - public static Optional parse(String url) { + public static Optional parse(@Nullable String url) { try { + if (null == url) { + return Optional.empty(); + } + return Optional.of(new EdgeUrl(url)); } catch (URISyntaxException e) { return Optional.empty(); From a56953c7989968a4582b117c9df54ed11fc99064 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 24 Jul 2023 15:25:09 +0200 Subject: [PATCH 079/157] (converter, WIP) Refactor converter to not have to load everything into RAM. --- .../crawling/io/CrawledDomainReader.java | 11 +- .../crawling/model/CrawledDocument.java | 7 +- .../src/main/java/plan/CrawlPlan.java | 32 +++- .../marginalia/converting/ConverterMain.java | 21 +-- .../compiler/DocumentsCompiler.java | 1 - .../converting/compiler/UrlsCompiler.java | 35 ++-- .../converting/model/ProcessedDomain.java | 13 +- .../processor/DocumentProcessor.java | 17 +- .../converting/processor/DomainProcessor.java | 156 ++++++++---------- .../processor/logic/FeatureExtractor.java | 5 +- .../AbstractDocumentProcessorPlugin.java | 8 +- .../plugin/HtmlDocumentProcessorPlugin.java | 7 +- .../PlainTextDocumentProcessorPlugin.java | 5 +- .../converting/ConvertingIntegrationTest.java | 23 ++- ...CrawlingThenConvertingIntegrationTest.java | 6 +- .../crawl/retreival/CrawlDataReference.java | 3 +- .../crawl/retreival/CrawlerRetreiver.java | 8 +- .../retreival/fetcher/HttpFetcherImpl.java | 4 +- .../retreival/CrawlerMockFetcherTest.java | 5 +- .../tools/experiments/AdblockExperiment.java | 2 +- .../experiments/DebugConverterExperiment.java | 2 +- .../SentenceStatisticsExperiment.java | 2 +- .../experiments/SiteStatisticsExperiment.java | 12 +- .../tools/experiments/TopicExperiment.java | 2 +- .../tools/TermFrequencyExtractor.java | 2 +- run/env/service.env | 2 +- 26 files changed, 194 insertions(+), 197 deletions(-) diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java index c3dddb3c..1753f7c9 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java @@ -31,11 +31,9 @@ public class CrawledDomainReader { public CrawledDomainReader() { } - public Iterator createIterator(Path basePath, CrawlingSpecification spec) throws IOException { + public Iterator createIterator(Path fullPath) throws IOException { - final var path = CrawlerOutputFile.getOutputFile(basePath, spec.id, spec.domain); - - BufferedReader br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile())))); + BufferedReader br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(fullPath.toFile())))); return new Iterator<>() { SerializableCrawlData next; @@ -71,6 +69,11 @@ public class CrawledDomainReader { } }; } + + public Iterator createIterator(Path basePath, CrawlingSpecification spec) throws IOException { + + return createIterator(CrawlerOutputFile.getOutputFile(basePath, spec.id, spec.domain)); + } public CrawledDomain read(Path path) throws IOException { DomainDataAssembler domainData = new DomainDataAssembler(); diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java index 0066ddf2..94d13235 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java @@ -21,7 +21,7 @@ public class CrawledDocument implements SerializableCrawlData { public String crawlerStatusDesc; public String headers; - public BigString documentBody; + public String documentBody; public String documentBodyHash; public String canonicalUrl; @@ -35,9 +35,4 @@ public class CrawledDocument implements SerializableCrawlData { return SERIAL_IDENTIFIER; } - /** Remove all large data from this object to save memory */ - public void dispose() { - documentBody = null; - headers = null; - } } diff --git a/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java b/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java index 655525d6..f1d71f37 100644 --- a/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java +++ b/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java @@ -5,22 +5,18 @@ import lombok.NoArgsConstructor; import lombok.ToString; import nu.marginalia.crawling.io.CrawledDomainReader; import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.crawling.model.SerializableCrawlData; import nu.marginalia.crawling.model.spec.CrawlerSpecificationLoader; import nu.marginalia.crawling.model.spec.CrawlingSpecification; import nu.marginalia.process.log.WorkLog; -import nu.marginalia.process.log.WorkLogEntry; -import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.FileNotFoundException; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.Iterator; -import java.util.function.Consumer; import java.util.function.Predicate; -import java.util.stream.Stream; import java.util.Optional; @AllArgsConstructor @NoArgsConstructor @ToString @@ -122,4 +118,30 @@ public class CrawlPlan { return reader.readOptionally(path); }); } + + + public Iterable> crawlDataIterable(Predicate idPredicate) { + final CrawledDomainReader reader = new CrawledDomainReader(); + + return WorkLog.iterableMap(crawl.getLogFile(), + entry -> { + if (!idPredicate.test(entry.id())) { + return Optional.empty(); + } + + var path = getCrawledFilePath(entry.path()); + + if (!Files.exists(path)) { + logger.warn("File not found: {}", path); + return Optional.empty(); + } + + try { + return Optional.of(reader.createIterator(path)); + } + catch (IOException ex) { + return Optional.empty(); + } + }); + } } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java index 55c022ba..be617817 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java @@ -4,6 +4,7 @@ import com.google.gson.Gson; import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; +import nu.marginalia.crawling.model.SerializableCrawlData; import nu.marginalia.db.storage.FileStorageService; import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mq.MqMessage; @@ -23,12 +24,12 @@ import org.slf4j.LoggerFactory; import java.io.IOException; import java.sql.SQLException; +import java.util.Iterator; import java.util.List; import java.util.Optional; import java.util.UUID; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; -import java.util.function.Predicate; import static nu.marginalia.mqapi.ProcessInboxNames.CONVERTER_INBOX; @@ -101,20 +102,14 @@ public class ConverterMain { int totalDomains = plan.countCrawledDomains(); AtomicInteger processedDomains = new AtomicInteger(0); - var pipe = new ParallelPipe("Converter", 16, 4, 2) { + var pipe = new ParallelPipe, ProcessingInstructions>("Converter", 16, 4, 2) { @Override - protected ProcessingInstructions onProcess(CrawledDomain domainData) { - Thread.currentThread().setName("Converter:Processor["+domainData.domain+"] - " + domainData.size()); - try { - var processed = processor.process(domainData); - var compiled = compiler.compile(processed); + protected ProcessingInstructions onProcess(Iterator dataStream) { + var processed = processor.process(dataStream); + var compiled = compiler.compile(processed); - return new ProcessingInstructions(domainData.id, compiled); - } - finally { - Thread.currentThread().setName("Converter:Processor[IDLE]"); - } + return new ProcessingInstructions(processed.id, compiled); } @Override @@ -140,7 +135,7 @@ public class ConverterMain { processedDomains.set(processLog.countFinishedJobs()); heartbeat.setProgress(processedDomains.get() / (double) totalDomains); - for (var domain : plan.domainsIterable(id -> !processLog.isJobFinished(id))) + for (var domain : plan.crawlDataIterable(id -> !processLog.isJobFinished(id))) { pipe.accept(domain); } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java index 36b112fa..3849f015 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java @@ -3,7 +3,6 @@ package nu.marginalia.converting.compiler; import nu.marginalia.converting.instruction.Instruction; import nu.marginalia.converting.instruction.instructions.LoadKeywords; import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; -import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.model.crawl.HtmlFeature; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/UrlsCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/UrlsCompiler.java index 4d05a35d..d5184cfc 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/UrlsCompiler.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/UrlsCompiler.java @@ -6,6 +6,8 @@ import nu.marginalia.converting.instruction.instructions.LoadUrl; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.HashSet; @@ -15,30 +17,39 @@ import java.util.Set; public class UrlsCompiler { private static final int MAX_INTERNAL_LINKS = 25; + private final Logger logger = LoggerFactory.getLogger(getClass()); public void compile(List ret, List documents) { Set seenUrls = new HashSet<>(documents.size()*4); Set seenDomains = new HashSet<>(documents.size()); for (var doc : documents) { + if (doc.url == null) { + logger.warn("Discovered document with null URL"); + continue; + } + seenUrls.add(doc.url); - if (doc.details != null) { + if (doc.details == null) { + continue; + } - for (var url : doc.details.linksExternal) { - if (seenDomains.add(url.domain)) { - seenUrls.add(url); - } + // Add *some* external links; to avoid loading too many and gunking up the database with nonsense, + // only permit this once per external domain per crawled domain + for (var url : doc.details.linksExternal) { + if (seenDomains.add(url.domain)) { + seenUrls.add(url); } + } - if (doc.isOk()) { - // Don't load more than a few from linksInternal, grows too big for no reason - var linksToAdd = new ArrayList<>(doc.details.linksInternal); - if (linksToAdd.size() > MAX_INTERNAL_LINKS) { - linksToAdd.subList(MAX_INTERNAL_LINKS, linksToAdd.size()).clear(); - } - seenUrls.addAll(linksToAdd); + if (doc.isOk()) { + // Don't load more than a few from linksInternal, grows too big for no reason + var linksToAdd = new ArrayList<>(doc.details.linksInternal); + if (linksToAdd.size() > MAX_INTERNAL_LINKS) { + linksToAdd.subList(MAX_INTERNAL_LINKS, linksToAdd.size()).clear(); } + seenUrls.addAll(linksToAdd); } } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDomain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDomain.java index 95b66a02..e445d5b2 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDomain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDomain.java @@ -6,7 +6,6 @@ import nu.marginalia.model.crawl.DomainIndexingState; import java.util.List; import java.util.Optional; -import java.util.OptionalDouble; @ToString public class ProcessedDomain { @@ -16,17 +15,7 @@ public class ProcessedDomain { public DomainIndexingState state; public EdgeDomain redirect; public String ip; - - public OptionalDouble averageQuality() { - if (documents == null) { - return OptionalDouble.empty(); - } - return documents.stream() - .map(ProcessedDocument::quality) - .filter(OptionalDouble::isPresent) - .mapToDouble(OptionalDouble::getAsDouble) - .average(); - } + public String id; public int size() { return Optional.ofNullable(documents).map(List::size).orElse(1); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java index b7ac1767..82e9c5d7 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java @@ -2,7 +2,6 @@ package nu.marginalia.converting.processor; import com.google.inject.Inject; import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.CrawlerDocumentStatus; import nu.marginalia.model.crawl.UrlIndexingState; import nu.marginalia.converting.model.DisqualifiedException; @@ -38,11 +37,14 @@ public class DocumentProcessor { processorPlugins.add(plainTextDocumentProcessorPlugin); } - public ProcessedDocument process(CrawledDocument crawledDocument, CrawledDomain crawledDomain) { + public ProcessedDocument process(CrawledDocument crawledDocument) { ProcessedDocument ret = new ProcessedDocument(); try { - processDocument(crawledDocument, crawledDomain, ret); + // We must always provide the URL, even if we don't process the document + ret.url = getDocumentUrl(crawledDocument); + + processDocument(crawledDocument, ret); } catch (DisqualifiedException ex) { ret.state = UrlIndexingState.DISQUALIFIED; @@ -53,13 +55,12 @@ public class DocumentProcessor { ret.state = UrlIndexingState.DISQUALIFIED; ret.stateReason = DisqualifiedException.DisqualificationReason.PROCESSING_EXCEPTION.toString(); logger.info("Failed to convert " + crawledDocument.url, ex); - ex.printStackTrace(); } return ret; } - private void processDocument(CrawledDocument crawledDocument, CrawledDomain crawledDomain, ProcessedDocument ret) throws URISyntaxException, DisqualifiedException { + private void processDocument(CrawledDocument crawledDocument, ProcessedDocument ret) throws URISyntaxException, DisqualifiedException { var crawlerStatus = CrawlerDocumentStatus.valueOf(crawledDocument.crawlerStatus); if (crawlerStatus != CrawlerDocumentStatus.OK) { @@ -74,15 +75,11 @@ public class DocumentProcessor { throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.CONTENT_TYPE); } - - ret.url = getDocumentUrl(crawledDocument); ret.state = crawlerStatusToUrlState(crawledDocument.crawlerStatus, crawledDocument.httpStatus); final var plugin = findPlugin(crawledDocument); - AbstractDocumentProcessorPlugin.DetailsWithWords detailsWithWords = plugin.createDetails(crawledDomain, crawledDocument); - - crawledDocument.dispose(); + AbstractDocumentProcessorPlugin.DetailsWithWords detailsWithWords = plugin.createDetails(crawledDocument); ret.details = detailsWithWords.details(); ret.words = detailsWithWords.words(); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index dcdda943..64682319 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -1,18 +1,18 @@ package nu.marginalia.converting.processor; -import com.google.common.base.Strings; import com.google.inject.Inject; +import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.processor.logic.links.LinkGraph; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawledDomain; -import nu.marginalia.crawling.model.CrawlerDocumentStatus; -import nu.marginalia.crawling.model.CrawlerDomainStatus; +import nu.marginalia.crawling.model.*; import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.converting.processor.logic.links.TopKeywords; import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator; +import nu.marginalia.model.crawl.HtmlFeature; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.util.*; @@ -21,6 +21,8 @@ public class DomainProcessor { private final SiteWords siteWords; private final LshDocumentDeduplicator documentDeduplicator; + private final Logger logger = LoggerFactory.getLogger(getClass()); + @Inject public DomainProcessor(DocumentProcessor documentProcessor, SiteWords siteWords, @@ -30,44 +32,85 @@ public class DomainProcessor { this.documentDeduplicator = documentDeduplicator; } - public ProcessedDomain process(CrawledDomain crawledDomain) { + public ProcessedDomain process(Iterator dataStream) { var ret = new ProcessedDomain(); + List docs = new ArrayList<>(); + boolean cookies = false; + String ip = ""; + while (dataStream.hasNext()) { + var data = dataStream.next(); - ret.domain = new EdgeDomain(crawledDomain.domain); - ret.ip = crawledDomain.ip; + if (data instanceof CrawledDomain crawledDomain) { + ret.domain = new EdgeDomain(crawledDomain.domain); + ret.ip = crawledDomain.ip; + ret.id = crawledDomain.id; - if (crawledDomain.redirectDomain != null) { - ret.redirect = new EdgeDomain(crawledDomain.redirectDomain); - } + cookies = Objects.requireNonNullElse(crawledDomain.cookies, Collections.emptyList()).size() > 0; + ip = crawledDomain.ip; - if (crawledDomain.doc != null) { - ret.documents = new ArrayList<>(crawledDomain.doc.size()); - - fixBadCanonicalTags(crawledDomain.doc); - - for (var doc : crawledDomain.doc) { - var processedDoc = documentProcessor.process(doc, crawledDomain); - - if (processedDoc.url != null) { - ret.documents.add(processedDoc); + if (crawledDomain.redirectDomain != null) { + ret.redirect = new EdgeDomain(crawledDomain.redirectDomain); } - + ret.documents = docs; + ret.state = getState(crawledDomain.crawlerStatus); } + else if (data instanceof CrawledDocument doc) { + try { + if (doc.url == null) + continue; + fixBadCanonicalTag(doc); - documentDeduplicator.deduplicate(ret.documents); - - calculateStatistics(ret); - } - else { - ret.documents = Collections.emptyList(); + docs.add(documentProcessor.process(doc)); + } + catch (Exception ex) { + logger.warn("Failed to process " + doc.url, ex); + } + } } - ret.state = getState(crawledDomain.crawlerStatus); + // Add late keywords and features from domain-level information + + List terms = new ArrayList<>(); + terms.add("ip:"+ip); + if (cookies) + terms.add(HtmlFeature.COOKIES.getKeyword()); + + for (var document : ret.documents) { + if (document.details == null) + continue; + + if (cookies) + document.details.features.add(HtmlFeature.COOKIES); + + document.words.addAllSyntheticTerms(terms); + } + + documentDeduplicator.deduplicate(ret.documents); + calculateStatistics(ret); return ret; } + private void fixBadCanonicalTag(CrawledDocument doc) { + // Some sites have a canonical tag that points to a different domain, + // but our loader can not support this, so we point these back to the + // original url. + + var canonicalOpt = EdgeUrl.parse(doc.canonicalUrl); + if (canonicalOpt.isEmpty()) return; + + var urlOpt = EdgeUrl.parse(doc.url); + if (urlOpt.isEmpty()) return; + + var urlActual = urlOpt.get(); + var canonicalActual = canonicalOpt.get(); + + if (!Objects.equals(urlActual.domain, canonicalActual.domain)) { + doc.canonicalUrl = doc.url; + } + } + private void calculateStatistics(ProcessedDomain ret) { LinkGraph linkGraph = new LinkGraph(); TopKeywords topKeywords = new TopKeywords(); @@ -91,61 +134,6 @@ public class DomainProcessor { siteWords.flagAdjacentWords(topKeywords, invertedLinkGraph, ret); } - - private void fixBadCanonicalTags(List docs) { - Map> seenCanonicals = new HashMap<>(); - Set seenUrls = new HashSet<>(); - - // Sometimes sites set a blanket canonical link to their root page - // this removes such links from consideration - - for (var document : docs) { - if (!Strings.isNullOrEmpty(document.canonicalUrl) - && !Objects.equals(document.canonicalUrl, document.url)) { - seenCanonicals.computeIfAbsent(document.canonicalUrl, url -> new HashSet<>()).add(document.documentBodyHash); - } - seenUrls.add(document.url); - } - - for (var document : docs) { - if (!Strings.isNullOrEmpty(document.canonicalUrl) - && !Objects.equals(document.canonicalUrl, document.url) - && seenCanonicals.getOrDefault(document.canonicalUrl, Collections.emptySet()).size() > 1) { - - if (seenUrls.add(document.canonicalUrl)) { - document.canonicalUrl = document.url; - } - else { - document.crawlerStatus = CrawlerDocumentStatus.BAD_CANONICAL.name(); - } - } - } - - for (var document : docs) { - if (!Strings.isNullOrEmpty(document.canonicalUrl) - && !Objects.equals(document.canonicalUrl, document.url) - && seenCanonicals.getOrDefault(document.canonicalUrl, Collections.emptySet()).size() > 1) { - document.canonicalUrl = document.url; - } - } - - // Ignore canonical URL if it points to a different domain - // ... this confuses the hell out of the loader - for (var document : docs) { - if (Strings.isNullOrEmpty(document.canonicalUrl)) - continue; - - Optional cUrl = EdgeUrl.parse(document.canonicalUrl); - Optional dUrl = EdgeUrl.parse(document.url); - - if (cUrl.isPresent() && dUrl.isPresent() - && !Objects.equals(cUrl.get().domain, dUrl.get().domain)) - { - document.canonicalUrl = document.url; - } - } - } - private DomainIndexingState getState(String crawlerStatus) { return switch (CrawlerDomainStatus.valueOf(crawlerStatus)) { case OK -> DomainIndexingState.ACTIVE; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java index 57a98879..c431e94b 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java @@ -65,7 +65,7 @@ public class FeatureExtractor { this.googleAnwersSpamDetector = googleAnwersSpamDetector; } - public Set getFeatures(CrawledDomain domain, Document doc, DocumentLanguageData dld) { + public Set getFeatures(Document doc, DocumentLanguageData dld) { final Set features = new HashSet<>(); final Elements scriptTags = doc.getElementsByTag("script"); @@ -279,9 +279,6 @@ public class FeatureExtractor { } } - if (!domain.cookies.isEmpty()) - features.add(HtmlFeature.COOKIES); - if (recipeDetector.testP(dld) > 0.5) features.add(HtmlFeature.CATEGORY_FOOD); // these should be mutually exclusive diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java index c49d365f..14fd12ad 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java @@ -19,7 +19,7 @@ import java.util.*; public abstract class AbstractDocumentProcessorPlugin { protected LanguageFilter languageFilter = new LanguageFilter(); - public abstract DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocument crawledDocument) throws DisqualifiedException, URISyntaxException; + public abstract DetailsWithWords createDetails(CrawledDocument crawledDocument) throws DisqualifiedException, URISyntaxException; public abstract boolean isApplicable(CrawledDocument doc); protected void checkDocumentLanguage(DocumentLanguageData dld) throws DisqualifiedException { @@ -44,12 +44,6 @@ public abstract class AbstractDocumentProcessorPlugin { tagWords.add(key + ":" + value.toString().toLowerCase()); } - public MetaTagsBuilder addDomainCrawlData(CrawledDomain domain) { - add("ip", domain.ip); - - return this; - } - public MetaTagsBuilder addUrl(EdgeUrl url) { add("proto", url.proto); add("site", url.domain); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java index c2119688..8fb2b801 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java @@ -94,10 +94,10 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin } @Override - public DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocument crawledDocument) + public DetailsWithWords createDetails(CrawledDocument crawledDocument) throws DisqualifiedException, URISyntaxException { - String documentBody = crawledDocument.documentBody.decode(); + String documentBody = crawledDocument.documentBody; if (languageFilter.isBlockedUnicodeRange(documentBody)) { throw new DisqualifiedException(DisqualificationReason.LANGUAGE); @@ -141,7 +141,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin throw new DisqualifiedException(DisqualificationReason.QUALITY); } - final Set features = featureExtractor.getFeatures(crawledDomain, doc, dld); + final Set features = featureExtractor.getFeatures(doc, dld); ret.features = features; ret.hashCode = dld.localitySensitiveHashCode(); @@ -159,7 +159,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin ret.generator = generatorParts.type(); var tagWords = new MetaTagsBuilder() - .addDomainCrawlData(crawledDomain) .addPubDate(pubDate) .addUrl(url) .addFeatures(features) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java index e7d0a9a1..1dac05f1 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java @@ -55,10 +55,10 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP } @Override - public DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocument crawledDocument) + public DetailsWithWords createDetails(CrawledDocument crawledDocument) throws DisqualifiedException, URISyntaxException { - String documentBody = crawledDocument.documentBody.decode(); + String documentBody = crawledDocument.documentBody; if (languageFilter.isBlockedUnicodeRange(documentBody)) { throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LANGUAGE); @@ -97,7 +97,6 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url); var tagWords = new MetaTagsBuilder() - .addDomainCrawlData(crawledDomain) .addPubDate(pubDate) .addUrl(url) .addFeatures(ret.features) diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java index 67aa5299..8cf3a397 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java @@ -9,6 +9,7 @@ import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.processor.DomainProcessor; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.crawling.model.SerializableCrawlData; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.crawl.PubDate; @@ -40,18 +41,17 @@ public class ConvertingIntegrationTest { public void testEmptyDomain() { var docs = new ArrayList(); - var ret = domainProcessor.process( - new CrawledDomain("123", "memex.marginalia.nu", null, "OK", "-", "127.0.0.1", - docs, Collections.emptyList())); + var domain = new CrawledDomain("123", "memex.marginalia.nu", null, "OK", "-", "127.0.0.1", + docs, Collections.emptyList()); + var ret = domainProcessor.process(asSerializableCrawlData(domain)); assertEquals(ret.state, DomainIndexingState.ACTIVE); assertEquals(ret.domain, new EdgeDomain("memex.marginalia.nu")); assertTrue(ret.documents.isEmpty()); } - @Test public void testMemexMarginaliaNuDateInternalConsistency() throws IOException { - var ret = domainProcessor.process(readMarginaliaWorkingSet()); + var ret = domainProcessor.process(asSerializableCrawlData(readMarginaliaWorkingSet())); ret.documents.stream().filter(ProcessedDocument::isProcessedFully).forEach(doc -> { int year = PubDate.fromYearByte(doc.details.metadata.year()); Integer yearMeta = doc.details.pubYear; @@ -64,7 +64,7 @@ public class ConvertingIntegrationTest { @Test public void testMemexMarginaliaNu() throws IOException { - var ret = domainProcessor.process(readMarginaliaWorkingSet()); + var ret = domainProcessor.process(asSerializableCrawlData(readMarginaliaWorkingSet())); assertEquals(ret.state, DomainIndexingState.ACTIVE); assertEquals(ret.domain, new EdgeDomain("memex.marginalia.nu")); @@ -110,7 +110,7 @@ public class ConvertingIntegrationTest { "OK", "", "", - BigString.encode(readClassPathFile(p.toString())), + readClassPathFile(p.toString()), Double.toString(Math.random()), "https://memex.marginalia.nu/" + file, null, @@ -133,4 +133,13 @@ public class ConvertingIntegrationTest { return new String(Objects.requireNonNull(ClassLoader.getSystemResourceAsStream(s)).readAllBytes()); } + + private Iterator asSerializableCrawlData(CrawledDomain domain) { + List data = new ArrayList<>(); + if (domain.doc != null) { + data.addAll(domain.doc); + } + data.add(domain); + return data.iterator(); + } } diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java index 890a1081..9a79e9e9 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java @@ -55,7 +55,11 @@ public class CrawlingThenConvertingIntegrationTest { CrawledDomain domain = crawl(specs); - var output = domainProcessor.process(domain); + List data = new ArrayList<>(); + data.add(domain); + data.addAll(domain.doc); + + var output = domainProcessor.process(data.iterator()); for (var doc : output.documents) { if (doc.isOk()) { diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java index 8f331a65..adb25752 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java @@ -43,8 +43,7 @@ public class CrawlDataReference { return EasyLSH.hammingDistance(contentHashOne, contentHashOther) < 4; } - private long contentHash(BigString documentBody) { - String content = documentBody.decode(); + private long contentHash(String content) { EasyLSH hash = new EasyLSH(); int next = 0; diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index ebdbd4f0..87251059 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -323,7 +323,7 @@ public class CrawlerRetreiver { return; // Sniff the software based on the sample document - var doc = Jsoup.parse(sample.documentBody.decode()); + var doc = Jsoup.parse(sample.documentBody); crawlFrontier.setLinkFilter(linkFilterSelector.selectFilter(doc)); for (var link : doc.getElementsByTag("link")) { @@ -400,11 +400,9 @@ public class CrawlerRetreiver { CrawledDocument doc = reference.replaceOn304(fetchedDoc); if (doc.documentBody != null) { - var decoded = doc.documentBody.decode(); + doc.documentBodyHash = createHash(doc.documentBody); - doc.documentBodyHash = createHash(decoded); - - var parsedDoc = Jsoup.parse(decoded); + var parsedDoc = Jsoup.parse(doc.documentBody); EdgeUrl url = new EdgeUrl(doc.url); findLinks(url, parsedDoc); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java index 025c0aa9..02cba42c 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java @@ -295,7 +295,7 @@ public class HttpFetcherImpl implements HttpFetcher { .canonicalUrl(canonical) .httpStatus(rsp.code()) .url(responseUrl.toString()) - .documentBody(BigString.encode(strData)) + .documentBody(strData) .build(); } @@ -402,7 +402,7 @@ public class HttpFetcherImpl implements HttpFetcher { private SimpleRobotRules parseRobotsTxt(CrawledDocument doc) { return robotsParser.parseContent(doc.url, - doc.documentBody.decode().getBytes(), + doc.documentBody.getBytes(), doc.contentType, userAgent); } diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java index ae8e4679..fee1d44a 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java @@ -43,13 +43,12 @@ public class CrawlerMockFetcherTest { .contentType("text/html") .httpStatus(200) .crawlerStatus(CrawlerDocumentStatus.OK.name()) - .documentBody(BigString.encode(documentData)) + .documentBody(documentData) .build()); } @SneakyThrows private void registerUrlClasspathData(EdgeUrl url, String path) { - var data = BigString.encode(CommonTestData.loadTestData(path)); mockData.put(url, CrawledDocument.builder() .crawlId("1") @@ -57,7 +56,7 @@ public class CrawlerMockFetcherTest { .contentType("text/html") .httpStatus(200) .crawlerStatus(CrawlerDocumentStatus.OK.name()) - .documentBody(data) + .documentBody(CommonTestData.loadTestData(path)) .build()); } diff --git a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/AdblockExperiment.java b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/AdblockExperiment.java index 4228ed6b..da2a9272 100644 --- a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/AdblockExperiment.java +++ b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/AdblockExperiment.java @@ -32,7 +32,7 @@ public class AdblockExperiment extends Experiment { } private void processDocument(CrawledDocument doc) { - Document parsedDocument = Jsoup.parse(doc.documentBody.decode()); + Document parsedDocument = Jsoup.parse(doc.documentBody); if (simulator.hasAds(parsedDocument)) { System.out.println(doc.url); diff --git a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/DebugConverterExperiment.java b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/DebugConverterExperiment.java index 452be709..3a318dc3 100644 --- a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/DebugConverterExperiment.java +++ b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/DebugConverterExperiment.java @@ -26,7 +26,7 @@ public class DebugConverterExperiment extends Experiment { for (var doc : domain.doc) { if (doc.documentBody == null) continue; - var parsed = Jsoup.parse(doc.documentBody.decode()); + var parsed = Jsoup.parse(doc.documentBody); var tagExtractor = new BlogSpecialization.BlogTagExtractor(); parsed.traverse(tagExtractor); diff --git a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java index 7bf2f784..44f3cf18 100644 --- a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java +++ b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java @@ -41,7 +41,7 @@ public class SentenceStatisticsExperiment extends Experiment { for (var doc : domain.doc) { if (doc.documentBody == null) continue; - var parsed = Jsoup.parse(doc.documentBody.decode()); + var parsed = Jsoup.parse(doc.documentBody); parsed.body().filter(new DomPruningFilter(0.5)); diff --git a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SiteStatisticsExperiment.java b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SiteStatisticsExperiment.java index 3ac38b40..2882d0f2 100644 --- a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SiteStatisticsExperiment.java +++ b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SiteStatisticsExperiment.java @@ -31,12 +31,12 @@ public class SiteStatisticsExperiment extends Experiment { @Override public boolean process(CrawledDomain domain) { - var ret = domainProcessor.process(domain); - - ret.documents.stream() - .filter(ProcessedDocument::isProcessedFully) - .sorted(Comparator.comparing(doc -> doc.details.metadata.topology())) - .forEach(doc -> System.out.println(doc.url + ":" + doc.details.metadata)); +// var ret = domainProcessor.process(domain); +// +// ret.documents.stream() +// .filter(ProcessedDocument::isProcessedFully) +// .sorted(Comparator.comparing(doc -> doc.details.metadata.topology())) +// .forEach(doc -> System.out.println(doc.url + ":" + doc.details.metadata)); return true; } diff --git a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/TopicExperiment.java b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/TopicExperiment.java index e70df91c..f81bbcd2 100644 --- a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/TopicExperiment.java +++ b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/TopicExperiment.java @@ -37,7 +37,7 @@ public class TopicExperiment extends Experiment { for (var doc : domain.doc) { if (doc.documentBody == null) continue; - var parsed = Jsoup.parse(doc.documentBody.decode()); + var parsed = Jsoup.parse(doc.documentBody); parsed.body().filter(new DomPruningFilter(0.5)); var dld = se.extractSentences(parsed); diff --git a/code/tools/term-frequency-extractor/src/main/java/nu/marginalia/tools/TermFrequencyExtractor.java b/code/tools/term-frequency-extractor/src/main/java/nu/marginalia/tools/TermFrequencyExtractor.java index ece6a507..c5a52dd3 100644 --- a/code/tools/term-frequency-extractor/src/main/java/nu/marginalia/tools/TermFrequencyExtractor.java +++ b/code/tools/term-frequency-extractor/src/main/java/nu/marginalia/tools/TermFrequencyExtractor.java @@ -58,7 +58,7 @@ public class TermFrequencyExtractor { continue; docCount.incrementAndGet(); - Document parsed = Jsoup.parse(doc.documentBody.decode()); + Document parsed = Jsoup.parse(doc.documentBody); parsed.body().filter(new DomPruningFilter(0.5)); DocumentLanguageData dld = se.get().extractSentences(parsed); diff --git a/run/env/service.env b/run/env/service.env index 5553f603..ac745577 100644 --- a/run/env/service.env +++ b/run/env/service.env @@ -1,4 +1,4 @@ WMSA_HOME=run/ CONTROL_SERVICE_OPTS="-DdistPath=/dist" CONVERTER_OPTS="-ea -Xmx16G -XX:-CompactStrings -XX:+UseParallelGC -XX:GCTimeRatio=14 -XX:ParallelGCThreads=15" -CRAWLER_OPTS="-Dbigstring.disabled=true -Xmx16G -XX:+UseParallelGC -XX:GCTimeRatio=14 -XX:ParallelGCThreads=15" \ No newline at end of file +CRAWLER_OPTS="-Xmx16G -XX:+UseParallelGC -XX:GCTimeRatio=14 -XX:ParallelGCThreads=15" \ No newline at end of file From 667b0ca0b03feb295cd6c7c27ba98abc948c4977 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 24 Jul 2023 16:28:30 +0200 Subject: [PATCH 080/157] (converter, WIP) Refactor CrawledDomainReader to not return iterators. Instead return a closable class SerializableCrawlDataStream. --- .../crawling/io/CrawledDomainReader.java | 98 ++++++++++--------- .../io/SerializableCrawlDataStream.java | 45 +++++++++ .../src/main/java/plan/CrawlPlan.java | 7 +- .../marginalia/converting/ConverterMain.java | 8 +- .../converting/processor/DomainProcessor.java | 5 +- .../converting/ConvertingIntegrationTest.java | 7 +- ...CrawlingThenConvertingIntegrationTest.java | 3 +- .../java/nu/marginalia/crawl/CrawlerMain.java | 7 +- .../crawl/retreival/CrawlDataReference.java | 22 +++-- .../retreival/CrawlerRetreiverTest.java | 5 +- 10 files changed, 130 insertions(+), 77 deletions(-) create mode 100644 code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/SerializableCrawlDataStream.java diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java index 1753f7c9..76e37acc 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java @@ -2,7 +2,6 @@ package nu.marginalia.crawling.io; import com.github.luben.zstd.ZstdInputStream; import com.google.gson.Gson; -import lombok.SneakyThrows; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.SerializableCrawlData; @@ -11,13 +10,9 @@ import nu.marginalia.model.gson.GsonFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.BufferedReader; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStreamReader; +import java.io.*; import java.nio.file.Path; import java.util.ArrayList; -import java.util.Iterator; import java.util.List; import java.util.Optional; import java.util.concurrent.ForkJoinPool; @@ -31,48 +26,12 @@ public class CrawledDomainReader { public CrawledDomainReader() { } - public Iterator createIterator(Path fullPath) throws IOException { - - BufferedReader br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(fullPath.toFile())))); - - return new Iterator<>() { - SerializableCrawlData next; - - @Override - @SneakyThrows - public boolean hasNext() { - String identifier = br.readLine(); - if (identifier == null) { - br.close(); - return false; - } - String data = br.readLine(); - if (data == null) { - br.close(); - return false; - } - - if (identifier.equals(CrawledDomain.SERIAL_IDENTIFIER)) { - next = gson.fromJson(data, CrawledDomain.class); - } else if (identifier.equals(CrawledDocument.SERIAL_IDENTIFIER)) { - next = gson.fromJson(data, CrawledDocument.class); - } - else { - throw new IllegalStateException("Unknown identifier: " + identifier); - } - return true; - } - - @Override - public SerializableCrawlData next() { - return next; - } - }; + public SerializableCrawlDataStream createDataStream(Path fullPath) throws IOException { + return new FileReadingSerializableCrawlDataStream(gson, fullPath.toFile()); } - public Iterator createIterator(Path basePath, CrawlingSpecification spec) throws IOException { - - return createIterator(CrawlerOutputFile.getOutputFile(basePath, spec.id, spec.domain)); + public SerializableCrawlDataStream createDataStream(Path basePath, CrawlingSpecification spec) throws IOException { + return createDataStream(CrawlerOutputFile.getOutputFile(basePath, spec.id, spec.domain)); } public CrawledDomain read(Path path) throws IOException { @@ -138,4 +97,51 @@ public class CrawledDomainReader { return domainPrototype; } } + + private static class FileReadingSerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream { + private final Gson gson; + private final BufferedReader bufferedReader; + private SerializableCrawlData next = null; + + public FileReadingSerializableCrawlDataStream(Gson gson, File file) throws IOException { + this.gson = gson; + bufferedReader = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(file)))); + } + + @Override + public SerializableCrawlData next() throws IOException { + if (hasNext()) { + var ret = next; + next = null; + return ret; + } + throw new IllegalStateException("No more data"); + } + + @Override + public boolean hasNext() throws IOException { + if (next != null) + return true; + + String identifier = bufferedReader.readLine(); + if (identifier == null) return false; + String data = bufferedReader.readLine(); + if (data == null) return false; + + if (identifier.equals(CrawledDomain.SERIAL_IDENTIFIER)) { + next = gson.fromJson(data, CrawledDomain.class); + } else if (identifier.equals(CrawledDocument.SERIAL_IDENTIFIER)) { + next = gson.fromJson(data, CrawledDocument.class); + } + else { + throw new IllegalStateException("Unknown identifier: " + identifier); + } + return true; + } + + @Override + public void close() throws Exception { + bufferedReader.close(); + } + } } diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/SerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/SerializableCrawlDataStream.java new file mode 100644 index 00000000..e68526b1 --- /dev/null +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/SerializableCrawlDataStream.java @@ -0,0 +1,45 @@ +package nu.marginalia.crawling.io; + +import nu.marginalia.crawling.model.SerializableCrawlData; + +import java.io.IOException; +import java.util.Iterator; + +/** Closable iterator over serialized crawl data + * The data may appear in any order, and the iterator must be closed. + * */ +public interface SerializableCrawlDataStream { + static SerializableCrawlDataStream empty() { + return new SerializableCrawlDataStream() { + @Override + public SerializableCrawlData next() throws IOException { + throw new IllegalStateException("No more data"); + } + + @Override + public boolean hasNext() throws IOException { + return false; + } + }; + } + + // for testing + static SerializableCrawlDataStream fromIterator(Iterator iterator) { + return new SerializableCrawlDataStream() { + @Override + public SerializableCrawlData next() throws IOException { + return iterator.next(); + } + + @Override + public boolean hasNext() throws IOException { + return iterator.hasNext(); + } + }; + + } + + SerializableCrawlData next() throws IOException; + + boolean hasNext() throws IOException; +} diff --git a/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java b/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java index f1d71f37..0f6d66ea 100644 --- a/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java +++ b/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java @@ -4,8 +4,8 @@ import lombok.AllArgsConstructor; import lombok.NoArgsConstructor; import lombok.ToString; import nu.marginalia.crawling.io.CrawledDomainReader; +import nu.marginalia.crawling.io.SerializableCrawlDataStream; import nu.marginalia.crawling.model.CrawledDomain; -import nu.marginalia.crawling.model.SerializableCrawlData; import nu.marginalia.crawling.model.spec.CrawlerSpecificationLoader; import nu.marginalia.crawling.model.spec.CrawlingSpecification; import nu.marginalia.process.log.WorkLog; @@ -15,7 +15,6 @@ import org.slf4j.LoggerFactory; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import java.util.Iterator; import java.util.function.Predicate; import java.util.Optional; @@ -120,7 +119,7 @@ public class CrawlPlan { } - public Iterable> crawlDataIterable(Predicate idPredicate) { + public Iterable crawlDataIterable(Predicate idPredicate) { final CrawledDomainReader reader = new CrawledDomainReader(); return WorkLog.iterableMap(crawl.getLogFile(), @@ -137,7 +136,7 @@ public class CrawlPlan { } try { - return Optional.of(reader.createIterator(path)); + return Optional.of(reader.createDataStream(path)); } catch (IOException ex) { return Optional.empty(); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java index be617817..f35740ce 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java @@ -4,7 +4,7 @@ import com.google.gson.Gson; import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; -import nu.marginalia.crawling.model.SerializableCrawlData; +import nu.marginalia.crawling.io.SerializableCrawlDataStream; import nu.marginalia.db.storage.FileStorageService; import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mq.MqMessage; @@ -17,14 +17,12 @@ import plan.CrawlPlan; import nu.marginalia.converting.compiler.InstructionsCompiler; import nu.marginalia.converting.instruction.Instruction; import nu.marginalia.converting.processor.DomainProcessor; -import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.util.ParallelPipe; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.sql.SQLException; -import java.util.Iterator; import java.util.List; import java.util.Optional; import java.util.UUID; @@ -102,10 +100,10 @@ public class ConverterMain { int totalDomains = plan.countCrawledDomains(); AtomicInteger processedDomains = new AtomicInteger(0); - var pipe = new ParallelPipe, ProcessingInstructions>("Converter", 16, 4, 2) { + var pipe = new ParallelPipe("Converter", 16, 4, 2) { @Override - protected ProcessingInstructions onProcess(Iterator dataStream) { + protected ProcessingInstructions onProcess(SerializableCrawlDataStream dataStream) { var processed = processor.process(dataStream); var compiled = compiler.compile(processed); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index 64682319..26ade3c6 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -1,8 +1,10 @@ package nu.marginalia.converting.processor; import com.google.inject.Inject; +import lombok.SneakyThrows; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.processor.logic.links.LinkGraph; +import nu.marginalia.crawling.io.SerializableCrawlDataStream; import nu.marginalia.crawling.model.*; import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.converting.model.ProcessedDomain; @@ -32,7 +34,8 @@ public class DomainProcessor { this.documentDeduplicator = documentDeduplicator; } - public ProcessedDomain process(Iterator dataStream) { + @SneakyThrows + public ProcessedDomain process(SerializableCrawlDataStream dataStream) { var ret = new ProcessedDomain(); List docs = new ArrayList<>(); diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java index 8cf3a397..850b6ec2 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java @@ -3,10 +3,10 @@ package nu.marginalia.converting; import com.google.inject.Guice; import com.google.inject.Injector; -import nu.marginalia.bigstring.BigString; import nu.marginalia.converting.model.HtmlStandard; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.processor.DomainProcessor; +import nu.marginalia.crawling.io.SerializableCrawlDataStream; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.SerializableCrawlData; @@ -134,12 +134,13 @@ public class ConvertingIntegrationTest { } - private Iterator asSerializableCrawlData(CrawledDomain domain) { + private SerializableCrawlDataStream asSerializableCrawlData(CrawledDomain domain) { List data = new ArrayList<>(); if (domain.doc != null) { data.addAll(domain.doc); } data.add(domain); - return data.iterator(); + + return SerializableCrawlDataStream.fromIterator(data.iterator()); } } diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java index 9a79e9e9..f4aaf351 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java @@ -8,6 +8,7 @@ import nu.marginalia.converting.processor.DomainProcessor; import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; +import nu.marginalia.crawling.io.SerializableCrawlDataStream; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.SerializableCrawlData; @@ -59,7 +60,7 @@ public class CrawlingThenConvertingIntegrationTest { data.add(domain); data.addAll(domain.doc); - var output = domainProcessor.process(data.iterator()); + var output = domainProcessor.process(SerializableCrawlDataStream.fromIterator(data.iterator())); for (var doc : output.documents) { if (doc.isOk()) { diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java index f633a294..c06e610b 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java @@ -9,9 +9,6 @@ import nu.marginalia.WmsaHome; import nu.marginalia.crawl.retreival.CrawlDataReference; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawling.io.CrawledDomainReader; -import nu.marginalia.crawling.io.CrawlerOutputFile; -import nu.marginalia.crawling.model.CrawledDomain; -import nu.marginalia.crawling.model.SerializableCrawlData; import nu.marginalia.db.storage.FileStorageService; import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mq.MqMessage; @@ -216,8 +213,8 @@ public class CrawlerMain implements AutoCloseable { private CrawlDataReference getReference(CrawlingSpecification specification) { try { - var iterator = reader.createIterator(crawlDataDir, specification); - return new CrawlDataReference(iterator); + var dataStream = reader.createDataStream(crawlDataDir, specification); + return new CrawlDataReference(dataStream); } catch (IOException e) { logger.warn("Failed to read previous crawl data for {}", specification.domain); return new CrawlDataReference(); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java index adb25752..13d17dfc 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java @@ -2,34 +2,38 @@ package nu.marginalia.crawl.retreival; import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; -import nu.marginalia.bigstring.BigString; +import nu.marginalia.crawling.io.SerializableCrawlDataStream; import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.SerializableCrawlData; import nu.marginalia.lsh.EasyLSH; import javax.annotation.Nullable; -import java.util.*; +import java.io.IOException; /** A reference to a domain that has been crawled before. */ public class CrawlDataReference { - private final Iterator data; + private final SerializableCrawlDataStream data; - public CrawlDataReference(Iterator data) { + public CrawlDataReference(SerializableCrawlDataStream data) { this.data = data; } public CrawlDataReference() { - this(Collections.emptyIterator()); + this(SerializableCrawlDataStream.empty()); } @Nullable public CrawledDocument nextDocument() { - while (data.hasNext()) { - if (data.next() instanceof CrawledDocument doc) { - return doc; + try { + while (data.hasNext()) { + if (data.next() instanceof CrawledDocument doc) { + return doc; + } } } + catch (IOException ex) { + ex.printStackTrace(); + } return null; } diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java index 96f475a9..48aa39c9 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java @@ -8,7 +8,6 @@ import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawling.io.CrawledDomainReader; import nu.marginalia.crawling.io.CrawledDomainWriter; -import nu.marginalia.crawling.io.CrawlerOutputFile; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.spec.CrawlingSpecification; @@ -134,7 +133,7 @@ class CrawlerRetreiverTest { writer.close(); var reader = new CrawledDomainReader(); - var iter = reader.createIterator(out, specs); + var stream = reader.createDataStream(out, specs); CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0); domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList()); @@ -143,7 +142,7 @@ class CrawlerRetreiverTest { if (d instanceof CrawledDocument doc) { System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus); } - }).fetch(new CrawlDataReference(iter)); + }).fetch(new CrawlDataReference(stream)); } } \ No newline at end of file From 09fd0a1d0e72849def9a988f50e8343363aa5b6c Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 24 Jul 2023 17:04:42 +0200 Subject: [PATCH 081/157] (converter) Automatically clean stale file storage records if they disappear on disk --- .../db/storage/FileStorageService.java | 40 +++++++++++++++++++ .../db/storage/FileStorageServiceTest.java | 2 +- .../monitor/FileStorageMonitorActor.java | 33 ++++++++++++--- 3 files changed, 69 insertions(+), 6 deletions(-) diff --git a/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java b/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java index 334643b1..2ce1b4d1 100644 --- a/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java +++ b/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java @@ -342,4 +342,44 @@ public class FileStorageService { stmt.executeUpdate(); } } + + public List getEachFileStorage() { + List ret = new ArrayList<>(); + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT PATH, TYPE, DESCRIPTION, ID, BASE_ID + FROM FILE_STORAGE_VIEW + """)) { + + long storageId; + long baseId; + String path; + String description; + FileStorageType type; + + try (var rs = stmt.executeQuery()) { + while (rs.next()) { + baseId = rs.getLong("BASE_ID"); + storageId = rs.getLong("ID"); + path = rs.getString("PATH"); + type = FileStorageType.valueOf(rs.getString("TYPE")); + description = rs.getString("DESCRIPTION"); + + var base = getStorageBase(new FileStorageBaseId(baseId)); + + ret.add(new FileStorage( + new FileStorageId(storageId), + base, + type, + path, + description + )); + } + } + } catch (SQLException e) { + e.printStackTrace(); + } + + return ret; + } } diff --git a/code/common/db/src/test/java/nu/marginalia/db/storage/FileStorageServiceTest.java b/code/common/db/src/test/java/nu/marginalia/db/storage/FileStorageServiceTest.java index cfd1df26..43d99d7d 100644 --- a/code/common/db/src/test/java/nu/marginalia/db/storage/FileStorageServiceTest.java +++ b/code/common/db/src/test/java/nu/marginalia/db/storage/FileStorageServiceTest.java @@ -131,7 +131,7 @@ public class FileStorageServiceTest { var base = storage.createStorageBase(name, createTempDir(), FileStorageBaseType.SLOW, false, false); - var created = storage.allocatePermanentStorage(base, "xyz", FileStorageType.CRAWL_DATA, "thisShouldFail"); + var created = storage.allocatePermanentStorage(base, "xyz", FileStorageType.CRAWL_DATA, "thisShouldSucceed"); tempDirs.add(created.asPath()); var actual = storage.getStorage(created.id()); diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/FileStorageMonitorActor.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/FileStorageMonitorActor.java index dc6dd69d..663fa9d8 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/FileStorageMonitorActor.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/FileStorageMonitorActor.java @@ -15,6 +15,8 @@ import org.slf4j.LoggerFactory; import java.nio.file.Files; import java.nio.file.Path; +import java.sql.SQLException; +import java.util.List; import java.util.Optional; import java.util.concurrent.TimeUnit; @@ -27,6 +29,7 @@ public class FileStorageMonitorActor extends AbstractStateGraph { private static final String INITIAL = "INITIAL"; private static final String MONITOR = "MONITOR"; private static final String PURGE = "PURGE"; + private static final String REMOVE_STALE = "REMOVE-STALE"; private static final String END = "END"; private final FileStorageService fileStorageService; @@ -42,7 +45,10 @@ public class FileStorageMonitorActor extends AbstractStateGraph { public void init() { } - @GraphState(name = MONITOR, next = PURGE, resume = ResumeBehavior.RETRY, transitions = { PURGE }, + @GraphState(name = MONITOR, + next = PURGE, + resume = ResumeBehavior.RETRY, + transitions = { PURGE, REMOVE_STALE }, description = """ Monitor the file storage and trigger at transition to PURGE if any file storage area has been marked for deletion. @@ -52,12 +58,17 @@ public class FileStorageMonitorActor extends AbstractStateGraph { for (;;) { Optional toDeleteOpt = fileStorageService.findFileStorageToDelete(); - if (toDeleteOpt.isEmpty()) { - TimeUnit.SECONDS.sleep(10); - } - else { + if (toDeleteOpt.isPresent()) { transition(PURGE, toDeleteOpt.get().id()); } + + List allStorageItems = fileStorageService.getEachFileStorage(); + var missing = allStorageItems.stream().filter(storage -> !Files.exists(storage.asPath())).findAny(); + if (missing.isPresent()) { + transition(REMOVE_STALE, missing.get().id()); + } + + TimeUnit.SECONDS.sleep(10); } } @@ -79,4 +90,16 @@ public class FileStorageMonitorActor extends AbstractStateGraph { fileStorageService.removeFileStorage(storage.id()); } + + @GraphState( + name = REMOVE_STALE, + next = MONITOR, + resume = ResumeBehavior.RETRY, + description = """ + Remove file storage from the database if it doesn't exist on disk. + """ + ) + public void removeStale(FileStorageId id) throws SQLException { + fileStorageService.removeFileStorage(id); + } } From fd44e09ebd03f9fca564c7708dcc435f4d8f745e Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 24 Jul 2023 18:37:35 +0200 Subject: [PATCH 082/157] (loader) Don't delete the entire link database when the loader runs --- .../compiler/InstructionsCompiler.java | 9 +-------- .../nu/marginalia/loading/LoaderMain.java | 19 ------------------- .../loader/SqlLoadProcessedDomain.java | 3 +++ .../loader/SqlLoadProcessedDomainTest.java | 5 +++++ 4 files changed, 9 insertions(+), 27 deletions(-) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/InstructionsCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/InstructionsCompiler.java index a7076334..71bf7785 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/InstructionsCompiler.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/InstructionsCompiler.java @@ -8,7 +8,6 @@ import nu.marginalia.converting.model.ProcessedDomain; import java.util.ArrayList; import java.util.Collections; import java.util.List; -import java.util.Objects; import static java.util.Objects.requireNonNullElse; @@ -39,6 +38,7 @@ public class InstructionsCompiler { public List compile(ProcessedDomain domain) { List ret = new ArrayList<>(domain.size()*4); + // Guaranteed to always be first ret.add(new LoadProcessedDomain(domain.domain, domain.state, domain.ip)); if (domain.documents != null) { @@ -46,7 +46,6 @@ public class InstructionsCompiler { documentsCompiler.compile(ret, domain.documents); feedsCompiler.compile(ret, domain.documents); - linksCompiler.compile(ret, domain.domain, domain.documents); } if (domain.redirect != null) { @@ -57,10 +56,4 @@ public class InstructionsCompiler { return ret; } - - - - - - } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java index 5dff9388..fc169461 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java @@ -4,7 +4,6 @@ import com.google.gson.Gson; import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; -import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; import nu.marginalia.db.storage.FileStorageService; import nu.marginalia.loading.loader.IndexLoadKeywords; @@ -62,7 +61,6 @@ public class LoaderMain { @Inject public LoaderMain(ConvertedDomainReader instructionsReader, - HikariDataSource dataSource, LoaderFactory loaderFactory, ProcessHeartbeat heartbeat, MessageQueueFactory messageQueueFactory, @@ -81,27 +79,10 @@ public class LoaderMain { heartbeat.start(); - nukeTables(dataSource); - processorThread = new Thread(this::processor, "Processor Thread"); processorThread.start(); } - private void nukeTables(HikariDataSource dataSource) { - try (var conn = dataSource.getConnection(); - var stmt = conn.createStatement()) { - stmt.execute("SET FOREIGN_KEY_CHECKS = 0"); - stmt.execute("TRUNCATE TABLE EC_PAGE_DATA"); - stmt.execute("TRUNCATE TABLE EC_URL"); - stmt.execute("TRUNCATE TABLE EC_DOMAIN_LINK"); - stmt.execute("TRUNCATE TABLE DOMAIN_METADATA"); - stmt.execute("SET FOREIGN_KEY_CHECKS = 1"); - } - catch (SQLException ex) { - throw new RuntimeException(ex); - } - } - @SneakyThrows public void run(LoadRequest instructions) { var plan = instructions.getPlan(); diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java index 1e1998c7..c06ff84c 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java @@ -32,6 +32,9 @@ public class SqlLoadProcessedDomain { IN DID INT, IN IP VARCHAR(48)) BEGIN + DELETE FROM DOMAIN_METADATA WHERE ID=DID; + DELETE FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=DID; + DELETE FROM EC_URL WHERE DOMAIN_ID = DID; UPDATE EC_DOMAIN SET INDEX_DATE=NOW(), STATE=ST, DOMAIN_ALIAS=NULL, INDEXED=GREATEST(INDEXED,IDX), IP=IP WHERE ID=DID; DELETE FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=DID; END diff --git a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDomainTest.java b/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDomainTest.java index df61cf50..b595c1fa 100644 --- a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDomainTest.java +++ b/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDomainTest.java @@ -53,6 +53,11 @@ class SqlLoadProcessedDomainTest { var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource)); loader.load(loaderData, new EdgeDomain("www.marginalia.nu"), DomainIndexingState.BLOCKED, "127.0.0.1"); } + @Test + public void loadProcessedDomainTwice() { + var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource)); + loader.load(loaderData, new EdgeDomain("www.marginalia.nu"), DomainIndexingState.BLOCKED, "127.0.0.1"); + } @Test public void loadProcessedDomaiWithExtremelyLongIP() { From 507f26ad47cf94bb7967451a2fb39ee5d2240aa4 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 25 Jul 2023 20:41:43 +0200 Subject: [PATCH 083/157] (converter) Refactor converter to not keep instructions list in RAM. (converter) Refactor converter to not keep instructions list in RAM. (converter) Refactor converter to not keep instructions list in RAM. --- .../java/nu/marginalia/util/ParallelPipe.java | 112 ------------------ .../marginalia/converting/ConverterMain.java | 87 +++++++------- ...ter.java => InstructionWriterFactory.java} | 70 +++++++---- .../compiler/DocumentsCompiler.java | 15 +-- .../compiler/DomainMetadataCompiler.java | 5 +- .../converting/compiler/FeedsCompiler.java | 5 +- .../compiler/InstructionsCompiler.java | 21 ++-- .../converting/compiler/LinksCompiler.java | 5 +- .../converting/compiler/RedirectCompiler.java | 9 +- .../converting/compiler/UrlsCompiler.java | 7 +- .../control/svc/ProcessService.java | 1 + 11 files changed, 126 insertions(+), 211 deletions(-) delete mode 100644 code/common/process/src/main/java/nu/marginalia/util/ParallelPipe.java rename code/processes/converting-process/src/main/java/nu/marginalia/converting/{InstructionWriter.java => InstructionWriterFactory.java} (65%) diff --git a/code/common/process/src/main/java/nu/marginalia/util/ParallelPipe.java b/code/common/process/src/main/java/nu/marginalia/util/ParallelPipe.java deleted file mode 100644 index fc95debe..00000000 --- a/code/common/process/src/main/java/nu/marginalia/util/ParallelPipe.java +++ /dev/null @@ -1,112 +0,0 @@ -package nu.marginalia.util; - -import lombok.SneakyThrows; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.ArrayList; -import java.util.List; -import java.util.concurrent.LinkedBlockingQueue; -import java.util.concurrent.TimeUnit; - -/** Generalization of the workflow
    - * -- single provider thread reading sequentially from disk
    - * -> multiple independent CPU-bound processing tasks
    - * -> single consumer thread writing to network/disk
    - *

    - */ -public abstract class ParallelPipe { - private final LinkedBlockingQueue inputs; - private final LinkedBlockingQueue intermediates; - - private final Logger logger = LoggerFactory.getLogger(getClass()); - - private final List processThreads = new ArrayList<>(); - private final Thread receiverThread; - - private volatile boolean expectingInput = true; - private volatile boolean expectingOutput = true; - - public ParallelPipe(String name, int numberOfThreads, int inputQueueSize, int intermediateQueueSize) { - inputs = new LinkedBlockingQueue<>(inputQueueSize); - intermediates = new LinkedBlockingQueue<>(intermediateQueueSize); - - for (int i = 0; i < numberOfThreads; i++) { - processThreads.add(new Thread(this::runProcessThread, name + "-process["+i+"]")); - } - receiverThread = new Thread(this::runReceiverThread, name + "-receiver"); - - processThreads.forEach(Thread::start); - receiverThread.start(); - } - - public void clearQueues() { - inputs.clear(); - intermediates.clear(); - } - - @SneakyThrows - private void runProcessThread() { - while (expectingInput || !inputs.isEmpty()) { - var in = inputs.poll(10, TimeUnit.SECONDS); - - if (in != null) { - try { - var ret = onProcess(in); - if (ret != null) { - intermediates.put(ret); - } - } - catch (InterruptedException ex) { - throw ex; - } - catch (Exception ex) { - logger.error("Exception", ex); - } - - } - } - - logger.info("Terminating {}", Thread.currentThread().getName()); - } - - @SneakyThrows - private void runReceiverThread() { - while (expectingOutput || !inputs.isEmpty() || !intermediates.isEmpty()) { - var intermediate = intermediates.poll(997, TimeUnit.MILLISECONDS); - if (intermediate != null) { - try { - onReceive(intermediate); - } - catch (Exception ex) { - logger.error("Exception", ex); - } - } - } - - logger.info("Terminating {}", Thread.currentThread().getName()); - } - - /** Begin processing an item */ - @SneakyThrows - public void accept(INPUT input) { - inputs.put(input); - } - - /** The meat of the processor thread runtime */ - protected abstract INTERMEDIATE onProcess(INPUT input) throws Exception; - - /** The meat of the consumer thread runtime */ - protected abstract void onReceive(INTERMEDIATE intermediate) throws Exception; - - public void join() throws InterruptedException { - expectingInput = false; - - for (var thread : processThreads) { - thread.join(); - } - - expectingOutput = false; - receiverThread.join(); - } -} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java index f35740ce..9c8373e1 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java @@ -4,7 +4,7 @@ import com.google.gson.Gson; import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; -import nu.marginalia.crawling.io.SerializableCrawlDataStream; +import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.db.storage.FileStorageService; import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mq.MqMessage; @@ -17,7 +17,6 @@ import plan.CrawlPlan; import nu.marginalia.converting.compiler.InstructionsCompiler; import nu.marginalia.converting.instruction.Instruction; import nu.marginalia.converting.processor.DomainProcessor; -import nu.marginalia.util.ParallelPipe; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -26,6 +25,9 @@ import java.sql.SQLException; import java.util.List; import java.util.Optional; import java.util.UUID; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.Semaphore; +import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; @@ -55,7 +57,7 @@ public class ConverterMain { var request = converter.fetchInstructions(); try { - converter.load(request); + converter.convert(request); request.ok(); } catch (Exception ex) { @@ -87,58 +89,64 @@ public class ConverterMain { heartbeat.start(); } - - - public void load(ConvertRequest request) throws Exception { + public void convert(ConvertRequest request) throws Exception { var plan = request.getPlan(); + final int maxPoolSize = 16; + try (WorkLog processLog = plan.createProcessWorkLog(); ConversionLog log = new ConversionLog(plan.process.getDir())) { - var instructionWriter = new InstructionWriter(log, plan.process.getDir(), gson); + var instructionWriter = new InstructionWriterFactory(log, plan.process.getDir(), gson); + + Semaphore semaphore = new Semaphore(maxPoolSize); + var pool = new ThreadPoolExecutor( + maxPoolSize/4, + maxPoolSize, + 5, TimeUnit.MINUTES, + new LinkedBlockingQueue<>(8) + ); int totalDomains = plan.countCrawledDomains(); AtomicInteger processedDomains = new AtomicInteger(0); - var pipe = new ParallelPipe("Converter", 16, 4, 2) { - - @Override - protected ProcessingInstructions onProcess(SerializableCrawlDataStream dataStream) { - var processed = processor.process(dataStream); - var compiled = compiler.compile(processed); - - return new ProcessingInstructions(processed.id, compiled); - } - - @Override - protected void onReceive(ProcessingInstructions processedInstructions) throws IOException { - Thread.currentThread().setName("Converter:Receiver["+processedInstructions.id+"]"); - try { - var instructions = processedInstructions.instructions; - instructions.removeIf(Instruction::isNoOp); - - String where = instructionWriter.accept(processedInstructions.id, instructions); - processLog.setJobToFinished(processedInstructions.id, where, instructions.size()); - - heartbeat.setProgress(processedDomains.incrementAndGet() / (double) totalDomains); - } - finally { - Thread.currentThread().setName("Converter:Receiver[IDLE]"); - } - } - - }; - // Advance the progress bar to the current position if this is a resumption processedDomains.set(processLog.countFinishedJobs()); heartbeat.setProgress(processedDomains.get() / (double) totalDomains); for (var domain : plan.crawlDataIterable(id -> !processLog.isJobFinished(id))) { - pipe.accept(domain); + semaphore.acquire(); + pool.execute(() -> { + try { + ProcessedDomain processed = processor.process(domain); + + final String where; + final int size; + + try (var writer = instructionWriter.createInstructionsForDomainWriter(processed.id)) { + compiler.compile(processed, writer::accept); + where = writer.getFileName(); + size = writer.getSize(); + } + + processLog.setJobToFinished(processed.id, where, size); + heartbeat.setProgress(processedDomains.incrementAndGet() / (double) totalDomains); + } + catch (IOException ex) { + logger.warn("IO exception in converter", ex); + } + finally { + semaphore.release(); + } + }); } - pipe.join(); + pool.shutdown(); + do { + System.out.println("Waiting for pool to terminate... " + pool.getActiveCount() + " remaining"); + } while (!pool.awaitTermination(60, TimeUnit.SECONDS)); + request.ok(); } catch (Exception e) { @@ -205,7 +213,4 @@ public class ConverterMain { } } - - record ProcessingInstructions(String id, List instructions) {} - } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriter.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriterFactory.java similarity index 65% rename from code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriter.java rename to code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriterFactory.java index 826c41cd..e6009d0e 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriter.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriterFactory.java @@ -15,22 +15,18 @@ import nu.marginalia.model.EdgeUrl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.BufferedOutputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.OutputStreamWriter; +import java.io.*; import java.nio.file.Files; import java.nio.file.Path; -import java.util.List; -public class InstructionWriter { +public class InstructionWriterFactory { - private ConversionLog log; + private final ConversionLog log; private final Path outputDir; private final Gson gson; - private static final Logger logger = LoggerFactory.getLogger(InstructionWriter.class); + private static final Logger logger = LoggerFactory.getLogger(InstructionWriterFactory.class); - public InstructionWriter(ConversionLog log, Path outputDir, Gson gson) { + public InstructionWriterFactory(ConversionLog log, Path outputDir, Gson gson) { this.log = log; this.outputDir = outputDir; this.gson = gson; @@ -40,29 +36,57 @@ public class InstructionWriter { } } - public String accept(String id, List instructionList) throws IOException { + public InstructionWriter createInstructionsForDomainWriter(String id) throws IOException { Path outputFile = getOutputFile(id); + return new InstructionWriter(outputFile); + } - if (Files.exists(outputFile)) { - Files.delete(outputFile); + public class InstructionWriter implements AutoCloseable { + private final OutputStreamWriter outputStream; + private final String where; + private final SummarizingInterpreter summary = new SummarizingInterpreter(); + + private int size = 0; + + + InstructionWriter(Path filename) throws IOException { + where = filename.getFileName().toString(); + Files.deleteIfExists(filename); + outputStream = new OutputStreamWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(filename.toFile())))); } - try (var outputStream = new OutputStreamWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outputFile.toFile()))))) { + public void accept(Instruction instruction) { + if (instruction.isNoOp()) return; - SummarizingInterpreter summary = new SummarizingInterpreter(instructionList); - logger.info("Writing {} - {} - {}", id, instructionList.size(), summary); + instruction.apply(summary); + instruction.apply(log); - for (var instr : instructionList) { - instr.apply(log); + size++; - outputStream.append(instr.tag().name()); + try { + outputStream.append(instruction.tag().name()); outputStream.append(' '); - gson.toJson(instr, outputStream); + gson.toJson(instruction, outputStream); outputStream.append('\n'); } + catch (IOException ex) { + logger.warn("IO exception writing instruction", ex); + } } - return outputFile.getFileName().toString(); + @Override + public void close() throws IOException { + logger.info("Wrote {} - {} - {}", where, size, summary); + outputStream.close(); + } + + public String getFileName() { + return where; + } + + public int getSize() { + return size; + } } private Path getOutputFile(String id) throws IOException { @@ -79,12 +103,6 @@ public class InstructionWriter { private static class SummarizingInterpreter implements Interpreter { - private SummarizingInterpreter(List instructions) { - for (var i : instructions) { - i.apply(this); - } - } - private String domainName; private int ok = 0; private int error = 0; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java index 3849f015..881a1a33 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java @@ -7,34 +7,35 @@ import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.model.crawl.HtmlFeature; import java.util.List; +import java.util.function.Consumer; public class DocumentsCompiler { - public void compile(List ret, List documents) { + public void compile(Consumer instructionConsumer, List documents) { for (var doc : documents) { - compileDocumentDetails(ret, doc); + compileDocumentDetails(instructionConsumer, doc); } for (var doc : documents) { - compileWords(ret, doc); + compileWords(instructionConsumer, doc); } } - private void compileDocumentDetails(List ret, ProcessedDocument doc) { + private void compileDocumentDetails(Consumer instructionConsumer, ProcessedDocument doc) { var details = doc.details; if (details != null) { - ret.add(new LoadProcessedDocument(doc.url, doc.state, details.title, details.description, HtmlFeature.encode(details.features), details.standard.name(), details.length, details.hashCode, details.quality, details.pubYear)); + instructionConsumer.accept(new LoadProcessedDocument(doc.url, doc.state, details.title, details.description, HtmlFeature.encode(details.features), details.standard.name(), details.length, details.hashCode, details.quality, details.pubYear)); } } - private void compileWords(List ret, ProcessedDocument doc) { + private void compileWords(Consumer instructionConsumer, ProcessedDocument doc) { var words = doc.words; if (words != null) { - ret.add(new LoadKeywords(doc.url, doc.details.metadata, words.build())); + instructionConsumer.accept(new LoadKeywords(doc.url, doc.details.metadata, words.build())); } } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DomainMetadataCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DomainMetadataCompiler.java index e80f42eb..74ae5816 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DomainMetadataCompiler.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DomainMetadataCompiler.java @@ -11,11 +11,12 @@ import java.util.HashSet; import java.util.List; import java.util.Optional; import java.util.Set; +import java.util.function.Consumer; public class DomainMetadataCompiler { - public void compile(List ret, EdgeDomain domain, @NotNull List documents) { + public void compile(Consumer instructionConsumer, EdgeDomain domain, @NotNull List documents) { int visitedUrls = 0; int goodUrls = 0; @@ -36,7 +37,7 @@ public class DomainMetadataCompiler { .ifPresent(knownUrls::addAll); } - ret.add(new LoadDomainMetadata(domain, knownUrls.size(), goodUrls, visitedUrls)); + instructionConsumer.accept(new LoadDomainMetadata(domain, knownUrls.size(), goodUrls, visitedUrls)); } } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/FeedsCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/FeedsCompiler.java index 64779a0f..2c111ea2 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/FeedsCompiler.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/FeedsCompiler.java @@ -7,10 +7,11 @@ import nu.marginalia.model.EdgeUrl; import java.util.List; import java.util.Objects; +import java.util.function.Consumer; public class FeedsCompiler { - public void compile(List ret, List documents) { + public void compile(Consumer instructionConsumer, List documents) { EdgeUrl[] feeds = documents.stream().map(doc -> doc.details) .filter(Objects::nonNull) @@ -18,6 +19,6 @@ public class FeedsCompiler { .distinct() .toArray(EdgeUrl[]::new); - ret.add(new LoadRssFeed(feeds)); + instructionConsumer.accept(new LoadRssFeed(feeds)); } } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/InstructionsCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/InstructionsCompiler.java index 71bf7785..9b32ed8d 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/InstructionsCompiler.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/InstructionsCompiler.java @@ -8,6 +8,7 @@ import nu.marginalia.converting.model.ProcessedDomain; import java.util.ArrayList; import java.util.Collections; import java.util.List; +import java.util.function.Consumer; import static java.util.Objects.requireNonNullElse; @@ -35,25 +36,21 @@ public class InstructionsCompiler { this.redirectCompiler = redirectCompiler; } - public List compile(ProcessedDomain domain) { - List ret = new ArrayList<>(domain.size()*4); - + public void compile(ProcessedDomain domain, Consumer instructionConsumer) { // Guaranteed to always be first - ret.add(new LoadProcessedDomain(domain.domain, domain.state, domain.ip)); + instructionConsumer.accept(new LoadProcessedDomain(domain.domain, domain.state, domain.ip)); if (domain.documents != null) { - urlsCompiler.compile(ret, domain.documents); - documentsCompiler.compile(ret, domain.documents); + urlsCompiler.compile(instructionConsumer, domain.documents); + documentsCompiler.compile(instructionConsumer, domain.documents); - feedsCompiler.compile(ret, domain.documents); - linksCompiler.compile(ret, domain.domain, domain.documents); + feedsCompiler.compile(instructionConsumer, domain.documents); + linksCompiler.compile(instructionConsumer, domain.domain, domain.documents); } if (domain.redirect != null) { - redirectCompiler.compile(ret, domain.domain, domain.redirect); + redirectCompiler.compile(instructionConsumer, domain.domain, domain.redirect); } - domainMetadataCompiler.compile(ret, domain.domain, requireNonNullElse(domain.documents, Collections.emptyList())); - - return ret; + domainMetadataCompiler.compile(instructionConsumer, domain.domain, requireNonNullElse(domain.documents, Collections.emptyList())); } } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/LinksCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/LinksCompiler.java index a578602d..e100cb86 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/LinksCompiler.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/LinksCompiler.java @@ -8,10 +8,11 @@ import nu.marginalia.model.EdgeDomain; import java.util.List; import java.util.Objects; +import java.util.function.Consumer; public class LinksCompiler { - public void compile(List ret, EdgeDomain from, List documents) { + public void compile(Consumer instructionConsumer, EdgeDomain from, List documents) { DomainLink[] links = documents.stream().map(doc -> doc.details) .filter(Objects::nonNull) @@ -21,6 +22,6 @@ public class LinksCompiler { .map(domain -> new DomainLink(from, domain)) .toArray(DomainLink[]::new); - ret.add(new LoadDomainLink(links)); + instructionConsumer.accept(new LoadDomainLink(links)); } } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/RedirectCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/RedirectCompiler.java index b14dedca..dcd0201f 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/RedirectCompiler.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/RedirectCompiler.java @@ -8,12 +8,13 @@ import nu.marginalia.converting.instruction.instructions.LoadDomainRedirect; import nu.marginalia.model.EdgeDomain; import java.util.List; +import java.util.function.Consumer; public class RedirectCompiler { - public void compile(List ret, EdgeDomain from, EdgeDomain to) { - ret.add(new LoadDomain(to)); - ret.add(new LoadDomainLink(new DomainLink(from, to))); - ret.add(new LoadDomainRedirect(new DomainLink(from, to))); + public void compile(Consumer instructionConsumer, EdgeDomain from, EdgeDomain to) { + instructionConsumer.accept(new LoadDomain(to)); + instructionConsumer.accept(new LoadDomainLink(new DomainLink(from, to))); + instructionConsumer.accept(new LoadDomainRedirect(new DomainLink(from, to))); } } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/UrlsCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/UrlsCompiler.java index d5184cfc..ba347058 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/UrlsCompiler.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/UrlsCompiler.java @@ -13,13 +13,14 @@ import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; +import java.util.function.Consumer; public class UrlsCompiler { private static final int MAX_INTERNAL_LINKS = 25; private final Logger logger = LoggerFactory.getLogger(getClass()); - public void compile(List ret, List documents) { + public void compile(Consumer instructionConsumer, List documents) { Set seenUrls = new HashSet<>(documents.size()*4); Set seenDomains = new HashSet<>(documents.size()); @@ -53,8 +54,8 @@ public class UrlsCompiler { } } - ret.add(new LoadDomain(seenDomains.toArray(EdgeDomain[]::new))); - ret.add(new LoadUrl(seenUrls.toArray(EdgeUrl[]::new))); + instructionConsumer.accept(new LoadDomain(seenDomains.toArray(EdgeDomain[]::new))); + instructionConsumer.accept(new LoadUrl(seenUrls.toArray(EdgeUrl[]::new))); } } diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java index 124a2a49..032f2c23 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java @@ -119,6 +119,7 @@ public class ProcessService { } opts.put("WMSA_HOME", WMSA_HOME); opts.put("JAVA_HOME", System.getenv("JAVA_HOME")); + opts.put("JAVA_OPTS", ""); opts.put("CONVERTER_OPTS", System.getenv("CONVERTER_OPTS")); opts.put("LOADER_OPTS", System.getenv("LOADER_OPTS")); opts.put("CRAWLER_OPTS", System.getenv("CRAWLER_OPTS")); From 19c2ceec9b32ee64e8d430df5cdc860ca6653195 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 26 Jul 2023 11:50:23 +0200 Subject: [PATCH 084/157] (converter) Use Marginalia Yellow for control service --- .../control-service/src/main/resources/static/control/style.css | 1 + 1 file changed, 1 insertion(+) diff --git a/code/services-satellite/control-service/src/main/resources/static/control/style.css b/code/services-satellite/control-service/src/main/resources/static/control/style.css index 4056c91e..e3722cb4 100644 --- a/code/services-satellite/control-service/src/main/resources/static/control/style.css +++ b/code/services-satellite/control-service/src/main/resources/static/control/style.css @@ -1,4 +1,5 @@ body { + background-color: #f8f8ee; font-family: sans-serif; line-height: 1.6; From a5d980ee561865d1ea24746a95995ce9f34ac2a3 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 26 Jul 2023 15:46:22 +0200 Subject: [PATCH 085/157] (converter) Hook crawl job extractor and adjacencies calculator into control service. --- build.gradle | 8 ++ .../nu/marginalia/control/ControlService.java | 2 +- .../control/actor/ControlActors.java | 10 +- .../actor/task/CrawlJobExtractorActor.java | 135 ++++++++++++++++++ .../TriggerAdjacencyCalculationActor.java | 59 ++++++++ .../nu/marginalia/control/model/Actor.java | 4 +- .../control/model/ProcessHeartbeat.java | 2 + .../control/svc/ControlActorService.java | 24 ++++ .../control/svc/ProcessService.java | 20 ++- .../crawl/CrawlJobExtractorMain.java | 2 + .../build.gradle | 1 + .../WebsiteAdjacenciesCalculator.java | 31 ++-- 12 files changed, 273 insertions(+), 25 deletions(-) create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlJobExtractorActor.java create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/TriggerAdjacencyCalculationActor.java diff --git a/build.gradle b/build.gradle index 49bf7b98..ea0db6be 100644 --- a/build.gradle +++ b/build.gradle @@ -26,6 +26,14 @@ tasks.register('dist', Copy) { from tarTree("$buildDir/dist/loader-process.tar") into "$projectDir/run/dist/" } + copy { + from tarTree("$buildDir/dist/website-adjacencies-calculator.tar") + into "$projectDir/run/dist/" + } + copy { + from tarTree("$buildDir/dist/crawl-job-extractor-process.tar") + into "$projectDir/run/dist/" + } } } idea { diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java index eb43f9cb..cc2e74fd 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java @@ -93,12 +93,12 @@ public class ControlService extends Service { Spark.post("/public/fsms/:fsm/start", controlActorService::startFsm, redirectToProcesses); Spark.post("/public/fsms/:fsm/stop", controlActorService::stopFsm, redirectToProcesses); - Spark.post("/public/storage/:fid/crawl", controlActorService::triggerCrawling, redirectToProcesses); Spark.post("/public/storage/:fid/recrawl", controlActorService::triggerRecrawling, redirectToProcesses); Spark.post("/public/storage/:fid/process", controlActorService::triggerProcessing, redirectToProcesses); Spark.post("/public/storage/:fid/load", controlActorService::loadProcessedData, redirectToProcesses); + Spark.post("/public/storage/specs", controlActorService::createCrawlSpecification, redirectToStorage); Spark.post("/public/storage/:fid/delete", controlFileStorageService::flagFileForDeletionRequest, redirectToStorage); Spark.get("/public/:resource", this::serveStatic); diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java index bfa90be1..052ca2cb 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java @@ -4,13 +4,11 @@ import com.google.gson.Gson; import com.google.inject.Inject; import com.google.inject.Singleton; import lombok.SneakyThrows; -import nu.marginalia.control.actor.task.CrawlActor; -import nu.marginalia.control.actor.task.RecrawlActor; +import nu.marginalia.control.actor.task.*; import nu.marginalia.control.model.Actor; import nu.marginalia.control.actor.monitor.*; import nu.marginalia.control.actor.monitor.ConverterMonitorActor; import nu.marginalia.control.actor.monitor.LoaderMonitorActor; -import nu.marginalia.control.actor.task.ReconvertAndLoadActor; import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mqsm.StateMachine; @@ -45,7 +43,9 @@ public class ControlActors { LoaderMonitorActor loaderMonitor, MessageQueueMonitorActor messageQueueMonitor, ProcessLivenessMonitorActor processMonitorFSM, - FileStorageMonitorActor fileStorageMonitorActor + FileStorageMonitorActor fileStorageMonitorActor, + TriggerAdjacencyCalculationActor triggerAdjacencyCalculationActor, + CrawlJobExtractorActor crawlJobExtractorActor ) { this.messageQueueFactory = messageQueueFactory; this.eventLog = baseServiceParams.eventLog; @@ -60,6 +60,8 @@ public class ControlActors { register(Actor.MESSAGE_QUEUE_MONITOR, messageQueueMonitor); register(Actor.PROCESS_LIVENESS_MONITOR, processMonitorFSM); register(Actor.FILE_STORAGE_MONITOR, fileStorageMonitorActor); + register(Actor.ADJACENCY_CALCULATION, triggerAdjacencyCalculationActor); + register(Actor.CRAWL_JOB_EXTRACTOR, crawlJobExtractorActor); } private void register(Actor process, AbstractStateGraph graph) { diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlJobExtractorActor.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlJobExtractorActor.java new file mode 100644 index 00000000..df86da38 --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlJobExtractorActor.java @@ -0,0 +1,135 @@ +package nu.marginalia.control.actor.task; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.control.svc.ControlFileStorageService; +import nu.marginalia.control.svc.ProcessService; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.db.storage.model.FileStorage; +import nu.marginalia.db.storage.model.FileStorageBaseType; +import nu.marginalia.db.storage.model.FileStorageType; +import nu.marginalia.mqsm.StateFactory; +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mqsm.graph.GraphState; +import nu.marginalia.mqsm.graph.ResumeBehavior; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.net.URL; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.atomic.AtomicBoolean; + +@Singleton +public class CrawlJobExtractorActor extends AbstractStateGraph { + + private final Logger logger = LoggerFactory.getLogger(getClass()); + // STATES + + public static final String INITIAL = "INITIAL"; + public static final String CREATE_FROM_DB = "CREATE_FROM_DB"; + public static final String CREATE_FROM_LINK = "CREATE_FROM_LINK"; + public static final String END = "END"; + private final ProcessService processService; + private final FileStorageService fileStorageService; + private final ControlFileStorageService controlFileStorageService; + private final ExecutorService executor = Executors.newSingleThreadExecutor(); + + @Inject + public CrawlJobExtractorActor(StateFactory stateFactory, + ProcessService processService, + FileStorageService fileStorageService, + ControlFileStorageService controlFileStorageService + ) { + super(stateFactory); + this.processService = processService; + this.fileStorageService = fileStorageService; + this.controlFileStorageService = controlFileStorageService; + } + + public record CrawlJobExtractorArguments(String description) { } + public record CrawlJobExtractorArgumentsWithURL(String description, String url) { } + @GraphState(name = INITIAL, next = END) + public void initial() throws Exception { error("This state does nothing"); } + + @GraphState(name = CREATE_FROM_LINK, next = END, + resume = ResumeBehavior.ERROR, + description = """ + Download a list of URLs as provided, + and then spawn a CrawlJobExtractor process, + then wait for it to finish. + """ + ) + public void createFromFromLink(CrawlJobExtractorArgumentsWithURL arg) throws Exception { + if (arg == null) { + error("This actor requires a CrawlJobExtractorArgumentsWithURL argument"); + } + + var base = fileStorageService.getStorageBase(FileStorageBaseType.SLOW); + var storage = fileStorageService.allocateTemporaryStorage(base, FileStorageType.CRAWL_SPEC, "crawl-spec", arg.description()); + + Path urlsTxt = storage.asPath().resolve("urls.txt"); + + try (var os = Files.newOutputStream(urlsTxt, StandardOpenOption.CREATE_NEW); + var is = new URL(arg.url()).openStream()) + { + is.transferTo(os); + } + catch (Exception ex) { + controlFileStorageService.flagFileForDeletion(storage.id()); + error("Error downloading " + arg.url()); + } + + final Path path = storage.asPath(); + + run(storage, path.resolve("crawler.spec").toString(), + "-f", urlsTxt.toString()); + } + + + @GraphState(name = CREATE_FROM_DB, next = END, + resume = ResumeBehavior.ERROR, + description = """ + Spawns a CrawlJobExtractor process that loads data from the link database, and wait for it to finish. + """ + ) + public void createFromDB(CrawlJobExtractorArguments arg) throws Exception { + if (arg == null) { + error("This actor requires a CrawlJobExtractorArguments argument"); + } + + var base = fileStorageService.getStorageBase(FileStorageBaseType.SLOW); + var storage = fileStorageService.allocateTemporaryStorage(base, FileStorageType.CRAWL_SPEC, "crawl-spec", arg.description()); + + final Path path = storage.asPath(); + + run(storage, + path.resolve("crawler.spec").toString()); + } + + private void run(FileStorage storage, String... args) throws Exception { + + AtomicBoolean hasError = new AtomicBoolean(false); + var future = executor.submit(() -> { + try { + processService.trigger(ProcessService.ProcessId.CRAWL_JOB_EXTRACTOR, + args); + } + catch (Exception ex) { + logger.warn("Error in creating crawl job", ex); + hasError.set(true); + } + }); + future.get(); + + if (hasError.get()) { + controlFileStorageService.flagFileForDeletion(storage.id()); + error("Error triggering adjacency calculation"); + } + + } + +} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/TriggerAdjacencyCalculationActor.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/TriggerAdjacencyCalculationActor.java new file mode 100644 index 00000000..8861cc07 --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/TriggerAdjacencyCalculationActor.java @@ -0,0 +1,59 @@ +package nu.marginalia.control.actor.task; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.control.svc.ProcessService; +import nu.marginalia.mqsm.StateFactory; +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mqsm.graph.GraphState; +import nu.marginalia.mqsm.graph.ResumeBehavior; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.atomic.AtomicBoolean; + +@Singleton +public class TriggerAdjacencyCalculationActor extends AbstractStateGraph { + + private final Logger logger = LoggerFactory.getLogger(getClass()); + // STATES + + private static final String INITIAL = "INITIAL"; + private static final String END = "END"; + private final ProcessService processService; + private final ExecutorService executor = Executors.newSingleThreadExecutor(); + + @Inject + public TriggerAdjacencyCalculationActor(StateFactory stateFactory, + ProcessService processService) { + super(stateFactory); + this.processService = processService; + } + + @GraphState(name = INITIAL, next = END, + resume = ResumeBehavior.ERROR, + description = """ + Spawns a WebsitesAdjacenciesCalculator process and waits for it to finish. + """ + ) + public void init() throws Exception { + AtomicBoolean hasError = new AtomicBoolean(false); + var future = executor.submit(() -> { + try { + processService.trigger(ProcessService.ProcessId.ADJACENCIES_CALCULATOR, "load"); + } + catch (Exception ex) { + logger.warn("Error triggering adjacency calculation", ex); + hasError.set(true); + } + }); + future.get(); + + if (hasError.get()) { + error("Error triggering adjacency calculation"); + } + } + +} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/Actor.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/Actor.java index 83d0b810..755d67a1 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/Actor.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/Actor.java @@ -9,7 +9,9 @@ public enum Actor { CRAWLER_MONITOR, MESSAGE_QUEUE_MONITOR, PROCESS_LIVENESS_MONITOR, - FILE_STORAGE_MONITOR + FILE_STORAGE_MONITOR, + ADJACENCY_CALCULATION, + CRAWL_JOB_EXTRACTOR ; diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java index 47640dde..9b0b8b0a 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java @@ -42,6 +42,8 @@ public record ProcessHeartbeat( case "converter" -> ProcessService.ProcessId.CONVERTER; case "crawler" -> ProcessService.ProcessId.CRAWLER; case "loader" -> ProcessService.ProcessId.LOADER; + case "website-adjacencies-calculator" -> ProcessService.ProcessId.ADJACENCIES_CALCULATOR; + case "crawl-job-extractor" -> ProcessService.ProcessId.CRAWL_JOB_EXTRACTOR; default -> throw new RuntimeException("Unknown process base: " + processBase); }; } diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java index c7bab07f..25461e58 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java @@ -3,6 +3,7 @@ package nu.marginalia.control.svc; import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.control.actor.ControlActors; +import nu.marginalia.control.actor.task.CrawlJobExtractorActor; import nu.marginalia.control.actor.task.ReconvertAndLoadActor; import nu.marginalia.control.actor.task.RecrawlActor; import nu.marginalia.control.model.Actor; @@ -94,4 +95,27 @@ public class ControlActorService { }).toList(); } + public Object createCrawlSpecification(Request request, Response response) throws Exception { + final String description = request.queryParams("description"); + final String url = request.queryParams("url"); + final String source = request.queryParams("source"); + + if ("db".equals(source)) { + controlActors.startFrom(Actor.CRAWL_JOB_EXTRACTOR, + CrawlJobExtractorActor.CREATE_FROM_DB, + new CrawlJobExtractorActor.CrawlJobExtractorArguments(description) + ); + } + else if ("download".equals(source)) { + controlActors.startFrom(Actor.CRAWL_JOB_EXTRACTOR, + CrawlJobExtractorActor.CREATE_FROM_LINK, + new CrawlJobExtractorActor.CrawlJobExtractorArgumentsWithURL(description, url) + ); + } + else { + throw new IllegalArgumentException("Unknown source: " + source); + } + + return ""; + } } \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java index 032f2c23..0281ed43 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java @@ -7,7 +7,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.slf4j.Marker; import org.slf4j.MarkerFactory; -import spark.utils.IOUtils; import javax.inject.Inject; import javax.inject.Singleton; @@ -33,7 +32,11 @@ public class ProcessService { public enum ProcessId { CRAWLER("crawler-process/bin/crawler-process"), CONVERTER("converter-process/bin/converter-process"), - LOADER("loader-process/bin/loader-process"); + LOADER("loader-process/bin/loader-process"), + ADJACENCIES_CALCULATOR("website-adjacencies-calculator/bin/website-adjacencies-calculator"), + CRAWL_JOB_EXTRACTOR("crawl-job-extractor-process/bin/crawl-job-extractor-process"), + + ; public final String path; ProcessId(String path) { @@ -49,10 +52,17 @@ public class ProcessService { } public boolean trigger(ProcessId processId) throws Exception { + return trigger(processId, new String[0]); + } + + public boolean trigger(ProcessId processId, String... parameters) throws Exception { String processPath = processPath(processId); - String[] args = new String[] { - processPath - }; + String[] args = new String[parameters.length + 1]; + + args[0] = processPath; + for (int i = 0; i < parameters.length; i++) + args[i+1] = parameters[i]; + String[] env = env(); Process process; diff --git a/code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobExtractorMain.java b/code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobExtractorMain.java index e898293b..9693e2ae 100644 --- a/code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobExtractorMain.java +++ b/code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobExtractorMain.java @@ -28,6 +28,8 @@ public class CrawlJobExtractorMain { return; } + // TODO (2023-06-26) figure out whether this needs a ProcessHeartbeat + String[] targetDomains = getTargetDomains(Arrays.copyOfRange(args, 1, args.length)); try (CrawlJobSpecWriter out = new CrawlJobSpecWriter(outFile)) diff --git a/code/tools/website-adjacencies-calculator/build.gradle b/code/tools/website-adjacencies-calculator/build.gradle index 99fca87e..90b20e73 100644 --- a/code/tools/website-adjacencies-calculator/build.gradle +++ b/code/tools/website-adjacencies-calculator/build.gradle @@ -19,6 +19,7 @@ java { dependencies { implementation project(':code:common:model') implementation project(':code:common:db') + implementation project(':code:common:process') implementation project(':code:common:service') implementation libs.lombok diff --git a/code/tools/website-adjacencies-calculator/src/main/java/nu/marginalia/adjacencies/WebsiteAdjacenciesCalculator.java b/code/tools/website-adjacencies-calculator/src/main/java/nu/marginalia/adjacencies/WebsiteAdjacenciesCalculator.java index f6a4022f..12348543 100644 --- a/code/tools/website-adjacencies-calculator/src/main/java/nu/marginalia/adjacencies/WebsiteAdjacenciesCalculator.java +++ b/code/tools/website-adjacencies-calculator/src/main/java/nu/marginalia/adjacencies/WebsiteAdjacenciesCalculator.java @@ -2,9 +2,11 @@ package nu.marginalia.adjacencies; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; +import nu.marginalia.ProcessConfiguration; import nu.marginalia.db.DbDomainQueries; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.id.EdgeId; +import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.service.module.DatabaseModule; import java.sql.SQLException; @@ -58,30 +60,22 @@ public class WebsiteAdjacenciesCalculator { } @SneakyThrows - public void loadAll() { + public void loadAll(ProcessHeartbeat processHeartbeat) { AdjacenciesLoader loader = new AdjacenciesLoader(dataSource); - var executor = Executors.newFixedThreadPool(16); - var ids = adjacenciesData.getIdsList(); - - ProgressPrinter progressPrinter = new ProgressPrinter(ids.size()); - progressPrinter.start(); - + int total = adjacenciesData.getIdsList().size(); + AtomicInteger progress = new AtomicInteger(0); IntStream.of(adjacenciesData.getIdsList().toArray()).parallel() .filter(domainAliases::isNotAliased) .forEach(id -> { findAdjacent(id, loader::load); - progressPrinter.advance(); + processHeartbeat.setProgress(progress.incrementAndGet() / (double) total); }); - progressPrinter.stop(); - executor.shutdown(); System.out.println("Waiting for wrap-up"); loader.stop(); - - } private static class ProgressPrinter { @@ -192,10 +186,19 @@ public class WebsiteAdjacenciesCalculator { public static void main(String[] args) throws SQLException { DatabaseModule dm = new DatabaseModule(); - var main = new WebsiteAdjacenciesCalculator(dm.provideConnection()); + var dataSource = dm.provideConnection(); + + var main = new WebsiteAdjacenciesCalculator(dataSource); if (args.length == 1 && "load".equals(args[0])) { - main.loadAll(); + var processHeartbeat = new ProcessHeartbeat( + new ProcessConfiguration("website-adjacencies-calculator", 0, UUID.randomUUID()), + dataSource + ); + + processHeartbeat.start(); + main.loadAll(processHeartbeat); + processHeartbeat.shutDown(); return; } From 66bb12e55a4be2adbfbdd74d0bbe331bd5c8cb41 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 26 Jul 2023 21:59:35 +0200 Subject: [PATCH 086/157] (converter) File listing and download for file storage --- .../nu/marginalia/control/ControlService.java | 1 + .../control/model/FileStorageFileModel.java | 15 ++++ .../model/FileStorageWithRelatedEntries.java | 5 +- .../svc/ControlFileStorageService.java | 78 +++++++++++++++++-- .../templates/control/storage-details.hdb | 23 +++++- 5 files changed, 115 insertions(+), 7 deletions(-) create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageFileModel.java diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java index cc2e74fd..3411058c 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java @@ -85,6 +85,7 @@ public class ControlService extends Service { Spark.get("/public/storage/crawls", this::storageModelCrawls, storageCrawlsRenderer::render); Spark.get("/public/storage/processed", this::storageModelProcessed, storageProcessedRenderer::render); Spark.get("/public/storage/:id", this::storageDetailsModel, storageDetailsRenderer::render); + Spark.get("/public/storage/:id/file", controlFileStorageService::downloadFileFromStorage); final HtmlRedirect redirectToServices = new HtmlRedirect("/services"); diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageFileModel.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageFileModel.java new file mode 100644 index 00000000..c8b513ee --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageFileModel.java @@ -0,0 +1,15 @@ +package nu.marginalia.control.model; + +import nu.marginalia.db.storage.model.FileStorage; + +import java.util.List; + +public record FileStorageFileModel(String filename, + String type, + String size + ) { + + public boolean isDownloadable() { + return type.equals("file"); + } +} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithRelatedEntries.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithRelatedEntries.java index 28afba5d..608ccdca 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithRelatedEntries.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithRelatedEntries.java @@ -5,6 +5,9 @@ import nu.marginalia.db.storage.model.FileStorageType; import java.util.List; -public record FileStorageWithRelatedEntries(FileStorageWithActions self, List related) { +public record FileStorageWithRelatedEntries(FileStorageWithActions self, + List related, + List files + ) { } diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlFileStorageService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlFileStorageService.java index db122a7c..06bf240d 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlFileStorageService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlFileStorageService.java @@ -7,19 +7,23 @@ import lombok.SneakyThrows; import nu.marginalia.control.model.*; import nu.marginalia.db.storage.FileStorageService; import nu.marginalia.db.storage.model.*; +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import spark.Request; import spark.Response; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; import java.sql.SQLException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; @Singleton public class ControlFileStorageService { private final HikariDataSource dataSource; private final FileStorageService fileStorageService; + private Logger logger = LoggerFactory.getLogger(getClass()); @Inject public ControlFileStorageService(HikariDataSource dataSource, FileStorageService fileStorageService) { @@ -107,9 +111,47 @@ public class ControlFileStorageService { public FileStorageWithRelatedEntries getFileStorageWithRelatedEntries(FileStorageId id) throws SQLException { var storage = fileStorageService.getStorage(id); var related = getRelatedEntries(id); - return new FileStorageWithRelatedEntries(new FileStorageWithActions(storage), related); + + List files = new ArrayList<>(); + + try (var filesStream = Files.list(storage.asPath())) { + filesStream + .map(this::createFileModel) + .sorted(Comparator + .comparing(FileStorageFileModel::type) + .thenComparing(FileStorageFileModel::filename) + ) + .forEach(files::add); + } + catch (IOException ex) { + logger.error("Failed to list files in storage", ex); + } + + return new FileStorageWithRelatedEntries(new FileStorageWithActions(storage), related, files); } + private FileStorageFileModel createFileModel(Path p) { + try { + String type = Files.isRegularFile(p) ? "file" : "directory"; + String size; + if (Files.isDirectory(p)) { + size = "-"; + } + else { + long sizeBytes = Files.size(p); + + if (sizeBytes < 1024) size = sizeBytes + " B"; + else if (sizeBytes < 1024 * 1024) size = sizeBytes / 1024 + " KB"; + else if (sizeBytes < 1024 * 1024 * 1024) size = sizeBytes / (1024 * 1024) + " MB"; + else size = sizeBytes / (1024 * 1024 * 1024) + " GB"; + } + + return new FileStorageFileModel(p.toFile().getName(), type, size); + } + catch (IOException ex) { + throw new RuntimeException(ex); + } + } private List getRelatedEntries(FileStorageId id) { List ret = new ArrayList<>(); try (var conn = dataSource.getConnection(); @@ -131,4 +173,30 @@ public class ControlFileStorageService { } return ret; } + + public Object downloadFileFromStorage(Request request, Response response) throws SQLException { + var fileStorageId = FileStorageId.parse(request.params("id")); + String filename = request.queryParams("name"); + + Path root = fileStorageService.getStorage(fileStorageId).asPath(); + Path filePath = root.resolve(filename).normalize(); + + if (!filePath.startsWith(root)) { + response.status(403); + return ""; + } + + if (filePath.endsWith(".txt") || filePath.endsWith(".log")) response.type("text/plain"); + else response.type("application/octet-stream"); + + try (var is = Files.newInputStream(filePath)) { + is.transferTo(response.raw().getOutputStream()); + } + catch (IOException ex) { + logger.error("Failed to download file", ex); + throw new RuntimeException(ex); + } + + return ""; + } } diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/storage-details.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/storage-details.hdb index 9038d510..ec7d4ef0 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/storage-details.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/storage-details.hdb @@ -24,7 +24,28 @@ {{/with}} -

    Actions

    + + {{#if storage.files}} +

    Contents

    + + + + + + + {{#each storage.files}} + + + {{else}} {{filename}} {{/if}} + + + + {{/each}} +
    File NameTypeSize
    + {{#if downloadable}}{{filename}}{{type}}{{size}}
    + {{/if}} + +

    Actions

    {{#with storage.self}} {{#if isCrawlable}}
    From 92cac528134982f6c21bf8c6e6ce75dfff851892 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 28 Jul 2023 12:03:51 +0200 Subject: [PATCH 087/157] (mq) Add indexes to MESSAGE_QUEUE --- .../db/src/main/resources/sql/current/12-message-queue.sql | 3 +++ .../db/src/main/resources/sql/migrations/04-message-queue.sql | 3 +++ 2 files changed, 6 insertions(+) diff --git a/code/common/db/src/main/resources/sql/current/12-message-queue.sql b/code/common/db/src/main/resources/sql/current/12-message-queue.sql index 25bdc636..6e628e80 100644 --- a/code/common/db/src/main/resources/sql/current/12-message-queue.sql +++ b/code/common/db/src/main/resources/sql/current/12-message-queue.sql @@ -16,3 +16,6 @@ CREATE TABLE IF NOT EXISTS MESSAGE_QUEUE ( UPDATED_TIME TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) COMMENT 'Time of last update', TTL INT COMMENT 'Time to live in seconds' ); + +CREATE INDEX MESSAGE_QUEUE_STATE_IDX ON MESSAGE_QUEUE(STATE); +CREATE INDEX MESSAGE_QUEUE_OI_TICK_IDX ON MESSAGE_QUEUE(OWNER_INSTANCE, OWNER_TICK); diff --git a/code/common/db/src/main/resources/sql/migrations/04-message-queue.sql b/code/common/db/src/main/resources/sql/migrations/04-message-queue.sql index 25bdc636..6e628e80 100644 --- a/code/common/db/src/main/resources/sql/migrations/04-message-queue.sql +++ b/code/common/db/src/main/resources/sql/migrations/04-message-queue.sql @@ -16,3 +16,6 @@ CREATE TABLE IF NOT EXISTS MESSAGE_QUEUE ( UPDATED_TIME TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) COMMENT 'Time of last update', TTL INT COMMENT 'Time to live in seconds' ); + +CREATE INDEX MESSAGE_QUEUE_STATE_IDX ON MESSAGE_QUEUE(STATE); +CREATE INDEX MESSAGE_QUEUE_OI_TICK_IDX ON MESSAGE_QUEUE(OWNER_INSTANCE, OWNER_TICK); From 27e781761d248dfed19be4b5ccc1d239ecdbee96 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 28 Jul 2023 12:04:23 +0200 Subject: [PATCH 088/157] (mq single shot inbox) Flag messages as OK if there is no recipient --- .../marginalia/mq/inbox/MqSingleShotInbox.java | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSingleShotInbox.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSingleShotInbox.java index 85f7e2f5..edecf9d4 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSingleShotInbox.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSingleShotInbox.java @@ -2,6 +2,7 @@ package nu.marginalia.mq.inbox; import lombok.SneakyThrows; import nu.marginalia.mq.MqMessage; +import nu.marginalia.mq.MqMessageState; import nu.marginalia.mq.persistence.MqPersistence; import java.sql.SQLException; @@ -71,9 +72,24 @@ public class MqSingleShotInbox { return Optional.empty(); } + /** Send a response to the specified message. If the original message has no response inbox, + * the original message will be marked as OK instead. + * + * @param originalMessage The original message + * @param response The response + */ public void sendResponse(MqMessage originalMessage, MqInboxResponse response) { try { - persistence.sendResponse(originalMessage.msgId(), response.state(), response.message()); + if (!originalMessage.expectsResponse()) { + // If the original message doesn't expect a response, we can just mark it as OK, + // since the sendResponse method will fail explosively since it can't insert a response + // to a non-existent inbox. + + persistence.updateMessageState(originalMessage.msgId(), MqMessageState.OK); + } + else { + persistence.sendResponse(originalMessage.msgId(), response.state(), response.message()); + } } catch (SQLException e) { throw new RuntimeException(e); } From 77d5e39fe02dd5348999bc27c3713b3daf6304d9 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 28 Jul 2023 18:09:48 +0200 Subject: [PATCH 089/157] Make processed data Serializable --- .../src/main/java/nu/marginalia/model/EdgeDomain.java | 4 +++- .../model/src/main/java/nu/marginalia/model/EdgeUrl.java | 3 ++- .../java/nu/marginalia/model/idx/DocumentMetadata.java | 5 ++++- .../nu/marginalia/keyword/model/DocumentKeywords.java | 8 +++++--- .../nu/marginalia/converting/instruction/Instruction.java | 4 +++- .../converting/instruction/instructions/DomainLink.java | 4 +++- 6 files changed, 20 insertions(+), 8 deletions(-) diff --git a/code/common/model/src/main/java/nu/marginalia/model/EdgeDomain.java b/code/common/model/src/main/java/nu/marginalia/model/EdgeDomain.java index 96a44718..50d84e11 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/EdgeDomain.java +++ b/code/common/model/src/main/java/nu/marginalia/model/EdgeDomain.java @@ -3,13 +3,15 @@ package nu.marginalia.model; import lombok.*; import javax.annotation.Nonnull; +import java.io.Serializable; import java.util.Objects; import java.util.function.Predicate; import java.util.regex.Pattern; @AllArgsConstructor @Getter @Setter @Builder -public class EdgeDomain { +public class EdgeDomain implements Serializable { + @Nonnull public final String subDomain; @Nonnull diff --git a/code/common/model/src/main/java/nu/marginalia/model/EdgeUrl.java b/code/common/model/src/main/java/nu/marginalia/model/EdgeUrl.java index 90181263..9def0480 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/EdgeUrl.java +++ b/code/common/model/src/main/java/nu/marginalia/model/EdgeUrl.java @@ -6,6 +6,7 @@ import lombok.Setter; import nu.marginalia.util.QueryParams; import javax.annotation.Nullable; +import java.io.Serializable; import java.net.MalformedURLException; import java.net.URI; import java.net.URISyntaxException; @@ -15,7 +16,7 @@ import java.util.Optional; import java.util.regex.Pattern; @Getter @Setter @Builder -public class EdgeUrl { +public class EdgeUrl implements Serializable { public final String proto; public final EdgeDomain domain; public final Integer port; diff --git a/code/common/model/src/main/java/nu/marginalia/model/idx/DocumentMetadata.java b/code/common/model/src/main/java/nu/marginalia/model/idx/DocumentMetadata.java index fc49b300..0b1fe480 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/idx/DocumentMetadata.java +++ b/code/common/model/src/main/java/nu/marginalia/model/idx/DocumentMetadata.java @@ -2,6 +2,7 @@ package nu.marginalia.model.idx; import nu.marginalia.model.crawl.PubDate; +import java.io.Serializable; import java.util.EnumSet; import java.util.Set; @@ -15,7 +16,9 @@ public record DocumentMetadata(int avgSentLength, int year, int sets, int quality, - byte flags) { + byte flags) + implements Serializable +{ public String toString() { StringBuilder sb = new StringBuilder(getClass().getSimpleName()); diff --git a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/model/DocumentKeywords.java b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/model/DocumentKeywords.java index 55622cb8..5d611cc9 100644 --- a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/model/DocumentKeywords.java +++ b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/model/DocumentKeywords.java @@ -3,11 +3,13 @@ package nu.marginalia.keyword.model; import nu.marginalia.model.idx.WordMetadata; +import java.io.Serializable; import java.util.Arrays; -public record DocumentKeywords( - String[] keywords, - long[] metadata) { +public record DocumentKeywords(String[] keywords, + long[] metadata) +implements Serializable +{ @Override public String toString() { diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Instruction.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Instruction.java index 4964c9b1..b36ef217 100644 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Instruction.java +++ b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Instruction.java @@ -1,6 +1,8 @@ package nu.marginalia.converting.instruction; -public interface Instruction { +import java.io.Serializable; + +public interface Instruction extends Serializable { void apply(Interpreter interpreter); boolean isNoOp(); diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/DomainLink.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/DomainLink.java index c33f9892..22230a37 100644 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/DomainLink.java +++ b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/DomainLink.java @@ -2,5 +2,7 @@ package nu.marginalia.converting.instruction.instructions; import nu.marginalia.model.EdgeDomain; -public record DomainLink(EdgeDomain from, EdgeDomain to) { +import java.io.Serializable; + +public record DomainLink(EdgeDomain from, EdgeDomain to) implements Serializable { } From 9288d311d4723cf13acc9d8744d6ecd6e8fd4d39 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 28 Jul 2023 18:10:34 +0200 Subject: [PATCH 090/157] Add buffering to index journal writer --- .../journal/writer/IndexJournalWriterImpl.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/writer/IndexJournalWriterImpl.java b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/writer/IndexJournalWriterImpl.java index 4406350f..2fee64fe 100644 --- a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/writer/IndexJournalWriterImpl.java +++ b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/writer/IndexJournalWriterImpl.java @@ -32,7 +32,7 @@ public class IndexJournalWriterImpl implements IndexJournalWriter{ writeHeaderPlaceholder(fileStream); - outputStream = new DataOutputStream(new ZstdOutputStream(fileStream)); + outputStream = new DataOutputStream(new ZstdOutputStream(new BufferedOutputStream(fileStream))); } private static void writeHeaderPlaceholder(OutputStream fileStream) throws IOException { From f11103d31de7823c095223bdae0e538b1db515a4 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 28 Jul 2023 18:14:43 +0200 Subject: [PATCH 091/157] (WIP) Make it possible to sideload encyclopedia data. This is mostly a pilot track for sideloading other large websites. Also change coverter to produce a more compact output (java serialization instead of json). --- .../mqapi/converting/ConvertAction.java | 6 + .../mqapi/converting/ConvertRequest.java | 2 + .../processes/converting-process/build.gradle | 1 + .../marginalia/converting/ConverterMain.java | 143 +++++++--- .../converting/InstructionWriterFactory.java | 14 +- .../compiler/DocumentsCompiler.java | 4 +- .../compiler/DomainMetadataCompiler.java | 4 + .../compiler/InstructionsCompiler.java | 41 ++- .../converting/compiler/UrlsCompiler.java | 21 +- .../converting/processor/DomainProcessor.java | 3 +- .../EncyclopediaMarginaliaNuSideloader.java | 247 ++++++++++++++++++ .../converting/sideload/SideloadSource.java | 15 ++ .../sideload/SideloadSourceFactory.java | 23 ++ .../loading/ConvertedDomainReader.java | 65 +++-- .../nu/marginalia/loading/LoaderMain.java | 38 ++- .../nu/marginalia/loading/loader/Loader.java | 40 +-- .../loader/SqlLoadProcessedDocument.java | 2 +- .../loader/SqlLoadProcessedDomain.java | 9 +- .../loading/loader/SqlLoadUrls.java | 66 +++-- .../loader/SqlLoadProcessedDomainTest.java | 9 +- .../actor/task/ReconvertAndLoadActor.java | 6 +- settings.gradle | 2 + 22 files changed, 618 insertions(+), 143 deletions(-) create mode 100644 code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertAction.java create mode 100644 code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/EncyclopediaMarginaliaNuSideloader.java create mode 100644 code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSource.java create mode 100644 code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java diff --git a/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertAction.java b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertAction.java new file mode 100644 index 00000000..0c3f575a --- /dev/null +++ b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertAction.java @@ -0,0 +1,6 @@ +package nu.marginalia.mqapi.converting; + +public enum ConvertAction { + ConvertCrawlData, + SideloadEncyclopedia +} diff --git a/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertRequest.java b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertRequest.java index 64091146..abacf8af 100644 --- a/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertRequest.java +++ b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertRequest.java @@ -5,6 +5,8 @@ import nu.marginalia.db.storage.model.FileStorageId; @AllArgsConstructor public class ConvertRequest { + public final ConvertAction action; + public final String inputSource; public final FileStorageId crawlStorage; public final FileStorageId processedDataStorage; } diff --git a/code/processes/converting-process/build.gradle b/code/processes/converting-process/build.gradle index b85a829b..a14ee596 100644 --- a/code/processes/converting-process/build.gradle +++ b/code/processes/converting-process/build.gradle @@ -79,6 +79,7 @@ dependencies { implementation libs.crawlercommons implementation libs.commons.lang3 + implementation libs.sqlite testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.junit diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java index 9c8373e1..c7584a6c 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java @@ -5,24 +5,26 @@ import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; import nu.marginalia.converting.model.ProcessedDomain; +import nu.marginalia.converting.sideload.SideloadSource; +import nu.marginalia.converting.sideload.SideloadSourceFactory; import nu.marginalia.db.storage.FileStorageService; import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mq.MqMessage; import nu.marginalia.mq.inbox.MqInboxResponse; import nu.marginalia.mq.inbox.MqSingleShotInbox; +import nu.marginalia.mqapi.converting.ConvertAction; import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.process.log.WorkLog; import nu.marginalia.service.module.DatabaseModule; import plan.CrawlPlan; import nu.marginalia.converting.compiler.InstructionsCompiler; -import nu.marginalia.converting.instruction.Instruction; import nu.marginalia.converting.processor.DomainProcessor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; +import java.nio.file.Path; import java.sql.SQLException; -import java.util.List; import java.util.Optional; import java.util.UUID; import java.util.concurrent.LinkedBlockingQueue; @@ -34,7 +36,6 @@ import java.util.concurrent.atomic.AtomicInteger; import static nu.marginalia.mqapi.ProcessInboxNames.CONVERTER_INBOX; public class ConverterMain { - private static final Logger logger = LoggerFactory.getLogger(ConverterMain.class); private final DomainProcessor processor; private final InstructionsCompiler compiler; @@ -42,10 +43,9 @@ public class ConverterMain { private final ProcessHeartbeat heartbeat; private final MessageQueueFactory messageQueueFactory; private final FileStorageService fileStorageService; + private final SideloadSourceFactory sideloadSourceFactory; public static void main(String... args) throws Exception { - - Injector injector = Guice.createInjector( new ConverterModule(), new DatabaseModule() @@ -55,15 +55,9 @@ public class ConverterMain { logger.info("Starting pipe"); - var request = converter.fetchInstructions(); - try { - converter.convert(request); - request.ok(); - } - catch (Exception ex) { - logger.error("Conversion failed", ex); - request.err(); - } + converter + .fetchInstructions() + .execute(converter); logger.info("Finished"); @@ -77,21 +71,42 @@ public class ConverterMain { Gson gson, ProcessHeartbeat heartbeat, MessageQueueFactory messageQueueFactory, - FileStorageService fileStorageService - ) { + FileStorageService fileStorageService, + SideloadSourceFactory sideloadSourceFactory + ) + { this.processor = processor; this.compiler = compiler; this.gson = gson; this.heartbeat = heartbeat; this.messageQueueFactory = messageQueueFactory; this.fileStorageService = fileStorageService; + this.sideloadSourceFactory = sideloadSourceFactory; heartbeat.start(); } - public void convert(ConvertRequest request) throws Exception { + public void convert(SideloadSource sideloadSource, Path writeDir) throws Exception { + int maxPoolSize = 16; - var plan = request.getPlan(); + try (WorkLog workLog = new WorkLog(writeDir.resolve("processor.log")); + ConversionLog conversionLog = new ConversionLog(writeDir)) { + var instructionWriter = new InstructionWriterFactory(conversionLog, writeDir, gson); + + final String where; + final int size; + + try (var writer = instructionWriter.createInstructionsForDomainWriter(sideloadSource.getId())) { + compiler.compileStreaming(sideloadSource, writer::accept); + where = writer.getFileName(); + size = writer.getSize(); + } + + workLog.setJobToFinished(sideloadSource.getId(), where, size); + } + } + + public void convert(CrawlPlan plan) throws Exception { final int maxPoolSize = 16; @@ -146,29 +161,19 @@ public class ConverterMain { do { System.out.println("Waiting for pool to terminate... " + pool.getActiveCount() + " remaining"); } while (!pool.awaitTermination(60, TimeUnit.SECONDS)); - - request.ok(); - } - catch (Exception e) { - request.err(); - throw e; } } - private static class ConvertRequest { - private final CrawlPlan plan; + private abstract static class ConvertRequest { private final MqMessage message; private final MqSingleShotInbox inbox; - ConvertRequest(CrawlPlan plan, MqMessage message, MqSingleShotInbox inbox) { - this.plan = plan; + private ConvertRequest(MqMessage message, MqSingleShotInbox inbox) { this.message = message; this.inbox = inbox; } - public CrawlPlan getPlan() { - return plan; - } + public abstract void execute(ConverterMain converterMain) throws Exception; public void ok() { inbox.sendResponse(message, MqInboxResponse.ok()); @@ -176,9 +181,55 @@ public class ConverterMain { public void err() { inbox.sendResponse(message, MqInboxResponse.err()); } - } + private static class SideloadAction extends ConvertRequest { + + private final SideloadSource sideloadSource; + private final Path workDir; + + SideloadAction(SideloadSource sideloadSource, + Path workDir, + MqMessage message, MqSingleShotInbox inbox) { + super(message, inbox); + this.sideloadSource = sideloadSource; + this.workDir = workDir; + } + + @Override + public void execute(ConverterMain converterMain) throws Exception { + try { + converterMain.convert(sideloadSource, workDir); + ok(); + } + catch (Exception ex) { + logger.error("Error sideloading", ex); + err(); + } + } + } + + private static class ConvertCrawlDataAction extends ConvertRequest { + private final CrawlPlan plan; + + private ConvertCrawlDataAction(CrawlPlan plan, MqMessage message, MqSingleShotInbox inbox) { + super(message, inbox); + this.plan = plan; + } + + @Override + public void execute(ConverterMain converterMain) throws Exception { + try { + converterMain.convert(plan); + ok(); + } + catch (Exception ex) { + err(); + } + } + } + + private ConvertRequest fetchInstructions() throws Exception { var inbox = messageQueueFactory.createSingleShotInbox(CONVERTER_INBOX, UUID.randomUUID()); @@ -188,14 +239,30 @@ public class ConverterMain { var request = gson.fromJson(msg.payload(), nu.marginalia.mqapi.converting.ConvertRequest.class); - var crawlData = fileStorageService.getStorage(request.crawlStorage); - var processData = fileStorageService.getStorage(request.processedDataStorage); + if (request.action == ConvertAction.ConvertCrawlData) { - var plan = new CrawlPlan(null, - new CrawlPlan.WorkDir(crawlData.path(), "crawler.log"), - new CrawlPlan.WorkDir(processData.path(), "processor.log")); + var crawlData = fileStorageService.getStorage(request.crawlStorage); + var processData = fileStorageService.getStorage(request.processedDataStorage); - return new ConvertRequest(plan, msg, inbox); + var plan = new CrawlPlan(null, + new CrawlPlan.WorkDir(crawlData.path(), "crawler.log"), + new CrawlPlan.WorkDir(processData.path(), "processor.log")); + + return new ConvertCrawlDataAction(plan, msg, inbox); + } + + if (request.action == ConvertAction.SideloadEncyclopedia) { + var processData = fileStorageService.getStorage(request.processedDataStorage); + var filePath = Path.of(request.inputSource); + + return new SideloadAction(sideloadSourceFactory.sideloadEncyclopediaMarginaliaNu(filePath), + processData.asPath(), + msg, inbox); + } + + else { + throw new RuntimeException("Unknown action: " + request.action); + } } private Optional getMessage(MqSingleShotInbox inbox, String expectedFunction) throws SQLException, InterruptedException { diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriterFactory.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriterFactory.java index e6009d0e..fee4fc19 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriterFactory.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriterFactory.java @@ -42,7 +42,7 @@ public class InstructionWriterFactory { } public class InstructionWriter implements AutoCloseable { - private final OutputStreamWriter outputStream; + private final ObjectOutputStream outputStream; private final String where; private final SummarizingInterpreter summary = new SummarizingInterpreter(); @@ -52,7 +52,7 @@ public class InstructionWriterFactory { InstructionWriter(Path filename) throws IOException { where = filename.getFileName().toString(); Files.deleteIfExists(filename); - outputStream = new OutputStreamWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(filename.toFile())))); + outputStream = new ObjectOutputStream(new ZstdOutputStream(new FileOutputStream(filename.toFile()))); } public void accept(Instruction instruction) { @@ -64,10 +64,12 @@ public class InstructionWriterFactory { size++; try { - outputStream.append(instruction.tag().name()); - outputStream.append(' '); - gson.toJson(instruction, outputStream); - outputStream.append('\n'); + outputStream.writeObject(instruction); + + // Reset the stream to avoid keeping references to the objects + // (as this will cause the memory usage to grow indefinitely when + // writing huge amounts of data) + outputStream.reset(); } catch (IOException ex) { logger.warn("IO exception writing instruction", ex); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java index 881a1a33..9bc3f6b3 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java @@ -23,7 +23,7 @@ public class DocumentsCompiler { } - private void compileDocumentDetails(Consumer instructionConsumer, ProcessedDocument doc) { + public void compileDocumentDetails(Consumer instructionConsumer, ProcessedDocument doc) { var details = doc.details; if (details != null) { @@ -31,7 +31,7 @@ public class DocumentsCompiler { } } - private void compileWords(Consumer instructionConsumer, ProcessedDocument doc) { + public void compileWords(Consumer instructionConsumer, ProcessedDocument doc) { var words = doc.words; if (words != null) { diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DomainMetadataCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DomainMetadataCompiler.java index 74ae5816..3909edb1 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DomainMetadataCompiler.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DomainMetadataCompiler.java @@ -40,4 +40,8 @@ public class DomainMetadataCompiler { instructionConsumer.accept(new LoadDomainMetadata(domain, knownUrls.size(), goodUrls, visitedUrls)); } + public void compileFake(Consumer instructionConsumer, EdgeDomain domain, int countAll, int countGood) { + instructionConsumer.accept(new LoadDomainMetadata(domain, countAll, countGood, countAll)); + } + } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/InstructionsCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/InstructionsCompiler.java index 9b32ed8d..87f28e3c 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/InstructionsCompiler.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/InstructionsCompiler.java @@ -3,11 +3,15 @@ package nu.marginalia.converting.compiler; import com.google.inject.Inject; import nu.marginalia.converting.instruction.Instruction; import nu.marginalia.converting.instruction.instructions.LoadProcessedDomain; +import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.model.ProcessedDomain; +import nu.marginalia.converting.sideload.SideloadSource; +import nu.marginalia.model.EdgeUrl; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; -import java.util.ArrayList; import java.util.Collections; -import java.util.List; +import java.util.Iterator; import java.util.function.Consumer; import static java.util.Objects.requireNonNullElse; @@ -20,6 +24,8 @@ public class InstructionsCompiler { private final LinksCompiler linksCompiler; private final RedirectCompiler redirectCompiler; + private final Logger logger = LoggerFactory.getLogger(InstructionsCompiler.class); + @Inject public InstructionsCompiler(UrlsCompiler urlsCompiler, DocumentsCompiler documentsCompiler, @@ -53,4 +59,35 @@ public class InstructionsCompiler { domainMetadataCompiler.compile(instructionConsumer, domain.domain, requireNonNullElse(domain.documents, Collections.emptyList())); } + + public void compileStreaming(SideloadSource sideloadSource, + Consumer instructionConsumer) { + ProcessedDomain domain = sideloadSource.getDomain(); + Iterator urlsIterator = sideloadSource.getUrlsIterator(); + Iterator documentsIterator = sideloadSource.getDocumentsStream(); + + // Guaranteed to always be first + instructionConsumer.accept(new LoadProcessedDomain(domain.domain, domain.state, domain.ip)); + + int countAll = 0; + int countGood = 0; + + logger.info("Writing domains"); + urlsCompiler.compileJustDomain(instructionConsumer, domain.domain); + logger.info("Writing urls"); + urlsCompiler.compileJustUrls(instructionConsumer, urlsIterator); + + logger.info("Writing docs"); + + while (documentsIterator.hasNext()) { + var doc = documentsIterator.next(); + countAll++; + if (doc.isOk()) countGood++; + + documentsCompiler.compileDocumentDetails(instructionConsumer, doc); + documentsCompiler.compileWords(instructionConsumer, doc); + } + + domainMetadataCompiler.compileFake(instructionConsumer, domain.domain, countAll, countGood); + } } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/UrlsCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/UrlsCompiler.java index ba347058..34e243b3 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/UrlsCompiler.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/UrlsCompiler.java @@ -9,10 +9,7 @@ import nu.marginalia.model.EdgeUrl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.List; -import java.util.Set; +import java.util.*; import java.util.function.Consumer; public class UrlsCompiler { @@ -58,4 +55,20 @@ public class UrlsCompiler { instructionConsumer.accept(new LoadUrl(seenUrls.toArray(EdgeUrl[]::new))); } + public void compileJustUrls(Consumer instructionConsumer, Iterator urlsIterator) { + var urls = new ArrayList(1000); + + while (urlsIterator.hasNext()) { + if (urls.size() >= 1000) { + instructionConsumer.accept(new LoadUrl(urls.toArray(EdgeUrl[]::new))); + urls.clear(); + } + + urls.add(urlsIterator.next()); + } + } + + public void compileJustDomain(Consumer instructionConsumer, EdgeDomain domain) { + instructionConsumer.accept(new LoadDomain(domain)); + } } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index 26ade3c6..e313bcdf 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -28,7 +28,8 @@ public class DomainProcessor { @Inject public DomainProcessor(DocumentProcessor documentProcessor, SiteWords siteWords, - LshDocumentDeduplicator documentDeduplicator) { + LshDocumentDeduplicator documentDeduplicator) + { this.documentProcessor = documentProcessor; this.siteWords = siteWords; this.documentDeduplicator = documentDeduplicator; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/EncyclopediaMarginaliaNuSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/EncyclopediaMarginaliaNuSideloader.java new file mode 100644 index 00000000..ef5a5874 --- /dev/null +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/EncyclopediaMarginaliaNuSideloader.java @@ -0,0 +1,247 @@ +package nu.marginalia.converting.sideload; + +import com.github.luben.zstd.ZstdInputStream; +import com.google.gson.Gson; +import lombok.SneakyThrows; +import nu.marginalia.converting.model.DisqualifiedException; +import nu.marginalia.converting.model.ProcessedDocument; +import nu.marginalia.converting.model.ProcessedDomain; +import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin; +import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawl.DomainIndexingState; +import nu.marginalia.model.crawl.UrlIndexingState; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.URISyntaxException; +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; +import java.nio.file.Path; +import java.sql.*; +import java.time.LocalDateTime; +import java.util.Iterator; +import java.util.List; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.Semaphore; +import java.util.concurrent.atomic.AtomicBoolean; + +/** This is an experimental sideloader for encyclopedia.marginalia.nu's database; + * (which serves as a way of loading wikipedia's zim files without binding to GPL2'd code) + * + * See https://github.com/MarginaliaSearch/encyclopedia.marginalia.nu for extracting the data + */ +public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoCloseable { + + private final Connection connection; + private final Gson gson; + private final HtmlDocumentProcessorPlugin htmlProcessorPlugin; + + public EncyclopediaMarginaliaNuSideloader(Path pathToDbFile, + Gson gson, + HtmlDocumentProcessorPlugin htmlProcessorPlugin) throws SQLException { + this.gson = gson; + this.htmlProcessorPlugin = htmlProcessorPlugin; + String sqliteDbString = "jdbc:sqlite:" + pathToDbFile.toString(); + + connection = DriverManager.getConnection(sqliteDbString); + + } + + @Override + public ProcessedDomain getDomain() { + var ret = new ProcessedDomain(); + + ret.domain = new EdgeDomain("encyclopedia.marginalia.nu"); + ret.id = "encyclopedia.marginalia.nu"; + ret.ip = "127.0.0.1"; + ret.state = DomainIndexingState.ACTIVE; + + return ret; + } + + @Override + @SneakyThrows + public Iterator getUrlsIterator() { + EdgeUrl base = new EdgeUrl("https://encyclopedia.marginalia.nu/"); + + return new SqlQueryIterator<>(connection.prepareStatement(""" + SELECT url, html FROM articles + """)) + { + @Override + public EdgeUrl convert(ResultSet rs) throws Exception { + var path = URLEncoder.encode(rs.getString("url"), StandardCharsets.UTF_8); + + return base.withPathAndParam("/article/"+path, null); + } + }; + } + + + @SneakyThrows + @Override + public Iterator getDocumentsStream() { + LinkedBlockingQueue docs = new LinkedBlockingQueue<>(32); + AtomicBoolean isFinished = new AtomicBoolean(false); + + ExecutorService executorService = Executors.newFixedThreadPool(16); + Semaphore sem = new Semaphore(16); + + executorService.submit(() -> { + try { + var stmt = connection.prepareStatement(""" + SELECT url,title,html FROM articles + """); + stmt.setFetchSize(100); + + var rs = stmt.executeQuery(); + while (rs.next()) { + var articleParts = fromCompressedJson(rs.getBytes("html"), ArticleParts.class); + String title = rs.getString("title"); + String url = rs.getString("url"); + + sem.acquire(); + + executorService.submit(() -> { + try { + docs.add(convertDocument(articleParts.parts, title, url)); + } catch (URISyntaxException | DisqualifiedException e) { + e.printStackTrace(); + } finally { + sem.release(); + } + }); + } + + stmt.close(); + } + catch (Exception e) { + e.printStackTrace(); + } + finally { + isFinished.set(true); + } + }); + + return new Iterator<>() { + @Override + public boolean hasNext() { + return !isFinished.get() || !docs.isEmpty() || sem.availablePermits() < 16; + } + + @SneakyThrows + @Override + public ProcessedDocument next() { + return docs.take(); + } + }; + } + + private ProcessedDocument convertDocument(List parts, String title, String url) throws URISyntaxException, DisqualifiedException { + String fullUrl = "https://encyclopedia.marginalia.nu/article/"+url; + + StringBuilder fullHtml = new StringBuilder(); + fullHtml.append("").append(title).append(""); + for (String part : parts) { + fullHtml.append("

    "); + fullHtml.append(part); + fullHtml.append("

    "); + } + fullHtml.append(""); + + var crawledDoc = new CrawledDocument( + "encyclopedia.marginalia.nu", + fullUrl, + "text/html", + LocalDateTime.now().toString(), + 200, + "OK", + "NP", + "", + fullHtml.toString(), + Integer.toHexString(fullHtml.hashCode()), + fullUrl, + "", + "SIDELOAD" + ); + + var ret = new ProcessedDocument(); + try { + var details = htmlProcessorPlugin.createDetails(crawledDoc); + + ret.words = details.words(); + ret.details = details.details(); + ret.url = new EdgeUrl(fullUrl); + ret.state = UrlIndexingState.OK; + ret.stateReason = "SIDELOAD"; + } + catch (Exception e) { + ret.url = new EdgeUrl(fullUrl); + ret.state = UrlIndexingState.DISQUALIFIED; + ret.stateReason = "SIDELOAD"; + } + + return ret; + + } + + private T fromCompressedJson(byte[] stream, Class type) throws IOException { + return gson.fromJson(new InputStreamReader(new ZstdInputStream(new ByteArrayInputStream(stream))), type); + } + + private record ArticleParts(List parts) {} + + @Override + public String getId() { + return "encyclopedia.marginalia.nu"; + } + + @Override + public void close() throws Exception { + connection.close(); + } + + private abstract static class SqlQueryIterator implements Iterator { + PreparedStatement stmt; + ResultSet rs; + T next = null; + + public SqlQueryIterator(PreparedStatement stmt) throws SQLException { + this.stmt = stmt; + stmt.setFetchSize(1000); + rs = stmt.executeQuery(); + } + + @SneakyThrows + @Override + public boolean hasNext() { + if (next != null) { + return true; + } + if (!rs.next()) { + stmt.close(); + return false; + } + + next = convert(rs); + + return true; + } + + public abstract T convert(ResultSet rs) throws Exception; + + @Override + public T next () { + if (!hasNext()) + throw new IllegalStateException("No next element"); + var ret = next; + next = null; + return ret; + } + } +} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSource.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSource.java new file mode 100644 index 00000000..d23a81ae --- /dev/null +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSource.java @@ -0,0 +1,15 @@ +package nu.marginalia.converting.sideload; + +import nu.marginalia.converting.model.ProcessedDocument; +import nu.marginalia.converting.model.ProcessedDomain; +import nu.marginalia.model.EdgeUrl; + +import java.util.Iterator; + +public interface SideloadSource { + ProcessedDomain getDomain(); + Iterator getUrlsIterator(); + Iterator getDocumentsStream(); + + String getId(); +} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java new file mode 100644 index 00000000..83c629d3 --- /dev/null +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java @@ -0,0 +1,23 @@ +package nu.marginalia.converting.sideload; + +import com.google.gson.Gson; +import com.google.inject.Inject; +import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin; + +import java.nio.file.Path; +import java.sql.SQLException; + +public class SideloadSourceFactory { + private final Gson gson; + private final HtmlDocumentProcessorPlugin htmlProcessorPlugin; + + @Inject + public SideloadSourceFactory(Gson gson, HtmlDocumentProcessorPlugin htmlProcessorPlugin) { + this.gson = gson; + this.htmlProcessorPlugin = htmlProcessorPlugin; + } + + public SideloadSource sideloadEncyclopediaMarginaliaNu(Path pathToDbFile) throws SQLException { + return new EncyclopediaMarginaliaNuSideloader(pathToDbFile, gson, htmlProcessorPlugin); + } +} diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/ConvertedDomainReader.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/ConvertedDomainReader.java index 6b9dfbbd..1c06510e 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/ConvertedDomainReader.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/ConvertedDomainReader.java @@ -2,10 +2,8 @@ package nu.marginalia.loading; import com.github.luben.zstd.ZstdInputStream; import com.google.gson.Gson; -import com.google.gson.JsonParseException; +import lombok.SneakyThrows; import nu.marginalia.converting.instruction.Instruction; -import nu.marginalia.converting.instruction.InstructionTag; -import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -13,6 +11,7 @@ import javax.inject.Inject; import java.io.*; import java.nio.file.Path; import java.util.ArrayList; +import java.util.Iterator; import java.util.List; public class ConvertedDomainReader { @@ -27,30 +26,48 @@ public class ConvertedDomainReader { public List read(Path path, int cntHint) throws IOException { List ret = new ArrayList<>(cntHint); - try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new BufferedInputStream(new FileInputStream(path.toFile())))))) { - String line; - for (;;) { - line = br.readLine(); - - if (line == null) { - break; - } - if (line.isBlank()) { - continue; - } - var parts= line.split(" ", 2); - var type = InstructionTag.valueOf(parts[0]).clazz; - - try { - ret.add(gson.fromJson(parts[1], type)); - } - catch (NullPointerException|JsonParseException ex) { - logger.warn("Failed to deserialize {} {}", type.getSimpleName(), StringUtils.abbreviate(parts[1], 255)); - logger.warn("Json error", ex); - } + try (var or = new ObjectInputStream(new ZstdInputStream(new FileInputStream(path.toFile())))) { + var object = or.readObject(); + if (object instanceof Instruction is) { + ret.add(is); } + } catch (ClassNotFoundException e) { + throw new RuntimeException(e); } return ret; } + + public Iterator createIterator(Path path) throws IOException { + var or = new ObjectInputStream(new ZstdInputStream(new BufferedInputStream(new FileInputStream(path.toFile())))); + + return new Iterator<>() { + Instruction next; + @SneakyThrows + @Override + public boolean hasNext() { + if (next != null) + return true; + + try { + next = (Instruction) or.readObject(); + return true; + } + catch (java.io.EOFException ex) { + or.close(); + return false; + } + } + + @Override + public Instruction next() { + if (next != null || hasNext()) { + var ret = next; + next = null; + return ret; + } + throw new IllegalStateException(); + } + }; + } } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java index fc169461..c8441330 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java @@ -23,7 +23,7 @@ import org.slf4j.LoggerFactory; import java.nio.file.Path; import java.sql.SQLException; -import java.util.List; +import java.util.Iterator; import java.util.Optional; import java.util.UUID; import java.util.concurrent.LinkedBlockingQueue; @@ -55,8 +55,13 @@ public class LoaderMain { ); var instance = injector.getInstance(LoaderMain.class); - var instructions = instance.fetchInstructions(); - instance.run(instructions); + try { + var instructions = instance.fetchInstructions(); + instance.run(instructions); + } + catch (Exception ex) { + logger.error("Error running loader", ex); + } } @Inject @@ -101,7 +106,19 @@ public class LoaderMain { for (var entry : WorkLog.iterable(logFile)) { heartbeat.setProgress(loaded++ / (double) loadTotal); - load(plan, entry.path(), entry.cnt()); + var loader = loaderFactory.create(entry.cnt()); + Path destDir = plan.getProcessedFilePath(entry.path()); + + var instructionsIter = instructionsReader.createIterator(destDir); + while (instructionsIter.hasNext()) { + var next = instructionsIter.next(); + try { + next.apply(loader); + } + catch (Exception ex) { + logger.error("Failed to load instruction {}", next); + } + } } running = false; @@ -110,6 +127,7 @@ public class LoaderMain { // This needs to be done in order to have a readable index journal indexLoadKeywords.close(); + logger.info("Loading finished"); } catch (Exception ex) { logger.error("Failed to load", ex); @@ -119,6 +137,7 @@ public class LoaderMain { finally { heartbeat.shutDown(); } + System.exit(0); } @@ -128,7 +147,7 @@ public class LoaderMain { Path destDir = plan.getProcessedFilePath(path); try { var loader = loaderFactory.create(cnt); - var instructions = instructionsReader.read(destDir, cnt); + var instructions = instructionsReader.createIterator(destDir); processQueue.put(new LoadJob(path, loader, instructions)); } catch (Exception e) { logger.error("Failed to load " + destDir, e); @@ -137,15 +156,16 @@ public class LoaderMain { static final TaskStats taskStats = new TaskStats(100); - private record LoadJob(String path, Loader loader, List instructionList) { + private record LoadJob(String path, Loader loader, Iterator instructionIterator) { public void run() { long startTime = System.currentTimeMillis(); - for (var i : instructionList) { + while (instructionIterator.hasNext()) { + var next = instructionIterator.next(); try { - i.apply(loader); + next.apply(loader); } catch (Exception ex) { - logger.error("Failed to load instruction {}", i); + logger.error("Failed to load instruction {}", next); } } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java index 21216b35..96c5a21c 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java @@ -86,40 +86,24 @@ public class Loader implements Interpreter { @Override public void loadProcessedDocument(LoadProcessedDocument document) { - deferralCheck(document.url()); - processedDocumentList.add(document); + if (processedDocumentList.size() > 100) { + sqlLoadProcessedDocument.load(data, processedDocumentList); + processedDocumentList.clear(); + } } @Override public void loadProcessedDocumentWithError(LoadProcessedDocumentWithError document) { - deferralCheck(document.url()); - processedDocumentWithErrorList.add(document); - } - - private void deferralCheck(EdgeUrl url) { - if (data.getDomainId(url.domain) <= 0) - deferredDomains.add(url.domain); - - if (data.getUrlId(url) <= 0) - deferredUrls.add(url); + if (processedDocumentWithErrorList.size() > 100) { + sqlLoadProcessedDocument.loadWithError(data, processedDocumentWithErrorList); + processedDocumentWithErrorList.clear(); + } } @Override public void loadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) { - // This is a bit of a bandaid safeguard against a bug in - // in the converter, shouldn't be necessary in the future - if (!deferredDomains.isEmpty()) { - loadDomain(deferredDomains.toArray(EdgeDomain[]::new)); - deferredDomains.clear(); - } - - if (!deferredUrls.isEmpty()) { - loadUrl(deferredUrls.toArray(EdgeUrl[]::new)); - deferredUrls.clear(); - } - try { indexLoadKeywords.load(data, url, metadata, words); } catch (InterruptedException e) { @@ -140,8 +124,12 @@ public class Loader implements Interpreter { public void finish() { // Some work needs to be processed out of order for the database relations to work out - sqlLoadProcessedDocument.load(data, processedDocumentList); - sqlLoadProcessedDocument.loadWithError(data, processedDocumentWithErrorList); + if (processedDocumentList.size() > 0) { + sqlLoadProcessedDocument.load(data, processedDocumentList); + } + if (processedDocumentWithErrorList.size() > 0) { + sqlLoadProcessedDocument.loadWithError(data, processedDocumentWithErrorList); + } } } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDocument.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDocument.java index 2aec488d..02c4202c 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDocument.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDocument.java @@ -73,7 +73,7 @@ public class SqlLoadProcessedDocument { int urlId = data.getUrlId(doc.url()); if (urlId <= 0) { logger.warn("Failed to resolve ID for URL {}", doc.url()); - return; + continue; } stmt.setInt(1, urlId); diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java index c06ff84c..df598b14 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java @@ -14,12 +14,14 @@ import java.sql.SQLException; public class SqlLoadProcessedDomain { private final HikariDataSource dataSource; private final SqlLoadDomains loadDomains; + private final SqlLoadUrls loadUrls; private static final Logger logger = LoggerFactory.getLogger(SqlLoadProcessedDomain.class); @Inject - public SqlLoadProcessedDomain(HikariDataSource dataSource, SqlLoadDomains loadDomains) { + public SqlLoadProcessedDomain(HikariDataSource dataSource, SqlLoadDomains loadDomains, SqlLoadUrls loadUrls) { this.dataSource = dataSource; this.loadDomains = loadDomains; + this.loadUrls = loadUrls; try (var conn = dataSource.getConnection()) { @@ -34,7 +36,7 @@ public class SqlLoadProcessedDomain { BEGIN DELETE FROM DOMAIN_METADATA WHERE ID=DID; DELETE FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=DID; - DELETE FROM EC_URL WHERE DOMAIN_ID = DID; + DELETE FROM EC_PAGE_DATA WHERE ID IN (SELECT ID FROM EC_URL WHERE DOMAIN_ID = DID); UPDATE EC_DOMAIN SET INDEX_DATE=NOW(), STATE=ST, DOMAIN_ALIAS=NULL, INDEXED=GREATEST(INDEXED,IDX), IP=IP WHERE ID=DID; DELETE FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=DID; END @@ -47,6 +49,7 @@ public class SqlLoadProcessedDomain { } public void load(LoaderData data, EdgeDomain domain, DomainIndexingState state, String ip) { + data.setTargetDomain(domain); loadDomains.load(data, domain); @@ -63,6 +66,8 @@ public class SqlLoadProcessedDomain { if (rc < 1) { logger.warn("load({},{}) -- bad rowcount {}", domain, state, rc); } + + loadUrls.loadUrlsForDomain(data, domain, 0); } catch (SQLException ex) { logger.warn("SQL error initializing domain", ex); diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadUrls.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadUrls.java index 18bd32b1..a0b0f8cb 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadUrls.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadUrls.java @@ -30,23 +30,35 @@ public class SqlLoadUrls { public void load(LoaderData data, EdgeUrl[] urls) { Set affectedDomains = new HashSet<>(); + if (urls.length == 0) + return; + + int maxOldId = 0; try (var conn = dataSource.getConnection(); var insertCall = conn.prepareStatement("INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PARAM,PATH_HASH) VALUES (?,?,?,?,?,?)"); - var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH, PARAM FROM EC_URL WHERE DOMAIN_ID=?") - ) + var queryMaxId = conn.prepareStatement("SELECT MAX(ID) FROM EC_URL")) { conn.setAutoCommit(false); + var rs = queryMaxId.executeQuery(); + if (rs.next()) { + maxOldId = rs.getInt(1); + } int cnt = 0; int batchOffset = 0; + for (var url : urls) { + if (data.getUrlId(url) != 0) + continue; if (url.path.length() >= 255) { - logger.debug("Skipping bad URL {}", url); + logger.info("Skipping bad URL {}", url); continue; } + var domainId = data.getDomainId(url.domain); + affectedDomains.add(url.domain); insertCall.setString(1, url.proto); - insertCall.setInt(2, data.getDomainId(url.domain)); + insertCall.setInt(2, domainId); if (url.port != null) { insertCall.setInt(3, url.port); } @@ -58,10 +70,8 @@ public class SqlLoadUrls { insertCall.setLong(6, hashPath(url.path, url.param)); insertCall.addBatch(); - if (cnt++ == 1000) { + if (++cnt == 1000) { var ret = insertCall.executeBatch(); - conn.commit(); - for (int rv = 0; rv < cnt; rv++) { if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { logger.warn("load({}) -- bad row count {}", urls[batchOffset + rv], ret[rv]); @@ -72,10 +82,9 @@ public class SqlLoadUrls { cnt = 0; } } + if (cnt > 0) { var ret = insertCall.executeBatch(); - conn.commit(); - for (int rv = 0; rv < cnt; rv++) { if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { logger.warn("load({}) -- bad row count {}", urls[batchOffset + rv], ret[rv]); @@ -83,24 +92,12 @@ public class SqlLoadUrls { } } + conn.commit(); conn.setAutoCommit(true); - for (var domain : affectedDomains) { - queryCall.setInt(1, data.getDomainId(domain)); - var rsp = queryCall.executeQuery(); - rsp.setFetchSize(1000); - - while (rsp.next()) { - int urlId = rsp.getInt(1); - String proto = rsp.getString(2); - String path = rsp.getString(3); - String param = rsp.getString(4); - - data.addUrl(new EdgeUrl(proto, domain, null, path, param), urlId); - } + loadUrlsForDomain(data, domain, maxOldId); } - } catch (SQLException ex) { logger.warn("SQL error inserting URLs", ex); @@ -121,4 +118,27 @@ public class SqlLoadUrls { return pathHash + murmur3_128.hashString(queryParam, StandardCharsets.UTF_8).padToLong(); } } + + /** Loads urlIDs for the domain into `data` from the database, starting at URL ID minId. */ + public void loadUrlsForDomain(LoaderData data, EdgeDomain domain, int minId) throws SQLException { + try (var conn = dataSource.getConnection(); + var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH, PARAM FROM EC_URL WHERE DOMAIN_ID=? AND ID > ?")) { + + queryCall.setInt(1, data.getDomainId(domain)); + queryCall.setInt(2, minId); + + var rsp = queryCall.executeQuery(); + rsp.setFetchSize(1000); + + while (rsp.next()) { + int urlId = rsp.getInt(1); + String proto = rsp.getString(2); + String path = rsp.getString(3); + String param = rsp.getString(4); + + data.addUrl(new EdgeUrl(proto, domain, null, path, param), urlId); + } + } + + } } diff --git a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDomainTest.java b/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDomainTest.java index b595c1fa..75c74752 100644 --- a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDomainTest.java +++ b/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDomainTest.java @@ -5,6 +5,7 @@ import nu.marginalia.loading.loader.LoaderData; import nu.marginalia.loading.loader.SqlLoadDomains; import nu.marginalia.loading.loader.SqlLoadProcessedDomain; import nu.marginalia.converting.instruction.instructions.DomainLink; +import nu.marginalia.loading.loader.SqlLoadUrls; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.crawl.DomainIndexingState; import org.junit.jupiter.api.AfterEach; @@ -50,18 +51,18 @@ class SqlLoadProcessedDomainTest { @Test public void loadProcessedDomain() { - var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource)); + var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource), new SqlLoadUrls(dataSource)); loader.load(loaderData, new EdgeDomain("www.marginalia.nu"), DomainIndexingState.BLOCKED, "127.0.0.1"); } @Test public void loadProcessedDomainTwice() { - var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource)); + var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource), new SqlLoadUrls(dataSource)); loader.load(loaderData, new EdgeDomain("www.marginalia.nu"), DomainIndexingState.BLOCKED, "127.0.0.1"); } @Test public void loadProcessedDomaiWithExtremelyLongIP() { - var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource)); + var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource), new SqlLoadUrls(dataSource)); String ip = Stream.generate(() -> "127.").limit(1024).collect(Collectors.joining()); @@ -70,7 +71,7 @@ class SqlLoadProcessedDomainTest { @Test public void loadDomainAlias() { - var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource)); + var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource), new SqlLoadUrls(dataSource)); loader.loadAlias(loaderData, new DomainLink(new EdgeDomain("memex.marginalia.nu"), new EdgeDomain("www.marginalia.nu"))); } } \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/ReconvertAndLoadActor.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/ReconvertAndLoadActor.java index 96730aa2..c6d020e9 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/ReconvertAndLoadActor.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/ReconvertAndLoadActor.java @@ -10,6 +10,7 @@ import nu.marginalia.control.svc.ProcessOutboxFactory; import nu.marginalia.control.svc.ProcessService; import nu.marginalia.index.client.IndexClient; import nu.marginalia.index.client.IndexMqEndpoints; +import nu.marginalia.mqapi.converting.ConvertAction; import nu.marginalia.mqapi.converting.ConvertRequest; import nu.marginalia.mqapi.loading.LoadRequest; import nu.marginalia.db.storage.FileStorageService; @@ -121,7 +122,10 @@ public class ReconvertAndLoadActor extends AbstractStateGraph { storageService.relateFileStorages(toProcess.id(), processedArea.id()); // Pre-send convert request - var request = new ConvertRequest(message.crawlStorageId, processedArea.id()); + var request = new ConvertRequest(ConvertAction.ConvertCrawlData, + null, + message.crawlStorageId, + processedArea.id()); long id = mqConverterOutbox.sendAsync(ConvertRequest.class.getSimpleName(), gson.toJson(request)); return message diff --git a/settings.gradle b/settings.gradle index 7e6d02a0..131b449e 100644 --- a/settings.gradle +++ b/settings.gradle @@ -175,6 +175,8 @@ dependencyResolutionManagement { library('handlebars','com.github.jknack','handlebars').version('4.3.1') library('handlebars.markdown','com.github.jknack','handlebars-markdown').version('4.2.1') + library('sqlite','org.xerial','sqlite-jdbc').version('3.41.2.1') + bundle('slf4j', ['slf4j.api', 'log4j.api', 'log4j.core', 'log4j.slf4j']) bundle('slf4j.test', ['slf4j.jdk14']) bundle('prometheus', ['prometheus', 'prometheus-servlet', 'prometheus-server', 'prometheus-hotspot']) From e237df4a109f2c3206de510b2bb86f72dc8a4fb5 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 28 Jul 2023 18:15:16 +0200 Subject: [PATCH 092/157] (converter) Use a dumb thread pool instead of Java's executor service. --- .../nu/marginalia/crawl/CrawlLimiter.java | 78 ++++++++++++- .../java/nu/marginalia/crawl/CrawlerMain.java | 40 +++---- .../nu/marginalia/crawl/DumbThreadPool.java | 109 ++++++++++++++++++ .../crawl/retreival/CrawlerRetreiver.java | 4 + .../retreival/fetcher/HttpFetcherImpl.java | 26 ++++- .../marginalia/crawling/RssCrawlerTest.java | 3 - 6 files changed, 225 insertions(+), 35 deletions(-) create mode 100644 code/processes/crawling-process/src/main/java/nu/marginalia/crawl/DumbThreadPool.java diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java index e987c926..200a41a7 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java @@ -1,19 +1,87 @@ package nu.marginalia.crawl; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; +import lombok.SneakyThrows; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import java.util.concurrent.Semaphore; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; public class CrawlLimiter { - public static final int maxPoolSize = Integer.getInteger("crawler.pool-size", 512); + public static final int maxPoolSize = Integer.getInteger("crawler.pool-size", 256); + + // Thresholds for throttling task-spawning. Note there's a bit of hysteresis to this + private static final long THROTTLE_TRIGGER_FREE_RAM = 2 * 1024 * 1024 * 1024L; + private static final long THROTTLE_RELEASE_FREE_RAM = 4 * 1024 * 1024 * 1024L; private final Semaphore taskSemCount = new Semaphore(maxPoolSize); + // When set to true, the crawler will wait before starting additional tasks + private final AtomicBoolean throttle = new AtomicBoolean(false); + private static final Logger logger = LoggerFactory.getLogger(CrawlLimiter.class); + + public CrawlLimiter() { + Thread monitorThread = new Thread(this::monitor, "Memory Monitor"); + monitorThread.setDaemon(true); + monitorThread.start(); + } + + + @SneakyThrows + public void monitor() { + for (;;) { + synchronized (throttle) { + boolean oldThrottle = throttle.get(); + boolean newThrottle = oldThrottle; + + if (Runtime.getRuntime().maxMemory() == Long.MAX_VALUE) { + // According to the spec this may happen, although it seems to rarely + // be the case in practice + logger.warn("Memory based throttling disabled (set Xmx)"); + return; + } + + final long freeMemory = Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory(); + + if (oldThrottle && freeMemory > THROTTLE_RELEASE_FREE_RAM) { + newThrottle = false; + logger.warn("Memory based throttling released"); + } + else if (!oldThrottle && freeMemory < THROTTLE_TRIGGER_FREE_RAM) { + newThrottle = true; + logger.warn("Memory based throttling triggered"); + } + + + throttle.set(newThrottle); + + if (!newThrottle) { + throttle.notifyAll(); + } + if (newThrottle != oldThrottle) { + logger.warn("Memory based throttling set to {}", newThrottle); + } + } + + TimeUnit.SECONDS.sleep(1); + } + } + + private void waitForEnoughRAM() throws InterruptedException { + while (!throttle.get()) { + synchronized (throttle) { + throttle.wait(30000); + } + } + } public void acquire() throws InterruptedException { - // It's very important that we acquire the RAM semaphore first to avoid a deadlock taskSemCount.acquire(1); + + if (taskSemCount.availablePermits() < maxPoolSize / 2) { + waitForEnoughRAM(); + } } public void release() { diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java index c06e610b..e062f8a9 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java @@ -53,9 +53,8 @@ public class CrawlerMain implements AutoCloseable { private final MessageQueueFactory messageQueueFactory; private final FileStorageService fileStorageService; private final Gson gson; - private final ThreadPoolExecutor pool; + private final DumbThreadPool pool; - public final CrawlLimiter crawlLimiter = new CrawlLimiter(); private final Set processedIds = new HashSet<>(); final AbortMonitor abortMonitor = AbortMonitor.getInstance(); @@ -76,12 +75,7 @@ public class CrawlerMain implements AutoCloseable { this.gson = gson; // maybe need to set -Xss for JVM to deal with this? - pool = new ThreadPoolExecutor( - CrawlLimiter.maxPoolSize /128, - CrawlLimiter.maxPoolSize, - 5, TimeUnit.MINUTES, - new LinkedBlockingQueue<>(32) - ); + pool = new DumbThreadPool(CrawlLimiter.maxPoolSize, 8); } public static void main(String... args) throws Exception { @@ -142,7 +136,7 @@ public class CrawlerMain implements AutoCloseable { startCrawlTask(plan, spec); } - pool.shutdown(); + pool.shutDown(); do { System.out.println("Waiting for pool to terminate... " + pool.getActiveCount() + " remaining"); } while (!pool.awaitTermination(60, TimeUnit.SECONDS)); @@ -172,20 +166,19 @@ public class CrawlerMain implements AutoCloseable { } try { - crawlLimiter.acquire(); - } catch (InterruptedException e) { - throw new RuntimeException(e); + pool.submit(() -> { + try { + Thread.currentThread().setName("crawling:" + crawlingSpecification.domain); + fetchDomain(crawlingSpecification); + heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks); + } finally { + Thread.currentThread().setName("[idle]"); + } + }); + } + catch (InterruptedException ex) { + throw new RuntimeException(ex); } - - pool.execute(() -> { - try { - fetchDomain(crawlingSpecification); - heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks); - } - finally { - crawlLimiter.release(); - } - }); } @@ -195,7 +188,6 @@ public class CrawlerMain implements AutoCloseable { HttpFetcher fetcher = new HttpFetcherImpl(userAgent.uaString(), dispatcher, connectionPool); - try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification)) { var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept); @@ -282,7 +274,7 @@ public class CrawlerMain implements AutoCloseable { public void close() throws Exception { logger.info("Awaiting termination"); - pool.shutdown(); + pool.shutDown(); while (!pool.awaitTermination(1, TimeUnit.SECONDS)); logger.info("All finished"); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/DumbThreadPool.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/DumbThreadPool.java new file mode 100644 index 00000000..7a56be74 --- /dev/null +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/DumbThreadPool.java @@ -0,0 +1,109 @@ +package nu.marginalia.crawl; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; + +/** A simple thread pool implementation that will never invoke + * a task in the calling thread like {@link java.util.concurrent.ThreadPoolExecutor} + * does when the queue is full. Instead, it will block until a thread + * becomes available to run the task. This is useful for coarse grained + * tasks where the calling thread might otherwise block for hours. + */ +public class DumbThreadPool { + private final List workers = new ArrayList<>(); + private final LinkedBlockingQueue tasks; + private volatile boolean shutDown = false; + private final AtomicInteger taskCount = new AtomicInteger(0); + private final Logger logger = LoggerFactory.getLogger(DumbThreadPool.class); + + public DumbThreadPool(int poolSize, int queueSize) { + tasks = new LinkedBlockingQueue<>(queueSize); + + for (int i = 0; i < poolSize; i++) { + Thread worker = new Thread(this::worker, "Crawler Thread " + i); + worker.setDaemon(true); + worker.start(); + workers.add(worker); + } + + } + + public void submit(Runnable runnable) throws InterruptedException { + tasks.put(runnable); + } + + public void shutDown() { + this.shutDown = true; + } + + public void shutDownNow() { + this.shutDown = true; + for (Thread worker : workers) { + worker.interrupt(); + } + } + + private void worker() { + while (!shutDown) { + try { + Runnable task = tasks.poll(1, TimeUnit.SECONDS); + if (task == null) { + continue; + } + + try { + taskCount.incrementAndGet(); + task.run(); + } + catch (Exception ex) { + logger.warn("Error executing task", ex); + } + finally { + taskCount.decrementAndGet(); + } + } + + catch (InterruptedException ex) { + logger.warn("Thread pool worker interrupted", ex); + return; + } + } + } + + + public boolean awaitTermination(int i, TimeUnit timeUnit) { + final long start = System.currentTimeMillis(); + final long deadline = start + timeUnit.toMillis(i); + + for (var thread : workers) { + if (!thread.isAlive()) + continue; + + long timeRemaining = deadline - System.currentTimeMillis(); + + if (timeRemaining <= 0) + return false; + + try { + thread.join(timeRemaining); + } + catch (InterruptedException ex) { + logger.warn("Interrupted while waiting for thread pool to terminate", ex); + return false; + } + } + + return true; + } + + public int getActiveCount() { + return taskCount.get(); + } + +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index 87251059..3549b25b 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -516,6 +516,10 @@ public class CrawlerRetreiver { return false; if (doc == null) return false; + if (doc.documentBody == null) + return false; + if (newDoc.documentBody == null) + return false; return reference.isContentBodySame(doc, newDoc); } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java index 02cba42c..3bff814a 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java @@ -21,10 +21,13 @@ import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import javax.net.ssl.SSLException; import javax.net.ssl.X509TrustManager; +import java.io.EOFException; import java.io.IOException; import java.net.SocketTimeoutException; import java.net.URISyntaxException; +import java.net.UnknownHostException; import java.nio.charset.Charset; import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.StandardCharsets; @@ -120,7 +123,7 @@ public class HttpFetcherImpl implements HttpFetcher { return probeDomain(new EdgeUrl("https", url.domain, url.port, url.path, url.param)); } - logger.info("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage()); + logger.info("Error during fetching", ex); return new FetchResult(FetchResultState.ERROR, url.domain); } } @@ -197,11 +200,18 @@ public class HttpFetcherImpl implements HttpFetcher { catch (SocketTimeoutException ex) { return createTimeoutErrorRsp(url, ex); } - catch (IllegalCharsetNameException ex) { + catch (IllegalCharsetNameException | SSLException | EOFException ex) { + // This is a bit of a grab-bag of errors that crop up + // IllegalCharsetName is egg on our face, + // but SSLException and EOFException are probably the server's fault + return createHardErrorRsp(url, ex); } + catch (UnknownHostException ex) { + return createUnknownHostError(url, ex); + } catch (Exception ex) { - logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage()); + logger.error("Error during fetching", ex); return createHardErrorRsp(url, ex); } } @@ -214,6 +224,16 @@ public class HttpFetcherImpl implements HttpFetcher { .url(url.toString()) .build(); } + + private CrawledDocument createUnknownHostError(EdgeUrl url, Exception why) { + return CrawledDocument.builder() + .crawlerStatus(CrawlerDocumentStatus.ERROR.toString()) + .crawlerStatusDesc("Unknown Host") + .timestamp(LocalDateTime.now().toString()) + .url(url.toString()) + .build(); + } + private CrawledDocument createTimeoutErrorRsp(EdgeUrl url, Exception why) { return CrawledDocument.builder() .crawlerStatus("Timeout") diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/RssCrawlerTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/RssCrawlerTest.java index 86caa3c7..05de76dc 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/RssCrawlerTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/RssCrawlerTest.java @@ -59,7 +59,4 @@ class RssCrawlerTest { return urls; } - - - } \ No newline at end of file From 01476577b8c62e069e486fbcd23f7d2755870f8e Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 28 Jul 2023 22:00:07 +0200 Subject: [PATCH 093/157] (loader) Speed up loading back to original speeds with a cascading DELETE FROM EC_URL rather than EC_PAGE_DATA. * Also clean up code and have proper rollbacks for transactions. --- .../nu/marginalia/loading/LoaderMain.java | 3 + .../loader/SqlLoadProcessedDocument.java | 93 ++++++++-------- .../loader/SqlLoadProcessedDomain.java | 36 +++--- .../loading/loader/SqlLoadUrls.java | 103 ++++++++++-------- 4 files changed, 129 insertions(+), 106 deletions(-) diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java index c8441330..68bcf8c4 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java @@ -57,6 +57,7 @@ public class LoaderMain { var instance = injector.getInstance(LoaderMain.class); try { var instructions = instance.fetchInstructions(); + logger.info("Instructions received"); instance.run(instructions); } catch (Exception ex) { @@ -103,6 +104,7 @@ public class LoaderMain { LoaderMain.loadTotal = loadTotal; + logger.info("Loading {} files", loadTotal); for (var entry : WorkLog.iterable(logFile)) { heartbeat.setProgress(loaded++ / (double) loadTotal); @@ -130,6 +132,7 @@ public class LoaderMain { logger.info("Loading finished"); } catch (Exception ex) { + ex.printStackTrace(); logger.error("Failed to load", ex); instructions.err(); throw ex; diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDocument.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDocument.java index 02c4202c..909ec986 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDocument.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDocument.java @@ -64,61 +64,66 @@ public class SqlLoadProcessedDocument { public void load(LoaderData data, List documents) { - try (var conn = dataSource.getConnection(); - var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)")) { - conn.setAutoCommit(false); + try (var conn = dataSource.getConnection()) { + try (var insertCall = conn.prepareCall("CALL INSERT_PAGE_VISIT(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)") + ) { + conn.setAutoCommit(false); - int cnt = 0; int batchOffset = 0; - for (var doc : documents) { - int urlId = data.getUrlId(doc.url()); - if (urlId <= 0) { - logger.warn("Failed to resolve ID for URL {}", doc.url()); - continue; - } + int cnt = 0; + int batchOffset = 0; + for (var doc : documents) { + int urlId = data.getUrlId(doc.url()); + if (urlId <= 0) { + logger.warn("Failed to resolve ID for URL {}", doc.url()); + continue; + } - stmt.setInt(1, urlId); - stmt.setString(2, doc.state().name()); - stmt.setString(3, doc.title()); - stmt.setString(4, StringUtils.truncate(doc.description(), 255)); - stmt.setInt(5, doc.length()); - stmt.setInt(6, doc.htmlFeatures()); - stmt.setString(7, doc.standard()); - stmt.setDouble(8, doc.quality()); - stmt.setLong(9, doc.hash()); - if (doc.pubYear() != null) { - stmt.setShort(10, (short) doc.pubYear().intValue()); - } - else { - stmt.setInt(10, Types.SMALLINT); - } - stmt.addBatch(); + insertCall.setInt(1, urlId); + insertCall.setString(2, doc.state().name()); + insertCall.setString(3, doc.title()); + insertCall.setString(4, StringUtils.truncate(doc.description(), 255)); + insertCall.setInt(5, doc.length()); + insertCall.setInt(6, doc.htmlFeatures()); + insertCall.setString(7, doc.standard()); + insertCall.setDouble(8, doc.quality()); + insertCall.setLong(9, doc.hash()); + if (doc.pubYear() != null) { + insertCall.setShort(10, (short) doc.pubYear().intValue()); + } else { + insertCall.setInt(10, Types.SMALLINT); + } + insertCall.addBatch(); - if (++cnt == 100) { - var ret = stmt.executeBatch(); + if (++cnt == 100) { + var ret = insertCall.executeBatch(); + conn.commit(); + + for (int rv = 0; rv < cnt; rv++) { + if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { + logger.warn("load({}) -- bad row count {}", documents.get(batchOffset + rv), ret[rv]); + } + } + + cnt = 0; + batchOffset += 100; + } + } + if (cnt > 0) { + var ret = insertCall.executeBatch(); conn.commit(); - for (int rv = 0; rv < cnt; rv++) { if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { logger.warn("load({}) -- bad row count {}", documents.get(batchOffset + rv), ret[rv]); } } - - cnt = 0; - batchOffset += 100; } - } - if (cnt > 0) { - var ret = stmt.executeBatch(); - conn.commit(); - for (int rv = 0; rv < cnt; rv++) { - if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { - logger.warn("load({}) -- bad row count {}", documents.get(batchOffset + rv), ret[rv]); - } - } - } - - conn.setAutoCommit(true); + conn.setAutoCommit(true); + } + catch (SQLException ex) { + conn.rollback(); + throw ex; + } } catch (SQLException ex) { logger.warn("SQL error inserting document", ex); diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java index df598b14..9ac576af 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java @@ -27,6 +27,10 @@ public class SqlLoadProcessedDomain { try (var conn = dataSource.getConnection()) { try (var stmt = conn.createStatement()) { stmt.execute("DROP PROCEDURE IF EXISTS INITIALIZE_DOMAIN"); + + // Note that there should be no need to delete from EC_PAGE_DATA here as it's done via their + // CASCADE DELETE constraint on EC_URL. + stmt.execute(""" CREATE PROCEDURE INITIALIZE_DOMAIN ( IN ST ENUM('ACTIVE', 'EXHAUSTED', 'SPECIAL', 'SOCIAL_MEDIA', 'BLOCKED', 'REDIR', 'ERROR', 'UNKNOWN'), @@ -36,7 +40,7 @@ public class SqlLoadProcessedDomain { BEGIN DELETE FROM DOMAIN_METADATA WHERE ID=DID; DELETE FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=DID; - DELETE FROM EC_PAGE_DATA WHERE ID IN (SELECT ID FROM EC_URL WHERE DOMAIN_ID = DID); + DELETE FROM EC_URL WHERE DOMAIN_ID=DID; UPDATE EC_DOMAIN SET INDEX_DATE=NOW(), STATE=ST, DOMAIN_ALIAS=NULL, INDEXED=GREATEST(INDEXED,IDX), IP=IP WHERE ID=DID; DELETE FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=DID; END @@ -54,20 +58,24 @@ public class SqlLoadProcessedDomain { loadDomains.load(data, domain); - try (var conn = dataSource.getConnection(); - var initCall = conn.prepareCall("CALL INITIALIZE_DOMAIN(?,?,?,?)")) - { - initCall.setString(1, state.name()); - initCall.setInt(2, 1 + data.sizeHint / 100); - initCall.setInt(3, data.getDomainId(domain)); - initCall.setString(4, StringUtils.truncate(ip, 48)); - int rc = initCall.executeUpdate(); - conn.commit(); - if (rc < 1) { - logger.warn("load({},{}) -- bad rowcount {}", domain, state, rc); - } + try (var conn = dataSource.getConnection()) { + try (var initCall = conn.prepareCall("CALL INITIALIZE_DOMAIN(?,?,?,?)")) { + initCall.setString(1, state.name()); + initCall.setInt(2, 1 + data.sizeHint / 100); + initCall.setInt(3, data.getDomainId(domain)); + initCall.setString(4, StringUtils.truncate(ip, 48)); + int rc = initCall.executeUpdate(); + conn.commit(); + if (rc < 1) { + logger.warn("load({},{}) -- bad rowcount {}", domain, state, rc); + } - loadUrls.loadUrlsForDomain(data, domain, 0); + loadUrls.loadUrlsForDomain(data, domain, 0); + } + catch (SQLException ex) { + conn.rollback(); + throw ex; + } } catch (SQLException ex) { logger.warn("SQL error initializing domain", ex); diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadUrls.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadUrls.java index a0b0f8cb..922baf91 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadUrls.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadUrls.java @@ -34,69 +34,76 @@ public class SqlLoadUrls { return; int maxOldId = 0; - try (var conn = dataSource.getConnection(); - var insertCall = conn.prepareStatement("INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PARAM,PATH_HASH) VALUES (?,?,?,?,?,?)"); - var queryMaxId = conn.prepareStatement("SELECT MAX(ID) FROM EC_URL")) - { - conn.setAutoCommit(false); - var rs = queryMaxId.executeQuery(); - if (rs.next()) { - maxOldId = rs.getInt(1); - } + try (var conn = dataSource.getConnection()) { - int cnt = 0; int batchOffset = 0; + try (var insertStmt = conn.prepareStatement("INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PARAM,PATH_HASH) VALUES (?,?,?,?,?,?)"); + var queryMaxId = conn.prepareStatement("SELECT MAX(ID) FROM EC_URL")) { - for (var url : urls) { - if (data.getUrlId(url) != 0) - continue; - if (url.path.length() >= 255) { - logger.info("Skipping bad URL {}", url); - continue; + conn.setAutoCommit(false); + + var rs = queryMaxId.executeQuery(); + if (rs.next()) { + maxOldId = rs.getInt(1); } - var domainId = data.getDomainId(url.domain); - affectedDomains.add(url.domain); + int cnt = 0; + int batchOffset = 0; - insertCall.setString(1, url.proto); - insertCall.setInt(2, domainId); - if (url.port != null) { - insertCall.setInt(3, url.port); + for (var url : urls) { + if (data.getUrlId(url) != 0) + continue; + if (url.path.length() >= 255) { + logger.info("Skipping bad URL {}", url); + continue; + } + var domainId = data.getDomainId(url.domain); + + affectedDomains.add(url.domain); + + insertStmt.setString(1, url.proto); + insertStmt.setInt(2, domainId); + if (url.port != null) { + insertStmt.setInt(3, url.port); + } else { + insertStmt.setNull(3, Types.INTEGER); + } + insertStmt.setString(4, url.path); + insertStmt.setString(5, url.param); + insertStmt.setLong(6, hashPath(url.path, url.param)); + insertStmt.addBatch(); + + if (++cnt == 1000) { + var ret = insertStmt.executeBatch(); + for (int rv = 0; rv < cnt; rv++) { + if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { + logger.warn("load({}) -- bad row count {}", urls[batchOffset + rv], ret[rv]); + } + } + + batchOffset += cnt; + cnt = 0; + } } - else { - insertCall.setNull(3, Types.INTEGER); - } - insertCall.setString(4, url.path); - insertCall.setString(5, url.param); - insertCall.setLong(6, hashPath(url.path, url.param)); - insertCall.addBatch(); - if (++cnt == 1000) { - var ret = insertCall.executeBatch(); + if (cnt > 0) { + var ret = insertStmt.executeBatch(); for (int rv = 0; rv < cnt; rv++) { if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { logger.warn("load({}) -- bad row count {}", urls[batchOffset + rv], ret[rv]); } } + } - batchOffset += cnt; - cnt = 0; + conn.commit(); + conn.setAutoCommit(true); + + for (var domain : affectedDomains) { + loadUrlsForDomain(data, domain, maxOldId); } } - - if (cnt > 0) { - var ret = insertCall.executeBatch(); - for (int rv = 0; rv < cnt; rv++) { - if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { - logger.warn("load({}) -- bad row count {}", urls[batchOffset + rv], ret[rv]); - } - } - } - - conn.commit(); - conn.setAutoCommit(true); - - for (var domain : affectedDomains) { - loadUrlsForDomain(data, domain, maxOldId); + catch (SQLException ex) { + conn.rollback(); + throw ex; } } catch (SQLException ex) { From 866db6c63f0496c8e410cab0631f14642d4bb160 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 28 Jul 2023 22:02:05 +0200 Subject: [PATCH 094/157] (control) Dialog for updating message state; clean up file view. --- .../mq/persistence/MqPersistence.java | 69 ++++++++++++++++++- .../nu/marginalia/control/ControlService.java | 15 +++- .../control/model/FileStorageFileModel.java | 14 +--- .../svc/ControlFileStorageService.java | 10 ++- .../control/dialog-update-message-state.hdb | 41 +++++++++++ .../control/partials/message-queue-table.hdb | 62 +---------------- .../templates/control/storage-details.hdb | 7 +- 7 files changed, 134 insertions(+), 84 deletions(-) create mode 100644 code/services-satellite/control-service/src/main/resources/templates/control/dialog-update-message-state.hdb diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java index dce9d402..3413ffea 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java @@ -12,6 +12,8 @@ import java.sql.SQLException; import java.time.Duration; import java.util.*; +import static nu.marginalia.mq.MqMessageState.NEW; + @Singleton public class MqPersistence { private final HikariDataSource dataSource; @@ -100,12 +102,18 @@ public class MqPersistence { /** Modifies the state of a message by id */ public void updateMessageState(long id, MqMessageState mqMessageState) throws SQLException { + if (NEW == mqMessageState) { + reinitializeMessage(id); + return; + } + try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(""" UPDATE MESSAGE_QUEUE SET STATE=?, UPDATED_TIME=CURRENT_TIMESTAMP(6) WHERE ID=? """)) { + stmt.setString(1, mqMessageState.name()); stmt.setLong(2, id); @@ -115,6 +123,26 @@ public class MqPersistence { } } + /** Sets the message to 'NEW' state and removes any owner */ + public void reinitializeMessage(long id) throws SQLException { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + UPDATE MESSAGE_QUEUE + SET STATE='NEW', + OWNER_INSTANCE=NULL, + OWNER_TICK=NULL, + UPDATED_TIME=CURRENT_TIMESTAMP(6) + WHERE ID=? + """)) { + + stmt.setLong(1, id); + + if (stmt.executeUpdate() != 1) { + throw new IllegalArgumentException("No rows updated"); + } + } + } + /** Creates a new message in the queue referencing as a reply to an existing message * This message will have it's RELATED_ID set to the original message's ID. */ @@ -207,7 +235,8 @@ public class MqPersistence { AND RECIPIENT_INBOX=? LIMIT ? """) - ) { + ) + { queryStmt.setString(1, inboxName); queryStmt.setInt(2, n); var rs = queryStmt.executeQuery(); @@ -230,9 +259,44 @@ public class MqPersistence { } return messages; + } + } -} + public MqMessage getMessage(long id) throws SQLException { + try (var conn = dataSource.getConnection(); + var queryStmt = conn.prepareStatement(""" + SELECT + ID, + RELATED_ID, + FUNCTION, + PAYLOAD, + STATE, + SENDER_INBOX IS NOT NULL AS EXPECTS_RESPONSE + FROM MESSAGE_QUEUE + WHERE ID=? + """) + ) + { + queryStmt.setLong(1, id); + var rs = queryStmt.executeQuery(); + + if (rs.next()) { + long msgId = rs.getLong("ID"); + long relatedId = rs.getLong("RELATED_ID"); + + String function = rs.getString("FUNCTION"); + String payload = rs.getString("PAYLOAD"); + + MqMessageState state = MqMessageState.valueOf(rs.getString("STATE")); + boolean expectsResponse = rs.getBoolean("EXPECTS_RESPONSE"); + + return new MqMessage(msgId, relatedId, function, payload, state, expectsResponse); + } + } + + throw new IllegalArgumentException("No message with id " + id); + } /** Marks unclaimed messages addressed to this inbox with instanceUUID and tick, * then returns these messages. */ @@ -378,4 +442,5 @@ public class MqPersistence { } } + } diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java index 3411058c..e1efc3e4 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java @@ -8,6 +8,8 @@ import nu.marginalia.control.svc.*; import nu.marginalia.db.storage.model.FileStorageId; import nu.marginalia.db.storage.model.FileStorageType; import nu.marginalia.model.gson.GsonFactory; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.renderer.RendererFactory; import nu.marginalia.service.server.*; import org.slf4j.Logger; @@ -43,7 +45,8 @@ public class ControlService extends Service { ControlActorService controlActorService, StaticResources staticResources, MessageQueueViewService messageQueueViewService, - ControlFileStorageService controlFileStorageService + ControlFileStorageService controlFileStorageService, + MqPersistence persistence ) throws IOException { super(params); @@ -60,7 +63,9 @@ public class ControlService extends Service { var storageSpecsRenderer = rendererFactory.renderer("control/storage-specs"); var storageCrawlsRenderer = rendererFactory.renderer("control/storage-crawls"); var storageProcessedRenderer = rendererFactory.renderer("control/storage-processed"); + var storageDetailsRenderer = rendererFactory.renderer("control/storage-details"); + var updateMessageStateRenderer = rendererFactory.renderer("control/dialog-update-message-state"); this.controlActorService = controlActorService; @@ -102,6 +107,14 @@ public class ControlService extends Service { Spark.post("/public/storage/specs", controlActorService::createCrawlSpecification, redirectToStorage); Spark.post("/public/storage/:fid/delete", controlFileStorageService::flagFileForDeletionRequest, redirectToStorage); + Spark.get("/public/message/:id/state", (rq, rsp) -> persistence.getMessage(Long.parseLong(rq.params("id"))), updateMessageStateRenderer::render); + Spark.post("/public/message/:id/state", (rq, rsp) -> { + MqMessageState state = MqMessageState.valueOf(rq.queryParams("state")); + long id = Long.parseLong(rq.params("id")); + persistence.updateMessageState(id, state); + return ""; + }, redirectToProcesses); + Spark.get("/public/:resource", this::serveStatic); monitors.subscribe(this::logMonitorStateChange); diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageFileModel.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageFileModel.java index c8b513ee..41da73e8 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageFileModel.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageFileModel.java @@ -1,15 +1,7 @@ package nu.marginalia.control.model; -import nu.marginalia.db.storage.model.FileStorage; - -import java.util.List; - public record FileStorageFileModel(String filename, - String type, - String size - ) { - - public boolean isDownloadable() { - return type.equals("file"); - } + String mTime, + String size) +{ } diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlFileStorageService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlFileStorageService.java index 06bf240d..f80287f4 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlFileStorageService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlFileStorageService.java @@ -116,11 +116,9 @@ public class ControlFileStorageService { try (var filesStream = Files.list(storage.asPath())) { filesStream + .filter(Files::isRegularFile) .map(this::createFileModel) - .sorted(Comparator - .comparing(FileStorageFileModel::type) - .thenComparing(FileStorageFileModel::filename) - ) + .sorted(Comparator.comparing(FileStorageFileModel::filename)) .forEach(files::add); } catch (IOException ex) { @@ -132,7 +130,7 @@ public class ControlFileStorageService { private FileStorageFileModel createFileModel(Path p) { try { - String type = Files.isRegularFile(p) ? "file" : "directory"; + String mTime = Files.getLastModifiedTime(p).toInstant().toString(); String size; if (Files.isDirectory(p)) { size = "-"; @@ -146,7 +144,7 @@ public class ControlFileStorageService { else size = sizeBytes / (1024 * 1024 * 1024) + " GB"; } - return new FileStorageFileModel(p.toFile().getName(), type, size); + return new FileStorageFileModel(p.toFile().getName(), mTime, size); } catch (IOException ex) { throw new RuntimeException(ex); diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/dialog-update-message-state.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/dialog-update-message-state.hdb new file mode 100644 index 00000000..e5ee806f --- /dev/null +++ b/code/services-satellite/control-service/src/main/resources/templates/control/dialog-update-message-state.hdb @@ -0,0 +1,41 @@ + + + +Update ID + +{{> control/partials/nav}} +
    +

    Update Message State

    +

    Update the of a message in the message queue. This may be useful to prevent an actor +from resuming an action when this is not desirable. Setting an old message to 'NEW' will +erase information about its owner, and inboxes will consider the message new again.

    + +
    + +

    +
    + +

    +
    + +

    +
    + +

    +
    + +
    +
    +
    + + + + +
    + \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/partials/message-queue-table.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/partials/message-queue-table.hdb index 5c3397f4..cc8d98a2 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/partials/message-queue-table.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/partials/message-queue-table.hdb @@ -11,8 +11,8 @@ {{#each messages}} - {{stateCode}} {{state}} - {{id}} + {{stateCode}} {{state}} + {{id}} {{recipientInbox}} {{function}} @@ -30,61 +30,3 @@ {{/each}} - - -

    Edit Message

    -
    -
    - - - - - - - - - - - - -
    - -
    -
    - -
    -
    -
    -
    - - \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/storage-details.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/storage-details.hdb index ec7d4ef0..c811b478 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/storage-details.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/storage-details.hdb @@ -30,15 +30,14 @@ - + {{#each storage.files}} - {{else}} {{filename}} {{/if}} - + {{filename}} + {{/each}} From 9ad32ee9c72e1c2262f24d3f9c7b5cf8b507850d Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 29 Jul 2023 19:16:00 +0200 Subject: [PATCH 095/157] (control) Be more clear about when a process exits and why. --- .../java/nu/marginalia/control/svc/ProcessService.java | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java index 0281ed43..9496c2a1 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java @@ -84,7 +84,6 @@ public class ProcessService { var os = new BufferedReader(new InputStreamReader(process.getInputStream())) ) { eventLog.logEvent("PROCESS-STARTED", processId.toString()); - process.onExit().whenComplete((p,t) -> eventLog.logEvent("PROCESS-EXIT", processId.toString())); while (process.isAlive()) { if (es.ready()) @@ -93,9 +92,16 @@ public class ProcessService { logger.info(processMarker, os.readLine()); } - return 0 == process.waitFor(); + final int returnCode = process.waitFor(); + logger.info("Process {} terminated with code {}", processId, returnCode); + return 0 == returnCode; + } + catch (Exception ex) { + logger.info("Process {} terminated with code exception", processId); + throw ex; } finally { + eventLog.logEvent("PROCESS-EXIT", processId.toString()); processes.remove(processId); } From 7611b7900d8595321735f54c6b118f0e8b8785d8 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 29 Jul 2023 19:16:31 +0200 Subject: [PATCH 096/157] (crawler) Reduce long term memory allocation in DomainCrawlFrontier (crawler) Reduce long term memory allocation in DomainCrawlFrontier --- code/processes/crawling-process/build.gradle | 2 + .../crawl/retreival/DomainCrawlFrontier.java | 62 ++++++++++++++----- .../retreival/DomainCrawlFrontierTest.java | 32 ++++++++++ 3 files changed, 82 insertions(+), 14 deletions(-) create mode 100644 code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/DomainCrawlFrontierTest.java diff --git a/code/processes/crawling-process/build.gradle b/code/processes/crawling-process/build.gradle index 48068620..fcc7862d 100644 --- a/code/processes/crawling-process/build.gradle +++ b/code/processes/crawling-process/build.gradle @@ -52,6 +52,8 @@ dependencies { implementation libs.jsoup implementation libs.opencsv implementation libs.rxjava + implementation libs.fastutil + implementation libs.bundles.mariadb testImplementation libs.bundles.slf4j.test diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java index 4b9cc265..4b1b9ad1 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java @@ -1,16 +1,28 @@ package nu.marginalia.crawl.retreival; +import com.google.common.hash.HashFunction; +import it.unimi.dsi.fastutil.longs.LongOpenHashSet; import nu.marginalia.ip_blocklist.UrlBlocklist; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; +import java.net.URISyntaxException; import java.util.*; import java.util.function.Predicate; public class DomainCrawlFrontier { - private final LinkedList queue = new LinkedList<>(); - private final HashSet visited; - private final HashSet known; + private final ArrayDeque queue; + + // To save the number of strings kept in memory, + // do an approximate check using 64 bit hashes instead + // .. + // This isn't perfect, and may lead to false positives, + // but this is relatively unlikely, since the cardinality of these + // need to be in the billions to approach Birthday Paradox + // territory + private final LongOpenHashSet visited; + private final LongOpenHashSet known; + private final HashFunction hasher = com.google.common.hash.Hashing.murmur3_128(); private final EdgeDomain thisDomain; private final UrlBlocklist urlBlocklist; @@ -24,8 +36,9 @@ public class DomainCrawlFrontier { this.urlBlocklist = new UrlBlocklist(); this.depth = depth; - visited = new HashSet<>((int)(urls.size() * 1.5)); - known = new HashSet<>(urls.size() * 10); + queue = new ArrayDeque<>(10 + (int) (urls.size()*1.2)); + visited = new LongOpenHashSet(10 + (int)(urls.size() * 1.5)); + known = new LongOpenHashSet(10 + urls.size() * 2); for (String urlStr : urls) { EdgeUrl.parse(urlStr).ifPresent(this::addToQueue); @@ -48,21 +61,42 @@ public class DomainCrawlFrontier { } public void addFirst(EdgeUrl url) { - if (known.add(url.toString())) { - queue.addFirst(url); + if (addKnown(url)) { + queue.addFirst(url.toString()); } } public EdgeUrl takeNextUrl() { - return queue.removeFirst(); + try { + return new EdgeUrl(queue.removeFirst()); + } catch (URISyntaxException e) { + // This should never happen since we only add urls via EdgeUrl.toString() + throw new RuntimeException(e); + } } public EdgeUrl peek() { - return queue.peek(); + try { + return new EdgeUrl(queue.peek()); + } catch (URISyntaxException e) { + // This should never happen since we only add urls via EdgeUrl.toString() + throw new RuntimeException(e); + } } public boolean addVisited(EdgeUrl url) { - return visited.add(url.toString()); + long hashCode = hasher.hashUnencodedChars(url.toString()).padToLong(); + + return visited.add(hashCode); + } + public boolean addKnown(EdgeUrl url) { + long hashCode = hasher.hashUnencodedChars(url.toString()).padToLong(); + return known.add(hashCode); + } + + boolean isVisited(EdgeUrl url) { + long hashCode = hasher.hashUnencodedChars(url.toString()).padToLong(); + return visited.contains(hashCode); } public boolean filterLink(EdgeUrl url) { @@ -80,14 +114,14 @@ public class DomainCrawlFrontier { return; // reduce memory usage by not growing queue huge when crawling large sites - if (queue.size() + visited.size() >= depth + 1000) + if (queue.size() + visited.size() >= depth + 200) return; - if (visited.contains(url.toString())) + if (isVisited(url)) return; - if (known.add(url.toString())) { - queue.addLast(url); + if (addKnown(url)) { + queue.addLast(url.toString()); } } diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/DomainCrawlFrontierTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/DomainCrawlFrontierTest.java new file mode 100644 index 00000000..1396444b --- /dev/null +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/DomainCrawlFrontierTest.java @@ -0,0 +1,32 @@ +package nu.marginalia.crawl.retreival; + +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; +import org.junit.jupiter.api.Test; + +import java.net.URISyntaxException; +import java.util.Set; + +import static org.junit.jupiter.api.Assertions.*; + +class DomainCrawlFrontierTest { + + @Test + public void testVisited() throws URISyntaxException { + var dcf = new DomainCrawlFrontier(new EdgeDomain("example.com"), Set.of(), 100); + + assertTrue(dcf.addVisited(new EdgeUrl("https://example.com"))); + assertTrue(dcf.isVisited(new EdgeUrl("https://example.com"))); + assertFalse(dcf.addVisited(new EdgeUrl("https://example.com"))); + } + + @Test + public void testKnown() throws URISyntaxException { + var dcf = new DomainCrawlFrontier(new EdgeDomain("example.com"), Set.of(), 100); + + assertTrue(dcf.addKnown(new EdgeUrl("https://example.com"))); + assertFalse(dcf.addKnown(new EdgeUrl("https://example.com/"))); + assertTrue(dcf.addKnown(new EdgeUrl("https://example.com/index.html"))); + assertFalse(dcf.addKnown(new EdgeUrl("https://example.com"))); + } +} \ No newline at end of file From d2b6b2044c5f703a4ac3050227b034ad8cb6c265 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 29 Jul 2023 19:16:53 +0200 Subject: [PATCH 097/157] (crawler) Reduce log spam in HttpFetcherImpl --- .../crawl/retreival/fetcher/HttpFetcherImpl.java | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java index 3bff814a..b0b0fd9d 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java @@ -12,7 +12,6 @@ import nu.marginalia.crawling.model.CrawlerDocumentStatus; import nu.marginalia.crawling.model.ContentType; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.bigstring.BigString; import nu.marginalia.crawl.retreival.logic.ContentTypeLogic; import nu.marginalia.crawl.retreival.logic.ContentTypeParser; import okhttp3.*; @@ -25,9 +24,7 @@ import javax.net.ssl.SSLException; import javax.net.ssl.X509TrustManager; import java.io.EOFException; import java.io.IOException; -import java.net.SocketTimeoutException; -import java.net.URISyntaxException; -import java.net.UnknownHostException; +import java.net.*; import java.nio.charset.Charset; import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.StandardCharsets; @@ -200,16 +197,16 @@ public class HttpFetcherImpl implements HttpFetcher { catch (SocketTimeoutException ex) { return createTimeoutErrorRsp(url, ex); } - catch (IllegalCharsetNameException | SSLException | EOFException ex) { + catch (UnknownHostException ex) { + return createUnknownHostError(url, ex); + } + catch (SocketException | ProtocolException | IllegalCharsetNameException | SSLException | EOFException ex) { // This is a bit of a grab-bag of errors that crop up // IllegalCharsetName is egg on our face, // but SSLException and EOFException are probably the server's fault return createHardErrorRsp(url, ex); } - catch (UnknownHostException ex) { - return createUnknownHostError(url, ex); - } catch (Exception ex) { logger.error("Error during fetching", ex); return createHardErrorRsp(url, ex); From 05ba3bab96696fe684827fa6f185b34a0452b857 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 29 Jul 2023 19:17:19 +0200 Subject: [PATCH 098/157] (crawler) Make SitemapRetriever abort on too large sitemaps. --- .../retreival/fetcher/SitemapRetriever.java | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/SitemapRetriever.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/SitemapRetriever.java index bb2d2898..90b26a88 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/SitemapRetriever.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/SitemapRetriever.java @@ -5,7 +5,6 @@ import nu.marginalia.model.EdgeUrl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import javax.inject.Singleton; import java.io.FileNotFoundException; import java.io.IOException; import java.util.*; @@ -44,14 +43,19 @@ public class SitemapRetriever { final List urlsList = new ArrayList<>(10000); final Set seenUrls = new HashSet<>(); - final LinkedList maps = new LinkedList<>(); + final ArrayDeque maps = new ArrayDeque<>(); maps.add(map); - while (!maps.isEmpty()) { + while (!maps.isEmpty() && seenSiteMapUrls.size() > 2) { if (urlsList.size() >= 10000) break; + // This is some weird site that too many sitemaps + // ... it's causing us to run out of memory + if (seenSiteMapUrls.size() > 25) + break; + var firstMap = maps.removeFirst(); if (!seenSiteMapUrls.add(firstMap.getUrl().toString())) { @@ -74,7 +78,12 @@ public class SitemapRetriever { } else if (map instanceof SiteMapIndex index) { var sitemaps = index.getSitemaps(false); - maps.addAll(sitemaps); + for (var sitemap : sitemaps) { + // Limit how many sitemaps we can add to the queue + if (maps.size() < 25) { + maps.add(sitemap); + } + } } else { logger.warn("Unknown sitemap type: {}", map.getClass()); From d3f01bd1714abac891fcc7b2ba6c015a911e5880 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 29 Jul 2023 19:17:38 +0200 Subject: [PATCH 099/157] (crawler, converter) Remove monkey patched gson from dependencies --- code/process-models/converting-model/build.gradle | 2 +- code/processes/converting-process/build.gradle | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/code/process-models/converting-model/build.gradle b/code/process-models/converting-model/build.gradle index ae48aa32..11426794 100644 --- a/code/process-models/converting-model/build.gradle +++ b/code/process-models/converting-model/build.gradle @@ -12,7 +12,7 @@ java { } dependencies { - implementation project(':third-party:monkey-patch-gson') + //implementation project(':third-party:monkey-patch-gson') implementation project(':code:common:db') implementation project(':code:common:model') diff --git a/code/processes/converting-process/build.gradle b/code/processes/converting-process/build.gradle index a14ee596..b8e199f3 100644 --- a/code/processes/converting-process/build.gradle +++ b/code/processes/converting-process/build.gradle @@ -20,7 +20,7 @@ tasks.distZip.enabled = false dependencies { - implementation project(':third-party:monkey-patch-gson') + //implementation project(':third-party:monkey-patch-gson') implementation project(':code:common:process') From ee143bbc4852f37026b750af76f858dc647bcf1a Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 29 Jul 2023 19:19:09 +0200 Subject: [PATCH 100/157] (crawler, converter) Fix so that DumbThreadPool actually waits for termination as intended. --- .../marginalia/converting/ConverterMain.java | 18 +-- .../marginalia/converting/DumbThreadPool.java | 118 ++++++++++++++++++ .../nu/marginalia/crawl/DumbThreadPool.java | 25 ++-- 3 files changed, 139 insertions(+), 22 deletions(-) create mode 100644 code/processes/converting-process/src/main/java/nu/marginalia/converting/DumbThreadPool.java diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java index c7584a6c..99445d81 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java @@ -108,19 +108,13 @@ public class ConverterMain { public void convert(CrawlPlan plan) throws Exception { - final int maxPoolSize = 16; + final int maxPoolSize = Runtime.getRuntime().availableProcessors(); try (WorkLog processLog = plan.createProcessWorkLog(); ConversionLog log = new ConversionLog(plan.process.getDir())) { var instructionWriter = new InstructionWriterFactory(log, plan.process.getDir(), gson); - Semaphore semaphore = new Semaphore(maxPoolSize); - var pool = new ThreadPoolExecutor( - maxPoolSize/4, - maxPoolSize, - 5, TimeUnit.MINUTES, - new LinkedBlockingQueue<>(8) - ); + var pool = new DumbThreadPool(maxPoolSize, 2); int totalDomains = plan.countCrawledDomains(); AtomicInteger processedDomains = new AtomicInteger(0); @@ -131,8 +125,7 @@ public class ConverterMain { for (var domain : plan.crawlDataIterable(id -> !processLog.isJobFinished(id))) { - semaphore.acquire(); - pool.execute(() -> { + pool.submit(() -> { try { ProcessedDomain processed = processor.process(domain); @@ -151,13 +144,10 @@ public class ConverterMain { catch (IOException ex) { logger.warn("IO exception in converter", ex); } - finally { - semaphore.release(); - } }); } - pool.shutdown(); + pool.shutDown(); do { System.out.println("Waiting for pool to terminate... " + pool.getActiveCount() + " remaining"); } while (!pool.awaitTermination(60, TimeUnit.SECONDS)); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/DumbThreadPool.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/DumbThreadPool.java new file mode 100644 index 00000000..3175fec7 --- /dev/null +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/DumbThreadPool.java @@ -0,0 +1,118 @@ +package nu.marginalia.converting; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; + +/** A simple thread pool implementation that will never invoke + * a task in the calling thread like {@link java.util.concurrent.ThreadPoolExecutor} + * does when the queue is full. Instead, it will block until a thread + * becomes available to run the task. This is useful for coarse grained + * tasks where the calling thread might otherwise block for hours. + */ +public class DumbThreadPool { + private final List workers = new ArrayList<>(); + private final LinkedBlockingQueue tasks; + private volatile boolean shutDown = false; + private final AtomicInteger taskCount = new AtomicInteger(0); + private final Logger logger = LoggerFactory.getLogger(DumbThreadPool.class); + + public DumbThreadPool(int poolSize, int queueSize) { + tasks = new LinkedBlockingQueue<>(queueSize); + + for (int i = 0; i < poolSize; i++) { + Thread worker = new Thread(this::worker, "Crawler Thread " + i); + worker.setDaemon(true); + worker.start(); + workers.add(worker); + } + + } + + public void submit(Runnable runnable) throws InterruptedException { + tasks.put(runnable); + } + + public void shutDown() { + this.shutDown = true; + } + + public void shutDownNow() { + this.shutDown = true; + for (Thread worker : workers) { + worker.interrupt(); + } + } + + private void worker() { + while (!shutDown) { + try { + Runnable task = tasks.poll(1, TimeUnit.SECONDS); + if (task == null) { + continue; + } + + try { + taskCount.incrementAndGet(); + task.run(); + } + catch (Exception ex) { + logger.warn("Error executing task", ex); + } + finally { + taskCount.decrementAndGet(); + } + } + + catch (InterruptedException ex) { + logger.warn("Thread pool worker interrupted", ex); + return; + } + } + } + + + /** Wait for all tasks to complete up to the specified timeout, + * then return true if all tasks completed, false otherwise. + */ + public boolean awaitTermination(int i, TimeUnit timeUnit) throws InterruptedException { + final long start = System.currentTimeMillis(); + final long deadline = start + timeUnit.toMillis(i); + + for (var thread : workers) { + if (!thread.isAlive()) + continue; + + long timeRemaining = deadline - System.currentTimeMillis(); + if (timeRemaining <= 0) + return false; + + thread.join(timeRemaining); + if (thread.isAlive()) + return false; + } + + // Doublecheck the bookkeeping so we didn't mess up. This may mean you have to Ctrl+C the process + // if you see this warning forever, but for the crawler this is preferable to terminating early + // and missing tasks. (maybe some cosmic ray or OOM condition or X-Files baddie of the week killed a + // thread so hard and it didn't invoke finally and didn't decrement the task count) + + int activeCount = getActiveCount(); + if (activeCount != 0) { + logger.warn("Thread pool terminated with {} active threads(?!) -- check what's going on with jstack and kill manually", activeCount); + return false; + } + + return true; + } + + public int getActiveCount() { + return taskCount.get(); + } + +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/DumbThreadPool.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/DumbThreadPool.java index 7a56be74..676e3286 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/DumbThreadPool.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/DumbThreadPool.java @@ -77,7 +77,10 @@ public class DumbThreadPool { } - public boolean awaitTermination(int i, TimeUnit timeUnit) { + /** Wait for all tasks to complete up to the specified timeout, + * then return true if all tasks completed, false otherwise. + */ + public boolean awaitTermination(int i, TimeUnit timeUnit) throws InterruptedException { final long start = System.currentTimeMillis(); final long deadline = start + timeUnit.toMillis(i); @@ -86,17 +89,23 @@ public class DumbThreadPool { continue; long timeRemaining = deadline - System.currentTimeMillis(); - if (timeRemaining <= 0) return false; - try { - thread.join(timeRemaining); - } - catch (InterruptedException ex) { - logger.warn("Interrupted while waiting for thread pool to terminate", ex); + thread.join(timeRemaining); + if (thread.isAlive()) return false; - } + } + + // Doublecheck the bookkeeping so we didn't mess up. This may mean you have to Ctrl+C the process + // if you see this warning forever, but for the crawler this is preferable to terminating early + // and missing tasks. (maybe some cosmic ray or OOM condition or X-Files baddie of the week killed a + // thread so hard and it didn't invoke finally and didn't decrement the task count) + + int activeCount = getActiveCount(); + if (activeCount != 0) { + logger.warn("Thread pool terminated with {} active threads(?!) -- check what's going on with jstack and kill manually", activeCount); + return false; } return true; From 2a6183f9e000ac4397357e14df183b9012c69508 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 29 Jul 2023 19:20:09 +0200 Subject: [PATCH 101/157] (crawler) Dynamic throttling of the number of active crawl jobs permitted to spawn; reduce queue size. --- .../nu/marginalia/crawl/CrawlLimiter.java | 20 +++++-------------- .../java/nu/marginalia/crawl/CrawlerMain.java | 9 ++++++++- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java index 200a41a7..dd2122be 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java @@ -12,8 +12,8 @@ public class CrawlLimiter { public static final int maxPoolSize = Integer.getInteger("crawler.pool-size", 256); // Thresholds for throttling task-spawning. Note there's a bit of hysteresis to this - private static final long THROTTLE_TRIGGER_FREE_RAM = 2 * 1024 * 1024 * 1024L; - private static final long THROTTLE_RELEASE_FREE_RAM = 4 * 1024 * 1024 * 1024L; + private final long THROTTLE_TRIGGER_FREE_RAM = Runtime.getRuntime().maxMemory() / 4; + private final long THROTTLE_RELEASE_FREE_RAM = Runtime.getRuntime().maxMemory() / 2; private final Semaphore taskSemCount = new Semaphore(maxPoolSize); @@ -68,23 +68,13 @@ public class CrawlLimiter { } } - private void waitForEnoughRAM() throws InterruptedException { - while (!throttle.get()) { + @SneakyThrows + public void waitForEnoughRAM() { + while (throttle.get()) { synchronized (throttle) { throttle.wait(30000); } } } - public void acquire() throws InterruptedException { - taskSemCount.acquire(1); - - if (taskSemCount.availablePermits() < maxPoolSize / 2) { - waitForEnoughRAM(); - } - } - - public void release() { - taskSemCount.release(1); - } } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java index e062f8a9..dccbb69d 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java @@ -61,6 +61,7 @@ public class CrawlerMain implements AutoCloseable { volatile int totalTasks; final AtomicInteger tasksDone = new AtomicInteger(0); + private final CrawlLimiter limiter = new CrawlLimiter(); @Inject public CrawlerMain(UserAgent userAgent, @@ -75,7 +76,7 @@ public class CrawlerMain implements AutoCloseable { this.gson = gson; // maybe need to set -Xss for JVM to deal with this? - pool = new DumbThreadPool(CrawlLimiter.maxPoolSize, 8); + pool = new DumbThreadPool(CrawlLimiter.maxPoolSize, 1); } public static void main(String... args) throws Exception { @@ -136,6 +137,8 @@ public class CrawlerMain implements AutoCloseable { startCrawlTask(plan, spec); } + logger.info("Shutting down the pool, waiting for tasks to complete..."); + pool.shutDown(); do { System.out.println("Waiting for pool to terminate... " + pool.getActiveCount() + " remaining"); @@ -167,6 +170,8 @@ public class CrawlerMain implements AutoCloseable { try { pool.submit(() -> { + limiter.waitForEnoughRAM(); + try { Thread.currentThread().setName("crawling:" + crawlingSpecification.domain); fetchDomain(crawlingSpecification); @@ -200,6 +205,8 @@ public class CrawlerMain implements AutoCloseable { logger.info("Fetched {}", specification.domain); } catch (Exception e) { logger.error("Error fetching domain", e); + } finally { + logger.info("Done with {}", specification.domain); } } From aba134284f50cb4f0ee7053ac4f50749336b0636 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 29 Jul 2023 19:22:58 +0200 Subject: [PATCH 102/157] (crawler) Reduce log spam --- .../src/main/java/nu/marginalia/crawl/CrawlerMain.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java index dccbb69d..438bee66 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java @@ -204,9 +204,7 @@ public class CrawlerMain implements AutoCloseable { logger.info("Fetched {}", specification.domain); } catch (Exception e) { - logger.error("Error fetching domain", e); - } finally { - logger.info("Done with {}", specification.domain); + logger.error("Error fetching domain " + specification.domain, e); } } From 730e8f74e4755bd2f0795299461a19806682cd3e Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 30 Jul 2023 14:19:55 +0200 Subject: [PATCH 103/157] (crawler) Even more memory optimizations. * Fix minor resource leak in zstd streams * Use pools for zstd streams * Reduce the SSL session cache size --- .../crawling/io/CrawledDomainReader.java | 15 +++++++++++---- .../crawling/io/CrawledDomainWriter.java | 4 +++- .../io/SerializableCrawlDataStream.java | 6 +++++- .../model/spec/CrawlerSpecificationLoader.java | 4 +++- .../marginalia/converting/ConversionLog.java | 4 ++-- .../java/nu/marginalia/crawl/CrawlerMain.java | 18 ++++++++++++------ .../crawl/retreival/CrawlDataReference.java | 6 +++++- .../crawl/retreival/fetcher/NoSecuritySSL.java | 12 +++++++++++- .../loading/ConvertedDomainReader.java | 5 +++-- 9 files changed, 55 insertions(+), 19 deletions(-) diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java index 76e37acc..15f54d23 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java @@ -1,5 +1,6 @@ package nu.marginalia.crawling.io; +import com.github.luben.zstd.RecyclingBufferPool; import com.github.luben.zstd.ZstdInputStream; import com.google.gson.Gson; import nu.marginalia.crawling.model.CrawledDocument; @@ -37,7 +38,7 @@ public class CrawledDomainReader { public CrawledDomain read(Path path) throws IOException { DomainDataAssembler domainData = new DomainDataAssembler(); - try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile()))))) { + try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile()), RecyclingBufferPool.INSTANCE)))) { String line; while ((line = br.readLine()) != null) { if (line.startsWith("//")) { @@ -105,7 +106,7 @@ public class CrawledDomainReader { public FileReadingSerializableCrawlDataStream(Gson gson, File file) throws IOException { this.gson = gson; - bufferedReader = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(file)))); + bufferedReader = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(file), RecyclingBufferPool.INSTANCE))); } @Override @@ -124,9 +125,15 @@ public class CrawledDomainReader { return true; String identifier = bufferedReader.readLine(); - if (identifier == null) return false; + if (identifier == null) { + bufferedReader.close(); + return false; + } String data = bufferedReader.readLine(); - if (data == null) return false; + if (data == null) { + bufferedReader.close(); + return false; + } if (identifier.equals(CrawledDomain.SERIAL_IDENTIFIER)) { next = gson.fromJson(data, CrawledDomain.class); diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java index f431538c..bc83c10b 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java @@ -1,5 +1,6 @@ package nu.marginalia.crawling.io; +import com.github.luben.zstd.RecyclingBufferPool; import com.github.luben.zstd.ZstdOutputStream; import com.google.gson.Gson; import lombok.SneakyThrows; @@ -38,7 +39,8 @@ public class CrawledDomainWriter implements AutoCloseable { tmpFile = getOutputFile(spec.id, spec.domain + "_tmp"); actualFile = getOutputFile(spec.id, spec.domain); writer = new OutputStreamWriter(new ZstdOutputStream(new BufferedOutputStream(Files.newOutputStream(tmpFile, - StandardOpenOption.CREATE, StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING)))); + StandardOpenOption.CREATE, StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING)), + RecyclingBufferPool.INSTANCE)); } public Path getOutputFile() { diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/SerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/SerializableCrawlDataStream.java index e68526b1..7d4fb28c 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/SerializableCrawlDataStream.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/SerializableCrawlDataStream.java @@ -8,7 +8,7 @@ import java.util.Iterator; /** Closable iterator over serialized crawl data * The data may appear in any order, and the iterator must be closed. * */ -public interface SerializableCrawlDataStream { +public interface SerializableCrawlDataStream extends AutoCloseable { static SerializableCrawlDataStream empty() { return new SerializableCrawlDataStream() { @Override @@ -20,6 +20,8 @@ public interface SerializableCrawlDataStream { public boolean hasNext() throws IOException { return false; } + + public void close() {} }; } @@ -35,6 +37,8 @@ public interface SerializableCrawlDataStream { public boolean hasNext() throws IOException { return iterator.hasNext(); } + + public void close() {} }; } diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/spec/CrawlerSpecificationLoader.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/spec/CrawlerSpecificationLoader.java index 2ea956d5..d5d4e482 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/spec/CrawlerSpecificationLoader.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/spec/CrawlerSpecificationLoader.java @@ -1,5 +1,6 @@ package nu.marginalia.crawling.model.spec; +import com.github.luben.zstd.RecyclingBufferPool; import com.github.luben.zstd.ZstdInputStream; import com.google.gson.Gson; import com.google.gson.JsonStreamParser; @@ -17,7 +18,8 @@ public class CrawlerSpecificationLoader { @SneakyThrows public static Iterable asIterable(Path inputSpec) { - var inputStream = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(inputSpec.toFile())))); + var inputStream = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(inputSpec.toFile()), + RecyclingBufferPool.INSTANCE))); var parser = new JsonStreamParser(inputStream); return () -> new Iterator<>() { diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConversionLog.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConversionLog.java index 58aa8b04..2c2ffb95 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConversionLog.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConversionLog.java @@ -1,5 +1,6 @@ package nu.marginalia.converting; +import com.github.luben.zstd.RecyclingBufferPool; import com.github.luben.zstd.ZstdOutputStream; import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.idx.DocumentMetadata; @@ -27,8 +28,7 @@ public class ConversionLog implements AutoCloseable, Interpreter { String fileName = String.format("conversion-log-%s.zstd", LocalDateTime.now().toEpochSecond(ZoneOffset.UTC)); Path logFile = rootDir.resolve(fileName); - writer = new PrintWriter(new ZstdOutputStream( - new BufferedOutputStream(Files.newOutputStream(logFile, StandardOpenOption.WRITE, StandardOpenOption.CREATE)))); + writer = new PrintWriter(new ZstdOutputStream(new BufferedOutputStream(Files.newOutputStream(logFile, StandardOpenOption.WRITE, StandardOpenOption.CREATE)), RecyclingBufferPool.INSTANCE)); } @Override diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java index 438bee66..835f3b0e 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java @@ -55,7 +55,7 @@ public class CrawlerMain implements AutoCloseable { private final Gson gson; private final DumbThreadPool pool; - private final Set processedIds = new HashSet<>(); + private final Set processingIds = new HashSet<>(); final AbortMonitor abortMonitor = AbortMonitor.getInstance(); @@ -92,6 +92,9 @@ public class CrawlerMain implements AutoCloseable { System.setProperty("sun.net.client.defaultConnectTimeout", "30000"); System.setProperty("sun.net.client.defaultReadTimeout", "30000"); + // We don't want to use too much memory caching sessions for https + System.setProperty("javax.net.ssl.sessionCacheSize", "2048"); + Injector injector = Guice.createInjector( new CrawlerModule(), new DatabaseModule() @@ -154,7 +157,7 @@ public class CrawlerMain implements AutoCloseable { private void startCrawlTask(CrawlPlan plan, CrawlingSpecification crawlingSpecification) { - if (!processedIds.add(crawlingSpecification.id)) { + if (workLog.isJobFinished(crawlingSpecification.id) || !processingIds.add(crawlingSpecification.id)) { // This is a duplicate id, so we ignore it. Otherwise we'd end crawling the same site twice, // and if we're really unlucky, we might end up writing to the same output file from multiple @@ -193,11 +196,10 @@ public class CrawlerMain implements AutoCloseable { HttpFetcher fetcher = new HttpFetcherImpl(userAgent.uaString(), dispatcher, connectionPool); - try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification)) { + try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification); + CrawlDataReference reference = getReference(specification)) + { var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept); - - CrawlDataReference reference = getReference(specification); - int size = retreiver.fetch(reference); workLog.setJobToFinished(specification.id, writer.getOutputFile().toString(), size); @@ -206,6 +208,10 @@ public class CrawlerMain implements AutoCloseable { } catch (Exception e) { logger.error("Error fetching domain " + specification.domain, e); } + finally { + // We don't need to double-count these; it's also kept int he workLog + processingIds.remove(specification.id); + } } private CrawlDataReference getReference(CrawlingSpecification specification) { diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java index 13d17dfc..985bfc39 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java @@ -10,7 +10,7 @@ import javax.annotation.Nullable; import java.io.IOException; /** A reference to a domain that has been crawled before. */ -public class CrawlDataReference { +public class CrawlDataReference implements AutoCloseable { private final SerializableCrawlDataStream data; @@ -75,4 +75,8 @@ public class CrawlDataReference { return hashFunction.hashInt(v).asInt(); } + @Override + public void close() throws Exception { + data.close(); + } } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/NoSecuritySSL.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/NoSecuritySSL.java index a52251bc..06f106fc 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/NoSecuritySSL.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/NoSecuritySSL.java @@ -33,7 +33,17 @@ public class NoSecuritySSL { // Install the all-trusting trust manager final SSLContext sslContext = SSLContext.getInstance("SSL"); sslContext.init(null, trustAllCerts, new java.security.SecureRandom()); - // Create an ssl socket factory with our all-trusting manager + + var clientSessionContext = sslContext.getClientSessionContext(); + + System.out.println("Default session cache size: " + clientSessionContext.getSessionCacheSize()); + System.out.println("Session timeout: " + clientSessionContext.getSessionTimeout()); + + // The default value for this is very high and will use a crapload of memory + // since the crawler will be making a lot of requests to various hosts + clientSessionContext.setSessionCacheSize(2048); + + // Create a ssl socket factory with our all-trusting manager return sslContext.getSocketFactory(); } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/ConvertedDomainReader.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/ConvertedDomainReader.java index 1c06510e..91875169 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/ConvertedDomainReader.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/ConvertedDomainReader.java @@ -1,5 +1,6 @@ package nu.marginalia.loading; +import com.github.luben.zstd.RecyclingBufferPool; import com.github.luben.zstd.ZstdInputStream; import com.google.gson.Gson; import lombok.SneakyThrows; @@ -26,7 +27,7 @@ public class ConvertedDomainReader { public List read(Path path, int cntHint) throws IOException { List ret = new ArrayList<>(cntHint); - try (var or = new ObjectInputStream(new ZstdInputStream(new FileInputStream(path.toFile())))) { + try (var or = new ObjectInputStream(new ZstdInputStream(new FileInputStream(path.toFile()), RecyclingBufferPool.INSTANCE))) { var object = or.readObject(); if (object instanceof Instruction is) { ret.add(is); @@ -39,7 +40,7 @@ public class ConvertedDomainReader { } public Iterator createIterator(Path path) throws IOException { - var or = new ObjectInputStream(new ZstdInputStream(new BufferedInputStream(new FileInputStream(path.toFile())))); + var or = new ObjectInputStream(new ZstdInputStream(new BufferedInputStream(new FileInputStream(path.toFile())), RecyclingBufferPool.INSTANCE)); return new Iterator<>() { Instruction next; From caf3d231a85aaccf07d42937806ac21d5219c6c5 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 30 Jul 2023 16:53:13 +0200 Subject: [PATCH 104/157] (crawler) Fix rare issue with NPEs if the crawl queue is empty --- .../nu/marginalia/crawl/retreival/DomainCrawlFrontier.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java index 4b1b9ad1..30902a8e 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java @@ -77,6 +77,9 @@ public class DomainCrawlFrontier { public EdgeUrl peek() { try { + if (queue.peek() == null) { + return null; + } return new EdgeUrl(queue.peek()); } catch (URISyntaxException e) { // This should never happen since we only add urls via EdgeUrl.toString() From 5c071ce4d3ed1c27a403faca0510493e8f57bfe0 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 30 Jul 2023 16:53:39 +0200 Subject: [PATCH 105/157] (crawler) Clean up the code and remove unnecessary logging --- .../nu/marginalia/process/log/WorkLog.java | 15 +- .../crawling/io/CrawledDomainReader.java | 5 +- .../io/SerializableCrawlDataStream.java | 33 ++-- .../java/nu/marginalia/crawl/CrawlerMain.java | 147 ++++++++---------- .../nu/marginalia/crawl/DumbThreadPool.java | 11 +- .../retreival/fetcher/NoSecuritySSL.java | 3 - 6 files changed, 102 insertions(+), 112 deletions(-) diff --git a/code/common/process/src/main/java/nu/marginalia/process/log/WorkLog.java b/code/common/process/src/main/java/nu/marginalia/process/log/WorkLog.java index b74ab5b4..9be31d17 100644 --- a/code/common/process/src/main/java/nu/marginalia/process/log/WorkLog.java +++ b/code/common/process/src/main/java/nu/marginalia/process/log/WorkLog.java @@ -3,6 +3,7 @@ package nu.marginalia.process.log; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.Closeable; import java.io.FileOutputStream; import java.io.IOException; import java.nio.charset.StandardCharsets; @@ -11,7 +12,6 @@ import java.nio.file.Path; import java.time.LocalDateTime; import java.util.*; import java.util.function.Function; -import java.util.regex.Pattern; /** WorkLog is a journal of work done by a process, * so that it can be resumed after a crash or termination. @@ -25,7 +25,7 @@ import java.util.regex.Pattern; *

    * */ -public class WorkLog implements AutoCloseable { +public class WorkLog implements AutoCloseable, Closeable { private final Set finishedJobs = new HashSet<>(); private final FileOutputStream logWriter; private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -89,9 +89,14 @@ public class WorkLog implements AutoCloseable { } @Override - public void close() throws Exception { - logWriter.flush(); - logWriter.close(); + public void close() { + try { + logWriter.flush(); + logWriter.close(); + } + catch (IOException e) { + logger.error("Error closing work log", e); + } } public int countFinishedJobs() { diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java index 15f54d23..82a8823b 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java @@ -27,14 +27,17 @@ public class CrawledDomainReader { public CrawledDomainReader() { } + /** An iterator-like access to domain data This must be closed otherwise it will leak off-heap memory! */ public SerializableCrawlDataStream createDataStream(Path fullPath) throws IOException { return new FileReadingSerializableCrawlDataStream(gson, fullPath.toFile()); } + /** An iterator-like access to domain data. This must be closed otherwise it will leak off-heap memory! */ public SerializableCrawlDataStream createDataStream(Path basePath, CrawlingSpecification spec) throws IOException { return createDataStream(CrawlerOutputFile.getOutputFile(basePath, spec.id, spec.domain)); } - + + /** Read the entirety of the domain data into memory. This uses a lot of RAM */ public CrawledDomain read(Path path) throws IOException { DomainDataAssembler domainData = new DomainDataAssembler(); diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/SerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/SerializableCrawlDataStream.java index 7d4fb28c..3aecc0fc 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/SerializableCrawlDataStream.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/SerializableCrawlDataStream.java @@ -7,20 +7,24 @@ import java.util.Iterator; /** Closable iterator over serialized crawl data * The data may appear in any order, and the iterator must be closed. + * + * @see CrawledDomainReader * */ public interface SerializableCrawlDataStream extends AutoCloseable { + + + SerializableCrawlData next() throws IOException; + + boolean hasNext() throws IOException; + + + // Dummy iterator over nothing static SerializableCrawlDataStream empty() { return new SerializableCrawlDataStream() { @Override - public SerializableCrawlData next() throws IOException { - throw new IllegalStateException("No more data"); - } - + public SerializableCrawlData next() throws IOException { throw new IllegalStateException("No more data"); } @Override - public boolean hasNext() throws IOException { - return false; - } - + public boolean hasNext() throws IOException { return false;} public void close() {} }; } @@ -29,21 +33,12 @@ public interface SerializableCrawlDataStream extends AutoCloseable { static SerializableCrawlDataStream fromIterator(Iterator iterator) { return new SerializableCrawlDataStream() { @Override - public SerializableCrawlData next() throws IOException { - return iterator.next(); - } - + public SerializableCrawlData next() { return iterator.next(); } @Override - public boolean hasNext() throws IOException { - return iterator.hasNext(); - } - + public boolean hasNext() { return iterator.hasNext(); } public void close() {} }; } - SerializableCrawlData next() throws IOException; - - boolean hasNext() throws IOException; } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java index 835f3b0e..758c6d39 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java @@ -37,11 +37,10 @@ import java.util.concurrent.atomic.AtomicInteger; import static nu.marginalia.mqapi.ProcessInboxNames.CRAWLER_INBOX; -public class CrawlerMain implements AutoCloseable { +public class CrawlerMain { private final Logger logger = LoggerFactory.getLogger(getClass()); private Path crawlDataDir; - private WorkLog workLog; private final ProcessHeartbeat heartbeat; private final ConnectionPool connectionPool = new ConnectionPool(5, 10, TimeUnit.SECONDS); @@ -56,6 +55,7 @@ public class CrawlerMain implements AutoCloseable { private final DumbThreadPool pool; private final Set processingIds = new HashSet<>(); + private final CrawledDomainReader reader = new CrawledDomainReader(); final AbortMonitor abortMonitor = AbortMonitor.getInstance(); @@ -120,12 +120,10 @@ public class CrawlerMain implements AutoCloseable { public void run(CrawlPlan plan) throws InterruptedException, IOException { heartbeat.start(); - try { + try (WorkLog workLog = plan.createCrawlWorkLog()) { // First a validation run to ensure the file is all good to parse logger.info("Validating JSON"); - - workLog = plan.createCrawlWorkLog(); crawlDataDir = plan.crawl.getDir(); int countTotal = 0; @@ -136,8 +134,26 @@ public class CrawlerMain implements AutoCloseable { logger.info("Let's go"); - for (var spec : plan.crawlingSpecificationIterable()) { - startCrawlTask(plan, spec); + for (var crawlingSpecification : plan.crawlingSpecificationIterable()) { + + if (!abortMonitor.isAlive()) + break; + + // Check #1: Have we already crawled this site? Check is necessary for resuming a craw after a crash or something + if (workLog.isJobFinished(crawlingSpecification.id)) { + continue; + } + + // Check #2: Have we already started this crawl (but not finished it)? + // This shouldn't realistically happen, but if it does, we need to ignore it, otherwise + // we'd end crawling the same site twice and might end up writing to the same output + // file from multiple threads with complete bit salad as a result. + if (!processingIds.add(crawlingSpecification.id)) { + logger.error("Ignoring duplicate id: {}", crawlingSpecification.id); + continue; + } + + pool.submit(new CrawlTask(crawlingSpecification, workLog)); } logger.info("Shutting down the pool, waiting for tasks to complete..."); @@ -152,77 +168,59 @@ public class CrawlerMain implements AutoCloseable { } } - CrawledDomainReader reader = new CrawledDomainReader(); + class CrawlTask implements DumbThreadPool.Task { + private final CrawlingSpecification specification; + private final WorkLog workLog; - private void startCrawlTask(CrawlPlan plan, CrawlingSpecification crawlingSpecification) { - - if (workLog.isJobFinished(crawlingSpecification.id) || !processingIds.add(crawlingSpecification.id)) { - - // This is a duplicate id, so we ignore it. Otherwise we'd end crawling the same site twice, - // and if we're really unlucky, we might end up writing to the same output file from multiple - // threads with complete bit salad as a result. - - logger.error("Ignoring duplicate id: {}", crawlingSpecification.id); - return; + CrawlTask(CrawlingSpecification specification, WorkLog workLog) { + this.specification = specification; + this.workLog = workLog; } - if (!abortMonitor.isAlive()) { - return; + @Override + public void run() throws Exception { + + limiter.waitForEnoughRAM(); + + HttpFetcher fetcher = new HttpFetcherImpl(userAgent.uaString(), dispatcher, connectionPool); + + try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification); + CrawlDataReference reference = getReference(specification)) + { + Thread.currentThread().setName("crawling:" + specification.domain); + + var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept); + int size = retreiver.fetch(reference); + + workLog.setJobToFinished(specification.id, writer.getOutputFile().toString(), size); + heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks); + + logger.info("Fetched {}", specification.domain); + + } catch (Exception e) { + logger.error("Error fetching domain " + specification.domain, e); + } + finally { + // We don't need to double-count these; it's also kept int he workLog + processingIds.remove(specification.id); + Thread.currentThread().setName("[idle]"); + } } - try { - pool.submit(() -> { - limiter.waitForEnoughRAM(); + private CrawlDataReference getReference(CrawlingSpecification specification) { + try { + var dataStream = reader.createDataStream(crawlDataDir, specification); + return new CrawlDataReference(dataStream); + } catch (IOException e) { + logger.warn("Failed to read previous crawl data for {}", specification.domain); + return new CrawlDataReference(); + } + } - try { - Thread.currentThread().setName("crawling:" + crawlingSpecification.domain); - fetchDomain(crawlingSpecification); - heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks); - } finally { - Thread.currentThread().setName("[idle]"); - } - }); - } - catch (InterruptedException ex) { - throw new RuntimeException(ex); - } } - private void fetchDomain(CrawlingSpecification specification) { - if (workLog.isJobFinished(specification.id)) - return; - - HttpFetcher fetcher = new HttpFetcherImpl(userAgent.uaString(), dispatcher, connectionPool); - - try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification); - CrawlDataReference reference = getReference(specification)) - { - var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept); - int size = retreiver.fetch(reference); - - workLog.setJobToFinished(specification.id, writer.getOutputFile().toString(), size); - - logger.info("Fetched {}", specification.domain); - } catch (Exception e) { - logger.error("Error fetching domain " + specification.domain, e); - } - finally { - // We don't need to double-count these; it's also kept int he workLog - processingIds.remove(specification.id); - } - } - - private CrawlDataReference getReference(CrawlingSpecification specification) { - try { - var dataStream = reader.createDataStream(crawlDataDir, specification); - return new CrawlDataReference(dataStream); - } catch (IOException e) { - logger.warn("Failed to read previous crawl data for {}", specification.domain); - return new CrawlDataReference(); - } - } private static class CrawlRequest { private final CrawlPlan plan; @@ -252,6 +250,7 @@ public class CrawlerMain implements AutoCloseable { var inbox = messageQueueFactory.createSingleShotInbox(CRAWLER_INBOX, UUID.randomUUID()); + logger.info("Waiting for instructions"); var msgOpt = getMessage(inbox, nu.marginalia.mqapi.crawling.CrawlRequest.class.getSimpleName()); var msg = msgOpt.orElseThrow(() -> new RuntimeException("No message received")); @@ -282,16 +281,4 @@ public class CrawlerMain implements AutoCloseable { } } - - public void close() throws Exception { - logger.info("Awaiting termination"); - pool.shutDown(); - - while (!pool.awaitTermination(1, TimeUnit.SECONDS)); - logger.info("All finished"); - - workLog.close(); - dispatcher.executorService().shutdownNow(); - } - } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/DumbThreadPool.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/DumbThreadPool.java index 676e3286..f82c460e 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/DumbThreadPool.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/DumbThreadPool.java @@ -17,7 +17,7 @@ import java.util.concurrent.atomic.AtomicInteger; */ public class DumbThreadPool { private final List workers = new ArrayList<>(); - private final LinkedBlockingQueue tasks; + private final LinkedBlockingQueue tasks; private volatile boolean shutDown = false; private final AtomicInteger taskCount = new AtomicInteger(0); private final Logger logger = LoggerFactory.getLogger(DumbThreadPool.class); @@ -34,8 +34,8 @@ public class DumbThreadPool { } - public void submit(Runnable runnable) throws InterruptedException { - tasks.put(runnable); + public void submit(Task task) throws InterruptedException { + tasks.put(task); } public void shutDown() { @@ -52,7 +52,7 @@ public class DumbThreadPool { private void worker() { while (!shutDown) { try { - Runnable task = tasks.poll(1, TimeUnit.SECONDS); + Task task = tasks.poll(1, TimeUnit.SECONDS); if (task == null) { continue; } @@ -115,4 +115,7 @@ public class DumbThreadPool { return taskCount.get(); } + public interface Task { + void run() throws Exception; + } } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/NoSecuritySSL.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/NoSecuritySSL.java index 06f106fc..f86d2c48 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/NoSecuritySSL.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/NoSecuritySSL.java @@ -36,9 +36,6 @@ public class NoSecuritySSL { var clientSessionContext = sslContext.getClientSessionContext(); - System.out.println("Default session cache size: " + clientSessionContext.getSessionCacheSize()); - System.out.println("Session timeout: " + clientSessionContext.getSessionTimeout()); - // The default value for this is very high and will use a crapload of memory // since the crawler will be making a lot of requests to various hosts clientSessionContext.setSessionCacheSize(2048); From 6ff7e9648f4649a6a2ae3ebfec2398e79e6ae992 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 30 Jul 2023 16:54:02 +0200 Subject: [PATCH 106/157] (crawler) Use and pass the proper environment variables to the processes. --- .../main/java/nu/marginalia/control/svc/ProcessService.java | 6 +++--- run/env/service.env | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java index 9496c2a1..7e322913 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java @@ -136,9 +136,9 @@ public class ProcessService { opts.put("WMSA_HOME", WMSA_HOME); opts.put("JAVA_HOME", System.getenv("JAVA_HOME")); opts.put("JAVA_OPTS", ""); - opts.put("CONVERTER_OPTS", System.getenv("CONVERTER_OPTS")); - opts.put("LOADER_OPTS", System.getenv("LOADER_OPTS")); - opts.put("CRAWLER_OPTS", System.getenv("CRAWLER_OPTS")); + opts.put("CONVERTER_PROCESS_OPTS", System.getenv("CONVERTER_PROCESS_OPTS")); + opts.put("LOADER_PROCESS_OPTS", System.getenv("LOADER_PROCESS_OPTS")); + opts.put("CRAWLER_PROCESS_OPTS", System.getenv("CRAWLER_PROCESS_OPTS")); return opts.entrySet().stream().map(e -> e.getKey() + "=" + e.getValue()).toArray(String[]::new); } diff --git a/run/env/service.env b/run/env/service.env index ac745577..9f5ad323 100644 --- a/run/env/service.env +++ b/run/env/service.env @@ -1,4 +1,4 @@ WMSA_HOME=run/ CONTROL_SERVICE_OPTS="-DdistPath=/dist" -CONVERTER_OPTS="-ea -Xmx16G -XX:-CompactStrings -XX:+UseParallelGC -XX:GCTimeRatio=14 -XX:ParallelGCThreads=15" -CRAWLER_OPTS="-Xmx16G -XX:+UseParallelGC -XX:GCTimeRatio=14 -XX:ParallelGCThreads=15" \ No newline at end of file +CONVERTER_PROCESS_OPTS="-ea -Xmx16G -XX:-CompactStrings -XX:+UseParallelGC -XX:GCTimeRatio=14 -XX:ParallelGCThreads=15" +CRAWLER_PROCESS_OPTS="-Xmx16G -XX:+UseParallelGC -XX:GCTimeRatio=14 -XX:ParallelGCThreads=15" \ No newline at end of file From 5411950b872e6a64cc98ed32217d9ec643b85fa4 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 31 Jul 2023 10:31:29 +0200 Subject: [PATCH 107/157] (minor) Tidy up EdgeDomain class a bit, no functional difference --- .../main/java/nu/marginalia/model/EdgeDomain.java | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/code/common/model/src/main/java/nu/marginalia/model/EdgeDomain.java b/code/common/model/src/main/java/nu/marginalia/model/EdgeDomain.java index 50d84e11..88dea9c7 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/EdgeDomain.java +++ b/code/common/model/src/main/java/nu/marginalia/model/EdgeDomain.java @@ -162,22 +162,16 @@ public class EdgeDomain implements Serializable { public boolean equals(final Object o) { if (o == this) return true; - if (!(o instanceof EdgeDomain)) return false; - final EdgeDomain other = (EdgeDomain) o; - if (!other.canEqual((Object) this)) return false; + if (!(o instanceof EdgeDomain other)) return false; final String this$subDomain = this.getSubDomain(); final String other$subDomain = other.getSubDomain(); - if (!this$subDomain.equalsIgnoreCase(other$subDomain)) return false; + if (!Objects.equals(this$subDomain,other$subDomain)) return false; final String this$domain = this.getDomain(); final String other$domain = other.getDomain(); - if (!this$domain.equalsIgnoreCase(other$domain)) return false; + if (!Objects.equals(this$domain,other$domain)) return false; return true; } - protected boolean canEqual(final Object other) { - return other instanceof EdgeDomain; - } - public int hashCode() { final int PRIME = 59; int result = 1; From 6f4e767a04da6c15289de734c8e3ab48080dd8d8 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 31 Jul 2023 10:31:46 +0200 Subject: [PATCH 108/157] (minor) Re-enable monkey-patch-json for converter --- code/processes/converting-process/build.gradle | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/processes/converting-process/build.gradle b/code/processes/converting-process/build.gradle index b8e199f3..a14ee596 100644 --- a/code/processes/converting-process/build.gradle +++ b/code/processes/converting-process/build.gradle @@ -20,7 +20,7 @@ tasks.distZip.enabled = false dependencies { - //implementation project(':third-party:monkey-patch-gson') + implementation project(':third-party:monkey-patch-gson') implementation project(':code:common:process') From 9786f822206d6fd6e13e6a7e4989c0c3f06c954b Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 31 Jul 2023 10:32:23 +0200 Subject: [PATCH 109/157] Fix environment variables to processes so jmc works --- docker-compose.yml | 1 + run/env/service.env | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 4aff54db..0c5c3fd1 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -81,6 +81,7 @@ services: - "127.0.0.1:5090:5090" - "127.0.0.1:4090:5000" - "127.0.0.1:7090:4000" + - "127.0.0.1:7099:4001" depends_on: - mariadb mariadb: diff --git a/run/env/service.env b/run/env/service.env index 9f5ad323..9e982a7f 100644 --- a/run/env/service.env +++ b/run/env/service.env @@ -1,4 +1,5 @@ WMSA_HOME=run/ CONTROL_SERVICE_OPTS="-DdistPath=/dist" -CONVERTER_PROCESS_OPTS="-ea -Xmx16G -XX:-CompactStrings -XX:+UseParallelGC -XX:GCTimeRatio=14 -XX:ParallelGCThreads=15" -CRAWLER_PROCESS_OPTS="-Xmx16G -XX:+UseParallelGC -XX:GCTimeRatio=14 -XX:ParallelGCThreads=15" \ No newline at end of file +CONVERTER_PROCESS_OPTS="-ea -Xmx16G -XX:-CompactStrings -XX:+UseParallelGC -XX:GCTimeRatio=14 -XX:ParallelGCThreads=15 -Dservice-host=0.0.0.0 -ea -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.port=4001 -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false" +CRAWLER_PROCESS_OPTS="-Xmx16G -XX:+UseParallelGC -XX:GCTimeRatio=14 -XX:ParallelGCThreads=15 -Dservice-host=0.0.0.0 -ea -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.port=4001 -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false" +LOADER_PROCESS_OPTS="-Xmx16G -XX:+UseParallelGC -XX:GCTimeRatio=14 -XX:ParallelGCThreads=15 -Dservice-host=0.0.0.0 -ea -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.port=4001 -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false" \ No newline at end of file From cd90ca820f513551931de96219102e281249ba00 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 31 Jul 2023 10:32:47 +0200 Subject: [PATCH 110/157] YAGNI filter over ConverterDomainTypes --- .../processor/ConverterDomainTypes.java | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/ConverterDomainTypes.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/ConverterDomainTypes.java index 95a1b5fd..a8f3db7a 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/ConverterDomainTypes.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/ConverterDomainTypes.java @@ -9,8 +9,8 @@ import org.slf4j.LoggerFactory; import java.io.IOException; import java.sql.SQLException; -import java.util.HashMap; -import java.util.Map; +import java.util.HashSet; +import java.util.Set; /** Converter-side wrapper for of common:db's DomainTypes, * which is a list of domains of a known type (e.g. blog) @@ -18,11 +18,7 @@ import java.util.Map; @Singleton public class ConverterDomainTypes { private final Logger logger = LoggerFactory.getLogger(ConverterDomainTypes.class); - private final Map domainTypes = new HashMap<>(); - - private enum DomainType { - BLOG - } + private final Set blogs = new HashSet<>(10000, 0.5f); @Inject public ConverterDomainTypes(DomainTypes types) throws SQLException { @@ -40,14 +36,13 @@ public class ConverterDomainTypes { } for (var item : allBlogs) { - domainTypes.put(new EdgeDomain(item), DomainType.BLOG); + blogs.add(new EdgeDomain(item)); } - logger.info("Loaded {} domain types", domainTypes.size()); - + logger.info("Loaded {} domain types", blogs.size()); } public boolean isBlog(EdgeDomain domain) { - return domainTypes.get(domain) == DomainType.BLOG; + return blogs.contains(domain); } } From 1c948eb3d862e0dd39d1d4635a6db99eabb067a2 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 31 Jul 2023 10:33:15 +0200 Subject: [PATCH 111/157] (minor) Alter DumbThreadPool in Converter to not claim the threads are crawlers. --- .../src/main/java/nu/marginalia/converting/DumbThreadPool.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/DumbThreadPool.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/DumbThreadPool.java index 3175fec7..2ea1f071 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/DumbThreadPool.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/DumbThreadPool.java @@ -26,7 +26,7 @@ public class DumbThreadPool { tasks = new LinkedBlockingQueue<>(queueSize); for (int i = 0; i < poolSize; i++) { - Thread worker = new Thread(this::worker, "Crawler Thread " + i); + Thread worker = new Thread(this::worker, "Converter Thread " + i); worker.setDaemon(true); worker.start(); workers.add(worker); From 37c4cc68ed3897c273e919e3dffea83c94e7d4d2 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 31 Jul 2023 10:34:42 +0200 Subject: [PATCH 112/157] TODO --- .../src/main/java/nu/marginalia/converting/DumbThreadPool.java | 1 + .../src/main/java/nu/marginalia/crawl/DumbThreadPool.java | 1 + 2 files changed, 2 insertions(+) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/DumbThreadPool.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/DumbThreadPool.java index 2ea1f071..95cbf14a 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/DumbThreadPool.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/DumbThreadPool.java @@ -15,6 +15,7 @@ import java.util.concurrent.atomic.AtomicInteger; * becomes available to run the task. This is useful for coarse grained * tasks where the calling thread might otherwise block for hours. */ +// TODO: This class exists in crawler as well, should probably be broken out into a common library; use the one from crawler instead public class DumbThreadPool { private final List workers = new ArrayList<>(); private final LinkedBlockingQueue tasks; diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/DumbThreadPool.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/DumbThreadPool.java index f82c460e..076eb9e5 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/DumbThreadPool.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/DumbThreadPool.java @@ -15,6 +15,7 @@ import java.util.concurrent.atomic.AtomicInteger; * becomes available to run the task. This is useful for coarse grained * tasks where the calling thread might otherwise block for hours. */ +// TODO: This class exists in converter as well, should probably be broken out into a common library; use this version public class DumbThreadPool { private final List workers = new ArrayList<>(); private final LinkedBlockingQueue tasks; From 12bd74d4f344b8e70e1d8f4939e8401f2d1aeefe Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 31 Jul 2023 10:56:16 +0200 Subject: [PATCH 113/157] Clean up ProcessService --- .../control/svc/ProcessService.java | 62 +++++++++++-------- 1 file changed, 37 insertions(+), 25 deletions(-) diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java index 7e322913..e38c6d97 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java @@ -14,9 +14,7 @@ import java.io.BufferedReader; import java.io.InputStreamReader; import java.nio.file.Files; import java.nio.file.Path; -import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; +import java.util.*; import java.util.concurrent.ConcurrentHashMap; @Singleton @@ -56,14 +54,9 @@ public class ProcessService { } public boolean trigger(ProcessId processId, String... parameters) throws Exception { - String processPath = processPath(processId); - String[] args = new String[parameters.length + 1]; - - args[0] = processPath; - for (int i = 0; i < parameters.length; i++) - args[i+1] = parameters[i]; - - String[] env = env(); + final String processPath = distPath.resolve(processId.path).toString(); + final String[] env = createEnvironmentVariables(); + final String[] args = createCommandArguments(processPath, parameters); Process process; @@ -72,8 +65,8 @@ public class ProcessService { return false; } + logger.info("Starting process: {}: {} // {}", processId, Arrays.toString(args), Arrays.toString(env)); - logger.info("Starting process: {}", processId + ": " + Arrays.toString(args) + " // " + Arrays.toString(env)); synchronized (processes) { if (processes.containsKey(processId)) return false; process = Runtime.getRuntime().exec(args, env); @@ -104,8 +97,13 @@ public class ProcessService { eventLog.logEvent("PROCESS-EXIT", processId.toString()); processes.remove(processId); } + } - + private String[] createCommandArguments(String processPath, String[] parameters) { + final String[] args = new String[parameters.length + 1]; + args[0] = processPath; + System.arraycopy(parameters, 0, args, 1, parameters.length); + return args; } public boolean isRunning(ProcessId processId) { @@ -122,24 +120,38 @@ public class ProcessService { return true; } - private String processPath(ProcessId id) { - return distPath.resolve(id.path).toString(); - } + /** These environment variables are propagated from the parent process to the child process, + * along with WMSA_HOME, but it has special logic */ + private final List propagatedEnvironmentVariables = List.of( + "JAVA_HOME", + "CONVERTER_PROCESS_OPTS", + "LOADER_PROCESS_OPTS", + "CRAWLER_PROCESS_OPTS"); - private String[] env() { + private String[] createEnvironmentVariables() { + List opts = new ArrayList<>(); - Map opts = new HashMap<>(); String WMSA_HOME = System.getenv("WMSA_HOME"); + if (WMSA_HOME == null || WMSA_HOME.isBlank()) { WMSA_HOME = "/var/lib/wmsa"; } - opts.put("WMSA_HOME", WMSA_HOME); - opts.put("JAVA_HOME", System.getenv("JAVA_HOME")); - opts.put("JAVA_OPTS", ""); - opts.put("CONVERTER_PROCESS_OPTS", System.getenv("CONVERTER_PROCESS_OPTS")); - opts.put("LOADER_PROCESS_OPTS", System.getenv("LOADER_PROCESS_OPTS")); - opts.put("CRAWLER_PROCESS_OPTS", System.getenv("CRAWLER_PROCESS_OPTS")); - return opts.entrySet().stream().map(e -> e.getKey() + "=" + e.getValue()).toArray(String[]::new); + opts.add(env2str("WMSA_HOME", WMSA_HOME)); + opts.add(env2str("JAVA_OPTS", "")); // We explicitly empty this to avoid inheriting the parent process' JAVA_OPTS + + for (String envKey : propagatedEnvironmentVariables) { + String envValue = System.getenv(envKey); + if (envValue != null && !envValue.isBlank()) { + opts.add(env2str(envKey, envValue)); + } + } + + return opts.toArray(String[]::new); } + + private String env2str(String key, String val) { + return key + "=" + val; + } + } From 6b5fb0f841932afd18b1ee9b29a1bbbf3114f86d Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 31 Jul 2023 14:18:52 +0200 Subject: [PATCH 114/157] (control) Disable the start button for actors that aren't directly initializable. (control) Disable the start button for actors that aren't directly initializable. --- .../java/nu/marginalia/mqsm/StateMachine.java | 7 +++++++ .../marginalia/mqsm/graph/AbstractStateGraph.java | 15 +++++++++++++++ .../marginalia/control/actor/ControlActors.java | 4 ++++ .../marginalia/control/model/ActorRunState.java | 2 +- .../control/svc/ControlActorService.java | 3 ++- .../templates/control/partials/actors-table.hdb | 14 +++++++++++++- 6 files changed, 42 insertions(+), 3 deletions(-) diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java index 1ef80abb..a2567698 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java @@ -42,6 +42,7 @@ public class StateMachine { private final List> stateChangeListeners = new ArrayList<>(); private final Map allStates = new HashMap<>(); + private final boolean isDirectlyInitializable; public StateMachine(MessageQueueFactory messageQueueFactory, String queueName, @@ -57,6 +58,7 @@ public class StateMachine { registerStates(List.of(errorState, finalState, resumingState)); registerStates(stateGraph); + isDirectlyInitializable = stateGraph.isDirectlyInitializable(); for (var declaredState : stateGraph.declaredStates()) { if (!allStates.containsKey(declaredState.name())) { @@ -339,6 +341,11 @@ public class StateMachine { smInbox.abortCurrentTask(); } + /** Returns true if there is an INITIAL state that requires no parameters */ + public boolean isDirectlyInitializable() { + return isDirectlyInitializable; + } + private class StateEventSubscription implements MqSubscription { @Override diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/AbstractStateGraph.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/AbstractStateGraph.java index 098c4333..477788ef 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/AbstractStateGraph.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/AbstractStateGraph.java @@ -39,6 +39,21 @@ public abstract class AbstractStateGraph { throw new ControlFlowException("ERROR", ex.getClass().getSimpleName() + ":" + ex.getMessage()); } + /** Check whether there is an INITIAL state that can be directly initialized + * without declared parameters. */ + public boolean isDirectlyInitializable() { + for (var method : getClass().getMethods()) { + var gs = method.getAnnotation(GraphState.class); + if (gs == null) { + continue; + } + if ("INITIAL".equals(gs.name()) && method.getParameterCount() == 0) { + return true; + } + } + return false; + } + public Set declaredStates() { Set ret = new HashSet<>(); diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java index 052ca2cb..12f15bf9 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java @@ -112,6 +112,10 @@ public class ControlActors { ); } + public boolean isDirectlyInitializable(Actor actor) { + return actorDefinitions.get(actor).isDirectlyInitializable(); + } + public AbstractStateGraph getActorDefinition(Actor actor) { return actorDefinitions.get(actor); } diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ActorRunState.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ActorRunState.java index 69903ef0..152af472 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ActorRunState.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ActorRunState.java @@ -1,6 +1,6 @@ package nu.marginalia.control.model; -public record ActorRunState(String name, String state, boolean terminal) { +public record ActorRunState(String name, String state, boolean terminal, boolean canStart) { public String stateIcon() { if (terminal) { return "\uD83D\uDE34"; diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java index 25461e58..aa0ed905 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java @@ -90,8 +90,9 @@ public class ControlActorService { final String machineName = e.getKey().name(); final String stateName = state.name(); final boolean terminal = state.isFinal(); + final boolean canStart = controlActors.isDirectlyInitializable(e.getKey()) && terminal; - return new ActorRunState(machineName, stateName, terminal); + return new ActorRunState(machineName, stateName, terminal, canStart); }).toList(); } diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/partials/actors-table.hdb b/code/services-satellite/control-service/src/main/resources/templates/control/partials/actors-table.hdb index 9fe27aa6..e77b7b70 100644 --- a/code/services-satellite/control-service/src/main/resources/templates/control/partials/actors-table.hdb +++ b/code/services-satellite/control-service/src/main/resources/templates/control/partials/actors-table.hdb @@ -23,7 +23,19 @@ action="/fsms/{{name}}/start" method="post" onsubmit="return toggleActorSwitch('{{name}}')"> - + {{/if}} From c9d7635370d0e48e801778be955b9071efd4037b Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 31 Jul 2023 14:20:17 +0200 Subject: [PATCH 115/157] (control) Aborting an actor that waits on a process request terminates the running job. (control) Aborting an actor that waits on a process request terminates the running job. --- .../mqsm/graph/ControlFlowException.java | 2 +- .../monitor/AbstractProcessSpawnerActor.java | 18 ++++- .../actor/task/ActorProcessWatcher.java | 76 +++++++++++++++++++ .../control/actor/task/CrawlActor.java | 53 ++----------- .../actor/task/CrawlJobExtractorActor.java | 3 - .../actor/task/ReconvertAndLoadActor.java | 43 ++--------- .../control/actor/task/RecrawlActor.java | 41 +--------- 7 files changed, 106 insertions(+), 130 deletions(-) create mode 100644 code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/ActorProcessWatcher.java diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/ControlFlowException.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/ControlFlowException.java index aece44ea..5354a54a 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/ControlFlowException.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/ControlFlowException.java @@ -1,6 +1,6 @@ package nu.marginalia.mqsm.graph; -class ControlFlowException extends RuntimeException { +public class ControlFlowException extends RuntimeException { private final String state; private final Object payload; diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java index 4ff3cde8..6031f9d9 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java @@ -107,12 +107,22 @@ public class AbstractProcessSpawnerActor extends AbstractStateGraph { attempts = 0; try { + long startTime = System.currentTimeMillis(); var exec = new TaskExecution(); + long endTime = System.currentTimeMillis(); + if (exec.isError()) { - if (attempts < MAX_ATTEMPTS) - transition(RUN, attempts + 1); - else - transition(ERROR); + if (attempts < MAX_ATTEMPTS) transition(RUN, attempts + 1); + else error(); + } + else if (endTime - startTime < TimeUnit.SECONDS.toMillis(10)) { + // To avoid boot loops, we transition to error if the process + // didn't run for longer than 10 seconds. This might happen if + // the process crashes before it can reach the heartbeat and inbox + // stages of execution. In this case it would not report having acted + // on its message, and the process would be restarted forever without + // the attempts counter incrementing. + error("Process terminated within 10 seconds of starting"); } } catch (InterruptedException ex) { diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/ActorProcessWatcher.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/ActorProcessWatcher.java new file mode 100644 index 00000000..33f96f6b --- /dev/null +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/ActorProcessWatcher.java @@ -0,0 +1,76 @@ +package nu.marginalia.control.actor.task; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.control.svc.ProcessService; +import nu.marginalia.mq.MqMessage; +import nu.marginalia.mq.outbox.MqOutbox; +import nu.marginalia.mqsm.graph.ControlFlowException; + +import java.sql.SQLException; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; + +@Singleton +public class ActorProcessWatcher { + + private final ProcessService processService; + + @Inject + public ActorProcessWatcher(ProcessService processService) { + this.processService = processService; + } + + /** Wait for a process to start, and then wait for a response from the process, + * periodically checking that the process is still running. If the process dies, + * and does not respawn, or does not start at all, a control flow exception is thrown + * that will cause the actor to transition to ERROR. + *

    + * When interrupted, the process is killed and the message is marked as dead. + */ + public MqMessage waitResponse(MqOutbox outbox, ProcessService.ProcessId processId, long id) + throws ControlFlowException, InterruptedException, SQLException + { + if (!waitForProcess(processId, TimeUnit.SECONDS, 30)) { + throw new ControlFlowException("ERROR", + "Process " + processId + " did not launch"); + } + for (;;) { + try { + return outbox.waitResponse(id, 5, TimeUnit.SECONDS); + } + catch (InterruptedException ex) { + // Here we mark the message as dead, as it's the user that has aborted the process + // This will prevent the monitor process from attempting to respawn the process as we kill it + + outbox.flagAsDead(id); + processService.kill(processId); + + throw ex; + } + catch (TimeoutException ex) { + // Maybe the process died, wait a moment for it to restart + if (!waitForProcess(processId, TimeUnit.SECONDS, 30)) { + throw new ControlFlowException("ERROR", + "Process " + processId + " died and did not re-launch"); + } + } + } + } + + /** Wait the specified time for the specified process to start running (does not start the process) */ + private boolean waitForProcess(ProcessService.ProcessId processId, TimeUnit unit, int duration) throws InterruptedException { + + // Wait for process to start + long deadline = System.currentTimeMillis() + unit.toMillis(duration); + while (System.currentTimeMillis() < deadline) { + if (processService.isRunning(processId)) + return true; + + TimeUnit.SECONDS.sleep(1); + } + + return false; + } + +} diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlActor.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlActor.java index 4db5b3e1..a37639a6 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlActor.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlActor.java @@ -12,14 +12,9 @@ import nu.marginalia.db.storage.FileStorageService; import nu.marginalia.db.storage.model.FileStorageBaseType; import nu.marginalia.db.storage.model.FileStorageId; import nu.marginalia.db.storage.model.FileStorageType; -import nu.marginalia.index.client.IndexClient; -import nu.marginalia.index.client.IndexMqEndpoints; -import nu.marginalia.mq.MqMessage; import nu.marginalia.mq.MqMessageState; import nu.marginalia.mq.outbox.MqOutbox; -import nu.marginalia.mqapi.converting.ConvertRequest; import nu.marginalia.mqapi.crawling.CrawlRequest; -import nu.marginalia.mqapi.loading.LoadRequest; import nu.marginalia.mqsm.StateFactory; import nu.marginalia.mqsm.graph.AbstractStateGraph; import nu.marginalia.mqsm.graph.GraphState; @@ -27,11 +22,6 @@ import nu.marginalia.mqsm.graph.ResumeBehavior; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.nio.file.Files; -import java.nio.file.StandardCopyOption; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.TimeoutException; - @Singleton public class CrawlActor extends AbstractStateGraph { @@ -41,12 +31,13 @@ public class CrawlActor extends AbstractStateGraph { public static final String CRAWL = "CRAWL"; public static final String CRAWL_WAIT = "CRAWL-WAIT"; public static final String END = "END"; - private final ProcessService processService; private final MqOutbox mqCrawlerOutbox; private final FileStorageService storageService; private final Gson gson; private final Logger logger = LoggerFactory.getLogger(getClass()); + private final ActorProcessWatcher processWatcher; + @AllArgsConstructor @With @NoArgsConstructor public static class Message { @@ -57,17 +48,16 @@ public class CrawlActor extends AbstractStateGraph { @Inject public CrawlActor(StateFactory stateFactory, - ProcessService processService, ProcessOutboxFactory processOutboxFactory, FileStorageService storageService, - Gson gson - ) + Gson gson, + ActorProcessWatcher processWatcher) { super(stateFactory); - this.processService = processService; this.mqCrawlerOutbox = processOutboxFactory.createCrawlerOutbox(); this.storageService = storageService; this.gson = gson; + this.processWatcher = processWatcher; } @GraphState(name = INITIAL, @@ -128,7 +118,7 @@ public class CrawlActor extends AbstractStateGraph { """ ) public Message crawlerWait(Message message) throws Exception { - var rsp = waitResponse(mqCrawlerOutbox, ProcessService.ProcessId.CRAWLER, message.crawlerMsgId); + var rsp = processWatcher.waitResponse(mqCrawlerOutbox, ProcessService.ProcessId.CRAWLER, message.crawlerMsgId); if (rsp.state() != MqMessageState.OK) error("Crawler failed"); @@ -137,35 +127,4 @@ public class CrawlActor extends AbstractStateGraph { } - public MqMessage waitResponse(MqOutbox outbox, ProcessService.ProcessId processId, long id) throws Exception { - if (!waitForProcess(processId, TimeUnit.SECONDS, 30)) { - error("Process " + processId + " did not launch"); - } - for (;;) { - try { - return outbox.waitResponse(id, 1, TimeUnit.SECONDS); - } - catch (TimeoutException ex) { - // Maybe the process died, wait a moment for it to restart - if (!waitForProcess(processId, TimeUnit.SECONDS, 30)) { - error("Process " + processId + " died and did not re-launch"); - } - } - } - } - - public boolean waitForProcess(ProcessService.ProcessId processId, TimeUnit unit, int duration) throws InterruptedException { - - // Wait for process to start - long deadline = System.currentTimeMillis() + unit.toMillis(duration); - while (System.currentTimeMillis() < deadline) { - if (processService.isRunning(processId)) - return true; - - TimeUnit.SECONDS.sleep(1); - } - - return false; - } - } diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlJobExtractorActor.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlJobExtractorActor.java index df86da38..621e06e1 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlJobExtractorActor.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlJobExtractorActor.java @@ -29,7 +29,6 @@ public class CrawlJobExtractorActor extends AbstractStateGraph { private final Logger logger = LoggerFactory.getLogger(getClass()); // STATES - public static final String INITIAL = "INITIAL"; public static final String CREATE_FROM_DB = "CREATE_FROM_DB"; public static final String CREATE_FROM_LINK = "CREATE_FROM_LINK"; public static final String END = "END"; @@ -52,8 +51,6 @@ public class CrawlJobExtractorActor extends AbstractStateGraph { public record CrawlJobExtractorArguments(String description) { } public record CrawlJobExtractorArgumentsWithURL(String description, String url) { } - @GraphState(name = INITIAL, next = END) - public void initial() throws Exception { error("This state does nothing"); } @GraphState(name = CREATE_FROM_LINK, next = END, resume = ResumeBehavior.ERROR, diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/ReconvertAndLoadActor.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/ReconvertAndLoadActor.java index c6d020e9..f296ca6f 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/ReconvertAndLoadActor.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/ReconvertAndLoadActor.java @@ -17,7 +17,6 @@ import nu.marginalia.db.storage.FileStorageService; import nu.marginalia.db.storage.model.FileStorageBaseType; import nu.marginalia.db.storage.model.FileStorageId; import nu.marginalia.db.storage.model.FileStorageType; -import nu.marginalia.mq.MqMessage; import nu.marginalia.mq.MqMessageState; import nu.marginalia.mq.outbox.MqOutbox; import nu.marginalia.mqsm.StateFactory; @@ -29,8 +28,6 @@ import org.slf4j.LoggerFactory; import java.nio.file.Files; import java.nio.file.StandardCopyOption; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.TimeoutException; @Singleton public class ReconvertAndLoadActor extends AbstractStateGraph { @@ -49,7 +46,7 @@ public class ReconvertAndLoadActor extends AbstractStateGraph { public static final String REINDEX = "REINDEX"; public static final String REINDEX_WAIT = "REINDEX-WAIT"; public static final String END = "END"; - private final ProcessService processService; + private final ActorProcessWatcher processWatcher; private final MqOutbox mqConverterOutbox; private final MqOutbox mqLoaderOutbox; private final MqOutbox indexOutbox; @@ -68,7 +65,7 @@ public class ReconvertAndLoadActor extends AbstractStateGraph { @Inject public ReconvertAndLoadActor(StateFactory stateFactory, - ProcessService processService, + ActorProcessWatcher processWatcher, ProcessOutboxFactory processOutboxFactory, FileStorageService storageService, IndexClient indexClient, @@ -76,8 +73,8 @@ public class ReconvertAndLoadActor extends AbstractStateGraph { ) { super(stateFactory); + this.processWatcher = processWatcher; this.indexOutbox = indexClient.outbox(); - this.processService = processService; this.mqConverterOutbox = processOutboxFactory.createConverterOutbox(); this.mqLoaderOutbox = processOutboxFactory.createLoaderOutbox(); this.storageService = storageService; @@ -142,7 +139,7 @@ public class ReconvertAndLoadActor extends AbstractStateGraph { """ ) public Message reconvertWait(Message message) throws Exception { - var rsp = waitResponse(mqConverterOutbox, ProcessService.ProcessId.CONVERTER, message.converterMsgId); + var rsp = processWatcher.waitResponse(mqConverterOutbox, ProcessService.ProcessId.CONVERTER, message.converterMsgId); if (rsp.state() != MqMessageState.OK) error("Converter failed"); @@ -176,7 +173,7 @@ public class ReconvertAndLoadActor extends AbstractStateGraph { """ ) public void loadWait(Message message) throws Exception { - var rsp = waitResponse(mqLoaderOutbox, ProcessService.ProcessId.LOADER, message.loaderMsgId); + var rsp = processWatcher.waitResponse(mqLoaderOutbox, ProcessService.ProcessId.LOADER, message.loaderMsgId); if (rsp.state() != MqMessageState.OK) error("Loader failed"); @@ -260,35 +257,5 @@ public class ReconvertAndLoadActor extends AbstractStateGraph { } } - public MqMessage waitResponse(MqOutbox outbox, ProcessService.ProcessId processId, long id) throws Exception { - if (!waitForProcess(processId, TimeUnit.SECONDS, 30)) { - error("Process " + processId + " did not launch"); - } - for (;;) { - try { - return outbox.waitResponse(id, 1, TimeUnit.SECONDS); - } - catch (TimeoutException ex) { - // Maybe the process died, wait a moment for it to restart - if (!waitForProcess(processId, TimeUnit.SECONDS, 30)) { - error("Process " + processId + " died and did not re-launch"); - } - } - } - } - - public boolean waitForProcess(ProcessService.ProcessId processId, TimeUnit unit, int duration) throws InterruptedException { - - // Wait for process to start - long deadline = System.currentTimeMillis() + unit.toMillis(duration); - while (System.currentTimeMillis() < deadline) { - if (processService.isRunning(processId)) - return true; - - TimeUnit.SECONDS.sleep(1); - } - - return false; - } } diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/RecrawlActor.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/RecrawlActor.java index a04dd6bf..03178226 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/RecrawlActor.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/RecrawlActor.java @@ -39,11 +39,10 @@ public class RecrawlActor extends AbstractStateGraph { public static final String CRAWL = "CRAWL"; public static final String CRAWL_WAIT = "CRAWL-WAIT"; public static final String END = "END"; - private final ProcessService processService; private final MqOutbox mqCrawlerOutbox; private final FileStorageService storageService; private final Gson gson; - private final Logger logger = LoggerFactory.getLogger(getClass()); + private final ActorProcessWatcher processWatcher; @AllArgsConstructor @With @NoArgsConstructor @@ -62,14 +61,14 @@ public class RecrawlActor extends AbstractStateGraph { @Inject public RecrawlActor(StateFactory stateFactory, - ProcessService processService, + ActorProcessWatcher processWatcher, ProcessOutboxFactory processOutboxFactory, FileStorageService storageService, Gson gson ) { super(stateFactory); - this.processService = processService; + this.processWatcher = processWatcher; this.mqCrawlerOutbox = processOutboxFactory.createCrawlerOutbox(); this.storageService = storageService; this.gson = gson; @@ -137,7 +136,7 @@ public class RecrawlActor extends AbstractStateGraph { """ ) public RecrawlMessage crawlerWait(RecrawlMessage recrawlMessage) throws Exception { - var rsp = waitResponse(mqCrawlerOutbox, ProcessService.ProcessId.CRAWLER, recrawlMessage.crawlerMsgId); + var rsp = processWatcher.waitResponse(mqCrawlerOutbox, ProcessService.ProcessId.CRAWLER, recrawlMessage.crawlerMsgId); if (rsp.state() != MqMessageState.OK) error("Crawler failed"); @@ -145,36 +144,4 @@ public class RecrawlActor extends AbstractStateGraph { return recrawlMessage; } - - public MqMessage waitResponse(MqOutbox outbox, ProcessService.ProcessId processId, long id) throws Exception { - if (!waitForProcess(processId, TimeUnit.SECONDS, 30)) { - error("Process " + processId + " did not launch"); - } - for (;;) { - try { - return outbox.waitResponse(id, 1, TimeUnit.SECONDS); - } - catch (TimeoutException ex) { - // Maybe the process died, wait a moment for it to restart - if (!waitForProcess(processId, TimeUnit.SECONDS, 30)) { - error("Process " + processId + " died and did not re-launch"); - } - } - } - } - - public boolean waitForProcess(ProcessService.ProcessId processId, TimeUnit unit, int duration) throws InterruptedException { - - // Wait for process to start - long deadline = System.currentTimeMillis() + unit.toMillis(duration); - while (System.currentTimeMillis() < deadline) { - if (processService.isRunning(processId)) - return true; - - TimeUnit.SECONDS.sleep(1); - } - - return false; - } - } From d95f01b7014787cddd853170383da1daa842ba5c Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 31 Jul 2023 14:20:27 +0200 Subject: [PATCH 116/157] (control) Reduce log spam in control svc --- .../nu/marginalia/service/server/Service.java | 30 +++++++++++++------ .../nu/marginalia/control/ControlService.java | 17 +++++++++++ 2 files changed, 38 insertions(+), 9 deletions(-) diff --git a/code/common/service/src/main/java/nu/marginalia/service/server/Service.java b/code/common/service/src/main/java/nu/marginalia/service/server/Service.java index ebd75753..4185aad6 100644 --- a/code/common/service/src/main/java/nu/marginalia/service/server/Service.java +++ b/code/common/service/src/main/java/nu/marginalia/service/server/Service.java @@ -119,11 +119,7 @@ public class Service { Spark.halt(403); } - String url = request.pathInfo(); - if (request.queryString() != null) { - url = url + "?" + request.queryString(); - } - logger.info(httpMarker, "PUBLIC {}: {} {}", Context.fromRequest(request).getContextId(), request.requestMethod(), url); + logRequest(request); } private Object isInitialized(Request request, Response response) { @@ -168,9 +164,8 @@ public class Service { request_counter_bad.labels(serviceName).inc(); } - if (null != request.headers("X-Public")) { - logger.info(httpMarker, "RSP {}", response.status()); - } + logResponse(request, response); + } private void paintThreadName(Request request, String prefix) { @@ -178,7 +173,7 @@ public class Service { Thread.currentThread().setName(prefix + ctx.getContextId()); } - private void handleException(Exception ex, Request request, Response response) { + protected void handleException(Exception ex, Request request, Response response) { request_counter_err.labels(serviceName).inc(); if (ex instanceof MessagingException) { logger.error("{} {}", ex.getClass().getSimpleName(), ex.getMessage()); @@ -188,4 +183,21 @@ public class Service { } } + /** Log the request on the HTTP log */ + protected void logRequest(Request request) { + String url = request.pathInfo(); + if (request.queryString() != null) { + url = url + "?" + request.queryString(); + } + + logger.info(httpMarker, "PUBLIC {}: {} {}", Context.fromRequest(request).getContextId(), request.requestMethod(), url); + } + + /** Log the response on the HTTP log */ + protected void logResponse(Request request, Response response) { + if (null != request.headers("X-Public")) { + logger.info(httpMarker, "RSP {}", response.status()); + } + } + } diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java index e1efc3e4..49e5cd19 100644 --- a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java +++ b/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java @@ -120,6 +120,23 @@ public class ControlService extends Service { monitors.subscribe(this::logMonitorStateChange); } + @Override + public void logRequest(Request request) { + if ("GET".equals(request.requestMethod())) + return; + + super.logRequest(request); + } + + @Override + public void logResponse(Request request, Response response) { + if ("GET".equals(request.requestMethod())) + return; + + super.logResponse(request, response); + } + + private Object messageModel(Request request, Response response) { var message = messageQueueViewService.getMessage(Long.parseLong(request.params("id"))); if (message != null) { From 2f8488610a43d61dfd82ff1b792ece5e5ffd4d91 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 31 Jul 2023 14:22:24 +0200 Subject: [PATCH 117/157] (loader) Fix bug where trailing deferred domain meta inserts weren't executed --- .../nu/marginalia/mq/outbox/MqOutbox.java | 8 +- .../converting/instruction/Interpreter.java | 20 +-- .../marginalia/converting/ConversionLog.java | 26 ---- .../converting/InstructionWriterFactory.java | 28 ++--- .../nu/marginalia/loading/LoaderMain.java | 117 ++++++------------ .../nu/marginalia/loading/loader/Loader.java | 10 +- 6 files changed, 69 insertions(+), 140 deletions(-) diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java index d604a585..3f3362f1 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java @@ -118,7 +118,11 @@ public class MqOutbox { } - /** Blocks until a response arrives for the given message id or the timeout passes */ + /** Blocks until a response arrives for the given message id or the timeout passes. + *

    + * @throws TimeoutException if the timeout passes before a response arrives. + * @throws InterruptedException if the thread is interrupted while waiting. + */ public MqMessage waitResponse(long id, int timeout, TimeUnit unit) throws TimeoutException, SQLException, InterruptedException { long deadline = System.currentTimeMillis() + unit.toMillis(timeout); @@ -160,7 +164,9 @@ public class MqOutbox { public void flagAsBad(long id) throws SQLException { persistence.updateMessageState(id, MqMessageState.ERR); } + public void flagAsDead(long id) throws SQLException { persistence.updateMessageState(id, MqMessageState.DEAD); } + } \ No newline at end of file diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Interpreter.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Interpreter.java index 4583f31d..248ea38d 100644 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Interpreter.java +++ b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Interpreter.java @@ -10,18 +10,18 @@ import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError; public interface Interpreter { - void loadUrl(EdgeUrl[] url); - void loadDomain(EdgeDomain[] domain); - void loadRssFeed(EdgeUrl[] rssFeed); - void loadDomainLink(DomainLink[] links); + default void loadUrl(EdgeUrl[] url) {} + default void loadDomain(EdgeDomain[] domain) {} + default void loadRssFeed(EdgeUrl[] rssFeed) {} + default void loadDomainLink(DomainLink[] links) {} - void loadProcessedDomain(EdgeDomain domain, DomainIndexingState state, String ip); - void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument); - void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError); + default void loadProcessedDomain(EdgeDomain domain, DomainIndexingState state, String ip) {} + default void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) {} + default void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError) {} - void loadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words); + default void loadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) {} - void loadDomainRedirect(DomainLink link); + default void loadDomainRedirect(DomainLink link) {} - void loadDomainMetadata(EdgeDomain domain, int knownUrls, int goodUrls, int visitedUrls); + default void loadDomainMetadata(EdgeDomain domain, int knownUrls, int goodUrls, int visitedUrls) {} } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConversionLog.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConversionLog.java index 2c2ffb95..10c11e21 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConversionLog.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConversionLog.java @@ -36,35 +36,9 @@ public class ConversionLog implements AutoCloseable, Interpreter { writer.close(); } - @Override - public void loadUrl(EdgeUrl[] url) {} - - @Override - public void loadDomain(EdgeDomain[] domain) {} - - @Override - public void loadRssFeed(EdgeUrl[] rssFeed) {} - - @Override - public void loadDomainLink(DomainLink[] links) {} - - @Override - public void loadProcessedDomain(EdgeDomain domain, DomainIndexingState state, String ip) {} - - @Override - public void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) {} - @Override public synchronized void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError) { writer.printf("%s\t%s\n", loadProcessedDocumentWithError.url(), loadProcessedDocumentWithError.reason()); } - @Override - public void loadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) {} - - @Override - public void loadDomainRedirect(DomainLink link) {} - - @Override - public void loadDomainMetadata(EdgeDomain domain, int knownUrls, int goodUrls, int visitedUrls) {} } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriterFactory.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriterFactory.java index fee4fc19..08f842c6 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriterFactory.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriterFactory.java @@ -109,22 +109,16 @@ public class InstructionWriterFactory { private int ok = 0; private int error = 0; + int keywords = 0; + int documents = 0; + public String toString() { + // This shouldn't happen (TM) + assert keywords == documents : "keywords != documents"; + return String.format("%s - %d %d", domainName, ok, error); } - @Override - public void loadUrl(EdgeUrl[] url) {} - - @Override - public void loadDomain(EdgeDomain[] domain) {} - - @Override - public void loadRssFeed(EdgeUrl[] rssFeed) {} - - @Override - public void loadDomainLink(DomainLink[] links) {} - @Override public void loadProcessedDomain(EdgeDomain domain, DomainIndexingState state, String ip) { this.domainName = domain.toString(); @@ -132,20 +126,14 @@ public class InstructionWriterFactory { @Override public void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) { - - } - - @Override - public void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError) { + documents++; } @Override public void loadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) { + keywords++; } - @Override - public void loadDomainRedirect(DomainLink link) {} - @Override public void loadDomainMetadata(EdgeDomain domain, int knownUrls, int goodUrls, int visitedUrls) { ok += goodUrls; diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java index 68bcf8c4..21b0b1ec 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java @@ -1,12 +1,18 @@ package nu.marginalia.loading; +import com.google.common.collect.Sets; import com.google.gson.Gson; import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; import lombok.SneakyThrows; +import nu.marginalia.converting.instruction.Interpreter; +import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.keyword.model.DocumentKeywords; import nu.marginalia.loading.loader.IndexLoadKeywords; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mq.MqMessage; import nu.marginalia.mq.inbox.MqInboxResponse; @@ -14,19 +20,17 @@ import nu.marginalia.mq.inbox.MqSingleShotInbox; import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.process.log.WorkLog; import plan.CrawlPlan; -import nu.marginalia.loading.loader.Loader; import nu.marginalia.loading.loader.LoaderFactory; -import nu.marginalia.converting.instruction.Instruction; import nu.marginalia.service.module.DatabaseModule; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.nio.file.Path; import java.sql.SQLException; -import java.util.Iterator; +import java.util.HashSet; import java.util.Optional; +import java.util.Set; import java.util.UUID; -import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeUnit; import static nu.marginalia.mqapi.ProcessInboxNames.LOADER_INBOX; @@ -42,9 +46,6 @@ public class LoaderMain { private final FileStorageService fileStorageService; private final IndexLoadKeywords indexLoadKeywords; private final Gson gson; - private volatile boolean running = true; - - final Thread processorThread; public static void main(String... args) throws Exception { new org.mariadb.jdbc.Driver(); @@ -84,9 +85,6 @@ public class LoaderMain { this.gson = gson; heartbeat.start(); - - processorThread = new Thread(this::processor, "Processor Thread"); - processorThread.start(); } @SneakyThrows @@ -94,6 +92,7 @@ public class LoaderMain { var plan = instructions.getPlan(); var logFile = plan.process.getLogFile(); + TaskStats taskStats = new TaskStats(100); try { int loadTotal = 0; int loaded = 0; @@ -102,29 +101,37 @@ public class LoaderMain { loadTotal++; } - LoaderMain.loadTotal = loadTotal; - logger.info("Loading {} files", loadTotal); for (var entry : WorkLog.iterable(logFile)) { - heartbeat.setProgress(loaded++ / (double) loadTotal); + InstructionCounter instructionCounter = new InstructionCounter(); + + heartbeat.setProgress(loaded++ / (double) loadTotal); + long startTime = System.currentTimeMillis(); - var loader = loaderFactory.create(entry.cnt()); Path destDir = plan.getProcessedFilePath(entry.path()); - var instructionsIter = instructionsReader.createIterator(destDir); - while (instructionsIter.hasNext()) { - var next = instructionsIter.next(); - try { - next.apply(loader); - } - catch (Exception ex) { - logger.error("Failed to load instruction {}", next); + try (var loader = loaderFactory.create(entry.cnt())) { + var instructionsIter = instructionsReader.createIterator(destDir); + + while (instructionsIter.hasNext()) { + var next = instructionsIter.next(); + try { + next.apply(instructionCounter); + next.apply(loader); + } catch (Exception ex) { + logger.error("Failed to load instruction {}", next); + } } } + + long endTime = System.currentTimeMillis(); + long loadTime = endTime - startTime; + taskStats.observe(endTime - startTime); + + logger.info("Loaded {}/{} : {} ({}) {}ms {} l/s", taskStats.getCount(), + loadTotal, destDir, instructionCounter.getCount(), loadTime, taskStats.avgTime()); } - running = false; - processorThread.join(); instructions.ok(); // This needs to be done in order to have a readable index journal @@ -144,59 +151,6 @@ public class LoaderMain { System.exit(0); } - private volatile static int loadTotal; - - private void load(CrawlPlan plan, String path, int cnt) { - Path destDir = plan.getProcessedFilePath(path); - try { - var loader = loaderFactory.create(cnt); - var instructions = instructionsReader.createIterator(destDir); - processQueue.put(new LoadJob(path, loader, instructions)); - } catch (Exception e) { - logger.error("Failed to load " + destDir, e); - } - } - - static final TaskStats taskStats = new TaskStats(100); - - private record LoadJob(String path, Loader loader, Iterator instructionIterator) { - public void run() { - long startTime = System.currentTimeMillis(); - while (instructionIterator.hasNext()) { - var next = instructionIterator.next(); - try { - next.apply(loader); - } - catch (Exception ex) { - logger.error("Failed to load instruction {}", next); - } - } - - loader.finish(); - long loadTime = System.currentTimeMillis() - startTime; - taskStats.observe(loadTime); - logger.info("Loaded {}/{} : {} ({}) {}ms {} l/s", taskStats.getCount(), - loadTotal, path, loader.data.sizeHint, loadTime, taskStats.avgTime()); - } - - } - - private static final LinkedBlockingQueue processQueue = new LinkedBlockingQueue<>(2); - - private void processor() { - try { - while (running || !processQueue.isEmpty()) { - LoadJob job = processQueue.poll(1, TimeUnit.SECONDS); - - if (job != null) { - job.run(); - } - } - } catch (InterruptedException e) { - throw new RuntimeException(e); - } - - } private static class LoadRequest { private final CrawlPlan plan; private final MqMessage message; @@ -258,4 +212,13 @@ public class LoaderMain { } } + public class InstructionCounter implements Interpreter { + private int count = 0; + public void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) { + count++; + } + public int getCount() { + return count; + } + } } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java index 96c5a21c..d6f97076 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java @@ -15,7 +15,7 @@ import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.List; -public class Loader implements Interpreter { +public class Loader implements Interpreter, AutoCloseable { private final SqlLoadUrls sqlLoadUrls; private final SqlLoadDomains sqlLoadDomains; private final SqlLoadDomainLinks sqlLoadDomainLinks; @@ -30,8 +30,6 @@ public class Loader implements Interpreter { private final List processedDocumentList; private final List processedDocumentWithErrorList; - private final List deferredDomains = new ArrayList<>(); - private final List deferredUrls = new ArrayList<>(); public final LoaderData data; @@ -87,6 +85,7 @@ public class Loader implements Interpreter { @Override public void loadProcessedDocument(LoadProcessedDocument document) { processedDocumentList.add(document); + if (processedDocumentList.size() > 100) { sqlLoadProcessedDocument.load(data, processedDocumentList); processedDocumentList.clear(); @@ -96,6 +95,7 @@ public class Loader implements Interpreter { @Override public void loadProcessedDocumentWithError(LoadProcessedDocumentWithError document) { processedDocumentWithErrorList.add(document); + if (processedDocumentWithErrorList.size() > 100) { sqlLoadProcessedDocument.loadWithError(data, processedDocumentWithErrorList); processedDocumentWithErrorList.clear(); @@ -121,9 +121,7 @@ public class Loader implements Interpreter { sqlLoadDomainMetadata.load(data, domain, knownUrls, goodUrls, visitedUrls); } - public void finish() { - // Some work needs to be processed out of order for the database relations to work out - + public void close() { if (processedDocumentList.size() > 0) { sqlLoadProcessedDocument.load(data, processedDocumentList); } From 8f0cbf267bcab6fa6363d2a9ef4dfe91e99ef312 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 31 Jul 2023 14:23:49 +0200 Subject: [PATCH 118/157] (loader) Perform instruction reads in a separate thread for extra vroom vroom --- .../loading/ConvertedDomainReader.java | 114 +++++++++++------- 1 file changed, 68 insertions(+), 46 deletions(-) diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/ConvertedDomainReader.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/ConvertedDomainReader.java index 91875169..86b9db1f 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/ConvertedDomainReader.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/ConvertedDomainReader.java @@ -2,73 +2,95 @@ package nu.marginalia.loading; import com.github.luben.zstd.RecyclingBufferPool; import com.github.luben.zstd.ZstdInputStream; -import com.google.gson.Gson; import lombok.SneakyThrows; import nu.marginalia.converting.instruction.Instruction; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import javax.inject.Inject; import java.io.*; +import java.lang.ref.Cleaner; import java.nio.file.Path; -import java.util.ArrayList; import java.util.Iterator; -import java.util.List; +import java.util.concurrent.*; +import java.util.concurrent.atomic.AtomicBoolean; public class ConvertedDomainReader { + private final ExecutorService executorService = Executors.newSingleThreadExecutor(); private static final Logger logger = LoggerFactory.getLogger(ConvertedDomainReader.class); - private final Gson gson; - @Inject - public ConvertedDomainReader(Gson gson) { - this.gson = gson; + /** Creates a new iterator over Path. The implementation will try to read the file in a separate thread, and + * will block until the first instruction is available. Iterator$hasNext may block. + */ + public Iterator createIterator(Path path) { + return new PrefetchingInstructionIterator(path); } - public List read(Path path, int cntHint) throws IOException { - List ret = new ArrayList<>(cntHint); + class PrefetchingInstructionIterator implements Iterator { - try (var or = new ObjectInputStream(new ZstdInputStream(new FileInputStream(path.toFile()), RecyclingBufferPool.INSTANCE))) { - var object = or.readObject(); - if (object instanceof Instruction is) { - ret.add(is); - } - } catch (ClassNotFoundException e) { - throw new RuntimeException(e); + private final LinkedBlockingQueue queue = new LinkedBlockingQueue<>(16); + private final AtomicBoolean finished = new AtomicBoolean(false); + + private Instruction next = null; + + public PrefetchingInstructionIterator(Path path) { + Future future = executorService.submit(() -> readerThread(path)); + + // Cancel the future if the iterator is garbage collected + // to reduce the risk of leaking resources; as the worker thread + // will spin forever on put if the queue is full. + Cleaner.create().register(this, () -> { + future.cancel(true); + }); } - return ret; - } - - public Iterator createIterator(Path path) throws IOException { - var or = new ObjectInputStream(new ZstdInputStream(new BufferedInputStream(new FileInputStream(path.toFile())), RecyclingBufferPool.INSTANCE)); - - return new Iterator<>() { - Instruction next; - @SneakyThrows - @Override - public boolean hasNext() { - if (next != null) - return true; - - try { - next = (Instruction) or.readObject(); - return true; + private Object readerThread(Path path) { + try (var or = new ObjectInputStream(new ZstdInputStream(new BufferedInputStream(new FileInputStream(path.toFile())), RecyclingBufferPool.INSTANCE))) { + for (; ; ) { + var nextObject = or.readObject(); + if (nextObject instanceof Instruction is) { + queue.put(is); + } else { + logger.warn("Spurious object in file: {}", nextObject.getClass().getSimpleName()); + } } - catch (java.io.EOFException ex) { - or.close(); - return false; + } catch (EOFException ex) { + // Expected + return null; + } catch (ClassNotFoundException | IOException | InterruptedException e) { + logger.warn("Error reading file " + path, e); + throw new RuntimeException(e); + } finally { + finished.set(true); + } + } + + @SneakyThrows + @Override + public boolean hasNext() { + if (next != null) + return true; + + // As long as the worker is still running, we'll do a blocking poll to wait for the next instruction + // (but we wake up every second to check if the worker is still running) + while (!finished.get()) { + if (null != (next = queue.poll(1, TimeUnit.SECONDS))) { + return true; } } - @Override - public Instruction next() { - if (next != null || hasNext()) { - var ret = next; - next = null; - return ret; - } - throw new IllegalStateException(); + // If the worker is not running, we just drain the queue without waiting + return null != (next = queue.poll()); + } + + @Override + public Instruction next() { + if (next != null || hasNext()) { + try { return next; } + finally { next = null; } } - }; + throw new IllegalStateException(); + } + } + } From 86a5cc5c5f3bd4fd81fb30371a7865f362ca300c Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 1 Aug 2023 14:57:40 +0200 Subject: [PATCH 119/157] (hash) Modified version of common codec's Murmur3 hash --- settings.gradle | 2 + third-party/README.md | 1 + third-party/commons-codec/build.gradle | 20 ++ third-party/commons-codec/readme.md | 34 +++ .../nu/marginalia/hash/MurmurHashBench.java | 105 +++++++ .../nu/marginalia/hash/MurmurHash3_128.java | 277 ++++++++++++++++++ 6 files changed, 439 insertions(+) create mode 100644 third-party/commons-codec/build.gradle create mode 100644 third-party/commons-codec/readme.md create mode 100644 third-party/commons-codec/src/jmh/java/nu/marginalia/hash/MurmurHashBench.java create mode 100644 third-party/commons-codec/src/main/java/nu/marginalia/hash/MurmurHash3_128.java diff --git a/settings.gradle b/settings.gradle index 131b449e..62bc0f34 100644 --- a/settings.gradle +++ b/settings.gradle @@ -80,6 +80,7 @@ include 'third-party:openzim' include 'third-party:count-min-sketch' include 'third-party:monkey-patch-opennlp' include 'third-party:monkey-patch-gson' +include 'third-party:commons-codec' dependencyResolutionManagement { @@ -142,6 +143,7 @@ dependencyResolutionManagement { library('commons.lang3', 'org.apache.commons','commons-lang3').version('3.12.0') library('commons.compress','org.apache.commons','commons-compress').version('1.21') library('commons.io','commons-io','commons-io').version('2.11.0') + library('commons.codec', 'commons-codec', 'commons-codec').version('1.16.0') library('ffi','com.github.jnr','jnr-ffi').version('2.2.12') library('databind','com.fasterxml.jackson.core','jackson-databind').version('2.13.2.1') diff --git a/third-party/README.md b/third-party/README.md index c31ca585..d6b8a834 100644 --- a/third-party/README.md +++ b/third-party/README.md @@ -10,6 +10,7 @@ or lack an artifact, or to override some default that is inappropriate for the t * [PorterStemmer](porterstemmer/) - LGPL3 * [Uppend](uppend/) - MIT * [OpenZIM](openzim/) - GPL-2.0 +* [Commons Codec](commons-codec/) - Apache 2.0 ### Repackaged * [SymSpell](symspell/) - LGPL-3.0 diff --git a/third-party/commons-codec/build.gradle b/third-party/commons-codec/build.gradle new file mode 100644 index 00000000..600269c8 --- /dev/null +++ b/third-party/commons-codec/build.gradle @@ -0,0 +1,20 @@ +plugins { + id 'java' + id "me.champeau.jmh" version "0.6.6" +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + jmhImplementation project(':code:libraries:language-processing') + jmhImplementation libs.guava + jmhImplementation libs.commons.codec +} + +test { + useJUnitPlatform() +} diff --git a/third-party/commons-codec/readme.md b/third-party/commons-codec/readme.md new file mode 100644 index 00000000..71232ae7 --- /dev/null +++ b/third-party/commons-codec/readme.md @@ -0,0 +1,34 @@ +# Commons Codec + +License: [APL 2.0](http://www.apache.org/licenses/LICENSE-2.0) + +This package contains a heavily modified version of the Murmur3 hash from [commons-codec](https://commons.apache.org/proper/commons-codec/) +that cuts some corners but outperforms both Commons Codec and Guava fairly significantly for the particular use cases +we care about being fast: Hashing ASCII/Latin1 strings into a well behaving 64-bit hash. + +The method `hashLowerBytes(String data)` performs a zero allocation and zero conversion hash of +the *lower bytes* of the characters in the provided string. For ASCII, Latin1, or other 8 bit encodings +this is identical to hashing the entire string. For other use cases, especially away from the +Latin scripts, this function is possibly a foot-gun. + +The method `hashNearlyASCII(String data)` is the same as above, except it's +seeded with Java String's hashCode(). This is a very non-standard modification that +makes it a bit better at dealing with other encodings without measurable performance +impact. + +The method `long hash(byte[] data)` hashes the entire byte array. + +A non-standard behavior is that the hash function folds the 128 bit +hash into a 64 bit hash by xor:ing the 128 bit parts. + +## Performance Benchmarks + +| Algorithm | Ops/s | Remark | +|--------------------|-------------------|-----------------------------------------------------------------| +| Guava | 12,114 ± 439 | allocates byte buffers internally | +| Common Codec | 29,224 ± 1,080 | String.getByte() penalty, long\[2\] allocation, possibly elided | +| MH hash | 30,885 ± 847 | String.getByte() penalty, zero allocations | +| MH hashNearlyASCII | 50,018 ± 399 | Zero allocations, worse characteristics outside Latin1/ASCII | +| MH hashLowerBytes | 50,533 ± 478 | Zero allocations, only works for Latin1/ASCII | +| String.hashCode() | 567,381 ± 136,185 | Zero allocations, much weaker algo | + diff --git a/third-party/commons-codec/src/jmh/java/nu/marginalia/hash/MurmurHashBench.java b/third-party/commons-codec/src/jmh/java/nu/marginalia/hash/MurmurHashBench.java new file mode 100644 index 00000000..a4cc3029 --- /dev/null +++ b/third-party/commons-codec/src/jmh/java/nu/marginalia/hash/MurmurHashBench.java @@ -0,0 +1,105 @@ +package nu.marginalia.hash; + +import com.google.common.hash.HashFunction; +import com.google.common.hash.Hashing; +import org.apache.commons.codec.digest.MurmurHash3; +import org.openjdk.jmh.annotations.*; + +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; + +public class MurmurHashBench { + + private static HashFunction guavaHashFunction = Hashing.murmur3_128(); + private static MurmurHash3_128 marginaliahash = new MurmurHash3_128(); + + @State(Scope.Benchmark) + public static class BenchState { + + List strings; + + @Setup(Level.Trial) + public void doSetup() { + strings = new ArrayList<>(); + try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("dictionary/en-1000"), + "Could not load word frequency table"); + var br = new BufferedReader(new InputStreamReader(resource)) + ) { + for (;;) { + String s = br.readLine(); + if (s == null) { + break; + } + strings.add(s.toLowerCase()); + } + } + catch (Exception ex) { + throw new RuntimeException(ex); + } + } + } + + @Benchmark + @BenchmarkMode(Mode.Throughput) + public long benchGuava(BenchState state) { + long total = 0; + for (var string : state.strings) { + total += guavaHashFunction.hashUnencodedChars(string).padToLong(); + } + return total; + } + + @Benchmark + @BenchmarkMode(Mode.Throughput) + public long benchCommonCodec(BenchState state) { + long total = 0; + for (var string : state.strings) { + total += MurmurHash3.hash128x64(string.getBytes(StandardCharsets.UTF_8))[0]; + } + return total; + } + + @Benchmark + @BenchmarkMode(Mode.Throughput) + public long benchMarginalia_hashNonStandardASCIIOnlyDirect(BenchState state) { + long total = 0; + for (var string : state.strings) { + total += marginaliahash.hashLowerBytes(string); + } + return total; + } + + @Benchmark + @BenchmarkMode(Mode.Throughput) + public long benchMarginalia_hashStandard(BenchState state) { + long total = 0; + for (var string : state.strings) { + total += marginaliahash.hash(string.getBytes(StandardCharsets.UTF_8)); + } + return total; + } + + @Benchmark + @BenchmarkMode(Mode.Throughput) + public long benchJavaStringHash(BenchState state) { + long total = 0; + for (var string : state.strings) { + total += string.hashCode(); + } + return total; + } + + @Benchmark + @BenchmarkMode(Mode.Throughput) + public long benchWeakNonAscii(BenchState state) { + long total = 0; + for (var string : state.strings) { + total += marginaliahash.hashNearlyASCII(string); + } + return total; + } +} diff --git a/third-party/commons-codec/src/main/java/nu/marginalia/hash/MurmurHash3_128.java b/third-party/commons-codec/src/main/java/nu/marginalia/hash/MurmurHash3_128.java new file mode 100644 index 00000000..cd767d10 --- /dev/null +++ b/third-party/commons-codec/src/main/java/nu/marginalia/hash/MurmurHash3_128.java @@ -0,0 +1,277 @@ +package nu.marginalia.hash; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** A modified version of Commons Codec's murmur hash + * that minimizes allocations. + * */ +public class MurmurHash3_128 { + + /** + * A default seed to use for the murmur hash algorithm. + * Has the value {@code 104729}. + */ + public static final int DEFAULT_SEED = 104729; + + // Constants for 128-bit variant + private static final long C1 = 0x87c37b91114253d5L; + private static final long C2 = 0x4cf5ad432745937fL; + private static final int R1 = 31; + private static final int R2 = 27; + private static final int R3 = 33; + private static final int M = 5; + private static final int N1 = 0x52dce729; + private static final int N2 = 0x38495ab5; + + /** Assumes data is ASCII, or at the very least that you only care about the lower + * bytes of your string (which may be fine for hashing mostly latin script). + *

    + * Fold the 128 bit hash into 64 bits by xor:ing msw and lsw + */ + public long hashLowerBytes(String data) { + return hash64(data, 0, data.length(), DEFAULT_SEED); + } + + /** Like hashASCIIOnly except seeded with the Java String.hashCode() + * to provide better behavior for non-ASCII strings. It's much worse + * than doing it properly, but better than not doing this. + */ + public long hashNearlyASCII(String data) { + return hash64(data, 0, data.length(), data.hashCode()); + } + + /** Hash the bytes; fold the 128 bit hash into 64 bits by xor:ing msw and lsw */ + public long hash(byte[] data) { + return hash64(data, 0, data.length, DEFAULT_SEED); + } + + private static long hash64(final CharSequence data, final int offset, final int length, final long seed) { + long h1 = seed; + long h2 = seed; + final int nblocks = length >> 4; + + // body + for (int i = 0; i < nblocks; i++) { + final int index = offset + (i << 4); + long k1 = getLittleEndianLong(data, index); + long k2 = getLittleEndianLong(data, index + 8); + + // mix functions for k1 + k1 *= C1; + k1 = Long.rotateLeft(k1, R1); + k1 *= C2; + h1 ^= k1; + h1 = Long.rotateLeft(h1, R2); + h1 += h2; + h1 = h1 * M + N1; + + // mix functions for k2 + k2 *= C2; + k2 = Long.rotateLeft(k2, R3); + k2 *= C1; + h2 ^= k2; + h2 = Long.rotateLeft(h2, R1); + h2 += h1; + h2 = h2 * M + N2; + } + + // tail + long k1 = 0; + long k2 = 0; + final int index = offset + (nblocks << 4); + switch (offset + length - index) { + case 15: + k2 ^= ((long) data.charAt(index + 14) & 0xff) << 48; + case 14: + k2 ^= ((long) data.charAt(index + 13) & 0xff) << 40; + case 13: + k2 ^= ((long) data.charAt(index + 12) & 0xff) << 32; + case 12: + k2 ^= ((long) data.charAt(index + 11) & 0xff) << 24; + case 11: + k2 ^= ((long) data.charAt(index + 10) & 0xff) << 16; + case 10: + k2 ^= ((long) data.charAt(index + 9) & 0xff) << 8; + case 9: + k2 ^= data.charAt(index + 8) & 0xff; + k2 *= C2; + k2 = Long.rotateLeft(k2, R3); + k2 *= C1; + h2 ^= k2; + + case 8: + k1 ^= ((long) data.charAt(index + 7) & 0xff) << 56; + case 7: + k1 ^= ((long) data.charAt(index + 6) & 0xff) << 48; + case 6: + k1 ^= ((long) data.charAt(index + 5) & 0xff) << 40; + case 5: + k1 ^= ((long) data.charAt(index + 4) & 0xff) << 32; + case 4: + k1 ^= ((long) data.charAt(index + 3) & 0xff) << 24; + case 3: + k1 ^= ((long) data.charAt(index + 2) & 0xff) << 16; + case 2: + k1 ^= ((long) data.charAt(index + 1) & 0xff) << 8; + case 1: + k1 ^= data.charAt(index) & 0xff; + k1 *= C1; + k1 = Long.rotateLeft(k1, R1); + k1 *= C2; + h1 ^= k1; + } + + // finalization + h1 ^= length; + h2 ^= length; + + h1 += h2; + h2 += h1; + + h1 = fmix64(h1); + h2 = fmix64(h2); + + h1 += h2; + h2 += h1; + + return h1^h2; // non-standard 128->64 bit transformation + } + + private static long hash64(final byte[] data, final int offset, final int length, final long seed) { + long h1 = seed; + long h2 = seed; + final int nblocks = length >> 4; + + // body + for (int i = 0; i < nblocks; i++) { + final int index = offset + (i << 4); + long k1 = getLittleEndianLong(data, index); + long k2 = getLittleEndianLong(data, index + 8); + + // mix functions for k1 + k1 *= C1; + k1 = Long.rotateLeft(k1, R1); + k1 *= C2; + h1 ^= k1; + h1 = Long.rotateLeft(h1, R2); + h1 += h2; + h1 = h1 * M + N1; + + // mix functions for k2 + k2 *= C2; + k2 = Long.rotateLeft(k2, R3); + k2 *= C1; + h2 ^= k2; + h2 = Long.rotateLeft(h2, R1); + h2 += h1; + h2 = h2 * M + N2; + } + + // tail + long k1 = 0; + long k2 = 0; + final int index = offset + (nblocks << 4); + switch (offset + length - index) { + case 15: + k2 ^= ((long) data[index + 14] & 0xff) << 48; + case 14: + k2 ^= ((long) data[index + 13] & 0xff) << 40; + case 13: + k2 ^= ((long) data[index + 12] & 0xff) << 32; + case 12: + k2 ^= ((long) data[index + 11] & 0xff) << 24; + case 11: + k2 ^= ((long) data[index + 10] & 0xff) << 16; + case 10: + k2 ^= ((long) data[index + 9] & 0xff) << 8; + case 9: + k2 ^= data[index + 8] & 0xff; + k2 *= C2; + k2 = Long.rotateLeft(k2, R3); + k2 *= C1; + h2 ^= k2; + + case 8: + k1 ^= ((long) data[index + 7] & 0xff) << 56; + case 7: + k1 ^= ((long) data[index + 6] & 0xff) << 48; + case 6: + k1 ^= ((long) data[index + 5] & 0xff) << 40; + case 5: + k1 ^= ((long) data[index + 4] & 0xff) << 32; + case 4: + k1 ^= ((long) data[index + 3] & 0xff) << 24; + case 3: + k1 ^= ((long) data[index + 2] & 0xff) << 16; + case 2: + k1 ^= ((long) data[index + 1] & 0xff) << 8; + case 1: + k1 ^= data[index] & 0xff; + k1 *= C1; + k1 = Long.rotateLeft(k1, R1); + k1 *= C2; + h1 ^= k1; + } + + // finalization + h1 ^= length; + h2 ^= length; + + h1 += h2; + h2 += h1; + + h1 = fmix64(h1); + h2 = fmix64(h2); + + h1 += h2; + h2 += h1; + + return h1^h2; // non-standard 128->64 bit transformation + } + + private static long getLittleEndianLong(final CharSequence data, final int index) { + return (((long) data.charAt(index ) & 0xff) ) | + (((long) data.charAt(index + 1) & 0xff) << 8) | + (((long) data.charAt(index + 2) & 0xff) << 16) | + (((long) data.charAt(index + 3) & 0xff) << 24) | + (((long) data.charAt(index + 4) & 0xff) << 32) | + (((long) data.charAt(index + 5) & 0xff) << 40) | + (((long) data.charAt(index + 6) & 0xff) << 48) | + (((long) data.charAt(index + 7) & 0xff) << 56); + } + + private static long getLittleEndianLong(final byte[] data, final int index) { + return (((long) data[index ] & 0xff) ) | + (((long) data[index + 1] & 0xff) << 8) | + (((long) data[index + 2] & 0xff) << 16) | + (((long) data[index + 3] & 0xff) << 24) | + (((long) data[index + 4] & 0xff) << 32) | + (((long) data[index + 5] & 0xff) << 40) | + (((long) data[index + 6] & 0xff) << 48) | + (((long) data[index + 7] & 0xff) << 56); + } + private static long fmix64(long hash) { + hash ^= (hash >>> 33); + hash *= 0xff51afd7ed558ccdL; + hash ^= (hash >>> 33); + hash *= 0xc4ceb9fe1a85ec53L; + hash ^= (hash >>> 33); + return hash; + } + +} From ea66195b9714954bc52d8178e4dcf7df335fb283 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 1 Aug 2023 15:00:15 +0200 Subject: [PATCH 120/157] (loader) Optimize loader by using zstd's direct streaming writer and the Murmur3_128 string hash --- .../forward/ForwardIndexConverterTest.java | 1 - .../journal/model/IndexJournalEntryData.java | 8 +- .../journal/writer/IndexJournalWriter.java | 3 - .../writer/IndexJournalWriterImpl.java | 105 ++++++++++++------ .../index/journal/IndexJournalTest.java | 1 - .../ReverseIndexFullConverterTest2.java | 2 +- .../ReverseIndexPriorityConverterTest2.java | 2 +- code/processes/loading-process/build.gradle | 2 +- .../loading/loader/IndexLoadKeywords.java | 10 +- .../loader/LoaderIndexJournalWriter.java | 35 +++++- .../loading/loader/SqlLoadUrls.java | 24 ++-- 11 files changed, 124 insertions(+), 69 deletions(-) diff --git a/code/features-index/index-forward/src/test/java/nu/marginalia/index/forward/ForwardIndexConverterTest.java b/code/features-index/index-forward/src/test/java/nu/marginalia/index/forward/ForwardIndexConverterTest.java index c2411575..33acceea 100644 --- a/code/features-index/index-forward/src/test/java/nu/marginalia/index/forward/ForwardIndexConverterTest.java +++ b/code/features-index/index-forward/src/test/java/nu/marginalia/index/forward/ForwardIndexConverterTest.java @@ -64,7 +64,6 @@ class ForwardIndexConverterTest { keywordLexicon.commitToDisk(); - writer.forceWrite(); writer.close(); diff --git a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryData.java b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryData.java index 423626ce..f24be823 100644 --- a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryData.java +++ b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryData.java @@ -8,7 +8,7 @@ import java.util.Iterator; public class IndexJournalEntryData implements Iterable { private final int size; - private final long[] underlyingArray; + public final long[] underlyingArray; public static final int MAX_LENGTH = 1000; public static final int ENTRY_SIZE = 2; @@ -23,11 +23,6 @@ public class IndexJournalEntryData implements Iterable= size) throw new ArrayIndexOutOfBoundsException(); @@ -37,7 +32,6 @@ public class IndexJournalEntryData implements Iterable 0 && i < entry.size()) { + dataBuffer.putLong(entry.underlyingArray[i++]); + } + } numEntries++; } - @Override - public void forceWrite() throws IOException { - outputStream.flush(); - - try (var raf = new RandomAccessFile(outputFile.toFile(), "rws")) { - raf.writeLong(numEntries); - raf.writeLong(lexicon.size()); - } - } - - @Override - public void flushWords() { - lexicon.commitToDisk(); - } - public void close() throws IOException { - forceWrite(); + dataBuffer.flip(); + compressingStream.compress(dataBuffer); + dataBuffer.clear(); + compressingStream.flush(); + compressingStream.close(); - outputStream.close(); + + // Finalize the file by writing a header + + ByteBuffer header = ByteBuffer.allocate(16); + header.putLong(numEntries); + header.putLong(lexicon.size()); + header.flip(); + + while (header.position() < header.limit()) { + fileChannel.write(header, header.position()); + } + + fileChannel.close(); } } diff --git a/code/features-index/index-journal/src/test/java/nu/marginalia/index/journal/IndexJournalTest.java b/code/features-index/index-journal/src/test/java/nu/marginalia/index/journal/IndexJournalTest.java index 67b23dee..9cb96781 100644 --- a/code/features-index/index-journal/src/test/java/nu/marginalia/index/journal/IndexJournalTest.java +++ b/code/features-index/index-journal/src/test/java/nu/marginalia/index/journal/IndexJournalTest.java @@ -41,7 +41,6 @@ public class IndexJournalTest { .add(5, 5) .add(6, 6) .build()); - journalWriter.forceWrite(); journalWriter.close(); reader = new IndexJournalReaderSingleCompressedFile(tempFile); diff --git a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest2.java b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest2.java index 4488912b..a99ab674 100644 --- a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest2.java +++ b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest2.java @@ -76,7 +76,7 @@ class ReverseIndexFullConverterTest2 { keywordLexicon.commitToDisk(); Thread.sleep(1000); - writer.forceWrite(); + writer.close(); var reader = new IndexJournalReaderSingleCompressedFile(indexFile); diff --git a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexPriorityConverterTest2.java b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexPriorityConverterTest2.java index d634c175..1f9763c8 100644 --- a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexPriorityConverterTest2.java +++ b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexPriorityConverterTest2.java @@ -76,7 +76,7 @@ class ReverseIndexPriorityConverterTest2 { keywordLexicon.commitToDisk(); Thread.sleep(1000); - writer.forceWrite(); + writer.close(); var reader = new IndexJournalReaderSingleCompressedFile(indexFile); diff --git a/code/processes/loading-process/build.gradle b/code/processes/loading-process/build.gradle index d204247d..0a89c350 100644 --- a/code/processes/loading-process/build.gradle +++ b/code/processes/loading-process/build.gradle @@ -31,7 +31,7 @@ dependencies { implementation project(':code:features-index:lexicon') implementation project(':code:features-index:index-journal') implementation project(':code:libraries:language-processing') - + implementation project(':third-party:commons-codec') testImplementation project(':code:services-core:search-service') implementation project(':code:process-models:crawling-model') diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/IndexLoadKeywords.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/IndexLoadKeywords.java index dd627f85..7374c0a3 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/IndexLoadKeywords.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/IndexLoadKeywords.java @@ -16,7 +16,7 @@ public class IndexLoadKeywords implements Runnable { private static final Logger logger = LoggerFactory.getLogger(IndexLoadKeywords.class); private final LinkedBlockingQueue insertQueue = new LinkedBlockingQueue<>(32); - private final LoaderIndexJournalWriter client; + private final LoaderIndexJournalWriter journalWriter; private record InsertTask(int urlId, int domainId, DocumentMetadata metadata, DocumentKeywords wordSet) {} @@ -25,8 +25,8 @@ public class IndexLoadKeywords implements Runnable { private volatile boolean canceled = false; @Inject - public IndexLoadKeywords(LoaderIndexJournalWriter client) { - this.client = client; + public IndexLoadKeywords(LoaderIndexJournalWriter journalWriter) { + this.journalWriter = journalWriter; runThread = new Thread(this, getClass().getSimpleName()); runThread.start(); } @@ -36,7 +36,7 @@ public class IndexLoadKeywords implements Runnable { while (!canceled) { var data = insertQueue.poll(1, TimeUnit.SECONDS); if (data != null) { - client.putWords(new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), data.metadata(), data.wordSet); + journalWriter.putWords(new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), data.metadata(), data.wordSet); } } } @@ -45,7 +45,7 @@ public class IndexLoadKeywords implements Runnable { if (!canceled) { canceled = true; runThread.join(); - client.close(); + journalWriter.close(); } } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java index 14962f9b..87b00192 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java @@ -2,6 +2,7 @@ package nu.marginalia.loading.loader; import com.google.inject.Inject; import com.google.inject.Singleton; +import lombok.SneakyThrows; import nu.marginalia.db.storage.FileStorageService; import nu.marginalia.db.storage.model.FileStorageType; import nu.marginalia.dict.OffHeapDictionaryHashMap; @@ -25,6 +26,7 @@ import java.nio.file.Files; import java.nio.file.attribute.PosixFilePermissions; import java.sql.SQLException; import java.util.Arrays; +import java.util.concurrent.*; @Singleton public class LoaderIndexJournalWriter { @@ -51,6 +53,12 @@ public class LoaderIndexJournalWriter { indexWriter = new IndexJournalWriterImpl(lexicon, indexPath); } + private final LinkedBlockingQueue keywordInsertTaskQueue = + new LinkedBlockingQueue<>(65536); + private final ExecutorService keywordInsertionExecutor = + new ThreadPoolExecutor(8, 16, 1, TimeUnit.MINUTES, keywordInsertTaskQueue); + + @SneakyThrows public void putWords(EdgeId domain, EdgeId url, DocumentMetadata metadata, DocumentKeywords wordSet) { @@ -62,16 +70,29 @@ public class LoaderIndexJournalWriter { return; } + // Due to the very bursty access patterns of this method, doing the actual insertions in separate threads + // with a chonky work queue is a fairly decent improvement for (var chunk : KeywordListChunker.chopList(wordSet, IndexJournalEntryData.MAX_LENGTH)) { - - var entry = new IndexJournalEntryData(getOrInsertWordIds(chunk.keywords(), chunk.metadata())); - var header = new IndexJournalEntryHeader(domain, url, metadata.encode()); - - indexWriter.put(header, entry); + try { + keywordInsertionExecutor.submit(() -> loadWords(domain, url, metadata, chunk)); + } + catch (RejectedExecutionException ex) { + loadWords(domain, url, metadata, chunk); + } } } + private void loadWords(EdgeId domain, + EdgeId url, + DocumentMetadata metadata, + DocumentKeywords wordSet) { + var entry = new IndexJournalEntryData(getOrInsertWordIds(wordSet.keywords(), wordSet.metadata())); + var header = new IndexJournalEntryHeader(domain, url, metadata.encode()); + + indexWriter.put(header, entry); + } + private long[] getOrInsertWordIds(String[] words, long[] meta) { long[] ids = new long[words.length*2]; int putIdx = 0; @@ -93,6 +114,10 @@ public class LoaderIndexJournalWriter { } public void close() throws Exception { + keywordInsertionExecutor.shutdown(); + while (!keywordInsertionExecutor.awaitTermination(1, TimeUnit.DAYS)) { + // ...? + } indexWriter.close(); lexicon.close(); } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadUrls.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadUrls.java index 922baf91..4ef1509e 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadUrls.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadUrls.java @@ -1,15 +1,13 @@ package nu.marginalia.loading.loader; -import com.google.common.hash.HashFunction; -import com.google.common.hash.Hashing; import com.google.inject.Inject; import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.hash.MurmurHash3_128; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.nio.charset.StandardCharsets; import java.sql.SQLException; import java.sql.Types; import java.util.HashSet; @@ -26,6 +24,7 @@ public class SqlLoadUrls { public SqlLoadUrls(HikariDataSource dataSource) { this.dataSource = dataSource; } + private final MurmurHash3_128 murmurHash = new MurmurHash3_128(); public void load(LoaderData data, EdgeUrl[] urls) { Set affectedDomains = new HashSet<>(); @@ -52,6 +51,7 @@ public class SqlLoadUrls { for (var url : urls) { if (data.getUrlId(url) != 0) continue; + if (url.path.length() >= 255) { logger.info("Skipping bad URL {}", url); continue; @@ -114,16 +114,16 @@ public class SqlLoadUrls { } } - private static final HashFunction murmur3_128 = Hashing.murmur3_128(); + /* We use a uniqueness constraint on DOMAIN_ID and this hash instead of on the PATH and PARAM + * fields as the uniqueness index grows absurdly large for some reason, possibly due to the prevalent + * shared leading substrings in paths? + */ private long hashPath(String path, String queryParam) { - long pathHash = murmur3_128.hashString(path, StandardCharsets.UTF_8).padToLong(); - - if (queryParam == null) { - return pathHash; - } - else { - return pathHash + murmur3_128.hashString(queryParam, StandardCharsets.UTF_8).padToLong(); + long hash = murmurHash.hashNearlyASCII(path); + if (queryParam != null) { + hash ^= murmurHash.hashNearlyASCII(queryParam); } + return hash; } /** Loads urlIDs for the domain into `data` from the database, starting at URL ID minId. */ @@ -131,11 +131,11 @@ public class SqlLoadUrls { try (var conn = dataSource.getConnection(); var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH, PARAM FROM EC_URL WHERE DOMAIN_ID=? AND ID > ?")) { + queryCall.setFetchSize(1000); queryCall.setInt(1, data.getDomainId(domain)); queryCall.setInt(2, minId); var rsp = queryCall.executeQuery(); - rsp.setFetchSize(1000); while (rsp.next()) { int urlId = rsp.getInt(1); From b08e302dd52a87bb9290e7117414d8589fbf37aa Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 1 Aug 2023 15:01:48 +0200 Subject: [PATCH 121/157] (lexicon) Optimize lexicon by using Murmur3_128's hash function --- code/features-index/lexicon/build.gradle | 1 + .../java/nu/marginalia/lexicon/KeywordLexicon.java | 12 ++++++------ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/code/features-index/lexicon/build.gradle b/code/features-index/lexicon/build.gradle index 18da060e..131b0bf6 100644 --- a/code/features-index/lexicon/build.gradle +++ b/code/features-index/lexicon/build.gradle @@ -22,6 +22,7 @@ dependencies { implementation libs.prometheus implementation libs.guava implementation libs.fastutil + implementation project(':third-party:commons-codec') testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.junit diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexicon.java b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexicon.java index 4929b9c1..84507511 100644 --- a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexicon.java +++ b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexicon.java @@ -1,10 +1,9 @@ package nu.marginalia.lexicon; -import com.google.common.hash.HashFunction; -import com.google.common.hash.Hashing; import io.prometheus.client.Gauge; import lombok.SneakyThrows; import nu.marginalia.dict.DictionaryMap; +import nu.marginalia.hash.MurmurHash3_128; import nu.marginalia.lexicon.journal.KeywordLexiconJournal; import nu.marginalia.lexicon.journal.KeywordLexiconJournalFingerprint; import org.slf4j.Logger; @@ -37,7 +36,6 @@ public class KeywordLexicon implements AutoCloseable { private final Logger logger = LoggerFactory.getLogger(getClass()); private static final AtomicInteger instances = new AtomicInteger(); - private final HashFunction hashFunction = Hashing.murmur3_128(); private static final Gauge request_time_metrics = Gauge.build("wmsa_edge_index_dictionary_size", "Dictionary Size") @@ -46,6 +44,8 @@ public class KeywordLexicon implements AutoCloseable { private volatile KeywordLexiconJournalFingerprint fingerprint = null; + private final MurmurHash3_128 hasher = new MurmurHash3_128(); + @SneakyThrows public KeywordLexicon(KeywordLexiconJournal keywordLexiconJournal) { @@ -74,7 +74,7 @@ public class KeywordLexicon implements AutoCloseable { lock.lock(); try { reverseIndex.clear(); - journal.loadFile(bytes -> reverseIndex.put(hashFunction.hashBytes(bytes).padToLong())); + journal.loadFile(bytes -> reverseIndex.put(hasher.hash(bytes))); fingerprint = journal.journalFingerprint(); } finally { @@ -95,7 +95,7 @@ public class KeywordLexicon implements AutoCloseable { return DictionaryMap.NO_VALUE; } - final long key = hashFunction.hashBytes(bytes).padToLong(); + final long key = hasher.hash(bytes); int idx = getReadOnly(key); @@ -130,7 +130,7 @@ public class KeywordLexicon implements AutoCloseable { /** Get method that does not modify the lexicon if the word is not present */ public int getReadOnly(String word) { final byte[] bytes = word.getBytes(StandardCharsets.UTF_8); - return getReadOnly(hashFunction.hashBytes(bytes).padToLong()); + return getReadOnly(hasher.hash(bytes)); } /** Get method that does not modify the lexicon if the word is not present */ From c1ea60b3994e315489a942d8048bb366fcdbdb61 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 1 Aug 2023 15:05:04 +0200 Subject: [PATCH 122/157] (db) Default values for storage base --- .../resources/sql/current/13-file-storage.sql | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/code/common/db/src/main/resources/sql/current/13-file-storage.sql b/code/common/db/src/main/resources/sql/current/13-file-storage.sql index b2063fc8..4c4be35d 100644 --- a/code/common/db/src/main/resources/sql/current/13-file-storage.sql +++ b/code/common/db/src/main/resources/sql/current/13-file-storage.sql @@ -41,3 +41,28 @@ AS SELECT BASE.ID AS BASE_ID FROM FILE_STORAGE STORAGE INNER JOIN FILE_STORAGE_BASE BASE ON STORAGE.BASE_ID=BASE.ID; + +INSERT INTO FILE_STORAGE_BASE(NAME, PATH, TYPE, PERMIT_TEMP) +VALUES +('Index Storage', '/vol', 'SSD_FAST', false), +('Data Storage', '/samples', 'SLOW', false); + +INSERT INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE) +SELECT ID, 'iw', "Index Staging Area", 'INDEX_STAGING' +FROM FILE_STORAGE_BASE WHERE NAME='IndexData'; + +INSERT INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE) +SELECT ID, 'ir', "Index Live Area", 'INDEX_LIVE' +FROM FILE_STORAGE_BASE WHERE NAME='IndexData'; + +INSERT INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE) +SELECT ID, 'lw', "Lexicon Staging Area", 'LEXICON_STAGING' +FROM FILE_STORAGE_BASE WHERE NAME='IndexData'; + +INSERT INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE) +SELECT ID, 'lr', "Lexicon Live Area", 'LEXICON_LIVE' +FROM FILE_STORAGE_BASE WHERE NAME='IndexData'; + +INSERT INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE) +SELECT ID, 'ss', "Search Sets", 'SEARCH_SETS' +FROM FILE_STORAGE_BASE WHERE NAME='IndexData'; \ No newline at end of file From 36a23707c111e36671425b7eaea6342819cdbc83 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 1 Aug 2023 15:49:50 +0200 Subject: [PATCH 123/157] (control) Control service should be a core service. --- .../control-service/build.gradle | 0 code/services-core/control-service/readme.md | 9 +++++++++ .../src/main/java/nu/marginalia/control/ControlMain.java | 0 .../java/nu/marginalia/control/ControlProcessModule.java | 0 .../main/java/nu/marginalia/control/ControlService.java | 0 .../main/java/nu/marginalia/control/HtmlRedirect.java | 0 .../java/nu/marginalia/control/actor/ControlActors.java | 0 .../actor/monitor/AbstractProcessSpawnerActor.java | 0 .../control/actor/monitor/ConverterMonitorActor.java | 0 .../control/actor/monitor/CrawlerMonitorActor.java | 0 .../control/actor/monitor/FileStorageMonitorActor.java | 0 .../control/actor/monitor/LoaderMonitorActor.java | 0 .../control/actor/monitor/MessageQueueMonitorActor.java | 0 .../actor/monitor/ProcessLivenessMonitorActor.java | 0 .../control/actor/task/ActorProcessWatcher.java | 0 .../nu/marginalia/control/actor/task/CrawlActor.java | 0 .../control/actor/task/CrawlJobExtractorActor.java | 0 .../control/actor/task/ReconvertAndLoadActor.java | 0 .../nu/marginalia/control/actor/task/RecrawlActor.java | 0 .../actor/task/TriggerAdjacencyCalculationActor.java | 0 .../src/main/java/nu/marginalia/control/model/Actor.java | 0 .../java/nu/marginalia/control/model/ActorRunState.java | 0 .../java/nu/marginalia/control/model/ActorState.java | 0 .../nu/marginalia/control/model/ActorStateGraph.java | 0 .../java/nu/marginalia/control/model/EventLogEntry.java | 0 .../control/model/FileStorageBaseWithStorage.java | 0 .../marginalia/control/model/FileStorageFileModel.java | 0 .../marginalia/control/model/FileStorageWithActions.java | 0 .../control/model/FileStorageWithRelatedEntries.java | 0 .../nu/marginalia/control/model/MessageQueueEntry.java | 0 .../nu/marginalia/control/model/ProcessHeartbeat.java | 0 .../nu/marginalia/control/model/ServiceHeartbeat.java | 0 .../nu/marginalia/control/svc/ControlActorService.java | 0 .../control/svc/ControlFileStorageService.java | 0 .../java/nu/marginalia/control/svc/EventLogService.java | 0 .../java/nu/marginalia/control/svc/HeartbeatService.java | 0 .../marginalia/control/svc/MessageQueueViewService.java | 0 .../nu/marginalia/control/svc/ProcessOutboxFactory.java | 0 .../java/nu/marginalia/control/svc/ProcessService.java | 0 .../src/main/resources/static/control/refresh.js | 0 .../src/main/resources/static/control/style.css | 0 .../main/resources/templates/control/actor-details.hdb | 0 .../src/main/resources/templates/control/actors.hdb | 0 .../templates/control/dialog-update-message-state.hdb | 0 .../src/main/resources/templates/control/index.hdb | 0 .../templates/control/partials/actor-state-graph.hdb | 0 .../templates/control/partials/actors-table.hdb | 0 .../templates/control/partials/events-table.hdb | 0 .../templates/control/partials/message-queue-table.hdb | 0 .../main/resources/templates/control/partials/nav.hdb | 0 .../templates/control/partials/processes-table.hdb | 0 .../templates/control/partials/services-table.hdb | 0 .../templates/control/partials/storage-table.hdb | 0 .../templates/control/partials/storage-types.hdb | 0 .../main/resources/templates/control/service-by-id.hdb | 0 .../src/main/resources/templates/control/services.hdb | 0 .../main/resources/templates/control/storage-crawls.hdb | 0 .../main/resources/templates/control/storage-details.hdb | 0 .../resources/templates/control/storage-overview.hdb | 0 .../resources/templates/control/storage-processed.hdb | 0 .../main/resources/templates/control/storage-specs.hdb | 0 code/services-core/readme.md | 3 +++ settings.gradle | 2 +- 63 files changed, 13 insertions(+), 1 deletion(-) rename code/{services-satellite => services-core}/control-service/build.gradle (100%) create mode 100644 code/services-core/control-service/readme.md rename code/{services-satellite => services-core}/control-service/src/main/java/nu/marginalia/control/ControlMain.java (100%) rename code/{services-satellite => services-core}/control-service/src/main/java/nu/marginalia/control/ControlProcessModule.java (100%) rename code/{services-satellite => services-core}/control-service/src/main/java/nu/marginalia/control/ControlService.java (100%) rename code/{services-satellite => services-core}/control-service/src/main/java/nu/marginalia/control/HtmlRedirect.java (100%) rename code/{services-satellite => services-core}/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java (100%) rename code/{services-satellite => services-core}/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java (100%) rename code/{services-satellite => services-core}/control-service/src/main/java/nu/marginalia/control/actor/monitor/ConverterMonitorActor.java (100%) rename code/{services-satellite => services-core}/control-service/src/main/java/nu/marginalia/control/actor/monitor/CrawlerMonitorActor.java (100%) rename code/{services-satellite => services-core}/control-service/src/main/java/nu/marginalia/control/actor/monitor/FileStorageMonitorActor.java (100%) rename code/{services-satellite => services-core}/control-service/src/main/java/nu/marginalia/control/actor/monitor/LoaderMonitorActor.java (100%) rename code/{services-satellite => services-core}/control-service/src/main/java/nu/marginalia/control/actor/monitor/MessageQueueMonitorActor.java (100%) rename code/{services-satellite => services-core}/control-service/src/main/java/nu/marginalia/control/actor/monitor/ProcessLivenessMonitorActor.java (100%) rename code/{services-satellite => services-core}/control-service/src/main/java/nu/marginalia/control/actor/task/ActorProcessWatcher.java (100%) rename code/{services-satellite => services-core}/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlActor.java (100%) rename code/{services-satellite => services-core}/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlJobExtractorActor.java (100%) rename code/{services-satellite => services-core}/control-service/src/main/java/nu/marginalia/control/actor/task/ReconvertAndLoadActor.java (100%) rename code/{services-satellite => services-core}/control-service/src/main/java/nu/marginalia/control/actor/task/RecrawlActor.java (100%) rename code/{services-satellite => services-core}/control-service/src/main/java/nu/marginalia/control/actor/task/TriggerAdjacencyCalculationActor.java (100%) rename code/{services-satellite => services-core}/control-service/src/main/java/nu/marginalia/control/model/Actor.java (100%) rename code/{services-satellite => services-core}/control-service/src/main/java/nu/marginalia/control/model/ActorRunState.java (100%) rename code/{services-satellite => services-core}/control-service/src/main/java/nu/marginalia/control/model/ActorState.java (100%) rename code/{services-satellite => services-core}/control-service/src/main/java/nu/marginalia/control/model/ActorStateGraph.java (100%) rename code/{services-satellite => services-core}/control-service/src/main/java/nu/marginalia/control/model/EventLogEntry.java (100%) rename code/{services-satellite => services-core}/control-service/src/main/java/nu/marginalia/control/model/FileStorageBaseWithStorage.java (100%) rename code/{services-satellite => services-core}/control-service/src/main/java/nu/marginalia/control/model/FileStorageFileModel.java (100%) rename code/{services-satellite => services-core}/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithActions.java (100%) rename code/{services-satellite => services-core}/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithRelatedEntries.java (100%) rename code/{services-satellite => services-core}/control-service/src/main/java/nu/marginalia/control/model/MessageQueueEntry.java (100%) rename code/{services-satellite => services-core}/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java (100%) rename code/{services-satellite => services-core}/control-service/src/main/java/nu/marginalia/control/model/ServiceHeartbeat.java (100%) rename code/{services-satellite => services-core}/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java (100%) rename code/{services-satellite => services-core}/control-service/src/main/java/nu/marginalia/control/svc/ControlFileStorageService.java (100%) rename code/{services-satellite => services-core}/control-service/src/main/java/nu/marginalia/control/svc/EventLogService.java (100%) rename code/{services-satellite => services-core}/control-service/src/main/java/nu/marginalia/control/svc/HeartbeatService.java (100%) rename code/{services-satellite => services-core}/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueViewService.java (100%) rename code/{services-satellite => services-core}/control-service/src/main/java/nu/marginalia/control/svc/ProcessOutboxFactory.java (100%) rename code/{services-satellite => services-core}/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java (100%) rename code/{services-satellite => services-core}/control-service/src/main/resources/static/control/refresh.js (100%) rename code/{services-satellite => services-core}/control-service/src/main/resources/static/control/style.css (100%) rename code/{services-satellite => services-core}/control-service/src/main/resources/templates/control/actor-details.hdb (100%) rename code/{services-satellite => services-core}/control-service/src/main/resources/templates/control/actors.hdb (100%) rename code/{services-satellite => services-core}/control-service/src/main/resources/templates/control/dialog-update-message-state.hdb (100%) rename code/{services-satellite => services-core}/control-service/src/main/resources/templates/control/index.hdb (100%) rename code/{services-satellite => services-core}/control-service/src/main/resources/templates/control/partials/actor-state-graph.hdb (100%) rename code/{services-satellite => services-core}/control-service/src/main/resources/templates/control/partials/actors-table.hdb (100%) rename code/{services-satellite => services-core}/control-service/src/main/resources/templates/control/partials/events-table.hdb (100%) rename code/{services-satellite => services-core}/control-service/src/main/resources/templates/control/partials/message-queue-table.hdb (100%) rename code/{services-satellite => services-core}/control-service/src/main/resources/templates/control/partials/nav.hdb (100%) rename code/{services-satellite => services-core}/control-service/src/main/resources/templates/control/partials/processes-table.hdb (100%) rename code/{services-satellite => services-core}/control-service/src/main/resources/templates/control/partials/services-table.hdb (100%) rename code/{services-satellite => services-core}/control-service/src/main/resources/templates/control/partials/storage-table.hdb (100%) rename code/{services-satellite => services-core}/control-service/src/main/resources/templates/control/partials/storage-types.hdb (100%) rename code/{services-satellite => services-core}/control-service/src/main/resources/templates/control/service-by-id.hdb (100%) rename code/{services-satellite => services-core}/control-service/src/main/resources/templates/control/services.hdb (100%) rename code/{services-satellite => services-core}/control-service/src/main/resources/templates/control/storage-crawls.hdb (100%) rename code/{services-satellite => services-core}/control-service/src/main/resources/templates/control/storage-details.hdb (100%) rename code/{services-satellite => services-core}/control-service/src/main/resources/templates/control/storage-overview.hdb (100%) rename code/{services-satellite => services-core}/control-service/src/main/resources/templates/control/storage-processed.hdb (100%) rename code/{services-satellite => services-core}/control-service/src/main/resources/templates/control/storage-specs.hdb (100%) diff --git a/code/services-satellite/control-service/build.gradle b/code/services-core/control-service/build.gradle similarity index 100% rename from code/services-satellite/control-service/build.gradle rename to code/services-core/control-service/build.gradle diff --git a/code/services-core/control-service/readme.md b/code/services-core/control-service/readme.md new file mode 100644 index 00000000..c775d365 --- /dev/null +++ b/code/services-core/control-service/readme.md @@ -0,0 +1,9 @@ +# Control Service + +The control service provides an operator's user interface, and is responsible for orchestrating the various processes of the system. + + +## See Also + +* [processes](../../processes) +* [common/message-queue](../../common/message-queue) - The Message Queue and MQFSM abstractions \ No newline at end of file diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlMain.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlMain.java similarity index 100% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlMain.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/ControlMain.java diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlProcessModule.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlProcessModule.java similarity index 100% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlProcessModule.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/ControlProcessModule.java diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java similarity index 100% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/ControlService.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/HtmlRedirect.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/HtmlRedirect.java similarity index 100% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/HtmlRedirect.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/HtmlRedirect.java diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java similarity index 100% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java similarity index 100% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/ConverterMonitorActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/ConverterMonitorActor.java similarity index 100% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/ConverterMonitorActor.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/ConverterMonitorActor.java diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/CrawlerMonitorActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/CrawlerMonitorActor.java similarity index 100% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/CrawlerMonitorActor.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/CrawlerMonitorActor.java diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/FileStorageMonitorActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/FileStorageMonitorActor.java similarity index 100% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/FileStorageMonitorActor.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/FileStorageMonitorActor.java diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/LoaderMonitorActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/LoaderMonitorActor.java similarity index 100% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/LoaderMonitorActor.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/LoaderMonitorActor.java diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/MessageQueueMonitorActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/MessageQueueMonitorActor.java similarity index 100% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/MessageQueueMonitorActor.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/MessageQueueMonitorActor.java diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/ProcessLivenessMonitorActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/ProcessLivenessMonitorActor.java similarity index 100% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/monitor/ProcessLivenessMonitorActor.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/ProcessLivenessMonitorActor.java diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/ActorProcessWatcher.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ActorProcessWatcher.java similarity index 100% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/ActorProcessWatcher.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ActorProcessWatcher.java diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlActor.java similarity index 100% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlActor.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlActor.java diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlJobExtractorActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlJobExtractorActor.java similarity index 100% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlJobExtractorActor.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlJobExtractorActor.java diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/ReconvertAndLoadActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ReconvertAndLoadActor.java similarity index 100% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/ReconvertAndLoadActor.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ReconvertAndLoadActor.java diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/RecrawlActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/RecrawlActor.java similarity index 100% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/RecrawlActor.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/RecrawlActor.java diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/TriggerAdjacencyCalculationActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/TriggerAdjacencyCalculationActor.java similarity index 100% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/actor/task/TriggerAdjacencyCalculationActor.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/TriggerAdjacencyCalculationActor.java diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/Actor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/Actor.java similarity index 100% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/Actor.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/model/Actor.java diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ActorRunState.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/ActorRunState.java similarity index 100% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ActorRunState.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/model/ActorRunState.java diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ActorState.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/ActorState.java similarity index 100% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ActorState.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/model/ActorState.java diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ActorStateGraph.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/ActorStateGraph.java similarity index 100% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ActorStateGraph.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/model/ActorStateGraph.java diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/EventLogEntry.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/EventLogEntry.java similarity index 100% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/EventLogEntry.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/model/EventLogEntry.java diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageBaseWithStorage.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/FileStorageBaseWithStorage.java similarity index 100% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageBaseWithStorage.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/model/FileStorageBaseWithStorage.java diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageFileModel.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/FileStorageFileModel.java similarity index 100% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageFileModel.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/model/FileStorageFileModel.java diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithActions.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithActions.java similarity index 100% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithActions.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithActions.java diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithRelatedEntries.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithRelatedEntries.java similarity index 100% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithRelatedEntries.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithRelatedEntries.java diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/MessageQueueEntry.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/MessageQueueEntry.java similarity index 100% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/MessageQueueEntry.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/model/MessageQueueEntry.java diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java similarity index 100% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ServiceHeartbeat.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/ServiceHeartbeat.java similarity index 100% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/model/ServiceHeartbeat.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/model/ServiceHeartbeat.java diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java similarity index 100% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlFileStorageService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlFileStorageService.java similarity index 100% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ControlFileStorageService.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlFileStorageService.java diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/EventLogService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/EventLogService.java similarity index 100% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/EventLogService.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/svc/EventLogService.java diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/HeartbeatService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/HeartbeatService.java similarity index 100% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/HeartbeatService.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/svc/HeartbeatService.java diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueViewService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueViewService.java similarity index 100% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueViewService.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueViewService.java diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessOutboxFactory.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ProcessOutboxFactory.java similarity index 100% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessOutboxFactory.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ProcessOutboxFactory.java diff --git a/code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java similarity index 100% rename from code/services-satellite/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java diff --git a/code/services-satellite/control-service/src/main/resources/static/control/refresh.js b/code/services-core/control-service/src/main/resources/static/control/refresh.js similarity index 100% rename from code/services-satellite/control-service/src/main/resources/static/control/refresh.js rename to code/services-core/control-service/src/main/resources/static/control/refresh.js diff --git a/code/services-satellite/control-service/src/main/resources/static/control/style.css b/code/services-core/control-service/src/main/resources/static/control/style.css similarity index 100% rename from code/services-satellite/control-service/src/main/resources/static/control/style.css rename to code/services-core/control-service/src/main/resources/static/control/style.css diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/actor-details.hdb b/code/services-core/control-service/src/main/resources/templates/control/actor-details.hdb similarity index 100% rename from code/services-satellite/control-service/src/main/resources/templates/control/actor-details.hdb rename to code/services-core/control-service/src/main/resources/templates/control/actor-details.hdb diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/actors.hdb b/code/services-core/control-service/src/main/resources/templates/control/actors.hdb similarity index 100% rename from code/services-satellite/control-service/src/main/resources/templates/control/actors.hdb rename to code/services-core/control-service/src/main/resources/templates/control/actors.hdb diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/dialog-update-message-state.hdb b/code/services-core/control-service/src/main/resources/templates/control/dialog-update-message-state.hdb similarity index 100% rename from code/services-satellite/control-service/src/main/resources/templates/control/dialog-update-message-state.hdb rename to code/services-core/control-service/src/main/resources/templates/control/dialog-update-message-state.hdb diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/index.hdb b/code/services-core/control-service/src/main/resources/templates/control/index.hdb similarity index 100% rename from code/services-satellite/control-service/src/main/resources/templates/control/index.hdb rename to code/services-core/control-service/src/main/resources/templates/control/index.hdb diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/partials/actor-state-graph.hdb b/code/services-core/control-service/src/main/resources/templates/control/partials/actor-state-graph.hdb similarity index 100% rename from code/services-satellite/control-service/src/main/resources/templates/control/partials/actor-state-graph.hdb rename to code/services-core/control-service/src/main/resources/templates/control/partials/actor-state-graph.hdb diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/partials/actors-table.hdb b/code/services-core/control-service/src/main/resources/templates/control/partials/actors-table.hdb similarity index 100% rename from code/services-satellite/control-service/src/main/resources/templates/control/partials/actors-table.hdb rename to code/services-core/control-service/src/main/resources/templates/control/partials/actors-table.hdb diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/partials/events-table.hdb b/code/services-core/control-service/src/main/resources/templates/control/partials/events-table.hdb similarity index 100% rename from code/services-satellite/control-service/src/main/resources/templates/control/partials/events-table.hdb rename to code/services-core/control-service/src/main/resources/templates/control/partials/events-table.hdb diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/partials/message-queue-table.hdb b/code/services-core/control-service/src/main/resources/templates/control/partials/message-queue-table.hdb similarity index 100% rename from code/services-satellite/control-service/src/main/resources/templates/control/partials/message-queue-table.hdb rename to code/services-core/control-service/src/main/resources/templates/control/partials/message-queue-table.hdb diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/partials/nav.hdb b/code/services-core/control-service/src/main/resources/templates/control/partials/nav.hdb similarity index 100% rename from code/services-satellite/control-service/src/main/resources/templates/control/partials/nav.hdb rename to code/services-core/control-service/src/main/resources/templates/control/partials/nav.hdb diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/partials/processes-table.hdb b/code/services-core/control-service/src/main/resources/templates/control/partials/processes-table.hdb similarity index 100% rename from code/services-satellite/control-service/src/main/resources/templates/control/partials/processes-table.hdb rename to code/services-core/control-service/src/main/resources/templates/control/partials/processes-table.hdb diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/partials/services-table.hdb b/code/services-core/control-service/src/main/resources/templates/control/partials/services-table.hdb similarity index 100% rename from code/services-satellite/control-service/src/main/resources/templates/control/partials/services-table.hdb rename to code/services-core/control-service/src/main/resources/templates/control/partials/services-table.hdb diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/partials/storage-table.hdb b/code/services-core/control-service/src/main/resources/templates/control/partials/storage-table.hdb similarity index 100% rename from code/services-satellite/control-service/src/main/resources/templates/control/partials/storage-table.hdb rename to code/services-core/control-service/src/main/resources/templates/control/partials/storage-table.hdb diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/partials/storage-types.hdb b/code/services-core/control-service/src/main/resources/templates/control/partials/storage-types.hdb similarity index 100% rename from code/services-satellite/control-service/src/main/resources/templates/control/partials/storage-types.hdb rename to code/services-core/control-service/src/main/resources/templates/control/partials/storage-types.hdb diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/service-by-id.hdb b/code/services-core/control-service/src/main/resources/templates/control/service-by-id.hdb similarity index 100% rename from code/services-satellite/control-service/src/main/resources/templates/control/service-by-id.hdb rename to code/services-core/control-service/src/main/resources/templates/control/service-by-id.hdb diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/services.hdb b/code/services-core/control-service/src/main/resources/templates/control/services.hdb similarity index 100% rename from code/services-satellite/control-service/src/main/resources/templates/control/services.hdb rename to code/services-core/control-service/src/main/resources/templates/control/services.hdb diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/storage-crawls.hdb b/code/services-core/control-service/src/main/resources/templates/control/storage-crawls.hdb similarity index 100% rename from code/services-satellite/control-service/src/main/resources/templates/control/storage-crawls.hdb rename to code/services-core/control-service/src/main/resources/templates/control/storage-crawls.hdb diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/storage-details.hdb b/code/services-core/control-service/src/main/resources/templates/control/storage-details.hdb similarity index 100% rename from code/services-satellite/control-service/src/main/resources/templates/control/storage-details.hdb rename to code/services-core/control-service/src/main/resources/templates/control/storage-details.hdb diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/storage-overview.hdb b/code/services-core/control-service/src/main/resources/templates/control/storage-overview.hdb similarity index 100% rename from code/services-satellite/control-service/src/main/resources/templates/control/storage-overview.hdb rename to code/services-core/control-service/src/main/resources/templates/control/storage-overview.hdb diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/storage-processed.hdb b/code/services-core/control-service/src/main/resources/templates/control/storage-processed.hdb similarity index 100% rename from code/services-satellite/control-service/src/main/resources/templates/control/storage-processed.hdb rename to code/services-core/control-service/src/main/resources/templates/control/storage-processed.hdb diff --git a/code/services-satellite/control-service/src/main/resources/templates/control/storage-specs.hdb b/code/services-core/control-service/src/main/resources/templates/control/storage-specs.hdb similarity index 100% rename from code/services-satellite/control-service/src/main/resources/templates/control/storage-specs.hdb rename to code/services-core/control-service/src/main/resources/templates/control/storage-specs.hdb diff --git a/code/services-core/readme.md b/code/services-core/readme.md index 1af591d3..8a66f0c5 100644 --- a/code/services-core/readme.md +++ b/code/services-core/readme.md @@ -8,5 +8,8 @@ The cores services constitute the main functionality of the search engine. * The [index-service](index-service/) contains the indexes, it answers questions about which documents contain which terms. +* The [control-service](control-service/) provides an operator's user interface, and is responsible + for orchestrating the various processes of the system. + * The [assistant-service](assistant-service/) helps the search service with spelling suggestions other peripheral functionality. \ No newline at end of file diff --git a/settings.gradle b/settings.gradle index 62bc0f34..be5dd603 100644 --- a/settings.gradle +++ b/settings.gradle @@ -3,11 +3,11 @@ rootProject.name = 'marginalia.nu' include 'code:services-core:index-service' include 'code:services-core:assistant-service' include 'code:services-core:search-service' +include 'code:services-core:control-service' include 'code:services-satellite:api-service' include 'code:services-satellite:dating-service' include 'code:services-satellite:explorer-service' -include 'code:services-satellite:control-service' include 'code:libraries:array' include 'code:libraries:btree' From 2e29038ecd8d4ee42a7543fe3d2882d06c509ae2 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 1 Aug 2023 15:50:08 +0200 Subject: [PATCH 124/157] (db) Fix broken insert statement, move file storage defaults to a separate file. --- .../resources/sql/current/13-file-storage.sql | 25 ------------------- .../14-file-storage-default-values.sql | 24 ++++++++++++++++++ 2 files changed, 24 insertions(+), 25 deletions(-) create mode 100644 code/common/db/src/main/resources/sql/current/14-file-storage-default-values.sql diff --git a/code/common/db/src/main/resources/sql/current/13-file-storage.sql b/code/common/db/src/main/resources/sql/current/13-file-storage.sql index 4c4be35d..b2063fc8 100644 --- a/code/common/db/src/main/resources/sql/current/13-file-storage.sql +++ b/code/common/db/src/main/resources/sql/current/13-file-storage.sql @@ -41,28 +41,3 @@ AS SELECT BASE.ID AS BASE_ID FROM FILE_STORAGE STORAGE INNER JOIN FILE_STORAGE_BASE BASE ON STORAGE.BASE_ID=BASE.ID; - -INSERT INTO FILE_STORAGE_BASE(NAME, PATH, TYPE, PERMIT_TEMP) -VALUES -('Index Storage', '/vol', 'SSD_FAST', false), -('Data Storage', '/samples', 'SLOW', false); - -INSERT INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE) -SELECT ID, 'iw', "Index Staging Area", 'INDEX_STAGING' -FROM FILE_STORAGE_BASE WHERE NAME='IndexData'; - -INSERT INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE) -SELECT ID, 'ir', "Index Live Area", 'INDEX_LIVE' -FROM FILE_STORAGE_BASE WHERE NAME='IndexData'; - -INSERT INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE) -SELECT ID, 'lw', "Lexicon Staging Area", 'LEXICON_STAGING' -FROM FILE_STORAGE_BASE WHERE NAME='IndexData'; - -INSERT INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE) -SELECT ID, 'lr', "Lexicon Live Area", 'LEXICON_LIVE' -FROM FILE_STORAGE_BASE WHERE NAME='IndexData'; - -INSERT INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE) -SELECT ID, 'ss', "Search Sets", 'SEARCH_SETS' -FROM FILE_STORAGE_BASE WHERE NAME='IndexData'; \ No newline at end of file diff --git a/code/common/db/src/main/resources/sql/current/14-file-storage-default-values.sql b/code/common/db/src/main/resources/sql/current/14-file-storage-default-values.sql new file mode 100644 index 00000000..e82d5901 --- /dev/null +++ b/code/common/db/src/main/resources/sql/current/14-file-storage-default-values.sql @@ -0,0 +1,24 @@ +INSERT IGNORE INTO FILE_STORAGE_BASE(NAME, PATH, TYPE, PERMIT_TEMP) +VALUES +('Index Storage', '/vol', 'SSD_INDEX', false), +('Data Storage', '/samples', 'SLOW', false); + +INSERT IGNORE INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE) +SELECT ID, 'iw', "Index Staging Area", 'INDEX_STAGING' +FROM FILE_STORAGE_BASE WHERE NAME='IndexData'; + +INSERT IGNORE INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE) +SELECT ID, 'ir', "Index Live Area", 'INDEX_LIVE' +FROM FILE_STORAGE_BASE WHERE NAME='IndexData'; + +INSERT IGNORE INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE) +SELECT ID, 'lw', "Lexicon Staging Area", 'LEXICON_STAGING' +FROM FILE_STORAGE_BASE WHERE NAME='IndexData'; + +INSERT IGNORE INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE) +SELECT ID, 'lr', "Lexicon Live Area", 'LEXICON_LIVE' +FROM FILE_STORAGE_BASE WHERE NAME='IndexData'; + +INSERT IGNORE INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE) +SELECT ID, 'ss', "Search Sets", 'SEARCH_SETS' +FROM FILE_STORAGE_BASE WHERE NAME='IndexData'; \ No newline at end of file From 58556af6c7c6c233a0f1fa1bda31d6c01cb2e0ab Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 1 Aug 2023 17:08:42 +0200 Subject: [PATCH 125/157] (db) Use flwyay for database migrations. --- code/common/db/build.gradle | 15 ++++ code/common/db/readme.md | 23 +++++- .../migration/V23_06_0_000__base.sql} | 0 .../migration/V23_06_0_001__blacklist.sql} | 0 .../migration/V23_06_0_002__dictionary.sql} | 0 .../migration/V23_06_0_003__crawl-queue.sql} | 0 .../migration/V23_06_0_004__screenshot.sql} | 0 .../V23_06_0_005__domain_complaint.sql} | 0 .../migration/V23_06_0_006__api_key.sql} | 0 .../migration/V23_06_0_007__neighbors.sql} | 0 .../V23_06_0_008__random_domains.sql} | 0 .../migration/V23_06_0_009__news_feed.sql} | 0 .../migration/V23_07_0_001__domain_type.sql} | 0 .../V23_07_0_002__service_status.sql} | 0 .../V23_07_0_003__message_queue.sql} | 0 .../migration/V23_07_0_004__file_storage.sql} | 0 ...07_0_005__file_storage_default_values.sql} | 10 +-- ...3_07_0_006__message_queue_default_jobs.sql | 7 ++ .../sql/migrations/00-news-items.sql | 76 ------------------- .../resources/sql/migrations/01-domain.sql | 1 - .../sql/migrations/03-service-status.sql | 27 ------- .../sql/migrations/04-message-queue.sql | 21 ----- .../nu/marginalia/db/DomainTypesTest.java | 2 +- .../db/storage/FileStorageServiceTest.java | 2 +- .../nu/marginalia/mq/outbox/MqOutboxTest.java | 2 +- .../mq/persistence/MqPersistenceTest.java | 2 +- .../mqsm/StateMachineErrorTest.java | 2 +- .../marginalia/mqsm/StateMachineNullTest.java | 2 +- .../mqsm/StateMachineResumeTest.java | 2 +- .../nu/marginalia/mqsm/StateMachineTest.java | 2 +- .../loader/SqlLoadDomainLinksTest.java | 2 +- .../marginalia/loader/SqlLoadDomainsTest.java | 2 +- .../loader/SqlLoadProcessedDocumentTest.java | 2 +- .../loader/SqlLoadProcessedDomainTest.java | 2 +- .../nu/marginalia/loader/SqlLoadUrlsTest.java | 2 +- .../api/svc/LicenseServiceTest.java | 3 +- run/readme.md | 39 +++------- 37 files changed, 72 insertions(+), 176 deletions(-) rename code/common/db/src/main/resources/{sql/current/00-base.sql => db/migration/V23_06_0_000__base.sql} (100%) rename code/common/db/src/main/resources/{sql/current/01-blacklist.sql => db/migration/V23_06_0_001__blacklist.sql} (100%) rename code/common/db/src/main/resources/{sql/current/02-dictionary.sql => db/migration/V23_06_0_002__dictionary.sql} (100%) rename code/common/db/src/main/resources/{sql/current/03-crawl-queue.sql => db/migration/V23_06_0_003__crawl-queue.sql} (100%) rename code/common/db/src/main/resources/{sql/current/04-screenshot.sql => db/migration/V23_06_0_004__screenshot.sql} (100%) rename code/common/db/src/main/resources/{sql/current/05-domain-complaint.sql => db/migration/V23_06_0_005__domain_complaint.sql} (100%) rename code/common/db/src/main/resources/{sql/current/06-api-key.sql => db/migration/V23_06_0_006__api_key.sql} (100%) rename code/common/db/src/main/resources/{sql/current/07-neighbors.sql => db/migration/V23_06_0_007__neighbors.sql} (100%) rename code/common/db/src/main/resources/{sql/current/08-random-domains.sql => db/migration/V23_06_0_008__random_domains.sql} (100%) rename code/common/db/src/main/resources/{sql/current/09-news-feed.sql => db/migration/V23_06_0_009__news_feed.sql} (100%) rename code/common/db/src/main/resources/{sql/current/10-domain-type.sql => db/migration/V23_07_0_001__domain_type.sql} (100%) rename code/common/db/src/main/resources/{sql/current/11-service-status.sql => db/migration/V23_07_0_002__service_status.sql} (100%) rename code/common/db/src/main/resources/{sql/current/12-message-queue.sql => db/migration/V23_07_0_003__message_queue.sql} (100%) rename code/common/db/src/main/resources/{sql/current/13-file-storage.sql => db/migration/V23_07_0_004__file_storage.sql} (100%) rename code/common/db/src/main/resources/{sql/current/14-file-storage-default-values.sql => db/migration/V23_07_0_005__file_storage_default_values.sql} (75%) create mode 100644 code/common/db/src/main/resources/db/migration/V23_07_0_006__message_queue_default_jobs.sql delete mode 100644 code/common/db/src/main/resources/sql/migrations/00-news-items.sql delete mode 100644 code/common/db/src/main/resources/sql/migrations/01-domain.sql delete mode 100644 code/common/db/src/main/resources/sql/migrations/03-service-status.sql delete mode 100644 code/common/db/src/main/resources/sql/migrations/04-message-queue.sql diff --git a/code/common/db/build.gradle b/code/common/db/build.gradle index b7e3f0ef..52f63895 100644 --- a/code/common/db/build.gradle +++ b/code/common/db/build.gradle @@ -2,6 +2,7 @@ plugins { id 'java' id "io.freefair.lombok" version "5.3.3.3" id 'jvm-test-suite' + id "org.flywaydb.flyway" version "8.2.0" } @@ -11,6 +12,10 @@ java { } } +configurations { + flywayMigration.extendsFrom(implementation) +} + dependencies { implementation project(':code:common:model') @@ -29,6 +34,7 @@ dependencies { implementation libs.rxjava implementation libs.bundles.mariadb + flywayMigration 'org.flywaydb:flyway-mysql:9.8.1' testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.junit @@ -40,6 +46,15 @@ dependencies { testImplementation 'org.testcontainers:junit-jupiter:1.17.4' } +flyway { + + url = 'jdbc:mariadb://localhost:3306/WMSA_prod' + user = 'wmsa' + password = 'wmsa' + schemas = ['WMSA_prod'] + configurations = [ 'compileClasspath', 'flywayMigration' ] + locations = ['filesystem:src/main/resources/db/migration'] +} test { maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1 diff --git a/code/common/db/readme.md b/code/common/db/readme.md index 886153b6..ad62169d 100644 --- a/code/common/db/readme.md +++ b/code/common/db/readme.md @@ -2,10 +2,29 @@ This module primarily contains SQL files for the URLs database. The most central tables are `EC_DOMAIN`, `EC_URL` and `EC_PAGE_DATA`. +## Flyway + +The system uses flyway to track database changes and allow easy migrations, this is accessible via gradle tasks. + +* `flywayMigrate` +* `flywayBaseline` +* `flywayRepair` +* `flywayClean` (dangerous as in wipes your entire database) + +Refer to the [Flyway documentation](https://documentation.red-gate.com/fd/flyway-documentation-138346877.html) for guidance. +It's well documented and these are probably the only four tasks you'll ever need. + +If you are not running the system via docker, you need to provide alternative connection details than +the defaults (TODO: how?). + +The migration files are in [resources/db/migration](src/main/resources/db/migration). The file name convention +incorporates the project's cal-ver versioning; and are applied in lexicographical order. + + VYY_MM_v_nnn__description.sql + ## Central Paths -* [current](src/main/resources/sql/current) - The current database model -* [migrations](src/main/resources/sql/migrations) +* [migrations](src/main/resources/db/migration) - Flyway migrations ## See Also diff --git a/code/common/db/src/main/resources/sql/current/00-base.sql b/code/common/db/src/main/resources/db/migration/V23_06_0_000__base.sql similarity index 100% rename from code/common/db/src/main/resources/sql/current/00-base.sql rename to code/common/db/src/main/resources/db/migration/V23_06_0_000__base.sql diff --git a/code/common/db/src/main/resources/sql/current/01-blacklist.sql b/code/common/db/src/main/resources/db/migration/V23_06_0_001__blacklist.sql similarity index 100% rename from code/common/db/src/main/resources/sql/current/01-blacklist.sql rename to code/common/db/src/main/resources/db/migration/V23_06_0_001__blacklist.sql diff --git a/code/common/db/src/main/resources/sql/current/02-dictionary.sql b/code/common/db/src/main/resources/db/migration/V23_06_0_002__dictionary.sql similarity index 100% rename from code/common/db/src/main/resources/sql/current/02-dictionary.sql rename to code/common/db/src/main/resources/db/migration/V23_06_0_002__dictionary.sql diff --git a/code/common/db/src/main/resources/sql/current/03-crawl-queue.sql b/code/common/db/src/main/resources/db/migration/V23_06_0_003__crawl-queue.sql similarity index 100% rename from code/common/db/src/main/resources/sql/current/03-crawl-queue.sql rename to code/common/db/src/main/resources/db/migration/V23_06_0_003__crawl-queue.sql diff --git a/code/common/db/src/main/resources/sql/current/04-screenshot.sql b/code/common/db/src/main/resources/db/migration/V23_06_0_004__screenshot.sql similarity index 100% rename from code/common/db/src/main/resources/sql/current/04-screenshot.sql rename to code/common/db/src/main/resources/db/migration/V23_06_0_004__screenshot.sql diff --git a/code/common/db/src/main/resources/sql/current/05-domain-complaint.sql b/code/common/db/src/main/resources/db/migration/V23_06_0_005__domain_complaint.sql similarity index 100% rename from code/common/db/src/main/resources/sql/current/05-domain-complaint.sql rename to code/common/db/src/main/resources/db/migration/V23_06_0_005__domain_complaint.sql diff --git a/code/common/db/src/main/resources/sql/current/06-api-key.sql b/code/common/db/src/main/resources/db/migration/V23_06_0_006__api_key.sql similarity index 100% rename from code/common/db/src/main/resources/sql/current/06-api-key.sql rename to code/common/db/src/main/resources/db/migration/V23_06_0_006__api_key.sql diff --git a/code/common/db/src/main/resources/sql/current/07-neighbors.sql b/code/common/db/src/main/resources/db/migration/V23_06_0_007__neighbors.sql similarity index 100% rename from code/common/db/src/main/resources/sql/current/07-neighbors.sql rename to code/common/db/src/main/resources/db/migration/V23_06_0_007__neighbors.sql diff --git a/code/common/db/src/main/resources/sql/current/08-random-domains.sql b/code/common/db/src/main/resources/db/migration/V23_06_0_008__random_domains.sql similarity index 100% rename from code/common/db/src/main/resources/sql/current/08-random-domains.sql rename to code/common/db/src/main/resources/db/migration/V23_06_0_008__random_domains.sql diff --git a/code/common/db/src/main/resources/sql/current/09-news-feed.sql b/code/common/db/src/main/resources/db/migration/V23_06_0_009__news_feed.sql similarity index 100% rename from code/common/db/src/main/resources/sql/current/09-news-feed.sql rename to code/common/db/src/main/resources/db/migration/V23_06_0_009__news_feed.sql diff --git a/code/common/db/src/main/resources/sql/current/10-domain-type.sql b/code/common/db/src/main/resources/db/migration/V23_07_0_001__domain_type.sql similarity index 100% rename from code/common/db/src/main/resources/sql/current/10-domain-type.sql rename to code/common/db/src/main/resources/db/migration/V23_07_0_001__domain_type.sql diff --git a/code/common/db/src/main/resources/sql/current/11-service-status.sql b/code/common/db/src/main/resources/db/migration/V23_07_0_002__service_status.sql similarity index 100% rename from code/common/db/src/main/resources/sql/current/11-service-status.sql rename to code/common/db/src/main/resources/db/migration/V23_07_0_002__service_status.sql diff --git a/code/common/db/src/main/resources/sql/current/12-message-queue.sql b/code/common/db/src/main/resources/db/migration/V23_07_0_003__message_queue.sql similarity index 100% rename from code/common/db/src/main/resources/sql/current/12-message-queue.sql rename to code/common/db/src/main/resources/db/migration/V23_07_0_003__message_queue.sql diff --git a/code/common/db/src/main/resources/sql/current/13-file-storage.sql b/code/common/db/src/main/resources/db/migration/V23_07_0_004__file_storage.sql similarity index 100% rename from code/common/db/src/main/resources/sql/current/13-file-storage.sql rename to code/common/db/src/main/resources/db/migration/V23_07_0_004__file_storage.sql diff --git a/code/common/db/src/main/resources/sql/current/14-file-storage-default-values.sql b/code/common/db/src/main/resources/db/migration/V23_07_0_005__file_storage_default_values.sql similarity index 75% rename from code/common/db/src/main/resources/sql/current/14-file-storage-default-values.sql rename to code/common/db/src/main/resources/db/migration/V23_07_0_005__file_storage_default_values.sql index e82d5901..8d591965 100644 --- a/code/common/db/src/main/resources/sql/current/14-file-storage-default-values.sql +++ b/code/common/db/src/main/resources/db/migration/V23_07_0_005__file_storage_default_values.sql @@ -5,20 +5,20 @@ VALUES INSERT IGNORE INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE) SELECT ID, 'iw', "Index Staging Area", 'INDEX_STAGING' -FROM FILE_STORAGE_BASE WHERE NAME='IndexData'; +FROM FILE_STORAGE_BASE WHERE NAME='Index Storage'; INSERT IGNORE INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE) SELECT ID, 'ir', "Index Live Area", 'INDEX_LIVE' -FROM FILE_STORAGE_BASE WHERE NAME='IndexData'; +FROM FILE_STORAGE_BASE WHERE NAME='Index Storage'; INSERT IGNORE INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE) SELECT ID, 'lw', "Lexicon Staging Area", 'LEXICON_STAGING' -FROM FILE_STORAGE_BASE WHERE NAME='IndexData'; +FROM FILE_STORAGE_BASE WHERE NAME='Index Storage'; INSERT IGNORE INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE) SELECT ID, 'lr', "Lexicon Live Area", 'LEXICON_LIVE' -FROM FILE_STORAGE_BASE WHERE NAME='IndexData'; +FROM FILE_STORAGE_BASE WHERE NAME='Index Storage'; INSERT IGNORE INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE) SELECT ID, 'ss', "Search Sets", 'SEARCH_SETS' -FROM FILE_STORAGE_BASE WHERE NAME='IndexData'; \ No newline at end of file +FROM FILE_STORAGE_BASE WHERE NAME='Index Storage'; \ No newline at end of file diff --git a/code/common/db/src/main/resources/db/migration/V23_07_0_006__message_queue_default_jobs.sql b/code/common/db/src/main/resources/db/migration/V23_07_0_006__message_queue_default_jobs.sql new file mode 100644 index 00000000..9b78e3b4 --- /dev/null +++ b/code/common/db/src/main/resources/db/migration/V23_07_0_006__message_queue_default_jobs.sql @@ -0,0 +1,7 @@ +INSERT INTO MESSAGE_QUEUE(RECIPIENT_INBOX,FUNCTION,PAYLOAD) VALUES + ('fsm:converter_monitor','INITIAL',''), + ('fsm:loader_monitor','INITIAL',''), + ('fsm:crawler_monitor','INITIAL',''), + ('fsm:message_queue_monitor','INITIAL',''), + ('fsm:process_liveness_monitor','INITIAL',''), + ('fsm:file_storage_monitor','INITIAL',''); diff --git a/code/common/db/src/main/resources/sql/migrations/00-news-items.sql b/code/common/db/src/main/resources/sql/migrations/00-news-items.sql deleted file mode 100644 index 4f237b67..00000000 --- a/code/common/db/src/main/resources/sql/migrations/00-news-items.sql +++ /dev/null @@ -1,76 +0,0 @@ - -INSERT IGNORE INTO SEARCH_NEWS_FEED(TITLE, LINK, SOURCE, LIST_DATE) VALUES ( -'A search engine that favors text-heavy sites and punishes modern web design', -'https://news.ycombinator.com/item?id=28550764', -'Hacker News', -'2021-09-16' -); - -INSERT IGNORE INTO SEARCH_NEWS_FEED(TITLE, LINK, SOURCE, LIST_DATE) VALUES ( -'A Search Engine Designed To Surprise You', -'https://onezero.medium.com/a-search-engine-designed-to-surprise-you-b81944ed5c06', -'Clive Thompson OneZero', -'2021-09-16' -); - -INSERT IGNORE INTO SEARCH_NEWS_FEED(TITLE, LINK, SOURCE, LIST_DATE) VALUES ( -'🎂 First anniversary! 🎊', -'https://memex.marginalia.nu/log/49-marginalia-1-year.gmi', -null, -'2022-02-26'); - -INSERT IGNORE INTO SEARCH_NEWS_FEED(TITLE, LINK, SOURCE, LIST_DATE) VALUES ( -'Marginalia Search - Serendipity Engineering', -'https://www.metafilter.com/194653/Marginalia-Search-Serendipity-Engineering', -'MetaFilter', -'2022-03-09'); - -INSERT IGNORE INTO SEARCH_NEWS_FEED(TITLE, LINK, SOURCE, LIST_DATE) VALUES ( -'What Google Search Isn\'t Showing You', -'https://www.newyorker.com/culture/infinite-scroll/what-google-search-isnt-showing-you', -'The New Yorker 🎩', -'2022-03-10' -); - -INSERT IGNORE INTO SEARCH_NEWS_FEED(TITLE, LINK, SOURCE, LIST_DATE) VALUES ( -'You Should Check Out the Indie Web 🎞️', -'https://www.youtube.com/watch?v=rTSEr0cRJY8', -'YouTube, You\'ve Got Kat', -'2022-03-15' -); - -INSERT IGNORE INTO SEARCH_NEWS_FEED(TITLE, LINK, SOURCE, LIST_DATE) VALUES ( -'Marginalia Goes Open Source', -'https://news.ycombinator.com/item?id=31536626', -'Hacker News', -'2022-05-28' -); - -INSERT IGNORE INTO SEARCH_NEWS_FEED(TITLE, LINK, SOURCE, LIST_DATE) VALUES ( -'Kritik an Googles Suche - Platzhirsch auf dem Nebenschauplatz', -'https://www.deutschlandfunkkultur.de/google-suche-100.html', -'Deutschlandfunk Kultur 🇩🇪', -'2022-08-18' -); - -INSERT IGNORE INTO SEARCH_NEWS_FEED(TITLE, LINK, SOURCE, LIST_DATE) VALUES ( -'Google ei enää tideä', -'https://www.hs.fi/visio/art-2000009139237.html', -'Helsing Sanomat 🇫🇮', -'2022-10-19' -); - -INSERT IGNORE INTO SEARCH_NEWS_FEED(TITLE, LINK, SOURCE, LIST_DATE) VALUES ( -'Marginalia\'s Index Reaches 100,000,000 Documents 🎊', -'https://memex.marginalia.nu/log/64-hundred-million.gmi', -null, -'2022-10-21' -); - -INSERT IGNORE INTO SEARCH_NEWS_FEED(TITLE, LINK, SOURCE, LIST_DATE) VALUES ( -'Marginalia Receives NLnet grant', -'https://memex.marginalia.nu/log/74-marginalia-2-years.gmi', -null, -'2023-02-26' -); - diff --git a/code/common/db/src/main/resources/sql/migrations/01-domain.sql b/code/common/db/src/main/resources/sql/migrations/01-domain.sql deleted file mode 100644 index 0402fecb..00000000 --- a/code/common/db/src/main/resources/sql/migrations/01-domain.sql +++ /dev/null @@ -1 +0,0 @@ -ALTER TABLE EC_DOMAIN MODIFY COLUMN IP VARCHAR(48); \ No newline at end of file diff --git a/code/common/db/src/main/resources/sql/migrations/03-service-status.sql b/code/common/db/src/main/resources/sql/migrations/03-service-status.sql deleted file mode 100644 index a5d392c5..00000000 --- a/code/common/db/src/main/resources/sql/migrations/03-service-status.sql +++ /dev/null @@ -1,27 +0,0 @@ -CREATE TABLE IF NOT EXISTS SERVICE_HEARTBEAT ( - SERVICE_NAME VARCHAR(255) PRIMARY KEY COMMENT "Full name of the service, including node id if applicable, e.g. search-service:0", - SERVICE_BASE VARCHAR(255) NOT NULL COMMENT "Base name of the service, e.g. search-service", - INSTANCE VARCHAR(255) NOT NULL COMMENT "UUID of the service instance", - ALIVE BOOLEAN NOT NULL DEFAULT TRUE COMMENT "Set to false when the service is doing an orderly shutdown", - HEARTBEAT_TIME TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) COMMENT "Service was last seen at this point" -); - -CREATE TABLE IF NOT EXISTS PROCESS_HEARTBEAT ( - PROCESS_NAME VARCHAR(255) PRIMARY KEY COMMENT "Full name of the process, including node id if applicable, e.g. converter:0", - PROCESS_BASE VARCHAR(255) NOT NULL COMMENT "Base name of the process, e.g. converter", - INSTANCE VARCHAR(255) NOT NULL COMMENT "UUID of the process instance", - STATUS ENUM ('STARTING', 'RUNNING', 'STOPPED') NOT NULL DEFAULT 'STARTING' COMMENT "Status of the process", - PROGRESS INT NOT NULL DEFAULT 0 COMMENT "Progress of the process", - HEARTBEAT_TIME TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) COMMENT "Process was last seen at this point" -); - -CREATE TABLE IF NOT EXISTS SERVICE_EVENTLOG( - ID BIGINT AUTO_INCREMENT PRIMARY KEY COMMENT "Unique id", - SERVICE_NAME VARCHAR(255) NOT NULL COMMENT "Full name of the service, including node id if applicable, e.g. search-service:0", - SERVICE_BASE VARCHAR(255) NOT NULL COMMENT "Base name of the service, e.g. search-service", - INSTANCE VARCHAR(255) NOT NULL COMMENT "UUID of the service instance", - EVENT_TIME TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) COMMENT "Event time", - EVENT_TYPE VARCHAR(255) NOT NULL COMMENT "Event type", - EVENT_MESSAGE VARCHAR(255) NOT NULL COMMENT "Event message" -); - diff --git a/code/common/db/src/main/resources/sql/migrations/04-message-queue.sql b/code/common/db/src/main/resources/sql/migrations/04-message-queue.sql deleted file mode 100644 index 6e628e80..00000000 --- a/code/common/db/src/main/resources/sql/migrations/04-message-queue.sql +++ /dev/null @@ -1,21 +0,0 @@ -CREATE TABLE IF NOT EXISTS MESSAGE_QUEUE ( - ID BIGINT AUTO_INCREMENT PRIMARY KEY COMMENT 'Unique id', - RELATED_ID BIGINT NOT NULL DEFAULT -1 COMMENT 'Unique id a related message', - SENDER_INBOX VARCHAR(255) COMMENT 'Name of the sender inbox', - RECIPIENT_INBOX VARCHAR(255) NOT NULL COMMENT 'Name of the recipient inbox', - FUNCTION VARCHAR(255) NOT NULL COMMENT 'Which function to run', - PAYLOAD TEXT COMMENT 'Message to recipient', - -- These fields are used to avoid double processing of messages - -- instance marks the unique instance of the party, and the tick marks - -- the current polling iteration. Both are necessary. - OWNER_INSTANCE VARCHAR(255) COMMENT 'Instance UUID corresponding to the party that has claimed the message', - OWNER_TICK BIGINT DEFAULT -1 COMMENT 'Used by recipient to determine which messages it has processed', - STATE ENUM('NEW', 'ACK', 'OK', 'ERR', 'DEAD') - NOT NULL DEFAULT 'NEW' COMMENT 'Processing state', - CREATED_TIME TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) COMMENT 'Time of creation', - UPDATED_TIME TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) COMMENT 'Time of last update', - TTL INT COMMENT 'Time to live in seconds' -); - -CREATE INDEX MESSAGE_QUEUE_STATE_IDX ON MESSAGE_QUEUE(STATE); -CREATE INDEX MESSAGE_QUEUE_OI_TICK_IDX ON MESSAGE_QUEUE(OWNER_INSTANCE, OWNER_TICK); diff --git a/code/common/db/src/test/java/nu/marginalia/db/DomainTypesTest.java b/code/common/db/src/test/java/nu/marginalia/db/DomainTypesTest.java index 0829f6f5..387c880e 100644 --- a/code/common/db/src/test/java/nu/marginalia/db/DomainTypesTest.java +++ b/code/common/db/src/test/java/nu/marginalia/db/DomainTypesTest.java @@ -24,7 +24,7 @@ public class DomainTypesTest { .withDatabaseName("WMSA_prod") .withUsername("wmsa") .withPassword("wmsa") - .withInitScript("sql/current/10-domain-type.sql") + .withInitScript("db/migration/V23_07_0_001__domain_type.sql") .withNetworkAliases("mariadb"); static HikariDataSource dataSource; diff --git a/code/common/db/src/test/java/nu/marginalia/db/storage/FileStorageServiceTest.java b/code/common/db/src/test/java/nu/marginalia/db/storage/FileStorageServiceTest.java index 43d99d7d..bd7d6c8a 100644 --- a/code/common/db/src/test/java/nu/marginalia/db/storage/FileStorageServiceTest.java +++ b/code/common/db/src/test/java/nu/marginalia/db/storage/FileStorageServiceTest.java @@ -32,7 +32,7 @@ public class FileStorageServiceTest { .withDatabaseName("WMSA_prod") .withUsername("wmsa") .withPassword("wmsa") - .withInitScript("sql/current/13-file-storage.sql") + .withInitScript("db/migration/V23_07_0_004__file_storage.sql") .withNetworkAliases("mariadb"); static HikariDataSource dataSource; diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqOutboxTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqOutboxTest.java index 4411df25..ea2105bd 100644 --- a/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqOutboxTest.java +++ b/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqOutboxTest.java @@ -27,7 +27,7 @@ public class MqOutboxTest { .withDatabaseName("WMSA_prod") .withUsername("wmsa") .withPassword("wmsa") - .withInitScript("sql/current/12-message-queue.sql") + .withInitScript("db/migration/V23_07_0_003__message_queue.sql") .withNetworkAliases("mariadb"); static HikariDataSource dataSource; diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mq/persistence/MqPersistenceTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mq/persistence/MqPersistenceTest.java index 4b93fa5e..bab700c0 100644 --- a/code/common/message-queue/src/test/java/nu/marginalia/mq/persistence/MqPersistenceTest.java +++ b/code/common/message-queue/src/test/java/nu/marginalia/mq/persistence/MqPersistenceTest.java @@ -24,7 +24,7 @@ public class MqPersistenceTest { .withDatabaseName("WMSA_prod") .withUsername("wmsa") .withPassword("wmsa") - .withInitScript("sql/current/12-message-queue.sql") + .withInitScript("db/migration/V23_07_0_003__message_queue.sql") .withNetworkAliases("mariadb"); static HikariDataSource dataSource; diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineErrorTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineErrorTest.java index 863e1ce0..bc9ce5b8 100644 --- a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineErrorTest.java +++ b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineErrorTest.java @@ -32,7 +32,7 @@ public class StateMachineErrorTest { .withDatabaseName("WMSA_prod") .withUsername("wmsa") .withPassword("wmsa") - .withInitScript("sql/current/12-message-queue.sql") + .withInitScript("db/migration/V23_07_0_003__message_queue.sql") .withNetworkAliases("mariadb"); static HikariDataSource dataSource; diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineNullTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineNullTest.java index 301b75a1..e48e6cb5 100644 --- a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineNullTest.java +++ b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineNullTest.java @@ -29,7 +29,7 @@ public class StateMachineNullTest { .withDatabaseName("WMSA_prod") .withUsername("wmsa") .withPassword("wmsa") - .withInitScript("sql/current/12-message-queue.sql") + .withInitScript("db/migration/V23_07_0_003__message_queue.sql") .withNetworkAliases("mariadb"); static HikariDataSource dataSource; diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineResumeTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineResumeTest.java index dadc8b87..1ba7e5c5 100644 --- a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineResumeTest.java +++ b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineResumeTest.java @@ -33,7 +33,7 @@ public class StateMachineResumeTest { .withDatabaseName("WMSA_prod") .withUsername("wmsa") .withPassword("wmsa") - .withInitScript("sql/current/12-message-queue.sql") + .withInitScript("db/migration/V23_07_0_003__message_queue.sql") .withNetworkAliases("mariadb"); static HikariDataSource dataSource; diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineTest.java index 360df468..37cb6cce 100644 --- a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineTest.java +++ b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineTest.java @@ -29,7 +29,7 @@ public class StateMachineTest { .withDatabaseName("WMSA_prod") .withUsername("wmsa") .withPassword("wmsa") - .withInitScript("sql/current/12-message-queue.sql") + .withInitScript("db/migration/V23_07_0_003__message_queue.sql") .withNetworkAliases("mariadb"); static HikariDataSource dataSource; diff --git a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadDomainLinksTest.java b/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadDomainLinksTest.java index f80a54dc..a8d85699 100644 --- a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadDomainLinksTest.java +++ b/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadDomainLinksTest.java @@ -22,7 +22,7 @@ class SqlLoadDomainLinksTest { .withDatabaseName("WMSA_prod") .withUsername("wmsa") .withPassword("wmsa") - .withInitScript("sql/current/00-base.sql") + .withInitScript("db/migration/V23_06_0_000__base.sql") .withNetworkAliases("mariadb"); HikariDataSource dataSource; diff --git a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadDomainsTest.java b/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadDomainsTest.java index 21fc2902..16d52d33 100644 --- a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadDomainsTest.java +++ b/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadDomainsTest.java @@ -19,7 +19,7 @@ class SqlLoadDomainsTest { .withDatabaseName("WMSA_prod") .withUsername("wmsa") .withPassword("wmsa") - .withInitScript("sql/current/00-base.sql") + .withInitScript("db/migration/V23_06_0_000__base.sql") .withNetworkAliases("mariadb"); @Test diff --git a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDocumentTest.java b/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDocumentTest.java index 0abea35c..e9dd92b6 100644 --- a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDocumentTest.java +++ b/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDocumentTest.java @@ -33,7 +33,7 @@ class SqlLoadProcessedDocumentTest { .withDatabaseName("WMSA_prod") .withUsername("wmsa") .withPassword("wmsa") - .withInitScript("sql/current/00-base.sql") + .withInitScript("db/migration/V23_06_0_000__base.sql") .withNetworkAliases("mariadb"); HikariDataSource dataSource; diff --git a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDomainTest.java b/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDomainTest.java index 75c74752..0ef662eb 100644 --- a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDomainTest.java +++ b/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDomainTest.java @@ -27,7 +27,7 @@ class SqlLoadProcessedDomainTest { .withDatabaseName("WMSA_prod") .withUsername("wmsa") .withPassword("wmsa") - .withInitScript("sql/current/00-base.sql") + .withInitScript("db/migration/V23_06_0_000__base.sql") .withNetworkAliases("mariadb"); HikariDataSource dataSource; diff --git a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadUrlsTest.java b/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadUrlsTest.java index cc5c1381..7fece308 100644 --- a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadUrlsTest.java +++ b/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadUrlsTest.java @@ -24,7 +24,7 @@ class SqlLoadUrlsTest { .withDatabaseName("WMSA_prod") .withUsername("wmsa") .withPassword("wmsa") - .withInitScript("sql/current/00-base.sql") + .withInitScript("db/migration/V23_06_0_000__base.sql") .withNetworkAliases("mariadb"); HikariDataSource dataSource; diff --git a/code/services-satellite/api-service/src/test/java/nu/marginalia/api/svc/LicenseServiceTest.java b/code/services-satellite/api-service/src/test/java/nu/marginalia/api/svc/LicenseServiceTest.java index 3277dc99..0b9d846c 100644 --- a/code/services-satellite/api-service/src/test/java/nu/marginalia/api/svc/LicenseServiceTest.java +++ b/code/services-satellite/api-service/src/test/java/nu/marginalia/api/svc/LicenseServiceTest.java @@ -3,7 +3,6 @@ package nu.marginalia.api.svc; import com.zaxxer.hikari.HikariConfig; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; -import net.bytebuddy.utility.dispatcher.JavaDispatcher; import org.junit.jupiter.api.*; import org.testcontainers.containers.MariaDBContainer; import org.testcontainers.junit.jupiter.Container; @@ -22,7 +21,7 @@ class LicenseServiceTest { .withDatabaseName("WMSA_prod") .withUsername("wmsa") .withPassword("wmsa") - .withInitScript("sql/current/06-api-key.sql") + .withInitScript("db/migration/V23_06_0_006__api_key.sql") .withNetworkAliases("mariadb"); private static LicenseService service; diff --git a/run/readme.md b/run/readme.md index 3249c8d7..9015629b 100644 --- a/run/readme.md +++ b/run/readme.md @@ -17,7 +17,7 @@ follow these steps. You're assumed to sit in the project root the whole time. 1. Run the one-time setup, it will create the basic runtime directory structure and download some models and data that doesn't -come with the git repo. +come with the git repo because git deals poorly with large binary files. ``` $ run/setup.sh @@ -29,48 +29,29 @@ $ run/setup.sh $ ./gradlew assemble docker ``` -3. Download a sample of crawl data, process it and stick the metadata -into the database. The data is only downloaded once. Grab a cup of coffee, this takes a few minutes. -This needs to be done whenever the crawler or processor has changed. - +3. Initialize the database ``` $ docker-compose up -d mariadb -$ run/reconvert.sh +$ ./gradlew flywayMigrate ``` 4. Bring the system online. We'll run it in the foreground in the terminal this time because it's educational to see the logs. Add `-d` to run in the background. - ``` $ docker-compose up ``` -5. Since we've just processed new crawl data, the system needs to construct static -indexes. Wait for the line 'Auto-conversion finished!' +5. You should now be able to access the system. -When all is done, it should be possible to visit -[http://localhost:8080](http://localhost:8080) and try a few searches! +| Address | Description | +|-------------------------|------------------| +| https://localhost:8080/ | User-facing GUI | +| https://localhost:8081/ | Operator's GUI | +6. Download Sample Data -## Other Crawl Data - -By default, `reconvert.sh` will load the medium dataset. This is appropriate for a demo, -but other datasets also exist. - -| Set | Description | -|-----|----------------------------------------------------------------------------| -| s | 1000 domains, suitable for low-end machines | -| m | 2000 domains | -| l | 5000 domains | -| xl | 50,000 domains, basically pre-prod.
    Warning: 5h+ processing time | - -To switch datasets, run e.g. - -```shell -$ docker-compose up -d mariadb -$ ./run/reconvert.sh l -``` +TODO: How? ## Experiment Runner From e5c9791b14d7de5bc25c000066a0a891559b8d82 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 1 Aug 2023 17:28:29 +0200 Subject: [PATCH 126/157] (crawler) Fix rare ConcurrentModificationError due to HashSet --- .../src/main/java/nu/marginalia/crawl/CrawlerMain.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java index 758c6d39..fd936a7a 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java @@ -54,7 +54,7 @@ public class CrawlerMain { private final Gson gson; private final DumbThreadPool pool; - private final Set processingIds = new HashSet<>(); + private final Map processingIds = new ConcurrentHashMap<>(); private final CrawledDomainReader reader = new CrawledDomainReader(); final AbortMonitor abortMonitor = AbortMonitor.getInstance(); @@ -148,7 +148,7 @@ public class CrawlerMain { // This shouldn't realistically happen, but if it does, we need to ignore it, otherwise // we'd end crawling the same site twice and might end up writing to the same output // file from multiple threads with complete bit salad as a result. - if (!processingIds.add(crawlingSpecification.id)) { + if (processingIds.put(crawlingSpecification.id, "") != null) { logger.error("Ignoring duplicate id: {}", crawlingSpecification.id); continue; } From 483c2dbb4401085bd33d83afd0f9bc48128a507f Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 1 Aug 2023 17:34:25 +0200 Subject: [PATCH 127/157] (conf) Change default user-agent to not associate it with the project; remove unused disks.properties file. --- run/template/conf/disks.properties | 7 ------- run/template/conf/user-agent | 2 +- 2 files changed, 1 insertion(+), 8 deletions(-) delete mode 100644 run/template/conf/disks.properties diff --git a/run/template/conf/disks.properties b/run/template/conf/disks.properties deleted file mode 100644 index d48b2864..00000000 --- a/run/template/conf/disks.properties +++ /dev/null @@ -1,7 +0,0 @@ -resource-store=/vol/res - -index-write=/vol/iw -index-read=/vol/ir - -tmp-slow=/vol/tmps -tmp-fast=/vol/tmpf diff --git a/run/template/conf/user-agent b/run/template/conf/user-agent index d2bc09f9..3b865ea0 100644 --- a/run/template/conf/user-agent +++ b/run/template/conf/user-agent @@ -1 +1 @@ -test.marginalia.nu \ No newline at end of file +PoorlyConfiguredWebCrawler \ No newline at end of file From 867410c66b0e5a7760dee9cab340d7f7c42c2a2a Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 1 Aug 2023 18:05:43 +0200 Subject: [PATCH 128/157] (file-storage) Automatic file storage discovery via manifest file --- .../db/storage/FileStorageManifest.java | 51 +++++++++++++++ .../db/storage/FileStorageService.java | 65 +++++++++++++++++-- ..._07_0_005__file_storage_default_values.sql | 2 +- .../monitor/FileStorageMonitorActor.java | 3 + 4 files changed, 113 insertions(+), 8 deletions(-) create mode 100644 code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageManifest.java diff --git a/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageManifest.java b/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageManifest.java new file mode 100644 index 00000000..f002a47d --- /dev/null +++ b/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageManifest.java @@ -0,0 +1,51 @@ +package nu.marginalia.db.storage; + +import com.google.gson.Gson; +import nu.marginalia.db.storage.model.FileStorage; +import nu.marginalia.db.storage.model.FileStorageType; +import nu.marginalia.model.gson.GsonFactory; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; +import java.util.Optional; + +record FileStorageManifest(FileStorageType type, String description) { + private static final Gson gson = GsonFactory.get(); + private static final String fileName = "marginalia-manifest.json"; + private static final Logger logger = LoggerFactory.getLogger(FileStorageManifest.class); + + public static Optional find(Path directory) { + Path expectedFileName = directory.resolve(fileName); + + if (!Files.isRegularFile(expectedFileName) || + !Files.isReadable(expectedFileName)) { + return Optional.empty(); + } + + try (var reader = Files.newBufferedReader(expectedFileName)) { + return Optional.of(gson.fromJson(reader, FileStorageManifest.class)); + } + catch (Exception e) { + logger.warn("Failed to read manifest " + expectedFileName, e); + return Optional.empty(); + } + } + + public void write(FileStorage dir) { + Path expectedFileName = dir.asPath().resolve(fileName); + + try (var writer = Files.newBufferedWriter(expectedFileName, + StandardOpenOption.CREATE, + StandardOpenOption.TRUNCATE_EXISTING)) + { + gson.toJson(this, writer); + } + catch (Exception e) { + logger.warn("Failed to write manifest " + expectedFileName, e); + } + } + +} diff --git a/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java b/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java index 2ce1b4d1..e136dd0b 100644 --- a/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java +++ b/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java @@ -2,25 +2,26 @@ package nu.marginalia.db.storage; import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.db.storage.model.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import javax.inject.Inject; import javax.inject.Singleton; +import java.io.File; +import java.io.FileFilter; import java.io.FileNotFoundException; import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; +import java.nio.file.*; import java.nio.file.attribute.PosixFilePermissions; import java.sql.SQLException; -import java.util.ArrayList; -import java.util.List; -import java.util.Optional; +import java.util.*; /** Manages file storage for processes and services */ @Singleton public class FileStorageService { private final HikariDataSource dataSource; - + private final Logger logger = LoggerFactory.getLogger(FileStorageService.class); @Inject public FileStorageService(HikariDataSource dataSource) { this.dataSource = dataSource; @@ -65,6 +66,49 @@ public class FileStorageService { return null; } + public void synchronizeStorageManifests(FileStorageBase base) { + Set ignoredPaths = new HashSet<>(); + + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT PATH FROM FILE_STORAGE WHERE BASE_ID = ? + """)) { + stmt.setLong(1, base.id().id()); + var rs = stmt.executeQuery(); + while (rs.next()) { + ignoredPaths.add(rs.getString(1)); + } + } catch (SQLException e) { + throw new RuntimeException(e); + } + + File basePathFile = Path.of(base.path()).toFile(); + File[] files = basePathFile.listFiles(pathname -> pathname.isDirectory() && !ignoredPaths.contains(pathname.getName())); + if (files == null) return; + for (File file : files) { + var maybeManifest = FileStorageManifest.find(file.toPath()); + if (maybeManifest.isEmpty()) continue; + var manifest = maybeManifest.get(); + + logger.info("Discovered new file storage: " + file.getName() + " (" + manifest.type() + ")"); + + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + INSERT INTO FILE_STORAGE(BASE_ID, PATH, TYPE, DESCRIPTION) + VALUES (?, ?, ?, ?) + """)) { + stmt.setLong(1, base.id().id()); + stmt.setString(2, file.getName()); + stmt.setString(3, manifest.type().name()); + stmt.setString(4, manifest.description()); + stmt.execute(); + conn.commit(); + + } catch (SQLException e) { + throw new RuntimeException(e); + } + } + } public void relateFileStorages(FileStorageId source, FileStorageId target) { try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(""" @@ -198,7 +242,14 @@ public class FileStorageService { var rs = query.executeQuery(); if (rs.next()) { - return getStorage(new FileStorageId(rs.getLong("ID"))); + var storage = getStorage(new FileStorageId(rs.getLong("ID"))); + + // Write a manifest file so we can pick this up later without needing to insert it into DB + // (e.g. when loading from outside the system) + var manifest = new FileStorageManifest(type, description); + manifest.write(storage); + + return storage; } } diff --git a/code/common/db/src/main/resources/db/migration/V23_07_0_005__file_storage_default_values.sql b/code/common/db/src/main/resources/db/migration/V23_07_0_005__file_storage_default_values.sql index 8d591965..3803911f 100644 --- a/code/common/db/src/main/resources/db/migration/V23_07_0_005__file_storage_default_values.sql +++ b/code/common/db/src/main/resources/db/migration/V23_07_0_005__file_storage_default_values.sql @@ -1,7 +1,7 @@ INSERT IGNORE INTO FILE_STORAGE_BASE(NAME, PATH, TYPE, PERMIT_TEMP) VALUES ('Index Storage', '/vol', 'SSD_INDEX', false), -('Data Storage', '/samples', 'SLOW', false); +('Data Storage', '/samples', 'SLOW', true); INSERT IGNORE INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE) SELECT ID, 'iw', "Index Staging Area", 'INDEX_STAGING' diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/FileStorageMonitorActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/FileStorageMonitorActor.java index 663fa9d8..9f2ced26 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/FileStorageMonitorActor.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/FileStorageMonitorActor.java @@ -4,6 +4,7 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.db.storage.FileStorageService; import nu.marginalia.db.storage.model.FileStorage; +import nu.marginalia.db.storage.model.FileStorageBaseType; import nu.marginalia.db.storage.model.FileStorageId; import nu.marginalia.mqsm.StateFactory; import nu.marginalia.mqsm.graph.AbstractStateGraph; @@ -68,6 +69,8 @@ public class FileStorageMonitorActor extends AbstractStateGraph { transition(REMOVE_STALE, missing.get().id()); } + fileStorageService.synchronizeStorageManifests(fileStorageService.getStorageBase(FileStorageBaseType.SLOW)); + TimeUnit.SECONDS.sleep(10); } } From 659d2134ba365bb6366addf85dcc938846134195 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 1 Aug 2023 22:32:30 +0200 Subject: [PATCH 129/157] (file-storage) Deprecate mustClean flag --- .../db/storage/FileStorageService.java | 18 +++++++----------- .../db/storage/model/FileStorageBase.java | 2 -- .../migration/V23_07_0_004__file_storage.sql | 1 - .../db/storage/FileStorageServiceTest.java | 9 ++++----- 4 files changed, 11 insertions(+), 19 deletions(-) diff --git a/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java b/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java index e136dd0b..e78090c9 100644 --- a/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java +++ b/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java @@ -8,7 +8,6 @@ import org.slf4j.LoggerFactory; import javax.inject.Inject; import javax.inject.Singleton; import java.io.File; -import java.io.FileFilter; import java.io.FileNotFoundException; import java.io.IOException; import java.nio.file.*; @@ -46,7 +45,7 @@ public class FileStorageService { public FileStorageBase getStorageBase(FileStorageBaseId type) throws SQLException { try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(""" - SELECT ID, NAME, PATH, TYPE, MUST_CLEAN, PERMIT_TEMP + SELECT ID, NAME, PATH, TYPE, PERMIT_TEMP FROM FILE_STORAGE_BASE WHERE ID = ? """)) { stmt.setLong(1, type.id()); @@ -57,8 +56,7 @@ public class FileStorageService { FileStorageBaseType.valueOf(rs.getString(4)), rs.getString(2), rs.getString(3), - rs.getBoolean(5), - rs.getBoolean(6) + rs.getBoolean(5) ); } } @@ -156,7 +154,7 @@ public class FileStorageService { public FileStorageBase getStorageBase(FileStorageBaseType type) throws SQLException { try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(""" - SELECT ID, NAME, PATH, TYPE, MUST_CLEAN, PERMIT_TEMP + SELECT ID, NAME, PATH, TYPE, PERMIT_TEMP FROM FILE_STORAGE_BASE WHERE TYPE = ? """)) { stmt.setString(1, type.name()); @@ -167,8 +165,7 @@ public class FileStorageService { FileStorageBaseType.valueOf(rs.getString(4)), rs.getString(2), rs.getString(3), - rs.getBoolean(5), - rs.getBoolean(6) + rs.getBoolean(5) ); } } @@ -176,7 +173,7 @@ public class FileStorageService { return null; } - public FileStorageBase createStorageBase(String name, Path path, FileStorageBaseType type, boolean mustClean, boolean permitTemp) throws SQLException, FileNotFoundException { + public FileStorageBase createStorageBase(String name, Path path, FileStorageBaseType type, boolean permitTemp) throws SQLException, FileNotFoundException { if (!Files.exists(path)) { throw new FileNotFoundException("Storage base path does not exist: " + path); @@ -184,14 +181,13 @@ public class FileStorageService { try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(""" - INSERT INTO FILE_STORAGE_BASE(NAME, PATH, TYPE, MUST_CLEAN, PERMIT_TEMP) + INSERT INTO FILE_STORAGE_BASE(NAME, PATH, TYPE, PERMIT_TEMP) VALUES (?, ?, ?, ?, ?) """)) { stmt.setString(1, name); stmt.setString(2, path.toString()); stmt.setString(3, type.name()); - stmt.setBoolean(4, mustClean); - stmt.setBoolean(5, permitTemp); + stmt.setBoolean(4, permitTemp); int update = stmt.executeUpdate(); if (update < 0) { diff --git a/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageBase.java b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageBase.java index 96f09698..1e8245ad 100644 --- a/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageBase.java +++ b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageBase.java @@ -9,14 +9,12 @@ import java.nio.file.Path; * @param type the type of the storage base * @param name the name of the storage base * @param path the path of the storage base - * @param mustClean if true, the storage is small and *must* be cleaned after use * @param permitTemp if true, the storage may be used for temporary files */ public record FileStorageBase(FileStorageBaseId id, FileStorageBaseType type, String name, String path, - boolean mustClean, boolean permitTemp ) { public Path asPath() { diff --git a/code/common/db/src/main/resources/db/migration/V23_07_0_004__file_storage.sql b/code/common/db/src/main/resources/db/migration/V23_07_0_004__file_storage.sql index b2063fc8..d6de88a5 100644 --- a/code/common/db/src/main/resources/db/migration/V23_07_0_004__file_storage.sql +++ b/code/common/db/src/main/resources/db/migration/V23_07_0_004__file_storage.sql @@ -3,7 +3,6 @@ CREATE TABLE IF NOT EXISTS FILE_STORAGE_BASE ( NAME VARCHAR(255) NOT NULL UNIQUE, PATH VARCHAR(255) NOT NULL UNIQUE COMMENT 'The path to the storage base', TYPE ENUM ('SSD_INDEX', 'SSD_WORK', 'SLOW', 'BACKUP') NOT NULL, - MUST_CLEAN BOOLEAN NOT NULL DEFAULT FALSE COMMENT 'If true, the storage must be cleaned after use', PERMIT_TEMP BOOLEAN NOT NULL DEFAULT FALSE COMMENT 'If true, the storage can be used for temporary files' ) CHARACTER SET utf8mb4 diff --git a/code/common/db/src/test/java/nu/marginalia/db/storage/FileStorageServiceTest.java b/code/common/db/src/test/java/nu/marginalia/db/storage/FileStorageServiceTest.java index bd7d6c8a..92020f32 100644 --- a/code/common/db/src/test/java/nu/marginalia/db/storage/FileStorageServiceTest.java +++ b/code/common/db/src/test/java/nu/marginalia/db/storage/FileStorageServiceTest.java @@ -97,11 +97,10 @@ public class FileStorageServiceTest { String name = "test-" + UUID.randomUUID(); var storage = new FileStorageService(dataSource); - var base = storage.createStorageBase(name, createTempDir(), FileStorageBaseType.SLOW, false, false); + var base = storage.createStorageBase(name, createTempDir(), FileStorageBaseType.SLOW, false); Assertions.assertEquals(name, base.name()); Assertions.assertEquals(FileStorageBaseType.SLOW, base.type()); - Assertions.assertFalse(base.mustClean()); Assertions.assertFalse(base.permitTemp()); } @Test @@ -110,7 +109,7 @@ public class FileStorageServiceTest { var storage = new FileStorageService(dataSource); - var base = storage.createStorageBase(name, createTempDir(), FileStorageBaseType.SLOW, false, false); + var base = storage.createStorageBase(name, createTempDir(), FileStorageBaseType.SLOW, false); try { storage.allocateTemporaryStorage(base, FileStorageType.CRAWL_DATA, "xyz", "thisShouldFail"); @@ -129,7 +128,7 @@ public class FileStorageServiceTest { var storage = new FileStorageService(dataSource); - var base = storage.createStorageBase(name, createTempDir(), FileStorageBaseType.SLOW, false, false); + var base = storage.createStorageBase(name, createTempDir(), FileStorageBaseType.SLOW, false); var created = storage.allocatePermanentStorage(base, "xyz", FileStorageType.CRAWL_DATA, "thisShouldSucceed"); tempDirs.add(created.asPath()); @@ -144,7 +143,7 @@ public class FileStorageServiceTest { var storage = new FileStorageService(dataSource); - var base = storage.createStorageBase(name, createTempDir(), FileStorageBaseType.SLOW, false, true); + var base = storage.createStorageBase(name, createTempDir(), FileStorageBaseType.SLOW, true); var fileStorage = storage.allocateTemporaryStorage(base, FileStorageType.CRAWL_DATA, "xyz", "thisShouldSucceed"); Assertions.assertTrue(Files.exists(fileStorage.asPath())); From 8de3e6ab80456e5d434cbb26799fec9c33f25e19 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 1 Aug 2023 22:33:30 +0200 Subject: [PATCH 130/157] (control) Fix bug where CrawlActor and RecrawlActor would steal each others' mail --- .../actor/task/ActorProcessWatcher.java | 9 +++-- .../control/actor/task/CrawlActor.java | 6 +-- .../actor/task/ReconvertAndLoadActor.java | 8 ++-- .../control/actor/task/RecrawlActor.java | 12 ++---- ...utboxFactory.java => ProcessOutboxes.java} | 37 +++++++++++-------- .../control/svc/ProcessService.java | 2 +- 6 files changed, 37 insertions(+), 37 deletions(-) rename code/services-core/control-service/src/main/java/nu/marginalia/control/svc/{ProcessOutboxFactory.java => ProcessOutboxes.java} (58%) diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ActorProcessWatcher.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ActorProcessWatcher.java index 33f96f6b..e82168f4 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ActorProcessWatcher.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ActorProcessWatcher.java @@ -28,22 +28,23 @@ public class ActorProcessWatcher { *

    * When interrupted, the process is killed and the message is marked as dead. */ - public MqMessage waitResponse(MqOutbox outbox, ProcessService.ProcessId processId, long id) + public MqMessage waitResponse(MqOutbox outbox, ProcessService.ProcessId processId, long msgId) throws ControlFlowException, InterruptedException, SQLException { if (!waitForProcess(processId, TimeUnit.SECONDS, 30)) { throw new ControlFlowException("ERROR", "Process " + processId + " did not launch"); } + for (;;) { try { - return outbox.waitResponse(id, 5, TimeUnit.SECONDS); + return outbox.waitResponse(msgId, 5, TimeUnit.SECONDS); } catch (InterruptedException ex) { // Here we mark the message as dead, as it's the user that has aborted the process // This will prevent the monitor process from attempting to respawn the process as we kill it - outbox.flagAsDead(id); + outbox.flagAsDead(msgId); processService.kill(processId); throw ex; @@ -67,7 +68,7 @@ public class ActorProcessWatcher { if (processService.isRunning(processId)) return true; - TimeUnit.SECONDS.sleep(1); + TimeUnit.MILLISECONDS.sleep(100); } return false; diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlActor.java index a37639a6..48ebbc79 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlActor.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlActor.java @@ -6,7 +6,7 @@ import com.google.inject.Singleton; import lombok.AllArgsConstructor; import lombok.NoArgsConstructor; import lombok.With; -import nu.marginalia.control.svc.ProcessOutboxFactory; +import nu.marginalia.control.svc.ProcessOutboxes; import nu.marginalia.control.svc.ProcessService; import nu.marginalia.db.storage.FileStorageService; import nu.marginalia.db.storage.model.FileStorageBaseType; @@ -48,13 +48,13 @@ public class CrawlActor extends AbstractStateGraph { @Inject public CrawlActor(StateFactory stateFactory, - ProcessOutboxFactory processOutboxFactory, + ProcessOutboxes processOutboxes, FileStorageService storageService, Gson gson, ActorProcessWatcher processWatcher) { super(stateFactory); - this.mqCrawlerOutbox = processOutboxFactory.createCrawlerOutbox(); + this.mqCrawlerOutbox = processOutboxes.getCrawlerOutbox(); this.storageService = storageService; this.gson = gson; this.processWatcher = processWatcher; diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ReconvertAndLoadActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ReconvertAndLoadActor.java index f296ca6f..a7589439 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ReconvertAndLoadActor.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ReconvertAndLoadActor.java @@ -6,7 +6,7 @@ import com.google.inject.Singleton; import lombok.AllArgsConstructor; import lombok.NoArgsConstructor; import lombok.With; -import nu.marginalia.control.svc.ProcessOutboxFactory; +import nu.marginalia.control.svc.ProcessOutboxes; import nu.marginalia.control.svc.ProcessService; import nu.marginalia.index.client.IndexClient; import nu.marginalia.index.client.IndexMqEndpoints; @@ -66,7 +66,7 @@ public class ReconvertAndLoadActor extends AbstractStateGraph { @Inject public ReconvertAndLoadActor(StateFactory stateFactory, ActorProcessWatcher processWatcher, - ProcessOutboxFactory processOutboxFactory, + ProcessOutboxes processOutboxes, FileStorageService storageService, IndexClient indexClient, Gson gson @@ -75,8 +75,8 @@ public class ReconvertAndLoadActor extends AbstractStateGraph { super(stateFactory); this.processWatcher = processWatcher; this.indexOutbox = indexClient.outbox(); - this.mqConverterOutbox = processOutboxFactory.createConverterOutbox(); - this.mqLoaderOutbox = processOutboxFactory.createLoaderOutbox(); + this.mqConverterOutbox = processOutboxes.getConverterOutbox(); + this.mqLoaderOutbox = processOutboxes.getLoaderOutbox(); this.storageService = storageService; this.gson = gson; } diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/RecrawlActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/RecrawlActor.java index 03178226..9311cead 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/RecrawlActor.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/RecrawlActor.java @@ -6,14 +6,12 @@ import com.google.inject.Singleton; import lombok.AllArgsConstructor; import lombok.NoArgsConstructor; import lombok.With; -import nu.marginalia.control.svc.ProcessOutboxFactory; +import nu.marginalia.control.svc.ProcessOutboxes; import nu.marginalia.control.svc.ProcessService; import nu.marginalia.db.storage.FileStorageService; import nu.marginalia.db.storage.model.FileStorage; import nu.marginalia.db.storage.model.FileStorageId; import nu.marginalia.db.storage.model.FileStorageType; -import nu.marginalia.index.client.IndexClient; -import nu.marginalia.mq.MqMessage; import nu.marginalia.mq.MqMessageState; import nu.marginalia.mq.outbox.MqOutbox; import nu.marginalia.mqapi.crawling.CrawlRequest; @@ -21,14 +19,10 @@ import nu.marginalia.mqsm.StateFactory; import nu.marginalia.mqsm.graph.AbstractStateGraph; import nu.marginalia.mqsm.graph.GraphState; import nu.marginalia.mqsm.graph.ResumeBehavior; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import java.nio.file.Files; import java.sql.SQLException; import java.util.Optional; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.TimeoutException; @Singleton public class RecrawlActor extends AbstractStateGraph { @@ -62,14 +56,14 @@ public class RecrawlActor extends AbstractStateGraph { @Inject public RecrawlActor(StateFactory stateFactory, ActorProcessWatcher processWatcher, - ProcessOutboxFactory processOutboxFactory, + ProcessOutboxes processOutboxes, FileStorageService storageService, Gson gson ) { super(stateFactory); this.processWatcher = processWatcher; - this.mqCrawlerOutbox = processOutboxFactory.createCrawlerOutbox(); + this.mqCrawlerOutbox = processOutboxes.getCrawlerOutbox(); this.storageService = storageService; this.gson = gson; } diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ProcessOutboxFactory.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ProcessOutboxes.java similarity index 58% rename from code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ProcessOutboxFactory.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ProcessOutboxes.java index fb5598a9..a8699ab9 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ProcessOutboxFactory.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ProcessOutboxes.java @@ -8,35 +8,40 @@ import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.service.server.BaseServiceParams; @Singleton -public class ProcessOutboxFactory { - private final BaseServiceParams params; - private final MqPersistence persistence; +public class ProcessOutboxes { + private final MqOutbox converterOutbox; + private final MqOutbox loaderOutbox; + private final MqOutbox crawlerOutbox; @Inject - public ProcessOutboxFactory(BaseServiceParams params, MqPersistence persistence) { - this.params = params; - this.persistence = persistence; - } - - public MqOutbox createConverterOutbox() { - return new MqOutbox(persistence, + public ProcessOutboxes(BaseServiceParams params, MqPersistence persistence) { + converterOutbox = new MqOutbox(persistence, ProcessInboxNames.CONVERTER_INBOX, params.configuration.serviceName(), params.configuration.instanceUuid() ); - } - public MqOutbox createLoaderOutbox() { - return new MqOutbox(persistence, + loaderOutbox = new MqOutbox(persistence, ProcessInboxNames.LOADER_INBOX, params.configuration.serviceName(), params.configuration.instanceUuid() ); - } - public MqOutbox createCrawlerOutbox() { - return new MqOutbox(persistence, + crawlerOutbox = new MqOutbox(persistence, ProcessInboxNames.CRAWLER_INBOX, params.configuration.serviceName(), params.configuration.instanceUuid() ); } + + + public MqOutbox getConverterOutbox() { + return converterOutbox; + } + + public MqOutbox getLoaderOutbox() { + return loaderOutbox; + } + + public MqOutbox getCrawlerOutbox() { + return crawlerOutbox; + } } diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java index e38c6d97..67d284b7 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java @@ -125,7 +125,7 @@ public class ProcessService { private final List propagatedEnvironmentVariables = List.of( "JAVA_HOME", "CONVERTER_PROCESS_OPTS", - "LOADER_PROCESS_OPTS", +// "LOADER_PROCESS_OPTS", "CRAWLER_PROCESS_OPTS"); private String[] createEnvironmentVariables() { From ba724bc1b2e756ce48edfb14cfe250fc32953f04 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 1 Aug 2023 22:47:37 +0200 Subject: [PATCH 131/157] (scripts|docs) Update scripts and documentations for the new operator's gui and file storage workflows. --- doc/crawling.md | 75 ++++++++++----------------------- run/download-samples.sh | 40 ++++++++++++++++++ run/readme.md | 26 +++++++++++- run/reconvert.sh | 91 ----------------------------------------- 4 files changed, 86 insertions(+), 146 deletions(-) create mode 100755 run/download-samples.sh delete mode 100755 run/reconvert.sh diff --git a/doc/crawling.md b/doc/crawling.md index 46f476b9..cfb38f9e 100644 --- a/doc/crawling.md +++ b/doc/crawling.md @@ -1,6 +1,6 @@ # Crawling -This document is a first draft. +This document is a draft. ## WARNING Please don't run the crawler unless you intend to actually operate a public @@ -23,6 +23,11 @@ it doesn't need to be extremely fast, but it should be a few terabytes in size. with `noatime` and partitioned with a large block size. It may be a good idea to format the disk with a block size of 4096 bytes. This will reduce the amount of disk space used by the crawler. +Make sure you configure the user-agent properly. This will be used to identify the crawler, +and is matched against the robots.txt file. The crawler will not crawl sites that don't allow it. + +This can be done by editing the file `${WMSA_HOME}/conf/user-agent`. + ## Setup To operate the crawler, you need to set up a filesystem structure. @@ -45,66 +50,28 @@ $ mkdir /data/processed ### Specifications A crawl specification file is a compressed JSON file with each domain name to crawl, as well as -known URLs for each domain. These are created with the [crawl-job-extractor](../tools/crawl-job-extractor/) -tool. +known URLs for each domain. These are created in the `storage -> specifications` view in the operator's gui. -Let's put this in `/data/crawl.spec` +To bootstrap the system, you need a list of known domains. This is just a text file with one domain name per line, +with blanlines and comments starting with `#` ignored. -### Crawl Plan - -You also need a crawl plan. This is a YAML file that specifies where to store the crawl data. This -file is also used by the converter. - -This is an example from production. Note that the crawl specification mentioned previously is pointed -to by the `jobSpec` key. - -```yaml -jobSpec: "/data/crawl.spec" -crawl: - dir: "/data/crawl" - logName: "crawler.log" -process: - dir: "/data/processed" - logName: "process.log" -``` - -Let's put it in `/data/crawl-plan.yaml` +Make it available over HTTP(S) and select `Download a list of domains from a URL` in the `Create New Specification` +form. Make sure to give this specification a good description, as it will follow you around for a while. ## Crawling -Run the crawler-process script with the crawl plan as an argument. +Refresh the specification list in the operator's gui. You should see your new specification in the list. +Click the `[Info]` link next to it and select `[Crawl]` under `Actions`. -In practice something like this: - -```bash -screen sudo -u searchengine WMSA_HOME=/path/to/install/dir ./crawler-process /data/crawl-plan.yaml -``` - -This proces will run for a long time, up to a week. It will journal its progress in `crawler.log`, -and if the process should halt somehow, it replay the journal and continue where was. Do give it a -while before restarting though, to not annoy webmasters by re-crawling a bunch of websites. - -The crawler will populate the crawl directory with a directory structure. Note that on mechanical drives, -removing these files will take hours. You probably want a separate hard drive for this as the filesystem -will get severely gunked up. +Depending on the size of the specification, this may take anywhere between a few minutes to a few weeks. +You can follow the progress in the `Actors` view. ## Converting -The converter process takes the same argument as the crawler process. It will read the crawl data -and extract keywords and metadata and save them as compressed JSON models. It will create another huge -directory structure in the process directory, and uses its own journal to keep track of progress. +Once the crawl is finished, you can convert the data to a format that can be loaded into the database. +This is done by going to the `storage -> crawl` view in the operator's gui, clicking the `[Info]` link +and pressing `[Convert]` under `Actions`. -```bash -screen sudo -u searchengine WMSA_HOME=/path/to/install/dir ./converter-process /data/crawl-plan.yaml -``` - -**Note:** This process will use *a lot* of CPU. Expect every available core to be at 100% for several days. - -## Loader - -The loader process takes the same argument as the crawler and converter processes. It will read converted -data and insert it into the database and create a lexicon and index journal. - -**Note:** It will wipe the URL database before inserting data. It is a good idea to -bring the entire search-engine offline while this is happening. The loader will run -for a day or so. \ No newline at end of file +The rest of the process should be automatic. Follow the progress in the `Actors` view; the actor +`RECONVERT_LOAD` drives the process. The process can be stopped by terminating this actor. Depending on the +state, it may be necessary to restart from the beginning. \ No newline at end of file diff --git a/run/download-samples.sh b/run/download-samples.sh new file mode 100755 index 00000000..2465c50b --- /dev/null +++ b/run/download-samples.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +set -e + +SAMPLE_NAME=crawl-${1:-m} +SAMPLE_DIR="samples/${SAMPLE_NAME}/" + +function download_model { + model=$1 + url=$2 + + if [ ! -f $model ]; then + echo "** Downloading $url" + wget -O $model $url + fi +} + +pushd $(dirname $0) + +if [ -d ${SAMPLE_DIR} ]; then + echo "${SAMPLE_DIR} already exists; remove it if you want to re-download the sample" +fi + +mkdir -p samples/ +SAMPLE_TARBALL=samples/${SAMPLE_NAME}.tar.gz +download_model ${SAMPLE_TARBALL} https://downloads.marginalia.nu/${SAMPLE_TARBALL} || rm ${SAMPLE_TARBALL} + +if [ ! -f ${SAMPLE_TARBALL} ]; then + echo "!! Failed" + exit 255 +fi + +mkdir -p ${SAMPLE_DIR} +tar zxf ${SAMPLE_TARBALL} --strip-components=1 -C ${SAMPLE_DIR} + +cat > "${SAMPLE_DIR}/marginalia-manifest.json" < Date: Tue, 1 Aug 2023 22:50:05 +0200 Subject: [PATCH 132/157] (scripts|docs) Update scripts and documentations for the new operator's gui and file storage workflows. --- doc/crawling.md | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/doc/crawling.md b/doc/crawling.md index cfb38f9e..679f65c6 100644 --- a/doc/crawling.md +++ b/doc/crawling.md @@ -30,22 +30,9 @@ This can be done by editing the file `${WMSA_HOME}/conf/user-agent`. ## Setup -To operate the crawler, you need to set up a filesystem structure. +Ensure that the system is running and go to https://localhost:8081. See the documentation in [run/](../run/) for more information. +By default the system is configured to store data in `run/samples`. (!!!FIXME: How do you change this now?!!!) -You need - -* a directory for crawl data -* a directory for processed data -* a crawl specification file -* a crawl plan file - -Assuming we want to keep our crawl and processed data in -`/data`, then we would create the following directories: - -```bash -$ mkdir /data/crawl -$ mkdir /data/processed -``` ### Specifications From e088eb9ec8f1082b2d4162a3eca1d5c9fbc1d74e Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 1 Aug 2023 22:50:33 +0200 Subject: [PATCH 133/157] (scripts|docs) Update scripts and documentations for the new operator's gui and file storage workflows. --- doc/crawling.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/crawling.md b/doc/crawling.md index 679f65c6..cc846720 100644 --- a/doc/crawling.md +++ b/doc/crawling.md @@ -40,7 +40,7 @@ A crawl specification file is a compressed JSON file with each domain name to cr known URLs for each domain. These are created in the `storage -> specifications` view in the operator's gui. To bootstrap the system, you need a list of known domains. This is just a text file with one domain name per line, -with blanlines and comments starting with `#` ignored. +with blanklines and comments starting with `#` ignored. Make it available over HTTP(S) and select `Download a list of domains from a URL` in the `Create New Specification` form. Make sure to give this specification a good description, as it will follow you around for a while. From 7763df071505a0292455dc60529796214afb432d Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 1 Aug 2023 22:52:41 +0200 Subject: [PATCH 134/157] (docs) Add control-service to the main readme.md --- code/readme.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/code/readme.md b/code/readme.md index 1de3e46c..3aca8a37 100644 --- a/code/readme.md +++ b/code/readme.md @@ -14,10 +14,11 @@ A map of the most important components and how they relate can be found below. ### Services * [core services](services-core/) "macroservices", stateful, memory hungry doing heavy lifting. +* * [control-service](services-core/control-service) * * [search](services-core/search-service) * * [index](services-core/index-service) * * [assistant](services-core/assistant-service) -* [sattelite services](services-satellite/) "microservices", stateless providing additional functionality. +* [satellite services](services-satellite/) "microservices", stateless providing additional functionality. * * [api](services-satellite/api-service) - public API * * [dating](services-satellite/dating-service) - [https://explore.marginalia.nu/](https://explore.marginalia.nu/) * * [explorer](services-satellite/explorer-service) - [https://explore2.marginalia.nu/](https://explore2.marginalia.nu/) From 9979c9defe1c24099e00f43820c9ff21bffb1d0b Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 2 Aug 2023 20:13:30 +0200 Subject: [PATCH 135/157] (search/index) Add blogosphere filter --- .../model/query/SearchSetIdentifier.java | 1 + .../index/svc/IndexSearchSetsService.java | 32 +++++++++++++++++-- .../search/model/SearchProfile.java | 1 + .../search/svc/SearchApiQueryService.java | 1 + .../templates/search/parts/search-form.hdb | 3 +- 5 files changed, 34 insertions(+), 4 deletions(-) diff --git a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/query/SearchSetIdentifier.java b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/query/SearchSetIdentifier.java index aca5c291..e89d6d8b 100644 --- a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/query/SearchSetIdentifier.java +++ b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/query/SearchSetIdentifier.java @@ -8,6 +8,7 @@ package nu.marginalia.index.client.model.query; public enum SearchSetIdentifier { NONE, RETRO, + BLOGS, ACADEMIA, SMALLWEB } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java index 512c735e..ba671969 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java @@ -2,9 +2,13 @@ package nu.marginalia.index.svc; import com.google.inject.Inject; import com.google.inject.Singleton; +import it.unimi.dsi.fastutil.ints.IntOpenHashSet; import lombok.SneakyThrows; +import nu.marginalia.db.DomainTypes; import nu.marginalia.index.IndexServicesFactory; import nu.marginalia.index.searchset.SearchSet; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.id.EdgeIdList; import nu.marginalia.ranking.ReversePageRank; import nu.marginalia.ranking.StandardPageRank; import nu.marginalia.ranking.accumulator.RankingResultHashMapAccumulator; @@ -25,7 +29,7 @@ import java.io.IOException; @Singleton public class IndexSearchSetsService { private final Logger logger = LoggerFactory.getLogger(getClass()); - private final RankingDomainFetcher rankingDomains; + private final DomainTypes domainTypes; private final DbUpdateRanks dbUpdateRanks; private final RankingDomainFetcher similarityDomains; private final RankingSettings rankingSettings; @@ -35,19 +39,21 @@ public class IndexSearchSetsService { private volatile RankingSearchSet retroSet; private volatile RankingSearchSet smallWebSet; private volatile RankingSearchSet academiaSet; + private volatile RankingSearchSet blogsSet; private final SearchSet anySet = new SearchSetAny(); // The ranking value of the domains used in sorting the domains private volatile DomainRankings domainRankings = new DomainRankings(); @Inject - public IndexSearchSetsService(RankingDomainFetcher rankingDomains, + public IndexSearchSetsService(DomainTypes domainTypes, + RankingDomainFetcher rankingDomains, RankingDomainFetcherForSimilarityData similarityDomains, RankingSettings rankingSettings, IndexServicesFactory servicesFactory, DbUpdateRanks dbUpdateRanks) throws IOException { + this.domainTypes = domainTypes; - this.rankingDomains = rankingDomains; this.dbUpdateRanks = dbUpdateRanks; if (similarityDomains.hasData()) { @@ -64,6 +70,7 @@ public class IndexSearchSetsService { smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, servicesFactory.getSearchSetsBase().resolve("small-web.dat")); academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, servicesFactory.getSearchSetsBase().resolve("academia.dat")); retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, servicesFactory.getSearchSetsBase().resolve("retro.dat")); + blogsSet = new RankingSearchSet(SearchSetIdentifier.BLOGS, servicesFactory.getSearchSetsBase().resolve("blogs.dat")); } public DomainRankings getDomainRankings() { @@ -79,6 +86,7 @@ public class IndexSearchSetsService { case RETRO -> retroSet; case ACADEMIA -> academiaSet; case SMALLWEB -> smallWebSet; + case BLOGS -> blogsSet; }; } @@ -86,6 +94,7 @@ public class IndexSearchSetsService { updateAcademiaDomainsSet(); updateRetroDomainsSet(); updateSmallWebDomainsSet(); + updateBlogsSet(); updateDomainRankings(); } @@ -131,6 +140,23 @@ public class IndexSearchSetsService { } } + @SneakyThrows + public void updateBlogsSet() { + EdgeIdList knownDomains = domainTypes.getKnownDomainsByType(DomainTypes.Type.BLOG); + + if (knownDomains.isEmpty()) { + // FIXME: We don't want to reload the entire list every time, but we do want to do it sometimes. Actor maybe? + domainTypes.reloadDomainsList(DomainTypes.Type.BLOG); + knownDomains = domainTypes.getKnownDomainsByType(DomainTypes.Type.BLOG); + } + + synchronized (this) { + blogsSet = new RankingSearchSet(SearchSetIdentifier.BLOGS, blogsSet.source, new IntOpenHashSet(knownDomains.values())); + blogsSet.write(); + } + } + + @SneakyThrows public void updateAcademiaDomainsSet() { var entry = rankingSettings.academia; diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java index b732cb18..c913d0ce 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java @@ -10,6 +10,7 @@ import java.util.Objects; public enum SearchProfile { DEFAULT("default", SearchSetIdentifier.RETRO), MODERN("modern", SearchSetIdentifier.SMALLWEB), + BLOGOSPHERE("blogosphere", SearchSetIdentifier.BLOGS), CORPO("corpo", SearchSetIdentifier.NONE), YOLO("yolo", SearchSetIdentifier.NONE), VINTAGE("vintage", SearchSetIdentifier.NONE), diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchApiQueryService.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchApiQueryService.java index a84cdaee..ad9d3fd6 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchApiQueryService.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchApiQueryService.java @@ -48,6 +48,7 @@ public class SearchApiQueryService { case "1" -> SearchProfile.MODERN; case "2" -> SearchProfile.DEFAULT; case "3" -> SearchProfile.CORPO_CLEAN; + case "blogosphere" -> SearchProfile.BLOGOSPHERE; default -> SearchProfile.CORPO_CLEAN; }; } diff --git a/code/services-core/search-service/src/main/resources/templates/search/parts/search-form.hdb b/code/services-core/search-service/src/main/resources/templates/search/parts/search-form.hdb index fba8f3c7..8d5b4b79 100644 --- a/code/services-core/search-service/src/main/resources/templates/search/parts/search-form.hdb +++ b/code/services-core/search-service/src/main/resources/templates/search/parts/search-form.hdb @@ -9,7 +9,7 @@

    File NameTypeLast Mod Size
    - {{#if downloadable}}{{filename}}{{type}}{{mTime}} {{size}}
    + + + + + + + + + + + {{#each apikeys}} + + + + + + + + + + + {{/each}} +
    Key 
    LicenseNameContactRate
    {{licenseKey}} +
    + +
    +
    {{license}}{{name}}{{email}}{{rate}}
    +

    Add New

    +
    +
    +
    +
    +
    +
    +
    +
    +

    + +
    + + + + + \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/dialog-update-message-state.hdb b/code/services-core/control-service/src/main/resources/templates/control/dialog-update-message-state.hdb index e5ee806f..13d25615 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/dialog-update-message-state.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/dialog-update-message-state.hdb @@ -12,16 +12,16 @@ erase information about its owner, and inboxes will consider the message new aga

    -

    +

    -

    +

    -

    +

    -

    +


    @@ -37,5 +37,8 @@ erase information about its owner, and inboxes will consider the message new aga
    +

    Note that while setting a message to NEW or in some instances ACK typically causes an Actor + to act on the message, setting a message in ACK to ERR or DEAD will not stop action, but only + prevent resumption of action. To stop a running actor, use the Actors view and press the toggle.

    \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/partials/actors-table.hdb b/code/services-core/control-service/src/main/resources/templates/control/partials/actors-table.hdb index e77b7b70..a09e16a4 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/partials/actors-table.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/partials/actors-table.hdb @@ -15,7 +15,13 @@ action="/fsms/{{name}}/stop" method="post" onsubmit="return toggleActorSwitch('{{name}}')"> - + {{/unless}} {{#if terminal}} @@ -32,6 +38,7 @@ {{/unless}} {{#if canStart}} value="Off" + title="Start the actor" {{/if}} class="toggle-switch-off" diff --git a/code/services-core/control-service/src/main/resources/templates/control/partials/nav.hdb b/code/services-core/control-service/src/main/resources/templates/control/partials/nav.hdb index 40610960..184f28b9 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/partials/nav.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/partials/nav.hdb @@ -4,5 +4,8 @@
  • Services
  • Actors
  • Storage
  • +
  • API Keys
  • +
  • Blacklist
  • +
  • Complaints
  • \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/partials/storage-table.hdb b/code/services-core/control-service/src/main/resources/templates/control/partials/storage-table.hdb index 9be012e5..d46aafa8 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/partials/storage-table.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/partials/storage-table.hdb @@ -4,20 +4,18 @@ Type Name Path - Must Clean Permit Temp {{base.type}} {{base.name}} {{base.path}} - {{base.mustClean}} {{base.permitTemp}} Type - Path + Path Description {{#each storage}} @@ -26,7 +24,7 @@ Info {{storage.type}} - {{storage.path}} + {{storage.path}} {{storage.description}} {{/each}} diff --git a/code/services-core/control-service/src/main/resources/templates/control/storage-details.hdb b/code/services-core/control-service/src/main/resources/templates/control/storage-details.hdb index c811b478..d4f32718 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/storage-details.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/storage-details.hdb @@ -47,22 +47,22 @@

    Actions

    {{#with storage.self}} {{#if isCrawlable}} -
    + Perform a full re-crawl of this data:
    {{/if}} {{#if isLoadable}} -
    + Load this data into index:
    {{/if}} {{#if isConvertible}} -
    + Process and load this data into index:
    {{/if}} {{#if isRecrawlable}} -
    + Perform a re-crawl of this data:
    {{/if}} diff --git a/code/services-core/control-service/src/main/resources/templates/control/storage-overview.hdb b/code/services-core/control-service/src/main/resources/templates/control/storage-overview.hdb index 7d978fb9..f80253c7 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/storage-overview.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/storage-overview.hdb @@ -16,20 +16,18 @@ Type Name Path - Must Clean Permit Temp {{base.type}} {{base.name}} {{base.path}} - {{base.mustClean}} {{base.permitTemp}} Type - Path + Path Description {{#each storage}} @@ -37,7 +35,7 @@ {{storage.type}} - {{storage.path}} + {{storage.path}} {{storage.description}} {{/each}} diff --git a/code/services-core/control-service/src/test/java/nu/marginalia/control/svc/ApiKeyServiceTest.java b/code/services-core/control-service/src/test/java/nu/marginalia/control/svc/ApiKeyServiceTest.java new file mode 100644 index 00000000..94877cb6 --- /dev/null +++ b/code/services-core/control-service/src/test/java/nu/marginalia/control/svc/ApiKeyServiceTest.java @@ -0,0 +1,94 @@ +package nu.marginalia.control.svc; + +import com.zaxxer.hikari.HikariConfig; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.control.model.ApiKeyModel; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.parallel.Execution; +import org.testcontainers.containers.MariaDBContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +import java.util.List; + +import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD; + +@Testcontainers +@Execution(SAME_THREAD) +@Tag("slow") +public class ApiKeyServiceTest { + @Container + static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") + .withDatabaseName("WMSA_prod") + .withUsername("wmsa") + .withPassword("wmsa") + .withInitScript("db/migration/V23_06_0_006__api_key.sql") + .withNetworkAliases("mariadb"); + + static HikariDataSource dataSource; + @BeforeAll + public static void setup() { + HikariConfig config = new HikariConfig(); + config.setJdbcUrl(mariaDBContainer.getJdbcUrl()); + config.setUsername("wmsa"); + config.setPassword("wmsa"); + + dataSource = new HikariDataSource(config); + } + + @AfterAll + public static void tearDown() { + dataSource.close(); + mariaDBContainer.close(); + } + + @Test + void getKeys() { + var apiKeyService = new ApiKeyService(dataSource); + apiKeyService.addApiKey("public domain", "bob dobbs", "bob@dobbstown.com", 30); + apiKeyService.addApiKey("public domain", "connie dobbs", "cdobbs@dobbstown.com", 15); + + var keys = apiKeyService.getApiKeys(); + System.out.println(keys); + assertEquals(2, keys.size()); + } + + @Test + void addApiKey() { + var apiKeyService = new ApiKeyService(dataSource); + apiKeyService.addApiKey("public domain", "bob dobbs", "bob@dobbstown.com", 30); + + var keys = apiKeyService.getApiKeys(); + + System.out.println(keys); + assertEquals(1, keys.size()); + + var key = keys.get(0); + + assertEquals("public domain", key.license()); + assertEquals("bob dobbs", key.name()); + assertEquals("bob@dobbstown.com", key.email()); + assertEquals(30, key.rate()); + assertNotNull(key.licenseKey()); + } + + @Test + void deleteApiKey() { + var apiKeyService = new ApiKeyService(dataSource); + apiKeyService.addApiKey("public domain", "bob dobbs", "bob@dobbstown.com", 30); + + List keys = apiKeyService.getApiKeys(); + + assertEquals(1, keys.size()); + + String licenseKey= keys.get(0).licenseKey(); + apiKeyService.deleteApiKey(licenseKey); + + keys = apiKeyService.getApiKeys(); + assertEquals(0, keys.size()); + } +} \ No newline at end of file From c22feaf42ee303f1bb606734a284fb7dbb52d300 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 3 Aug 2023 17:58:18 +0200 Subject: [PATCH 137/157] (crawl) Make crawler limiter request a GC when throttling --- .../src/main/java/nu/marginalia/crawl/CrawlLimiter.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java index dd2122be..1b61cb0d 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java @@ -51,6 +51,9 @@ public class CrawlLimiter { else if (!oldThrottle && freeMemory < THROTTLE_TRIGGER_FREE_RAM) { newThrottle = true; logger.warn("Memory based throttling triggered"); + + // Try to GC + System.gc(); } From f01f608474cf4ee78b43894a5fb9c77d96eb473b Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 3 Aug 2023 17:58:52 +0200 Subject: [PATCH 138/157] (blacklist) Support blacklists with subdomain --- .../main/java/nu/marginalia/db/DomainBlacklistImpl.java | 2 +- .../search/siteinfo/DomainInformationService.java | 9 +++------ 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/code/common/db/src/main/java/nu/marginalia/db/DomainBlacklistImpl.java b/code/common/db/src/main/java/nu/marginalia/db/DomainBlacklistImpl.java index 87be3942..8bfbca7e 100644 --- a/code/common/db/src/main/java/nu/marginalia/db/DomainBlacklistImpl.java +++ b/code/common/db/src/main/java/nu/marginalia/db/DomainBlacklistImpl.java @@ -52,7 +52,7 @@ public class DomainBlacklistImpl implements DomainBlacklist { } try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement("SELECT EC_DOMAIN.ID FROM EC_DOMAIN INNER JOIN EC_DOMAIN_BLACKLIST ON EC_DOMAIN_BLACKLIST.URL_DOMAIN = EC_DOMAIN.DOMAIN_TOP")) { + try (var stmt = connection.prepareStatement("SELECT EC_DOMAIN.ID FROM EC_DOMAIN INNER JOIN EC_DOMAIN_BLACKLIST ON (EC_DOMAIN_BLACKLIST.URL_DOMAIN = EC_DOMAIN.DOMAIN_TOP OR EC_DOMAIN_BLACKLIST.URL_DOMAIN = EC_DOMAIN.DOMAIN_NAME)")) { stmt.setFetchSize(1000); var rsp = stmt.executeQuery(); while (rsp.next()) { diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/siteinfo/DomainInformationService.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/siteinfo/DomainInformationService.java index 35ce81b7..7863c17b 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/siteinfo/DomainInformationService.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/siteinfo/DomainInformationService.java @@ -115,14 +115,11 @@ public class DomainInformationService { public boolean isBlacklisted(EdgeDomain domain) { try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN_BLACKLIST WHERE URL_DOMAIN=?")) { + try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN_BLACKLIST WHERE URL_DOMAIN IN (?,?)")) { stmt.setString(1, domain.domain); + stmt.setString(2, domain.toString()); var rsp = stmt.executeQuery(); - if (rsp.next()) { - return true; - } else { - return false; - } + return rsp.next(); } } } From 1d0cea1d5548f5db7f5a0a678eb3913c2f65473d Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 3 Aug 2023 17:59:57 +0200 Subject: [PATCH 139/157] (converter) GUI for dealing with user complaints --- .../nu/marginalia/control/ControlService.java | 49 +++++++- .../model/DomainComplaintCategory.java | 28 +++++ .../control/model/DomainComplaintModel.java | 17 +++ .../control/svc/DomainComplaintService.java | 112 ++++++++++++++++++ .../main/resources/static/control/style.css | 25 ++++ .../resources/templates/control/api-keys.hdb | 2 +- .../templates/control/domain-complaints.hdb | 111 +++++++++++++++++ .../control/partials/message-queue-table.hdb | 3 +- .../search/svc/SearchFlagSiteService.java | 7 +- 9 files changed, 349 insertions(+), 5 deletions(-) create mode 100644 code/services-core/control-service/src/main/java/nu/marginalia/control/model/DomainComplaintCategory.java create mode 100644 code/services-core/control-service/src/main/java/nu/marginalia/control/model/DomainComplaintModel.java create mode 100644 code/services-core/control-service/src/main/java/nu/marginalia/control/svc/DomainComplaintService.java create mode 100644 code/services-core/control-service/src/main/resources/templates/control/domain-complaints.hdb diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java index c62631ef..1ec0e555 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java @@ -4,9 +4,12 @@ import com.google.gson.Gson; import com.google.inject.Inject; import nu.marginalia.client.ServiceMonitors; import nu.marginalia.control.model.Actor; +import nu.marginalia.control.model.DomainComplaintModel; +import nu.marginalia.control.model.ProcessHeartbeat; import nu.marginalia.control.svc.*; import nu.marginalia.db.storage.model.FileStorageId; import nu.marginalia.db.storage.model.FileStorageType; +import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.mq.MqMessageState; import nu.marginalia.mq.persistence.MqPersistence; @@ -21,7 +24,10 @@ import spark.Spark; import java.io.IOException; import java.sql.SQLException; +import java.util.Comparator; +import java.util.List; import java.util.Map; +import java.util.stream.Collectors; public class ControlService extends Service { @@ -32,6 +38,7 @@ public class ControlService extends Service { private final HeartbeatService heartbeatService; private final EventLogService eventLogService; private final ApiKeyService apiKeyService; + private final DomainComplaintService domainComplaintService; private final ControlActorService controlActorService; private final StaticResources staticResources; private final MessageQueueViewService messageQueueViewService; @@ -49,6 +56,7 @@ public class ControlService extends Service { MessageQueueViewService messageQueueViewService, ControlFileStorageService controlFileStorageService, ApiKeyService apiKeyService, + DomainComplaintService domainComplaintService, MqPersistence persistence ) throws IOException { @@ -57,6 +65,7 @@ public class ControlService extends Service { this.heartbeatService = heartbeatService; this.eventLogService = eventLogService; this.apiKeyService = apiKeyService; + this.domainComplaintService = domainComplaintService; var indexRenderer = rendererFactory.renderer("control/index"); var servicesRenderer = rendererFactory.renderer("control/services"); @@ -69,6 +78,7 @@ public class ControlService extends Service { var storageProcessedRenderer = rendererFactory.renderer("control/storage-processed"); var apiKeysRenderer = rendererFactory.renderer("control/api-keys"); + var domainComplaintsRenderer = rendererFactory.renderer("control/domain-complaints"); var storageDetailsRenderer = rendererFactory.renderer("control/storage-details"); var updateMessageStateRenderer = rendererFactory.renderer("control/dialog-update-message-state"); @@ -103,6 +113,7 @@ public class ControlService extends Service { final HtmlRedirect redirectToProcesses = new HtmlRedirect("/actors"); final HtmlRedirect redirectToApiKeys = new HtmlRedirect("/api-keys"); final HtmlRedirect redirectToStorage = new HtmlRedirect("/storage"); + final HtmlRedirect redirectToComplaints = new HtmlRedirect("/complaints"); Spark.post("/public/fsms/:fsm/start", controlActorService::startFsm, redirectToProcesses); Spark.post("/public/fsms/:fsm/stop", controlActorService::stopFsm, redirectToProcesses); @@ -120,6 +131,9 @@ public class ControlService extends Service { // HTML forms don't support the DELETE verb :-( Spark.post("/public/api-keys/:key/delete", this::deleteApiKey, redirectToApiKeys); + Spark.get("/public/complaints", this::complaintsModel, domainComplaintsRenderer::render); + Spark.post("/public/complaints/:domain", this::reviewComplaint, redirectToComplaints); + Spark.get("/public/message/:id/state", (rq, rsp) -> persistence.getMessage(Long.parseLong(rq.params("id"))), updateMessageStateRenderer::render); Spark.post("/public/message/:id/state", (rq, rsp) -> { MqMessageState state = MqMessageState.valueOf(rq.queryParams("state")); @@ -133,6 +147,35 @@ public class ControlService extends Service { monitors.subscribe(this::logMonitorStateChange); } + private Object complaintsModel(Request request, Response response) { + Map> complaintsByReviewed = + domainComplaintService.getComplaints().stream().collect(Collectors.partitioningBy(DomainComplaintModel::reviewed)); + + var reviewed = complaintsByReviewed.get(true); + var unreviewed = complaintsByReviewed.get(false); + + reviewed.sort(Comparator.comparing(DomainComplaintModel::reviewDate).reversed()); + unreviewed.sort(Comparator.comparing(DomainComplaintModel::fileDate).reversed()); + + return Map.of("complaintsNew", unreviewed, "complaintsReviewed", reviewed); + } + + private Object reviewComplaint(Request request, Response response) { + var domain = new EdgeDomain(request.params("domain")); + String action = request.queryParams("action"); + + logger.info("Reviewing complaint for domain {} with action {}", domain, action); + + switch (action) { + case "noop" -> domainComplaintService.reviewNoAction(domain); + case "appeal" -> domainComplaintService.approveAppealBlacklisting(domain); + case "blacklist" -> domainComplaintService.blacklistDomain(domain); + default -> throw new UnsupportedOperationException(); + } + + return ""; + } + private Object createApiKey(Request request, Response response) { String license = request.queryParams("license"); String name = request.queryParams("name"); @@ -223,7 +266,11 @@ public class ControlService extends Service { } private Object processesModel(Request request, Response response) { - return Map.of("processes", heartbeatService.getProcessHeartbeats(), + var heartbeatsAll = heartbeatService.getProcessHeartbeats(); + var byIsJob = heartbeatsAll.stream().collect(Collectors.partitioningBy(ProcessHeartbeat::isServiceJob)); + + return Map.of("processes", byIsJob.get(false), + "jobs", byIsJob.get(true), "actors", controlActorService.getActorStates(), "messages", messageQueueViewService.getLastEntries(20)); } diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/model/DomainComplaintCategory.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/DomainComplaintCategory.java new file mode 100644 index 00000000..d1743ba9 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/DomainComplaintCategory.java @@ -0,0 +1,28 @@ +package nu.marginalia.control.model; + +public enum DomainComplaintCategory { + SPAM("spam"), + FREEBOOTING("freebooting"), + BROKEN("broken"), + SHOCK("shock"), + BLACKLIST("blacklist"), + UNKNOWN("unknown"); + + private final String categoryName; + + DomainComplaintCategory(String categoryName) { + this.categoryName = categoryName; + } + + public String categoryName() { + return categoryName; + } + public static DomainComplaintCategory fromCategoryName(String categoryName) { + for (DomainComplaintCategory category : values()) { + if (category.categoryName().equals(categoryName)) { + return category; + } + } + return UNKNOWN; + } +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/model/DomainComplaintModel.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/DomainComplaintModel.java new file mode 100644 index 00000000..603b6fc8 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/DomainComplaintModel.java @@ -0,0 +1,17 @@ +package nu.marginalia.control.model; + +public record DomainComplaintModel(String domain, + DomainComplaintCategory category, + String description, + String sample, + String decision, + String fileDate, + String reviewDate, + boolean reviewed) +{ + + public boolean isAppeal() { + return category == DomainComplaintCategory.BLACKLIST; + } + +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/DomainComplaintService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/DomainComplaintService.java new file mode 100644 index 00000000..bf36bfad --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/DomainComplaintService.java @@ -0,0 +1,112 @@ +package nu.marginalia.control.svc; + +import com.google.inject.Inject; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.control.model.DomainComplaintCategory; +import nu.marginalia.control.model.DomainComplaintModel; +import nu.marginalia.model.EdgeDomain; + +import java.sql.SQLException; +import java.sql.Timestamp; +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; + + +/** Service for handling domain complaints. This code has an user-facing correspondent in + * SearchFlagSiteService in search-service + */ +public class DomainComplaintService { + private final HikariDataSource dataSource; + + @Inject + public DomainComplaintService(HikariDataSource dataSource) { + this.dataSource = dataSource; + } + + public List getComplaints() { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT EC_DOMAIN.DOMAIN_NAME AS DOMAIN, CATEGORY, DESCRIPTION, SAMPLE, FILE_DATE, REVIEWED, DECISION, REVIEW_DATE + FROM DOMAIN_COMPLAINT LEFT JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_COMPLAINT.DOMAIN_ID + """)) { + List complaints = new ArrayList<>(); + var rs = stmt.executeQuery(); + while (rs.next()) { + complaints.add(new DomainComplaintModel( + rs.getString("DOMAIN"), + DomainComplaintCategory.fromCategoryName(rs.getString("CATEGORY")), + rs.getString("DESCRIPTION"), + rs.getString("SAMPLE"), + rs.getString("DECISION"), + rs.getTimestamp("FILE_DATE").toLocalDateTime().toString(), + Optional.ofNullable(rs.getTimestamp("REVIEW_DATE")) + .map(Timestamp::toLocalDateTime).map(Object::toString).orElse(null), + rs.getBoolean("REVIEWED") + )); + } + return complaints; + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } + + public void approveAppealBlacklisting(EdgeDomain domain) { + removeFromBlacklist(domain); + setDecision(domain, "APPROVED"); + } + + public void blacklistDomain(EdgeDomain domain) { + addToBlacklist(domain); + setDecision(domain, "BLACKLISTED"); + } + + public void reviewNoAction(EdgeDomain domain) { + setDecision(domain, "REJECTED"); + } + + private void addToBlacklist(EdgeDomain domain) { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + INSERT IGNORE INTO EC_DOMAIN_BLACKLIST (URL_DOMAIN) VALUES (?) + """)) { + stmt.setString(1, domain.toString()); + stmt.executeUpdate(); + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } + + private void removeFromBlacklist(EdgeDomain domain) { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + DELETE FROM EC_DOMAIN_BLACKLIST WHERE URL_DOMAIN=? + """)) { + stmt.setString(1, domain.toString()); + stmt.addBatch(); + stmt.setString(1, domain.domain); + stmt.executeBatch(); + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } + + private void setDecision(EdgeDomain domain, String decision) { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + UPDATE DOMAIN_COMPLAINT SET DECISION=?, REVIEW_DATE=NOW() + WHERE DOMAIN_ID=(SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?) + AND DECISION IS NULL + """)) { + stmt.setString(1, decision); + stmt.setString(2, domain.toString()); + stmt.executeUpdate(); + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } +} diff --git a/code/services-core/control-service/src/main/resources/static/control/style.css b/code/services-core/control-service/src/main/resources/static/control/style.css index e3722cb4..a248a499 100644 --- a/code/services-core/control-service/src/main/resources/static/control/style.css +++ b/code/services-core/control-service/src/main/resources/static/control/style.css @@ -49,9 +49,34 @@ table { } th { text-align: left; } td,th { padding-right: 1ch; border: 1px solid #ccc; } + tr:nth-of-type(2n) { background-color: #eee; } + + +table.table-rh-2 tr:nth-of-type(4n+1) { background-color: #eee; } +table.table-rh-2 tr:nth-of-type(4n+2) { background-color: #eee; } +table.table-rh-2 tr:nth-of-type(4n+3) { background-color: unset; } +table.table-rh-2 tr:nth-of-type(4n) { background-color: unset; } + +table.table-rh-2 tr:nth-of-type(4n) td, +table.table-rh-2 tr:nth-of-type(4n) th { border-bottom: 1px solid #888; } +table.table-rh-2 tr:nth-of-type(4n+2) td, +table.table-rh-2 tr:nth-of-type(4n+2) th { border-bottom: 1px solid #888; } + +table.table-rh-3 tr:nth-of-type(6n+1) { background-color: #eee; } +table.table-rh-3 tr:nth-of-type(6n+2) { background-color: #eee; } +table.table-rh-3 tr:nth-of-type(6n+3) { background-color: #eee; } +table.table-rh-3 tr:nth-of-type(6n+4) { background-color: unset; } +table.table-rh-3 tr:nth-of-type(6n+5) { background-color: unset; } +table.table-rh-3 tr:nth-of-type(6n) { background-color: unset; } + +table.table-rh-3 tr:nth-of-type(6n) td, +table.table-rh-3 tr:nth-of-type(6n) th { border-bottom: 1px solid #888; } +table.table-rh-3 tr:nth-of-type(6n+3) td, +table.table-rh-3 tr:nth-of-type(6n+3) th { border-bottom: 1px solid #888; } + body > nav { grid-area: left; } diff --git a/code/services-core/control-service/src/main/resources/templates/control/api-keys.hdb b/code/services-core/control-service/src/main/resources/templates/control/api-keys.hdb index 5361fb82..e58b6b8a 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/api-keys.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/api-keys.hdb @@ -10,7 +10,7 @@

    API Keys

    - +
    diff --git a/code/services-core/control-service/src/main/resources/templates/control/domain-complaints.hdb b/code/services-core/control-service/src/main/resources/templates/control/domain-complaints.hdb new file mode 100644 index 00000000..ac1f6c88 --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/domain-complaints.hdb @@ -0,0 +1,111 @@ + + + + Control Service + + + + +{{> control/partials/nav}} +
    + +

    Domain Complaints

    + {{#unless complaintsNew}} +

    No new complaints!

    + {{/unless}} + {{#if complaintsNew}} +
    Key  
    + + + + + + + + + + + + + + {{#each complaintsNew}} + + + + + + + + + + + + + + {{/each}} +
    DateCategory
    DomainSample
    Description
    {{fileDate}}{{category}} +
    + + + +
    +
    {{domain}}{{sample}}
    {{description}}
    + {{/if}} + + {{#if complaintsReviewed}} +

    Review Log

    + + + + + + + + + + + + + + + {{#each complaintsReviewed}} + + + + + + + + + + + + + {{/each}} +
    Review DateCategoryAction
    DomainSample
    Description
    {{fileDate}}{{category}} + {{decision}} +
    {{domain}}{{sample}}
    {{description}}
    + {{/if}} +
    + + + + \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/partials/message-queue-table.hdb b/code/services-core/control-service/src/main/resources/templates/control/partials/message-queue-table.hdb index cc8d98a2..b971c928 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/partials/message-queue-table.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/partials/message-queue-table.hdb @@ -1,6 +1,6 @@

    Message Queue

    - +
    @@ -9,6 +9,7 @@ + {{#each messages}} diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchFlagSiteService.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchFlagSiteService.java index 5eb960a5..33d0165d 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchFlagSiteService.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchFlagSiteService.java @@ -17,6 +17,9 @@ import java.util.Map; import java.util.function.Function; import java.util.stream.Collectors; +/** Service for handling flagging sites. This code has an admin-facing correspondent in + * DomainComplaintService in control-service + */ public class SearchFlagSiteService { private final MustacheRenderer formTemplate; private final HikariDataSource dataSource; @@ -83,9 +86,9 @@ public class SearchFlagSiteService { try (var conn = dataSource.getConnection(); var complaintsStmt = conn.prepareStatement(""" - SELECT CATEGORY, FILE_DATE, REVIEWED, DECISION + SELECT CATEGORY, FILE_DATE, REVIEWED, DECISION FROM DOMAIN_COMPLAINT - WHERE DOMAIN_ID=? + WHERE DOMAIN_ID=? """); var stmt = conn.prepareStatement( """ From 624b78ec3a588f4279d309998a79321eea7f84f6 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 4 Aug 2023 14:40:06 +0200 Subject: [PATCH 140/157] (heartbeat) Task heartbeats --- .../migration/V23_07_0_007__task_status.sql | 10 + .../service/control/ServiceHeartbeat.java | 11 +- .../service/control/ServiceTaskHeartbeat.java | 184 ++++++++++++++++++ .../features-index/index-forward/build.gradle | 1 + .../index/forward/ForwardIndexConverter.java | 28 ++- .../features-index/index-reverse/build.gradle | 1 + .../index/full/ReverseIndexFullConverter.java | 47 +++-- .../ReverseIndexPriorityConverter.java | 62 +++--- .../ReverseIndexFullConverterTest.java | 6 +- .../ReverseIndexFullConverterTest2.java | 6 +- .../ReverseIndexPriorityConverterTest2.java | 6 +- .../nu/marginalia/control/ControlService.java | 8 +- .../monitor/ProcessLivenessMonitorActor.java | 33 +++- .../control/model/ProcessHeartbeat.java | 12 +- .../control/model/TaskHeartbeat.java | 29 +++ .../control/svc/HeartbeatService.java | 45 ++++- .../control/svc/ProcessService.java | 3 +- .../resources/templates/control/actors.hdb | 2 +- .../control/partials/processes-table.hdb | 21 +- .../index/IndexServicesFactory.java | 36 +++- .../index/svc/IndexSearchSetsService.java | 36 +++- ...ndexQueryServiceIntegrationTestModule.java | 1 + 22 files changed, 515 insertions(+), 73 deletions(-) create mode 100644 code/common/db/src/main/resources/db/migration/V23_07_0_007__task_status.sql create mode 100644 code/common/service/src/main/java/nu/marginalia/service/control/ServiceTaskHeartbeat.java create mode 100644 code/services-core/control-service/src/main/java/nu/marginalia/control/model/TaskHeartbeat.java diff --git a/code/common/db/src/main/resources/db/migration/V23_07_0_007__task_status.sql b/code/common/db/src/main/resources/db/migration/V23_07_0_007__task_status.sql new file mode 100644 index 00000000..7c7ec175 --- /dev/null +++ b/code/common/db/src/main/resources/db/migration/V23_07_0_007__task_status.sql @@ -0,0 +1,10 @@ +CREATE TABLE IF NOT EXISTS TASK_HEARTBEAT ( + TASK_NAME VARCHAR(255) PRIMARY KEY COMMENT "Full name of the task, including node id if applicable, e.g. reconvert:0", + TASK_BASE VARCHAR(255) NOT NULL COMMENT "Base name of the task, e.g. reconvert", + INSTANCE VARCHAR(255) NOT NULL COMMENT "UUID of the task instance", + SERVICE_INSTANCE VARCHAR(255) NOT NULL COMMENT "UUID of the parent service", + STATUS ENUM ('STARTING', 'RUNNING', 'STOPPED') NOT NULL DEFAULT 'STARTING' COMMENT "Status of the task", + PROGRESS INT NOT NULL DEFAULT 0 COMMENT "Progress of the task", + STAGE_NAME VARCHAR(255) DEFAULT "", + HEARTBEAT_TIME TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) COMMENT "Task was last seen at this point" +); diff --git a/code/common/service/src/main/java/nu/marginalia/service/control/ServiceHeartbeat.java b/code/common/service/src/main/java/nu/marginalia/service/control/ServiceHeartbeat.java index ff5c8755..de146926 100644 --- a/code/common/service/src/main/java/nu/marginalia/service/control/ServiceHeartbeat.java +++ b/code/common/service/src/main/java/nu/marginalia/service/control/ServiceHeartbeat.java @@ -10,7 +10,8 @@ import org.slf4j.LoggerFactory; import java.sql.SQLException; import java.util.concurrent.TimeUnit; -/** This service sends a heartbeat to the database every 5 seconds. +/** This service sends a heartbeat to the database every 5 seconds, + * updating the control service with the liveness information for the service. */ @Singleton public class ServiceHeartbeat { @@ -18,6 +19,7 @@ public class ServiceHeartbeat { private final String serviceName; private final String serviceBase; private final String instanceUUID; + private final ServiceConfiguration configuration; private final HikariDataSource dataSource; @@ -32,6 +34,7 @@ public class ServiceHeartbeat { { this.serviceName = configuration.serviceName() + ":" + configuration.node(); this.serviceBase = configuration.serviceName(); + this.configuration = configuration; this.dataSource = dataSource; this.instanceUUID = configuration.instanceUuid().toString(); @@ -41,6 +44,11 @@ public class ServiceHeartbeat { Runtime.getRuntime().addShutdownHook(new Thread(this::shutDown)); } + public > ServiceTaskHeartbeat createServiceProcessHeartbeat(Class steps, String processName) { + return new ServiceTaskHeartbeat<>(steps, configuration, processName, dataSource); + } + + public void start() { if (!running) { runnerThread.start(); @@ -142,4 +150,5 @@ public class ServiceHeartbeat { } } } + } diff --git a/code/common/service/src/main/java/nu/marginalia/service/control/ServiceTaskHeartbeat.java b/code/common/service/src/main/java/nu/marginalia/service/control/ServiceTaskHeartbeat.java new file mode 100644 index 00000000..a460bc1c --- /dev/null +++ b/code/common/service/src/main/java/nu/marginalia/service/control/ServiceTaskHeartbeat.java @@ -0,0 +1,184 @@ +package nu.marginalia.service.control; + + +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.service.module.ServiceConfiguration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.SQLException; +import java.util.UUID; +import java.util.concurrent.TimeUnit; + +/** This object sends a heartbeat to the database every few seconds, + * updating with the progress of a task within a service. Progress is tracked by providing + * enumerations corresponding to the steps in the task. It's important they're arranged in the same + * order as the steps in the task in order to get an accurate progress tracking. + */ +public class ServiceTaskHeartbeat> implements AutoCloseable { + private final Logger logger = LoggerFactory.getLogger(ServiceTaskHeartbeat.class); + private final String taskName; + private final String taskBase; + private final String instanceUUID; + private final HikariDataSource dataSource; + + + private final Thread runnerThread; + private final int heartbeatInterval = Integer.getInteger("mcp.heartbeat.interval", 1); + private final String serviceInstanceUUID; + private final int stepCount; + + private volatile boolean running = false; + private volatile int stepNum = 0; + private volatile String step = "-"; + + ServiceTaskHeartbeat(Class stepClass, + ServiceConfiguration configuration, + String taskName, + HikariDataSource dataSource) + { + this.taskName = configuration.serviceName() + "." + taskName + ":" + configuration.node(); + this.taskBase = configuration.serviceName() + "." + taskName; + this.dataSource = dataSource; + + this.instanceUUID = UUID.randomUUID().toString(); + this.serviceInstanceUUID = configuration.instanceUuid().toString(); + + this.stepCount = stepClass.getEnumConstants().length; + + runnerThread = new Thread(this::run); + runnerThread.start(); + } + + /** Update the progress of the task. This is a fast function that doesn't block; + * the actual update is done in a separate thread. + * + * @param step The current step in the task. + */ + public void progress(T step) { + this.step = step.name(); + + // off by one since we calculate the progress based on the number of steps, + // and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the + // final progress being 80% and not 100%) + + this.stepNum = 1 + step.ordinal(); + + logger.info("ServiceTask {} progress: {}", taskBase, step.name()); + } + + public void shutDown() { + if (!running) + return; + + running = false; + + try { + runnerThread.join(); + heartbeatStop(); + } + catch (InterruptedException|SQLException ex) { + logger.warn("ServiceHeartbeat shutdown failed", ex); + } + } + + private void run() { + if (!running) + running = true; + else + return; + + try { + heartbeatInit(); + + while (running) { + try { + heartbeatUpdate(); + } + catch (SQLException ex) { + logger.warn("ServiceHeartbeat failed to update", ex); + } + + TimeUnit.SECONDS.sleep(heartbeatInterval); + } + } + catch (InterruptedException|SQLException ex) { + logger.error("ServiceHeartbeat caught irrecoverable exception, killing service", ex); + System.exit(255); + } + } + + private void heartbeatInit() throws SQLException { + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement( + """ + INSERT INTO TASK_HEARTBEAT (TASK_NAME, TASK_BASE, INSTANCE, SERVICE_INSTANCE, HEARTBEAT_TIME, STATUS) + VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP(6), 'STARTING') + ON DUPLICATE KEY UPDATE + INSTANCE = ?, + SERVICE_INSTANCE = ?, + HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), + STATUS = 'STARTING' + """ + )) + { + stmt.setString(1, taskName); + stmt.setString(2, taskBase); + stmt.setString(3, instanceUUID); + stmt.setString(4, serviceInstanceUUID); + stmt.setString(5, instanceUUID); + stmt.setString(6, serviceInstanceUUID); + stmt.executeUpdate(); + } + } + } + + private void heartbeatUpdate() throws SQLException { + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement( + """ + UPDATE TASK_HEARTBEAT + SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), + STATUS = 'RUNNING', + PROGRESS = ?, + STAGE_NAME = ? + WHERE INSTANCE = ? + """) + ) + { + stmt.setInt(1, (int) Math.round(100 * stepNum / (double) stepCount)); + stmt.setString(2, step); + stmt.setString(3, instanceUUID); + stmt.executeUpdate(); + } + } + } + + private void heartbeatStop() throws SQLException { + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement( + """ + UPDATE TASK_HEARTBEAT + SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), + STATUS='STOPPED', + PROGRESS = ?, + STAGE_NAME = ? + WHERE INSTANCE = ? + """) + ) + { + stmt.setInt(1, (int) Math.round(100 * stepNum / (double) stepCount)); + stmt.setString( 2, step); + stmt.setString( 3, instanceUUID); + stmt.executeUpdate(); + } + } + } + + @Override + public void close() { + shutDown(); + } + +} + diff --git a/code/features-index/index-forward/build.gradle b/code/features-index/index-forward/build.gradle index e7a34566..299c6496 100644 --- a/code/features-index/index-forward/build.gradle +++ b/code/features-index/index-forward/build.gradle @@ -18,6 +18,7 @@ dependencies { implementation project(':code:features-index:index-journal') implementation project(':code:features-index:lexicon') implementation project(':code:common:model') + implementation project(':code:common:service') implementation project(':third-party:uppend') diff --git a/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexConverter.java b/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexConverter.java index a82d2ea6..1496a653 100644 --- a/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexConverter.java +++ b/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexConverter.java @@ -7,6 +7,7 @@ import nu.marginalia.array.LongArray; import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.ranking.DomainRankings; +import nu.marginalia.service.control.ServiceHeartbeat; import org.roaringbitmap.IntConsumer; import org.roaringbitmap.RoaringBitmap; import org.slf4j.Logger; @@ -19,6 +20,7 @@ import java.nio.file.Path; public class ForwardIndexConverter { + private final ServiceHeartbeat heartbeat; private final File inputFile; private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -28,18 +30,27 @@ public class ForwardIndexConverter { private final DomainRankings domainRankings; - public ForwardIndexConverter( + public ForwardIndexConverter(ServiceHeartbeat heartbeat, File inputFile, Path outputFileDocsId, Path outputFileDocsData, DomainRankings domainRankings ) { + this.heartbeat = heartbeat; this.inputFile = inputFile; this.outputFileDocsId = outputFileDocsId; this.outputFileDocsData = outputFileDocsData; this.domainRankings = domainRankings; } + public enum TaskSteps { + GET_DOC_IDS, + GATHER_OFFSETS, + SUPPLEMENTAL_INDEXES, + FORCE, + FINISHED + } + public void convert() throws IOException { deleteOldFiles(); @@ -53,18 +64,21 @@ public class ForwardIndexConverter { logger.info("Domain Rankings size = {}", domainRankings.size()); - try { + try (var progress = heartbeat.createServiceProcessHeartbeat(TaskSteps.class, "forwardIndexConverter")) { + progress.progress(TaskSteps.GET_DOC_IDS); + LongArray docsFileId = getDocIds(outputFileDocsId, journalReader); + progress.progress(TaskSteps.GATHER_OFFSETS); + // doc ids -> sorted list of ids - logger.info("Gathering Offsets"); Long2IntOpenHashMap docIdToIdx = new Long2IntOpenHashMap((int) docsFileId.size()); docsFileId.forEach(0, docsFileId.size(), (pos, val) -> docIdToIdx.put(val, (int) pos)); - // docIdToIdx -> file offset for id + progress.progress(TaskSteps.SUPPLEMENTAL_INDEXES); - logger.info("Creating Supplementary Indexes"); + // docIdToIdx -> file offset for id LongArray docFileData = LongArray.mmapForWriting(outputFileDocsData, ForwardIndexParameters.ENTRY_SIZE * docsFileId.size()); @@ -78,11 +92,15 @@ public class ForwardIndexConverter { docFileData.set(entryOffset + ForwardIndexParameters.DOMAIN_OFFSET, entry.domainId()); }); + progress.progress(TaskSteps.FORCE); + docFileData.force(); docsFileId.force(); docFileData.advice(NativeIO.Advice.DontNeed); docsFileId.advice(NativeIO.Advice.DontNeed); + + progress.progress(TaskSteps.FINISHED); } catch (IOException ex) { logger.error("Failed to convert", ex); throw ex; diff --git a/code/features-index/index-reverse/build.gradle b/code/features-index/index-reverse/build.gradle index 3ef67762..d2e3b233 100644 --- a/code/features-index/index-reverse/build.gradle +++ b/code/features-index/index-reverse/build.gradle @@ -20,6 +20,7 @@ dependencies { implementation project(':code:features-index:index-journal') implementation project(':code:features-index:lexicon') implementation project(':code:common:model') + implementation project(':code:common:service') implementation libs.lombok annotationProcessor libs.lombok diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullConverter.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullConverter.java index 339e1c39..c8cafcde 100644 --- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullConverter.java +++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullConverter.java @@ -21,11 +21,14 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardOpenOption; +import nu.marginalia.service.control.ServiceHeartbeat; + import static nu.marginalia.index.full.ReverseIndexFullParameters.bTreeContext; public class ReverseIndexFullConverter { private static final int RWF_BIN_SIZE = 10_000_000; + private final ServiceHeartbeat heartbeat; private final Path tmpFileDir; private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -36,11 +39,13 @@ public class ReverseIndexFullConverter { private final Path outputFileDocs; private final SortingContext sortingContext; - public ReverseIndexFullConverter(Path tmpFileDir, + public ReverseIndexFullConverter(ServiceHeartbeat heartbeat, + Path tmpFileDir, IndexJournalReader journalReader, DomainRankings domainRankings, Path outputFileWords, Path outputFileDocs) { + this.heartbeat = heartbeat; this.tmpFileDir = tmpFileDir; this.journalReader = journalReader; this.domainRankings = domainRankings; @@ -49,6 +54,18 @@ public class ReverseIndexFullConverter { this.sortingContext = new SortingContext(tmpFileDir, 64_000); } + public enum TaskSteps { + ACCUMULATE_STATISTICS, + INCREMENT_OFFSETS, + COUNT_OFFSETS, + CREATE_INTERMEDIATE_DOCS, + SORT_INTERMEDIATE_DOCS, + SIZING, + FINALIZING_DOCS, + FORCE, + FINISHED, + } + public void convert() throws IOException { deleteOldFiles(); @@ -57,28 +74,32 @@ public class ReverseIndexFullConverter { return; } - final IndexJournalStatistics statistics = journalReader.getStatistics(); - final Path intermediateUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat"); + try (var progress = heartbeat.createServiceProcessHeartbeat(TaskSteps.class, "reverseIndexFullConverter")) { + progress.progress(TaskSteps.ACCUMULATE_STATISTICS); - try { + final IndexJournalStatistics statistics = journalReader.getStatistics(); final long wordsFileSize = statistics.highestWord() + 1; + progress.progress(TaskSteps.INCREMENT_OFFSETS); + logger.debug("Words file size: {}", wordsFileSize); // Create a count of how many documents has contains each word final LongArray wordsOffsets = LongArray.allocate(wordsFileSize); - logger.info("Gathering Offsets"); journalReader.forEachWordId(wordsOffsets::increment); + progress.progress(TaskSteps.COUNT_OFFSETS); + wordsOffsets.transformEach(0, wordsFileSize, new CountToOffsetTransformer(ReverseIndexFullParameters.ENTRY_SIZE)); + progress.progress(TaskSteps.CREATE_INTERMEDIATE_DOCS); + // Construct an intermediate representation of the reverse documents index try (FileChannel intermediateDocChannel = (FileChannel) Files.newByteChannel(intermediateUrlsFile, StandardOpenOption.CREATE, StandardOpenOption.READ, StandardOpenOption.WRITE)) { - logger.info("Creating Intermediate Docs File"); // Construct intermediate index try (RandomWriteFunnel intermediateDocumentWriteFunnel = new RandomWriteFunnel(tmpFileDir, RWF_BIN_SIZE); @@ -89,8 +110,7 @@ public class ReverseIndexFullConverter { intermediateDocumentWriteFunnel.write(intermediateDocChannel); } intermediateDocChannel.force(false); - - logger.info("Sorting Intermediate Docs File"); + progress.progress(TaskSteps.SORT_INTERMEDIATE_DOCS); // Sort each segment of the intermediate file { @@ -102,28 +122,29 @@ public class ReverseIndexFullConverter { intermediateDocs.force(); } - - logger.info("Sizing"); + progress.progress(TaskSteps.SIZING); IndexSizeEstimator sizeEstimator = new IndexSizeEstimator( ReverseIndexFullParameters.bTreeContext, ReverseIndexFullParameters.ENTRY_SIZE); wordsOffsets.fold(0, 0, wordsOffsets.size(), sizeEstimator); - - logger.info("Finalizing Docs File"); + progress.progress(TaskSteps.FINALIZING_DOCS); LongArray finalDocs = LongArray.mmapForWriting(outputFileDocs, sizeEstimator.size); // Construct the proper reverse index wordsOffsets.transformEachIO(0, wordsOffsets.size(), new ReverseIndexBTreeTransformer(finalDocs, ReverseIndexFullParameters.ENTRY_SIZE, bTreeContext, intermediateDocChannel)); wordsOffsets.write(outputFileWords); + progress.progress(TaskSteps.FORCE); + // Attempt to clean up before forcing (important disk space preservation) Files.deleteIfExists(intermediateUrlsFile); wordsOffsets.force(); finalDocs.force(); - logger.info("Done"); + + progress.progress(TaskSteps.FINISHED); } } catch (IOException ex) { diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityConverter.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityConverter.java index fbd49405..d5ee0f88 100644 --- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityConverter.java +++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityConverter.java @@ -12,6 +12,7 @@ import nu.marginalia.index.journal.model.IndexJournalStatistics; import nu.marginalia.index.journal.reader.IndexJournalReader; import nu.marginalia.ranking.DomainRankings; import nu.marginalia.rwf.RandomWriteFunnel; +import nu.marginalia.service.control.ServiceHeartbeat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -21,9 +22,12 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardOpenOption; +import static nu.marginalia.index.priority.ReverseIndexPriorityParameters.bTreeContext; + public class ReverseIndexPriorityConverter { private static final int RWF_BIN_SIZE = 10_000_000; + private final ServiceHeartbeat heartbeat; private final Path tmpFileDir; private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -34,11 +38,13 @@ public class ReverseIndexPriorityConverter { private final Path outputFileDocs; private final SortingContext sortingContext; - public ReverseIndexPriorityConverter(Path tmpFileDir, + public ReverseIndexPriorityConverter(ServiceHeartbeat heartbeat, + Path tmpFileDir, IndexJournalReader journalReader, DomainRankings domainRankings, Path outputFileWords, Path outputFileDocs) { + this.heartbeat = heartbeat; this.tmpFileDir = tmpFileDir; this.journalReader = journalReader; this.domainRankings = domainRankings; @@ -47,6 +53,18 @@ public class ReverseIndexPriorityConverter { this.sortingContext = new SortingContext(tmpFileDir, 64_000); } + public enum TaskSteps { + ACCUMULATE_STATISTICS, + INCREMENT_OFFSETS, + COUNT_OFFSETS, + CREATE_INTERMEDIATE_DOCS, + SORT_INTERMEDIATE_DOCS, + SIZING, + FINALIZING_DOCS, + FORCE, + FINISHED, + } + public void convert() throws IOException { deleteOldFiles(); @@ -55,28 +73,32 @@ public class ReverseIndexPriorityConverter { return; } - final IndexJournalStatistics statistics = journalReader.getStatistics(); - final Path intermediateUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat"); + try (var progress = heartbeat.createServiceProcessHeartbeat(TaskSteps.class, "reverseIndexPriorityConverter")) { + progress.progress(TaskSteps.ACCUMULATE_STATISTICS); - try { + final IndexJournalStatistics statistics = journalReader.getStatistics(); final long wordsFileSize = statistics.highestWord() + 1; + progress.progress(TaskSteps.INCREMENT_OFFSETS); + logger.debug("Words file size: {}", wordsFileSize); // Create a count of how many documents has contains each word final LongArray wordsOffsets = LongArray.allocate(wordsFileSize); - logger.info("Gathering Offsets"); journalReader.forEachWordId(wordsOffsets::increment); + progress.progress(TaskSteps.COUNT_OFFSETS); + wordsOffsets.transformEach(0, wordsFileSize, new CountToOffsetTransformer(ReverseIndexPriorityParameters.ENTRY_SIZE)); + progress.progress(TaskSteps.CREATE_INTERMEDIATE_DOCS); + // Construct an intermediate representation of the reverse documents index try (FileChannel intermediateDocChannel = (FileChannel) Files.newByteChannel(intermediateUrlsFile, StandardOpenOption.CREATE, StandardOpenOption.READ, StandardOpenOption.WRITE)) { - logger.info("Creating Intermediate Docs File"); // Construct intermediate index try (RandomWriteFunnel intermediateDocumentWriteFunnel = new RandomWriteFunnel(tmpFileDir, RWF_BIN_SIZE); @@ -87,8 +109,7 @@ public class ReverseIndexPriorityConverter { intermediateDocumentWriteFunnel.write(intermediateDocChannel); } intermediateDocChannel.force(false); - - logger.info("Sorting Intermediate Docs File"); + progress.progress(TaskSteps.SORT_INTERMEDIATE_DOCS); // Sort each segment of the intermediate file { @@ -100,32 +121,29 @@ public class ReverseIndexPriorityConverter { intermediateDocs.force(); } + progress.progress(TaskSteps.SIZING); - logger.info("Sizing"); - - IndexSizeEstimator indexSizeEstimator = new IndexSizeEstimator( - ReverseIndexPriorityParameters.bTreeContext, + IndexSizeEstimator sizeEstimator = new IndexSizeEstimator( + bTreeContext, ReverseIndexPriorityParameters.ENTRY_SIZE); - wordsOffsets.fold(0, 0, wordsOffsets.size(), indexSizeEstimator); + wordsOffsets.fold(0, 0, wordsOffsets.size(), sizeEstimator); + progress.progress(TaskSteps.FINALIZING_DOCS); - logger.info("Finalizing Docs File"); - - LongArray finalDocs = LongArray.mmapForWriting(outputFileDocs, indexSizeEstimator.size); + LongArray finalDocs = LongArray.mmapForWriting(outputFileDocs, sizeEstimator.size); // Construct the proper reverse index - wordsOffsets.transformEachIO(0, wordsOffsets.size(), - new ReverseIndexBTreeTransformer(finalDocs, - ReverseIndexPriorityParameters.ENTRY_SIZE, - ReverseIndexPriorityParameters.bTreeContext, - intermediateDocChannel)); + wordsOffsets.transformEachIO(0, wordsOffsets.size(), new ReverseIndexBTreeTransformer(finalDocs, ReverseIndexPriorityParameters.ENTRY_SIZE, bTreeContext, intermediateDocChannel)); wordsOffsets.write(outputFileWords); + progress.progress(TaskSteps.FORCE); + // Attempt to clean up before forcing (important disk space preservation) Files.deleteIfExists(intermediateUrlsFile); wordsOffsets.force(); finalDocs.force(); - logger.info("Done"); + + progress.progress(TaskSteps.FINISHED); } } catch (IOException ex) { diff --git a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest.java b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest.java index 01df3e2f..6212dc8a 100644 --- a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest.java +++ b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest.java @@ -13,9 +13,11 @@ import nu.marginalia.ranking.DomainRankings; import nu.marginalia.lexicon.KeywordLexicon; import nu.marginalia.lexicon.journal.KeywordLexiconJournal; import nu.marginalia.model.idx.DocumentMetadata; +import nu.marginalia.service.control.ServiceHeartbeat; import nu.marginalia.test.TestUtil; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import org.mockito.Mockito; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -84,7 +86,9 @@ class ReverseIndexFullConverterTest { var docsFile = dataDir.resolve("docs.dat"); var journalReader = new IndexJournalReaderSingleCompressedFile(indexFile); - new ReverseIndexFullConverter(tmpDir, journalReader, new DomainRankings(), wordsFile, docsFile) + new ReverseIndexFullConverter( + Mockito.mock(ServiceHeartbeat.class), + tmpDir, journalReader, new DomainRankings(), wordsFile, docsFile) .convert(); var reverseIndexReader = new ReverseIndexFullReader(wordsFile, docsFile); diff --git a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest2.java b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest2.java index a99ab674..ab8be8ea 100644 --- a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest2.java +++ b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest2.java @@ -14,10 +14,12 @@ import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode; import nu.marginalia.ranking.DomainRankings; import nu.marginalia.lexicon.KeywordLexicon; import nu.marginalia.lexicon.journal.KeywordLexiconJournal; +import nu.marginalia.service.control.ServiceHeartbeat; import nu.marginalia.test.TestUtil; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import org.mockito.Mockito; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -117,7 +119,7 @@ class ReverseIndexFullConverterTest2 { Path tmpDir = Path.of("/tmp"); - new ReverseIndexFullConverter(tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile), new DomainRankings(), wordsFile, docsFile).convert(); + new ReverseIndexFullConverter(Mockito.mock(ServiceHeartbeat.class), tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile), new DomainRankings(), wordsFile, docsFile).convert(); var reverseReader = new ReverseIndexFullReader(wordsFile, docsFile); @@ -142,7 +144,7 @@ class ReverseIndexFullConverterTest2 { Path tmpDir = Path.of("/tmp"); - new ReverseIndexFullConverter(tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile, null, ReverseIndexPriorityParameters::filterPriorityRecord), new DomainRankings(), wordsFile, docsFile).convert(); + new ReverseIndexFullConverter(Mockito.mock(ServiceHeartbeat.class), tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile, null, ReverseIndexPriorityParameters::filterPriorityRecord), new DomainRankings(), wordsFile, docsFile).convert(); var reverseReader = new ReverseIndexFullReader(wordsFile, docsFile); diff --git a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexPriorityConverterTest2.java b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexPriorityConverterTest2.java index 1f9763c8..29f9959b 100644 --- a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexPriorityConverterTest2.java +++ b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexPriorityConverterTest2.java @@ -14,10 +14,12 @@ import nu.marginalia.lexicon.KeywordLexicon; import nu.marginalia.lexicon.journal.KeywordLexiconJournal; import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode; import nu.marginalia.ranking.DomainRankings; +import nu.marginalia.service.control.ServiceHeartbeat; import nu.marginalia.test.TestUtil; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import org.mockito.Mockito; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -117,7 +119,7 @@ class ReverseIndexPriorityConverterTest2 { Path tmpDir = Path.of("/tmp"); - new ReverseIndexPriorityConverter(tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile), new DomainRankings(), wordsFile, docsFile).convert(); + new ReverseIndexPriorityConverter(Mockito.mock(ServiceHeartbeat.class), tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile), new DomainRankings(), wordsFile, docsFile).convert(); var reverseReader = new ReverseIndexPriorityReader(wordsFile, docsFile); @@ -142,7 +144,7 @@ class ReverseIndexPriorityConverterTest2 { Path tmpDir = Path.of("/tmp"); - new ReverseIndexPriorityConverter(tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile, null, ReverseIndexPriorityParameters::filterPriorityRecord), new DomainRankings(), wordsFile, docsFile).convert(); + new ReverseIndexPriorityConverter(Mockito.mock(ServiceHeartbeat.class), tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile, null, ReverseIndexPriorityParameters::filterPriorityRecord), new DomainRankings(), wordsFile, docsFile).convert(); var reverseReader = new ReverseIndexPriorityReader(wordsFile, docsFile); diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java index 1ec0e555..78ec0e5f 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java @@ -266,11 +266,11 @@ public class ControlService extends Service { } private Object processesModel(Request request, Response response) { - var heartbeatsAll = heartbeatService.getProcessHeartbeats(); - var byIsJob = heartbeatsAll.stream().collect(Collectors.partitioningBy(ProcessHeartbeat::isServiceJob)); + var processes = heartbeatService.getProcessHeartbeats(); + var jobs = heartbeatService.getTaskHeartbeats(); - return Map.of("processes", byIsJob.get(false), - "jobs", byIsJob.get(true), + return Map.of("processes", processes, + "jobs", jobs, "actors", controlActorService.getActorStates(), "messages", messageQueueViewService.getLastEntries(20)); } diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/ProcessLivenessMonitorActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/ProcessLivenessMonitorActor.java index 1623fd49..1098a085 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/ProcessLivenessMonitorActor.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/ProcessLivenessMonitorActor.java @@ -3,6 +3,7 @@ package nu.marginalia.control.actor.monitor; import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.control.model.ProcessHeartbeat; +import nu.marginalia.control.model.ServiceHeartbeat; import nu.marginalia.control.svc.HeartbeatService; import nu.marginalia.control.svc.ProcessService; import nu.marginalia.mqsm.StateFactory; @@ -11,6 +12,7 @@ import nu.marginalia.mqsm.graph.GraphState; import nu.marginalia.mqsm.graph.ResumeBehavior; import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; @Singleton public class ProcessLivenessMonitorActor extends AbstractStateGraph { @@ -46,12 +48,33 @@ public class ProcessLivenessMonitorActor extends AbstractStateGraph { public void monitor() throws Exception { for (;;) { - var processHeartbeats = heartbeatService.getProcessHeartbeats(); + for (var heartbeat : heartbeatService.getProcessHeartbeats()) { + if (!heartbeat.isRunning()) { + continue; + } + + var processId = heartbeat.getProcessId(); + if (null == processId) + continue; + + if (processService.isRunning(processId) && heartbeat.lastSeenMillis() < 10000) { + continue; + } + + heartbeatService.flagProcessAsStopped(heartbeat); + } + + var livingServices = heartbeatService.getServiceHeartbeats().stream() + .filter(ServiceHeartbeat::alive) + .map(ServiceHeartbeat::uuidFull) + .collect(Collectors.toSet()); + + for (var heartbeat : heartbeatService.getTaskHeartbeats()) { + if (!livingServices.contains(heartbeat.serviceUuuidFull())) { + heartbeatService.removeTaskHeartbeat(heartbeat); + } + } - processHeartbeats.stream() - .filter(ProcessHeartbeat::isRunning) - .filter(p -> !processService.isRunning(p.getProcessId())) - .forEach(heartbeatService::flagProcessAsStopped); TimeUnit.SECONDS.sleep(60); } diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java index 9b0b8b0a..f3f43e76 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java @@ -44,7 +44,17 @@ public record ProcessHeartbeat( case "loader" -> ProcessService.ProcessId.LOADER; case "website-adjacencies-calculator" -> ProcessService.ProcessId.ADJACENCIES_CALCULATOR; case "crawl-job-extractor" -> ProcessService.ProcessId.CRAWL_JOB_EXTRACTOR; - default -> throw new RuntimeException("Unknown process base: " + processBase); + default -> null; }; } + + public String displayName() { + var pid = getProcessId(); + if (pid != null) { + return pid.name(); + } + else { + return processBase; + } + } } diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/model/TaskHeartbeat.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/TaskHeartbeat.java new file mode 100644 index 00000000..84d5bcd5 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/TaskHeartbeat.java @@ -0,0 +1,29 @@ +package nu.marginalia.control.model; + + +public record TaskHeartbeat( + String taskName, + String taskBase, + String serviceUuuidFull, + double lastSeenMillis, + Integer progress, + String stage, + String status +) { + public boolean isStopped() { + return "STOPPED".equals(status); + } + public boolean isRunning() { + return "RUNNING".equals(status); + } + + public String progressStyle() { + if ("RUNNING".equals(status) && progress != null) { + return """ + background: linear-gradient(90deg, #ccc 0%%, #ccc %d%%, #fff %d%%) + """.formatted(progress, progress, progress); + } + return ""; + } + +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/HeartbeatService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/HeartbeatService.java index 74a504b2..1379924a 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/HeartbeatService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/HeartbeatService.java @@ -5,6 +5,7 @@ import com.google.inject.Singleton; import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.control.model.ProcessHeartbeat; import nu.marginalia.control.model.ServiceHeartbeat; +import nu.marginalia.control.model.TaskHeartbeat; import nu.marginalia.service.control.ServiceEventLog; import java.sql.SQLException; @@ -51,6 +52,49 @@ public class HeartbeatService { return heartbeats; } + public List getTaskHeartbeats() { + List heartbeats = new ArrayList<>(); + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT TASK_NAME, TASK_BASE, SERVICE_INSTANCE, STATUS, STAGE_NAME, PROGRESS, TIMESTAMPDIFF(MICROSECOND, TASK_HEARTBEAT.HEARTBEAT_TIME, CURRENT_TIMESTAMP(6)) AS TSDIFF + FROM TASK_HEARTBEAT + INNER JOIN SERVICE_HEARTBEAT ON SERVICE_HEARTBEAT.`INSTANCE` = SERVICE_INSTANCE + """)) { + var rs = stmt.executeQuery(); + while (rs.next()) { + int progress = rs.getInt("PROGRESS"); + heartbeats.add(new TaskHeartbeat( + rs.getString("TASK_NAME"), + rs.getString("TASK_BASE"), + rs.getString("SERVICE_INSTANCE"), + rs.getLong("TSDIFF") / 1000., + progress < 0 ? null : progress, + rs.getString("STAGE_NAME"), + rs.getString("STATUS") + )); + } + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + return heartbeats; + } + + public void removeTaskHeartbeat(TaskHeartbeat heartbeat) { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + DELETE FROM TASK_HEARTBEAT + WHERE SERVICE_INSTANCE = ? + """)) { + + stmt.setString(1, heartbeat.serviceUuuidFull()); + stmt.executeUpdate(); + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } + public List getProcessHeartbeats() { List heartbeats = new ArrayList<>(); @@ -99,5 +143,4 @@ public class HeartbeatService { throw new RuntimeException(ex); } } - } diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java index 67d284b7..cea64c9f 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java @@ -32,8 +32,7 @@ public class ProcessService { CONVERTER("converter-process/bin/converter-process"), LOADER("loader-process/bin/loader-process"), ADJACENCIES_CALCULATOR("website-adjacencies-calculator/bin/website-adjacencies-calculator"), - CRAWL_JOB_EXTRACTOR("crawl-job-extractor-process/bin/crawl-job-extractor-process"), - + CRAWL_JOB_EXTRACTOR("crawl-job-extractor-process/bin/crawl-job-extractor-process") ; public final String path; diff --git a/code/services-core/control-service/src/main/resources/templates/control/actors.hdb b/code/services-core/control-service/src/main/resources/templates/control/actors.hdb index 9bb0bfd0..c669ce46 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/actors.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/actors.hdb @@ -16,7 +16,7 @@ \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/partials/processes-table.hdb b/code/services-core/control-service/src/main/resources/templates/control/partials/processes-table.hdb index 4547e76b..d1a4eeea 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/partials/processes-table.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/partials/processes-table.hdb @@ -1,5 +1,6 @@

    Processes

    +
    State
    TTL
    Msg ID
    Related ID
    Owner Instance
    Owner Tick
    Created
    Updated
    {{stateCode}} {{state}}
    @@ -10,7 +11,7 @@ {{#each processes}} - + {{/each}} +
    Process ID
    {{processId}}{{displayName}}    {{uuid}} @@ -20,4 +21,22 @@ {{#unless isStopped}}{{lastSeenMillis}}{{/unless}}
    + +

    Jobs

    + + + + + + + + {{#each jobs}} + + + + + + + {{/each}}
    Process IDStatusProgressLast Seen (ms)
    {{taskBase}}{{status}}{{#if progress}}{{progress}}%{{/if}} {{stage}}{{#unless isStopped}}{{lastSeenMillis}}{{/unless}}
    \ No newline at end of file diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexServicesFactory.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexServicesFactory.java index eafd3d57..031fcd2d 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexServicesFactory.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexServicesFactory.java @@ -15,9 +15,9 @@ import nu.marginalia.index.full.ReverseIndexFullConverter; import nu.marginalia.index.priority.ReverseIndexPriorityReader; import nu.marginalia.index.priority.ReverseIndexPriorityParameters; import nu.marginalia.index.full.ReverseIndexFullReader; -import nu.marginalia.lexicon.KeywordLexicon; import nu.marginalia.ranking.DomainRankings; import nu.marginalia.index.index.SearchIndexReader; +import nu.marginalia.service.control.ServiceHeartbeat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -33,6 +33,7 @@ import java.util.stream.Stream; @Singleton public class IndexServicesFactory { private final Path tmpFileDir; + private final ServiceHeartbeat heartbeat; private final Path liveStorage; private final Path stagingStorage; @@ -55,8 +56,10 @@ public class IndexServicesFactory { @Inject public IndexServicesFactory( + ServiceHeartbeat heartbeat, FileStorageService fileStorageService ) throws IOException, SQLException { + this.heartbeat = heartbeat; liveStorage = fileStorageService.getStorageByType(FileStorageType.INDEX_LIVE).asPath(); stagingStorage = fileStorageService.getStorageByType(FileStorageType.INDEX_STAGING).asPath(); @@ -100,17 +103,34 @@ public class IndexServicesFactory { ).noneMatch(Files::exists); } + enum ConvertSteps { + FORWARD_INDEX, + FULL_REVERSE_INDEX, + PRIORITY_REVERSE_INDEX, + FINISHED + } public void convertIndex(DomainRankings domainRankings) throws IOException { - convertForwardIndex(domainRankings); - convertFullReverseIndex(domainRankings); - convertPriorityReverseIndex(domainRankings); + try (var hb = heartbeat.createServiceProcessHeartbeat(ConvertSteps.class, "index-conversion")) { + hb.progress(ConvertSteps.FORWARD_INDEX); + convertForwardIndex(domainRankings); + + hb.progress(ConvertSteps.FULL_REVERSE_INDEX); + convertFullReverseIndex(domainRankings); + + hb.progress(ConvertSteps.PRIORITY_REVERSE_INDEX); + convertPriorityReverseIndex(domainRankings); + + hb.progress(ConvertSteps.FINISHED); + } } private void convertFullReverseIndex(DomainRankings domainRankings) throws IOException { logger.info("Converting full reverse index {}", writerIndexFile); var journalReader = new IndexJournalReaderSingleCompressedFile(writerIndexFile); - var converter = new ReverseIndexFullConverter(tmpFileDir, + var converter = new ReverseIndexFullConverter( + heartbeat, + tmpFileDir, journalReader, domainRankings, revIndexWords.get(NEXT_PART).toPath(), @@ -128,7 +148,8 @@ public class IndexServicesFactory { var journalReader = new IndexJournalReaderSingleCompressedFile(writerIndexFile, null, ReverseIndexPriorityParameters::filterPriorityRecord); - var converter = new ReverseIndexPriorityConverter(tmpFileDir, + var converter = new ReverseIndexPriorityConverter(heartbeat, + tmpFileDir, journalReader, domainRankings, revPrioIndexWords.get(NEXT_PART).toPath(), @@ -144,7 +165,8 @@ public class IndexServicesFactory { logger.info("Converting forward index data {}", writerIndexFile); - new ForwardIndexConverter(writerIndexFile.toFile(), + new ForwardIndexConverter(heartbeat, + writerIndexFile.toFile(), fwdIndexDocId.get(NEXT_PART).toPath(), fwdIndexDocData.get(NEXT_PART).toPath(), domainRankings) diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java index ba671969..79602c22 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java @@ -21,6 +21,7 @@ import nu.marginalia.index.config.RankingSettings; import nu.marginalia.ranking.DomainRankings; import nu.marginalia.index.client.model.query.SearchSetIdentifier; import nu.marginalia.index.db.DbUpdateRanks; +import nu.marginalia.service.control.ServiceHeartbeat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -30,6 +31,7 @@ import java.io.IOException; public class IndexSearchSetsService { private final Logger logger = LoggerFactory.getLogger(getClass()); private final DomainTypes domainTypes; + private final ServiceHeartbeat heartbeat; private final DbUpdateRanks dbUpdateRanks; private final RankingDomainFetcher similarityDomains; private final RankingSettings rankingSettings; @@ -47,12 +49,14 @@ public class IndexSearchSetsService { @Inject public IndexSearchSetsService(DomainTypes domainTypes, + ServiceHeartbeat heartbeat, RankingDomainFetcher rankingDomains, RankingDomainFetcherForSimilarityData similarityDomains, RankingSettings rankingSettings, IndexServicesFactory servicesFactory, DbUpdateRanks dbUpdateRanks) throws IOException { this.domainTypes = domainTypes; + this.heartbeat = heartbeat; this.dbUpdateRanks = dbUpdateRanks; @@ -90,12 +94,34 @@ public class IndexSearchSetsService { }; } + enum RepartitionSteps { + UPDATE_ACADEMIA, + UPDATE_RETRO, + UPDATE_SMALL_WEB, + UPDATE_BLOGS, + UPDATE_RANKINGS, + FINISHED + } public void recalculateAll() { - updateAcademiaDomainsSet(); - updateRetroDomainsSet(); - updateSmallWebDomainsSet(); - updateBlogsSet(); - updateDomainRankings(); + try (var processHeartbeat = heartbeat.createServiceProcessHeartbeat(RepartitionSteps.class, "repartitionAll")) { + + processHeartbeat.progress(RepartitionSteps.UPDATE_ACADEMIA); + updateAcademiaDomainsSet(); + + processHeartbeat.progress(RepartitionSteps.UPDATE_RETRO); + updateRetroDomainsSet(); + + processHeartbeat.progress(RepartitionSteps.UPDATE_SMALL_WEB); + updateSmallWebDomainsSet(); + + processHeartbeat.progress(RepartitionSteps.UPDATE_BLOGS); + updateBlogsSet(); + + processHeartbeat.progress(RepartitionSteps.UPDATE_RANKINGS); + updateDomainRankings(); + + processHeartbeat.progress(RepartitionSteps.FINISHED); + } } private void updateDomainRankings() { diff --git a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java index 997e2a74..1c4c6986 100644 --- a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java +++ b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java @@ -63,6 +63,7 @@ public class IndexQueryServiceIntegrationTestModule extends AbstractModule { when(fileStorageServiceMock.getStorageByType(FileStorageType.INDEX_STAGING)).thenReturn(new FileStorage(null, null, null, slowDir.toString(), null)); var servicesFactory = new IndexServicesFactory( + Mockito.mock(ServiceHeartbeat.class), fileStorageServiceMock ); bind(IndexServicesFactory.class).toInstance(servicesFactory); From 912129311de84328ecff2929bb16c3d3afaed4ad Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 4 Aug 2023 17:54:18 +0200 Subject: [PATCH 141/157] (control) Message Queue GUI --- .../nu/marginalia/control/ControlService.java | 101 +++++++++++-- .../control/svc/MessageQueueViewService.java | 139 ++++++++++++------ .../control/dialog-update-message-state.hdb | 2 +- .../templates/control/message-queue.hdb | 20 +++ .../templates/control/new-message.hdb | 31 ++++ .../control/partials/message-queue-table.hdb | 29 +++- .../templates/control/partials/nav.hdb | 1 + .../control/partials/processes-table.hdb | 5 +- .../templates/control/service-by-id.hdb | 1 + 9 files changed, 267 insertions(+), 62 deletions(-) create mode 100644 code/services-core/control-service/src/main/resources/templates/control/message-queue.hdb create mode 100644 code/services-core/control-service/src/main/resources/templates/control/new-message.hdb diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java index 78ec0e5f..eca19900 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java @@ -5,7 +5,7 @@ import com.google.inject.Inject; import nu.marginalia.client.ServiceMonitors; import nu.marginalia.control.model.Actor; import nu.marginalia.control.model.DomainComplaintModel; -import nu.marginalia.control.model.ProcessHeartbeat; +import nu.marginalia.control.model.MessageQueueEntry; import nu.marginalia.control.svc.*; import nu.marginalia.db.storage.model.FileStorageId; import nu.marginalia.db.storage.model.FileStorageType; @@ -27,6 +27,7 @@ import java.sql.SQLException; import java.util.Comparator; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.stream.Collectors; public class ControlService extends Service { @@ -80,8 +81,11 @@ public class ControlService extends Service { var apiKeysRenderer = rendererFactory.renderer("control/api-keys"); var domainComplaintsRenderer = rendererFactory.renderer("control/domain-complaints"); + var messageQueueRenderer = rendererFactory.renderer("control/message-queue"); + var storageDetailsRenderer = rendererFactory.renderer("control/storage-details"); var updateMessageStateRenderer = rendererFactory.renderer("control/dialog-update-message-state"); + var newMessageRenderer = rendererFactory.renderer("control/new-message"); this.controlActorService = controlActorService; @@ -98,7 +102,7 @@ public class ControlService extends Service { Spark.get("/public/services", this::servicesModel, servicesRenderer::render); Spark.get("/public/services/:id", this::serviceModel, serviceByIdRenderer::render); - Spark.get("/public/messages/:id", this::messageModel, gson::toJson); + Spark.get("/public/messages/:id", this::existingMessageModel, gson::toJson); Spark.get("/public/actors", this::processesModel, actorsRenderer::render); Spark.get("/public/actors/:fsm", this::actorDetailsModel, actorDetailsRenderer::render); Spark.get("/public/storage", this::storageModel, storageRenderer::render); @@ -114,9 +118,38 @@ public class ControlService extends Service { final HtmlRedirect redirectToApiKeys = new HtmlRedirect("/api-keys"); final HtmlRedirect redirectToStorage = new HtmlRedirect("/storage"); final HtmlRedirect redirectToComplaints = new HtmlRedirect("/complaints"); + final HtmlRedirect redirectToMessageQueue = new HtmlRedirect("/message-queue"); Spark.post("/public/fsms/:fsm/start", controlActorService::startFsm, redirectToProcesses); Spark.post("/public/fsms/:fsm/stop", controlActorService::stopFsm, redirectToProcesses); + + Spark.get("/public/message-queue", this::messageQueueModel, messageQueueRenderer::render); + Spark.post("/public/message-queue/", (rq, rsp) -> { + String recipient = rq.queryParams("recipientInbox"); + String sender = rq.queryParams("senderInbox"); + String relatedMessage = rq.queryParams("relatedId"); + String function = rq.queryParams("function"); + String payload = rq.queryParams("payload"); + + persistence.sendNewMessage(recipient, + sender, + relatedMessage == null ? null : Long.parseLong(relatedMessage), + function, + payload, + null); + + return ""; + }, redirectToMessageQueue); + Spark.get("/public/message-queue/new", this::newMessageModel, newMessageRenderer::render); + Spark.get("/public/message-queue/:id/reply", this::replyMessageModel, newMessageRenderer::render); + Spark.get("/public/message-queue/:id/edit", (rq, rsp) -> persistence.getMessage(Long.parseLong(rq.params("id"))), updateMessageStateRenderer::render); + Spark.post("/public/message-queue/:id/edit", (rq, rsp) -> { + MqMessageState state = MqMessageState.valueOf(rq.queryParams("state")); + long id = Long.parseLong(rq.params("id")); + persistence.updateMessageState(id, state); + return ""; + }, redirectToMessageQueue); + Spark.post("/public/storage/:fid/crawl", controlActorService::triggerCrawling, redirectToProcesses); Spark.post("/public/storage/:fid/recrawl", controlActorService::triggerRecrawling, redirectToProcesses); Spark.post("/public/storage/:fid/process", controlActorService::triggerProcessing, redirectToProcesses); @@ -134,19 +167,42 @@ public class ControlService extends Service { Spark.get("/public/complaints", this::complaintsModel, domainComplaintsRenderer::render); Spark.post("/public/complaints/:domain", this::reviewComplaint, redirectToComplaints); - Spark.get("/public/message/:id/state", (rq, rsp) -> persistence.getMessage(Long.parseLong(rq.params("id"))), updateMessageStateRenderer::render); - Spark.post("/public/message/:id/state", (rq, rsp) -> { - MqMessageState state = MqMessageState.valueOf(rq.queryParams("state")); - long id = Long.parseLong(rq.params("id")); - persistence.updateMessageState(id, state); - return ""; - }, redirectToProcesses); - Spark.get("/public/:resource", this::serveStatic); monitors.subscribe(this::logMonitorStateChange); } + private Object messageQueueModel(Request request, Response response) { + String inboxParam = request.queryParams("inbox"); + String instanceParam = request.queryParams("instance"); + String afterParam = request.queryParams("after"); + + long afterId = Optional.ofNullable(afterParam).map(Long::parseLong).orElse(Long.MAX_VALUE); + + List entries; + + if (inboxParam != null) { + entries = messageQueueViewService.getEntriesForInbox(inboxParam, afterId, 20); + } + else if (instanceParam != null) { + entries = messageQueueViewService.getEntriesForInstance(instanceParam, afterId, 20); + } + else { + entries = messageQueueViewService.getEntries(afterId, 20); + } + + Object next; + + if (entries.size() == 20) + next = entries.stream().mapToLong(MessageQueueEntry::id).min().getAsLong(); + else + next = ""; + + Object prev = afterParam == null ? "" : afterParam; + + return Map.of("messages", entries, "next", next, "prev", prev); + } + private Object complaintsModel(Request request, Response response) { Map> complaintsByReviewed = domainComplaintService.getComplaints().stream().collect(Collectors.partitioningBy(DomainComplaintModel::reviewed)); @@ -224,7 +280,7 @@ public class ControlService extends Service { } - private Object messageModel(Request request, Response response) { + private Object existingMessageModel(Request request, Response response) { var message = messageQueueViewService.getMessage(Long.parseLong(request.params("id"))); if (message != null) { response.type("application/json"); @@ -236,11 +292,34 @@ public class ControlService extends Service { } } + private Object newMessageModel(Request request, Response response) { + String idParam = request.queryParams("id"); + if (null == idParam) + return Map.of("relatedId", "-1"); + + var message = messageQueueViewService.getMessage(Long.parseLong(idParam)); + if (message != null) + return message; + + return Map.of("relatedId", "-1"); + } + private Object replyMessageModel(Request request, Response response) { + String idParam = request.params("id"); + + var message = messageQueueViewService.getMessage(Long.parseLong(idParam)); + + return Map.of("relatedId", message.id(), + "recipientInbox", message.senderInbox(), + "function", "REPLY"); + } + + private Object serviceModel(Request request, Response response) { String serviceName = request.params("id"); return Map.of( "id", serviceName, + "messages", messageQueueViewService.getEntriesForInbox(serviceName, Long.MAX_VALUE, 20), "events", eventLogService.getLastEntriesForService(serviceName, 20)); } diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueViewService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueViewService.java index f52ba3a1..8f3a45a7 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueViewService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueViewService.java @@ -7,6 +7,7 @@ import nu.marginalia.control.model.Actor; import nu.marginalia.control.model.MessageQueueEntry; import nu.marginalia.mqsm.graph.AbstractStateGraph; +import java.sql.ResultSet; import java.sql.SQLException; import java.util.ArrayList; import java.util.List; @@ -34,20 +35,7 @@ public class MessageQueueViewService { List entries = new ArrayList<>(n); var rs = query.executeQuery(); while (rs.next()) { - entries.add(new MessageQueueEntry( - rs.getLong("ID"), - rs.getLong("RELATED_ID"), - rs.getString("SENDER_INBOX"), - rs.getString("RECIPIENT_INBOX"), - rs.getString("FUNCTION"), - rs.getString("PAYLOAD"), - rs.getString("OWNER_INSTANCE"), - rs.getLong("OWNER_TICK"), - rs.getString("STATE"), - rs.getTimestamp("CREATED_TIME").toLocalDateTime().toLocalTime().toString(), - rs.getTimestamp("UPDATED_TIME").toLocalDateTime().toLocalTime().toString(), - rs.getInt("TTL") - )); + entries.add(newEntry(rs)); } return entries; } @@ -68,20 +56,7 @@ public class MessageQueueViewService { var rs = query.executeQuery(); if (rs.next()) { - return new MessageQueueEntry( - rs.getLong("ID"), - rs.getLong("RELATED_ID"), - rs.getString("SENDER_INBOX"), - rs.getString("RECIPIENT_INBOX"), - rs.getString("FUNCTION"), - rs.getString("PAYLOAD"), - rs.getString("OWNER_INSTANCE"), - rs.getLong("OWNER_TICK"), - rs.getString("STATE"), - rs.getTimestamp("CREATED_TIME").toLocalDateTime().toLocalTime().toString(), - rs.getTimestamp("UPDATED_TIME").toLocalDateTime().toLocalTime().toString(), - rs.getInt("TTL") - ); + return newEntry(rs); } } catch (SQLException ex) { @@ -105,20 +80,7 @@ public class MessageQueueViewService { List entries = new ArrayList<>(n); var rs = query.executeQuery(); while (rs.next()) { - entries.add(new MessageQueueEntry( - rs.getLong("ID"), - rs.getLong("RELATED_ID"), - rs.getString("SENDER_INBOX"), - rs.getString("RECIPIENT_INBOX"), - rs.getString("FUNCTION"), - rs.getString("PAYLOAD"), - rs.getString("OWNER_INSTANCE"), - rs.getLong("OWNER_TICK"), - rs.getString("STATE"), - rs.getTimestamp("CREATED_TIME").toLocalDateTime().toLocalTime().toString(), - rs.getTimestamp("UPDATED_TIME").toLocalDateTime().toLocalTime().toString(), - rs.getInt("TTL") - )); + entries.add(newEntry(rs)); } return entries; } @@ -126,4 +88,97 @@ public class MessageQueueViewService { throw new RuntimeException(ex); } } + + public List getEntriesForInbox(String inbox, long afterId, int n) { + try (var conn = dataSource.getConnection(); + var query = conn.prepareStatement(""" + SELECT ID, RELATED_ID, SENDER_INBOX, RECIPIENT_INBOX, FUNCTION, PAYLOAD, OWNER_INSTANCE, OWNER_TICK, STATE, CREATED_TIME, UPDATED_TIME, TTL + FROM MESSAGE_QUEUE + WHERE ID < ? AND (RECIPIENT_INBOX = ? OR SENDER_INBOX = ?) + ORDER BY ID DESC + LIMIT ? + """)) { + + query.setLong(1, afterId); + query.setString(2, inbox); + query.setString(3, inbox); + query.setInt(4, n); + + List entries = new ArrayList<>(n); + var rs = query.executeQuery(); + while (rs.next()) { + entries.add(newEntry(rs)); + } + return entries; + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } + public List getEntriesForInstance(String instance, long afterId, int n) { + try (var conn = dataSource.getConnection(); + var query = conn.prepareStatement(""" + SELECT ID, RELATED_ID, SENDER_INBOX, RECIPIENT_INBOX, FUNCTION, PAYLOAD, OWNER_INSTANCE, OWNER_TICK, STATE, CREATED_TIME, UPDATED_TIME, TTL + FROM MESSAGE_QUEUE + WHERE ID < ? AND OWNER_INSTANCE = ? + ORDER BY ID DESC + LIMIT ? + """)) { + + query.setLong(1, afterId); + query.setString(2, instance); + query.setInt(3, n); + + List entries = new ArrayList<>(n); + var rs = query.executeQuery(); + while (rs.next()) { + entries.add(newEntry(rs)); + } + return entries; + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } + + public List getEntries(long afterId, int n) { + try (var conn = dataSource.getConnection(); + var query = conn.prepareStatement(""" + SELECT ID, RELATED_ID, SENDER_INBOX, RECIPIENT_INBOX, FUNCTION, PAYLOAD, OWNER_INSTANCE, OWNER_TICK, STATE, CREATED_TIME, UPDATED_TIME, TTL + FROM MESSAGE_QUEUE + WHERE ID < ? + ORDER BY ID DESC + LIMIT ? + """)) { + + query.setLong(1, afterId); + query.setInt(2, n); + + List entries = new ArrayList<>(n); + var rs = query.executeQuery(); + while (rs.next()) { + entries.add(newEntry(rs)); + } + return entries; + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } + + private MessageQueueEntry newEntry(ResultSet rs) throws SQLException { + return new MessageQueueEntry( + rs.getLong("ID"), + rs.getLong("RELATED_ID"), + rs.getString("SENDER_INBOX"), + rs.getString("RECIPIENT_INBOX"), + rs.getString("FUNCTION"), + rs.getString("PAYLOAD"), + rs.getString("OWNER_INSTANCE"), + rs.getLong("OWNER_TICK"), + rs.getString("STATE"), + rs.getTimestamp("CREATED_TIME").toLocalDateTime().toLocalTime().toString(), + rs.getTimestamp("UPDATED_TIME").toLocalDateTime().toLocalTime().toString(), + rs.getInt("TTL")); + } } diff --git a/code/services-core/control-service/src/main/resources/templates/control/dialog-update-message-state.hdb b/code/services-core/control-service/src/main/resources/templates/control/dialog-update-message-state.hdb index 13d25615..7acae272 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/dialog-update-message-state.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/dialog-update-message-state.hdb @@ -9,7 +9,7 @@

    Update the of a message in the message queue. This may be useful to prevent an actor from resuming an action when this is not desirable. Setting an old message to 'NEW' will erase information about its owner, and inboxes will consider the message new again.

    -
    +

    diff --git a/code/services-core/control-service/src/main/resources/templates/control/message-queue.hdb b/code/services-core/control-service/src/main/resources/templates/control/message-queue.hdb new file mode 100644 index 00000000..cc5b5da9 --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/message-queue.hdb @@ -0,0 +1,20 @@ + + + + Control Service + + + + + {{> control/partials/nav}} +
    + {{> control/partials/message-queue-table }} +
    + + + + \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/new-message.hdb b/code/services-core/control-service/src/main/resources/templates/control/new-message.hdb new file mode 100644 index 00000000..211d690c --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/new-message.hdb @@ -0,0 +1,31 @@ + + + +Update ID + +{{> control/partials/nav}} +
    +

    Create Message

    + +
    + +
    +
    + +
    +
    + +
    +
    + +
    +
    + +
    +
    + + + +
    + + \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/partials/message-queue-table.hdb b/code/services-core/control-service/src/main/resources/templates/control/partials/message-queue-table.hdb index b971c928..6aac4572 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/partials/message-queue-table.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/partials/message-queue-table.hdb @@ -1,7 +1,9 @@

    Message Queue

    + + @@ -9,25 +11,42 @@ - + + + + {{#each messages}} - + + - + + - + {{/each}} + + + + +
    Action State
    TTL
    Msg ID
    Related ID
    Recipient
    Sender
    Owner Instance
    Owner Tick
    Created
    Updated
    + [Add Message] +
    {{stateCode}} {{state}}[Edit]{{stateCode}} {{state}} {{id}}{{recipientInbox}}{{recipientInbox}} {{function}} -    {{ownerInstance}} +    {{ownerInstance}} {{createdTime}}
    + {{#if senderInbox}}[Reply]{{/if}} + {{ttl}} {{relatedId}}{{senderInbox}}{{senderInbox}} {{payload}} {{ownerTick}} {{updatedTime}}
    + {{#if prev}}Prev{{/if}} + {{#if next}}Next{{/if}} +
    diff --git a/code/services-core/control-service/src/main/resources/templates/control/partials/nav.hdb b/code/services-core/control-service/src/main/resources/templates/control/partials/nav.hdb index 184f28b9..94f3b13a 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/partials/nav.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/partials/nav.hdb @@ -3,6 +3,7 @@
  • Overview
  • Services
  • Actors
  • +
  • Message Queue
  • Storage
  • API Keys
  • Blacklist
  • diff --git a/code/services-core/control-service/src/main/resources/templates/control/partials/processes-table.hdb b/code/services-core/control-service/src/main/resources/templates/control/partials/processes-table.hdb index d1a4eeea..50ab8d58 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/partials/processes-table.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/partials/processes-table.hdb @@ -11,10 +11,9 @@ {{#each processes}} - {{displayName}} + {{displayName}} -    - {{uuid}} +   {{uuid}} {{status}} {{#if progress}}{{progress}}%{{/if}} diff --git a/code/services-core/control-service/src/main/resources/templates/control/service-by-id.hdb b/code/services-core/control-service/src/main/resources/templates/control/service-by-id.hdb index 5b1fe6b4..f350ac5a 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/service-by-id.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/service-by-id.hdb @@ -10,6 +10,7 @@

    Services/{{id}}

    {{> control/partials/events-table }} + {{> control/partials/message-queue-table }}
    From 00eb8b90dcc1f6f96d222abefd338316dd5548bf Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 4 Aug 2023 22:05:29 +0200 Subject: [PATCH 142/157] (control) Message Queue GUI --- .../nu/marginalia/control/ControlService.java | 18 +++++- .../control/model/MessageQueueEntry.java | 3 + .../control/svc/MessageQueueViewService.java | 31 ++++++++++ .../control/dialog-update-message-state.hdb | 44 -------------- .../templates/control/new-message.hdb | 51 ++++++++++------ .../control/partials/message-queue-table.hdb | 23 +++---- .../control/update-message-state.hdb | 60 +++++++++++++++++++ .../templates/control/view-message.hdb | 57 ++++++++++++++++++ 8 files changed, 210 insertions(+), 77 deletions(-) delete mode 100644 code/services-core/control-service/src/main/resources/templates/control/dialog-update-message-state.hdb create mode 100644 code/services-core/control-service/src/main/resources/templates/control/update-message-state.hdb create mode 100644 code/services-core/control-service/src/main/resources/templates/control/view-message.hdb diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java index eca19900..2fc8f121 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java @@ -84,8 +84,9 @@ public class ControlService extends Service { var messageQueueRenderer = rendererFactory.renderer("control/message-queue"); var storageDetailsRenderer = rendererFactory.renderer("control/storage-details"); - var updateMessageStateRenderer = rendererFactory.renderer("control/dialog-update-message-state"); + var updateMessageStateRenderer = rendererFactory.renderer("control/update-message-state"); var newMessageRenderer = rendererFactory.renderer("control/new-message"); + var viewMessageRenderer = rendererFactory.renderer("control/view-message"); this.controlActorService = controlActorService; @@ -132,7 +133,7 @@ public class ControlService extends Service { String payload = rq.queryParams("payload"); persistence.sendNewMessage(recipient, - sender, + sender.isBlank() ? null : sender, relatedMessage == null ? null : Long.parseLong(relatedMessage), function, payload, @@ -141,6 +142,11 @@ public class ControlService extends Service { return ""; }, redirectToMessageQueue); Spark.get("/public/message-queue/new", this::newMessageModel, newMessageRenderer::render); + Spark.get("/public/message-queue/:id", + (rq, rsp) -> Map.of("message", messageQueueViewService.getMessage(Long.parseLong(rq.params("id"))), + "relatedMessages", messageQueueViewService.getRelatedMessages(Long.parseLong(rq.params("id")))) + , viewMessageRenderer::render); + Spark.get("/public/message-queue/:id/reply", this::replyMessageModel, newMessageRenderer::render); Spark.get("/public/message-queue/:id/edit", (rq, rsp) -> persistence.getMessage(Long.parseLong(rq.params("id"))), updateMessageStateRenderer::render); Spark.post("/public/message-queue/:id/edit", (rq, rsp) -> { @@ -181,10 +187,13 @@ public class ControlService extends Service { List entries; + String mqFilter = "filter=none"; if (inboxParam != null) { + mqFilter = "inbox=" + inboxParam; entries = messageQueueViewService.getEntriesForInbox(inboxParam, afterId, 20); } else if (instanceParam != null) { + mqFilter = "instance=" + instanceParam; entries = messageQueueViewService.getEntriesForInstance(instanceParam, afterId, 20); } else { @@ -200,7 +209,10 @@ public class ControlService extends Service { Object prev = afterParam == null ? "" : afterParam; - return Map.of("messages", entries, "next", next, "prev", prev); + return Map.of("messages", entries, + "next", next, + "prev", prev, + "mqFilter", mqFilter); } private Object complaintsModel(Request request, Response response) { diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/model/MessageQueueEntry.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/MessageQueueEntry.java index 43c5bf07..c90bda76 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/model/MessageQueueEntry.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/MessageQueueEntry.java @@ -15,6 +15,9 @@ public record MessageQueueEntry ( int ttl ) { + public boolean hasRelatedMessage() { + return relatedId > 0; + } public String ownerInstance() { if (ownerInstanceFull == null) { return ""; diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueViewService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueViewService.java index 8f3a45a7..02031c2a 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueViewService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueViewService.java @@ -166,6 +166,37 @@ public class MessageQueueViewService { } } + public List getRelatedMessages(long relatedId) { + try (var conn = dataSource.getConnection(); + var query = conn.prepareStatement(""" + (SELECT ID, RELATED_ID, SENDER_INBOX, RECIPIENT_INBOX, FUNCTION, PAYLOAD, OWNER_INSTANCE, OWNER_TICK, STATE, CREATED_TIME, UPDATED_TIME, TTL + FROM MESSAGE_QUEUE + WHERE RELATED_ID = ? + ORDER BY ID DESC + LIMIT 100) + UNION + (SELECT ID, RELATED_ID, SENDER_INBOX, RECIPIENT_INBOX, FUNCTION, PAYLOAD, OWNER_INSTANCE, OWNER_TICK, STATE, CREATED_TIME, UPDATED_TIME, TTL + FROM MESSAGE_QUEUE + WHERE ID = (SELECT RELATED_ID FROM MESSAGE_QUEUE WHERE ID=?) + ORDER BY ID DESC + LIMIT 100) + """)) { + + query.setLong(1, relatedId); + query.setLong(2, relatedId); + + List entries = new ArrayList<>(100); + var rs = query.executeQuery(); + while (rs.next()) { + entries.add(newEntry(rs)); + } + return entries; + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } + private MessageQueueEntry newEntry(ResultSet rs) throws SQLException { return new MessageQueueEntry( rs.getLong("ID"), diff --git a/code/services-core/control-service/src/main/resources/templates/control/dialog-update-message-state.hdb b/code/services-core/control-service/src/main/resources/templates/control/dialog-update-message-state.hdb deleted file mode 100644 index 7acae272..00000000 --- a/code/services-core/control-service/src/main/resources/templates/control/dialog-update-message-state.hdb +++ /dev/null @@ -1,44 +0,0 @@ - - - -Update ID - -{{> control/partials/nav}} -
    -

    Update Message State

    -

    Update the of a message in the message queue. This may be useful to prevent an actor -from resuming an action when this is not desirable. Setting an old message to 'NEW' will -erase information about its owner, and inboxes will consider the message new again.

    -
    -
    - -
    -
    - -
    -
    - -
    -
    - -
    -
    - -
    -
    -
    - - - -
    -

    Note that while setting a message to NEW or in some instances ACK typically causes an Actor - to act on the message, setting a message in ACK to ERR or DEAD will not stop action, but only - prevent resumption of action. To stop a running actor, use the Actors view and press the toggle.

    -
    - \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/new-message.hdb b/code/services-core/control-service/src/main/resources/templates/control/new-message.hdb index 211d690c..91242ba4 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/new-message.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/new-message.hdb @@ -1,30 +1,43 @@ -Update ID +Message Queue | New Message {{> control/partials/nav}}

    Create Message

    -
    - -
    -
    - -
    -
    - -
    -
    - -
    -
    - -
    -
    - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    FieldValue
    + +
    diff --git a/code/services-core/control-service/src/main/resources/templates/control/partials/message-queue-table.hdb b/code/services-core/control-service/src/main/resources/templates/control/partials/message-queue-table.hdb index 6aac4572..d71d0941 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/partials/message-queue-table.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/partials/message-queue-table.hdb @@ -3,7 +3,6 @@ - @@ -12,16 +11,15 @@ - {{#each messages}} - - + - - + @@ -43,9 +44,9 @@ {{/each}} - diff --git a/code/services-core/control-service/src/main/resources/templates/control/update-message-state.hdb b/code/services-core/control-service/src/main/resources/templates/control/update-message-state.hdb new file mode 100644 index 00000000..7d2a16ee --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/update-message-state.hdb @@ -0,0 +1,60 @@ + + + +Update ID + +{{> control/partials/nav}} +
    +

    Update Message State

    +

    Update the of a message in the message queue. This may be useful to prevent an actor +from resuming an action when this is not desirable. Setting an old message to 'NEW' will +erase information about its owner, and inboxes will consider the message new again.

    +
    +
    Action State
    TTL
    Msg ID
    Related ID
    Recipient
    Sender
    Created
    Updated
    + [Add Message]
    [Edit] {{stateCode}} {{state}}{{id}}{{id}} {{recipientInbox}} {{function}} @@ -30,11 +28,14 @@ {{createdTime}}
    - {{#if senderInbox}}[Reply]{{/if}} - {{ttl}}{{relatedId}} + {{#if hasRelatedMessage}} + {{relatedId}} + {{else}} + {{relatedId}} + {{/if}} + {{senderInbox}} {{payload}} {{ownerTick}}
    - {{#if prev}}Prev{{/if}} - {{#if next}}Next{{/if}} + + {{#if prev}}Prev{{/if}} + {{#if next}}Next{{/if}}
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    FieldValue
    + +
    + +

    Note that while setting a message to NEW or in some instances ACK typically causes an Actor + to act on the message, setting a message in ACK to ERR or DEAD will not stop action, but only + prevent resumption of action. To stop a running actor, use the Actors view and press the toggle.

    + + + \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/view-message.hdb b/code/services-core/control-service/src/main/resources/templates/control/view-message.hdb new file mode 100644 index 00000000..fb52f440 --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/view-message.hdb @@ -0,0 +1,57 @@ + + + +Message Queue | New Message + +{{> control/partials/nav}} +
    +

    View Message {{id}}

    + {{#with message}} + + + + + + + + + + + + +
    FieldValueAction
    id{{id}}[Copy Message]
    recipientInbox{{recipientInbox}}
    state{{state}}[Edit State]
    senderInbox{{senderInbox}}{{#if senderInbox}}[Reply]{{/if}}
    relatedId + {{#if hasRelatedMessage}} + {{relatedId}} + {{else}} + {{relatedId}} + {{/if}} +
    function{{function}}
    payload + +
    Created{{createdTime}}
    Updated{{updatedTime}}
    + {{/with}} + + {{#if relatedMessages}} +

    Related Messages

    + + + + + + + + + {{#each relatedMessages}} + + + + + + + + {{/each}} +
    IDRecipient InboxSender InboxFunctionState
    {{id}}{{recipientInbox}}{{senderInbox}}{{function}}{{state}}
    + {{/if}} +
    + + + \ No newline at end of file From 08eed17e665fea06fb6961f4478d17dac9abb7c3 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 5 Aug 2023 14:42:16 +0200 Subject: [PATCH 143/157] (api-service) Mq endpoint for flushing caches --- .../src/main/java/nu/marginalia/api/ApiService.java | 9 +++++++++ .../main/java/nu/marginalia/api/svc/LicenseService.java | 5 ++++- .../main/java/nu/marginalia/api/svc/ResponseCache.java | 4 ++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/code/services-satellite/api-service/src/main/java/nu/marginalia/api/ApiService.java b/code/services-satellite/api-service/src/main/java/nu/marginalia/api/ApiService.java index 4da8c0f6..aed2006d 100644 --- a/code/services-satellite/api-service/src/main/java/nu/marginalia/api/ApiService.java +++ b/code/services-satellite/api-service/src/main/java/nu/marginalia/api/ApiService.java @@ -11,6 +11,7 @@ import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.search.client.SearchClient; import nu.marginalia.search.client.model.ApiSearchResults; import nu.marginalia.service.server.*; +import nu.marginalia.service.server.mq.MqNotification; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.slf4j.Marker; @@ -58,6 +59,14 @@ public class ApiService extends Service { Spark.get("/public/api/:key/search/*", this::search, gson::toJson); } + @MqNotification(endpoint = "FLUSH_CACHES") + public void flushCaches(String unusedArg) { + logger.info("Flushing caches"); + + responseCache.flush(); + licenseService.flushCache(); + } + private Object search(Request request, Response response) { String[] args = request.splat(); diff --git a/code/services-satellite/api-service/src/main/java/nu/marginalia/api/svc/LicenseService.java b/code/services-satellite/api-service/src/main/java/nu/marginalia/api/svc/LicenseService.java index f7847a46..7025ae33 100644 --- a/code/services-satellite/api-service/src/main/java/nu/marginalia/api/svc/LicenseService.java +++ b/code/services-satellite/api-service/src/main/java/nu/marginalia/api/svc/LicenseService.java @@ -8,7 +8,6 @@ import nu.marginalia.api.model.ApiLicense; import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import spark.Request; import spark.Spark; import java.util.concurrent.ConcurrentHashMap; @@ -57,4 +56,8 @@ public class LicenseService { throw new IllegalStateException("This is unreachable"); } + + public void flushCache() { + licenseCache.clear(); + } } diff --git a/code/services-satellite/api-service/src/main/java/nu/marginalia/api/svc/ResponseCache.java b/code/services-satellite/api-service/src/main/java/nu/marginalia/api/svc/ResponseCache.java index 59c62fe5..032ad9b4 100644 --- a/code/services-satellite/api-service/src/main/java/nu/marginalia/api/svc/ResponseCache.java +++ b/code/services-satellite/api-service/src/main/java/nu/marginalia/api/svc/ResponseCache.java @@ -40,6 +40,10 @@ public class ResponseCache { return license.getKey() + ":" + queryString + ":" + queryParams; } + public void flush() { + cache.invalidateAll(); + } + public void cleanUp() { cache.cleanUp(); } From cdfe284f9a19ce095150b965b90ee169dd7c154a Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 5 Aug 2023 14:42:33 +0200 Subject: [PATCH 144/157] (file storage) File Storage Type for EXPORT data (file storage) File Storage Type for EXPORT data --- .../nu/marginalia/db/storage/model/FileStorageType.java | 1 + .../resources/db/migration/V23_07_0_004__file_storage.sql | 2 +- .../migration/V23_07_0_005__file_storage_default_values.sql | 6 +++++- run/setup.sh | 2 +- 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageType.java b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageType.java index 97eef136..9f512d06 100644 --- a/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageType.java +++ b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageType.java @@ -9,5 +9,6 @@ public enum FileStorageType { INDEX_LIVE, LEXICON_LIVE, BACKUP, + EXPORT, SEARCH_SETS } diff --git a/code/common/db/src/main/resources/db/migration/V23_07_0_004__file_storage.sql b/code/common/db/src/main/resources/db/migration/V23_07_0_004__file_storage.sql index d6de88a5..641d0e03 100644 --- a/code/common/db/src/main/resources/db/migration/V23_07_0_004__file_storage.sql +++ b/code/common/db/src/main/resources/db/migration/V23_07_0_004__file_storage.sql @@ -13,7 +13,7 @@ CREATE TABLE IF NOT EXISTS FILE_STORAGE ( BASE_ID BIGINT NOT NULL, PATH VARCHAR(255) NOT NULL COMMENT 'The path to the storage relative to the base', DESCRIPTION VARCHAR(255) NOT NULL, - TYPE ENUM ('CRAWL_SPEC', 'CRAWL_DATA', 'PROCESSED_DATA', 'INDEX_STAGING', 'LEXICON_STAGING', 'INDEX_LIVE', 'LEXICON_LIVE', 'SEARCH_SETS', 'BACKUP') NOT NULL, + TYPE ENUM ('CRAWL_SPEC', 'CRAWL_DATA', 'PROCESSED_DATA', 'INDEX_STAGING', 'LEXICON_STAGING', 'INDEX_LIVE', 'LEXICON_LIVE', 'SEARCH_SETS', 'BACKUP', 'EXPORT') NOT NULL, DO_PURGE BOOLEAN NOT NULL DEFAULT FALSE COMMENT 'If true, the storage may be cleaned', CREATE_DATE TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6), CONSTRAINT CONS UNIQUE (BASE_ID, PATH), diff --git a/code/common/db/src/main/resources/db/migration/V23_07_0_005__file_storage_default_values.sql b/code/common/db/src/main/resources/db/migration/V23_07_0_005__file_storage_default_values.sql index 3803911f..74434055 100644 --- a/code/common/db/src/main/resources/db/migration/V23_07_0_005__file_storage_default_values.sql +++ b/code/common/db/src/main/resources/db/migration/V23_07_0_005__file_storage_default_values.sql @@ -21,4 +21,8 @@ FROM FILE_STORAGE_BASE WHERE NAME='Index Storage'; INSERT IGNORE INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE) SELECT ID, 'ss', "Search Sets", 'SEARCH_SETS' -FROM FILE_STORAGE_BASE WHERE NAME='Index Storage'; \ No newline at end of file +FROM FILE_STORAGE_BASE WHERE NAME='Index Storage'; + +INSERT IGNORE INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE) +SELECT ID, 'export', "Exported Data", 'EXPORT' +FROM FILE_STORAGE_BASE WHERE TYPE='EXPORT'; \ No newline at end of file diff --git a/run/setup.sh b/run/setup.sh index ba4ac355..24feba85 100755 --- a/run/setup.sh +++ b/run/setup.sh @@ -18,7 +18,7 @@ function download_model { pushd $(dirname $0) -mkdir -p model logs db samples install vol/ir/{0,1}/ vol/{lr,lw} vol/iw/{0,1}/search-sets vol/{tmpf,tmps} data +mkdir -p model logs db samples install vol/ir/{0,1}/ vol/{lr,lw} vol/iw/{0,1}/search-sets vol/{tmpf,tmps} data samples/export download_model model/English.DICT https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.DICT download_model model/English.RDR https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.RDR From c2b45bec8d49b414d47e915a388785ddb7965dca Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 5 Aug 2023 14:43:28 +0200 Subject: [PATCH 145/157] (mq) Rename notify to sendNotice to avoid name clash with the java object function --- .../nu/marginalia/mq/outbox/MqOutbox.java | 4 ++-- .../java/nu/marginalia/mqsm/StateMachine.java | 21 +++++++++---------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java index 3f3362f1..40022c11 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java @@ -154,10 +154,10 @@ public class MqOutbox { return Optional.ofNullable(response); } - public long notify(String function, String payload) throws Exception { + public long sendNotice(String function, String payload) throws Exception { return persistence.sendNewMessage(inboxName, null, null, function, payload, null); } - public long notify(long relatedId, String function, String payload) throws Exception { + public long sendNotice(long relatedId, String function, String payload) throws Exception { return persistence.sendNewMessage(inboxName, null, relatedId, function, payload, null); } diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java index a2567698..64eea90f 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java @@ -3,7 +3,6 @@ package nu.marginalia.mqsm; import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mq.MqMessage; import nu.marginalia.mq.MqMessageState; -import nu.marginalia.mq.inbox.MqInboxIf; import nu.marginalia.mq.inbox.MqInboxResponse; import nu.marginalia.mq.inbox.MqSubscription; import nu.marginalia.mq.inbox.MqSynchronousInbox; @@ -136,7 +135,7 @@ public class StateMachine { notifyAll(); } - smOutbox.notify(transition.state(), transition.message()); + smOutbox.sendNotice(transition.state(), transition.message()); } /** Initialize the state machine. */ @@ -148,7 +147,7 @@ public class StateMachine { notifyAll(); } - smOutbox.notify(transition.state(), transition.message()); + smOutbox.sendNotice(transition.state(), transition.message()); } /** Initialize the state machine. */ @@ -160,7 +159,7 @@ public class StateMachine { notifyAll(); } - smOutbox.notify(transition.state(), transition.message()); + smOutbox.sendNotice(transition.state(), transition.message()); } /** Initialize the state machine. */ @@ -172,7 +171,7 @@ public class StateMachine { notifyAll(); } - smOutbox.notify(transition.state(), transition.message()); + smOutbox.sendNotice(transition.state(), transition.message()); } /** Resume the state machine from the last known state. */ @@ -219,7 +218,7 @@ public class StateMachine { try { if (resumeState.resumeBehavior().equals(ResumeBehavior.ERROR)) { // The message is acknowledged, but the state does not support resuming - smOutbox.notify(expectedMessage.id, "ERROR", "Illegal resumption from ACK'ed state " + message.function()); + smOutbox.sendNotice(expectedMessage.id, "ERROR", "Illegal resumption from ACK'ed state " + message.function()); } else if (resumeState.resumeBehavior().equals(ResumeBehavior.RESTART)) { this.state = resumeState; @@ -227,7 +226,7 @@ public class StateMachine { // The message is already acknowledged, we flag it as dead and then send an identical message smOutbox.flagAsDead(message.msgId()); expectedMessage = ExpectedMessage.responseTo(message); - smOutbox.notify(message.msgId(), "INITIAL", ""); + smOutbox.sendNotice(message.msgId(), "INITIAL", ""); } else { this.state = resumeState; @@ -235,7 +234,7 @@ public class StateMachine { // The message is already acknowledged, we flag it as dead and then send an identical message smOutbox.flagAsDead(message.msgId()); expectedMessage = ExpectedMessage.responseTo(message); - smOutbox.notify(message.msgId(), message.function(), message.payload()); + smOutbox.sendNotice(message.msgId(), message.function(), message.payload()); } } catch (Exception e) { @@ -288,7 +287,7 @@ public class StateMachine { } else { expectedMessage = ExpectedMessage.responseTo(msg); - smOutbox.notify(expectedMessage.id, transition.state(), transition.message()); + smOutbox.sendNotice(expectedMessage.id, transition.state(), transition.message()); } } else { @@ -319,7 +318,7 @@ public class StateMachine { // and also permits the real termination message to have an // unique expected ID - long abortMsgId = smOutbox.notify(expectedMessage.id, "ABORT", "Aborting execution"); + long abortMsgId = smOutbox.sendNotice(expectedMessage.id, "ABORT", "Aborting execution"); // Set it as dead to clean up the queue from mystery ACK messages smOutbox.flagAsDead(abortMsgId); @@ -333,7 +332,7 @@ public class StateMachine { expectedMessage = ExpectedMessage.expectId(abortMsgId); // Add a state transition to the final state - smOutbox.notify(abortMsgId, finalState.name(), ""); + smOutbox.sendNotice(abortMsgId, finalState.name(), ""); // Dislodge the current task with an interrupt. // It's actually fine if we accidentally interrupt the wrong thread From bf37a3eb25ab76c8b786b9f21545aa7f1d35c706 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 5 Aug 2023 14:43:57 +0200 Subject: [PATCH 146/157] (search-service) Make flushCaches endpoint a notice and not a request --- .../src/main/java/nu/marginalia/search/SearchService.java | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/SearchService.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/SearchService.java index b6e7a5d2..5fe5751e 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/SearchService.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/SearchService.java @@ -11,7 +11,7 @@ import nu.marginalia.search.db.DbUrlDetailsQuery; import nu.marginalia.search.svc.SearchFrontPageService; import nu.marginalia.search.svc.*; import nu.marginalia.service.server.*; -import nu.marginalia.service.server.mq.MqRequest; +import nu.marginalia.service.server.mq.MqNotification; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import spark.Request; @@ -76,11 +76,10 @@ public class SearchService extends Service { Spark.awaitInitialization(); } - @MqRequest(endpoint = SearchMqEndpoints.FLUSH_CACHES) - public String flushCaches(String unusedArg) { + @MqNotification(endpoint = SearchMqEndpoints.FLUSH_CACHES) + public void flushCaches(String unusedArg) { logger.info("Flushing caches"); dbUrlDetailsQuery.clearCaches(); - return "OK"; } private Object serveStatic(Request request, Response response) { From 715d61dfea8dbae13087824db5957c571514e006 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 5 Aug 2023 14:44:16 +0200 Subject: [PATCH 147/157] (mq) Fix bug in notice handling where they were registered on the wrong name --- .../server/mq/ServiceMqSubscription.java | 27 +++++++++++++------ 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/code/common/service/src/main/java/nu/marginalia/service/server/mq/ServiceMqSubscription.java b/code/common/service/src/main/java/nu/marginalia/service/server/mq/ServiceMqSubscription.java index d344d928..61a024f5 100644 --- a/code/common/service/src/main/java/nu/marginalia/service/server/mq/ServiceMqSubscription.java +++ b/code/common/service/src/main/java/nu/marginalia/service/server/mq/ServiceMqSubscription.java @@ -18,29 +18,40 @@ public class ServiceMqSubscription implements MqSubscription { private final Map notifications = new HashMap<>(); private final Service service; + public ServiceMqSubscription(Service service) { this.service = service; + + /* Wire up all methods annotated with @MqRequest and @MqNotification + * to receive corresponding messages from this subscription */ + for (var method : service.getClass().getMethods()) { var annotation = method.getAnnotation(MqRequest.class); if (annotation != null) { requests.put(annotation.endpoint(), method); } - if (method.getAnnotation(MqNotification.class) != null) { - notifications.put(method.getName(), method); + } + + for (var method : service.getClass().getMethods()) { + var annotation = method.getAnnotation(MqNotification.class); + if (annotation != null) { + notifications.put(annotation.endpoint(), method); } } } @Override public boolean filter(MqMessage rawMessage) { - boolean isInteresting = requests.containsKey(rawMessage.function()) - || notifications.containsKey(rawMessage.function()); - - if (!isInteresting) { - logger.warn("Received message for unknown function " + rawMessage.function()); + if (requests.containsKey(rawMessage.function())) { + return true; + } + if (notifications.containsKey(rawMessage.function())) { + return true; } - return isInteresting; + logger.warn("Received message for unknown function " + rawMessage.function()); + + return false; } @Override From be444f9172d75c6b7abbbd634bf52b8c8610d344 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 5 Aug 2023 14:44:55 +0200 Subject: [PATCH 148/157] (control) New actions view, re-arrange navigation menu --- .../nu/marginalia/control/ControlService.java | 51 +++-- .../control/actor/ControlActors.java | 6 +- .../control/actor/task/ExportDataActor.java | 192 ++++++++++++++++++ .../control/actor/task/FlushLinkDatabase.java | 85 ++++++++ .../nu/marginalia/control/model/Actor.java | 5 +- .../control/svc/ControlActionsService.java | 111 ++++++++++ .../resources/templates/control/actions.hdb | 103 ++++++++++ .../templates/control/partials/nav.hdb | 12 +- .../templates/control/storage-details.hdb | 75 ++++--- 9 files changed, 592 insertions(+), 48 deletions(-) create mode 100644 code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ExportDataActor.java create mode 100644 code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/FlushLinkDatabase.java create mode 100644 code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActionsService.java create mode 100644 code/services-core/control-service/src/main/resources/templates/control/actions.hdb diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java index 2fc8f121..d239f1aa 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java @@ -58,6 +58,7 @@ public class ControlService extends Service { ControlFileStorageService controlFileStorageService, ApiKeyService apiKeyService, DomainComplaintService domainComplaintService, + ControlActionsService controlActionsService, MqPersistence persistence ) throws IOException { @@ -88,6 +89,8 @@ public class ControlService extends Service { var newMessageRenderer = rendererFactory.renderer("control/new-message"); var viewMessageRenderer = rendererFactory.renderer("control/view-message"); + var actionsViewRenderer = rendererFactory.renderer("control/actions"); + this.controlActorService = controlActorService; this.staticResources = staticResources; @@ -101,28 +104,26 @@ public class ControlService extends Service { Spark.get("/public/", (req, rsp) -> indexRenderer.render(Map.of())); + Spark.get("/public/actions", (rq,rsp) -> new Object() , actionsViewRenderer::render); Spark.get("/public/services", this::servicesModel, servicesRenderer::render); Spark.get("/public/services/:id", this::serviceModel, serviceByIdRenderer::render); Spark.get("/public/messages/:id", this::existingMessageModel, gson::toJson); Spark.get("/public/actors", this::processesModel, actorsRenderer::render); Spark.get("/public/actors/:fsm", this::actorDetailsModel, actorDetailsRenderer::render); - Spark.get("/public/storage", this::storageModel, storageRenderer::render); - Spark.get("/public/storage/specs", this::storageModelSpecs, storageSpecsRenderer::render); - Spark.get("/public/storage/crawls", this::storageModelCrawls, storageCrawlsRenderer::render); - Spark.get("/public/storage/processed", this::storageModelProcessed, storageProcessedRenderer::render); - Spark.get("/public/storage/:id", this::storageDetailsModel, storageDetailsRenderer::render); - Spark.get("/public/storage/:id/file", controlFileStorageService::downloadFileFromStorage); - final HtmlRedirect redirectToServices = new HtmlRedirect("/services"); - final HtmlRedirect redirectToProcesses = new HtmlRedirect("/actors"); + final HtmlRedirect redirectToActors = new HtmlRedirect("/actors"); final HtmlRedirect redirectToApiKeys = new HtmlRedirect("/api-keys"); final HtmlRedirect redirectToStorage = new HtmlRedirect("/storage"); final HtmlRedirect redirectToComplaints = new HtmlRedirect("/complaints"); final HtmlRedirect redirectToMessageQueue = new HtmlRedirect("/message-queue"); - Spark.post("/public/fsms/:fsm/start", controlActorService::startFsm, redirectToProcesses); - Spark.post("/public/fsms/:fsm/stop", controlActorService::stopFsm, redirectToProcesses); + // FSMs + + Spark.post("/public/fsms/:fsm/start", controlActorService::startFsm, redirectToActors); + Spark.post("/public/fsms/:fsm/stop", controlActorService::stopFsm, redirectToActors); + + // Message Queue Spark.get("/public/message-queue", this::messageQueueModel, messageQueueRenderer::render); Spark.post("/public/message-queue/", (rq, rsp) -> { @@ -156,14 +157,26 @@ public class ControlService extends Service { return ""; }, redirectToMessageQueue); - Spark.post("/public/storage/:fid/crawl", controlActorService::triggerCrawling, redirectToProcesses); - Spark.post("/public/storage/:fid/recrawl", controlActorService::triggerRecrawling, redirectToProcesses); - Spark.post("/public/storage/:fid/process", controlActorService::triggerProcessing, redirectToProcesses); - Spark.post("/public/storage/:fid/load", controlActorService::loadProcessedData, redirectToProcesses); + // Storage + Spark.get("/public/storage", this::storageModel, storageRenderer::render); + Spark.get("/public/storage/specs", this::storageModelSpecs, storageSpecsRenderer::render); + Spark.get("/public/storage/crawls", this::storageModelCrawls, storageCrawlsRenderer::render); + Spark.get("/public/storage/processed", this::storageModelProcessed, storageProcessedRenderer::render); + Spark.get("/public/storage/:id", this::storageDetailsModel, storageDetailsRenderer::render); + Spark.get("/public/storage/:id/file", controlFileStorageService::downloadFileFromStorage); + + // Storage Actions + + Spark.post("/public/storage/:fid/crawl", controlActorService::triggerCrawling, redirectToActors); + Spark.post("/public/storage/:fid/recrawl", controlActorService::triggerRecrawling, redirectToActors); + Spark.post("/public/storage/:fid/process", controlActorService::triggerProcessing, redirectToActors); + Spark.post("/public/storage/:fid/load", controlActorService::loadProcessedData, redirectToActors); Spark.post("/public/storage/specs", controlActorService::createCrawlSpecification, redirectToStorage); Spark.post("/public/storage/:fid/delete", controlFileStorageService::flagFileForDeletionRequest, redirectToStorage); + // API Keys + Spark.get("/public/api-keys", this::apiKeysModel, apiKeysRenderer::render); Spark.post("/public/api-keys", this::createApiKey, redirectToApiKeys); Spark.delete("/public/api-keys/:key", this::deleteApiKey, redirectToApiKeys); @@ -173,6 +186,16 @@ public class ControlService extends Service { Spark.get("/public/complaints", this::complaintsModel, domainComplaintsRenderer::render); Spark.post("/public/complaints/:domain", this::reviewComplaint, redirectToComplaints); + // Actions + + Spark.post("/public/actions/calculate-adjacencies", controlActionsService::calculateAdjacencies, redirectToActors); + Spark.post("/public/actions/repartition-index", controlActionsService::triggerRepartition, redirectToActors); + Spark.post("/public/actions/reconvert-index", controlActionsService::triggerReconversion, redirectToActors); + Spark.post("/public/actions/trigger-data-exports", controlActionsService::triggerDataExports, redirectToActors); + Spark.post("/public/actions/flush-search-caches", controlActionsService::flushSearchCaches, redirectToActors); + Spark.post("/public/actions/flush-api-caches", controlActionsService::flushApiCaches, redirectToActors); + Spark.post("/public/actions/flush-links-database", controlActionsService::flushLinkDatabase, redirectToActors); + Spark.get("/public/:resource", this::serveStatic); monitors.subscribe(this::logMonitorStateChange); diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java index 12f15bf9..782b27c1 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java @@ -45,7 +45,9 @@ public class ControlActors { ProcessLivenessMonitorActor processMonitorFSM, FileStorageMonitorActor fileStorageMonitorActor, TriggerAdjacencyCalculationActor triggerAdjacencyCalculationActor, - CrawlJobExtractorActor crawlJobExtractorActor + CrawlJobExtractorActor crawlJobExtractorActor, + ExportDataActor exportDataActor, + FlushLinkDatabase flushLinkDatabase ) { this.messageQueueFactory = messageQueueFactory; this.eventLog = baseServiceParams.eventLog; @@ -62,6 +64,8 @@ public class ControlActors { register(Actor.FILE_STORAGE_MONITOR, fileStorageMonitorActor); register(Actor.ADJACENCY_CALCULATION, triggerAdjacencyCalculationActor); register(Actor.CRAWL_JOB_EXTRACTOR, crawlJobExtractorActor); + register(Actor.EXPORT_DATA, exportDataActor); + register(Actor.FLUSH_LINK_DATABASE, flushLinkDatabase); } private void register(Actor process, AbstractStateGraph graph) { diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ExportDataActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ExportDataActor.java new file mode 100644 index 00000000..10227dc9 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ExportDataActor.java @@ -0,0 +1,192 @@ +package nu.marginalia.control.actor.task; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.zaxxer.hikari.HikariDataSource; +import lombok.AllArgsConstructor; +import lombok.NoArgsConstructor; +import lombok.With; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.db.storage.model.FileStorageId; +import nu.marginalia.db.storage.model.FileStorageType; +import nu.marginalia.mqsm.StateFactory; +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mqsm.graph.GraphState; +import nu.marginalia.mqsm.graph.ResumeBehavior; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedWriter; +import java.io.OutputStreamWriter; +import java.nio.file.Files; +import java.nio.file.StandardCopyOption; +import java.nio.file.StandardOpenOption; +import java.nio.file.attribute.PosixFilePermissions; +import java.util.zip.GZIPOutputStream; + +@Singleton +public class ExportDataActor extends AbstractStateGraph { + + private static final String blacklistFilename = "blacklist.csv.gz"; + private static final String domainsFilename = "domains.csv.gz"; + private static final String linkGraphFilename = "linkgraph.csv.gz"; + + + // STATES + public static final String INITIAL = "INITIAL"; + public static final String EXPORT_DOMAINS = "EXPORT-DOMAINS"; + public static final String EXPORT_BLACKLIST = "EXPORT-BLACKLIST"; + public static final String EXPORT_LINK_GRAPH = "EXPORT-LINK-GRAPH"; + + public static final String END = "END"; + private final FileStorageService storageService; + private final HikariDataSource dataSource; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + @AllArgsConstructor @With @NoArgsConstructor + public static class Message { + public FileStorageId storageId = null; + }; + + @Inject + public ExportDataActor(StateFactory stateFactory, + FileStorageService storageService, + HikariDataSource dataSource) + { + super(stateFactory); + this.storageService = storageService; + this.dataSource = dataSource; + } + + @GraphState(name = INITIAL, + next = EXPORT_BLACKLIST, + description = """ + Find EXPORT storage area, then transition to EXPORT-BLACKLIST. + """) + public Message init(Integer i) throws Exception { + + var storage = storageService.getStorageByType(FileStorageType.EXPORT); + if (storage == null) error("Bad storage id"); + + return new Message().withStorageId(storage.id()); + } + + @GraphState(name = EXPORT_BLACKLIST, + next = EXPORT_DOMAINS, + resume = ResumeBehavior.ERROR, + description = """ + Export the blacklist from the database to the EXPORT storage area. + """ + ) + public Message exportBlacklist(Message message) throws Exception { + var storage = storageService.getStorage(message.storageId); + var tmpFile = Files.createTempFile(storage.asPath(), "export", ".csv.gz", + PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--"))); + + try (var bw = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(Files.newOutputStream(tmpFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)))); + var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement("SELECT URL_DOMAIN FROM EC_DOMAIN_BLACKLIST"); + ) + { + stmt.setFetchSize(1000); + var rs = stmt.executeQuery(); + while (rs.next()) { + bw.write(rs.getString(1)); + bw.write("\n"); + } + Files.move(tmpFile, storage.asPath().resolve(blacklistFilename), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING); + } + catch (Exception ex) { + logger.error("Failed to export blacklist", ex); + error("Failed to export blacklist"); + } + finally { + Files.deleteIfExists(tmpFile); + } + + return message; + } + + @GraphState( + name = EXPORT_DOMAINS, + next = EXPORT_LINK_GRAPH, + resume = ResumeBehavior.RETRY, + description = """ + Export known domains to the EXPORT storage area. + """ + ) + public Message exportDomains(Message message) throws Exception { + var storage = storageService.getStorage(message.storageId); + var tmpFile = Files.createTempFile(storage.asPath(), "export", ".csv.gz", + PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--"))); + + try (var bw = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(Files.newOutputStream(tmpFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)))); + var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement("SELECT DOMAIN_NAME, ID, INDEXED, STATE FROM EC_DOMAIN"); + ) + { + stmt.setFetchSize(1000); + var rs = stmt.executeQuery(); + while (rs.next()) { + bw.write(rs.getString("DOMAIN_NAME")); + bw.write(","); + bw.write(rs.getString("ID")); + bw.write(","); + bw.write(rs.getString("INDEXED")); + bw.write(","); + bw.write(rs.getString("STATE")); + bw.write("\n"); + } + Files.move(tmpFile, storage.asPath().resolve(domainsFilename), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING); + } + catch (Exception ex) { + logger.error("Failed to export domains", ex); + error("Failed to export domains"); + } + finally { + Files.deleteIfExists(tmpFile); + } + + return message; + } + + @GraphState( + name = EXPORT_LINK_GRAPH, + next = END, + resume = ResumeBehavior.RETRY, + description = """ + Export known domains to the EXPORT storage area. + """ + ) + public Message exportLinkGraph(Message message) throws Exception { + var storage = storageService.getStorage(message.storageId); + var tmpFile = Files.createTempFile(storage.asPath(), "export", ".csv.gz", + PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--"))); + + try (var bw = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(Files.newOutputStream(tmpFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)))); + var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK"); + ) + { + stmt.setFetchSize(1000); + var rs = stmt.executeQuery(); + while (rs.next()) { + bw.write(rs.getString("SOURCE_DOMAIN_ID")); + bw.write(","); + bw.write(rs.getString("DEST_DOMAIN_ID")); + bw.write("\n"); + } + Files.move(tmpFile, storage.asPath().resolve(linkGraphFilename), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING); + } + catch (Exception ex) { + logger.error("Failed to export link graph", ex); + error("Failed to export link graph"); + } + finally { + Files.deleteIfExists(tmpFile); + } + + return message; + } + +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/FlushLinkDatabase.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/FlushLinkDatabase.java new file mode 100644 index 00000000..833efc19 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/FlushLinkDatabase.java @@ -0,0 +1,85 @@ +package nu.marginalia.control.actor.task; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.zaxxer.hikari.HikariDataSource; +import lombok.AllArgsConstructor; +import lombok.NoArgsConstructor; +import lombok.With; +import nu.marginalia.db.storage.model.FileStorageId; +import nu.marginalia.mqsm.StateFactory; +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mqsm.graph.GraphState; +import nu.marginalia.mqsm.graph.ResumeBehavior; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedWriter; +import java.io.OutputStreamWriter; +import java.nio.file.Files; +import java.nio.file.StandardCopyOption; +import java.nio.file.StandardOpenOption; +import java.nio.file.attribute.PosixFilePermissions; +import java.sql.SQLException; +import java.util.zip.GZIPOutputStream; + +@Singleton +public class FlushLinkDatabase extends AbstractStateGraph { + + + // STATES + public static final String INITIAL = "INITIAL"; + public static final String FLUSH_DATABASE = "FLUSH_DATABASE"; + + public static final String END = "END"; + private final HikariDataSource dataSource; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + @AllArgsConstructor @With @NoArgsConstructor + public static class Message { + public FileStorageId storageId = null; + }; + + @Inject + public FlushLinkDatabase(StateFactory stateFactory, + HikariDataSource dataSource) + { + super(stateFactory); + this.dataSource = dataSource; + } + + @GraphState(name = INITIAL, + next = FLUSH_DATABASE, + description = """ + Initial stage + """) + public void init(Integer i) throws Exception { + + } + + @GraphState(name = FLUSH_DATABASE, + next = END, + resume = ResumeBehavior.ERROR, + description = """ + Truncate the domain and link tables. + """ + ) + public void exportBlacklist() throws Exception { + try (var conn = dataSource.getConnection(); + var stmt = conn.createStatement()) + { + stmt.executeUpdate("SET FOREIGN_KEY_CHECKS = 0"); + stmt.executeUpdate("TRUNCATE TABLE EC_PAGE_DATA"); + stmt.executeUpdate("TRUNCATE TABLE EC_URL"); + stmt.executeUpdate("TRUNCATE TABLE EC_DOMAIN_LINK"); + stmt.executeUpdate("TRUNCATE TABLE DOMAIN_METADATA"); + stmt.executeUpdate("SET FOREIGN_KEY_CHECKS = 1"); + } + catch (SQLException ex) { + logger.error("Failed to truncate tables", ex); + error("Failed to truncate tables"); + } + } + + +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/model/Actor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/Actor.java index 755d67a1..937f8ff8 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/model/Actor.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/Actor.java @@ -11,8 +11,9 @@ public enum Actor { PROCESS_LIVENESS_MONITOR, FILE_STORAGE_MONITOR, ADJACENCY_CALCULATION, - CRAWL_JOB_EXTRACTOR - ; + CRAWL_JOB_EXTRACTOR, + EXPORT_DATA, + FLUSH_LINK_DATABASE; public String id() { diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActionsService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActionsService.java new file mode 100644 index 00000000..eb683c3e --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActionsService.java @@ -0,0 +1,111 @@ +package nu.marginalia.control.svc; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.control.actor.ControlActors; +import nu.marginalia.control.model.Actor; +import nu.marginalia.index.client.IndexClient; +import nu.marginalia.index.client.IndexMqEndpoints; +import nu.marginalia.mq.MessageQueueFactory; +import nu.marginalia.mq.outbox.MqOutbox; +import nu.marginalia.search.client.SearchClient; +import nu.marginalia.search.client.SearchMqEndpoints; +import nu.marginalia.service.control.ServiceEventLog; +import nu.marginalia.service.id.ServiceId; +import nu.marginalia.service.server.BaseServiceParams; +import spark.Request; +import spark.Response; +import spark.Spark; + +import java.util.UUID; + +@Singleton +public class ControlActionsService { + + private final ControlActors actors; + private final SearchClient searchClient; + private final IndexClient indexClient; + private final MqOutbox apiOutbox; + private final ServiceEventLog eventLog; + + @Inject + public ControlActionsService(ControlActors actors, + SearchClient searchClient, + IndexClient indexClient, + MessageQueueFactory mqFactory, + ServiceEventLog eventLog) { + + this.actors = actors; + this.searchClient = searchClient; + this.indexClient = indexClient; + this.apiOutbox = createApiOutbox(mqFactory); + this.eventLog = eventLog; + + } + + /** This is a hack to get around the fact that the API service is not a core service + * and lacks a proper internal API + */ + private MqOutbox createApiOutbox(MessageQueueFactory mqFactory) { + String inboxName = ServiceId.Api.name + ":" + "0"; + String outboxName = System.getProperty("service-name", UUID.randomUUID().toString()); + return mqFactory.createOutbox(inboxName, outboxName, UUID.randomUUID()); + } + + public Object calculateAdjacencies(Request request, Response response) throws Exception { + eventLog.logEvent("USER-ACTION", "CALCULATE-ADJACENCIES"); + + actors.start(Actor.ADJACENCY_CALCULATION); + + return ""; + } + + public Object triggerDataExports(Request request, Response response) throws Exception { + eventLog.logEvent("USER-ACTION", "EXPORT-DATA"); + actors.start(Actor.EXPORT_DATA); + + return ""; + } + + public Object flushSearchCaches(Request request, Response response) throws Exception { + eventLog.logEvent("USER-ACTION", "FLUSH-SEARCH-CACHES"); + searchClient.outbox().sendNotice(SearchMqEndpoints.FLUSH_CACHES, ""); + + return ""; + } + + public Object flushApiCaches(Request request, Response response) throws Exception { + eventLog.logEvent("USER-ACTION", "FLUSH-API-CACHES"); + apiOutbox.sendNotice("FLUSH_CACHES", ""); + + return ""; + } + + public Object flushLinkDatabase(Request request, Response response) throws Exception { + + String footgunLicense = request.queryParams("footgun-license"); + + if (!"YES".equals(footgunLicense)) { + Spark.halt(403); + return "You must agree to the footgun license to flush the link database"; + } + + eventLog.logEvent("USER-ACTION", "FLUSH-LINK-DATABASE"); + + actors.start(Actor.FLUSH_LINK_DATABASE); + + return ""; + } + + public Object triggerRepartition(Request request, Response response) throws Exception { + indexClient.outbox().sendAsync(IndexMqEndpoints.INDEX_REPARTITION, ""); + + return null; + } + + public Object triggerReconversion(Request request, Response response) throws Exception { + indexClient.outbox().sendAsync(IndexMqEndpoints.INDEX_REINDEX, ""); + + return null; + } +} diff --git a/code/services-core/control-service/src/main/resources/templates/control/actions.hdb b/code/services-core/control-service/src/main/resources/templates/control/actions.hdb new file mode 100644 index 00000000..5efcedbd --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/actions.hdb @@ -0,0 +1,103 @@ + + + + Control Service + + + + + {{> control/partials/nav}} +
    +

    Actions

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    ActionTrigger
    Trigger Adjacency Calculation

    + This will trigger a recalculation of website similarities, which affects + the rankings calculations. +

    +
    + +
    +
    Repartition Index

    + This will recalculate the rankings and search sets for the index. +

    +
    + +
    +
    Reconvert Index

    + This will reconstruct the index from the index journal. +

    +
    + +
    +
    Flush search-service Caches

    + This will instruct the search-service to flush its caches, + getting rid of any stale data. This may rarely be necessary after + reloading the index. +

    +
    + +
    +
    Flush api-service Caches

    + This will instruct the api-service to flush its caches, + getting rid of any stale data. This will be necessary after + changes to the API licenses directly through the database. +

    +
    + +
    +
    Trigger Data Exports

    + This exports the data from the database into a set of CSV files +

    +
    + +
    +
    + WARNING -- Destructive Actions Below This Line +
    Flush Links Database.

    + This will drop all known URLs and domain links.
    + This action is not reversible. +

    +
    +
    + +

    + +
    +
    +
    + + \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/partials/nav.hdb b/code/services-core/control-service/src/main/resources/templates/control/partials/nav.hdb index 94f3b13a..05086051 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/partials/nav.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/partials/nav.hdb @@ -1,12 +1,16 @@ \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/storage-details.hdb b/code/services-core/control-service/src/main/resources/templates/control/storage-details.hdb index d4f32718..e1574fd5 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/storage-details.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/storage-details.hdb @@ -45,33 +45,54 @@ {{/if}}

    Actions

    - {{#with storage.self}} - {{#if isCrawlable}} -
    - Perform a full re-crawl of this data:
    -
    - {{/if}} - {{#if isLoadable}} -
    - Load this data into index:
    -
    - {{/if}} - {{#if isConvertible}} -
    - Process and load this data into index:
    -
    - {{/if}} - {{#if isRecrawlable}} -
    - Perform a re-crawl of this data:
    -
    - {{/if}} - {{#if isDeletable}} -
    - Delete this data:
    -
    - {{/if}} - {{/with}} + + + + + + {{#with storage.self}} + {{#if isCrawlable}} + + + + + + + {{/if}} + {{#if isLoadable}} + + + + + + + {{/if}} + {{#if isConvertible}} + + + + + + + {{/if}} + {{#if isRecrawlable}} + + + + + + + {{/if}} + {{#if isDeletable}} + + + + + + + {{/if}} + {{/with}} +
    DescriptionTrigger
    Perform a full re-crawl of this data
    Load this data into index
    Process and load this data into index
    Perform a re-crawl of this data
    Delete this data
    {{#if storage.related}}

    Related

    From 4ab1cd950231fb676113eba5a94515bf75dc24d4 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 7 Aug 2023 12:57:38 +0200 Subject: [PATCH 149/157] (*) last touches --- .../mqapi/converting/ConvertAction.java | 3 +- code/common/message-queue/readme.md | 23 +- .../mq/inbox/MqSingleShotInbox.java | 6 +- .../mq/persistence/MqPersistence.java | 78 +++--- ...ateMachine.java => ActorStateMachine.java} | 14 +- .../mqsm/graph/AbstractStateGraph.java | 10 +- .../mqsm/graph/ControlFlowException.java | 1 + .../nu/marginalia/mqsm/graph/GraphState.java | 1 + ...inalState.java => TerminalGraphState.java} | 2 +- ...t.java => ActorStateMachineErrorTest.java} | 4 +- ...st.java => ActorStateMachineNullTest.java} | 4 +- ....java => ActorStateMachineResumeTest.java} | 12 +- ...neTest.java => ActorStateMachineTest.java} | 10 +- .../model/DocumentKeywordsBuilder.java | 2 +- .../processes/converting-process/build.gradle | 3 + .../marginalia/converting/ConverterMain.java | 9 + .../converting/compiler/UrlsCompiler.java | 3 + .../EncyclopediaMarginaliaNuSideloader.java | 4 +- .../sideload/SideloadSourceFactory.java | 14 +- .../sideload/StackExchange7zReader.java | 229 ++++++++++++++++++ .../sideload/StackexchangeSideloader.java | 149 ++++++++++++ .../sideload/StackexchangeSideloaderTest.java | 21 ++ .../nu/marginalia/loading/LoaderMain.java | 6 +- .../loader/LoaderIndexJournalWriter.java | 11 +- .../nu/marginalia/control/ControlService.java | 16 +- .../control/actor/ControlActors.java | 14 +- .../monitor/AbstractProcessSpawnerActor.java | 4 +- .../control/actor/task/ConvertActor.java | 176 ++++++++++++++ .../TriggerAdjacencyCalculationActor.java | 2 +- ...atabase.java => TruncateLinkDatabase.java} | 13 +- .../nu/marginalia/control/model/Actor.java | 4 +- .../control/svc/ControlActionsService.java | 9 +- .../control/svc/ControlActorService.java | 9 +- .../resources/templates/control/actions.hdb | 10 +- .../resources/templates/control/index.hdb | 10 +- 35 files changed, 780 insertions(+), 106 deletions(-) rename code/common/message-queue/src/main/java/nu/marginalia/mqsm/{StateMachine.java => ActorStateMachine.java} (97%) rename code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/{TerminalState.java => TerminalGraphState.java} (84%) rename code/common/message-queue/src/test/java/nu/marginalia/mqsm/{StateMachineErrorTest.java => ActorStateMachineErrorTest.java} (95%) rename code/common/message-queue/src/test/java/nu/marginalia/mqsm/{StateMachineNullTest.java => ActorStateMachineNullTest.java} (95%) rename code/common/message-queue/src/test/java/nu/marginalia/mqsm/{StateMachineResumeTest.java => ActorStateMachineResumeTest.java} (89%) rename code/common/message-queue/src/test/java/nu/marginalia/mqsm/{StateMachineTest.java => ActorStateMachineTest.java} (89%) create mode 100644 code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/StackExchange7zReader.java create mode 100644 code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/StackexchangeSideloader.java create mode 100644 code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/StackexchangeSideloaderTest.java create mode 100644 code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ConvertActor.java rename code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/{FlushLinkDatabase.java => TruncateLinkDatabase.java} (83%) diff --git a/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertAction.java b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertAction.java index 0c3f575a..abc571f7 100644 --- a/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertAction.java +++ b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertAction.java @@ -2,5 +2,6 @@ package nu.marginalia.mqapi.converting; public enum ConvertAction { ConvertCrawlData, - SideloadEncyclopedia + SideloadEncyclopedia, + SideloadStackexchange } diff --git a/code/common/message-queue/readme.md b/code/common/message-queue/readme.md index cbb5082c..d71459dd 100644 --- a/code/common/message-queue/readme.md +++ b/code/common/message-queue/readme.md @@ -24,8 +24,9 @@ The inbox implementations as well as the outbox can be constructed via the `Mess ## Message Queue State Machine (MQSM) -The MQSM is a finite state machine that is backed by the message queue. The machine itself -is defined through a class that extends the 'AbstractStateGraph'; with state transitions and +The MQSM is a finite state machine that is backed by the message queue used to implement an Actor style paradigm. + +The machine itself is defined through a class that extends the 'AbstractStateGraph'; with state transitions and names defined as implementations. Example: @@ -80,4 +81,20 @@ It can not be assumed that the states are invoked within the same process, or ev on the same day, etc. The usual considerations for writing deterministic Java code are advisable unless unavoidable; -all state must be local, don't iterate over hash maps, etc. \ No newline at end of file +all state must be local, don't iterate over hash maps, etc. + +### Create a state machine +To create an ActorStateMachine from the above class, the following code can be used: + +```java +ActorStateMachine actorStateMachine = new ActorStateMachine( + messageQueueFactory, + actorInboxName, + actorInstanceUUID, + new ExampleStateMachine()); + +actorStateMachine.start(); +``` + +The state machine will now run until it reaches the end state +and listen to messages on the inbox for state transitions. diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSingleShotInbox.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSingleShotInbox.java index edecf9d4..19645c64 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSingleShotInbox.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSingleShotInbox.java @@ -55,8 +55,10 @@ public class MqSingleShotInbox { /** Steal a message from the inbox, and change the owner to this instance. This is useful - * for resuming an aborted process. - * + * for resuming an aborted process. This should be done judiciously, only in cases we're certain + * that the original owner is no longer running as it may cause duplicate processing, race + * conditions, etc. + *

    * @param predicate A predicate that must be true for the message to be stolen * @return The stolen message, or empty if no message was stolen */ diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java index 3413ffea..030fff81 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java @@ -14,6 +14,10 @@ import java.util.*; import static nu.marginalia.mq.MqMessageState.NEW; +/** A persistence layer for the message queue. + *

    + * All storage operations must be done through this class. + */ @Singleton public class MqPersistence { private final HikariDataSource dataSource; @@ -23,33 +27,6 @@ public class MqPersistence { this.dataSource = dataSource; } - /** Flags messages as dead if they have not been set to a terminal state within a TTL after the last update. */ - public int reapDeadMessages() throws SQLException { - try (var conn = dataSource.getConnection(); - var setToDead = conn.prepareStatement(""" - UPDATE MESSAGE_QUEUE - SET STATE='DEAD', UPDATED_TIME=CURRENT_TIMESTAMP(6) - WHERE STATE IN ('NEW', 'ACK') - AND TTL IS NOT NULL - AND TIMESTAMPDIFF(SECOND, UPDATED_TIME, CURRENT_TIMESTAMP(6)) > TTL - """)) { - return setToDead.executeUpdate(); - } - } - - /** Removes messages that have been set to a terminal state a while after their last update timestamp */ - public int cleanOldMessages() throws SQLException { - try (var conn = dataSource.getConnection(); - var setToDead = conn.prepareStatement(""" - DELETE FROM MESSAGE_QUEUE - WHERE STATE = 'OK' - AND TTL IS NOT NULL - AND TIMESTAMPDIFF(SECOND, UPDATED_TIME, CURRENT_TIMESTAMP(6)) > 3600 - """)) { - return setToDead.executeUpdate(); - } - } - /** * Adds a new message to the message queue. * @@ -100,7 +77,14 @@ public class MqPersistence { } } - /** Modifies the state of a message by id */ + /** Modifies the state of a message by id. + *

    + * If the state is 'NEW', ownership information will be stripped to avoid creating + * a broken message that can't be dequeued because it has an owner. + * + * @param id The id of the message + * @param mqMessageState The new state + * */ public void updateMessageState(long id, MqMessageState mqMessageState) throws SQLException { if (NEW == mqMessageState) { reinitializeMessage(id); @@ -124,7 +108,7 @@ public class MqPersistence { } /** Sets the message to 'NEW' state and removes any owner */ - public void reinitializeMessage(long id) throws SQLException { + private void reinitializeMessage(long id) throws SQLException { try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(""" UPDATE MESSAGE_QUEUE @@ -219,7 +203,9 @@ public class MqPersistence { } } - /** Return up to n unprocessed messages from the specified inbox that are in states 'NEW' or 'ACK' */ + /** Return up to n unprocessed messages from the specified inbox that are in states 'NEW' or 'ACK' + * without updating their ownership information + */ public Collection eavesdrop(String inboxName, int n) throws SQLException { try (var conn = dataSource.getConnection(); var queryStmt = conn.prepareStatement(""" @@ -263,6 +249,11 @@ public class MqPersistence { } + /** Returns the message with the specified ID + * + * @throws SQLException if there is a problem with the database + * @throws IllegalArgumentException if the message doesn't exist + */ public MqMessage getMessage(long id) throws SQLException { try (var conn = dataSource.getConnection(); var queryStmt = conn.prepareStatement(""" @@ -427,6 +418,7 @@ public class MqPersistence { } + /** Modify the message indicated by id to have the given owner information */ public void changeOwner(long id, String instanceUUID, int tick) { try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(""" @@ -443,4 +435,30 @@ public class MqPersistence { } + /** Flags messages as dead if they have not been set to a terminal state within a TTL after the last update. */ + public int reapDeadMessages() throws SQLException { + try (var conn = dataSource.getConnection(); + var setToDead = conn.prepareStatement(""" + UPDATE MESSAGE_QUEUE + SET STATE='DEAD', UPDATED_TIME=CURRENT_TIMESTAMP(6) + WHERE STATE IN ('NEW', 'ACK') + AND TTL IS NOT NULL + AND TIMESTAMPDIFF(SECOND, UPDATED_TIME, CURRENT_TIMESTAMP(6)) > TTL + """)) { + return setToDead.executeUpdate(); + } + } + + /** Removes messages that have been set to a terminal state a while after their last update timestamp */ + public int cleanOldMessages() throws SQLException { + try (var conn = dataSource.getConnection(); + var setToDead = conn.prepareStatement(""" + DELETE FROM MESSAGE_QUEUE + WHERE STATE = 'OK' + AND TTL IS NOT NULL + AND TIMESTAMPDIFF(SECOND, UPDATED_TIME, CURRENT_TIMESTAMP(6)) > 3600 + """)) { + return setToDead.executeUpdate(); + } + } } diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/ActorStateMachine.java similarity index 97% rename from code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java rename to code/common/message-queue/src/main/java/nu/marginalia/mqsm/ActorStateMachine.java index 64eea90f..a3f7edbe 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateMachine.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/ActorStateMachine.java @@ -18,12 +18,12 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.function.BiConsumer; -/** A state machine that can be used to implement a finite state machine +/** A state machine that can be used to implement an actor * using a message queue as the persistence layer. The state machine is * resilient to crashes and can be resumed from the last state. */ -public class StateMachine { - private final Logger logger = LoggerFactory.getLogger(StateMachine.class); +public class ActorStateMachine { + private final Logger logger = LoggerFactory.getLogger(ActorStateMachine.class); private final MqSynchronousInbox smInbox; private final MqOutbox smOutbox; @@ -43,10 +43,10 @@ public class StateMachine { private final boolean isDirectlyInitializable; - public StateMachine(MessageQueueFactory messageQueueFactory, - String queueName, - UUID instanceUUID, - AbstractStateGraph stateGraph) + public ActorStateMachine(MessageQueueFactory messageQueueFactory, + String queueName, + UUID instanceUUID, + AbstractStateGraph stateGraph) { this.queueName = queueName; diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/AbstractStateGraph.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/AbstractStateGraph.java index 477788ef..977f2ce4 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/AbstractStateGraph.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/AbstractStateGraph.java @@ -66,11 +66,13 @@ public abstract class AbstractStateGraph { return ret; } - public Set terminalStates() { - Set ret = new HashSet<>(); + + + public Set terminalStates() { + Set ret = new HashSet<>(); for (var method : getClass().getMethods()) { - var gs = method.getAnnotation(TerminalState.class); + var gs = method.getAnnotation(TerminalGraphState.class); if (gs != null) { ret.add(gs); } @@ -88,7 +90,7 @@ public abstract class AbstractStateGraph { ret.add(graphState(method, gs)); } - var ts = method.getAnnotation(TerminalState.class); + var ts = method.getAnnotation(TerminalGraphState.class); if (ts != null) { ret.add(stateFactory.create(ts.name(), ResumeBehavior.ERROR, () -> { throw new ControlFlowException(ts.name(), null); diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/ControlFlowException.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/ControlFlowException.java index 5354a54a..12e5b569 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/ControlFlowException.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/ControlFlowException.java @@ -1,5 +1,6 @@ package nu.marginalia.mqsm.graph; +/** Exception thrown by a state to indicate that the state machine should jump to a different state. */ public class ControlFlowException extends RuntimeException { private final String state; private final Object payload; diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/GraphState.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/GraphState.java index bf7be4a6..e5764dd2 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/GraphState.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/GraphState.java @@ -4,6 +4,7 @@ package nu.marginalia.mqsm.graph; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; +/** Annotation for declaring a state in an actor's state graph. */ @Retention(RetentionPolicy.RUNTIME) public @interface GraphState { String name(); diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/TerminalState.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/TerminalGraphState.java similarity index 84% rename from code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/TerminalState.java rename to code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/TerminalGraphState.java index 46a2be0c..c7b11730 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/TerminalState.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/TerminalGraphState.java @@ -4,7 +4,7 @@ import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; @Retention(RetentionPolicy.RUNTIME) -public @interface TerminalState { +public @interface TerminalGraphState { String name(); String description() default ""; } diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineErrorTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/ActorStateMachineErrorTest.java similarity index 95% rename from code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineErrorTest.java rename to code/common/message-queue/src/test/java/nu/marginalia/mqsm/ActorStateMachineErrorTest.java index bc9ce5b8..3ca46e83 100644 --- a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineErrorTest.java +++ b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/ActorStateMachineErrorTest.java @@ -26,7 +26,7 @@ import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD; @Tag("slow") @Testcontainers @Execution(SAME_THREAD) -public class StateMachineErrorTest { +public class ActorStateMachineErrorTest { @Container static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") .withDatabaseName("WMSA_prod") @@ -85,7 +85,7 @@ public class StateMachineErrorTest { @Test public void smResumeResumableFromNew() throws Exception { var stateFactory = new StateFactory(new GsonBuilder().create()); - var sm = new StateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new ErrorHurdles(stateFactory)); + var sm = new ActorStateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new ErrorHurdles(stateFactory)); sm.init(); diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineNullTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/ActorStateMachineNullTest.java similarity index 95% rename from code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineNullTest.java rename to code/common/message-queue/src/test/java/nu/marginalia/mqsm/ActorStateMachineNullTest.java index e48e6cb5..a20c75f0 100644 --- a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineNullTest.java +++ b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/ActorStateMachineNullTest.java @@ -23,7 +23,7 @@ import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD; @Tag("slow") @Testcontainers @Execution(SAME_THREAD) -public class StateMachineNullTest { +public class ActorStateMachineNullTest { @Container static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") .withDatabaseName("WMSA_prod") @@ -83,7 +83,7 @@ public class StateMachineNullTest { var graph = new TestGraph(stateFactory); - var sm = new StateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), graph); + var sm = new ActorStateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), graph); sm.registerStates(graph); sm.init(); diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineResumeTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/ActorStateMachineResumeTest.java similarity index 89% rename from code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineResumeTest.java rename to code/common/message-queue/src/test/java/nu/marginalia/mqsm/ActorStateMachineResumeTest.java index 1ba7e5c5..825a4c43 100644 --- a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineResumeTest.java +++ b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/ActorStateMachineResumeTest.java @@ -27,7 +27,7 @@ import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD; @Tag("slow") @Testcontainers @Execution(SAME_THREAD) -public class StateMachineResumeTest { +public class ActorStateMachineResumeTest { @Container static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") .withDatabaseName("WMSA_prod") @@ -86,7 +86,7 @@ public class StateMachineResumeTest { persistence.sendNewMessage(inboxId, null, -1L, "RESUMABLE", "", null); - var sm = new StateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); + var sm = new ActorStateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); sm.join(2, TimeUnit.SECONDS); sm.stop(); @@ -107,7 +107,7 @@ public class StateMachineResumeTest { long id = persistence.sendNewMessage(inboxId, null, -1L, "RESUMABLE", "", null); persistence.updateMessageState(id, MqMessageState.ACK); - var sm = new StateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); + var sm = new ActorStateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); sm.join(4, TimeUnit.SECONDS); sm.stop(); @@ -129,7 +129,7 @@ public class StateMachineResumeTest { persistence.sendNewMessage(inboxId, null, -1L, "NON-RESUMABLE", "", null); - var sm = new StateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); + var sm = new ActorStateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); sm.join(2, TimeUnit.SECONDS); sm.stop(); @@ -151,7 +151,7 @@ public class StateMachineResumeTest { long id = persistence.sendNewMessage(inboxId, null, null, "NON-RESUMABLE", "", null); persistence.updateMessageState(id, MqMessageState.ACK); - var sm = new StateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); + var sm = new ActorStateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); sm.join(2, TimeUnit.SECONDS); sm.stop(); @@ -170,7 +170,7 @@ public class StateMachineResumeTest { var stateFactory = new StateFactory(new GsonBuilder().create()); - var sm = new StateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); + var sm = new ActorStateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); sm.join(2, TimeUnit.SECONDS); sm.stop(); diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/ActorStateMachineTest.java similarity index 89% rename from code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineTest.java rename to code/common/message-queue/src/test/java/nu/marginalia/mqsm/ActorStateMachineTest.java index 37cb6cce..5574c771 100644 --- a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/StateMachineTest.java +++ b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/ActorStateMachineTest.java @@ -23,7 +23,7 @@ import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD; @Tag("slow") @Testcontainers @Execution(SAME_THREAD) -public class StateMachineTest { +public class ActorStateMachineTest { @Container static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") .withDatabaseName("WMSA_prod") @@ -90,7 +90,7 @@ public class StateMachineTest { var graph = new TestGraph(stateFactory); - var sm = new StateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), graph); + var sm = new ActorStateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), graph); sm.registerStates(graph); sm.init(); @@ -105,7 +105,7 @@ public class StateMachineTest { @Test public void testStartStopStartStop() throws Exception { var stateFactory = new StateFactory(new GsonBuilder().create()); - var sm = new StateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new TestGraph(stateFactory)); + var sm = new ActorStateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new TestGraph(stateFactory)); sm.init(); @@ -114,7 +114,7 @@ public class StateMachineTest { System.out.println("-------------------- "); - var sm2 = new StateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new TestGraph(stateFactory)); + var sm2 = new ActorStateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new TestGraph(stateFactory)); sm2.join(2, TimeUnit.SECONDS); sm2.stop(); @@ -131,7 +131,7 @@ public class StateMachineTest { persistence.sendNewMessage(inboxId, null, null, "INITIAL", "", null); persistence.sendNewMessage(inboxId, null, null, "INITIAL", "", null); - var sm = new StateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new TestGraph(stateFactory)); + var sm = new ActorStateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new TestGraph(stateFactory)); Thread.sleep(50); diff --git a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java index 0912d459..6ce80372 100644 --- a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java +++ b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java @@ -62,7 +62,7 @@ public class DocumentKeywordsBuilder { words.putIfAbsent(word, 0); } - public void setFlagOnMetadataForWords(WordFlags flag, Set flagWords) { + public void setFlagOnMetadataForWords(WordFlags flag, Collection flagWords) { flagWords.forEach(word -> words.mergeLong(word, flag.asBit(), (a, b) -> a|b) ); diff --git a/code/processes/converting-process/build.gradle b/code/processes/converting-process/build.gradle index a14ee596..6d5ce58c 100644 --- a/code/processes/converting-process/build.gradle +++ b/code/processes/converting-process/build.gradle @@ -79,12 +79,15 @@ dependencies { implementation libs.crawlercommons implementation libs.commons.lang3 + implementation libs.commons.compress implementation libs.sqlite testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.junit testImplementation libs.mockito + implementation 'org.tukaani:xz:1.8' + testImplementation project(':code:processes:test-data') testImplementation project(':code:processes:crawling-process') } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java index 99445d81..0dfd816c 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java @@ -250,6 +250,15 @@ public class ConverterMain { msg, inbox); } + if (request.action == ConvertAction.SideloadStackexchange) { + var processData = fileStorageService.getStorage(request.processedDataStorage); + var filePath = Path.of(request.inputSource); + var domainName = filePath.toFile().getName().substring(0, filePath.toFile().getName().lastIndexOf('.')); + return new SideloadAction(sideloadSourceFactory.sideloadStackexchange(filePath, domainName), + processData.asPath(), + msg, inbox); + } + else { throw new RuntimeException("Unknown action: " + request.action); } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/UrlsCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/UrlsCompiler.java index 34e243b3..ee4f3cbe 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/UrlsCompiler.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/UrlsCompiler.java @@ -66,6 +66,9 @@ public class UrlsCompiler { urls.add(urlsIterator.next()); } + if (!urls.isEmpty()) { + instructionConsumer.accept(new LoadUrl(urls.toArray(EdgeUrl[]::new))); + } } public void compileJustDomain(Consumer instructionConsumer, EdgeDomain domain) { diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/EncyclopediaMarginaliaNuSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/EncyclopediaMarginaliaNuSideloader.java index ef5a5874..ae07b6c3 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/EncyclopediaMarginaliaNuSideloader.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/EncyclopediaMarginaliaNuSideloader.java @@ -103,7 +103,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC while (rs.next()) { var articleParts = fromCompressedJson(rs.getBytes("html"), ArticleParts.class); String title = rs.getString("title"); - String url = rs.getString("url"); + String url = URLEncoder.encode(rs.getString("url"), StandardCharsets.UTF_8); sem.acquire(); @@ -176,6 +176,8 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC ret.words = details.words(); ret.details = details.details(); + ret.details.metadata = ret.details.metadata + .withSize(10_000_000, Math.max(0, 255 - url.length())); ret.url = new EdgeUrl(fullUrl); ret.state = UrlIndexingState.OK; ret.stateReason = "SIDELOAD"; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java index 83c629d3..fd709951 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java @@ -3,6 +3,8 @@ package nu.marginalia.converting.sideload; import com.google.gson.Gson; import com.google.inject.Inject; import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin; +import nu.marginalia.keyword.DocumentKeywordExtractor; +import nu.marginalia.language.sentence.SentenceExtractor; import java.nio.file.Path; import java.sql.SQLException; @@ -10,14 +12,24 @@ import java.sql.SQLException; public class SideloadSourceFactory { private final Gson gson; private final HtmlDocumentProcessorPlugin htmlProcessorPlugin; + private final SentenceExtractor sentenceExtractor; + private final DocumentKeywordExtractor documentKeywordExtractor; @Inject - public SideloadSourceFactory(Gson gson, HtmlDocumentProcessorPlugin htmlProcessorPlugin) { + public SideloadSourceFactory(Gson gson, HtmlDocumentProcessorPlugin htmlProcessorPlugin, SentenceExtractor sentenceExtractor, DocumentKeywordExtractor documentKeywordExtractor) { this.gson = gson; this.htmlProcessorPlugin = htmlProcessorPlugin; + this.sentenceExtractor = sentenceExtractor; + this.documentKeywordExtractor = documentKeywordExtractor; } public SideloadSource sideloadEncyclopediaMarginaliaNu(Path pathToDbFile) throws SQLException { return new EncyclopediaMarginaliaNuSideloader(pathToDbFile, gson, htmlProcessorPlugin); } + + /** Do not use, this code isn't finished */ + @Deprecated() + public SideloadSource sideloadStackexchange(Path pathTo7zFile, String domainName) { + return new StackexchangeSideloader(pathTo7zFile, domainName, sentenceExtractor, documentKeywordExtractor); + } } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/StackExchange7zReader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/StackExchange7zReader.java new file mode 100644 index 00000000..a3e42e65 --- /dev/null +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/StackExchange7zReader.java @@ -0,0 +1,229 @@ +package nu.marginalia.converting.sideload; + +import lombok.SneakyThrows; +import org.apache.commons.compress.archivers.sevenz.SevenZArchiveEntry; +import org.apache.commons.compress.archivers.sevenz.SevenZFile; + +import javax.xml.namespace.QName; +import javax.xml.stream.XMLInputFactory; +import javax.xml.stream.XMLStreamException; +import java.io.IOException; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; +import java.util.stream.Collectors; + +@Deprecated +public class StackExchange7zReader { + private final Path pathTo7zFile; + + public StackExchange7zReader(Path pathTo7zFile) { + this.pathTo7zFile = pathTo7zFile; + } + + public List getIds() throws IOException, XMLStreamException { + try (SevenZFile file = new SevenZFile(pathTo7zFile.toFile())) { + for (SevenZArchiveEntry entry : file.getEntries()) { + if ("Posts.xml".equals(entry.getName())) { + return getIds(file, entry); + } + } + } + return List.of(); + } + + + private List getIds(SevenZFile file, SevenZArchiveEntry entry) throws IOException, XMLStreamException { + List ids = new ArrayList<>(10000); + + XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance(); + var idField = new QName("Id"); + + try (var inputStream = file.getInputStream(entry)) { + + var xmlReader = xmlInputFactory.createXMLEventReader(inputStream); + + while (xmlReader.hasNext()) { + var event = xmlReader.nextEvent(); + if (!event.isStartElement()) continue; + + var startEvent = event.asStartElement(); + if (!"row".equals(startEvent.getName().getLocalPart())) continue; + + var fieldValue = startEvent.getAttributeByName(idField); + if (fieldValue != null) { + ids.add(fieldValue.getValue()); + } + } + } + + return ids; + } + + public Iterator postIterator() throws IOException, XMLStreamException { + SevenZFile postsFile = new SevenZFile(pathTo7zFile.toFile()); + SevenZFile commentsFile = new SevenZFile(pathTo7zFile.toFile()); + + SevenZArchiveEntry postsEntry = null; + SevenZArchiveEntry commentsEntry = null; + + for (SevenZArchiveEntry entry : postsFile.getEntries()) { + if ("Posts.xml".equals(entry.getName())) { + postsEntry = entry; + break; + } + } + + for (SevenZArchiveEntry entry : commentsFile.getEntries()) { + if ("Comments.xml".equals(entry.getName())) { + commentsEntry = entry; + break; + } + } + + if (postsEntry == null || commentsEntry == null) { + postsFile.close(); + commentsFile.close(); + + throw new IOException("Posts.xml or Comments.xml not found in 7z file"); + } + + var postsInputStream = postsFile.getInputStream(postsEntry); + var commentsInputStream = commentsFile.getInputStream(commentsEntry); + + XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance(); + + var postsXmlReader = xmlInputFactory.createXMLEventReader(postsInputStream); + var commentsXmlReader = xmlInputFactory.createXMLEventReader(commentsInputStream); + + QName titleName = new QName("Title"); + QName idName = new QName("Id"); + QName bodyName = new QName("Body"); + QName tagsName = new QName("Tags"); + QName creationDateName = new QName("CreationDate"); + QName score = new QName("Score"); + + QName postIdName = new QName("PostId"); + QName textName = new QName("Text"); + + return new Iterator<>() { + Post next = null; + Comment nextComment = null; + + @SneakyThrows + @Override + public boolean hasNext() { + if (next != null) + return true; + + while (postsXmlReader.hasNext()) { + var event = postsXmlReader.nextEvent(); + if (!event.isStartElement()) continue; + + var startEvent = event.asStartElement(); + if (!"row".equals(startEvent.getName().getLocalPart())) continue; + + var scoreAttribute = startEvent.getAttributeByName(score); + if (scoreAttribute == null) continue; + int score = Integer.parseInt(scoreAttribute.getValue()); + if (score < 1) continue; + + var titleAttribute = startEvent.getAttributeByName(titleName); + if (titleAttribute == null) continue; + String title = titleAttribute.getValue(); + + var idAttribute = startEvent.getAttributeByName(idName); + if (idAttribute == null) continue; + int id = Integer.parseInt(idAttribute.getValue()); + + var bodyAttribute = startEvent.getAttributeByName(bodyName); + if (bodyAttribute == null) continue; + String body = bodyAttribute.getValue(); + + var tagsAttribute = startEvent.getAttributeByName(tagsName); + if (tagsAttribute == null) continue; + String tags = tagsAttribute.getValue(); + List tagsParsed = parseTags(tags); + var creationDateAttribute = startEvent.getAttributeByName(creationDateName); + if (creationDateAttribute == null) continue; + String creationDate = creationDateAttribute.getValue(); + int year = Integer.parseInt(creationDate.substring(0, 4)); + + List comments = new ArrayList<>(); + do { + if (nextComment == null) continue; + + if (nextComment.postId > id) { + break; + } + if (nextComment.postId == id) { + comments.add(nextComment); + nextComment = null; + } + } + while (readNextComment()); + + next = new Post(title, tagsParsed, year, id, body, comments); + return true; + } + + postsInputStream.close(); + commentsInputStream.close(); + postsFile.close(); + commentsFile.close(); + + return false; + } + + private boolean readNextComment() throws XMLStreamException { + while (commentsXmlReader.hasNext()) { + var event = commentsXmlReader.nextEvent(); + if (!event.isStartElement()) continue; + + var startEvent = event.asStartElement(); + if (!"row".equals(startEvent.getName().getLocalPart())) continue; + + var postIdAttribute = startEvent.getAttributeByName(postIdName); + if (postIdAttribute == null) continue; + int postId = Integer.parseInt(postIdAttribute.getValue()); + + var textAttribute = startEvent.getAttributeByName(textName); + if (textAttribute == null) continue; + String text = textAttribute.getValue(); + + nextComment = new Comment(postId, text); + return true; + } + return false; + } + + @Override + public Post next() { + if (hasNext()) { + var ret = next; + next = null; + return ret; + } + + throw new IllegalStateException("No more posts"); + } + }; + } + + private List parseTags(String tags) { + return Arrays.stream(tags.split("<|>")) + .filter(s -> !s.isBlank()) + .collect(Collectors.toList()); + } + + + public record Post(String title, List tags, int year, int id, String body, List comments) { + + } + + public record Comment(int postId, String text) { + + } +} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/StackexchangeSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/StackexchangeSideloader.java new file mode 100644 index 00000000..97a37ac9 --- /dev/null +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/StackexchangeSideloader.java @@ -0,0 +1,149 @@ +package nu.marginalia.converting.sideload; + +import lombok.SneakyThrows; +import nu.marginalia.converting.model.*; +import nu.marginalia.keyword.DocumentKeywordExtractor; +import nu.marginalia.language.sentence.SentenceExtractor; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawl.DomainIndexingState; +import nu.marginalia.model.crawl.HtmlFeature; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.model.crawl.UrlIndexingState; +import nu.marginalia.model.idx.DocumentFlags; +import nu.marginalia.model.idx.DocumentMetadata; +import nu.marginalia.model.idx.WordFlags; +import org.apache.commons.lang3.StringUtils; +import org.jsoup.Jsoup; + +import javax.xml.stream.XMLStreamException; +import java.io.IOException; +import java.nio.file.Path; +import java.util.EnumSet; +import java.util.Iterator; +import java.util.List; +import java.util.Optional; + +/** This code is broken */ +@Deprecated() +public class StackexchangeSideloader implements SideloadSource { + private final StackExchange7zReader reader; + private final SentenceExtractor sentenceExtractor; + private final DocumentKeywordExtractor keywordExtractor; + private final String domainName; + + public StackexchangeSideloader(Path pathTo7zFile, + String domainName, + SentenceExtractor sentenceExtractor, + DocumentKeywordExtractor keywordExtractor + ) { + this.domainName = domainName; + reader = new StackExchange7zReader(pathTo7zFile); + this.sentenceExtractor = sentenceExtractor; + this.keywordExtractor = keywordExtractor; + } + + @Override + public ProcessedDomain getDomain() { + var ret = new ProcessedDomain(); + + ret.domain = new EdgeDomain(domainName); + ret.id = domainName; + ret.ip = "127.0.0.1"; + ret.state = DomainIndexingState.ACTIVE; + + return ret; + } + + @SneakyThrows + @Override + public Iterator getUrlsIterator() { + var ids = reader.getIds(); + return ids.stream() + .map(id -> EdgeUrl.parse("https://" + domainName + "/questions/" + id)) + .filter(Optional::isPresent) + .map(Optional::get) + .iterator(); + } + + @Override + public Iterator getDocumentsStream() { + try { + var baseIter = reader.postIterator(); + return new Iterator<>() { + + @Override + public boolean hasNext() { + return baseIter.hasNext(); + } + + @Override + public ProcessedDocument next() { + return convert(baseIter.next()); + } + }; + } catch (IOException e) { + throw new RuntimeException(e); + } catch (XMLStreamException e) { + throw new RuntimeException(e); + } + } + + @SneakyThrows + private ProcessedDocument convert(StackExchange7zReader.Post post) { + String fullUrl = "https://" + domainName + "/questions/" + post.id(); + + StringBuilder fullHtml = new StringBuilder(); + fullHtml.append("").append(post.title()).append(""); + fullHtml.append("

    ").append(post.title()).append("

    "); + for (var comment : post.comments()) { + fullHtml.append("

    ").append(comment.text()).append("

    "); + } + fullHtml.append(""); + + var ret = new ProcessedDocument(); + try { + + var url = new EdgeUrl(fullUrl); + var doc = Jsoup.parse(fullHtml.toString()); + var dld = sentenceExtractor.extractSentences(doc); + + ret.url = url; + ret.words = keywordExtractor.extractKeywords(dld, url); + ret.words.addJustNoMeta("site:"+domainName); + ret.words.addJustNoMeta("site:"+url.domain.domain); + ret.words.addJustNoMeta(url.domain.domain); + ret.words.setFlagOnMetadataForWords(WordFlags.Subjects, post.tags()); + ret.details = new ProcessedDocumentDetails(); + ret.details.pubYear = post.year(); + ret.details.quality = 5; + ret.details.metadata = new DocumentMetadata(4, + PubDate.toYearByte(ret.details.pubYear), (int) -ret.details.quality, EnumSet.noneOf(DocumentFlags.class)); + ret.details.features = EnumSet.noneOf(HtmlFeature.class); + ret.details.generator = GeneratorType.DOCS; + ret.details.title = StringUtils.truncate(post.title(), 128); + ret.details.description = StringUtils.truncate(doc.body().text(), 512); + ret.details.length = 128; + + ret.details.standard = HtmlStandard.HTML5; + ret.details.feedLinks = List.of(); + ret.details.linksExternal = List.of(); + ret.details.linksInternal = List.of(); + ret.state = UrlIndexingState.OK; + ret.stateReason = "SIDELOAD"; + } + catch (Exception e) { + ret.url = new EdgeUrl(fullUrl); + ret.state = UrlIndexingState.DISQUALIFIED; + ret.stateReason = "SIDELOAD"; + } + + return ret; + } + + + @Override + public String getId() { + return domainName; + } +} diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/StackexchangeSideloaderTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/StackexchangeSideloaderTest.java new file mode 100644 index 00000000..ee48ccc9 --- /dev/null +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/StackexchangeSideloaderTest.java @@ -0,0 +1,21 @@ +package nu.marginalia.converting.sideload; + +import org.junit.jupiter.api.Test; + +import javax.xml.stream.XMLStreamException; +import java.io.IOException; +import java.nio.file.Path; + +class StackexchangeSideloaderTest { + @Test + public void test7zFile() throws IOException, XMLStreamException { + var stackExchangeReader = new StackExchange7zReader(Path.of("/mnt/storage/stackexchange/scifi.meta.stackexchange.com.7z")); + + System.out.println(stackExchangeReader.getIds()); + + var iter = stackExchangeReader.postIterator(); + while (iter.hasNext()) { + System.out.println(iter.next()); + } + } +} \ No newline at end of file diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java index 21b0b1ec..ea643d71 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java @@ -119,7 +119,7 @@ public class LoaderMain { next.apply(instructionCounter); next.apply(loader); } catch (Exception ex) { - logger.error("Failed to load instruction {}", next); + logger.error("Failed to load instruction " + next.getClass().getSimpleName(), ex); } } } @@ -214,9 +214,11 @@ public class LoaderMain { public class InstructionCounter implements Interpreter { private int count = 0; - public void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) { + + public void loadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) { count++; } + public int getCount() { return count; } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java index 87b00192..073b5c94 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java @@ -43,8 +43,8 @@ public class LoaderIndexJournalWriter { var lexiconPath = lexiconArea.asPath().resolve("dictionary.dat"); var indexPath = indexArea.asPath().resolve("page-index.dat"); - Files.deleteIfExists(lexiconPath); Files.deleteIfExists(indexPath); + Files.deleteIfExists(lexiconPath); Files.createFile(indexPath, PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--"))); Files.createFile(lexiconPath, PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--"))); @@ -62,8 +62,10 @@ public class LoaderIndexJournalWriter { public void putWords(EdgeId domain, EdgeId url, DocumentMetadata metadata, DocumentKeywords wordSet) { - if (wordSet.keywords().length == 0) + if (wordSet.keywords().length == 0) { + logger.info("Skipping zero-length word set for {}:{}", domain, url); return; + } if (domain.id() <= 0 || url.id() <= 0) { logger.warn("Bad ID: {}:{}", domain, url); @@ -87,6 +89,11 @@ public class LoaderIndexJournalWriter { EdgeId url, DocumentMetadata metadata, DocumentKeywords wordSet) { + if (null == metadata) { + logger.warn("Null metadata for {}:{}", domain, url); + return; + } + var entry = new IndexJournalEntryData(getOrInsertWordIds(wordSet.keywords(), wordSet.metadata())); var header = new IndexJournalEntryHeader(domain, url, metadata.encode()); diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java index d239f1aa..40707b1b 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java @@ -102,7 +102,7 @@ public class ControlService extends Service { return heartbeatService.getServiceHeartbeats(); }, gson::toJson); - Spark.get("/public/", (req, rsp) -> indexRenderer.render(Map.of())); + Spark.get("/public/", this::overviewModel, indexRenderer::render); Spark.get("/public/actions", (rq,rsp) -> new Object() , actionsViewRenderer::render); Spark.get("/public/services", this::servicesModel, servicesRenderer::render); @@ -190,17 +190,27 @@ public class ControlService extends Service { Spark.post("/public/actions/calculate-adjacencies", controlActionsService::calculateAdjacencies, redirectToActors); Spark.post("/public/actions/repartition-index", controlActionsService::triggerRepartition, redirectToActors); - Spark.post("/public/actions/reconvert-index", controlActionsService::triggerReconversion, redirectToActors); + Spark.post("/public/actions/reconstruct-index", controlActionsService::triggerIndexReconstruction, redirectToActors); Spark.post("/public/actions/trigger-data-exports", controlActionsService::triggerDataExports, redirectToActors); Spark.post("/public/actions/flush-search-caches", controlActionsService::flushSearchCaches, redirectToActors); Spark.post("/public/actions/flush-api-caches", controlActionsService::flushApiCaches, redirectToActors); - Spark.post("/public/actions/flush-links-database", controlActionsService::flushLinkDatabase, redirectToActors); + Spark.post("/public/actions/truncate-links-database", controlActionsService::truncateLinkDatabase, redirectToActors); Spark.get("/public/:resource", this::serveStatic); monitors.subscribe(this::logMonitorStateChange); } + private Object overviewModel(Request request, Response response) { + + return Map.of("processes", heartbeatService.getProcessHeartbeats(), + "jobs", heartbeatService.getTaskHeartbeats(), + "actors", controlActorService.getActorStates(), + "services", heartbeatService.getServiceHeartbeats(), + "events", eventLogService.getLastEntries(20) + ); + } + private Object messageQueueModel(Request request, Response response) { String inboxParam = request.queryParams("inbox"); String instanceParam = request.queryParams("instance"); diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java index 782b27c1..74dd3090 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java @@ -11,7 +11,7 @@ import nu.marginalia.control.actor.monitor.ConverterMonitorActor; import nu.marginalia.control.actor.monitor.LoaderMonitorActor; import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.mq.MessageQueueFactory; -import nu.marginalia.mqsm.StateMachine; +import nu.marginalia.mqsm.ActorStateMachine; import nu.marginalia.mqsm.graph.AbstractStateGraph; import nu.marginalia.mqsm.state.MachineState; import nu.marginalia.service.control.ServiceEventLog; @@ -28,13 +28,14 @@ public class ControlActors { private final ServiceEventLog eventLog; private final Gson gson; private final MessageQueueFactory messageQueueFactory; - public Map stateMachines = new HashMap<>(); + public Map stateMachines = new HashMap<>(); public Map actorDefinitions = new HashMap<>(); @Inject public ControlActors(MessageQueueFactory messageQueueFactory, GsonFactory gsonFactory, BaseServiceParams baseServiceParams, + ConvertActor convertActor, ReconvertAndLoadActor reconvertAndLoadActor, CrawlActor crawlActor, RecrawlActor recrawlActor, @@ -47,7 +48,7 @@ public class ControlActors { TriggerAdjacencyCalculationActor triggerAdjacencyCalculationActor, CrawlJobExtractorActor crawlJobExtractorActor, ExportDataActor exportDataActor, - FlushLinkDatabase flushLinkDatabase + TruncateLinkDatabase truncateLinkDatabase ) { this.messageQueueFactory = messageQueueFactory; this.eventLog = baseServiceParams.eventLog; @@ -55,21 +56,24 @@ public class ControlActors { register(Actor.CRAWL, crawlActor); register(Actor.RECRAWL, recrawlActor); + register(Actor.CONVERT, convertActor); register(Actor.RECONVERT_LOAD, reconvertAndLoadActor); + register(Actor.CONVERTER_MONITOR, converterMonitorFSM); register(Actor.LOADER_MONITOR, loaderMonitor); register(Actor.CRAWLER_MONITOR, crawlerMonitorActor); register(Actor.MESSAGE_QUEUE_MONITOR, messageQueueMonitor); register(Actor.PROCESS_LIVENESS_MONITOR, processMonitorFSM); register(Actor.FILE_STORAGE_MONITOR, fileStorageMonitorActor); + register(Actor.ADJACENCY_CALCULATION, triggerAdjacencyCalculationActor); register(Actor.CRAWL_JOB_EXTRACTOR, crawlJobExtractorActor); register(Actor.EXPORT_DATA, exportDataActor); - register(Actor.FLUSH_LINK_DATABASE, flushLinkDatabase); + register(Actor.TRUNCATE_LINK_DATABASE, truncateLinkDatabase); } private void register(Actor process, AbstractStateGraph graph) { - var sm = new StateMachine(messageQueueFactory, process.id(), UUID.randomUUID(), graph); + var sm = new ActorStateMachine(messageQueueFactory, process.id(), UUID.randomUUID(), graph); sm.listen((function, param) -> logStateChange(process, function)); stateMachines.put(process, sm); diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java index 6031f9d9..9ac07516 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java @@ -9,7 +9,7 @@ import nu.marginalia.mqsm.StateFactory; import nu.marginalia.mqsm.graph.AbstractStateGraph; import nu.marginalia.mqsm.graph.GraphState; import nu.marginalia.mqsm.graph.ResumeBehavior; -import nu.marginalia.mqsm.graph.TerminalState; +import nu.marginalia.mqsm.graph.TerminalGraphState; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -155,7 +155,7 @@ public class AbstractProcessSpawnerActor extends AbstractStateGraph { } } - @TerminalState(name = ABORTED, description = "The process was manually aborted") + @TerminalGraphState(name = ABORTED, description = "The process was manually aborted") public void aborted() throws Exception {} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ConvertActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ConvertActor.java new file mode 100644 index 00000000..f2bb0c6b --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ConvertActor.java @@ -0,0 +1,176 @@ +package nu.marginalia.control.actor.task; + +import com.google.gson.Gson; +import com.google.inject.Inject; +import com.google.inject.Singleton; +import lombok.AllArgsConstructor; +import lombok.NoArgsConstructor; +import lombok.With; +import nu.marginalia.control.svc.ProcessOutboxes; +import nu.marginalia.control.svc.ProcessService; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.db.storage.model.FileStorageBaseType; +import nu.marginalia.db.storage.model.FileStorageId; +import nu.marginalia.db.storage.model.FileStorageType; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.outbox.MqOutbox; +import nu.marginalia.mqapi.converting.ConvertAction; +import nu.marginalia.mqapi.converting.ConvertRequest; +import nu.marginalia.mqsm.StateFactory; +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mqsm.graph.GraphState; +import nu.marginalia.mqsm.graph.ResumeBehavior; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.nio.file.Files; +import java.nio.file.Path; + +@Singleton +public class ConvertActor extends AbstractStateGraph { + + // STATES + + public static final String CONVERT = "CONVERT"; + public static final String CONVERT_ENCYCLOPEDIA = "CONVERT_ENCYCLOPEDIA"; + public static final String CONVERT_STACKEXCHANGE = "CONVERT_STACKEXCHANGE"; + public static final String CONVERT_WAIT = "CONVERT-WAIT"; + + public static final String END = "END"; + private final ActorProcessWatcher processWatcher; + private final MqOutbox mqConverterOutbox; + private final FileStorageService storageService; + private final Gson gson; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + + @AllArgsConstructor @With @NoArgsConstructor + public static class Message { + public FileStorageId crawlStorageId = null; + public FileStorageId processedStorageId = null; + public long converterMsgId = 0L; + public long loaderMsgId = 0L; + }; + + @Inject + public ConvertActor(StateFactory stateFactory, + ActorProcessWatcher processWatcher, + ProcessOutboxes processOutboxes, + FileStorageService storageService, + Gson gson + ) + { + super(stateFactory); + this.processWatcher = processWatcher; + this.mqConverterOutbox = processOutboxes.getConverterOutbox(); + this.storageService = storageService; + this.gson = gson; + } + + @GraphState(name = CONVERT, + next = CONVERT_WAIT, + resume = ResumeBehavior.ERROR, + description = """ + Allocate a storage area for the processed data, + then send a convert request to the converter and transition to RECONVERT_WAIT. + """ + ) + public Long convert(FileStorageId sourceStorageId) throws Exception { + // Create processed data area + + var toProcess = storageService.getStorage(sourceStorageId); + var base = storageService.getStorageBase(FileStorageBaseType.SLOW); + var processedArea = storageService.allocateTemporaryStorage(base, + FileStorageType.PROCESSED_DATA, "processed-data", + "Processed Data; " + toProcess.description()); + + storageService.relateFileStorages(toProcess.id(), processedArea.id()); + + // Pre-send convert request + var request = new ConvertRequest(ConvertAction.ConvertCrawlData, + null, + sourceStorageId, + processedArea.id()); + + return mqConverterOutbox.sendAsync(ConvertRequest.class.getSimpleName(), gson.toJson(request)); + } + + @GraphState(name = CONVERT_ENCYCLOPEDIA, + next = CONVERT_WAIT, + resume = ResumeBehavior.ERROR, + description = """ + Allocate a storage area for the processed data, + then send a convert request to the converter and transition to RECONVERT_WAIT. + """ + ) + public Long convertEncyclopedia(String source) throws Exception { + // Create processed data area + + Path sourcePath = Path.of(source); + if (!Files.exists(sourcePath)) + error("Source path does not exist: " + sourcePath); + + String fileName = sourcePath.toFile().getName(); + + var base = storageService.getStorageBase(FileStorageBaseType.SLOW); + var processedArea = storageService.allocateTemporaryStorage(base, + FileStorageType.PROCESSED_DATA, "processed-data", + "Processed Encylopedia Data; " + fileName); + + // Pre-send convert request + var request = new ConvertRequest(ConvertAction.SideloadEncyclopedia, + sourcePath.toString(), + null, + processedArea.id()); + + return mqConverterOutbox.sendAsync(ConvertRequest.class.getSimpleName(), gson.toJson(request)); + } + + @GraphState(name = CONVERT_STACKEXCHANGE, + next = CONVERT_WAIT, + resume = ResumeBehavior.ERROR, + description = """ + Allocate a storage area for the processed data, + then send a convert request to the converter and transition to RECONVERT_WAIT. + """ + ) + public Long convertStackexchange(String source) throws Exception { + // Create processed data area + + Path sourcePath = Path.of(source); + if (!Files.exists(sourcePath)) + error("Source path does not exist: " + sourcePath); + + String fileName = sourcePath.toFile().getName(); + + var base = storageService.getStorageBase(FileStorageBaseType.SLOW); + var processedArea = storageService.allocateTemporaryStorage(base, + FileStorageType.PROCESSED_DATA, "processed-data", + "Processed Stackexchange Data; " + fileName); + + // Pre-send convert request + var request = new ConvertRequest(ConvertAction.SideloadStackexchange, + sourcePath.toString(), + null, + processedArea.id()); + + return mqConverterOutbox.sendAsync(ConvertRequest.class.getSimpleName(), gson.toJson(request)); + } + + @GraphState( + name = CONVERT_WAIT, + next = END, + resume = ResumeBehavior.RETRY, + description = """ + Wait for the converter to finish processing the data. + """ + ) + public void convertWait(Long msgId) throws Exception { + var rsp = processWatcher.waitResponse(mqConverterOutbox, ProcessService.ProcessId.CONVERTER, msgId); + + if (rsp.state() != MqMessageState.OK) + error("Converter failed"); + } + + +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/TriggerAdjacencyCalculationActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/TriggerAdjacencyCalculationActor.java index 8861cc07..367140fa 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/TriggerAdjacencyCalculationActor.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/TriggerAdjacencyCalculationActor.java @@ -38,7 +38,7 @@ public class TriggerAdjacencyCalculationActor extends AbstractStateGraph { Spawns a WebsitesAdjacenciesCalculator process and waits for it to finish. """ ) - public void init() throws Exception { + public void init(Integer unused) throws Exception { AtomicBoolean hasError = new AtomicBoolean(false); var future = executor.submit(() -> { try { diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/FlushLinkDatabase.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/TruncateLinkDatabase.java similarity index 83% rename from code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/FlushLinkDatabase.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/TruncateLinkDatabase.java index 833efc19..355620e5 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/FlushLinkDatabase.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/TruncateLinkDatabase.java @@ -14,17 +14,10 @@ import nu.marginalia.mqsm.graph.ResumeBehavior; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.BufferedWriter; -import java.io.OutputStreamWriter; -import java.nio.file.Files; -import java.nio.file.StandardCopyOption; -import java.nio.file.StandardOpenOption; -import java.nio.file.attribute.PosixFilePermissions; import java.sql.SQLException; -import java.util.zip.GZIPOutputStream; @Singleton -public class FlushLinkDatabase extends AbstractStateGraph { +public class TruncateLinkDatabase extends AbstractStateGraph { // STATES @@ -41,8 +34,8 @@ public class FlushLinkDatabase extends AbstractStateGraph { }; @Inject - public FlushLinkDatabase(StateFactory stateFactory, - HikariDataSource dataSource) + public TruncateLinkDatabase(StateFactory stateFactory, + HikariDataSource dataSource) { super(stateFactory); this.dataSource = dataSource; diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/model/Actor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/Actor.java index 937f8ff8..cc761073 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/model/Actor.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/Actor.java @@ -13,7 +13,9 @@ public enum Actor { ADJACENCY_CALCULATION, CRAWL_JOB_EXTRACTOR, EXPORT_DATA, - FLUSH_LINK_DATABASE; + TRUNCATE_LINK_DATABASE, + + CONVERT; public String id() { diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActionsService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActionsService.java index eb683c3e..5eaf03c2 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActionsService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActionsService.java @@ -12,7 +12,6 @@ import nu.marginalia.search.client.SearchClient; import nu.marginalia.search.client.SearchMqEndpoints; import nu.marginalia.service.control.ServiceEventLog; import nu.marginalia.service.id.ServiceId; -import nu.marginalia.service.server.BaseServiceParams; import spark.Request; import spark.Response; import spark.Spark; @@ -81,18 +80,18 @@ public class ControlActionsService { return ""; } - public Object flushLinkDatabase(Request request, Response response) throws Exception { + public Object truncateLinkDatabase(Request request, Response response) throws Exception { String footgunLicense = request.queryParams("footgun-license"); if (!"YES".equals(footgunLicense)) { Spark.halt(403); - return "You must agree to the footgun license to flush the link database"; + return "You must agree to the footgun license to truncate the link database"; } eventLog.logEvent("USER-ACTION", "FLUSH-LINK-DATABASE"); - actors.start(Actor.FLUSH_LINK_DATABASE); + actors.start(Actor.TRUNCATE_LINK_DATABASE); return ""; } @@ -103,7 +102,7 @@ public class ControlActionsService { return null; } - public Object triggerReconversion(Request request, Response response) throws Exception { + public Object triggerIndexReconstruction(Request request, Response response) throws Exception { indexClient.outbox().sendAsync(IndexMqEndpoints.INDEX_REINDEX, ""); return null; diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java index aa0ed905..51b3739c 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java @@ -14,7 +14,7 @@ import nu.marginalia.mqsm.state.MachineState; import spark.Request; import spark.Response; -import java.util.Map; +import java.util.Comparator; @Singleton public class ControlActorService { @@ -84,7 +84,7 @@ public class ControlActorService { } public Object getActorStates() { - return controlActors.getActorStates().entrySet().stream().sorted(Map.Entry.comparingByKey()).map(e -> { + return controlActors.getActorStates().entrySet().stream().map(e -> { final MachineState state = e.getValue(); final String machineName = e.getKey().name(); @@ -93,7 +93,10 @@ public class ControlActorService { final boolean canStart = controlActors.isDirectlyInitializable(e.getKey()) && terminal; return new ActorRunState(machineName, stateName, terminal, canStart); - }).toList(); + }) + .filter(s -> !s.terminal() || s.canStart()) + .sorted(Comparator.comparing(ActorRunState::name)) + .toList(); } public Object createCrawlSpecification(Request request, Response response) throws Exception { diff --git a/code/services-core/control-service/src/main/resources/templates/control/actions.hdb b/code/services-core/control-service/src/main/resources/templates/control/actions.hdb index 5efcedbd..dda6ea0d 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/actions.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/actions.hdb @@ -35,12 +35,12 @@
    - @@ -84,12 +84,12 @@ -
    Reconvert Index

    +

    Reconstruct Index

    This will reconstruct the index from the index journal.

    -
    - + +
    Flush Links Database.

    +

    Truncate Links Database.

    This will drop all known URLs and domain links.
    This action is not reversible.

    -
    +


    diff --git a/code/services-core/control-service/src/main/resources/templates/control/index.hdb b/code/services-core/control-service/src/main/resources/templates/control/index.hdb index 5e72a451..43c189fd 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/index.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/index.hdb @@ -8,8 +8,16 @@ {{> control/partials/nav}}
    -

    Overview

    + {{> control/partials/services-table }} + {{> control/partials/processes-table}} + {{> control/partials/actors-table}} + {{> control/partials/events-table }}
    + From afad4f5ebb6e9d79ed10755aae96f5bbfdb860c9 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 7 Aug 2023 12:59:33 +0200 Subject: [PATCH 150/157] (*) last touches --- .../resources/sql/current/10-domain-type.sql | 19 ------------------- 1 file changed, 19 deletions(-) delete mode 100644 code/common/db/src/main/resources/sql/current/10-domain-type.sql diff --git a/code/common/db/src/main/resources/sql/current/10-domain-type.sql b/code/common/db/src/main/resources/sql/current/10-domain-type.sql deleted file mode 100644 index 2011d1f6..00000000 --- a/code/common/db/src/main/resources/sql/current/10-domain-type.sql +++ /dev/null @@ -1,19 +0,0 @@ -CREATE TABLE IF NOT EXISTS DOMAIN_SELECTION_TYPE ( - ID INT PRIMARY KEY AUTO_INCREMENT, - NAME VARCHAR(255) UNIQUE, - SOURCE VARCHAR(255) NOT NULL -) -CHARACTER SET utf8mb4 -COLLATE utf8mb4_bin; - -CREATE TABLE DOMAIN_SELECTION ( - DOMAIN_NAME VARCHAR(255) PRIMARY KEY, - DOMAIN_TYPE_ID INT, - FOREIGN KEY (DOMAIN_TYPE_ID) REFERENCES DOMAIN_SELECTION_TYPE(ID) ON DELETE CASCADE -) -CHARACTER SET utf8mb4 -COLLATE utf8mb4_unicode_ci; - -INSERT IGNORE INTO DOMAIN_SELECTION_TYPE(NAME, SOURCE) -VALUES ('BLOG', 'https://raw.githubusercontent.com/MarginaliaSearch/submit-site-to-marginalia-search/master/blogs.txt'), - ('TEST', 'https://downloads.marginalia.nu/domain-list-test.txt'); \ No newline at end of file From 71dfe9f33e709552d0b67e71c3e2c92c6205fa2f Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 9 Aug 2023 12:42:01 +0200 Subject: [PATCH 151/157] (control) Clean up the ControlService, move mq-related endpoints to MessageQueueService. --- .../nu/marginalia/control/ControlService.java | 130 ++---------------- .../nu/marginalia/control/HtmlRedirect.java | 3 + .../control/{model => actor}/Actor.java | 2 +- .../control/actor/ControlActors.java | 1 - .../monitor/AbstractProcessSpawnerActor.java | 2 +- .../actor/monitor/ConverterMonitorActor.java | 2 +- .../actor/monitor/CrawlerMonitorActor.java | 2 +- .../actor/monitor/LoaderMonitorActor.java | 2 +- .../monitor/ProcessLivenessMonitorActor.java | 3 +- .../actor/task/ActorProcessWatcher.java | 2 +- .../control/actor/task/ConvertActor.java | 4 +- .../control/actor/task/CrawlActor.java | 4 +- .../actor/task/CrawlJobExtractorActor.java | 2 +- .../actor/task/ReconvertAndLoadActor.java | 4 +- .../control/actor/task/RecrawlActor.java | 4 +- .../TriggerAdjacencyCalculationActor.java | 2 +- .../control/model/ProcessHeartbeat.java | 2 +- .../{svc => process}/ProcessOutboxes.java | 2 +- .../{svc => process}/ProcessService.java | 2 +- .../control/svc/ControlActionsService.java | 2 +- .../control/svc/ControlActorService.java | 2 +- ...wService.java => MessageQueueService.java} | 110 ++++++++++++++- 22 files changed, 145 insertions(+), 144 deletions(-) rename code/services-core/control-service/src/main/java/nu/marginalia/control/{model => actor}/Actor.java (91%) rename code/services-core/control-service/src/main/java/nu/marginalia/control/{svc => process}/ProcessOutboxes.java (97%) rename code/services-core/control-service/src/main/java/nu/marginalia/control/{svc => process}/ProcessService.java (99%) rename code/services-core/control-service/src/main/java/nu/marginalia/control/svc/{MessageQueueViewService.java => MessageQueueService.java} (67%) diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java index 40707b1b..671cd584 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java @@ -3,16 +3,13 @@ package nu.marginalia.control; import com.google.gson.Gson; import com.google.inject.Inject; import nu.marginalia.client.ServiceMonitors; -import nu.marginalia.control.model.Actor; +import nu.marginalia.control.actor.Actor; import nu.marginalia.control.model.DomainComplaintModel; -import nu.marginalia.control.model.MessageQueueEntry; import nu.marginalia.control.svc.*; import nu.marginalia.db.storage.model.FileStorageId; import nu.marginalia.db.storage.model.FileStorageType; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.gson.GsonFactory; -import nu.marginalia.mq.MqMessageState; -import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.renderer.RendererFactory; import nu.marginalia.service.server.*; import org.eclipse.jetty.util.StringUtil; @@ -27,7 +24,6 @@ import java.sql.SQLException; import java.util.Comparator; import java.util.List; import java.util.Map; -import java.util.Optional; import java.util.stream.Collectors; public class ControlService extends Service { @@ -42,7 +38,7 @@ public class ControlService extends Service { private final DomainComplaintService domainComplaintService; private final ControlActorService controlActorService; private final StaticResources staticResources; - private final MessageQueueViewService messageQueueViewService; + private final MessageQueueService messageQueueService; private final ControlFileStorageService controlFileStorageService; @@ -54,12 +50,11 @@ public class ControlService extends Service { RendererFactory rendererFactory, ControlActorService controlActorService, StaticResources staticResources, - MessageQueueViewService messageQueueViewService, + MessageQueueService messageQueueService, ControlFileStorageService controlFileStorageService, ApiKeyService apiKeyService, DomainComplaintService domainComplaintService, - ControlActionsService controlActionsService, - MqPersistence persistence + ControlActionsService controlActionsService ) throws IOException { super(params); @@ -94,7 +89,7 @@ public class ControlService extends Service { this.controlActorService = controlActorService; this.staticResources = staticResources; - this.messageQueueViewService = messageQueueViewService; + this.messageQueueService = messageQueueService; this.controlFileStorageService = controlFileStorageService; Spark.get("/public/heartbeats", (req, res) -> { @@ -107,7 +102,6 @@ public class ControlService extends Service { Spark.get("/public/actions", (rq,rsp) -> new Object() , actionsViewRenderer::render); Spark.get("/public/services", this::servicesModel, servicesRenderer::render); Spark.get("/public/services/:id", this::serviceModel, serviceByIdRenderer::render); - Spark.get("/public/messages/:id", this::existingMessageModel, gson::toJson); Spark.get("/public/actors", this::processesModel, actorsRenderer::render); Spark.get("/public/actors/:fsm", this::actorDetailsModel, actorDetailsRenderer::render); @@ -125,37 +119,13 @@ public class ControlService extends Service { // Message Queue - Spark.get("/public/message-queue", this::messageQueueModel, messageQueueRenderer::render); - Spark.post("/public/message-queue/", (rq, rsp) -> { - String recipient = rq.queryParams("recipientInbox"); - String sender = rq.queryParams("senderInbox"); - String relatedMessage = rq.queryParams("relatedId"); - String function = rq.queryParams("function"); - String payload = rq.queryParams("payload"); - - persistence.sendNewMessage(recipient, - sender.isBlank() ? null : sender, - relatedMessage == null ? null : Long.parseLong(relatedMessage), - function, - payload, - null); - - return ""; - }, redirectToMessageQueue); - Spark.get("/public/message-queue/new", this::newMessageModel, newMessageRenderer::render); - Spark.get("/public/message-queue/:id", - (rq, rsp) -> Map.of("message", messageQueueViewService.getMessage(Long.parseLong(rq.params("id"))), - "relatedMessages", messageQueueViewService.getRelatedMessages(Long.parseLong(rq.params("id")))) - , viewMessageRenderer::render); - - Spark.get("/public/message-queue/:id/reply", this::replyMessageModel, newMessageRenderer::render); - Spark.get("/public/message-queue/:id/edit", (rq, rsp) -> persistence.getMessage(Long.parseLong(rq.params("id"))), updateMessageStateRenderer::render); - Spark.post("/public/message-queue/:id/edit", (rq, rsp) -> { - MqMessageState state = MqMessageState.valueOf(rq.queryParams("state")); - long id = Long.parseLong(rq.params("id")); - persistence.updateMessageState(id, state); - return ""; - }, redirectToMessageQueue); + Spark.get("/public/message-queue", messageQueueService::listMessageQueueModel, messageQueueRenderer::render); + Spark.post("/public/message-queue/", messageQueueService::createMessage, redirectToMessageQueue); + Spark.get("/public/message-queue/new", messageQueueService::newMessageModel, newMessageRenderer::render); + Spark.get("/public/message-queue/:id", messageQueueService::viewMessageModel, viewMessageRenderer::render); + Spark.get("/public/message-queue/:id/reply", messageQueueService::replyMessageModel, newMessageRenderer::render); + Spark.get("/public/message-queue/:id/edit", messageQueueService::viewMessageForEditStateModel, updateMessageStateRenderer::render); + Spark.post("/public/message-queue/:id/edit", messageQueueService::editMessageState, redirectToMessageQueue); // Storage Spark.get("/public/storage", this::storageModel, storageRenderer::render); @@ -211,42 +181,6 @@ public class ControlService extends Service { ); } - private Object messageQueueModel(Request request, Response response) { - String inboxParam = request.queryParams("inbox"); - String instanceParam = request.queryParams("instance"); - String afterParam = request.queryParams("after"); - - long afterId = Optional.ofNullable(afterParam).map(Long::parseLong).orElse(Long.MAX_VALUE); - - List entries; - - String mqFilter = "filter=none"; - if (inboxParam != null) { - mqFilter = "inbox=" + inboxParam; - entries = messageQueueViewService.getEntriesForInbox(inboxParam, afterId, 20); - } - else if (instanceParam != null) { - mqFilter = "instance=" + instanceParam; - entries = messageQueueViewService.getEntriesForInstance(instanceParam, afterId, 20); - } - else { - entries = messageQueueViewService.getEntries(afterId, 20); - } - - Object next; - - if (entries.size() == 20) - next = entries.stream().mapToLong(MessageQueueEntry::id).min().getAsLong(); - else - next = ""; - - Object prev = afterParam == null ? "" : afterParam; - - return Map.of("messages", entries, - "next", next, - "prev", prev, - "mqFilter", mqFilter); - } private Object complaintsModel(Request request, Response response) { Map> complaintsByReviewed = @@ -325,46 +259,12 @@ public class ControlService extends Service { } - private Object existingMessageModel(Request request, Response response) { - var message = messageQueueViewService.getMessage(Long.parseLong(request.params("id"))); - if (message != null) { - response.type("application/json"); - return message; - } - else { - response.status(404); - return ""; - } - } - - private Object newMessageModel(Request request, Response response) { - String idParam = request.queryParams("id"); - if (null == idParam) - return Map.of("relatedId", "-1"); - - var message = messageQueueViewService.getMessage(Long.parseLong(idParam)); - if (message != null) - return message; - - return Map.of("relatedId", "-1"); - } - private Object replyMessageModel(Request request, Response response) { - String idParam = request.params("id"); - - var message = messageQueueViewService.getMessage(Long.parseLong(idParam)); - - return Map.of("relatedId", message.id(), - "recipientInbox", message.senderInbox(), - "function", "REPLY"); - } - - private Object serviceModel(Request request, Response response) { String serviceName = request.params("id"); return Map.of( "id", serviceName, - "messages", messageQueueViewService.getEntriesForInbox(serviceName, Long.MAX_VALUE, 20), + "messages", messageQueueService.getEntriesForInbox(serviceName, Long.MAX_VALUE, 20), "events", eventLogService.getLastEntriesForService(serviceName, 20)); } @@ -396,7 +296,7 @@ public class ControlService extends Service { return Map.of("processes", processes, "jobs", jobs, "actors", controlActorService.getActorStates(), - "messages", messageQueueViewService.getLastEntries(20)); + "messages", messageQueueService.getLastEntries(20)); } private Object actorDetailsModel(Request request, Response response) { final Actor actor = Actor.valueOf(request.params("fsm").toUpperCase()); @@ -405,7 +305,7 @@ public class ControlService extends Service { return Map.of( "actor", actor, "state-graph", controlActorService.getActorStateGraph(actor), - "messages", messageQueueViewService.getLastEntriesForInbox(inbox, 20)); + "messages", messageQueueService.getLastEntriesForInbox(inbox, 20)); } private Object serveStatic(Request request, Response response) { String resource = request.params("resource"); diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/HtmlRedirect.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/HtmlRedirect.java index fd49bd6d..ff1e2368 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/HtmlRedirect.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/HtmlRedirect.java @@ -5,6 +5,9 @@ import spark.ResponseTransformer; public class HtmlRedirect implements ResponseTransformer { private final String html; + /** Because Spark doesn't have a redirect method that works with relative URLs + * (without explicitly providing the external address),we use HTML and let the + * browser resolve the relative redirect instead */ public HtmlRedirect(String destination) { this.html = """ diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/model/Actor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/Actor.java similarity index 91% rename from code/services-core/control-service/src/main/java/nu/marginalia/control/model/Actor.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/actor/Actor.java index cc761073..d9002e18 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/model/Actor.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/Actor.java @@ -1,4 +1,4 @@ -package nu.marginalia.control.model; +package nu.marginalia.control.actor; public enum Actor { CRAWL, diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java index 74dd3090..37cd9e9c 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java @@ -5,7 +5,6 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import lombok.SneakyThrows; import nu.marginalia.control.actor.task.*; -import nu.marginalia.control.model.Actor; import nu.marginalia.control.actor.monitor.*; import nu.marginalia.control.actor.monitor.ConverterMonitorActor; import nu.marginalia.control.actor.monitor.LoaderMonitorActor; diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java index 9ac07516..92bbc1d6 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java @@ -2,7 +2,7 @@ package nu.marginalia.control.actor.monitor; import com.google.inject.Inject; import com.google.inject.Singleton; -import nu.marginalia.control.svc.ProcessService; +import nu.marginalia.control.process.ProcessService; import nu.marginalia.mq.MqMessageState; import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.mqsm.StateFactory; diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/ConverterMonitorActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/ConverterMonitorActor.java index b1f37067..158b48ca 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/ConverterMonitorActor.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/ConverterMonitorActor.java @@ -2,7 +2,7 @@ package nu.marginalia.control.actor.monitor; import com.google.inject.Inject; import com.google.inject.Singleton; -import nu.marginalia.control.svc.ProcessService; +import nu.marginalia.control.process.ProcessService; import nu.marginalia.mqapi.ProcessInboxNames; import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.mqsm.StateFactory; diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/CrawlerMonitorActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/CrawlerMonitorActor.java index f50f7b73..cc9c73fb 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/CrawlerMonitorActor.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/CrawlerMonitorActor.java @@ -2,7 +2,7 @@ package nu.marginalia.control.actor.monitor; import com.google.inject.Inject; import com.google.inject.Singleton; -import nu.marginalia.control.svc.ProcessService; +import nu.marginalia.control.process.ProcessService; import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.mqapi.ProcessInboxNames; import nu.marginalia.mqsm.StateFactory; diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/LoaderMonitorActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/LoaderMonitorActor.java index 3b959356..fcf3b895 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/LoaderMonitorActor.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/LoaderMonitorActor.java @@ -2,7 +2,7 @@ package nu.marginalia.control.actor.monitor; import com.google.inject.Inject; import com.google.inject.Singleton; -import nu.marginalia.control.svc.ProcessService; +import nu.marginalia.control.process.ProcessService; import nu.marginalia.mqapi.ProcessInboxNames; import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.mqsm.StateFactory; diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/ProcessLivenessMonitorActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/ProcessLivenessMonitorActor.java index 1098a085..4128f6f9 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/ProcessLivenessMonitorActor.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/ProcessLivenessMonitorActor.java @@ -2,10 +2,9 @@ package nu.marginalia.control.actor.monitor; import com.google.inject.Inject; import com.google.inject.Singleton; -import nu.marginalia.control.model.ProcessHeartbeat; import nu.marginalia.control.model.ServiceHeartbeat; import nu.marginalia.control.svc.HeartbeatService; -import nu.marginalia.control.svc.ProcessService; +import nu.marginalia.control.process.ProcessService; import nu.marginalia.mqsm.StateFactory; import nu.marginalia.mqsm.graph.AbstractStateGraph; import nu.marginalia.mqsm.graph.GraphState; diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ActorProcessWatcher.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ActorProcessWatcher.java index e82168f4..d6c33608 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ActorProcessWatcher.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ActorProcessWatcher.java @@ -2,7 +2,7 @@ package nu.marginalia.control.actor.task; import com.google.inject.Inject; import com.google.inject.Singleton; -import nu.marginalia.control.svc.ProcessService; +import nu.marginalia.control.process.ProcessService; import nu.marginalia.mq.MqMessage; import nu.marginalia.mq.outbox.MqOutbox; import nu.marginalia.mqsm.graph.ControlFlowException; diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ConvertActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ConvertActor.java index f2bb0c6b..0bcc5293 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ConvertActor.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ConvertActor.java @@ -6,8 +6,8 @@ import com.google.inject.Singleton; import lombok.AllArgsConstructor; import lombok.NoArgsConstructor; import lombok.With; -import nu.marginalia.control.svc.ProcessOutboxes; -import nu.marginalia.control.svc.ProcessService; +import nu.marginalia.control.process.ProcessOutboxes; +import nu.marginalia.control.process.ProcessService; import nu.marginalia.db.storage.FileStorageService; import nu.marginalia.db.storage.model.FileStorageBaseType; import nu.marginalia.db.storage.model.FileStorageId; diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlActor.java index 48ebbc79..40f447c1 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlActor.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlActor.java @@ -6,8 +6,8 @@ import com.google.inject.Singleton; import lombok.AllArgsConstructor; import lombok.NoArgsConstructor; import lombok.With; -import nu.marginalia.control.svc.ProcessOutboxes; -import nu.marginalia.control.svc.ProcessService; +import nu.marginalia.control.process.ProcessOutboxes; +import nu.marginalia.control.process.ProcessService; import nu.marginalia.db.storage.FileStorageService; import nu.marginalia.db.storage.model.FileStorageBaseType; import nu.marginalia.db.storage.model.FileStorageId; diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlJobExtractorActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlJobExtractorActor.java index 621e06e1..9cadc49a 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlJobExtractorActor.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlJobExtractorActor.java @@ -3,7 +3,7 @@ package nu.marginalia.control.actor.task; import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.control.svc.ControlFileStorageService; -import nu.marginalia.control.svc.ProcessService; +import nu.marginalia.control.process.ProcessService; import nu.marginalia.db.storage.FileStorageService; import nu.marginalia.db.storage.model.FileStorage; import nu.marginalia.db.storage.model.FileStorageBaseType; diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ReconvertAndLoadActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ReconvertAndLoadActor.java index a7589439..06c982ff 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ReconvertAndLoadActor.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ReconvertAndLoadActor.java @@ -6,8 +6,8 @@ import com.google.inject.Singleton; import lombok.AllArgsConstructor; import lombok.NoArgsConstructor; import lombok.With; -import nu.marginalia.control.svc.ProcessOutboxes; -import nu.marginalia.control.svc.ProcessService; +import nu.marginalia.control.process.ProcessOutboxes; +import nu.marginalia.control.process.ProcessService; import nu.marginalia.index.client.IndexClient; import nu.marginalia.index.client.IndexMqEndpoints; import nu.marginalia.mqapi.converting.ConvertAction; diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/RecrawlActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/RecrawlActor.java index 9311cead..c4253a0d 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/RecrawlActor.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/RecrawlActor.java @@ -6,8 +6,8 @@ import com.google.inject.Singleton; import lombok.AllArgsConstructor; import lombok.NoArgsConstructor; import lombok.With; -import nu.marginalia.control.svc.ProcessOutboxes; -import nu.marginalia.control.svc.ProcessService; +import nu.marginalia.control.process.ProcessOutboxes; +import nu.marginalia.control.process.ProcessService; import nu.marginalia.db.storage.FileStorageService; import nu.marginalia.db.storage.model.FileStorage; import nu.marginalia.db.storage.model.FileStorageId; diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/TriggerAdjacencyCalculationActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/TriggerAdjacencyCalculationActor.java index 367140fa..7441b437 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/TriggerAdjacencyCalculationActor.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/TriggerAdjacencyCalculationActor.java @@ -2,7 +2,7 @@ package nu.marginalia.control.actor.task; import com.google.inject.Inject; import com.google.inject.Singleton; -import nu.marginalia.control.svc.ProcessService; +import nu.marginalia.control.process.ProcessService; import nu.marginalia.mqsm.StateFactory; import nu.marginalia.mqsm.graph.AbstractStateGraph; import nu.marginalia.mqsm.graph.GraphState; diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java index f3f43e76..accb3351 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java @@ -1,6 +1,6 @@ package nu.marginalia.control.model; -import nu.marginalia.control.svc.ProcessService; +import nu.marginalia.control.process.ProcessService; public record ProcessHeartbeat( String processId, diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ProcessOutboxes.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/process/ProcessOutboxes.java similarity index 97% rename from code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ProcessOutboxes.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/process/ProcessOutboxes.java index a8699ab9..b5b74406 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ProcessOutboxes.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/process/ProcessOutboxes.java @@ -1,4 +1,4 @@ -package nu.marginalia.control.svc; +package nu.marginalia.control.process; import com.google.inject.Inject; import com.google.inject.Singleton; diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/process/ProcessService.java similarity index 99% rename from code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/process/ProcessService.java index cea64c9f..25583f43 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ProcessService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/process/ProcessService.java @@ -1,4 +1,4 @@ -package nu.marginalia.control.svc; +package nu.marginalia.control.process; import com.google.inject.name.Named; import nu.marginalia.service.control.ServiceEventLog; diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActionsService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActionsService.java index 5eaf03c2..4425ac52 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActionsService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActionsService.java @@ -3,7 +3,7 @@ package nu.marginalia.control.svc; import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.control.actor.ControlActors; -import nu.marginalia.control.model.Actor; +import nu.marginalia.control.actor.Actor; import nu.marginalia.index.client.IndexClient; import nu.marginalia.index.client.IndexMqEndpoints; import nu.marginalia.mq.MessageQueueFactory; diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java index 51b3739c..ddfbbe58 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java @@ -6,7 +6,7 @@ import nu.marginalia.control.actor.ControlActors; import nu.marginalia.control.actor.task.CrawlJobExtractorActor; import nu.marginalia.control.actor.task.ReconvertAndLoadActor; import nu.marginalia.control.actor.task.RecrawlActor; -import nu.marginalia.control.model.Actor; +import nu.marginalia.control.actor.Actor; import nu.marginalia.control.model.ActorRunState; import nu.marginalia.control.model.ActorStateGraph; import nu.marginalia.db.storage.model.FileStorageId; diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueViewService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueService.java similarity index 67% rename from code/services-core/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueViewService.java rename to code/services-core/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueService.java index 02031c2a..1d74a5bf 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueViewService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueService.java @@ -3,23 +3,123 @@ package nu.marginalia.control.svc; import com.google.inject.Inject; import com.google.inject.Singleton; import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.control.model.Actor; import nu.marginalia.control.model.MessageQueueEntry; -import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.persistence.MqPersistence; +import spark.Request; +import spark.Response; import java.sql.ResultSet; import java.sql.SQLException; import java.util.ArrayList; import java.util.List; +import java.util.Map; +import java.util.Optional; @Singleton -public class MessageQueueViewService { +public class MessageQueueService { private final HikariDataSource dataSource; + private final MqPersistence persistence; @Inject - public MessageQueueViewService(HikariDataSource dataSource) { + public MessageQueueService(HikariDataSource dataSource, MqPersistence persistence) { this.dataSource = dataSource; + this.persistence = persistence; + } + + + public Object viewMessageModel(Request request, Response response) { + return Map.of("message", getMessage(Long.parseLong(request.params("id"))), + "relatedMessages", getRelatedMessages(Long.parseLong(request.params("id")))); + } + + + public Object listMessageQueueModel(Request request, Response response) { + String inboxParam = request.queryParams("inbox"); + String instanceParam = request.queryParams("instance"); + String afterParam = request.queryParams("after"); + + long afterId = Optional.ofNullable(afterParam).map(Long::parseLong).orElse(Long.MAX_VALUE); + + List entries; + + String mqFilter = "filter=none"; + if (inboxParam != null) { + mqFilter = "inbox=" + inboxParam; + entries = getEntriesForInbox(inboxParam, afterId, 20); + } + else if (instanceParam != null) { + mqFilter = "instance=" + instanceParam; + entries = getEntriesForInstance(instanceParam, afterId, 20); + } + else { + entries = getEntries(afterId, 20); + } + + Object next; + + if (entries.size() == 20) + next = entries.stream().mapToLong(MessageQueueEntry::id).min().getAsLong(); + else + next = ""; + + Object prev = afterParam == null ? "" : afterParam; + + return Map.of("messages", entries, + "next", next, + "prev", prev, + "mqFilter", mqFilter); + } + + public Object newMessageModel(Request request, Response response) { + String idParam = request.queryParams("id"); + if (null == idParam) + return Map.of("relatedId", "-1"); + + var message = getMessage(Long.parseLong(idParam)); + if (message != null) + return message; + + return Map.of("relatedId", "-1"); + } + + public Object replyMessageModel(Request request, Response response) { + String idParam = request.params("id"); + + var message = getMessage(Long.parseLong(idParam)); + + return Map.of("relatedId", message.id(), + "recipientInbox", message.senderInbox(), + "function", "REPLY"); + } + + public Object createMessage(Request request, Response response) throws Exception { + String recipient = request.queryParams("recipientInbox"); + String sender = request.queryParams("senderInbox"); + String relatedMessage = request.queryParams("relatedId"); + String function = request.queryParams("function"); + String payload = request.queryParams("payload"); + + persistence.sendNewMessage(recipient, + sender.isBlank() ? null : sender, + relatedMessage == null ? null : Long.parseLong(relatedMessage), + function, + payload, + null); + + return ""; + } + + public Object viewMessageForEditStateModel(Request request, Response response) throws SQLException { + return persistence.getMessage(Long.parseLong(request.params("id"))); + } + + public Object editMessageState(Request request, Response response) throws SQLException { + MqMessageState state = MqMessageState.valueOf(request.queryParams("state")); + long id = Long.parseLong(request.params("id")); + persistence.updateMessageState(id, state); + return ""; } public List getLastEntries(int n) { @@ -43,7 +143,6 @@ public class MessageQueueViewService { throw new RuntimeException(ex); } } - public MessageQueueEntry getMessage(long id) { try (var conn = dataSource.getConnection(); var query = conn.prepareStatement(""" @@ -115,6 +214,7 @@ public class MessageQueueViewService { throw new RuntimeException(ex); } } + public List getEntriesForInstance(String instance, long afterId, int n) { try (var conn = dataSource.getConnection(); var query = conn.prepareStatement(""" From 47f3855a4ba03526fc36405fd205339d281376e5 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 9 Aug 2023 12:42:23 +0200 Subject: [PATCH 152/157] (control) More informative readme.md --- code/services-core/control-service/readme.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/code/services-core/control-service/readme.md b/code/services-core/control-service/readme.md index c775d365..82b08093 100644 --- a/code/services-core/control-service/readme.md +++ b/code/services-core/control-service/readme.md @@ -1,7 +1,16 @@ # Control Service -The control service provides an operator's user interface, and is responsible for orchestrating the various processes of the system. +The control service provides an operator's user interface, and is responsible for orchestrating the various +processes of the system using Actors. +Actors within the control service will spawn processes when necessary, by +monitoring their message queue inboxes. + +## Central Classes + +* [ControlService](src/main/java/nu/marginalia/control/ControlService.java) +* [ControlActors](src/main/java/nu/marginalia/control/actor/ControlActors.java) - Class responsible for Actors' lifecycle +* [ProcessService](src/main/java/nu/marginalia/control/process/ProcessService.java) - Class responsible for spawning Processes ## See Also From 251fc63b42b8beba48361bfd59973d7664642204 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 9 Aug 2023 13:33:28 +0200 Subject: [PATCH 153/157] (*) Fix merge gore --- .../db/storage/FileStorageService.java | 2 +- .../service/control/ServiceHeartbeat.java | 2 +- .../index/forward/ForwardIndexConverter.java | 2 +- .../forward/ForwardIndexConverterTest.java | 12 ++++++++++-- .../index/full/ReverseIndexFullConverter.java | 2 +- .../priority/ReverseIndexPriorityConverter.java | 2 +- .../reverse/ReverseIndexFullConverterTest.java | 9 ++++++++- .../reverse/ReverseIndexFullConverterTest2.java | 17 +++++++++++++++-- .../ReverseIndexPriorityConverterTest2.java | 17 +++++++++++++++-- .../java/nu/marginalia/crawl/CrawlerMain.java | 9 --------- .../control/svc/ApiKeyServiceTest.java | 15 +++++++++++---- .../marginalia/index/IndexServicesFactory.java | 5 +---- .../index/svc/IndexSearchSetsService.java | 2 +- .../svc/IndexQueryServiceIntegrationTest.java | 4 ++++ .../IndexQueryServiceIntegrationTestModule.java | 11 +++++++++-- 15 files changed, 79 insertions(+), 32 deletions(-) diff --git a/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java b/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java index e78090c9..813d1c57 100644 --- a/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java +++ b/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java @@ -182,7 +182,7 @@ public class FileStorageService { try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(""" INSERT INTO FILE_STORAGE_BASE(NAME, PATH, TYPE, PERMIT_TEMP) - VALUES (?, ?, ?, ?, ?) + VALUES (?, ?, ?, ?) """)) { stmt.setString(1, name); stmt.setString(2, path.toString()); diff --git a/code/common/service/src/main/java/nu/marginalia/service/control/ServiceHeartbeat.java b/code/common/service/src/main/java/nu/marginalia/service/control/ServiceHeartbeat.java index de146926..d0fdba32 100644 --- a/code/common/service/src/main/java/nu/marginalia/service/control/ServiceHeartbeat.java +++ b/code/common/service/src/main/java/nu/marginalia/service/control/ServiceHeartbeat.java @@ -44,7 +44,7 @@ public class ServiceHeartbeat { Runtime.getRuntime().addShutdownHook(new Thread(this::shutDown)); } - public > ServiceTaskHeartbeat createServiceProcessHeartbeat(Class steps, String processName) { + public > ServiceTaskHeartbeat createServiceTaskHeartbeat(Class steps, String processName) { return new ServiceTaskHeartbeat<>(steps, configuration, processName, dataSource); } diff --git a/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexConverter.java b/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexConverter.java index 1496a653..07a966f8 100644 --- a/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexConverter.java +++ b/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexConverter.java @@ -64,7 +64,7 @@ public class ForwardIndexConverter { logger.info("Domain Rankings size = {}", domainRankings.size()); - try (var progress = heartbeat.createServiceProcessHeartbeat(TaskSteps.class, "forwardIndexConverter")) { + try (var progress = heartbeat.createServiceTaskHeartbeat(TaskSteps.class, "forwardIndexConverter")) { progress.progress(TaskSteps.GET_DOC_IDS); LongArray docsFileId = getDocIds(outputFileDocsId, journalReader); diff --git a/code/features-index/index-forward/src/test/java/nu/marginalia/index/forward/ForwardIndexConverterTest.java b/code/features-index/index-forward/src/test/java/nu/marginalia/index/forward/ForwardIndexConverterTest.java index 33acceea..1c6fdf1c 100644 --- a/code/features-index/index-forward/src/test/java/nu/marginalia/index/forward/ForwardIndexConverterTest.java +++ b/code/features-index/index-forward/src/test/java/nu/marginalia/index/forward/ForwardIndexConverterTest.java @@ -1,7 +1,6 @@ package nu.marginalia.index.forward; import lombok.SneakyThrows; -import nu.marginalia.dict.OffHeapDictionaryHashMap; import nu.marginalia.index.journal.model.IndexJournalEntry; import nu.marginalia.index.journal.writer.IndexJournalWriterImpl; import nu.marginalia.index.journal.writer.IndexJournalWriter; @@ -9,10 +8,13 @@ import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode; import nu.marginalia.ranking.DomainRankings; import nu.marginalia.lexicon.KeywordLexicon; import nu.marginalia.lexicon.journal.KeywordLexiconJournal; +import nu.marginalia.service.control.ServiceHeartbeat; +import nu.marginalia.service.control.ServiceTaskHeartbeat; import nu.marginalia.test.TestUtil; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import org.mockito.Mockito; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -22,6 +24,7 @@ import java.nio.file.Path; import java.util.stream.IntStream; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.mockito.Mockito.when; class ForwardIndexConverterTest { @@ -98,7 +101,12 @@ class ForwardIndexConverterTest { @Test void testForwardIndex() throws IOException { - new ForwardIndexConverter(indexFile.toFile(), docsFileId, docsFileData, new DomainRankings()).convert(); + // RIP fairies + var serviceHeartbeat = Mockito.mock(ServiceHeartbeat.class); + when(serviceHeartbeat.createServiceTaskHeartbeat(Mockito.any(), Mockito.any())) + .thenReturn(Mockito.mock(ServiceTaskHeartbeat.class)); + + new ForwardIndexConverter(serviceHeartbeat, indexFile.toFile(), docsFileId, docsFileData, new DomainRankings()).convert(); var forwardReader = new ForwardIndexReader(docsFileId, docsFileData); diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullConverter.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullConverter.java index c8cafcde..f2e3f91b 100644 --- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullConverter.java +++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullConverter.java @@ -76,7 +76,7 @@ public class ReverseIndexFullConverter { final Path intermediateUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat"); - try (var progress = heartbeat.createServiceProcessHeartbeat(TaskSteps.class, "reverseIndexFullConverter")) { + try (var progress = heartbeat.createServiceTaskHeartbeat(TaskSteps.class, "reverseIndexFullConverter")) { progress.progress(TaskSteps.ACCUMULATE_STATISTICS); final IndexJournalStatistics statistics = journalReader.getStatistics(); diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityConverter.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityConverter.java index d5ee0f88..4c9cd0d0 100644 --- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityConverter.java +++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityConverter.java @@ -75,7 +75,7 @@ public class ReverseIndexPriorityConverter { final Path intermediateUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat"); - try (var progress = heartbeat.createServiceProcessHeartbeat(TaskSteps.class, "reverseIndexPriorityConverter")) { + try (var progress = heartbeat.createServiceTaskHeartbeat(TaskSteps.class, "reverseIndexPriorityConverter")) { progress.progress(TaskSteps.ACCUMULATE_STATISTICS); final IndexJournalStatistics statistics = journalReader.getStatistics(); diff --git a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest.java b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest.java index 6212dc8a..7644d019 100644 --- a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest.java +++ b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest.java @@ -14,6 +14,7 @@ import nu.marginalia.lexicon.KeywordLexicon; import nu.marginalia.lexicon.journal.KeywordLexiconJournal; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.service.control.ServiceHeartbeat; +import nu.marginalia.service.control.ServiceTaskHeartbeat; import nu.marginalia.test.TestUtil; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -28,6 +29,7 @@ import java.util.stream.IntStream; import java.util.stream.LongStream; import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.mockito.Mockito.when; class ReverseIndexFullConverterTest { KeywordLexicon keywordLexicon; @@ -86,8 +88,13 @@ class ReverseIndexFullConverterTest { var docsFile = dataDir.resolve("docs.dat"); var journalReader = new IndexJournalReaderSingleCompressedFile(indexFile); + // RIP fairies + var serviceHeartbeat = Mockito.mock(ServiceHeartbeat.class); + when(serviceHeartbeat.createServiceTaskHeartbeat(Mockito.any(), Mockito.any())) + .thenReturn(Mockito.mock(ServiceTaskHeartbeat.class)); + new ReverseIndexFullConverter( - Mockito.mock(ServiceHeartbeat.class), + serviceHeartbeat, tmpDir, journalReader, new DomainRankings(), wordsFile, docsFile) .convert(); diff --git a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest2.java b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest2.java index ab8be8ea..e4c7b7e4 100644 --- a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest2.java +++ b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest2.java @@ -15,6 +15,7 @@ import nu.marginalia.ranking.DomainRankings; import nu.marginalia.lexicon.KeywordLexicon; import nu.marginalia.lexicon.journal.KeywordLexiconJournal; import nu.marginalia.service.control.ServiceHeartbeat; +import nu.marginalia.service.control.ServiceTaskHeartbeat; import nu.marginalia.test.TestUtil; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; @@ -30,6 +31,8 @@ import java.util.Arrays; import java.util.stream.IntStream; import java.util.stream.LongStream; +import static org.mockito.Mockito.when; + class ReverseIndexFullConverterTest2 { KeywordLexicon keywordLexicon; @@ -119,7 +122,12 @@ class ReverseIndexFullConverterTest2 { Path tmpDir = Path.of("/tmp"); - new ReverseIndexFullConverter(Mockito.mock(ServiceHeartbeat.class), tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile), new DomainRankings(), wordsFile, docsFile).convert(); + // RIP fairies + var serviceHeartbeat = Mockito.mock(ServiceHeartbeat.class); + when(serviceHeartbeat.createServiceTaskHeartbeat(Mockito.any(), Mockito.any())) + .thenReturn(Mockito.mock(ServiceTaskHeartbeat.class)); + + new ReverseIndexFullConverter(serviceHeartbeat, tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile), new DomainRankings(), wordsFile, docsFile).convert(); var reverseReader = new ReverseIndexFullReader(wordsFile, docsFile); @@ -144,7 +152,12 @@ class ReverseIndexFullConverterTest2 { Path tmpDir = Path.of("/tmp"); - new ReverseIndexFullConverter(Mockito.mock(ServiceHeartbeat.class), tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile, null, ReverseIndexPriorityParameters::filterPriorityRecord), new DomainRankings(), wordsFile, docsFile).convert(); + // RIP fairies + var serviceHeartbeat = Mockito.mock(ServiceHeartbeat.class); + when(serviceHeartbeat.createServiceTaskHeartbeat(Mockito.any(), Mockito.any())) + .thenReturn(Mockito.mock(ServiceTaskHeartbeat.class)); + + new ReverseIndexFullConverter(serviceHeartbeat, tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile, null, ReverseIndexPriorityParameters::filterPriorityRecord), new DomainRankings(), wordsFile, docsFile).convert(); var reverseReader = new ReverseIndexFullReader(wordsFile, docsFile); diff --git a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexPriorityConverterTest2.java b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexPriorityConverterTest2.java index 29f9959b..dcd46e22 100644 --- a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexPriorityConverterTest2.java +++ b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexPriorityConverterTest2.java @@ -15,6 +15,7 @@ import nu.marginalia.lexicon.journal.KeywordLexiconJournal; import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode; import nu.marginalia.ranking.DomainRankings; import nu.marginalia.service.control.ServiceHeartbeat; +import nu.marginalia.service.control.ServiceTaskHeartbeat; import nu.marginalia.test.TestUtil; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; @@ -30,6 +31,8 @@ import java.util.Arrays; import java.util.stream.IntStream; import java.util.stream.LongStream; +import static org.mockito.Mockito.when; + class ReverseIndexPriorityConverterTest2 { KeywordLexicon keywordLexicon; @@ -119,7 +122,12 @@ class ReverseIndexPriorityConverterTest2 { Path tmpDir = Path.of("/tmp"); - new ReverseIndexPriorityConverter(Mockito.mock(ServiceHeartbeat.class), tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile), new DomainRankings(), wordsFile, docsFile).convert(); + // RIP fairies + var serviceHeartbeat = Mockito.mock(ServiceHeartbeat.class); + when(serviceHeartbeat.createServiceTaskHeartbeat(Mockito.any(), Mockito.any())) + .thenReturn(Mockito.mock(ServiceTaskHeartbeat.class)); + + new ReverseIndexPriorityConverter(serviceHeartbeat, tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile), new DomainRankings(), wordsFile, docsFile).convert(); var reverseReader = new ReverseIndexPriorityReader(wordsFile, docsFile); @@ -144,7 +152,12 @@ class ReverseIndexPriorityConverterTest2 { Path tmpDir = Path.of("/tmp"); - new ReverseIndexPriorityConverter(Mockito.mock(ServiceHeartbeat.class), tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile, null, ReverseIndexPriorityParameters::filterPriorityRecord), new DomainRankings(), wordsFile, docsFile).convert(); + // RIP fairies + var serviceHeartbeat = Mockito.mock(ServiceHeartbeat.class); + when(serviceHeartbeat.createServiceTaskHeartbeat(Mockito.any(), Mockito.any())) + .thenReturn(Mockito.mock(ServiceTaskHeartbeat.class)); + + new ReverseIndexPriorityConverter(serviceHeartbeat, tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile, null, ReverseIndexPriorityParameters::filterPriorityRecord), new DomainRankings(), wordsFile, docsFile).convert(); var reverseReader = new ReverseIndexPriorityReader(wordsFile, docsFile); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java index 3fad75a9..fd936a7a 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java @@ -166,15 +166,6 @@ public class CrawlerMain { finally { heartbeat.shutDown(); } - - pool.execute(() -> { - try { - fetchDomain(crawlingSpecification); - } - finally { - taskSem.release(); - } - }); } class CrawlTask implements DumbThreadPool.Task { diff --git a/code/services-core/control-service/src/test/java/nu/marginalia/control/svc/ApiKeyServiceTest.java b/code/services-core/control-service/src/test/java/nu/marginalia/control/svc/ApiKeyServiceTest.java index 94877cb6..7bb8536a 100644 --- a/code/services-core/control-service/src/test/java/nu/marginalia/control/svc/ApiKeyServiceTest.java +++ b/code/services-core/control-service/src/test/java/nu/marginalia/control/svc/ApiKeyServiceTest.java @@ -3,15 +3,13 @@ package nu.marginalia.control.svc; import com.zaxxer.hikari.HikariConfig; import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.control.model.ApiKeyModel; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Tag; -import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.*; import org.junit.jupiter.api.parallel.Execution; import org.testcontainers.containers.MariaDBContainer; import org.testcontainers.junit.jupiter.Container; import org.testcontainers.junit.jupiter.Testcontainers; +import java.sql.SQLException; import java.util.List; import static org.junit.jupiter.api.Assertions.*; @@ -46,6 +44,15 @@ public class ApiKeyServiceTest { mariaDBContainer.close(); } + @AfterEach + public void cleanDb() { + try (var conn = dataSource.getConnection(); var stmt = conn.createStatement()) { + stmt.executeUpdate("TRUNCATE TABLE EC_API_KEY"); + } catch (SQLException e) { + e.printStackTrace(); + } + } + @Test void getKeys() { var apiKeyService = new ApiKeyService(dataSource); diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexServicesFactory.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexServicesFactory.java index 031fcd2d..9e0c2a04 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexServicesFactory.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexServicesFactory.java @@ -2,14 +2,11 @@ package nu.marginalia.index; import com.google.inject.Inject; import com.google.inject.Singleton; -import com.google.inject.name.Named; import nu.marginalia.db.storage.FileStorageService; import nu.marginalia.db.storage.model.FileStorageType; import nu.marginalia.index.forward.ForwardIndexConverter; import nu.marginalia.index.forward.ForwardIndexReader; import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile; -import nu.marginalia.index.journal.writer.IndexJournalWriter; -import nu.marginalia.index.journal.writer.IndexJournalWriterImpl; import nu.marginalia.index.priority.ReverseIndexPriorityConverter; import nu.marginalia.index.full.ReverseIndexFullConverter; import nu.marginalia.index.priority.ReverseIndexPriorityReader; @@ -110,7 +107,7 @@ public class IndexServicesFactory { FINISHED } public void convertIndex(DomainRankings domainRankings) throws IOException { - try (var hb = heartbeat.createServiceProcessHeartbeat(ConvertSteps.class, "index-conversion")) { + try (var hb = heartbeat.createServiceTaskHeartbeat(ConvertSteps.class, "index-conversion")) { hb.progress(ConvertSteps.FORWARD_INDEX); convertForwardIndex(domainRankings); diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java index 79602c22..3d886158 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java @@ -103,7 +103,7 @@ public class IndexSearchSetsService { FINISHED } public void recalculateAll() { - try (var processHeartbeat = heartbeat.createServiceProcessHeartbeat(RepartitionSteps.class, "repartitionAll")) { + try (var processHeartbeat = heartbeat.createServiceTaskHeartbeat(RepartitionSteps.class, "repartitionAll")) { processHeartbeat.progress(RepartitionSteps.UPDATE_ACADEMIA); updateAcademiaDomainsSet(); diff --git a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java index fd92354b..37030b1f 100644 --- a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java +++ b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java @@ -18,6 +18,7 @@ import nu.marginalia.lexicon.KeywordLexicon; import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.WordMetadata; +import nu.marginalia.service.control.ServiceHeartbeat; import nu.marginalia.service.server.Initialization; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Assertions; @@ -49,6 +50,9 @@ public class IndexQueryServiceIntegrationTest { @Inject KeywordLexicon keywordLexicon; + @Inject + ServiceHeartbeat heartbeat; + @Inject IndexJournalWriter indexJournalWriter; diff --git a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java index 1c4c6986..0801bc77 100644 --- a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java +++ b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java @@ -17,6 +17,7 @@ import nu.marginalia.index.util.TestUtil; import nu.marginalia.index.client.model.query.SearchSetIdentifier; import nu.marginalia.service.control.ServiceEventLog; import nu.marginalia.service.control.ServiceHeartbeat; +import nu.marginalia.service.control.ServiceTaskHeartbeat; import nu.marginalia.service.id.ServiceId; import nu.marginalia.service.module.ServiceConfiguration; import org.mockito.Mockito; @@ -62,8 +63,14 @@ public class IndexQueryServiceIntegrationTestModule extends AbstractModule { when(fileStorageServiceMock.getStorageByType(FileStorageType.INDEX_LIVE)).thenReturn(new FileStorage(null, null, null, fastDir.toString(), null)); when(fileStorageServiceMock.getStorageByType(FileStorageType.INDEX_STAGING)).thenReturn(new FileStorage(null, null, null, slowDir.toString(), null)); + var serviceHeartbeat = Mockito.mock(ServiceHeartbeat.class); + // RIP fairies + when(serviceHeartbeat.createServiceTaskHeartbeat(Mockito.any(), Mockito.any())) + .thenReturn(Mockito.mock(ServiceTaskHeartbeat.class)); + bind(ServiceHeartbeat.class).toInstance(serviceHeartbeat); + var servicesFactory = new IndexServicesFactory( - Mockito.mock(ServiceHeartbeat.class), + serviceHeartbeat, fileStorageServiceMock ); bind(IndexServicesFactory.class).toInstance(servicesFactory); @@ -81,7 +88,7 @@ public class IndexQueryServiceIntegrationTestModule extends AbstractModule { bind(KeywordLexiconReadOnlyView.class).toInstance(new KeywordLexiconReadOnlyView(keywordLexicon)); bind(ServiceEventLog.class).toInstance(Mockito.mock(ServiceEventLog.class)); - bind(ServiceHeartbeat.class).toInstance(Mockito.mock(ServiceHeartbeat.class)); + bind(IndexJournalWriter.class).toInstance(new IndexJournalWriterImpl(keywordLexicon, slowDir.resolve("page-index.dat"))); From b5ed21be216b1843f23a186dabafda740343f061 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 9 Aug 2023 14:23:22 +0200 Subject: [PATCH 154/157] (mq) MqPersistence no longer relies on autoCommit being enabled --- .../mq/persistence/MqPersistence.java | 29 +++++++++++++++++-- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java index 030fff81..68fb2f83 100644 --- a/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java @@ -67,6 +67,10 @@ public class MqPersistence { else stmt.setLong(6, ttl.toSeconds()); stmt.executeUpdate(); + + if (!conn.getAutoCommit()) + conn.commit(); + var rsp = lastIdQuery.executeQuery(); if (!rsp.next()) { @@ -104,6 +108,9 @@ public class MqPersistence { if (stmt.executeUpdate() != 1) { throw new IllegalArgumentException("No rows updated"); } + + if (!conn.getAutoCommit()) + conn.commit(); } } @@ -124,6 +131,9 @@ public class MqPersistence { if (stmt.executeUpdate() != 1) { throw new IllegalArgumentException("No rows updated"); } + + if (!conn.getAutoCommit()) + conn.commit(); } } @@ -199,7 +209,10 @@ public class MqPersistence { updateStmt.setLong(2, tick); updateStmt.setString(3, inboxName); updateStmt.setInt(4, n); - return updateStmt.executeUpdate(); + var ret = updateStmt.executeUpdate(); + if (!conn.getAutoCommit()) + conn.commit(); + return ret; } } @@ -429,6 +442,10 @@ public class MqPersistence { stmt.setInt(2, tick); stmt.setLong(3, id); stmt.executeUpdate(); + + if (!conn.getAutoCommit()) + conn.commit(); + } catch (SQLException e) { throw new RuntimeException(e); } @@ -445,7 +462,10 @@ public class MqPersistence { AND TTL IS NOT NULL AND TIMESTAMPDIFF(SECOND, UPDATED_TIME, CURRENT_TIMESTAMP(6)) > TTL """)) { - return setToDead.executeUpdate(); + int ret = setToDead.executeUpdate(); + if (!conn.getAutoCommit()) + conn.commit(); + return ret; } } @@ -458,7 +478,10 @@ public class MqPersistence { AND TTL IS NOT NULL AND TIMESTAMPDIFF(SECOND, UPDATED_TIME, CURRENT_TIMESTAMP(6)) > 3600 """)) { - return setToDead.executeUpdate(); + int ret = setToDead.executeUpdate(); + if (!conn.getAutoCommit()) + conn.commit(); + return ret; } } } From ce293029c7009ccf62b17187e0f118ee6635019a Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 9 Aug 2023 14:23:53 +0200 Subject: [PATCH 155/157] (converter) Treat adtech tracking as advertisement. --- .../marginalia/model/crawl/HtmlFeature.java | 8 +++-- .../processor/logic/FeatureExtractor.java | 35 ++++++++++--------- .../marginalia/search/model/UrlDetails.java | 5 +-- 3 files changed, 28 insertions(+), 20 deletions(-) diff --git a/code/common/model/src/main/java/nu/marginalia/model/crawl/HtmlFeature.java b/code/common/model/src/main/java/nu/marginalia/model/crawl/HtmlFeature.java index d9adbff6..03e5557c 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/crawl/HtmlFeature.java +++ b/code/common/model/src/main/java/nu/marginalia/model/crawl/HtmlFeature.java @@ -3,11 +3,15 @@ package nu.marginalia.model.crawl; import java.util.Collection; public enum HtmlFeature { + // Note, the first 32 of these features are bit encoded in the database + // so be sure to keep anything that's potentially important toward the top + // of the list + MEDIA( "special:media"), JS("special:scripts"), AFFILIATE_LINK( "special:affiliate"), - TRACKING_INNOCENT("special:tracking"), - TRACKING_EVIL("special:tracking2"), + TRACKING("special:tracking"), + TRACKING_ADTECH("special:ads"), // We'll this as ads for now VIEWPORT("special:viewport"), diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java index c431e94b..040f96dd 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java @@ -4,7 +4,6 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.adblock.AdblockSimulator; import nu.marginalia.adblock.GoogleAnwersSpamDetector; -import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.topic.RecipeDetector; @@ -25,9 +24,11 @@ public class FeatureExtractor { "twitter.com", "bing.com", "msn.com"); - private static final List shittyTrackers = List.of("adform.net", + private static final List adtechTrackers = List.of("adform.net", "connect.facebook", "facebook.com/tr", + "absbygoogle.com", + "adnxs.com", "googletagmanager.com", "googlesyndication.com", "smartadserver.com", @@ -203,11 +204,11 @@ public class FeatureExtractor { for (var scriptTag : scriptTags) { if (hasInvasiveTrackingScript(scriptTag)) { - features.add(HtmlFeature.TRACKING_INNOCENT); - features.add(HtmlFeature.TRACKING_EVIL); + features.add(HtmlFeature.TRACKING); + features.add(HtmlFeature.TRACKING_ADTECH); } else if (hasNaiveTrackingScript(scriptTag)) { - features.add(HtmlFeature.TRACKING_INNOCENT); + features.add(HtmlFeature.TRACKING); } if (scriptTag.hasAttr("didomi/javascript")) { @@ -234,42 +235,44 @@ public class FeatureExtractor { features.add(HtmlFeature.COOKIELAW); } if (scriptText.contains("_linkedin_data_partner_id")) { - features.add(HtmlFeature.TRACKING_EVIL); + features.add(HtmlFeature.TRACKING); + features.add(HtmlFeature.TRACKING_ADTECH); } if (scriptText.contains("window.OneSignal")) { features.add(HtmlFeature.ONESIGNAL); } if (scriptText.contains("connect.facebook.net")) { - features.add(HtmlFeature.TRACKING_EVIL); + features.add(HtmlFeature.TRACKING); + features.add(HtmlFeature.TRACKING_ADTECH); } if (scriptText.contains("hotjar.com")) { - features.add(HtmlFeature.TRACKING_INNOCENT); + features.add(HtmlFeature.TRACKING); } } for (var noscript : doc.getElementsByTag("noscript")) { for (var iframe : noscript.getElementsByTag("iframe")) { if (hasInvasiveTrackingScript(iframe)) { - features.add(HtmlFeature.TRACKING_INNOCENT); - features.add(HtmlFeature.TRACKING_EVIL); + features.add(HtmlFeature.TRACKING); + features.add(HtmlFeature.TRACKING_ADTECH); } else if (hasNaiveTrackingScript(iframe)) { - features.add(HtmlFeature.TRACKING_INNOCENT); + features.add(HtmlFeature.TRACKING); } } for (var img : noscript.getElementsByTag("img")) { if (hasInvasiveTrackingScript(img)) { - features.add(HtmlFeature.TRACKING_INNOCENT); - features.add(HtmlFeature.TRACKING_EVIL); + features.add(HtmlFeature.TRACKING); + features.add(HtmlFeature.TRACKING_ADTECH); } else if (hasNaiveTrackingScript(img)) { - features.add(HtmlFeature.TRACKING_INNOCENT); + features.add(HtmlFeature.TRACKING); } } } if (scriptTags.html().contains("google-analytics.com")) { - features.add(HtmlFeature.TRACKING_INNOCENT); + features.add(HtmlFeature.TRACKING); } for (var aTag : doc.getElementsByTag("a")) { @@ -296,7 +299,7 @@ public class FeatureExtractor { } private boolean hasInvasiveTrackingScript(String src) { - for (var tracker : shittyTrackers) { + for (var tracker : adtechTrackers) { if (src.contains(tracker)) { return true; } diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java index b44f2551..2de12536 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java @@ -121,7 +121,8 @@ public class UrlDetails { for (var problem :EnumSet.of( HtmlFeature.JS, - HtmlFeature.TRACKING_INNOCENT, + HtmlFeature.TRACKING, + HtmlFeature.TRACKING_ADTECH, HtmlFeature.AFFILIATE_LINK, HtmlFeature.COOKIES, HtmlFeature.ADVERTISEMENT)) { @@ -156,7 +157,7 @@ public class UrlDetails { return HtmlFeature.hasFeature(features, HtmlFeature.JS); } public boolean isTracking() { - return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING_INNOCENT); + return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING); } public boolean isAffiliate() { return HtmlFeature.hasFeature(features, HtmlFeature.AFFILIATE_LINK); From 807fb2d052c60a423084adbbf12dc437a794f274 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 9 Aug 2023 14:46:38 +0200 Subject: [PATCH 156/157] (service) Task heartbeat creates event log entries --- .../service/control/ServiceHeartbeat.java | 5 ++++- .../service/control/ServiceTaskHeartbeat.java | 20 +++++++++++++++---- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/code/common/service/src/main/java/nu/marginalia/service/control/ServiceHeartbeat.java b/code/common/service/src/main/java/nu/marginalia/service/control/ServiceHeartbeat.java index d0fdba32..c9c5085c 100644 --- a/code/common/service/src/main/java/nu/marginalia/service/control/ServiceHeartbeat.java +++ b/code/common/service/src/main/java/nu/marginalia/service/control/ServiceHeartbeat.java @@ -20,6 +20,7 @@ public class ServiceHeartbeat { private final String serviceBase; private final String instanceUUID; private final ServiceConfiguration configuration; + private final ServiceEventLog eventLog; private final HikariDataSource dataSource; @@ -30,11 +31,13 @@ public class ServiceHeartbeat { @Inject public ServiceHeartbeat(ServiceConfiguration configuration, + ServiceEventLog eventLog, HikariDataSource dataSource) { this.serviceName = configuration.serviceName() + ":" + configuration.node(); this.serviceBase = configuration.serviceName(); this.configuration = configuration; + this.eventLog = eventLog; this.dataSource = dataSource; this.instanceUUID = configuration.instanceUuid().toString(); @@ -45,7 +48,7 @@ public class ServiceHeartbeat { } public > ServiceTaskHeartbeat createServiceTaskHeartbeat(Class steps, String processName) { - return new ServiceTaskHeartbeat<>(steps, configuration, processName, dataSource); + return new ServiceTaskHeartbeat<>(steps, configuration, processName, eventLog, dataSource); } diff --git a/code/common/service/src/main/java/nu/marginalia/service/control/ServiceTaskHeartbeat.java b/code/common/service/src/main/java/nu/marginalia/service/control/ServiceTaskHeartbeat.java index a460bc1c..bf0d6a9f 100644 --- a/code/common/service/src/main/java/nu/marginalia/service/control/ServiceTaskHeartbeat.java +++ b/code/common/service/src/main/java/nu/marginalia/service/control/ServiceTaskHeartbeat.java @@ -27,6 +27,7 @@ public class ServiceTaskHeartbeat> implements AutoCloseable { private final int heartbeatInterval = Integer.getInteger("mcp.heartbeat.interval", 1); private final String serviceInstanceUUID; private final int stepCount; + private final ServiceEventLog eventLog; private volatile boolean running = false; private volatile int stepNum = 0; @@ -35,8 +36,10 @@ public class ServiceTaskHeartbeat> implements AutoCloseable { ServiceTaskHeartbeat(Class stepClass, ServiceConfiguration configuration, String taskName, + ServiceEventLog eventLog, HikariDataSource dataSource) { + this.eventLog = eventLog; this.taskName = configuration.serviceName() + "." + taskName + ":" + configuration.node(); this.taskBase = configuration.serviceName() + "." + taskName; this.dataSource = dataSource; @@ -46,6 +49,8 @@ public class ServiceTaskHeartbeat> implements AutoCloseable { this.stepCount = stepClass.getEnumConstants().length; + heartbeatInit(); + runnerThread = new Thread(this::run); runnerThread.start(); } @@ -58,6 +63,7 @@ public class ServiceTaskHeartbeat> implements AutoCloseable { public void progress(T step) { this.step = step.name(); + // off by one since we calculate the progress based on the number of steps, // and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the // final progress being 80% and not 100%) @@ -65,6 +71,7 @@ public class ServiceTaskHeartbeat> implements AutoCloseable { this.stepNum = 1 + step.ordinal(); logger.info("ServiceTask {} progress: {}", taskBase, step.name()); + eventLog.logEvent("TASK-STEP", taskName + " = " + step.name()); } public void shutDown() { @@ -89,8 +96,6 @@ public class ServiceTaskHeartbeat> implements AutoCloseable { return; try { - heartbeatInit(); - while (running) { try { heartbeatUpdate(); @@ -102,13 +107,13 @@ public class ServiceTaskHeartbeat> implements AutoCloseable { TimeUnit.SECONDS.sleep(heartbeatInterval); } } - catch (InterruptedException|SQLException ex) { + catch (InterruptedException ex) { logger.error("ServiceHeartbeat caught irrecoverable exception, killing service", ex); System.exit(255); } } - private void heartbeatInit() throws SQLException { + private void heartbeatInit() { try (var connection = dataSource.getConnection()) { try (var stmt = connection.prepareStatement( """ @@ -131,6 +136,12 @@ public class ServiceTaskHeartbeat> implements AutoCloseable { stmt.executeUpdate(); } } + catch (SQLException ex) { + logger.error("ServiceHeartbeat failed to initialize", ex); + throw new RuntimeException(ex); + } + + eventLog.logEvent("TASK-STARTED", taskName); } private void heartbeatUpdate() throws SQLException { @@ -173,6 +184,7 @@ public class ServiceTaskHeartbeat> implements AutoCloseable { stmt.executeUpdate(); } } + eventLog.logEvent("TASK-TERMINATED", taskName); } @Override From 4f8048be3139254e87997f41b1f78c679cc97c80 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 10 Aug 2023 15:40:07 +0200 Subject: [PATCH 157/157] (blacklist) Blacklist management --- .../db/migration/V23_06_0_001__blacklist.sql | 1 + .../nu/marginalia/control/ControlService.java | 27 +++++++ .../control/model/BlacklistedDomainModel.java | 6 ++ .../control/svc/ControlBlacklistService.java | 79 +++++++++++++++++++ .../control/svc/DomainComplaintService.java | 37 ++------- .../resources/templates/control/blacklist.hdb | 68 ++++++++++++++++ 6 files changed, 189 insertions(+), 29 deletions(-) create mode 100644 code/services-core/control-service/src/main/java/nu/marginalia/control/model/BlacklistedDomainModel.java create mode 100644 code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlBlacklistService.java create mode 100644 code/services-core/control-service/src/main/resources/templates/control/blacklist.hdb diff --git a/code/common/db/src/main/resources/db/migration/V23_06_0_001__blacklist.sql b/code/common/db/src/main/resources/db/migration/V23_06_0_001__blacklist.sql index e46161bc..d05d8e9d 100644 --- a/code/common/db/src/main/resources/db/migration/V23_06_0_001__blacklist.sql +++ b/code/common/db/src/main/resources/db/migration/V23_06_0_001__blacklist.sql @@ -2,6 +2,7 @@ CREATE TABLE IF NOT EXISTS EC_DOMAIN_BLACKLIST ( ID INT PRIMARY KEY AUTO_INCREMENT, URL_DOMAIN VARCHAR(255) UNIQUE NOT NULL + COMMENT VARCHAR(255) DEFAULT NULL ) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java index 671cd584..2b49b249 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java @@ -24,6 +24,7 @@ import java.sql.SQLException; import java.util.Comparator; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.stream.Collectors; public class ControlService extends Service { @@ -36,6 +37,7 @@ public class ControlService extends Service { private final EventLogService eventLogService; private final ApiKeyService apiKeyService; private final DomainComplaintService domainComplaintService; + private final ControlBlacklistService blacklistService; private final ControlActorService controlActorService; private final StaticResources staticResources; private final MessageQueueService messageQueueService; @@ -54,6 +56,7 @@ public class ControlService extends Service { ControlFileStorageService controlFileStorageService, ApiKeyService apiKeyService, DomainComplaintService domainComplaintService, + ControlBlacklistService blacklistService, ControlActionsService controlActionsService ) throws IOException { @@ -63,6 +66,7 @@ public class ControlService extends Service { this.eventLogService = eventLogService; this.apiKeyService = apiKeyService; this.domainComplaintService = domainComplaintService; + this.blacklistService = blacklistService; var indexRenderer = rendererFactory.renderer("control/index"); var servicesRenderer = rendererFactory.renderer("control/services"); @@ -85,6 +89,7 @@ public class ControlService extends Service { var viewMessageRenderer = rendererFactory.renderer("control/view-message"); var actionsViewRenderer = rendererFactory.renderer("control/actions"); + var blacklistRenderer = rendererFactory.renderer("control/blacklist"); this.controlActorService = controlActorService; @@ -109,6 +114,7 @@ public class ControlService extends Service { final HtmlRedirect redirectToActors = new HtmlRedirect("/actors"); final HtmlRedirect redirectToApiKeys = new HtmlRedirect("/api-keys"); final HtmlRedirect redirectToStorage = new HtmlRedirect("/storage"); + final HtmlRedirect redirectToBlacklist = new HtmlRedirect("/blacklist"); final HtmlRedirect redirectToComplaints = new HtmlRedirect("/complaints"); final HtmlRedirect redirectToMessageQueue = new HtmlRedirect("/message-queue"); @@ -145,6 +151,11 @@ public class ControlService extends Service { Spark.post("/public/storage/specs", controlActorService::createCrawlSpecification, redirectToStorage); Spark.post("/public/storage/:fid/delete", controlFileStorageService::flagFileForDeletionRequest, redirectToStorage); + // Blacklist + + Spark.get("/public/blacklist", this::blacklistModel, blacklistRenderer::render); + Spark.post("/public/blacklist", this::updateBlacklist, redirectToBlacklist); + // API Keys Spark.get("/public/api-keys", this::apiKeysModel, apiKeysRenderer::render); @@ -171,6 +182,22 @@ public class ControlService extends Service { monitors.subscribe(this::logMonitorStateChange); } + private Object blacklistModel(Request request, Response response) { + return Map.of("blacklist", blacklistService.lastNAdditions(100)); + } + + private Object updateBlacklist(Request request, Response response) { + var domain = new EdgeDomain(request.queryParams("domain")); + if ("add".equals(request.queryParams("act"))) { + var comment = Objects.requireNonNullElse(request.queryParams("comment"), ""); + blacklistService.addToBlacklist(domain, comment); + } else if ("del".equals(request.queryParams("act"))) { + blacklistService.removeFromBlacklist(domain); + } + + return ""; + } + private Object overviewModel(Request request, Response response) { return Map.of("processes", heartbeatService.getProcessHeartbeats(), diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/model/BlacklistedDomainModel.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/BlacklistedDomainModel.java new file mode 100644 index 00000000..e7db4805 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/BlacklistedDomainModel.java @@ -0,0 +1,6 @@ +package nu.marginalia.control.model; + +import nu.marginalia.model.EdgeDomain; + +public record BlacklistedDomainModel(EdgeDomain domain, String comment) { +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlBlacklistService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlBlacklistService.java new file mode 100644 index 00000000..d23a06e2 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlBlacklistService.java @@ -0,0 +1,79 @@ +package nu.marginalia.control.svc; + +import com.google.inject.Inject; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.control.model.BlacklistedDomainModel; +import nu.marginalia.model.EdgeDomain; + +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.List; + +public class ControlBlacklistService { + + private final HikariDataSource dataSource; + + @Inject + public ControlBlacklistService(HikariDataSource dataSource) { + this.dataSource = dataSource; + } + + public void addToBlacklist(EdgeDomain domain, String comment) { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + INSERT IGNORE INTO EC_DOMAIN_BLACKLIST (URL_DOMAIN, COMMENT) VALUES (?, ?) + """)) { + stmt.setString(1, domain.toString()); + stmt.setString(2, comment); + stmt.executeUpdate(); + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } + + public void removeFromBlacklist(EdgeDomain domain) { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + DELETE FROM EC_DOMAIN_BLACKLIST WHERE URL_DOMAIN=? + """)) { + stmt.setString(1, domain.toString()); + stmt.addBatch(); + stmt.setString(1, domain.domain); + stmt.addBatch(); + stmt.executeBatch(); + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } + + public List lastNAdditions(int n) { + final List ret = new ArrayList<>(n); + + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT URL_DOMAIN, COMMENT + FROM EC_DOMAIN_BLACKLIST + ORDER BY ID DESC + LIMIT ? + """)) { + stmt.setInt(1, n); + + var rs = stmt.executeQuery(); + while (rs.next()) { + ret.add(new BlacklistedDomainModel( + new EdgeDomain(rs.getString("URL_DOMAIN")), + rs.getString("COMMENT") + ) + ); + } + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + + return ret; + + } +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/DomainComplaintService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/DomainComplaintService.java index bf36bfad..758d0313 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/DomainComplaintService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/DomainComplaintService.java @@ -18,10 +18,14 @@ import java.util.Optional; */ public class DomainComplaintService { private final HikariDataSource dataSource; + private final ControlBlacklistService blacklistService; @Inject - public DomainComplaintService(HikariDataSource dataSource) { + public DomainComplaintService(HikariDataSource dataSource, + ControlBlacklistService blacklistService + ) { this.dataSource = dataSource; + this.blacklistService = blacklistService; } public List getComplaints() { @@ -53,12 +57,13 @@ public class DomainComplaintService { } public void approveAppealBlacklisting(EdgeDomain domain) { - removeFromBlacklist(domain); + blacklistService.removeFromBlacklist(domain); setDecision(domain, "APPROVED"); } public void blacklistDomain(EdgeDomain domain) { - addToBlacklist(domain); + blacklistService.addToBlacklist(domain, "Domain complaint"); + setDecision(domain, "BLACKLISTED"); } @@ -66,33 +71,7 @@ public class DomainComplaintService { setDecision(domain, "REJECTED"); } - private void addToBlacklist(EdgeDomain domain) { - try (var conn = dataSource.getConnection(); - var stmt = conn.prepareStatement(""" - INSERT IGNORE INTO EC_DOMAIN_BLACKLIST (URL_DOMAIN) VALUES (?) - """)) { - stmt.setString(1, domain.toString()); - stmt.executeUpdate(); - } - catch (SQLException ex) { - throw new RuntimeException(ex); - } - } - private void removeFromBlacklist(EdgeDomain domain) { - try (var conn = dataSource.getConnection(); - var stmt = conn.prepareStatement(""" - DELETE FROM EC_DOMAIN_BLACKLIST WHERE URL_DOMAIN=? - """)) { - stmt.setString(1, domain.toString()); - stmt.addBatch(); - stmt.setString(1, domain.domain); - stmt.executeBatch(); - } - catch (SQLException ex) { - throw new RuntimeException(ex); - } - } private void setDecision(EdgeDomain domain, String decision) { try (var conn = dataSource.getConnection(); diff --git a/code/services-core/control-service/src/main/resources/templates/control/blacklist.hdb b/code/services-core/control-service/src/main/resources/templates/control/blacklist.hdb new file mode 100644 index 00000000..5622659c --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/blacklist.hdb @@ -0,0 +1,68 @@ + + + + Control Service + + + + +{{> control/partials/nav}} +
    +

    Blacklist

    + +

    + The blacklist is a list of sanctioned domains that will not be + crawled, indexed, or returned from the search results. +

    + + + + + + + + + + + + + +
    DescriptionAction
    Add To Blacklist

    + This will add the given domain to the blacklist. +

    + +  
    +   +
    +
    + + +
    Remove from blacklist

    + Remove the specified domain from the blacklist. This will ensure that + the domain is not blacklisted, in doing so it may remove the root domain + from the blacklist as well. +

    +
    +   +
    +
    + +
    +
    + +

    Recent Additions

    + + + + + + {{#each blacklist}} + + + + + {{/each}} +
    DomainComment
    {{domain}}{{comment}}
    +
    + + \ No newline at end of file