diff --git a/code/common/config/java/nu/marginalia/nodecfg/NodeConfigurationService.java b/code/common/config/java/nu/marginalia/nodecfg/NodeConfigurationService.java index d69d87fc..7098a298 100644 --- a/code/common/config/java/nu/marginalia/nodecfg/NodeConfigurationService.java +++ b/code/common/config/java/nu/marginalia/nodecfg/NodeConfigurationService.java @@ -3,6 +3,7 @@ package nu.marginalia.nodecfg; import com.google.inject.Inject; import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.nodecfg.model.NodeConfiguration; +import nu.marginalia.nodecfg.model.NodeProfile; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -20,10 +21,10 @@ public class NodeConfigurationService { this.dataSource = dataSource; } - public NodeConfiguration create(int id, String description, boolean acceptQueries, boolean keepWarcs) throws SQLException { + public NodeConfiguration create(int id, String description, boolean acceptQueries, boolean keepWarcs, NodeProfile nodeProfile) throws SQLException { try (var conn = dataSource.getConnection(); var is = conn.prepareStatement(""" - INSERT IGNORE INTO NODE_CONFIGURATION(ID, DESCRIPTION, ACCEPT_QUERIES, KEEP_WARCS) VALUES(?, ?, ?, ?) + INSERT IGNORE INTO NODE_CONFIGURATION(ID, DESCRIPTION, ACCEPT_QUERIES, KEEP_WARCS, NODE_PROFILE) VALUES(?, ?, ?, ?, ?) """) ) { @@ -31,6 +32,7 @@ public class NodeConfigurationService { is.setString(2, description); is.setBoolean(3, acceptQueries); is.setBoolean(4, keepWarcs); + is.setString(5, nodeProfile.name()); if (is.executeUpdate() <= 0) { throw new IllegalStateException("Failed to insert configuration"); @@ -43,7 +45,7 @@ public class NodeConfigurationService { public List getAll() { try (var conn = dataSource.getConnection(); var qs = conn.prepareStatement(""" - SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, KEEP_WARCS, DISABLED + SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, KEEP_WARCS, NODE_PROFILE, DISABLED FROM NODE_CONFIGURATION """)) { var rs = qs.executeQuery(); @@ -58,6 +60,7 @@ public class NodeConfigurationService { rs.getBoolean("AUTO_CLEAN"), rs.getBoolean("PRECESSION"), rs.getBoolean("KEEP_WARCS"), + NodeProfile.valueOf(rs.getString("NODE_PROFILE")), rs.getBoolean("DISABLED") )); } @@ -72,7 +75,7 @@ public class NodeConfigurationService { public NodeConfiguration get(int nodeId) throws SQLException { try (var conn = dataSource.getConnection(); var qs = conn.prepareStatement(""" - SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, KEEP_WARCS, DISABLED + SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, KEEP_WARCS, NODE_PROFILE, DISABLED FROM NODE_CONFIGURATION WHERE ID=? """)) { @@ -86,6 +89,7 @@ public class NodeConfigurationService { rs.getBoolean("AUTO_CLEAN"), rs.getBoolean("PRECESSION"), rs.getBoolean("KEEP_WARCS"), + NodeProfile.valueOf(rs.getString("NODE_PROFILE")), rs.getBoolean("DISABLED") ); } @@ -98,7 +102,7 @@ public class NodeConfigurationService { try (var conn = dataSource.getConnection(); var us = conn.prepareStatement(""" UPDATE NODE_CONFIGURATION - SET DESCRIPTION=?, ACCEPT_QUERIES=?, AUTO_CLEAN=?, PRECESSION=?, KEEP_WARCS=?, DISABLED=? + SET DESCRIPTION=?, ACCEPT_QUERIES=?, AUTO_CLEAN=?, PRECESSION=?, KEEP_WARCS=?, DISABLED=?, NODE_PROFILE=? WHERE ID=? """)) { @@ -108,7 +112,8 @@ public class NodeConfigurationService { us.setBoolean(4, config.includeInPrecession()); us.setBoolean(5, config.keepWarcs()); us.setBoolean(6, config.disabled()); - us.setInt(7, config.node()); + us.setString(7, config.profile().name()); + us.setInt(8, config.node()); if (us.executeUpdate() <= 0) throw new IllegalStateException("Failed to update configuration"); diff --git a/code/common/config/java/nu/marginalia/nodecfg/model/NodeConfiguration.java b/code/common/config/java/nu/marginalia/nodecfg/model/NodeConfiguration.java index fb7c7aba..35a71b98 100644 --- a/code/common/config/java/nu/marginalia/nodecfg/model/NodeConfiguration.java +++ b/code/common/config/java/nu/marginalia/nodecfg/model/NodeConfiguration.java @@ -6,6 +6,7 @@ public record NodeConfiguration(int node, boolean autoClean, boolean includeInPrecession, boolean keepWarcs, + NodeProfile profile, boolean disabled ) { diff --git a/code/common/config/java/nu/marginalia/nodecfg/model/NodeProfile.java b/code/common/config/java/nu/marginalia/nodecfg/model/NodeProfile.java new file mode 100644 index 00000000..96e7c24b --- /dev/null +++ b/code/common/config/java/nu/marginalia/nodecfg/model/NodeProfile.java @@ -0,0 +1,28 @@ +package nu.marginalia.nodecfg.model; + +public enum NodeProfile { + BATCH_CRAWL, + REALTIME, + MIXED, + SIDELOAD; + + public boolean isBatchCrawl() { + return this == BATCH_CRAWL; + } + public boolean isRealtime() { + return this == REALTIME; + } + public boolean isMixed() { + return this == MIXED; + } + public boolean isSideload() { + return this == SIDELOAD; + } + + public boolean permitBatchCrawl() { + return isBatchCrawl() ||isMixed(); + } + public boolean permitSideload() { + return isMixed() || isSideload(); + } +} diff --git a/code/common/config/test/nu/marginalia/nodecfg/NodeConfigurationServiceTest.java b/code/common/config/test/nu/marginalia/nodecfg/NodeConfigurationServiceTest.java index 63a18c93..a7bab69c 100644 --- a/code/common/config/test/nu/marginalia/nodecfg/NodeConfigurationServiceTest.java +++ b/code/common/config/test/nu/marginalia/nodecfg/NodeConfigurationServiceTest.java @@ -2,6 +2,7 @@ package nu.marginalia.nodecfg; import com.zaxxer.hikari.HikariConfig; import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.nodecfg.model.NodeProfile; import nu.marginalia.test.TestMigrationLoader; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Tag; @@ -46,8 +47,8 @@ public class NodeConfigurationServiceTest { @Test public void test() throws SQLException { - var a = nodeConfigurationService.create(1, "Test", false, false); - var b = nodeConfigurationService.create(2, "Foo", true, false); + var a = nodeConfigurationService.create(1, "Test", false, false, NodeProfile.MIXED); + var b = nodeConfigurationService.create(2, "Foo", true, false, NodeProfile.MIXED); assertEquals(1, a.node()); assertEquals("Test", a.description()); diff --git a/code/common/db/resources/db/migration/V24_11_0_001__add_node_profile.sql b/code/common/db/resources/db/migration/V24_11_0_001__add_node_profile.sql new file mode 100644 index 00000000..914fcaef --- /dev/null +++ b/code/common/db/resources/db/migration/V24_11_0_001__add_node_profile.sql @@ -0,0 +1 @@ +ALTER TABLE WMSA_prod.NODE_CONFIGURATION ADD COLUMN NODE_PROFILE VARCHAR(255) DEFAULT 'MIXED'; \ No newline at end of file diff --git a/code/common/process/build.gradle b/code/common/process/build.gradle deleted file mode 100644 index 51289ed4..00000000 --- a/code/common/process/build.gradle +++ /dev/null @@ -1,34 +0,0 @@ -plugins { - id 'java' - - id 'jvm-test-suite' -} - -java { - toolchain { - languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) - } -} - -apply from: "$rootProject.projectDir/srcsets.gradle" - -dependencies { - implementation libs.notnull - - implementation libs.bundles.slf4j - testImplementation libs.bundles.slf4j.test - - implementation libs.guava - implementation libs.guava - implementation dependencies.create(libs.guice.get()) { - exclude group: 'com.google.guava' - } - implementation libs.bundles.mariadb - implementation libs.commons.lang3 - - implementation libs.snakeyaml - - testImplementation libs.bundles.slf4j.test - testImplementation libs.bundles.junit - testImplementation libs.mockito -} diff --git a/code/common/process/java/nu/marginalia/process/control/ProcessAdHocTaskHeartbeat.java b/code/common/process/java/nu/marginalia/process/control/ProcessAdHocTaskHeartbeat.java deleted file mode 100644 index e4af6bcd..00000000 --- a/code/common/process/java/nu/marginalia/process/control/ProcessAdHocTaskHeartbeat.java +++ /dev/null @@ -1,7 +0,0 @@ -package nu.marginalia.process.control; - -public interface ProcessAdHocTaskHeartbeat extends AutoCloseable { - void progress(String step, int progress, int total); - - void close(); -} diff --git a/code/common/process/readme.md b/code/common/process/readme.md deleted file mode 100644 index 989a6193..00000000 --- a/code/common/process/readme.md +++ /dev/null @@ -1,4 +0,0 @@ -# Process - -Basic functionality for a Process. Processes must include this dependency to ensure -their loggers are configured properly! \ No newline at end of file diff --git a/code/common/process/resources/log4j2.properties b/code/common/process/resources/log4j2.properties deleted file mode 100644 index 18eaf147..00000000 --- a/code/common/process/resources/log4j2.properties +++ /dev/null @@ -1,9 +0,0 @@ -log4j2.isThreadContextMapInheritable=true -status = info -appender.console.type = Console -appender.console.name = LogToConsole -appender.console.layout.type = PatternLayout -appender.console.layout.pattern = %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %c{1}- %msg{nolookups}%n -appender.console.filter.http.type = MarkerFilter -rootLogger.level = info -rootLogger.appenderRef.console.ref = LogToConsole diff --git a/code/common/process/java/nu/marginalia/ProcessConfiguration.java b/code/common/service/java/nu/marginalia/process/ProcessConfiguration.java similarity index 78% rename from code/common/process/java/nu/marginalia/ProcessConfiguration.java rename to code/common/service/java/nu/marginalia/process/ProcessConfiguration.java index 35e1433f..45c3cf22 100644 --- a/code/common/process/java/nu/marginalia/ProcessConfiguration.java +++ b/code/common/service/java/nu/marginalia/process/ProcessConfiguration.java @@ -1,4 +1,4 @@ -package nu.marginalia; +package nu.marginalia.process; import java.util.UUID; diff --git a/code/common/process/java/nu/marginalia/ProcessConfigurationModule.java b/code/common/service/java/nu/marginalia/process/ProcessConfigurationModule.java similarity index 97% rename from code/common/process/java/nu/marginalia/ProcessConfigurationModule.java rename to code/common/service/java/nu/marginalia/process/ProcessConfigurationModule.java index 61214074..12784230 100644 --- a/code/common/process/java/nu/marginalia/ProcessConfigurationModule.java +++ b/code/common/service/java/nu/marginalia/process/ProcessConfigurationModule.java @@ -1,4 +1,4 @@ -package nu.marginalia; +package nu.marginalia.process; import com.google.inject.AbstractModule; import com.google.inject.name.Names; diff --git a/code/common/service/java/nu/marginalia/process/ProcessMainClass.java b/code/common/service/java/nu/marginalia/process/ProcessMainClass.java new file mode 100644 index 00000000..17758007 --- /dev/null +++ b/code/common/service/java/nu/marginalia/process/ProcessMainClass.java @@ -0,0 +1,102 @@ +package nu.marginalia.process; + +import com.google.gson.Gson; +import nu.marginalia.mq.MessageQueueFactory; +import nu.marginalia.mq.MqMessage; +import nu.marginalia.mq.inbox.MqInboxResponse; +import nu.marginalia.mq.inbox.MqSingleShotInbox; +import nu.marginalia.service.ConfigLoader; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.SQLException; +import java.util.Optional; +import java.util.UUID; +import java.util.concurrent.TimeUnit; + +public abstract class ProcessMainClass { + private static final Logger logger = LoggerFactory.getLogger(ProcessMainClass.class); + + private final MessageQueueFactory messageQueueFactory; + private final int node; + private final String inboxName; + + static { + // Load global config ASAP + ConfigLoader.loadConfig( + ConfigLoader.getConfigPath("system") + ); + } + + private final Gson gson; + + public ProcessMainClass(MessageQueueFactory messageQueueFactory, + ProcessConfiguration config, + Gson gson, + String inboxName + ) { + this.gson = gson; + new org.mariadb.jdbc.Driver(); + this.messageQueueFactory = messageQueueFactory; + this.node = config.node(); + this.inboxName = inboxName; + } + + + protected Instructions fetchInstructions(Class requestType) throws Exception { + + var inbox = messageQueueFactory.createSingleShotInbox(inboxName, node, UUID.randomUUID()); + + logger.info("Waiting for instructions"); + + var msgOpt = getMessage(inbox, requestType.getSimpleName()); + var msg = msgOpt.orElseThrow(() -> new RuntimeException("No message received")); + + // for live crawl, request is empty for now + T request = gson.fromJson(msg.payload(), requestType); + + return new Instructions<>(msg, inbox, request); + } + + + private Optional getMessage(MqSingleShotInbox inbox, String expectedFunction) throws InterruptedException, SQLException { + var opt = inbox.waitForMessage(30, TimeUnit.SECONDS); + if (opt.isPresent()) { + if (!opt.get().function().equals(expectedFunction)) { + throw new RuntimeException("Unexpected function: " + opt.get().function()); + } + return opt; + } + else { + var stolenMessage = inbox.stealMessage(msg -> msg.function().equals(expectedFunction)); + stolenMessage.ifPresent(mqMessage -> logger.info("Stole message {}", mqMessage)); + return stolenMessage; + } + } + + + protected static class Instructions { + private final MqMessage message; + private final MqSingleShotInbox inbox; + private final T value; + Instructions(MqMessage message, MqSingleShotInbox inbox, T value) + { + this.message = message; + this.inbox = inbox; + this.value = value; + } + + public T value() { + return value; + } + + public void ok() { + inbox.sendResponse(message, MqInboxResponse.ok()); + } + public void err() { + inbox.sendResponse(message, MqInboxResponse.err()); + } + + } + +} diff --git a/code/common/process/java/nu/marginalia/process/control/FakeProcessHeartbeat.java b/code/common/service/java/nu/marginalia/process/control/FakeProcessHeartbeat.java similarity index 87% rename from code/common/process/java/nu/marginalia/process/control/FakeProcessHeartbeat.java rename to code/common/service/java/nu/marginalia/process/control/FakeProcessHeartbeat.java index 95d4345b..7b12b07c 100644 --- a/code/common/process/java/nu/marginalia/process/control/FakeProcessHeartbeat.java +++ b/code/common/service/java/nu/marginalia/process/control/FakeProcessHeartbeat.java @@ -3,6 +3,8 @@ package nu.marginalia.process.control; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.util.Collection; + /** Dummy implementation of ProcessHeartbeat that does nothing */ public class FakeProcessHeartbeat implements ProcessHeartbeat { private static final Logger logger = LoggerFactory.getLogger(FakeProcessHeartbeat.class); @@ -30,6 +32,11 @@ public class FakeProcessHeartbeat implements ProcessHeartbeat { logger.info("Progress: {}, {}/{}", step, progress, total); } + @Override + public Iterable wrap(String step, Collection collection) { + return collection; + } + @Override public void close() {} }; diff --git a/code/common/service/java/nu/marginalia/process/control/ProcessAdHocTaskHeartbeat.java b/code/common/service/java/nu/marginalia/process/control/ProcessAdHocTaskHeartbeat.java new file mode 100644 index 00000000..7ad81a3b --- /dev/null +++ b/code/common/service/java/nu/marginalia/process/control/ProcessAdHocTaskHeartbeat.java @@ -0,0 +1,12 @@ +package nu.marginalia.process.control; + +import java.util.Collection; + +public interface ProcessAdHocTaskHeartbeat extends AutoCloseable { + void progress(String step, int progress, int total); + + /** Wrap a collection to provide heartbeat progress updates as it's iterated through */ + Iterable wrap(String step, Collection collection); + + void close(); +} diff --git a/code/common/process/java/nu/marginalia/process/control/ProcessAdHocTaskHeartbeatImpl.java b/code/common/service/java/nu/marginalia/process/control/ProcessAdHocTaskHeartbeatImpl.java similarity index 84% rename from code/common/process/java/nu/marginalia/process/control/ProcessAdHocTaskHeartbeatImpl.java rename to code/common/service/java/nu/marginalia/process/control/ProcessAdHocTaskHeartbeatImpl.java index 9c0eeddf..1069446c 100644 --- a/code/common/process/java/nu/marginalia/process/control/ProcessAdHocTaskHeartbeatImpl.java +++ b/code/common/service/java/nu/marginalia/process/control/ProcessAdHocTaskHeartbeatImpl.java @@ -2,11 +2,13 @@ package nu.marginalia.process.control; import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.ProcessConfiguration; +import nu.marginalia.process.ProcessConfiguration; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.sql.SQLException; +import java.util.Collection; +import java.util.Iterator; import java.util.UUID; import java.util.concurrent.TimeUnit; @@ -69,6 +71,35 @@ public class ProcessAdHocTaskHeartbeatImpl implements AutoCloseable, ProcessAdHo logger.info("ProcessTask {} progress: {}%", taskBase, progress); } + /** Wrap a collection to provide heartbeat progress updates as it's iterated through */ + @Override + public Iterable wrap(String step, Collection collection) { + return () -> new Iterator<>() { + private final Iterator base = collection.iterator(); + private final int size = collection.size(); + private final int updateInterval = Math.max(1, size / 100); // update every 1% of the collection, or at least once + private int pos = 0; + + @Override + public boolean hasNext() { + boolean ret = base.hasNext(); + if (!ret) { + progress(step, size, size); + } + return ret; + } + + @Override + public T next() { + // update every 1% of the collection, to avoid hammering the database with updates + if (pos++ % updateInterval == 0) { + progress(step, pos, size); + } + return base.next(); + } + }; + } + public void shutDown() { if (!running) return; @@ -185,6 +216,5 @@ public class ProcessAdHocTaskHeartbeatImpl implements AutoCloseable, ProcessAdHo public void close() { shutDown(); } - } diff --git a/code/common/process/java/nu/marginalia/process/control/ProcessHeartbeat.java b/code/common/service/java/nu/marginalia/process/control/ProcessHeartbeat.java similarity index 100% rename from code/common/process/java/nu/marginalia/process/control/ProcessHeartbeat.java rename to code/common/service/java/nu/marginalia/process/control/ProcessHeartbeat.java diff --git a/code/common/process/java/nu/marginalia/process/control/ProcessHeartbeatImpl.java b/code/common/service/java/nu/marginalia/process/control/ProcessHeartbeatImpl.java similarity index 96% rename from code/common/process/java/nu/marginalia/process/control/ProcessHeartbeatImpl.java rename to code/common/service/java/nu/marginalia/process/control/ProcessHeartbeatImpl.java index 6f85d890..3e4bd948 100644 --- a/code/common/process/java/nu/marginalia/process/control/ProcessHeartbeatImpl.java +++ b/code/common/service/java/nu/marginalia/process/control/ProcessHeartbeatImpl.java @@ -4,17 +4,18 @@ package nu.marginalia.process.control; import com.google.inject.Inject; import com.google.inject.Singleton; import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.ProcessConfiguration; +import nu.marginalia.process.ProcessConfiguration; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.Closeable; import java.sql.SQLException; import java.util.concurrent.TimeUnit; /** This service sends a heartbeat to the database every 5 seconds. */ @Singleton -public class ProcessHeartbeatImpl implements ProcessHeartbeat { +public class ProcessHeartbeatImpl implements ProcessHeartbeat, Closeable { private final Logger logger = LoggerFactory.getLogger(ProcessHeartbeatImpl.class); private final String processName; private final String processBase; @@ -169,5 +170,9 @@ public class ProcessHeartbeatImpl implements ProcessHeartbeat { } } } + + public void close() { + shutDown(); + } } diff --git a/code/common/process/java/nu/marginalia/process/control/ProcessTaskHeartbeat.java b/code/common/service/java/nu/marginalia/process/control/ProcessTaskHeartbeat.java similarity index 100% rename from code/common/process/java/nu/marginalia/process/control/ProcessTaskHeartbeat.java rename to code/common/service/java/nu/marginalia/process/control/ProcessTaskHeartbeat.java diff --git a/code/common/process/java/nu/marginalia/process/control/ProcessTaskHeartbeatImpl.java b/code/common/service/java/nu/marginalia/process/control/ProcessTaskHeartbeatImpl.java similarity index 99% rename from code/common/process/java/nu/marginalia/process/control/ProcessTaskHeartbeatImpl.java rename to code/common/service/java/nu/marginalia/process/control/ProcessTaskHeartbeatImpl.java index f8d68b7e..2ea3da52 100644 --- a/code/common/process/java/nu/marginalia/process/control/ProcessTaskHeartbeatImpl.java +++ b/code/common/service/java/nu/marginalia/process/control/ProcessTaskHeartbeatImpl.java @@ -2,7 +2,7 @@ package nu.marginalia.process.control; import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.ProcessConfiguration; +import nu.marginalia.process.ProcessConfiguration; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/code/common/process/java/nu/marginalia/process/log/WorkLoadIterable.java b/code/common/service/java/nu/marginalia/process/log/WorkLoadIterable.java similarity index 100% rename from code/common/process/java/nu/marginalia/process/log/WorkLoadIterable.java rename to code/common/service/java/nu/marginalia/process/log/WorkLoadIterable.java diff --git a/code/common/process/java/nu/marginalia/process/log/WorkLog.java b/code/common/service/java/nu/marginalia/process/log/WorkLog.java similarity index 100% rename from code/common/process/java/nu/marginalia/process/log/WorkLog.java rename to code/common/service/java/nu/marginalia/process/log/WorkLog.java diff --git a/code/common/process/java/nu/marginalia/process/log/WorkLogEntry.java b/code/common/service/java/nu/marginalia/process/log/WorkLogEntry.java similarity index 100% rename from code/common/process/java/nu/marginalia/process/log/WorkLogEntry.java rename to code/common/service/java/nu/marginalia/process/log/WorkLogEntry.java diff --git a/code/common/service/java/nu/marginalia/service/ConfigLoader.java b/code/common/service/java/nu/marginalia/service/ConfigLoader.java index d5c9f627..b19fca65 100644 --- a/code/common/service/java/nu/marginalia/service/ConfigLoader.java +++ b/code/common/service/java/nu/marginalia/service/ConfigLoader.java @@ -9,11 +9,11 @@ import java.util.Properties; public class ConfigLoader { - static Path getConfigPath(String configName) { + public static Path getConfigPath(String configName) { return WmsaHome.getHomePath().resolve("conf/properties/" + configName + ".properties"); } - static void loadConfig(Path configPath) { + public static void loadConfig(Path configPath) { if (!Files.exists(configPath)) { System.err.println("No config file found at " + configPath); return; diff --git a/code/common/service/java/nu/marginalia/service/ProcessMainClass.java b/code/common/service/java/nu/marginalia/service/ProcessMainClass.java deleted file mode 100644 index 6f66b57d..00000000 --- a/code/common/service/java/nu/marginalia/service/ProcessMainClass.java +++ /dev/null @@ -1,20 +0,0 @@ -package nu.marginalia.service; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public abstract class ProcessMainClass { - private static final Logger logger = LoggerFactory.getLogger(ProcessMainClass.class); - - static { - // Load global config ASAP - ConfigLoader.loadConfig( - ConfigLoader.getConfigPath("system") - ); - } - - public ProcessMainClass() { - new org.mariadb.jdbc.Driver(); - } - -} diff --git a/code/common/service/java/nu/marginalia/service/ServiceId.java b/code/common/service/java/nu/marginalia/service/ServiceId.java index 2c09dff4..664c146a 100644 --- a/code/common/service/java/nu/marginalia/service/ServiceId.java +++ b/code/common/service/java/nu/marginalia/service/ServiceId.java @@ -13,7 +13,10 @@ public enum ServiceId { Dating("dating-service"), Status("setatus-service"), - Explorer("explorer-service"); + Explorer("explorer-service"), + + NOT_A_SERVICE("NOT_A_SERVICE") + ; public final String serviceName; diff --git a/code/common/service/java/nu/marginalia/service/server/NodeStatusWatcher.java b/code/common/service/java/nu/marginalia/service/server/NodeStatusWatcher.java index 920de3db..23c99020 100644 --- a/code/common/service/java/nu/marginalia/service/server/NodeStatusWatcher.java +++ b/code/common/service/java/nu/marginalia/service/server/NodeStatusWatcher.java @@ -4,6 +4,7 @@ import com.google.inject.Inject; import com.google.inject.name.Named; import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.nodecfg.NodeConfigurationService; +import nu.marginalia.nodecfg.model.NodeProfile; import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.model.FileStorageBaseType; import org.slf4j.Logger; @@ -56,7 +57,9 @@ public class NodeStatusWatcher { private void setupNode() { try { - configurationService.create(nodeId, "Node " + nodeId, true, false); + NodeProfile profile = NodeProfile.MIXED; + + configurationService.create(nodeId, "Node " + nodeId, true, false, profile); fileStorageService.createStorageBase("Index Data", Path.of("/idx"), nodeId, FileStorageBaseType.CURRENT); fileStorageService.createStorageBase("Index Backups", Path.of("/backup"), nodeId, FileStorageBaseType.BACKUP); diff --git a/code/common/process/test/nu/marginalia/process/log/WorkLogTest.java b/code/common/service/test/nu/marginalia/process/log/WorkLogTest.java similarity index 100% rename from code/common/process/test/nu/marginalia/process/log/WorkLogTest.java rename to code/common/service/test/nu/marginalia/process/log/WorkLogTest.java diff --git a/code/execution/api/java/nu/marginalia/executor/client/ExecutorClient.java b/code/execution/api/java/nu/marginalia/executor/client/ExecutorClient.java index 020c29b0..5670c827 100644 --- a/code/execution/api/java/nu/marginalia/executor/client/ExecutorClient.java +++ b/code/execution/api/java/nu/marginalia/executor/client/ExecutorClient.java @@ -182,4 +182,10 @@ public class ExecutorClient { } } + public void restartExecutorService(int node) { + channelPool.call(ExecutorApiBlockingStub::restartExecutorService) + .forNode(node) + .run(Empty.getDefaultInstance()); + } + } diff --git a/code/execution/api/src/main/protobuf/executor-api.proto b/code/execution/api/src/main/protobuf/executor-api.proto index 667fe70d..b3ab1d41 100644 --- a/code/execution/api/src/main/protobuf/executor-api.proto +++ b/code/execution/api/src/main/protobuf/executor-api.proto @@ -17,6 +17,8 @@ service ExecutorApi { rpc downloadSampleData(RpcDownloadSampleData) returns (Empty) {} rpc calculateAdjacencies(Empty) returns (Empty) {} rpc restoreBackup(RpcFileStorageId) returns (Empty) {} + + rpc restartExecutorService(Empty) returns (Empty) {} } service ExecutorCrawlApi { diff --git a/code/execution/build.gradle b/code/execution/build.gradle index 4a28f540..641642db 100644 --- a/code/execution/build.gradle +++ b/code/execution/build.gradle @@ -15,15 +15,15 @@ dependencies { // These look weird but they're needed to be able to spawn the processes // from the executor service - implementation project(':code:processes:website-adjacencies-calculator') + implementation project(':code:processes:export-task-process') implementation project(':code:processes:crawling-process') + implementation project(':code:processes:live-crawling-process') implementation project(':code:processes:loading-process') implementation project(':code:processes:converting-process') implementation project(':code:processes:index-constructor-process') implementation project(':code:common:config') implementation project(':code:common:model') - implementation project(':code:common:process') implementation project(':code:common:db') implementation project(':code:common:linkdb') @@ -42,7 +42,6 @@ dependencies { implementation project(':code:processes:crawling-process:model') implementation project(':code:processes:crawling-process:model') implementation project(':code:processes:crawling-process:ft-link-parser') - implementation project(':code:execution:data-extractors') implementation project(':code:index:index-journal') implementation project(':code:index:api') implementation project(':code:processes:process-mq-api') diff --git a/code/execution/data-extractors/readme.md b/code/execution/data-extractors/readme.md deleted file mode 100644 index ea318e9f..00000000 --- a/code/execution/data-extractors/readme.md +++ /dev/null @@ -1,7 +0,0 @@ -Contains converter-*like* extraction jobs that operate on crawled data to produce export files. - -## Important classes - -* [AtagExporter](java/nu/marginalia/extractor/AtagExporter.java) - extracts anchor texts from the crawled data. -* [FeedExporter](java/nu/marginalia/extractor/FeedExporter.java) - tries to find RSS/Atom feeds within the crawled data. -* [TermFrequencyExporter](java/nu/marginalia/extractor/TermFrequencyExporter.java) - exports the 'TF' part of TF-IDF. \ No newline at end of file diff --git a/code/execution/java/nu/marginalia/actor/ExecutorActor.java b/code/execution/java/nu/marginalia/actor/ExecutorActor.java index e3b2308b..c4bd43fd 100644 --- a/code/execution/java/nu/marginalia/actor/ExecutorActor.java +++ b/code/execution/java/nu/marginalia/actor/ExecutorActor.java @@ -1,29 +1,38 @@ package nu.marginalia.actor; +import nu.marginalia.nodecfg.model.NodeProfile; + +import java.util.Set; + public enum ExecutorActor { - CRAWL, - RECRAWL, - RECRAWL_SINGLE_DOMAIN, - CONVERT_AND_LOAD, - PROC_CONVERTER_SPAWNER, - PROC_LOADER_SPAWNER, - PROC_CRAWLER_SPAWNER, - MONITOR_PROCESS_LIVENESS, - MONITOR_FILE_STORAGE, - ADJACENCY_CALCULATION, - CRAWL_JOB_EXTRACTOR, - EXPORT_DATA, - EXPORT_SEGMENTATION_MODEL, - EXPORT_ATAGS, - EXPORT_TERM_FREQUENCIES, - EXPORT_FEEDS, - PROC_INDEX_CONSTRUCTOR_SPAWNER, - CONVERT, - RESTORE_BACKUP, - EXPORT_SAMPLE_DATA, - DOWNLOAD_SAMPLE, - SCRAPE_FEEDS, - UPDATE_RSS; + CRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED), + RECRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED), + RECRAWL_SINGLE_DOMAIN(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED), + PROC_CONVERTER_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED), + PROC_CRAWLER_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED), + PROC_EXPORT_TASKS_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED), + ADJACENCY_CALCULATION(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED), + EXPORT_DATA(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED), + EXPORT_SEGMENTATION_MODEL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED), + EXPORT_ATAGS(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED), + EXPORT_TERM_FREQUENCIES(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED), + EXPORT_FEEDS(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED), + EXPORT_SAMPLE_DATA(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED), + DOWNLOAD_SAMPLE(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED), + + PROC_LOADER_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.SIDELOAD), + RESTORE_BACKUP(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.SIDELOAD), + CONVERT(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.SIDELOAD), + + CONVERT_AND_LOAD(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.REALTIME, NodeProfile.SIDELOAD), + MONITOR_PROCESS_LIVENESS(NodeProfile.BATCH_CRAWL, NodeProfile.REALTIME, NodeProfile.MIXED, NodeProfile.SIDELOAD), + MONITOR_FILE_STORAGE(NodeProfile.BATCH_CRAWL, NodeProfile.REALTIME, NodeProfile.MIXED, NodeProfile.SIDELOAD), + PROC_INDEX_CONSTRUCTOR_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.REALTIME, NodeProfile.MIXED, NodeProfile.SIDELOAD), + + LIVE_CRAWL(NodeProfile.REALTIME), + PROC_LIVE_CRAWL_SPAWNER(NodeProfile.REALTIME), + SCRAPE_FEEDS(NodeProfile.REALTIME), + UPDATE_RSS(NodeProfile.REALTIME); public String id() { return "fsm:" + name().toLowerCase(); @@ -33,4 +42,9 @@ public enum ExecutorActor { return "fsm:" + name().toLowerCase() + ":" + node; } + ExecutorActor(NodeProfile... profileSet) { + this.profileSet = Set.of(profileSet); + } + + public Set profileSet; } diff --git a/code/execution/java/nu/marginalia/actor/ExecutorActorControlService.java b/code/execution/java/nu/marginalia/actor/ExecutorActorControlService.java index 86f99455..7cb3e91c 100644 --- a/code/execution/java/nu/marginalia/actor/ExecutorActorControlService.java +++ b/code/execution/java/nu/marginalia/actor/ExecutorActorControlService.java @@ -10,11 +10,14 @@ import nu.marginalia.actor.state.ActorStateInstance; import nu.marginalia.actor.state.ActorStep; import nu.marginalia.actor.task.*; import nu.marginalia.mq.MessageQueueFactory; +import nu.marginalia.nodecfg.NodeConfigurationService; +import nu.marginalia.nodecfg.model.NodeConfiguration; import nu.marginalia.service.control.ServiceEventLog; import nu.marginalia.service.server.BaseServiceParams; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.sql.SQLException; import java.util.HashMap; import java.util.Map; import java.util.UUID; @@ -28,18 +31,23 @@ public class ExecutorActorControlService { public Map actorDefinitions = new HashMap<>(); private final int node; + private final NodeConfiguration nodeConfiguration; + private final Logger logger = LoggerFactory.getLogger(getClass()); @Inject public ExecutorActorControlService(MessageQueueFactory messageQueueFactory, + NodeConfigurationService configurationService, BaseServiceParams baseServiceParams, ConvertActor convertActor, ConvertAndLoadActor convertAndLoadActor, CrawlActor crawlActor, + LiveCrawlActor liveCrawlActor, RecrawlSingleDomainActor recrawlSingleDomainActor, RestoreBackupActor restoreBackupActor, ConverterMonitorActor converterMonitorFSM, CrawlerMonitorActor crawlerMonitorActor, + LiveCrawlerMonitorActor liveCrawlerMonitorActor, LoaderMonitorActor loaderMonitor, ProcessLivenessMonitorActor processMonitorFSM, FileStorageMonitorActor fileStorageMonitorActor, @@ -51,16 +59,20 @@ public class ExecutorActorControlService { ExportSampleDataActor exportSampleDataActor, ExportTermFreqActor exportTermFrequenciesActor, ExportSegmentationModelActor exportSegmentationModelActor, + ExportTaskMonitorActor exportTasksMonitorActor, DownloadSampleActor downloadSampleActor, ScrapeFeedsActor scrapeFeedsActor, ExecutorActorStateMachines stateMachines, - UpdateRssActor updateRssActor) { + UpdateRssActor updateRssActor) throws SQLException { this.messageQueueFactory = messageQueueFactory; this.eventLog = baseServiceParams.eventLog; this.stateMachines = stateMachines; this.node = baseServiceParams.configuration.node(); + this.nodeConfiguration = configurationService.get(node); + register(ExecutorActor.CRAWL, crawlActor); + register(ExecutorActor.LIVE_CRAWL, liveCrawlActor); register(ExecutorActor.RECRAWL_SINGLE_DOMAIN, recrawlSingleDomainActor); register(ExecutorActor.CONVERT, convertActor); @@ -71,6 +83,8 @@ public class ExecutorActorControlService { register(ExecutorActor.PROC_CONVERTER_SPAWNER, converterMonitorFSM); register(ExecutorActor.PROC_LOADER_SPAWNER, loaderMonitor); register(ExecutorActor.PROC_CRAWLER_SPAWNER, crawlerMonitorActor); + register(ExecutorActor.PROC_LIVE_CRAWL_SPAWNER, liveCrawlerMonitorActor); + register(ExecutorActor.PROC_EXPORT_TASKS_SPAWNER, exportTasksMonitorActor); register(ExecutorActor.MONITOR_PROCESS_LIVENESS, processMonitorFSM); register(ExecutorActor.MONITOR_FILE_STORAGE, fileStorageMonitorActor); @@ -91,6 +105,11 @@ public class ExecutorActorControlService { } private void register(ExecutorActor process, RecordActorPrototype graph) { + + if (!process.profileSet.contains(nodeConfiguration.profile())) { + return; + } + var sm = new ActorStateMachine(messageQueueFactory, process.id(), node, UUID.randomUUID(), graph); sm.listen((function, param) -> logStateChange(process, function)); diff --git a/code/execution/java/nu/marginalia/actor/proc/ExportTaskMonitorActor.java b/code/execution/java/nu/marginalia/actor/proc/ExportTaskMonitorActor.java new file mode 100644 index 00000000..b07f80e8 --- /dev/null +++ b/code/execution/java/nu/marginalia/actor/proc/ExportTaskMonitorActor.java @@ -0,0 +1,29 @@ +package nu.marginalia.actor.proc; + +import com.google.gson.Gson; +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor; +import nu.marginalia.mq.persistence.MqPersistence; +import nu.marginalia.mqapi.ProcessInboxNames; +import nu.marginalia.process.ProcessService; +import nu.marginalia.service.module.ServiceConfiguration; + +@Singleton +public class ExportTaskMonitorActor extends AbstractProcessSpawnerActor { + + @Inject + public ExportTaskMonitorActor(Gson gson, + ServiceConfiguration configuration, + MqPersistence persistence, + ProcessService processService) { + super(gson, + configuration, + persistence, + processService, + ProcessInboxNames.EXPORT_TASK_INBOX, + ProcessService.ProcessId.EXPORT_TASKS); + } + + +} diff --git a/code/execution/java/nu/marginalia/actor/proc/LiveCrawlerMonitorActor.java b/code/execution/java/nu/marginalia/actor/proc/LiveCrawlerMonitorActor.java new file mode 100644 index 00000000..5b0eb69a --- /dev/null +++ b/code/execution/java/nu/marginalia/actor/proc/LiveCrawlerMonitorActor.java @@ -0,0 +1,29 @@ +package nu.marginalia.actor.proc; + +import com.google.gson.Gson; +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor; +import nu.marginalia.mq.persistence.MqPersistence; +import nu.marginalia.mqapi.ProcessInboxNames; +import nu.marginalia.process.ProcessService; +import nu.marginalia.service.module.ServiceConfiguration; + +@Singleton +public class LiveCrawlerMonitorActor extends AbstractProcessSpawnerActor { + + @Inject + public LiveCrawlerMonitorActor(Gson gson, + ServiceConfiguration configuration, + MqPersistence persistence, + ProcessService processService) { + super(gson, + configuration, + persistence, + processService, + ProcessInboxNames.LIVE_CRAWLER_INBOX, + ProcessService.ProcessId.LIVE_CRAWLER); + } + + +} diff --git a/code/execution/java/nu/marginalia/actor/proc/ScrapeFeedsActor.java b/code/execution/java/nu/marginalia/actor/proc/ScrapeFeedsActor.java index 0fbd75e2..d6e2029f 100644 --- a/code/execution/java/nu/marginalia/actor/proc/ScrapeFeedsActor.java +++ b/code/execution/java/nu/marginalia/actor/proc/ScrapeFeedsActor.java @@ -10,6 +10,8 @@ import nu.marginalia.actor.state.ActorResumeBehavior; import nu.marginalia.actor.state.ActorStep; import nu.marginalia.actor.state.Resume; import nu.marginalia.model.EdgeDomain; +import nu.marginalia.nodecfg.NodeConfigurationService; +import nu.marginalia.nodecfg.model.NodeProfile; import nu.marginalia.service.control.ServiceEventLog; import nu.marginalia.service.module.ServiceConfiguration; import org.jsoup.Jsoup; @@ -39,6 +41,7 @@ public class ScrapeFeedsActor extends RecordActorPrototype { private final Duration pollInterval = Duration.ofHours(6); private final ServiceEventLog eventLog; + private final NodeConfigurationService nodeConfigurationService; private final HikariDataSource dataSource; private final int nodeId; @@ -54,8 +57,8 @@ public class ScrapeFeedsActor extends RecordActorPrototype { public ActorStep transition(ActorStep self) throws Exception { return switch(self) { case Initial() -> { - if (nodeId > 1) { - yield new End(); + if (nodeConfigurationService.get(nodeId).profile() != NodeProfile.REALTIME) { + yield new Error("Invalid node profile for RSS update"); } else { yield new Wait(LocalDateTime.now().toString()); @@ -177,10 +180,12 @@ public class ScrapeFeedsActor extends RecordActorPrototype { public ScrapeFeedsActor(Gson gson, ServiceEventLog eventLog, ServiceConfiguration configuration, + NodeConfigurationService nodeConfigurationService, HikariDataSource dataSource) { super(gson); this.eventLog = eventLog; + this.nodeConfigurationService = nodeConfigurationService; this.dataSource = dataSource; this.nodeId = configuration.node(); } diff --git a/code/execution/java/nu/marginalia/actor/proc/UpdateRssActor.java b/code/execution/java/nu/marginalia/actor/proc/UpdateRssActor.java index 3ccad794..344f2920 100644 --- a/code/execution/java/nu/marginalia/actor/proc/UpdateRssActor.java +++ b/code/execution/java/nu/marginalia/actor/proc/UpdateRssActor.java @@ -11,6 +11,8 @@ import nu.marginalia.api.feeds.RpcFeedUpdateMode; import nu.marginalia.mq.MqMessage; import nu.marginalia.mq.MqMessageState; import nu.marginalia.mq.persistence.MqPersistence; +import nu.marginalia.nodecfg.NodeConfigurationService; +import nu.marginalia.nodecfg.model.NodeProfile; import nu.marginalia.service.module.ServiceConfiguration; import java.time.Duration; @@ -25,13 +27,19 @@ public class UpdateRssActor extends RecordActorPrototype { private final Duration updateInterval = Duration.ofHours(24); private final int cleanInterval = 60; + private final NodeConfigurationService nodeConfigurationService; private final MqPersistence persistence; @Inject - public UpdateRssActor(Gson gson, FeedsClient feedsClient, ServiceConfiguration serviceConfiguration, MqPersistence persistence) { + public UpdateRssActor(Gson gson, + FeedsClient feedsClient, + ServiceConfiguration serviceConfiguration, + NodeConfigurationService nodeConfigurationService, + MqPersistence persistence) { super(gson); this.feedsClient = feedsClient; this.nodeId = serviceConfiguration.node(); + this.nodeConfigurationService = nodeConfigurationService; this.persistence = persistence; } @@ -55,9 +63,8 @@ public class UpdateRssActor extends RecordActorPrototype { public ActorStep transition(ActorStep self) throws Exception { return switch (self) { case Initial() -> { - if (nodeId > 1) { - // Only run on the first node - yield new End(); + if (nodeConfigurationService.get(nodeId).profile() != NodeProfile.REALTIME) { + yield new Error("Invalid node profile for RSS update"); } else { // Wait for 5 minutes before starting the first update, to give the system time to start up properly diff --git a/code/execution/java/nu/marginalia/actor/task/ExportAtagsActor.java b/code/execution/java/nu/marginalia/actor/task/ExportAtagsActor.java index 5323302b..e8b4f341 100644 --- a/code/execution/java/nu/marginalia/actor/task/ExportAtagsActor.java +++ b/code/execution/java/nu/marginalia/actor/task/ExportAtagsActor.java @@ -3,50 +3,68 @@ package nu.marginalia.actor.task; import com.google.gson.Gson; import com.google.inject.Inject; import com.google.inject.Singleton; -import nu.marginalia.extractor.AtagExporter; -import nu.marginalia.extractor.ExporterIf; -import nu.marginalia.storage.model.*; import nu.marginalia.actor.prototype.RecordActorPrototype; import nu.marginalia.actor.state.ActorStep; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.outbox.MqOutbox; +import nu.marginalia.mqapi.tasks.ExportTaskRequest; +import nu.marginalia.process.ProcessOutboxes; +import nu.marginalia.process.ProcessService; import nu.marginalia.storage.FileStorageService; +import nu.marginalia.storage.model.FileStorageId; +import nu.marginalia.storage.model.FileStorageState; +import nu.marginalia.storage.model.FileStorageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.time.LocalDateTime; @Singleton public class ExportAtagsActor extends RecordActorPrototype { private final FileStorageService storageService; - private final ExporterIf atagExporter; + private final ActorProcessWatcher processWatcher; + private final MqOutbox exportTasksOutbox; + private final Logger logger = LoggerFactory.getLogger(getClass()); public record Export(FileStorageId crawlId) implements ActorStep {} - public record Run(FileStorageId crawlId, FileStorageId destId) implements ActorStep {} + public record Run(FileStorageId crawlId, FileStorageId destId, long msgId) implements ActorStep { + public Run(FileStorageId crawlId, FileStorageId destId) { + this(crawlId, destId, -1); + } + } + @Override public ActorStep transition(ActorStep self) throws Exception { return switch(self) { case Export(FileStorageId crawlId) -> { - var storage = storageService.allocateStorage(FileStorageType.EXPORT, "atag-export", "Anchor Tags " + LocalDateTime.now()); + var storage = storageService.allocateStorage(FileStorageType.EXPORT, "atags-export", "Atags " + LocalDateTime.now()); if (storage == null) yield new Error("Bad storage id"); yield new Run(crawlId, storage.id()); } - case Run(FileStorageId crawlId, FileStorageId destId) -> { + case Run(FileStorageId crawlId, FileStorageId destId, long msgId) when msgId < 0 -> { storageService.setFileStorageState(destId, FileStorageState.NEW); - try { - atagExporter.export(crawlId, destId); - storageService.setFileStorageState(destId, FileStorageState.UNSET); - } - catch (Exception ex) { - storageService.setFileStorageState(destId, FileStorageState.DELETE); - yield new Error("Failed to export data"); - } - - yield new End(); + long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.atags(crawlId, destId)); + yield new Run(crawlId, destId, newMsgId); } + case Run(_, FileStorageId destId, long msgId) -> { + var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId); + + if (rsp.state() != MqMessageState.OK) { + storageService.flagFileForDeletion(destId); + yield new Error("Exporter failed"); + } + else { + storageService.setFileStorageState(destId, FileStorageState.UNSET); + yield new End(); + } + } + default -> new Error(); }; } - @Override public String describe() { return "Export anchor tags from crawl data"; @@ -55,11 +73,13 @@ public class ExportAtagsActor extends RecordActorPrototype { @Inject public ExportAtagsActor(Gson gson, FileStorageService storageService, - AtagExporter atagExporter) + ProcessOutboxes processOutboxes, + ActorProcessWatcher processWatcher) { super(gson); + this.exportTasksOutbox = processOutboxes.getExportTasksOutbox(); this.storageService = storageService; - this.atagExporter = atagExporter; + this.processWatcher = processWatcher; } } diff --git a/code/execution/java/nu/marginalia/actor/task/ExportFeedsActor.java b/code/execution/java/nu/marginalia/actor/task/ExportFeedsActor.java index faaaf528..7a0d612c 100644 --- a/code/execution/java/nu/marginalia/actor/task/ExportFeedsActor.java +++ b/code/execution/java/nu/marginalia/actor/task/ExportFeedsActor.java @@ -5,8 +5,11 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.actor.prototype.RecordActorPrototype; import nu.marginalia.actor.state.ActorStep; -import nu.marginalia.extractor.ExporterIf; -import nu.marginalia.extractor.FeedExporter; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.outbox.MqOutbox; +import nu.marginalia.mqapi.tasks.ExportTaskRequest; +import nu.marginalia.process.ProcessOutboxes; +import nu.marginalia.process.ProcessService; import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageState; @@ -19,11 +22,17 @@ import java.time.LocalDateTime; @Singleton public class ExportFeedsActor extends RecordActorPrototype { private final FileStorageService storageService; + private final ActorProcessWatcher processWatcher; + private final MqOutbox exportTasksOutbox; private final Logger logger = LoggerFactory.getLogger(getClass()); - private final ExporterIf feedExporter; public record Export(FileStorageId crawlId) implements ActorStep {} - public record Run(FileStorageId crawlId, FileStorageId destId) implements ActorStep {} + public record Run(FileStorageId crawlId, FileStorageId destId, long msgId) implements ActorStep { + public Run(FileStorageId crawlId, FileStorageId destId) { + this(crawlId, destId, -1); + } + } + @Override public ActorStep transition(ActorStep self) throws Exception { return switch(self) { @@ -33,20 +42,25 @@ public class ExportFeedsActor extends RecordActorPrototype { if (storage == null) yield new Error("Bad storage id"); yield new Run(crawlId, storage.id()); } - case Run(FileStorageId crawlId, FileStorageId destId) -> { + case Run(FileStorageId crawlId, FileStorageId destId, long msgId) when msgId < 0 -> { storageService.setFileStorageState(destId, FileStorageState.NEW); - try { - feedExporter.export(crawlId, destId); - storageService.setFileStorageState(destId, FileStorageState.UNSET); - } - catch (Exception ex) { - storageService.setFileStorageState(destId, FileStorageState.DELETE); - yield new Error("Failed to export data"); - } - - yield new End(); + long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.feeds(crawlId, destId)); + yield new Run(crawlId, destId, newMsgId); } + case Run(_, FileStorageId destId, long msgId) -> { + var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId); + + if (rsp.state() != MqMessageState.OK) { + storageService.flagFileForDeletion(destId); + yield new Error("Exporter failed"); + } + else { + storageService.setFileStorageState(destId, FileStorageState.UNSET); + yield new End(); + } + } + default -> new Error(); }; } @@ -60,11 +74,13 @@ public class ExportFeedsActor extends RecordActorPrototype { @Inject public ExportFeedsActor(Gson gson, FileStorageService storageService, - FeedExporter feedExporter) + ActorProcessWatcher processWatcher, + ProcessOutboxes outboxes) { super(gson); this.storageService = storageService; - this.feedExporter = feedExporter; + this.processWatcher = processWatcher; + this.exportTasksOutbox = outboxes.getExportTasksOutbox(); } } diff --git a/code/execution/java/nu/marginalia/actor/task/ExportSampleDataActor.java b/code/execution/java/nu/marginalia/actor/task/ExportSampleDataActor.java index e47a803e..fa63a523 100644 --- a/code/execution/java/nu/marginalia/actor/task/ExportSampleDataActor.java +++ b/code/execution/java/nu/marginalia/actor/task/ExportSampleDataActor.java @@ -5,7 +5,11 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.actor.prototype.RecordActorPrototype; import nu.marginalia.actor.state.ActorStep; -import nu.marginalia.extractor.SampleDataExporter; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.outbox.MqOutbox; +import nu.marginalia.mqapi.tasks.ExportTaskRequest; +import nu.marginalia.process.ProcessOutboxes; +import nu.marginalia.process.ProcessService; import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageState; @@ -18,11 +22,17 @@ import java.time.LocalDateTime; @Singleton public class ExportSampleDataActor extends RecordActorPrototype { private final FileStorageService storageService; + private final ActorProcessWatcher processWatcher; + private final MqOutbox exportTasksOutbox; private final Logger logger = LoggerFactory.getLogger(getClass()); - private final SampleDataExporter dataExporter; public record Export(FileStorageId crawlId, int size, String name) implements ActorStep {} - public record Run(FileStorageId crawlId, FileStorageId destId, int size, String name) implements ActorStep {} + public record Run(FileStorageId crawlId, FileStorageId destId, int size, String name, long msgId) implements ActorStep { + public Run(FileStorageId crawlId, FileStorageId destId, int size, String name) { + this(crawlId, destId, size, name, -1); + } + } + @Override public ActorStep transition(ActorStep self) throws Exception { return switch(self) { @@ -35,28 +45,29 @@ public class ExportSampleDataActor extends RecordActorPrototype { if (storage == null) yield new Error("Bad storage id"); yield new Run(crawlId, storage.id(), size, name); } - case Run(FileStorageId crawlId, FileStorageId destId, int size, String name) -> { + case Run(FileStorageId crawlId, FileStorageId destId, int size, String name, long msgId) when msgId < 0 -> { storageService.setFileStorageState(destId, FileStorageState.NEW); - try { - dataExporter.export(crawlId, destId, size, name); - storageService.setFileStorageState(destId, FileStorageState.UNSET); - } - catch (Exception ex) { - storageService.setFileStorageState(destId, FileStorageState.DELETE); - - logger.error("Failed to export data", ex); - - yield new Error("Failed to export data"); - } - - yield new End(); + long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.sampleData(crawlId, destId, size, name)); + yield new Run(crawlId, destId, size, name, newMsgId); } + case Run(_, FileStorageId destId, _, _, long msgId) -> { + var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId); + + if (rsp.state() != MqMessageState.OK) { + storageService.flagFileForDeletion(destId); + yield new Error("Exporter failed"); + } + else { + storageService.setFileStorageState(destId, FileStorageState.UNSET); + yield new End(); + } + } + default -> new Error(); }; } - @Override public String describe() { return "Export RSS/Atom feeds from crawl data"; @@ -65,11 +76,13 @@ public class ExportSampleDataActor extends RecordActorPrototype { @Inject public ExportSampleDataActor(Gson gson, FileStorageService storageService, - SampleDataExporter dataExporter) + ProcessOutboxes processOutboxes, + ActorProcessWatcher processWatcher) { super(gson); this.storageService = storageService; - this.dataExporter = dataExporter; + this.processWatcher = processWatcher; + this.exportTasksOutbox = processOutboxes.getExportTasksOutbox(); } } diff --git a/code/execution/java/nu/marginalia/actor/task/ExportTermFreqActor.java b/code/execution/java/nu/marginalia/actor/task/ExportTermFreqActor.java index a9191060..d4c50686 100644 --- a/code/execution/java/nu/marginalia/actor/task/ExportTermFreqActor.java +++ b/code/execution/java/nu/marginalia/actor/task/ExportTermFreqActor.java @@ -5,45 +5,62 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.actor.prototype.RecordActorPrototype; import nu.marginalia.actor.state.ActorStep; -import nu.marginalia.extractor.ExporterIf; -import nu.marginalia.extractor.TermFrequencyExporter; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.outbox.MqOutbox; +import nu.marginalia.mqapi.tasks.ExportTaskRequest; +import nu.marginalia.process.ProcessOutboxes; +import nu.marginalia.process.ProcessService; import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageState; import nu.marginalia.storage.model.FileStorageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.time.LocalDateTime; @Singleton public class ExportTermFreqActor extends RecordActorPrototype { private final FileStorageService storageService; - private final ExporterIf exporter; + private final ActorProcessWatcher processWatcher; + private final MqOutbox exportTasksOutbox; + private final Logger logger = LoggerFactory.getLogger(getClass()); + public record Export(FileStorageId crawlId) implements ActorStep {} - public record Run(FileStorageId crawlId, FileStorageId destId) implements ActorStep {} + public record Run(FileStorageId crawlId, FileStorageId destId, long msgId) implements ActorStep { + public Run(FileStorageId crawlId, FileStorageId destId) { + this(crawlId, destId, -1); + } + } @Override public ActorStep transition(ActorStep self) throws Exception { return switch(self) { case Export(FileStorageId crawlId) -> { - var storage = storageService.allocateStorage(FileStorageType.EXPORT, "term-freq-export", "Term Frequencies " + LocalDateTime.now()); + var storage = storageService.allocateStorage(FileStorageType.EXPORT, "term-freq", "Term Frequencies " + LocalDateTime.now()); if (storage == null) yield new Error("Bad storage id"); yield new Run(crawlId, storage.id()); } - case Run(FileStorageId crawlId, FileStorageId destId) -> { + case Run(FileStorageId crawlId, FileStorageId destId, long msgId) when msgId < 0 -> { storageService.setFileStorageState(destId, FileStorageState.NEW); - try { - exporter.export(crawlId, destId); - storageService.setFileStorageState(destId, FileStorageState.UNSET); - } - catch (Exception ex) { - storageService.setFileStorageState(destId, FileStorageState.DELETE); - yield new Error("Failed to export data"); - } - - yield new End(); + long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.termFreq(crawlId, destId)); + yield new Run(crawlId, destId, newMsgId); } + case Run(_, FileStorageId destId, long msgId) -> { + var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId); + + if (rsp.state() != MqMessageState.OK) { + storageService.flagFileForDeletion(destId); + yield new Error("Exporter failed"); + } + else { + storageService.setFileStorageState(destId, FileStorageState.UNSET); + yield new End(); + } + } + default -> new Error(); }; } @@ -57,11 +74,13 @@ public class ExportTermFreqActor extends RecordActorPrototype { @Inject public ExportTermFreqActor(Gson gson, FileStorageService storageService, - TermFrequencyExporter exporter) + ProcessOutboxes processOutboxes, + ActorProcessWatcher processWatcher) { super(gson); this.storageService = storageService; - this.exporter = exporter; + this.processWatcher = processWatcher; + this.exportTasksOutbox = processOutboxes.getExportTasksOutbox(); } } diff --git a/code/execution/java/nu/marginalia/actor/task/LiveCrawlActor.java b/code/execution/java/nu/marginalia/actor/task/LiveCrawlActor.java new file mode 100644 index 00000000..9c159fc8 --- /dev/null +++ b/code/execution/java/nu/marginalia/actor/task/LiveCrawlActor.java @@ -0,0 +1,108 @@ +package nu.marginalia.actor.task; + +import com.google.gson.Gson; +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.IndexLocations; +import nu.marginalia.actor.ExecutorActor; +import nu.marginalia.actor.ExecutorActorStateMachines; +import nu.marginalia.actor.prototype.RecordActorPrototype; +import nu.marginalia.actor.state.ActorStep; +import nu.marginalia.api.feeds.FeedsClient; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.outbox.MqOutbox; +import nu.marginalia.mqapi.crawling.LiveCrawlRequest; +import nu.marginalia.process.ProcessOutboxes; +import nu.marginalia.process.ProcessService; +import nu.marginalia.storage.FileStorageService; +import org.apache.commons.io.FileUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.time.Duration; +import java.util.Objects; + +@Singleton +public class LiveCrawlActor extends RecordActorPrototype { + + // STATES + private final ActorProcessWatcher processWatcher; + private final MqOutbox mqLiveCrawlerOutbox; + private final ExecutorActorStateMachines executorActorStateMachines; + private final FeedsClient feedsClient; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + private final FileStorageService fileStorageService; + + public record Initial() implements ActorStep {} + public record Monitor(String feedsHash) implements ActorStep {} + public record LiveCrawl(String feedsHash, long msgId) implements ActorStep { + public LiveCrawl(String feedsHash) { this(feedsHash, -1); } + } + + @Override + public ActorStep transition(ActorStep self) throws Exception { + logger.info("{}", self); + return switch (self) { + case Initial() -> { + yield new Monitor("-"); + } + case Monitor(String feedsHash) -> { + for (;;) { + String currentHash = feedsClient.getFeedDataHash(); + if (!Objects.equals(currentHash, feedsHash)) { + yield new LiveCrawl(currentHash); + } + Thread.sleep(Duration.ofMinutes(15)); + } + } + case LiveCrawl(String feedsHash, long msgId) when msgId < 0 -> { + // Clear the index journal before starting the crawl + Path indexJournalLocation = IndexLocations.getIndexConstructionArea(fileStorageService).resolve("index-journal"); + if (Files.isDirectory(indexJournalLocation)) { + FileUtils.deleteDirectory(indexJournalLocation.toFile()); + } + + long id = mqLiveCrawlerOutbox.sendAsync(new LiveCrawlRequest()); + yield new LiveCrawl(feedsHash, id); + } + case LiveCrawl(String feedsHash, long msgId) -> { + var rsp = processWatcher.waitResponse(mqLiveCrawlerOutbox, ProcessService.ProcessId.LIVE_CRAWLER, msgId); + + if (rsp.state() != MqMessageState.OK) { + yield new Error("Crawler failed"); + } + + // Build the index + executorActorStateMachines.initFrom(ExecutorActor.CONVERT_AND_LOAD, new ConvertAndLoadActor.Rerank()); + + yield new Monitor(feedsHash); + } + default -> new Error("Unknown state"); + }; + } + + @Override + public String describe() { + return "Actor that polls the feeds database for changes, and triggers the live crawler when needed"; + } + + @Inject + public LiveCrawlActor(ActorProcessWatcher processWatcher, + ProcessOutboxes processOutboxes, + FeedsClient feedsClient, + Gson gson, + ExecutorActorStateMachines executorActorStateMachines, FileStorageService fileStorageService) + { + super(gson); + this.processWatcher = processWatcher; + this.mqLiveCrawlerOutbox = processOutboxes.getLiveCrawlerOutbox(); + this.executorActorStateMachines = executorActorStateMachines; + this.feedsClient = feedsClient; + this.fileStorageService = fileStorageService; + } + + +} diff --git a/code/execution/java/nu/marginalia/actor/task/TriggerAdjacencyCalculationActor.java b/code/execution/java/nu/marginalia/actor/task/TriggerAdjacencyCalculationActor.java index f81aeb79..e4a46d84 100644 --- a/code/execution/java/nu/marginalia/actor/task/TriggerAdjacencyCalculationActor.java +++ b/code/execution/java/nu/marginalia/actor/task/TriggerAdjacencyCalculationActor.java @@ -5,53 +5,59 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.actor.prototype.RecordActorPrototype; import nu.marginalia.actor.state.ActorStep; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.outbox.MqOutbox; +import nu.marginalia.mqapi.tasks.ExportTaskRequest; +import nu.marginalia.process.ProcessOutboxes; import nu.marginalia.process.ProcessService; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.atomic.AtomicBoolean; - @Singleton public class TriggerAdjacencyCalculationActor extends RecordActorPrototype { + private final ActorProcessWatcher processWatcher; + private final MqOutbox exportTasksOutbox; private final Logger logger = LoggerFactory.getLogger(getClass()); - private final ProcessService processService; - private final ExecutorService executor = Executors.newSingleThreadExecutor(); - public record Run() implements ActorStep {} + public record Run(long msgId) implements ActorStep { + public Run() { + this(-1); + } + } @Override public ActorStep transition(ActorStep self) throws Exception { - return switch (self) { - case Run() -> { - AtomicBoolean hasError = new AtomicBoolean(false); - var future = executor.submit(() -> { - try { - processService.trigger(ProcessService.ProcessId.ADJACENCIES_CALCULATOR, "load"); - } - catch (Exception ex) { - logger.warn("Error triggering adjacency calculation", ex); - hasError.set(true); - } - }); - future.get(); - - if (hasError.get()) { - yield new Error("Error triggering adjacency calculation"); - } - yield new End(); + return switch(self) { + case Run(long msgId) when msgId < 0 -> { + long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.adjacencies()); + yield new Run(newMsgId); } + case Run(long msgId) -> { + var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId); + + if (rsp.state() != MqMessageState.OK) { + yield new Error("Exporter failed"); + } + else { + yield new End(); + } + } + default -> new Error(); }; } + @Inject public TriggerAdjacencyCalculationActor(Gson gson, - ProcessService processService) { + ProcessOutboxes processOutboxes, + ActorProcessWatcher processWatcher) { super(gson); - this.processService = processService; + + this.exportTasksOutbox = processOutboxes.getExportTasksOutbox(); + this.processWatcher = processWatcher; + } @Override diff --git a/code/execution/java/nu/marginalia/execution/ExecutorGrpcService.java b/code/execution/java/nu/marginalia/execution/ExecutorGrpcService.java index 724edcf7..1b28919f 100644 --- a/code/execution/java/nu/marginalia/execution/ExecutorGrpcService.java +++ b/code/execution/java/nu/marginalia/execution/ExecutorGrpcService.java @@ -15,9 +15,12 @@ import nu.marginalia.service.module.ServiceConfiguration; import nu.marginalia.service.server.DiscoverableService; import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.model.FileStorageId; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.nio.file.Files; import java.nio.file.Path; +import java.time.Duration; import java.time.LocalDateTime; import java.time.ZoneId; import java.time.format.DateTimeFormatter; @@ -32,6 +35,8 @@ public class ExecutorGrpcService private final ServiceConfiguration serviceConfiguration; private final ExecutorActorControlService actorControlService; + private static final Logger logger = LoggerFactory.getLogger(ExecutorGrpcService.class); + @Inject public ExecutorGrpcService(ActorApi actorApi, FileStorageService fileStorageService, @@ -240,5 +245,22 @@ public class ExecutorGrpcService } } + @Override + public void restartExecutorService(Empty request, StreamObserver responseObserver) { + responseObserver.onNext(Empty.getDefaultInstance()); + responseObserver.onCompleted(); + + logger.info("Restarting executor service on node {}", serviceConfiguration.node()); + + try { + // Wait for the response to be sent before restarting + Thread.sleep(Duration.ofSeconds(5)); + } + catch (InterruptedException e) { + logger.warn("Interrupted while waiting for restart", e); + } + + System.exit(0); + } } diff --git a/code/execution/java/nu/marginalia/process/ProcessOutboxes.java b/code/execution/java/nu/marginalia/process/ProcessOutboxes.java index 2c7e3fad..4f2e79d5 100644 --- a/code/execution/java/nu/marginalia/process/ProcessOutboxes.java +++ b/code/execution/java/nu/marginalia/process/ProcessOutboxes.java @@ -13,6 +13,8 @@ public class ProcessOutboxes { private final MqOutbox loaderOutbox; private final MqOutbox crawlerOutbox; private final MqOutbox indexConstructorOutbox; + private final MqOutbox liveCrawlerOutbox; + private final MqOutbox exportTasksOutbox; @Inject public ProcessOutboxes(BaseServiceParams params, MqPersistence persistence) { @@ -44,6 +46,22 @@ public class ProcessOutboxes { params.configuration.node(), params.configuration.instanceUuid() ); + + liveCrawlerOutbox = new MqOutbox(persistence, + ProcessInboxNames.LIVE_CRAWLER_INBOX, + params.configuration.node(), + params.configuration.serviceName(), + params.configuration.node(), + params.configuration.instanceUuid() + ); + + exportTasksOutbox = new MqOutbox(persistence, + ProcessInboxNames.EXPORT_TASK_INBOX, + params.configuration.node(), + params.configuration.serviceName(), + params.configuration.node(), + params.configuration.instanceUuid() + ); } @@ -60,4 +78,8 @@ public class ProcessOutboxes { } public MqOutbox getIndexConstructorOutbox() { return indexConstructorOutbox; } + + public MqOutbox getLiveCrawlerOutbox() { return liveCrawlerOutbox; } + + public MqOutbox getExportTasksOutbox() { return exportTasksOutbox; } } diff --git a/code/execution/java/nu/marginalia/process/ProcessService.java b/code/execution/java/nu/marginalia/process/ProcessService.java index 30f15f6e..f38651e1 100644 --- a/code/execution/java/nu/marginalia/process/ProcessService.java +++ b/code/execution/java/nu/marginalia/process/ProcessService.java @@ -3,14 +3,14 @@ package nu.marginalia.process; import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.WmsaHome; -import nu.marginalia.adjacencies.WebsiteAdjacenciesCalculator; import nu.marginalia.converting.ConverterMain; import nu.marginalia.crawl.CrawlerMain; import nu.marginalia.index.IndexConstructorMain; +import nu.marginalia.livecrawler.LiveCrawlerMain; import nu.marginalia.loading.LoaderMain; -import nu.marginalia.service.ProcessMainClass; import nu.marginalia.service.control.ServiceEventLog; import nu.marginalia.service.server.BaseServiceParams; +import nu.marginalia.task.ExportTasksMain; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.slf4j.Marker; @@ -37,23 +37,24 @@ public class ProcessService { private final int node; - public static ProcessService.ProcessId translateExternalIdBase(String id) { + public static ProcessId translateExternalIdBase(String id) { return switch (id) { - case "converter" -> ProcessService.ProcessId.CONVERTER; - case "crawler" -> ProcessService.ProcessId.CRAWLER; - case "loader" -> ProcessService.ProcessId.LOADER; - case "website-adjacencies-calculator" -> ProcessService.ProcessId.ADJACENCIES_CALCULATOR; - case "index-constructor" -> ProcessService.ProcessId.INDEX_CONSTRUCTOR; + case "converter" -> ProcessId.CONVERTER; + case "crawler" -> ProcessId.CRAWLER; + case "loader" -> ProcessId.LOADER; + case "export-tasks" -> ProcessId.EXPORT_TASKS; + case "index-constructor" -> ProcessId.INDEX_CONSTRUCTOR; default -> null; }; } public enum ProcessId { CRAWLER(CrawlerMain.class), + LIVE_CRAWLER(LiveCrawlerMain.class), CONVERTER(ConverterMain.class), LOADER(LoaderMain.class), INDEX_CONSTRUCTOR(IndexConstructorMain.class), - ADJACENCIES_CALCULATOR(WebsiteAdjacenciesCalculator.class) + EXPORT_TASKS(ExportTasksMain.class), ; public final String mainClass; @@ -64,10 +65,11 @@ public class ProcessService { List envOpts() { String variable = switch (this) { case CRAWLER -> "CRAWLER_PROCESS_OPTS"; + case LIVE_CRAWLER -> "LIVE_CRAWLER_PROCESS_OPTS"; case CONVERTER -> "CONVERTER_PROCESS_OPTS"; case LOADER -> "LOADER_PROCESS_OPTS"; case INDEX_CONSTRUCTOR -> "INDEX_CONSTRUCTION_PROCESS_OPTS"; - case ADJACENCIES_CALCULATOR -> "ADJACENCIES_CALCULATOR_PROCESS_OPTS"; + case EXPORT_TASKS -> "EXPORT_TASKS_PROCESS_OPTS"; }; String value = System.getenv(variable); diff --git a/code/functions/live-capture/api/java/nu/marginalia/api/feeds/FeedsClient.java b/code/functions/live-capture/api/java/nu/marginalia/api/feeds/FeedsClient.java index d69145b2..ad1e4559 100644 --- a/code/functions/live-capture/api/java/nu/marginalia/api/feeds/FeedsClient.java +++ b/code/functions/live-capture/api/java/nu/marginalia/api/feeds/FeedsClient.java @@ -11,10 +11,15 @@ import nu.marginalia.service.discovery.property.ServicePartition; import nu.marginalia.service.module.ServiceConfiguration; import javax.annotation.CheckReturnValue; +import java.time.Instant; +import java.util.ArrayList; +import java.util.List; import java.util.UUID; import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; +import java.util.function.BiConsumer; @Singleton public class FeedsClient { @@ -23,7 +28,9 @@ public class FeedsClient { private final MqOutbox updateFeedsOutbox; @Inject - public FeedsClient(GrpcChannelPoolFactory factory, MqPersistence mqPersistence, ServiceConfiguration serviceConfiguration) { + public FeedsClient(GrpcChannelPoolFactory factory, + MqPersistence mqPersistence, + ServiceConfiguration serviceConfiguration) { // The client is only interested in the primary node var key = ServiceKey.forGrpcApi(FeedApiGrpc.class, ServicePartition.any()); @@ -46,6 +53,25 @@ public class FeedsClient { } } + public void getUpdatedDomains(Instant since, BiConsumer> consumer) throws ExecutionException, InterruptedException { + channelPool.call(FeedApiGrpc.FeedApiBlockingStub::getUpdatedLinks) + .run(RpcUpdatedLinksRequest.newBuilder().setSinceEpochMillis(since.toEpochMilli()).build()) + .forEachRemaining(rsp -> consumer.accept(rsp.getDomain(), new ArrayList<>(rsp.getUrlList()))); + } + + public record UpdatedDomain(String domain, List urls) { + public UpdatedDomain(RpcUpdatedLinksResponse rsp) { + this(rsp.getDomain(), new ArrayList<>(rsp.getUrlList())); + } + } + + /** Get the hash of the feed data, for identifying when the data has been updated */ + public String getFeedDataHash() { + return channelPool.call(FeedApiGrpc.FeedApiBlockingStub::getFeedDataHash) + .run(Empty.getDefaultInstance()) + .getHash(); + } + /** Update the feeds, return a message ID for the update */ @CheckReturnValue public long updateFeeds(RpcFeedUpdateMode mode) throws Exception { diff --git a/code/functions/live-capture/api/src/main/protobuf/feeds.proto b/code/functions/live-capture/api/src/main/protobuf/feeds.proto index 738eaf3e..c3512832 100644 --- a/code/functions/live-capture/api/src/main/protobuf/feeds.proto +++ b/code/functions/live-capture/api/src/main/protobuf/feeds.proto @@ -7,7 +7,22 @@ option java_multiple_files=true; service FeedApi { rpc getFeed(RpcDomainId) returns (RpcFeed) {} + rpc getFeedDataHash(Empty) returns (RpcFeedDataHash) {} rpc updateFeeds(RpcUpdateRequest) returns (Empty) {} + rpc getUpdatedLinks(RpcUpdatedLinksRequest) returns (stream RpcUpdatedLinksResponse) {} +} + +message RpcUpdatedLinksRequest { + int64 sinceEpochMillis = 1; +} + +message RpcUpdatedLinksResponse { + string domain = 1; + repeated string url = 2; +} + +message RpcFeedDataHash { + string hash = 1; } message RpcDomainId { diff --git a/code/functions/live-capture/java/nu/marginalia/rss/db/FeedDb.java b/code/functions/live-capture/java/nu/marginalia/rss/db/FeedDb.java index f8c08b45..921b4e9c 100644 --- a/code/functions/live-capture/java/nu/marginalia/rss/db/FeedDb.java +++ b/code/functions/live-capture/java/nu/marginalia/rss/db/FeedDb.java @@ -11,11 +11,16 @@ import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.BufferedInputStream; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardCopyOption; +import java.security.MessageDigest; +import java.time.Instant; +import java.util.Base64; import java.util.List; import java.util.Optional; +import java.util.function.BiConsumer; @Singleton public class FeedDb { @@ -45,6 +50,18 @@ public class FeedDb { } } + /** Constructor for testing */ + public FeedDb(Path dbPath) { + feedDbEnabled = true; + readerDbPath = dbPath; + + try { + reader = new FeedDbReader(readerDbPath); + } catch (Exception e) { + reader = null; + } + } + public boolean isEnabled() { return feedDbEnabled; } @@ -112,7 +129,7 @@ public class FeedDb { } try { - Path dbFile = Files.createTempFile(WmsaHome.getDataPath(), "rss-feeds", ".tmp.db"); + Path dbFile = Files.createTempFile(readerDbPath.getParent(), "rss-feeds", ".tmp.db"); return new FeedDbWriter(dbFile); } catch (Exception e) { logger.error("Error creating new database writer", e); @@ -141,4 +158,34 @@ public class FeedDb { } } + public String getDataHash() throws Exception { + MessageDigest digest = MessageDigest.getInstance("MD5"); + + byte[] buffer = new byte[4096]; + + try (var inputStream = new BufferedInputStream(Files.newInputStream(readerDbPath))) { + int rb; + + while ((rb = inputStream.read(buffer)) >= 0) { + digest.update(buffer, 0, rb); + } + } + + return Base64.getEncoder().encodeToString(digest.digest()); + } + + public void getLinksUpdatedSince(Instant since, BiConsumer> consumer) throws Exception { + if (!feedDbEnabled) { + throw new IllegalStateException("Feed database is disabled on this node"); + } + + // Capture the current reader to avoid concurrency issues + FeedDbReader reader = this.reader; + + if (reader == null) { + throw new NullPointerException("Reader is not available"); + } + + reader.getLinksUpdatedSince(since, consumer); + } } diff --git a/code/functions/live-capture/java/nu/marginalia/rss/db/FeedDbReader.java b/code/functions/live-capture/java/nu/marginalia/rss/db/FeedDbReader.java index c810742d..c86ab8f4 100644 --- a/code/functions/live-capture/java/nu/marginalia/rss/db/FeedDbReader.java +++ b/code/functions/live-capture/java/nu/marginalia/rss/db/FeedDbReader.java @@ -12,9 +12,11 @@ import java.nio.file.Path; import java.sql.Connection; import java.sql.DriverManager; import java.sql.SQLException; +import java.time.Instant; import java.util.ArrayList; import java.util.List; import java.util.Optional; +import java.util.function.BiConsumer; public class FeedDbReader implements AutoCloseable { private static final Logger logger = LoggerFactory.getLogger(FeedDbReader.class); @@ -99,4 +101,27 @@ public class FeedDbReader implements AutoCloseable { } + public void getLinksUpdatedSince(Instant since, BiConsumer> consumer) { + try (var stmt = connection.prepareStatement("SELECT FEED FROM feed")) { + var rs = stmt.executeQuery(); + + while (rs.next()) { + FeedItems items = deserialize(rs.getString(1)); + + List urls = new ArrayList<>(); + for (var item : items.items()) { + if (item.getUpdateTimeZD().toInstant().isAfter(since)) { + urls.add(item.url()); + } + } + + if (!urls.isEmpty()) { + consumer.accept(items.domain(), new ArrayList<>(urls)); + urls.clear(); + } + } + } catch (SQLException e) { + logger.error("Error getting updated links", e); + } + } } diff --git a/code/functions/live-capture/java/nu/marginalia/rss/model/FeedItem.java b/code/functions/live-capture/java/nu/marginalia/rss/model/FeedItem.java index 119a32cb..e55ed6b6 100644 --- a/code/functions/live-capture/java/nu/marginalia/rss/model/FeedItem.java +++ b/code/functions/live-capture/java/nu/marginalia/rss/model/FeedItem.java @@ -55,6 +55,11 @@ public record FeedItem(String title, return zonedDateTime.map(date -> date.format(DATE_FORMAT)).orElse(""); } + public ZonedDateTime getUpdateTimeZD() { + return ZonedDateTime.parse(date, DATE_FORMAT); + } + + @Override public int compareTo(@NotNull FeedItem o) { return o.date.compareTo(date); diff --git a/code/functions/live-capture/java/nu/marginalia/rss/model/FeedItems.java b/code/functions/live-capture/java/nu/marginalia/rss/model/FeedItems.java index 3a8f0676..23a5d0f9 100644 --- a/code/functions/live-capture/java/nu/marginalia/rss/model/FeedItems.java +++ b/code/functions/live-capture/java/nu/marginalia/rss/model/FeedItems.java @@ -1,7 +1,6 @@ package nu.marginalia.rss.model; import java.util.List; -import java.util.Optional; public record FeedItems(String domain, String feedUrl, @@ -17,17 +16,4 @@ public record FeedItems(String domain, public boolean isEmpty() { return items.isEmpty(); } - - public Optional getLatest() { - if (items.isEmpty()) - return Optional.empty(); - - return Optional.of( - items.getFirst() - ); - } - - public Optional getLatestDate() { - return getLatest().map(FeedItem::date); - } } diff --git a/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java b/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java index 3cc0b84d..de44b7be 100644 --- a/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java +++ b/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java @@ -70,7 +70,7 @@ public class FeedFetcherService { this.serviceHeartbeat = serviceHeartbeat; this.executorClient = executorClient; - rssReader.addHeader("User-Agent", WmsaHome.getUserAgent().uaIdentifier() + " RSS Feed Fetcher"); + rssReader.addHeader("User-Agent", WmsaHome.getUserAgent().uaIdentifier()); } public enum UpdateMode { diff --git a/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedsGrpcService.java b/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedsGrpcService.java index 46524d16..1c5ca4a7 100644 --- a/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedsGrpcService.java +++ b/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedsGrpcService.java @@ -14,6 +14,8 @@ import nu.marginalia.service.server.DiscoverableService; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.time.Instant; +import java.util.List; import java.util.Optional; public class FeedsGrpcService extends FeedApiGrpc.FeedApiImplBase implements DiscoverableService { @@ -64,6 +66,45 @@ public class FeedsGrpcService extends FeedApiGrpc.FeedApiImplBase implements Dis responseObserver.onCompleted(); } + @Override + public void getFeedDataHash(Empty request, StreamObserver responseObserver) { + if (!feedDb.isEnabled()) { + responseObserver.onError(new IllegalStateException("Feed database is disabled on this node")); + return; + } + + try { + String hash = feedDb.getDataHash(); + responseObserver.onNext(RpcFeedDataHash.newBuilder().setHash(hash).build()); + responseObserver.onCompleted(); + } + catch (Exception e) { + logger.error("Error getting feed data hash", e); + responseObserver.onError(e); + } + } + + @Override + public void getUpdatedLinks(RpcUpdatedLinksRequest request, StreamObserver responseObserver) { + Instant since = Instant.ofEpochMilli(request.getSinceEpochMillis()); + + try { + feedDb.getLinksUpdatedSince(since, (String domain, List urls) -> { + RpcUpdatedLinksResponse rsp = RpcUpdatedLinksResponse.newBuilder() + .setDomain(domain) + .addAllUrl(urls) + .build(); + responseObserver.onNext(rsp); + }); + + responseObserver.onCompleted(); + } + catch (Exception e) { + logger.error("Error getting updated links", e); + responseObserver.onError(e); + } + } + @Override public void getFeed(RpcDomainId request, StreamObserver responseObserver) diff --git a/code/functions/live-capture/test/nu/marginalia/rss/db/FeedDbReaderTest.java b/code/functions/live-capture/test/nu/marginalia/rss/db/FeedDbReaderTest.java new file mode 100644 index 00000000..e698c03f --- /dev/null +++ b/code/functions/live-capture/test/nu/marginalia/rss/db/FeedDbReaderTest.java @@ -0,0 +1,34 @@ +package nu.marginalia.rss.db; + +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; + +import java.nio.file.Path; +import java.sql.SQLException; +import java.time.Instant; +import java.time.temporal.ChronoUnit; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +class FeedDbReaderTest { + + @Tag("flaky") // will only work on ~vlofgren, not on CI; remove test when this feature is stable + @Test + void getLinksUpdatedSince() throws SQLException { + var reader = new FeedDbReader(Path.of("/home/vlofgren/rss-feeds.db")); + Map> links = new HashMap<>(); + + reader.getLinksUpdatedSince(Instant.now().minus(10, ChronoUnit.DAYS), links::put); + + System.out.println(links.size()); + for (var link : links.values()) { + if (link.size() < 2) { + System.out.println(link); + } + } + + reader.close(); + + } +} \ No newline at end of file diff --git a/code/functions/live-capture/test/nu/marginalia/rss/db/FeedDbTest.java b/code/functions/live-capture/test/nu/marginalia/rss/db/FeedDbTest.java new file mode 100644 index 00000000..e2dba9e1 --- /dev/null +++ b/code/functions/live-capture/test/nu/marginalia/rss/db/FeedDbTest.java @@ -0,0 +1,36 @@ +package nu.marginalia.rss.db; + +import nu.marginalia.rss.model.FeedItem; +import nu.marginalia.rss.model.FeedItems; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; + +class FeedDbTest { + + @Test + public void testDbHash() throws Exception{ + Path dbPath = Files.createTempFile("rss-feeds", ".db"); + try { + FeedDb db = new FeedDb(dbPath); + + try (var writer = db.createWriter()) { + writer.saveFeed(new FeedItems("foo", "bar", "baz", List.of( + new FeedItem("title1", "link1", "description1", "content1"), + new FeedItem("title2", "link2", "description2", "content2") + ))); + + db.switchDb(writer); + } + + var hash = db.getDataHash(); + Assertions.assertFalse(hash.isBlank()); + } finally { + Files.delete(dbPath); + } + + } +} \ No newline at end of file diff --git a/code/index/build.gradle b/code/index/build.gradle index ad1d1000..126de167 100644 --- a/code/index/build.gradle +++ b/code/index/build.gradle @@ -67,7 +67,6 @@ dependencies { testImplementation libs.bundles.junit testImplementation libs.mockito testImplementation libs.commons.lang3 - testImplementation project(':code:common:process') testImplementation project(':code:libraries:array') testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4') diff --git a/code/index/index-forward/build.gradle b/code/index/index-forward/build.gradle index 946ef74b..53decc21 100644 --- a/code/index/index-forward/build.gradle +++ b/code/index/index-forward/build.gradle @@ -20,7 +20,7 @@ dependencies { implementation project(':code:index:query') implementation project(':code:index:index-journal') implementation project(':code:common:model') - implementation project(':code:common:process') + implementation project(':code:common:service') implementation project(':code:processes:converting-process:model') implementation libs.bundles.slf4j diff --git a/code/index/index-reverse/build.gradle b/code/index/index-reverse/build.gradle index bd0831ba..7e7504a4 100644 --- a/code/index/index-reverse/build.gradle +++ b/code/index/index-reverse/build.gradle @@ -21,8 +21,8 @@ dependencies { implementation project(':code:index:query') implementation project(':code:index:index-journal') implementation project(':code:common:model') + implementation project(':code:common:service') implementation project(':code:processes:converting-process:model') - implementation project(':code:common:process') implementation project(':third-party:parquet-floor') implementation project(':third-party:commons-codec') diff --git a/code/processes/converting-process/build.gradle b/code/processes/converting-process/build.gradle index 48c7a878..75987482 100644 --- a/code/processes/converting-process/build.gradle +++ b/code/processes/converting-process/build.gradle @@ -21,7 +21,6 @@ tasks.distZip.enabled = false apply from: "$rootProject.projectDir/srcsets.gradle" dependencies { - implementation project(':code:common:process') implementation project(':third-party:porterstemmer') implementation project(':third-party:count-min-sketch') diff --git a/code/processes/converting-process/ft-anchor-keywords/build.gradle b/code/processes/converting-process/ft-anchor-keywords/build.gradle index 7572cce0..dbf6b2cf 100644 --- a/code/processes/converting-process/ft-anchor-keywords/build.gradle +++ b/code/processes/converting-process/ft-anchor-keywords/build.gradle @@ -14,9 +14,9 @@ apply from: "$rootProject.projectDir/srcsets.gradle" dependencies { implementation project(':code:common:config') + implementation project(':code:common:service') implementation project(':code:common:model') implementation project(':code:common:db') - implementation project(':code:common:process') implementation project(':code:processes:converting-process:ft-keyword-extraction') implementation project(':code:libraries:language-processing') implementation project(':code:libraries:term-frequency-dict') diff --git a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsSourceFactory.java b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsSourceFactory.java index aaed5ace..b0da1148 100644 --- a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsSourceFactory.java +++ b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsSourceFactory.java @@ -2,10 +2,10 @@ package nu.marginalia.atags.source; import com.google.inject.Inject; import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.ProcessConfiguration; import nu.marginalia.WmsaHome; import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.model.EdgeDomain; +import nu.marginalia.process.ProcessConfiguration; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java index 97935d79..b3e3521f 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java @@ -4,8 +4,6 @@ import com.google.gson.Gson; import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; -import nu.marginalia.ProcessConfiguration; -import nu.marginalia.ProcessConfigurationModule; import nu.marginalia.converting.model.CrawlPlan; import nu.marginalia.converting.model.WorkDir; import nu.marginalia.converting.processor.DomainProcessor; @@ -17,14 +15,14 @@ import nu.marginalia.converting.writer.ConverterWriter; import nu.marginalia.io.CrawledDomainReader; import nu.marginalia.io.SerializableCrawlDataStream; import nu.marginalia.mq.MessageQueueFactory; -import nu.marginalia.mq.MqMessage; -import nu.marginalia.mq.inbox.MqInboxResponse; -import nu.marginalia.mq.inbox.MqSingleShotInbox; +import nu.marginalia.mqapi.converting.ConvertRequest; +import nu.marginalia.process.ProcessConfiguration; +import nu.marginalia.process.ProcessConfigurationModule; +import nu.marginalia.process.ProcessMainClass; import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.process.control.ProcessHeartbeatImpl; import nu.marginalia.process.log.WorkLog; import nu.marginalia.process.log.WorkLogEntry; -import nu.marginalia.service.ProcessMainClass; import nu.marginalia.service.module.DatabaseModule; import nu.marginalia.storage.FileStorageService; import nu.marginalia.util.SimpleBlockingThreadPool; @@ -34,13 +32,13 @@ import org.apache.logging.log4j.util.Strings; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.sql.SQLException; import java.util.Collection; import java.util.List; import java.util.Optional; -import java.util.UUID; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.function.Function; @@ -50,12 +48,9 @@ import static nu.marginalia.mqapi.ProcessInboxNames.CONVERTER_INBOX; public class ConverterMain extends ProcessMainClass { private static final Logger logger = LoggerFactory.getLogger(ConverterMain.class); private final DomainProcessor processor; - private final Gson gson; private final ProcessHeartbeat heartbeat; - private final MessageQueueFactory messageQueueFactory; private final FileStorageService fileStorageService; private final SideloadSourceFactory sideloadSourceFactory; - private final int node; public static void main(String... args) throws Exception { @@ -70,8 +65,9 @@ public class ConverterMain extends ProcessMainClass { logger.info("Starting pipe"); - converter - .fetchInstructions() + Instructions instructions = converter.fetchInstructions(ConvertRequest.class); + + converter.createAction(instructions) .execute(converter); logger.info("Finished"); @@ -83,6 +79,65 @@ public class ConverterMain extends ProcessMainClass { System.exit(0); } + private Action createAction(Instructions instructions) throws SQLException, IOException { + var request = instructions.value(); + final Path inputPath = request.getInputPath(); + + return switch (request.action) { + case ConvertCrawlData -> { + var crawlData = fileStorageService.getStorage(request.crawlStorage); + var processData = fileStorageService.getStorage(request.processedDataStorage); + + var plan = new CrawlPlan(null, + new WorkDir(crawlData.asPath().toString(), "crawler.log"), + new WorkDir(processData.asPath().toString(), "processor.log") + ); + + yield new ConvertCrawlDataAction(plan, instructions); + } + case SideloadEncyclopedia -> { + var processData = fileStorageService.getStorage(request.processedDataStorage); + + yield new SideloadAction( + sideloadSourceFactory.sideloadEncyclopediaMarginaliaNu(inputPath, request.baseUrl), + processData.asPath(), + instructions); + } + case SideloadDirtree -> { + var processData = fileStorageService.getStorage(request.processedDataStorage); + + yield new SideloadAction( + sideloadSourceFactory.sideloadDirtree(inputPath), + processData.asPath(), + instructions); + } + case SideloadWarc -> { + var processData = fileStorageService.getStorage(request.processedDataStorage); + + yield new SideloadAction( + sideloadSourceFactory.sideloadWarc(inputPath), + processData.asPath(), + instructions); + } + case SideloadReddit -> { + var processData = fileStorageService.getStorage(request.processedDataStorage); + + yield new SideloadAction( + sideloadSourceFactory.sideloadReddit(inputPath), + processData.asPath(), + instructions); + } + case SideloadStackexchange -> { + var processData = fileStorageService.getStorage(request.processedDataStorage); + + yield new SideloadAction( + sideloadSourceFactory.sideloadStackexchange(inputPath), + processData.asPath(), + instructions); + } + }; + } + @Inject public ConverterMain( DomainProcessor processor, @@ -94,13 +149,12 @@ public class ConverterMain extends ProcessMainClass { ProcessConfiguration processConfiguration ) { + super(messageQueueFactory, processConfiguration, gson, CONVERTER_INBOX); + this.processor = processor; - this.gson = gson; this.heartbeat = heartbeat; - this.messageQueueFactory = messageQueueFactory; this.fileStorageService = fileStorageService; this.sideloadSourceFactory = sideloadSourceFactory; - this.node = processConfiguration.node(); heartbeat.start(); } @@ -220,45 +274,44 @@ public class ConverterMain extends ProcessMainClass { } } - private abstract static class ConvertRequest { - private final MqMessage message; - private final MqSingleShotInbox inbox; + private abstract static class Action { + final Instructions instructions; - private ConvertRequest(MqMessage message, MqSingleShotInbox inbox) { - this.message = message; - this.inbox = inbox; + public Action(Instructions instructions) { + this.instructions = instructions; } public abstract void execute(ConverterMain converterMain) throws Exception; public void ok() { - inbox.sendResponse(message, MqInboxResponse.ok()); + instructions.ok(); } public void err() { - inbox.sendResponse(message, MqInboxResponse.err()); + instructions.err(); } } - private static class SideloadAction extends ConvertRequest { + private static class SideloadAction extends Action { private final Collection sideloadSources; private final Path workDir; SideloadAction(SideloadSource sideloadSource, Path workDir, - MqMessage message, MqSingleShotInbox inbox) { - super(message, inbox); + Instructions instructions) { + super(instructions); this.sideloadSources = List.of(sideloadSource); this.workDir = workDir; } SideloadAction(Collection sideloadSources, Path workDir, - MqMessage message, MqSingleShotInbox inbox) { - super(message, inbox); + Instructions instructions) { + super(instructions); this.sideloadSources = sideloadSources; this.workDir = workDir; } + @Override public void execute(ConverterMain converterMain) throws Exception { try { @@ -272,11 +325,12 @@ public class ConverterMain extends ProcessMainClass { } } - private static class ConvertCrawlDataAction extends ConvertRequest { + private static class ConvertCrawlDataAction extends Action { private final CrawlPlan plan; - private ConvertCrawlDataAction(CrawlPlan plan, MqMessage message, MqSingleShotInbox inbox) { - super(message, inbox); + private ConvertCrawlDataAction(CrawlPlan plan, + Instructions instructions) { + super(instructions); this.plan = plan; } @@ -294,94 +348,4 @@ public class ConverterMain extends ProcessMainClass { } } - - private ConvertRequest fetchInstructions() throws Exception { - - var inbox = messageQueueFactory.createSingleShotInbox(CONVERTER_INBOX, node, UUID.randomUUID()); - - var msgOpt = getMessage(inbox, nu.marginalia.mqapi.converting.ConvertRequest.class.getSimpleName()); - var msg = msgOpt.orElseThrow(() -> new RuntimeException("No message received")); - - try { - var request = gson.fromJson(msg.payload(), nu.marginalia.mqapi.converting.ConvertRequest.class); - - // will be null on ConvertCrawlData - final Path inputPath = request.getInputPath(); - - return switch (request.action) { - case ConvertCrawlData -> { - var crawlData = fileStorageService.getStorage(request.crawlStorage); - var processData = fileStorageService.getStorage(request.processedDataStorage); - - var plan = new CrawlPlan(null, - new WorkDir(crawlData.asPath().toString(), "crawler.log"), - new WorkDir(processData.asPath().toString(), "processor.log") - ); - - yield new ConvertCrawlDataAction(plan, msg, inbox); - } - case SideloadEncyclopedia -> { - var processData = fileStorageService.getStorage(request.processedDataStorage); - - yield new SideloadAction( - sideloadSourceFactory.sideloadEncyclopediaMarginaliaNu(inputPath, request.baseUrl), - processData.asPath(), - msg, inbox); - } - case SideloadDirtree -> { - var processData = fileStorageService.getStorage(request.processedDataStorage); - - yield new SideloadAction( - sideloadSourceFactory.sideloadDirtree(inputPath), - processData.asPath(), - msg, inbox); - } - case SideloadWarc -> { - var processData = fileStorageService.getStorage(request.processedDataStorage); - - yield new SideloadAction( - sideloadSourceFactory.sideloadWarc(inputPath), - processData.asPath(), - msg, inbox); - } - case SideloadReddit -> { - var processData = fileStorageService.getStorage(request.processedDataStorage); - - yield new SideloadAction( - sideloadSourceFactory.sideloadReddit(inputPath), - processData.asPath(), - msg, inbox); - } - case SideloadStackexchange -> { - var processData = fileStorageService.getStorage(request.processedDataStorage); - - yield new SideloadAction( - sideloadSourceFactory.sideloadStackexchange(inputPath), - processData.asPath(), - msg, inbox); - } - }; - } - catch (Exception ex) { - inbox.sendResponse(msg, MqInboxResponse.err(ex.getClass().getSimpleName() + ": " + ex.getMessage())); - - throw ex; - } - } - - private Optional getMessage(MqSingleShotInbox inbox, String expectedFunction) throws SQLException, InterruptedException { - var opt = inbox.waitForMessage(30, TimeUnit.SECONDS); - if (opt.isPresent()) { - if (!opt.get().function().equals(expectedFunction)) { - throw new RuntimeException("Unexpected function: " + opt.get().function()); - } - return opt; - } - else { - var stolenMessage = inbox.stealMessage(msg -> msg.function().equals(expectedFunction)); - stolenMessage.ifPresent(mqMessage -> logger.info("Stole message {}", mqMessage)); - return stolenMessage; - } - } - } diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java index 53825ecf..c0999c96 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -1,7 +1,6 @@ package nu.marginalia.converting.processor; import com.google.inject.Inject; -import nu.marginalia.atags.AnchorTextKeywords; import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.atags.source.AnchorTagsSource; import nu.marginalia.atags.source.AnchorTagsSourceFactory; @@ -37,7 +36,6 @@ public class DomainProcessor { private final DocumentProcessor documentProcessor; private final SiteWords siteWords; private final AnchorTagsSource anchorTagsSource; - private final AnchorTextKeywords anchorTextKeywords; private final GeoIpDictionary geoIpDictionary; private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -46,12 +44,10 @@ public class DomainProcessor { public DomainProcessor(DocumentProcessor documentProcessor, SiteWords siteWords, AnchorTagsSourceFactory anchorTagsSourceFactory, - AnchorTextKeywords anchorTextKeywords, GeoIpDictionary geoIpDictionary) throws SQLException { this.documentProcessor = documentProcessor; this.siteWords = siteWords; - this.anchorTextKeywords = anchorTextKeywords; this.anchorTagsSource = anchorTagsSourceFactory.create(); this.geoIpDictionary = geoIpDictionary; diff --git a/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloaderProcessing.java b/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloaderProcessing.java index b7cf244b..95729851 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloaderProcessing.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloaderProcessing.java @@ -51,10 +51,6 @@ public class SideloaderProcessing { "NP", "", body, - Integer.toHexString(url.hashCode()), - url, - "", - "SIDELOAD", false, null, null diff --git a/code/processes/converting-process/java/nu/marginalia/converting/writer/ConverterBatchWriter.java b/code/processes/converting-process/java/nu/marginalia/converting/writer/ConverterBatchWriter.java index 1a53e539..785318d9 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/writer/ConverterBatchWriter.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/writer/ConverterBatchWriter.java @@ -27,6 +27,8 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter private final SlopDomainLinkRecord.Writer domainLinkWriter; private final SlopDocumentRecord.Writer documentWriter; + private int ordinalOffset = 0; + private static final Logger logger = LoggerFactory.getLogger(ConverterBatchWriter.class); public ConverterBatchWriter(Path basePath, int batchNumber) throws IOException { @@ -46,6 +48,11 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter documentWriter = new SlopDocumentRecord.Writer(ProcessedDataFileNames.documentFileName(basePath), batchNumber); } + /** Sets the lowest ordinal value for the documents in this batch */ + public void setOrdinalOffset(int ordinalOffset) { + this.ordinalOffset = ordinalOffset; + } + @Override public void write(ConverterBatchWritableIf writable) throws IOException { writable.write(this); @@ -79,7 +86,7 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter throws IOException { - int ordinal = 0; + int ordinal = ordinalOffset; String domainName = domain.toString(); diff --git a/code/processes/converting-process/test/nu/marginalia/converting/ConvertingIntegrationTest.java b/code/processes/converting-process/test/nu/marginalia/converting/ConvertingIntegrationTest.java index 728d57ca..7cc451b4 100644 --- a/code/processes/converting-process/test/nu/marginalia/converting/ConvertingIntegrationTest.java +++ b/code/processes/converting-process/test/nu/marginalia/converting/ConvertingIntegrationTest.java @@ -147,10 +147,6 @@ public class ConvertingIntegrationTest { "", "", readClassPathFile(p.toString()), - Double.toString(Math.random()), - "https://memex.marginalia.nu/" + file, - null, - "", false, null, null diff --git a/code/processes/converting-process/test/nu/marginalia/converting/ConvertingIntegrationTestModule.java b/code/processes/converting-process/test/nu/marginalia/converting/ConvertingIntegrationTestModule.java index 83f28882..d14750ef 100644 --- a/code/processes/converting-process/test/nu/marginalia/converting/ConvertingIntegrationTestModule.java +++ b/code/processes/converting-process/test/nu/marginalia/converting/ConvertingIntegrationTestModule.java @@ -3,9 +3,9 @@ package nu.marginalia.converting; import com.google.inject.AbstractModule; import com.google.inject.name.Names; import nu.marginalia.LanguageModels; -import nu.marginalia.ProcessConfiguration; import nu.marginalia.WmsaHome; import nu.marginalia.converting.processor.ConverterDomainTypes; +import nu.marginalia.process.ProcessConfiguration; import nu.marginalia.service.module.ServiceConfiguration; import org.mockito.Mockito; diff --git a/code/processes/converting-process/test/nu/marginalia/converting/sideload/reddit/RedditSideloaderTest.java b/code/processes/converting-process/test/nu/marginalia/converting/sideload/reddit/RedditSideloaderTest.java index 9c40bcab..4f964db3 100644 --- a/code/processes/converting-process/test/nu/marginalia/converting/sideload/reddit/RedditSideloaderTest.java +++ b/code/processes/converting-process/test/nu/marginalia/converting/sideload/reddit/RedditSideloaderTest.java @@ -2,10 +2,10 @@ package nu.marginalia.converting.sideload.reddit; import com.google.inject.AbstractModule; import com.google.inject.Guice; -import nu.marginalia.ProcessConfiguration; import nu.marginalia.converting.ConverterModule; import nu.marginalia.converting.processor.ConverterDomainTypes; import nu.marginalia.converting.sideload.SideloadSourceFactory; +import nu.marginalia.process.ProcessConfiguration; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; diff --git a/code/processes/crawling-process/build.gradle b/code/processes/crawling-process/build.gradle index 2d34904f..e955f86c 100644 --- a/code/processes/crawling-process/build.gradle +++ b/code/processes/crawling-process/build.gradle @@ -21,7 +21,6 @@ tasks.distZip.enabled = false apply from: "$rootProject.projectDir/srcsets.gradle" dependencies { - implementation project(':code:common:process') implementation project(':code:common:db') implementation project(':code:common:model') diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java index ce002903..e7fbe4f9 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java @@ -5,8 +5,6 @@ import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.ProcessConfiguration; -import nu.marginalia.ProcessConfigurationModule; import nu.marginalia.UserAgent; import nu.marginalia.WmsaHome; import nu.marginalia.atags.model.DomainLinks; @@ -25,15 +23,15 @@ import nu.marginalia.io.CrawledDomainReader; import nu.marginalia.io.CrawlerOutputFile; import nu.marginalia.model.EdgeDomain; import nu.marginalia.mq.MessageQueueFactory; -import nu.marginalia.mq.MqMessage; -import nu.marginalia.mq.inbox.MqInboxResponse; -import nu.marginalia.mq.inbox.MqSingleShotInbox; import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter; +import nu.marginalia.process.ProcessConfiguration; +import nu.marginalia.process.ProcessConfigurationModule; +import nu.marginalia.process.ProcessMainClass; import nu.marginalia.process.control.ProcessHeartbeatImpl; import nu.marginalia.process.log.WorkLog; -import nu.marginalia.service.ProcessMainClass; import nu.marginalia.service.module.DatabaseModule; import nu.marginalia.storage.FileStorageService; +import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.util.SimpleBlockingThreadPool; import okhttp3.ConnectionPool; import okhttp3.Dispatcher; @@ -46,8 +44,10 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardCopyOption; import java.security.Security; -import java.sql.SQLException; -import java.util.*; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; @@ -59,14 +59,12 @@ public class CrawlerMain extends ProcessMainClass { private final UserAgent userAgent; private final ProcessHeartbeatImpl heartbeat; - private final MessageQueueFactory messageQueueFactory; private final DomainProber domainProber; private final FileStorageService fileStorageService; private final AnchorTagsSourceFactory anchorTagsSourceFactory; private final WarcArchiverFactory warcArchiverFactory; private final HikariDataSource dataSource; private final DomainBlacklist blacklist; - private final Gson gson; private final int node; private final SimpleBlockingThreadPool pool; @@ -96,16 +94,17 @@ public class CrawlerMain extends ProcessMainClass { HikariDataSource dataSource, DomainBlacklist blacklist, Gson gson) throws InterruptedException { + + super(messageQueueFactory, processConfiguration, gson, CRAWLER_INBOX); + this.userAgent = userAgent; this.heartbeat = heartbeat; - this.messageQueueFactory = messageQueueFactory; this.domainProber = domainProber; this.fileStorageService = fileStorageService; this.anchorTagsSourceFactory = anchorTagsSourceFactory; this.warcArchiverFactory = warcArchiverFactory; this.dataSource = dataSource; this.blacklist = blacklist; - this.gson = gson; this.node = processConfiguration.node(); pool = new SimpleBlockingThreadPool("CrawlerPool", @@ -144,13 +143,14 @@ public class CrawlerMain extends ProcessMainClass { ); var crawler = injector.getInstance(CrawlerMain.class); - var instructions = crawler.fetchInstructions(); + var instructions = crawler.fetchInstructions(nu.marginalia.mqapi.crawling.CrawlRequest.class); try { - if (instructions.targetDomainName != null) { - crawler.runForSingleDomain(instructions.targetDomainName, instructions.outputDir); + var req = instructions.value(); + if (req.targetDomainName != null) { + crawler.runForSingleDomain(req.targetDomainName, req.crawlStorage); } else { - crawler.runForDatabaseDomains(instructions.outputDir); + crawler.runForDatabaseDomains(req.crawlStorage); } instructions.ok(); } catch (Exception ex) { @@ -166,6 +166,10 @@ public class CrawlerMain extends ProcessMainClass { System.exit(0); } + public void runForDatabaseDomains(FileStorageId fileStorageId) throws Exception { + runForDatabaseDomains(fileStorageService.getStorage(fileStorageId).asPath()); + } + public void runForDatabaseDomains(Path outputDir) throws Exception { heartbeat.start(); @@ -285,6 +289,11 @@ public class CrawlerMain extends ProcessMainClass { } } + + public void runForSingleDomain(String targetDomainName, FileStorageId fileStorageId) throws Exception { + runForSingleDomain(targetDomainName, fileStorageService.getStorage(fileStorageId).asPath()); + } + public void runForSingleDomain(String targetDomainName, Path outputDir) throws Exception { heartbeat.start(); @@ -410,70 +419,6 @@ public class CrawlerMain extends ProcessMainClass { } - - - private static class CrawlRequest { - private final Path outputDir; - private final MqMessage message; - private final MqSingleShotInbox inbox; - - private final String targetDomainName; - - CrawlRequest(String targetDomainName, - Path outputDir, - MqMessage message, - MqSingleShotInbox inbox) - { - this.message = message; - this.inbox = inbox; - this.outputDir = outputDir; - this.targetDomainName = targetDomainName; - } - - - public void ok() { - inbox.sendResponse(message, MqInboxResponse.ok()); - } - public void err() { - inbox.sendResponse(message, MqInboxResponse.err()); - } - - } - - private CrawlRequest fetchInstructions() throws Exception { - - var inbox = messageQueueFactory.createSingleShotInbox(CRAWLER_INBOX, node, UUID.randomUUID()); - - logger.info("Waiting for instructions"); - - var msgOpt = getMessage(inbox, nu.marginalia.mqapi.crawling.CrawlRequest.class.getSimpleName()); - var msg = msgOpt.orElseThrow(() -> new RuntimeException("No message received")); - - var request = gson.fromJson(msg.payload(), nu.marginalia.mqapi.crawling.CrawlRequest.class); - var crawlStorage = fileStorageService.getStorage(request.crawlStorage); - - return new CrawlRequest( - request.targetDomainName, - crawlStorage.asPath(), - msg, - inbox); - } - - private Optional getMessage(MqSingleShotInbox inbox, String expectedFunction) throws SQLException, InterruptedException { - var opt = inbox.waitForMessage(30, TimeUnit.SECONDS); - if (opt.isPresent()) { - if (!opt.get().function().equals(expectedFunction)) { - throw new RuntimeException("Unexpected function: " + opt.get().function()); - } - return opt; - } - else { - var stolenMessage = inbox.stealMessage(msg -> msg.function().equals(expectedFunction)); - stolenMessage.ifPresent(mqMessage -> logger.info("Stole message {}", mqMessage)); - return stolenMessage; - } - } - public record CrawlSpecRecord(@NotNull String domain, int crawlDepth, @NotNull List urls) { public CrawlSpecRecord(String domain, int crawlDepth) { diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java index fa9a50b0..31925ce7 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java @@ -19,7 +19,12 @@ public class CrawlDelayTimer { private final long delayTime; public CrawlDelayTimer(long delayTime) { - this.delayTime = delayTime; + if (delayTime <= 0) { + this.delayTime = DEFAULT_CRAWL_DELAY_MIN_MS; + } + else { + this.delayTime = delayTime; + } } /** Call when we've gotten an HTTP 429 response. This will wait a moment, and then @@ -41,6 +46,10 @@ public class CrawlDelayTimer { Thread.sleep(delay.toMillis()); } + public void waitFetchDelay() { + waitFetchDelay(0); + } + public void waitFetchDelay(long spentTime) { long sleepTime = delayTime; diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/warc/WarcArchiverFactory.java b/code/processes/crawling-process/java/nu/marginalia/crawl/warc/WarcArchiverFactory.java index c1a53718..33c08165 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/warc/WarcArchiverFactory.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/warc/WarcArchiverFactory.java @@ -1,8 +1,8 @@ package nu.marginalia.crawl.warc; import com.google.inject.Inject; -import nu.marginalia.ProcessConfiguration; import nu.marginalia.nodecfg.NodeConfigurationService; +import nu.marginalia.process.ProcessConfiguration; import org.apache.commons.io.IOUtils; import java.io.IOException; diff --git a/code/processes/crawling-process/model/build.gradle b/code/processes/crawling-process/model/build.gradle index 50103c41..bcac359e 100644 --- a/code/processes/crawling-process/model/build.gradle +++ b/code/processes/crawling-process/model/build.gradle @@ -20,7 +20,6 @@ dependencies { implementation project(':code:common:model') implementation project(':code:common:db') implementation project(':code:common:config') - implementation project(':code:common:process') implementation project(':code:index:api') implementation project(':code:processes:crawling-process:ft-content-type') implementation project(':code:libraries:language-processing') diff --git a/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/ParquetSerializableCrawlDataStream.java b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/ParquetSerializableCrawlDataStream.java index 703a70af..b8dea5bc 100644 --- a/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/ParquetSerializableCrawlDataStream.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/ParquetSerializableCrawlDataStream.java @@ -148,10 +148,7 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial "", nextRecord.headers, bodyString, - Long.toHexString(hash.hashNearlyASCII(bodyString)), // this field isn't actually used, maybe we can skip calculating it? - nextRecord.url, - null, - "", + // this field isn't actually used, maybe we can skip calculating it? nextRecord.cookies, lastModified, etag)); diff --git a/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java index 9d01d551..656e4b0f 100644 --- a/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java @@ -4,7 +4,7 @@ import nu.marginalia.model.EdgeUrl; import org.apache.commons.lang3.StringUtils; import org.jetbrains.annotations.Nullable; -public class CrawledDocument implements SerializableCrawlData { +public final class CrawledDocument implements SerializableCrawlData { public String crawlId; public String url; @@ -21,16 +21,6 @@ public class CrawledDocument implements SerializableCrawlData { public String documentBody; - @Deprecated - public String documentBodyHash; - - @Deprecated - public String canonicalUrl; - public String redirectUrl; - - @Deprecated - public String recrawlState; - /** * This is not guaranteed to be set in all versions of the format, * information may come in CrawledDomain instead @@ -40,7 +30,7 @@ public class CrawledDocument implements SerializableCrawlData { public String lastModifiedMaybe; public String etagMaybe; - public CrawledDocument(String crawlId, String url, String contentType, String timestamp, int httpStatus, String crawlerStatus, String crawlerStatusDesc, @Nullable String headers, String documentBody, String documentBodyHash, String canonicalUrl, String redirectUrl, String recrawlState, Boolean hasCookies, String lastModifiedMaybe, String etagMaybe) { + public CrawledDocument(String crawlId, String url, String contentType, String timestamp, int httpStatus, String crawlerStatus, String crawlerStatusDesc, @Nullable String headers, String documentBody, Boolean hasCookies, String lastModifiedMaybe, String etagMaybe) { this.crawlId = crawlId; this.url = url; this.contentType = contentType; @@ -50,10 +40,6 @@ public class CrawledDocument implements SerializableCrawlData { this.crawlerStatusDesc = crawlerStatusDesc; this.headers = headers; this.documentBody = documentBody; - this.documentBodyHash = documentBodyHash; - this.canonicalUrl = canonicalUrl; - this.redirectUrl = redirectUrl; - this.recrawlState = recrawlState; this.hasCookies = hasCookies; this.lastModifiedMaybe = lastModifiedMaybe; this.etagMaybe = etagMaybe; @@ -120,7 +106,7 @@ public class CrawledDocument implements SerializableCrawlData { } public String toString() { - return "CrawledDocument(crawlId=" + this.crawlId + ", url=" + this.url + ", contentType=" + this.contentType + ", timestamp=" + this.timestamp + ", httpStatus=" + this.httpStatus + ", crawlerStatus=" + this.crawlerStatus + ", crawlerStatusDesc=" + this.crawlerStatusDesc + ", headers=" + this.headers + ", documentBody=" + this.documentBody + ", documentBodyHash=" + this.documentBodyHash + ", canonicalUrl=" + this.canonicalUrl + ", redirectUrl=" + this.redirectUrl + ", recrawlState=" + this.recrawlState + ", hasCookies=" + this.hasCookies + ", lastModifiedMaybe=" + this.lastModifiedMaybe + ", etagMaybe=" + this.etagMaybe + ")"; + return "CrawledDocument(crawlId=" + this.crawlId + ", url=" + this.url + ", contentType=" + this.contentType + ", timestamp=" + this.timestamp + ", httpStatus=" + this.httpStatus + ", crawlerStatus=" + this.crawlerStatus + ", crawlerStatusDesc=" + this.crawlerStatusDesc + ", headers=" + this.headers + ", documentBody=" + this.documentBody + ", hasCookies=" + this.hasCookies + ", lastModifiedMaybe=" + this.lastModifiedMaybe + ", etagMaybe=" + this.etagMaybe + ")"; } public static class CrawledDocumentBuilder { @@ -133,9 +119,6 @@ public class CrawledDocument implements SerializableCrawlData { private String crawlerStatusDesc; private @Nullable String headers; private String documentBody; - private String documentBodyHash; - private String canonicalUrl; - private String redirectUrl; private String recrawlState; private Boolean hasCookies; private String lastModifiedMaybe; @@ -189,23 +172,6 @@ public class CrawledDocument implements SerializableCrawlData { return this; } - @Deprecated - public CrawledDocumentBuilder documentBodyHash(String documentBodyHash) { - this.documentBodyHash = documentBodyHash; - return this; - } - - @Deprecated - public CrawledDocumentBuilder canonicalUrl(String canonicalUrl) { - this.canonicalUrl = canonicalUrl; - return this; - } - - public CrawledDocumentBuilder redirectUrl(String redirectUrl) { - this.redirectUrl = redirectUrl; - return this; - } - @Deprecated public CrawledDocumentBuilder recrawlState(String recrawlState) { this.recrawlState = recrawlState; @@ -228,11 +194,11 @@ public class CrawledDocument implements SerializableCrawlData { } public CrawledDocument build() { - return new CrawledDocument(this.crawlId, this.url, this.contentType, this.timestamp, this.httpStatus, this.crawlerStatus, this.crawlerStatusDesc, this.headers, this.documentBody, this.documentBodyHash, this.canonicalUrl, this.redirectUrl, this.recrawlState, this.hasCookies, this.lastModifiedMaybe, this.etagMaybe); + return new CrawledDocument(this.crawlId, this.url, this.contentType, this.timestamp, this.httpStatus, this.crawlerStatus, this.crawlerStatusDesc, this.headers, this.documentBody, this.hasCookies, this.lastModifiedMaybe, this.etagMaybe); } public String toString() { - return "CrawledDocument.CrawledDocumentBuilder(crawlId=" + this.crawlId + ", url=" + this.url + ", contentType=" + this.contentType + ", timestamp=" + this.timestamp + ", httpStatus=" + this.httpStatus + ", crawlerStatus=" + this.crawlerStatus + ", crawlerStatusDesc=" + this.crawlerStatusDesc + ", headers=" + this.headers + ", documentBody=" + this.documentBody + ", documentBodyHash=" + this.documentBodyHash + ", canonicalUrl=" + this.canonicalUrl + ", redirectUrl=" + this.redirectUrl + ", recrawlState=" + this.recrawlState + ", hasCookies=" + this.hasCookies + ", lastModifiedMaybe=" + this.lastModifiedMaybe + ", etagMaybe=" + this.etagMaybe + ")"; + return "CrawledDocument.CrawledDocumentBuilder(crawlId=" + this.crawlId + ", url=" + this.url + ", contentType=" + this.contentType + ", timestamp=" + this.timestamp + ", httpStatus=" + this.httpStatus + ", crawlerStatus=" + this.crawlerStatus + ", crawlerStatusDesc=" + this.crawlerStatusDesc + ", headers=" + this.headers + ", documentBody=" + this.documentBody + ", recrawlState=" + this.recrawlState + ", hasCookies=" + this.hasCookies + ", lastModifiedMaybe=" + this.lastModifiedMaybe + ", etagMaybe=" + this.etagMaybe + ")"; } } } diff --git a/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDomain.java b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDomain.java index c3005aee..33addc99 100644 --- a/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDomain.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDomain.java @@ -1,8 +1,9 @@ package nu.marginalia.model.crawldata; import java.util.List; +import java.util.Objects; -public class CrawledDomain implements SerializableCrawlData { +public final class CrawledDomain implements SerializableCrawlData { public String domain; public String redirectDomain; @@ -11,6 +12,7 @@ public class CrawledDomain implements SerializableCrawlData { public String crawlerStatusDesc; public String ip; + @Deprecated // This used to be populated, but is no longer public List doc; /** @@ -29,15 +31,6 @@ public class CrawledDomain implements SerializableCrawlData { this.cookies = cookies; } - public static CrawledDomainBuilder builder() { - return new CrawledDomainBuilder(); - } - - public int size() { - if (doc == null) return 0; - return doc.size(); - } - public String getDomain() { return this.domain; } @@ -94,119 +87,26 @@ public class CrawledDomain implements SerializableCrawlData { this.cookies = cookies; } - public boolean equals(final Object o) { - if (o == this) return true; - if (!(o instanceof CrawledDomain)) return false; - final CrawledDomain other = (CrawledDomain) o; - if (!other.canEqual((Object) this)) return false; - final Object this$domain = this.getDomain(); - final Object other$domain = other.getDomain(); - if (this$domain == null ? other$domain != null : !this$domain.equals(other$domain)) return false; - final Object this$redirectDomain = this.getRedirectDomain(); - final Object other$redirectDomain = other.getRedirectDomain(); - if (this$redirectDomain == null ? other$redirectDomain != null : !this$redirectDomain.equals(other$redirectDomain)) - return false; - final Object this$crawlerStatus = this.getCrawlerStatus(); - final Object other$crawlerStatus = other.getCrawlerStatus(); - if (this$crawlerStatus == null ? other$crawlerStatus != null : !this$crawlerStatus.equals(other$crawlerStatus)) - return false; - final Object this$crawlerStatusDesc = this.getCrawlerStatusDesc(); - final Object other$crawlerStatusDesc = other.getCrawlerStatusDesc(); - if (this$crawlerStatusDesc == null ? other$crawlerStatusDesc != null : !this$crawlerStatusDesc.equals(other$crawlerStatusDesc)) - return false; - final Object this$ip = this.getIp(); - final Object other$ip = other.getIp(); - if (this$ip == null ? other$ip != null : !this$ip.equals(other$ip)) return false; - final Object this$doc = this.getDoc(); - final Object other$doc = other.getDoc(); - if (this$doc == null ? other$doc != null : !this$doc.equals(other$doc)) return false; - final Object this$cookies = this.getCookies(); - final Object other$cookies = other.getCookies(); - if (this$cookies == null ? other$cookies != null : !this$cookies.equals(other$cookies)) return false; - return true; - } - - protected boolean canEqual(final Object other) { - return other instanceof CrawledDomain; + @Override + public boolean equals(Object o) { + if (!(o instanceof CrawledDomain that)) return false; + + return Objects.equals(domain, that.domain) && Objects.equals(redirectDomain, that.redirectDomain) && Objects.equals(crawlerStatus, that.crawlerStatus) && Objects.equals(crawlerStatusDesc, that.crawlerStatusDesc) && Objects.equals(ip, that.ip) && Objects.equals(doc, that.doc) && Objects.equals(cookies, that.cookies); } + @Override public int hashCode() { - final int PRIME = 59; - int result = 1; - final Object $domain = this.getDomain(); - result = result * PRIME + ($domain == null ? 43 : $domain.hashCode()); - final Object $redirectDomain = this.getRedirectDomain(); - result = result * PRIME + ($redirectDomain == null ? 43 : $redirectDomain.hashCode()); - final Object $crawlerStatus = this.getCrawlerStatus(); - result = result * PRIME + ($crawlerStatus == null ? 43 : $crawlerStatus.hashCode()); - final Object $crawlerStatusDesc = this.getCrawlerStatusDesc(); - result = result * PRIME + ($crawlerStatusDesc == null ? 43 : $crawlerStatusDesc.hashCode()); - final Object $ip = this.getIp(); - result = result * PRIME + ($ip == null ? 43 : $ip.hashCode()); - final Object $doc = this.getDoc(); - result = result * PRIME + ($doc == null ? 43 : $doc.hashCode()); - final Object $cookies = this.getCookies(); - result = result * PRIME + ($cookies == null ? 43 : $cookies.hashCode()); + int result = Objects.hashCode(domain); + result = 31 * result + Objects.hashCode(redirectDomain); + result = 31 * result + Objects.hashCode(crawlerStatus); + result = 31 * result + Objects.hashCode(crawlerStatusDesc); + result = 31 * result + Objects.hashCode(ip); + result = 31 * result + Objects.hashCode(doc); + result = 31 * result + Objects.hashCode(cookies); return result; } public String toString() { return "CrawledDomain(domain=" + this.getDomain() + ", redirectDomain=" + this.getRedirectDomain() + ", crawlerStatus=" + this.getCrawlerStatus() + ", crawlerStatusDesc=" + this.getCrawlerStatusDesc() + ", ip=" + this.getIp() + ", doc=" + this.getDoc() + ", cookies=" + this.getCookies() + ")"; } - - public static class CrawledDomainBuilder { - private String domain; - private String redirectDomain; - private String crawlerStatus; - private String crawlerStatusDesc; - private String ip; - private List doc; - private List cookies; - - CrawledDomainBuilder() { - } - - public CrawledDomainBuilder domain(String domain) { - this.domain = domain; - return this; - } - - public CrawledDomainBuilder redirectDomain(String redirectDomain) { - this.redirectDomain = redirectDomain; - return this; - } - - public CrawledDomainBuilder crawlerStatus(String crawlerStatus) { - this.crawlerStatus = crawlerStatus; - return this; - } - - public CrawledDomainBuilder crawlerStatusDesc(String crawlerStatusDesc) { - this.crawlerStatusDesc = crawlerStatusDesc; - return this; - } - - public CrawledDomainBuilder ip(String ip) { - this.ip = ip; - return this; - } - - public CrawledDomainBuilder doc(List doc) { - this.doc = doc; - return this; - } - - public CrawledDomainBuilder cookies(List cookies) { - this.cookies = cookies; - return this; - } - - public CrawledDomain build() { - return new CrawledDomain(this.domain, this.redirectDomain, this.crawlerStatus, this.crawlerStatusDesc, this.ip, this.doc, this.cookies); - } - - public String toString() { - return "CrawledDomain.CrawledDomainBuilder(domain=" + this.domain + ", redirectDomain=" + this.redirectDomain + ", crawlerStatus=" + this.crawlerStatus + ", crawlerStatusDesc=" + this.crawlerStatusDesc + ", ip=" + this.ip + ", doc=" + this.doc + ", cookies=" + this.cookies + ")"; - } - } } diff --git a/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/SerializableCrawlData.java b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/SerializableCrawlData.java index 58d25dea..dfb26597 100644 --- a/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/SerializableCrawlData.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/SerializableCrawlData.java @@ -1,5 +1,5 @@ package nu.marginalia.model.crawldata; -public interface SerializableCrawlData { +public sealed interface SerializableCrawlData permits CrawledDocument, CrawledDomain { String getDomain(); } diff --git a/code/execution/data-extractors/build.gradle b/code/processes/export-task-process/build.gradle similarity index 57% rename from code/execution/data-extractors/build.gradle rename to code/processes/export-task-process/build.gradle index 2a0c08c6..11df9ee1 100644 --- a/code/execution/data-extractors/build.gradle +++ b/code/processes/export-task-process/build.gradle @@ -1,24 +1,33 @@ plugins { id 'java' - - id "de.undercouch.download" version "5.1.0" - + id 'application' id 'jvm-test-suite' } - java { toolchain { languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } +application { + mainClass = 'nu.marginalia.task.ExportTaskMain' + applicationName = 'export-task-process' +} + +tasks.distZip.enabled = false + apply from: "$rootProject.projectDir/srcsets.gradle" dependencies { - implementation project(':code:common:config') - implementation project(':code:common:process') implementation project(':code:common:model') + implementation project(':code:common:db') + implementation project(':code:common:service') + implementation project(':code:common:config') + implementation project(':code:libraries:message-queue') + + implementation project(':code:functions:link-graph:api') + implementation project(':code:processes:process-mq-api') implementation project(':code:libraries:language-processing') implementation project(':code:libraries:term-frequency-dict') implementation project(':code:libraries:blocking-thread-pool') @@ -28,20 +37,32 @@ dependencies { implementation project(':code:processes:converting-process') implementation project(':third-party:commons-codec') - implementation libs.bundles.slf4j + implementation libs.guava implementation dependencies.create(libs.guice.get()) { exclude group: 'com.google.guava' } + implementation libs.roaringbitmap implementation libs.trove + implementation libs.fastutil + implementation libs.bundles.mariadb + implementation libs.gson implementation libs.commons.lang3 + implementation libs.commons.io implementation libs.commons.compress - implementation libs.notnull + implementation libs.commons.codec implementation libs.jsoup + + testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.junit testImplementation libs.mockito -} + testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4') + testImplementation libs.commons.codec + testImplementation 'org.testcontainers:mariadb:1.17.4' + testImplementation 'org.testcontainers:junit-jupiter:1.17.4' + testImplementation project(':code:libraries:test-helpers') +} diff --git a/code/processes/website-adjacencies-calculator/java/nu/marginalia/adjacencies/AdjacenciesData.java b/code/processes/export-task-process/java/nu/marginalia/adjacencies/AdjacenciesData.java similarity index 100% rename from code/processes/website-adjacencies-calculator/java/nu/marginalia/adjacencies/AdjacenciesData.java rename to code/processes/export-task-process/java/nu/marginalia/adjacencies/AdjacenciesData.java diff --git a/code/processes/website-adjacencies-calculator/java/nu/marginalia/adjacencies/AdjacenciesLoader.java b/code/processes/export-task-process/java/nu/marginalia/adjacencies/AdjacenciesLoader.java similarity index 100% rename from code/processes/website-adjacencies-calculator/java/nu/marginalia/adjacencies/AdjacenciesLoader.java rename to code/processes/export-task-process/java/nu/marginalia/adjacencies/AdjacenciesLoader.java diff --git a/code/processes/website-adjacencies-calculator/java/nu/marginalia/adjacencies/DomainAliases.java b/code/processes/export-task-process/java/nu/marginalia/adjacencies/DomainAliases.java similarity index 100% rename from code/processes/website-adjacencies-calculator/java/nu/marginalia/adjacencies/DomainAliases.java rename to code/processes/export-task-process/java/nu/marginalia/adjacencies/DomainAliases.java diff --git a/code/processes/website-adjacencies-calculator/java/nu/marginalia/adjacencies/SparseBitVector.java b/code/processes/export-task-process/java/nu/marginalia/adjacencies/SparseBitVector.java similarity index 100% rename from code/processes/website-adjacencies-calculator/java/nu/marginalia/adjacencies/SparseBitVector.java rename to code/processes/export-task-process/java/nu/marginalia/adjacencies/SparseBitVector.java diff --git a/code/processes/export-task-process/java/nu/marginalia/adjacencies/WebsiteAdjacenciesCalculator.java b/code/processes/export-task-process/java/nu/marginalia/adjacencies/WebsiteAdjacenciesCalculator.java new file mode 100644 index 00000000..c6ce4e91 --- /dev/null +++ b/code/processes/export-task-process/java/nu/marginalia/adjacencies/WebsiteAdjacenciesCalculator.java @@ -0,0 +1,127 @@ +package nu.marginalia.adjacencies; + +import com.google.inject.Inject; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.api.linkgraph.AggregateLinkGraphClient; +import nu.marginalia.process.ProcessConfiguration; +import nu.marginalia.process.control.ProcessHeartbeatImpl; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.concurrent.Executors; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.Consumer; +import java.util.stream.IntStream; + +import static nu.marginalia.adjacencies.SparseBitVector.andCardinality; +import static nu.marginalia.adjacencies.SparseBitVector.weightedProduct; + +public class WebsiteAdjacenciesCalculator { + private final AggregateLinkGraphClient domainLinksClient; + private final ProcessConfiguration configuration; + private final HikariDataSource dataSource; + public AdjacenciesData adjacenciesData; + public DomainAliases domainAliases; + private static final Logger logger = LoggerFactory.getLogger(WebsiteAdjacenciesCalculator.class); + + float[] weights; + + @Inject + public WebsiteAdjacenciesCalculator(AggregateLinkGraphClient domainLinksClient, + ProcessConfiguration configuration, + HikariDataSource dataSource) { + this.domainLinksClient = domainLinksClient; + this.configuration = configuration; + this.dataSource = dataSource; + } + + public void export() throws Exception { + try (var processHeartbeat = new ProcessHeartbeatImpl(configuration, dataSource)) { + domainAliases = new DomainAliases(dataSource); + adjacenciesData = new AdjacenciesData(domainLinksClient, domainAliases); + weights = adjacenciesData.getWeights(); + + AdjacenciesLoader loader = new AdjacenciesLoader(dataSource); + var executor = Executors.newFixedThreadPool(16); + + int total = adjacenciesData.getIdsList().size(); + AtomicInteger progress = new AtomicInteger(0); + IntStream.of(adjacenciesData.getIdsList().toArray()).parallel() + .filter(domainAliases::isNotAliased) + .forEach(id -> { + findAdjacent(id, loader::load); + processHeartbeat.setProgress(progress.incrementAndGet() / (double) total); + }); + + executor.shutdown(); + System.out.println("Waiting for wrap-up"); + loader.stop(); + } + } + + public void findAdjacent(int domainId, Consumer andThen) { + findAdjacentDtoS(domainId, andThen); + } + + double cosineSimilarity(SparseBitVector a, SparseBitVector b) { + double andCardinality = andCardinality(a, b); + andCardinality /= Math.sqrt(a.getCardinality()); + andCardinality /= Math.sqrt(b.getCardinality()); + return andCardinality; + } + + double expensiveCosineSimilarity(SparseBitVector a, SparseBitVector b) { + return weightedProduct(weights, a, b) / Math.sqrt(a.mulAndSum(weights) * b.mulAndSum(weights)); + } + + public record DomainSimilarities(int domainId, List similarities) {} + + public record DomainSimilarity(int domainId, double value) {} + + private void findAdjacentDtoS(int domainId, Consumer andThen) { + var vector = adjacenciesData.getVector(domainId); + + if (vector == null || !vector.cardinalityExceeds(10)) { + return; + } + + List similarities = new ArrayList<>(1000); + + var items = adjacenciesData.getCandidates(vector); + + + int cardMin = Math.max(2, (int) (0.01 * vector.getCardinality())); + + items.forEach(id -> { + var otherVec = adjacenciesData.getVector(id); + + if (null == otherVec || otherVec == vector) + return true; + + if (otherVec.getCardinality() < cardMin) + return true; + + double similarity = cosineSimilarity(vector, otherVec); + if (similarity > 0.1) { + var recalculated = expensiveCosineSimilarity(vector, otherVec); + if (recalculated > 0.1) { + similarities.add(new DomainSimilarity(id, recalculated)); + } + } + + return true; + }); + + if (similarities.size() > 128) { + similarities.sort(Comparator.comparing(DomainSimilarity::value)); + similarities.subList(0, similarities.size() - 128).clear(); + } + + + andThen.accept(new DomainSimilarities(domainId, similarities)); + } + +} diff --git a/code/execution/data-extractors/java/nu/marginalia/extractor/AtagExporter.java b/code/processes/export-task-process/java/nu/marginalia/extractor/AtagExporter.java similarity index 100% rename from code/execution/data-extractors/java/nu/marginalia/extractor/AtagExporter.java rename to code/processes/export-task-process/java/nu/marginalia/extractor/AtagExporter.java diff --git a/code/execution/data-extractors/java/nu/marginalia/extractor/ExporterIf.java b/code/processes/export-task-process/java/nu/marginalia/extractor/ExporterIf.java similarity index 100% rename from code/execution/data-extractors/java/nu/marginalia/extractor/ExporterIf.java rename to code/processes/export-task-process/java/nu/marginalia/extractor/ExporterIf.java diff --git a/code/execution/data-extractors/java/nu/marginalia/extractor/FeedExporter.java b/code/processes/export-task-process/java/nu/marginalia/extractor/FeedExporter.java similarity index 100% rename from code/execution/data-extractors/java/nu/marginalia/extractor/FeedExporter.java rename to code/processes/export-task-process/java/nu/marginalia/extractor/FeedExporter.java diff --git a/code/execution/data-extractors/java/nu/marginalia/extractor/SampleDataExporter.java b/code/processes/export-task-process/java/nu/marginalia/extractor/SampleDataExporter.java similarity index 100% rename from code/execution/data-extractors/java/nu/marginalia/extractor/SampleDataExporter.java rename to code/processes/export-task-process/java/nu/marginalia/extractor/SampleDataExporter.java diff --git a/code/execution/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java b/code/processes/export-task-process/java/nu/marginalia/extractor/TermFrequencyExporter.java similarity index 100% rename from code/execution/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java rename to code/processes/export-task-process/java/nu/marginalia/extractor/TermFrequencyExporter.java diff --git a/code/processes/export-task-process/java/nu/marginalia/task/ExportTasksMain.java b/code/processes/export-task-process/java/nu/marginalia/task/ExportTasksMain.java new file mode 100644 index 00000000..c5d7ed12 --- /dev/null +++ b/code/processes/export-task-process/java/nu/marginalia/task/ExportTasksMain.java @@ -0,0 +1,82 @@ +package nu.marginalia.task; + +import com.google.gson.Gson; +import com.google.inject.Guice; +import com.google.inject.Inject; +import nu.marginalia.adjacencies.WebsiteAdjacenciesCalculator; +import nu.marginalia.extractor.AtagExporter; +import nu.marginalia.extractor.FeedExporter; +import nu.marginalia.extractor.SampleDataExporter; +import nu.marginalia.extractor.TermFrequencyExporter; +import nu.marginalia.mq.MessageQueueFactory; +import nu.marginalia.mqapi.ProcessInboxNames; +import nu.marginalia.mqapi.tasks.ExportTaskRequest; +import nu.marginalia.process.ProcessConfiguration; +import nu.marginalia.process.ProcessConfigurationModule; +import nu.marginalia.process.ProcessMainClass; +import nu.marginalia.service.module.DatabaseModule; +import nu.marginalia.service.module.ServiceDiscoveryModule; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class ExportTasksMain extends ProcessMainClass { + + private static final Logger logger = LoggerFactory.getLogger(ExportTasksMain.class); + + private final AtagExporter atagExporter; + private final FeedExporter feedExporter; + private final SampleDataExporter sampleDataExporter; + private final TermFrequencyExporter termFrequencyExporter; + private final WebsiteAdjacenciesCalculator websiteAdjacenciesCalculator; + + public static void main(String[] args) throws Exception { + + var injector = Guice.createInjector( + new ServiceDiscoveryModule(), + new ProcessConfigurationModule("export-tasks"), + new DatabaseModule(false) + ); + + var exportTasks = injector.getInstance(ExportTasksMain.class); + + Instructions instructions = exportTasks.fetchInstructions(ExportTaskRequest.class); + try { + exportTasks.run(instructions.value()); + instructions.ok(); + } + catch (Exception e) { + logger.error("Error running export task", e); + instructions.err(); + } + + } + + @Inject + public ExportTasksMain(MessageQueueFactory messageQueueFactory, + ProcessConfiguration config, + AtagExporter atagExporter, + FeedExporter feedExporter, + SampleDataExporter sampleDataExporter, + TermFrequencyExporter termFrequencyExporter, + Gson gson, WebsiteAdjacenciesCalculator websiteAdjacenciesCalculator) + { + super(messageQueueFactory, config, gson, ProcessInboxNames.EXPORT_TASK_INBOX); + this.atagExporter = atagExporter; + this.feedExporter = feedExporter; + this.sampleDataExporter = sampleDataExporter; + this.termFrequencyExporter = termFrequencyExporter; + this.websiteAdjacenciesCalculator = websiteAdjacenciesCalculator; + } + + private void run(ExportTaskRequest request) throws Exception { + switch (request.task) { + case ATAGS: atagExporter.export(request.crawlId, request.destId); break; + case FEEDS: feedExporter.export(request.crawlId, request.destId); break; + case TERM_FREQ: termFrequencyExporter.export(request.crawlId, request.destId); break; + case SAMPLE_DATA: sampleDataExporter.export(request.crawlId, request.destId, request.size, request.name); break; + case ADJACENCIES: websiteAdjacenciesCalculator.export(); break; + } + } + + +} diff --git a/code/processes/website-adjacencies-calculator/test/nu/marginalia/adjacencies/AdjacenciesLoaderTest.java b/code/processes/export-task-process/test/nu/marginalia/adjacencies/AdjacenciesLoaderTest.java similarity index 100% rename from code/processes/website-adjacencies-calculator/test/nu/marginalia/adjacencies/AdjacenciesLoaderTest.java rename to code/processes/export-task-process/test/nu/marginalia/adjacencies/AdjacenciesLoaderTest.java diff --git a/code/processes/website-adjacencies-calculator/test/nu/marginalia/adjacencies/SparseBitVectorTest.java b/code/processes/export-task-process/test/nu/marginalia/adjacencies/SparseBitVectorTest.java similarity index 100% rename from code/processes/website-adjacencies-calculator/test/nu/marginalia/adjacencies/SparseBitVectorTest.java rename to code/processes/export-task-process/test/nu/marginalia/adjacencies/SparseBitVectorTest.java diff --git a/code/processes/index-constructor-process/build.gradle b/code/processes/index-constructor-process/build.gradle index 6de7e773..04416bc6 100644 --- a/code/processes/index-constructor-process/build.gradle +++ b/code/processes/index-constructor-process/build.gradle @@ -22,7 +22,6 @@ apply from: "$rootProject.projectDir/srcsets.gradle" dependencies { implementation project(':code:processes:process-mq-api') - implementation project(':code:common:process') implementation project(':code:common:service') implementation project(':code:common:db') implementation project(':code:common:config') diff --git a/code/processes/index-constructor-process/java/nu/marginalia/index/IndexConstructorMain.java b/code/processes/index-constructor-process/java/nu/marginalia/index/IndexConstructorMain.java index ef93b554..470b1694 100644 --- a/code/processes/index-constructor-process/java/nu/marginalia/index/IndexConstructorMain.java +++ b/code/processes/index-constructor-process/java/nu/marginalia/index/IndexConstructorMain.java @@ -1,11 +1,8 @@ package nu.marginalia.index; -import com.google.gson.Gson; import com.google.inject.Guice; import com.google.inject.Inject; import nu.marginalia.IndexLocations; -import nu.marginalia.ProcessConfiguration; -import nu.marginalia.ProcessConfigurationModule; import nu.marginalia.index.construction.full.FullIndexConstructor; import nu.marginalia.index.construction.prio.PrioIndexConstructor; import nu.marginalia.index.domainrankings.DomainRankings; @@ -15,13 +12,11 @@ import nu.marginalia.index.journal.IndexJournal; import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.mq.MessageQueueFactory; -import nu.marginalia.mq.MqMessage; -import nu.marginalia.mq.inbox.MqInboxResponse; -import nu.marginalia.mq.inbox.MqSingleShotInbox; import nu.marginalia.mqapi.index.CreateIndexRequest; -import nu.marginalia.mqapi.index.IndexName; +import nu.marginalia.process.ProcessConfiguration; +import nu.marginalia.process.ProcessConfigurationModule; +import nu.marginalia.process.ProcessMainClass; import nu.marginalia.process.control.ProcessHeartbeatImpl; -import nu.marginalia.service.ProcessMainClass; import nu.marginalia.service.module.DatabaseModule; import nu.marginalia.storage.FileStorageService; import org.slf4j.Logger; @@ -31,8 +26,6 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.sql.SQLException; -import java.util.Optional; -import java.util.UUID; import java.util.concurrent.TimeUnit; import static nu.marginalia.mqapi.ProcessInboxNames.INDEX_CONSTRUCTOR_INBOX; @@ -40,15 +33,12 @@ import static nu.marginalia.mqapi.ProcessInboxNames.INDEX_CONSTRUCTOR_INBOX; public class IndexConstructorMain extends ProcessMainClass { private final FileStorageService fileStorageService; private final ProcessHeartbeatImpl heartbeat; - private final MessageQueueFactory messageQueueFactory; private final DomainRankings domainRankings; - private final int node; private static final Logger logger = LoggerFactory.getLogger(IndexConstructorMain.class); - private final Gson gson = GsonFactory.get(); - public static void main(String[] args) throws Exception { - CreateIndexInstructions instructions = null; + public static void main(String[] args) throws Exception { + Instructions instructions = null; try { new org.mariadb.jdbc.Driver(); @@ -58,9 +48,8 @@ public class IndexConstructorMain extends ProcessMainClass { new DatabaseModule(false)) .getInstance(IndexConstructorMain.class); - instructions = main.fetchInstructions(); - - main.run(instructions); + instructions = main.fetchInstructions(CreateIndexRequest.class); + main.run(instructions.value()); instructions.ok(); } catch (Exception ex) { @@ -85,17 +74,17 @@ public class IndexConstructorMain extends ProcessMainClass { ProcessConfiguration processConfiguration, DomainRankings domainRankings) { + super(messageQueueFactory, processConfiguration, GsonFactory.get(), INDEX_CONSTRUCTOR_INBOX); + this.fileStorageService = fileStorageService; this.heartbeat = heartbeat; - this.messageQueueFactory = messageQueueFactory; this.domainRankings = domainRankings; - this.node = processConfiguration.node(); } - private void run(CreateIndexInstructions instructions) throws SQLException, IOException { + private void run(CreateIndexRequest instructions) throws SQLException, IOException { heartbeat.start(); - switch (instructions.name) { + switch (instructions.indexName()) { case FORWARD -> createForwardIndex(); case REVERSE_FULL -> createFullReverseIndex(); case REVERSE_PRIO -> createPrioReverseIndex(); @@ -171,52 +160,4 @@ public class IndexConstructorMain extends ProcessMainClass { docId); } - private static class CreateIndexInstructions { - - public final IndexName name; - private final MqSingleShotInbox inbox; - private final MqMessage message; - - private CreateIndexInstructions(IndexName name, MqSingleShotInbox inbox, MqMessage message) { - this.name = name; - this.inbox = inbox; - this.message = message; - } - - public void ok() { - inbox.sendResponse(message, MqInboxResponse.ok()); - } - public void err() { - inbox.sendResponse(message, MqInboxResponse.err()); - } - } - - private CreateIndexInstructions fetchInstructions() throws Exception { - - var inbox = messageQueueFactory.createSingleShotInbox(INDEX_CONSTRUCTOR_INBOX, node, UUID.randomUUID()); - - logger.info("Waiting for instructions"); - var msgOpt = getMessage(inbox, CreateIndexRequest.class.getSimpleName()); - var msg = msgOpt.orElseThrow(() -> new RuntimeException("No message received")); - - var payload = gson.fromJson(msg.payload(), CreateIndexRequest.class); - var name = payload.indexName(); - - return new CreateIndexInstructions(name, inbox, msg); - } - - private Optional getMessage(MqSingleShotInbox inbox, String expectedFunction) throws SQLException, InterruptedException { - var opt = inbox.waitForMessage(30, TimeUnit.SECONDS); - if (opt.isPresent()) { - if (!opt.get().function().equals(expectedFunction)) { - throw new RuntimeException("Unexpected function: " + opt.get().function()); - } - return opt; - } - else { - var stolenMessage = inbox.stealMessage(msg -> msg.function().equals(expectedFunction)); - stolenMessage.ifPresent(mqMessage -> logger.info("Stole message {}", mqMessage)); - return stolenMessage; - } - } } diff --git a/code/processes/live-crawling-process/build.gradle b/code/processes/live-crawling-process/build.gradle new file mode 100644 index 00000000..5dad1177 --- /dev/null +++ b/code/processes/live-crawling-process/build.gradle @@ -0,0 +1,77 @@ +plugins { + id 'java' + + id 'application' + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) + } +} + +application { + mainClass = 'nu.marginalia.livecrawler.LiveCrawlerMain' + applicationName = 'live-crawler-process' +} + +tasks.distZip.enabled = false + +apply from: "$rootProject.projectDir/srcsets.gradle" + +dependencies { + + implementation project(':code:common:db') + implementation project(':code:common:model') + implementation project(':code:common:config') + implementation project(':code:common:service') + implementation project(':code:common:linkdb') + implementation project(':code:functions:live-capture:api') + implementation project(':code:libraries:blocking-thread-pool') + implementation project(':code:index:api') + implementation project(':code:processes:process-mq-api') + implementation project(':code:libraries:message-queue') + implementation project(':code:libraries:language-processing') + implementation project(':code:libraries:easy-lsh') + implementation project(':code:processes:crawling-process') + implementation project(':code:processes:crawling-process:model') + implementation project(':code:processes:converting-process') + implementation project(':code:processes:loading-process') + + + implementation project(':code:processes:crawling-process:ft-crawl-blocklist') + implementation project(':code:processes:crawling-process:ft-link-parser') + implementation project(':code:processes:crawling-process:ft-content-type') + implementation project(':third-party:commons-codec') + + implementation libs.bundles.slf4j + + implementation libs.notnull + implementation libs.guava + implementation dependencies.create(libs.guice.get()) { + exclude group: 'com.google.guava' + } + implementation libs.gson + implementation libs.zstd + implementation libs.jwarc + implementation libs.crawlercommons + implementation libs.okhttp3 + implementation libs.jsoup + implementation libs.opencsv + implementation libs.fastutil + implementation libs.sqlite + + implementation libs.bundles.mariadb + + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito + testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4') + testImplementation libs.commons.codec + testImplementation 'org.testcontainers:mariadb:1.17.4' + testImplementation 'org.testcontainers:junit-jupiter:1.17.4' + testImplementation project(':code:libraries:test-helpers') +} + diff --git a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlDataSet.java b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlDataSet.java new file mode 100644 index 00000000..4324c254 --- /dev/null +++ b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlDataSet.java @@ -0,0 +1,244 @@ +package nu.marginalia.livecrawler; + +import nu.marginalia.io.SerializableCrawlDataStream; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawldata.CrawledDocument; +import nu.marginalia.model.crawldata.CrawledDomain; +import nu.marginalia.model.crawldata.SerializableCrawlData; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.SQLException; +import java.time.Instant; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.zip.GZIPInputStream; +import java.util.zip.GZIPOutputStream; + +/** Data access object for the live crawl database, a simple sqlite file */ +public class LiveCrawlDataSet implements AutoCloseable { + private final Connection connection; + private final Path basePath; + + public LiveCrawlDataSet(Path basePath) throws SQLException { + this.basePath = basePath; + this.connection = DriverManager.getConnection("jdbc:sqlite:" + basePath.resolve("live-crawl-data.db")); + this.connection.setAutoCommit(true); + + try (var stmt = connection.createStatement()) { + stmt.execute("CREATE TABLE IF NOT EXISTS urls (url TEXT PRIMARY KEY, domainId LONG, body BLOB, headers BLOB, ip TEXT, timestamp long)"); + stmt.execute("CREATE INDEX IF NOT EXISTS domainIdIndex ON urls (domainId)"); + stmt.execute("CREATE TABLE IF NOT EXISTS badUrls (url TEXT PRIMARY KEY, timestamp long)"); + } + } + + public Path createWorkDir() throws IOException { + return Files.createTempDirectory(basePath, "work"); + } + + /** Remove entries older than the given timestamp */ + public void prune(Instant cutoff) throws SQLException { + try (var stmt = connection.prepareStatement("DELETE FROM urls WHERE timestamp < ?")) { + stmt.setLong(1, cutoff.toEpochMilli()); + stmt.executeUpdate(); + } + + try (var stmt = connection.prepareStatement("DELETE FROM badUrls WHERE timestamp < ?")) { + stmt.setLong(1, cutoff.toEpochMilli()); + stmt.executeUpdate(); + } + } + + /** Check if the given URL is already in the database */ + public boolean hasUrl(String url) throws SQLException { + try (var stmt = connection.prepareStatement(""" + SELECT 1 FROM urls WHERE urls.url = ? + UNION + SELECT 1 FROM badUrls WHERE badUrls.url = ? + """); + ) { + stmt.setString(1, url); + stmt.setString(2, url); + + return stmt.executeQuery().next(); + } + } + + /** Check if the given URL is already in the database */ + public boolean hasUrl(EdgeUrl url) throws SQLException { + return hasUrl(url.toString()); + } + + /** Save a document to the database */ + public void saveDocument(int domainId, EdgeUrl url, String body, String headers, String ip) throws SQLException, IOException { + try (var stmt = connection.prepareStatement(""" + INSERT OR REPLACE INTO urls (domainId, url, body, headers, ip, timestamp) + VALUES (?, ?, ?, ?, ?, ?) + """)) + { + stmt.setInt(1, domainId); + stmt.setString(2, url.toString()); + stmt.setBytes(3, compress(body)); + stmt.setBytes(4, compress(headers)); + stmt.setString(5, ip); + stmt.setLong(6, Instant.now().toEpochMilli()); + stmt.executeUpdate(); + } + } + + /** Flag a URL as bad, i.e. it should not be revisited */ + public void flagAsBad(EdgeUrl url) { + try (var stmt = connection.prepareStatement(""" + INSERT OR IGNORE INTO badUrls (url, timestamp) + VALUES (?, ?) + """)) + { + stmt.setString(1, url.toString()); + stmt.setLong(2, Instant.now().toEpochMilli()); + stmt.executeUpdate(); + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } + + private byte[] compress(String data) throws IOException { + // gzip compression + try (var bos = new ByteArrayOutputStream(); + var gzip = new GZIPOutputStream(bos)) + { + gzip.write(data.getBytes()); + gzip.finish(); + return bos.toByteArray(); + } + } + + private String decompress(byte[] data) { + // gzip decompression + try (var bis = new ByteArrayInputStream(data); + var gzip = new GZIPInputStream(bis)) + { + return new String(gzip.readAllBytes()); + } + catch (IOException ex) { + throw new RuntimeException(ex); + } + } + + /** Get the data in the database as a list of SerializableCrawlDataStream's, the + * format expected by the converter code. + */ + public Collection getDataStreams() throws SQLException { + List domainIds = new ArrayList<>(); + + try (var stmt = connection.createStatement()) { + var rs = stmt.executeQuery("SELECT DISTINCT domainId FROM urls"); + while (rs.next()) { + domainIds.add(rs.getInt(1)); + } + } + + List streams = new ArrayList<>(); + for (var domainId : domainIds) { + streams.add(new WrappedDataStream(domainId)); + } + return streams; + } + + /** Wraps the data in the database as a SerializableCrawlDataStream. + *

+ * This is a bit clunky as the interface is built intending the data + * to be a stream of objects being read from Parquet. + * */ + private class WrappedDataStream implements SerializableCrawlDataStream { + private final int domainId; + private ArrayList dataStack; + + WrappedDataStream(int domainId) { + this.domainId = domainId; + this.dataStack = null; + } + + /** Lazy initialization for the data being iterated over */ + private void query() { + try (var stmt = connection.prepareStatement(""" + SELECT url, body, headers, ip, timestamp + FROM urls + WHERE domainId = ? + """)) { + stmt.setInt(1, domainId); + var rs = stmt.executeQuery(); + dataStack = new ArrayList<>(); + while (rs.next()) { + String url = rs.getString("url"); + String body = decompress(rs.getBytes("body")); + String headers = decompress(rs.getBytes("headers")); + + dataStack.add(new CrawledDocument( + "LIVE", + url, + "text/html", + Instant.ofEpochMilli(rs.getLong("timestamp")).toString(), + 200, + "OK", + "", + headers, + body, + false, + "", + "" + )); + } + var last = dataStack.getLast(); + var domain = new CrawledDomain( + last.getDomain(), + null, + "OK", + "", + "0.0.0.0", + List.of(), + List.of() + ); + + // Add the domain as the last element, which will be the first + // element popped from the list + dataStack.addLast(domain); + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } + + @Override + public SerializableCrawlData next() throws IOException { + if (dataStack == null) + query(); + + return dataStack.removeLast(); + } + + @Override + public boolean hasNext() throws IOException { + if (dataStack == null) { + query(); + } + return !dataStack.isEmpty(); + } + + @Override + public void close() throws Exception { + dataStack.clear(); + } + } + + @Override + public void close() throws Exception { + connection.close(); + } +} diff --git a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlerMain.java b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlerMain.java new file mode 100644 index 00000000..d05925bb --- /dev/null +++ b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlerMain.java @@ -0,0 +1,224 @@ +package nu.marginalia.livecrawler; + +import com.google.gson.Gson; +import com.google.inject.Guice; +import com.google.inject.Inject; +import com.google.inject.Injector; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.WmsaHome; +import nu.marginalia.api.feeds.FeedsClient; +import nu.marginalia.converting.ConverterModule; +import nu.marginalia.converting.processor.DomainProcessor; +import nu.marginalia.converting.writer.ConverterBatchWriter; +import nu.marginalia.db.DbDomainQueries; +import nu.marginalia.db.DomainBlacklist; +import nu.marginalia.io.SerializableCrawlDataStream; +import nu.marginalia.loading.LoaderInputData; +import nu.marginalia.loading.documents.DocumentLoaderService; +import nu.marginalia.loading.documents.KeywordLoaderService; +import nu.marginalia.loading.domains.DbDomainIdRegistry; +import nu.marginalia.loading.domains.DomainIdRegistry; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.mq.MessageQueueFactory; +import nu.marginalia.mqapi.crawling.LiveCrawlRequest; +import nu.marginalia.process.ProcessConfiguration; +import nu.marginalia.process.ProcessConfigurationModule; +import nu.marginalia.process.ProcessMainClass; +import nu.marginalia.process.control.ProcessHeartbeat; +import nu.marginalia.service.module.DatabaseModule; +import nu.marginalia.service.module.ServiceDiscoveryModule; +import nu.marginalia.storage.FileStorageService; +import nu.marginalia.storage.model.FileStorageBaseType; +import org.apache.commons.io.FileUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.security.Security; +import java.time.Instant; +import java.time.temporal.ChronoUnit; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static nu.marginalia.mqapi.ProcessInboxNames.LIVE_CRAWLER_INBOX; + +public class LiveCrawlerMain extends ProcessMainClass { + + private static final Logger logger = + LoggerFactory.getLogger(LiveCrawlerMain.class); + + private final FeedsClient feedsClient; + private final ProcessHeartbeat heartbeat; + private final DbDomainQueries domainQueries; + private final DomainBlacklist domainBlacklist; + private final DomainProcessor domainProcessor; + private final FileStorageService fileStorageService; + private final KeywordLoaderService keywordLoaderService; + private final DocumentLoaderService documentLoaderService; + private final HikariDataSource dataSource; + + @Inject + public LiveCrawlerMain(FeedsClient feedsClient, + Gson gson, + ProcessConfiguration config, + ProcessHeartbeat heartbeat, + DbDomainQueries domainQueries, + DomainBlacklist domainBlacklist, + MessageQueueFactory messageQueueFactory, + DomainProcessor domainProcessor, + FileStorageService fileStorageService, + KeywordLoaderService keywordLoaderService, + DocumentLoaderService documentLoaderService, HikariDataSource dataSource) + throws Exception + { + super(messageQueueFactory, config, gson, LIVE_CRAWLER_INBOX); + + this.feedsClient = feedsClient; + this.heartbeat = heartbeat; + this.domainQueries = domainQueries; + this.domainBlacklist = domainBlacklist; + this.domainProcessor = domainProcessor; + this.fileStorageService = fileStorageService; + this.keywordLoaderService = keywordLoaderService; + this.documentLoaderService = documentLoaderService; + this.dataSource = dataSource; + + domainBlacklist.waitUntilLoaded(); + } + + public static void main(String... args) throws Exception { + + // Prevent Java from caching DNS lookups forever (filling up the system RAM as a result) + Security.setProperty("networkaddress.cache.ttl", "3600"); + + // This must run *early* + System.setProperty("http.agent", WmsaHome.getUserAgent().uaString()); + + // If these aren't set properly, the JVM will hang forever on some requests + System.setProperty("sun.net.client.defaultConnectTimeout", "30000"); + System.setProperty("sun.net.client.defaultReadTimeout", "30000"); + + // We don't want to use too much memory caching sessions for https + System.setProperty("javax.net.ssl.sessionCacheSize", "2048"); + + try { + Injector injector = Guice.createInjector( + new LiveCrawlerModule(), + new ProcessConfigurationModule("crawler"), + new ConverterModule(), + new ServiceDiscoveryModule(), + new DatabaseModule(false) + ); + + var crawler = injector.getInstance(LiveCrawlerMain.class); + Instructions instructions = crawler.fetchInstructions(LiveCrawlRequest.class); + + try{ + crawler.run(); + instructions.ok(); + } catch (Exception e) { + instructions.err(); + throw e; + } + + } catch (Exception e) { + logger.error("LiveCrawler failed", e); + + System.exit(1); + } + System.exit(0); + } + + enum LiveCrawlState { + PRUNE_DB, + FETCH_LINKS, + CRAWLING, + PROCESSING, + LOADING, + CONSTRUCTING, + DONE + } + + private void run() throws Exception { + Path basePath = fileStorageService.getStorageBase(FileStorageBaseType.STORAGE).asPath().resolve("live-crawl-data"); + + if (!Files.isDirectory(basePath)) { + Files.createDirectories(basePath); + } + + run(basePath); + } + + private void run(Path basePath) throws Exception { + try (var processHeartbeat = heartbeat.createProcessTaskHeartbeat(LiveCrawlState.class, "LiveCrawler"); + LiveCrawlDataSet dataSet = new LiveCrawlDataSet(basePath)) + { + final Instant cutoff = Instant.now().minus(60, ChronoUnit.DAYS); + + processHeartbeat.progress(LiveCrawlState.FETCH_LINKS); + + Map> urlsPerDomain = new HashMap<>(10_000); + feedsClient.getUpdatedDomains(cutoff, urlsPerDomain::put); + + logger.info("Fetched data for {} domains", urlsPerDomain.size()); + + processHeartbeat.progress(LiveCrawlState.PRUNE_DB); + + // Remove data that is too old + dataSet.prune(cutoff); + + processHeartbeat.progress(LiveCrawlState.CRAWLING); + + try (SimpleLinkScraper fetcher = new SimpleLinkScraper(dataSet, domainQueries, domainBlacklist); + var hb = heartbeat.createAdHocTaskHeartbeat("Live Crawling")) + { + for (Map.Entry> entry : hb.wrap("Fetching", urlsPerDomain.entrySet())) { + EdgeDomain domain = new EdgeDomain(entry.getKey()); + List urls = entry.getValue(); + + fetcher.scheduleRetrieval(domain, urls); + } + } + + Path tempPath = dataSet.createWorkDir(); + + try { + processHeartbeat.progress(LiveCrawlState.PROCESSING); + + try (var hb = heartbeat.createAdHocTaskHeartbeat("Processing"); + var writer = new ConverterBatchWriter(tempPath, 0) + ) { + // Offset the documents' ordinals toward the upper range, to avoid an ID collisions with the + // main indexes (the maximum permissible for doc ordinal is value is 67_108_863, so this + // leaves us with a lot of headroom still) + writer.setOrdinalOffset(67_000_000); + + for (SerializableCrawlDataStream stream : hb.wrap("Processing", dataSet.getDataStreams())) { + writer.write(domainProcessor.sideloadProcessing(stream, 0)); + } + } + + processHeartbeat.progress(LiveCrawlState.LOADING); + + LoaderInputData lid = new LoaderInputData(tempPath, 1); + + DomainIdRegistry domainIdRegistry = new DbDomainIdRegistry(dataSource); + + keywordLoaderService.loadKeywords(domainIdRegistry, heartbeat, lid); + documentLoaderService.loadDocuments(domainIdRegistry, heartbeat, lid); + + keywordLoaderService.close(); + + } finally { + FileUtils.deleteDirectory(tempPath.toFile()); + } + + // Construct the index + + processHeartbeat.progress(LiveCrawlState.DONE); + } + } + +} diff --git a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlerModule.java b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlerModule.java new file mode 100644 index 00000000..5dc779b0 --- /dev/null +++ b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlerModule.java @@ -0,0 +1,49 @@ +package nu.marginalia.livecrawler; + +import com.google.inject.AbstractModule; +import com.google.inject.Inject; +import com.google.inject.Provides; +import com.google.inject.Singleton; +import com.google.inject.name.Names; +import nu.marginalia.IndexLocations; +import nu.marginalia.UserAgent; +import nu.marginalia.WmsaHome; +import nu.marginalia.linkdb.docs.DocumentDbWriter; +import nu.marginalia.process.ProcessConfiguration; +import nu.marginalia.service.ServiceId; +import nu.marginalia.service.module.ServiceConfiguration; +import nu.marginalia.storage.FileStorageService; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.SQLException; +import java.util.UUID; + +import static nu.marginalia.linkdb.LinkdbFileNames.DOCDB_FILE_NAME; + +public class LiveCrawlerModule extends AbstractModule { + + public void configure() { + bind(UserAgent.class).toInstance(WmsaHome.getUserAgent()); + bind(Path.class).annotatedWith(Names.named("local-index-path")).toInstance(Path.of(System.getProperty("local-index-path", "/vol"))); + } + + @Inject + @Provides @Singleton + private DocumentDbWriter createLinkdbWriter(FileStorageService service) throws SQLException, IOException { + // Migrate + Path dbPath = IndexLocations.getLinkdbWritePath(service).resolve(DOCDB_FILE_NAME); + + if (Files.exists(dbPath)) { + Files.delete(dbPath); + } + return new DocumentDbWriter(dbPath); + } + + @Singleton + @Provides + public ServiceConfiguration provideServiceConfiguration(ProcessConfiguration processConfiguration) { + return new ServiceConfiguration(ServiceId.NOT_A_SERVICE, processConfiguration.node(), null, null, -1, UUID.randomUUID()); + } +} diff --git a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/SimpleLinkScraper.java b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/SimpleLinkScraper.java new file mode 100644 index 00000000..b1ad42e5 --- /dev/null +++ b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/SimpleLinkScraper.java @@ -0,0 +1,217 @@ +package nu.marginalia.livecrawler; + +import crawlercommons.robots.SimpleRobotRules; +import crawlercommons.robots.SimpleRobotRulesParser; +import nu.marginalia.WmsaHome; +import nu.marginalia.crawl.fetcher.HttpFetcherImpl; +import nu.marginalia.crawl.retreival.CrawlDelayTimer; +import nu.marginalia.db.DbDomainQueries; +import nu.marginalia.db.DomainBlacklist; +import nu.marginalia.link_parser.LinkParser; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.util.SimpleBlockingThreadPool; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; +import java.io.IOException; +import java.net.URISyntaxException; +import java.net.http.HttpClient; +import java.net.http.HttpHeaders; +import java.net.http.HttpRequest; +import java.net.http.HttpResponse; +import java.time.Duration; +import java.util.List; +import java.util.Optional; +import java.util.concurrent.ThreadLocalRandom; +import java.util.concurrent.TimeUnit; + +/** A simple link scraper that fetches URLs and stores them in a database, + * with no concept of a crawl frontier, WARC output, or other advanced features + */ +public class SimpleLinkScraper implements AutoCloseable { + private static final Logger logger = LoggerFactory.getLogger(SimpleLinkScraper.class); + + private final SimpleBlockingThreadPool pool = new SimpleBlockingThreadPool("LiveCrawler", 32, 10); + private final LinkParser lp = new LinkParser(); + private final LiveCrawlDataSet dataSet; + private final DbDomainQueries domainQueries; + private final DomainBlacklist domainBlacklist; + private final Duration connectTimeout = Duration.ofSeconds(10); + private final Duration readTimeout = Duration.ofSeconds(10); + + public SimpleLinkScraper(LiveCrawlDataSet dataSet, + DbDomainQueries domainQueries, + DomainBlacklist domainBlacklist) { + this.dataSet = dataSet; + this.domainQueries = domainQueries; + this.domainBlacklist = domainBlacklist; + } + + public void scheduleRetrieval(EdgeDomain domain, List urls) { + + var id = domainQueries.tryGetDomainId(domain); + if (id.isEmpty() || domainBlacklist.isBlacklisted(id.getAsInt())) { + return; + } + + pool.submitQuietly(() -> retrieveNow(domain, id.getAsInt(), urls)); + } + + public void retrieveNow(EdgeDomain domain, int domainId, List urls) throws Exception { + try (HttpClient client = HttpClient + .newBuilder() + .connectTimeout(connectTimeout) + .followRedirects(HttpClient.Redirect.NEVER) + .version(HttpClient.Version.HTTP_2) + .build()) { + + EdgeUrl rootUrl = domain.toRootUrlHttps(); + + SimpleRobotRules rules = fetchRobotsRules(rootUrl, client); + + if (rules == null) { // I/O error fetching robots.txt + // If we can't fetch the robots.txt, + for (var url : urls) { + lp.parseLink(rootUrl, url).ifPresent(this::maybeFlagAsBad); + } + return; + } + + CrawlDelayTimer timer = new CrawlDelayTimer(rules.getCrawlDelay()); + + for (var url : urls) { + Optional optParsedUrl = lp.parseLink(rootUrl, url); + if (optParsedUrl.isEmpty()) { + continue; + } + if (dataSet.hasUrl(optParsedUrl.get())) { + continue; + } + + EdgeUrl parsedUrl = optParsedUrl.get(); + if (!rules.isAllowed(url)) { + continue; + } + + switch (fetchUrl(domainId, parsedUrl, timer, client)) { + case FetchResult.Success(int id, EdgeUrl docUrl, String body, String headers) + -> dataSet.saveDocument(id, docUrl, body, headers, ""); + case FetchResult.Error(EdgeUrl docUrl) -> maybeFlagAsBad(docUrl); + } + } + } + } + + private void maybeFlagAsBad(EdgeUrl url) { + // To give bad URLs a chance to be re-fetched, we only flag them as bad + // with a 20% probability. This will prevent the same bad URL being + // re-fetched over and over again for several months, but still allow + // us to *mostly* re-fetch it if it was just a transient error. + + // There's of course the chance we immediately flag it as bad on an + // unlucky roll, but you know, that's xcom baby + + if (ThreadLocalRandom.current().nextDouble(0, 1) < 0.2) { + dataSet.flagAsBad(url); + } + } + + @Nullable + private SimpleRobotRules fetchRobotsRules(EdgeUrl rootUrl, HttpClient client) throws IOException, InterruptedException, URISyntaxException { + var robotsRequest = HttpRequest.newBuilder(rootUrl.withPathAndParam("/robots.txt", null).asURI()) + .GET() + .header("User-Agent", WmsaHome.getUserAgent().uaString()) + .timeout(readTimeout); + + // Fetch the robots.txt + + try { + SimpleRobotRulesParser parser = new SimpleRobotRulesParser(); + HttpResponse robotsTxt = client.send(robotsRequest.build(), HttpResponse.BodyHandlers.ofByteArray()); + if (robotsTxt.statusCode() == 200) { + return parser.parseContent(rootUrl.toString(), + robotsTxt.body(), + robotsTxt.headers().firstValue("Content-Type").orElse("text/plain"), + WmsaHome.getUserAgent().uaIdentifier()); + } + else if (robotsTxt.statusCode() == 404) { + return new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_ALL); + } + } + catch (IOException ex) { + logger.error("Error fetching robots.txt for {}: {} {}", rootUrl, ex.getClass().getSimpleName(), ex.getMessage()); + } + return null; + } + + /** Fetch a URL and store it in the database + */ + private FetchResult fetchUrl(int domainId, EdgeUrl parsedUrl, CrawlDelayTimer timer, HttpClient client) throws Exception { + + timer.waitFetchDelay(); + + HttpRequest request = HttpRequest.newBuilder(parsedUrl.asURI()) + .GET() + .header("User-Agent", WmsaHome.getUserAgent().uaString()) + .header("Accept", "text/html") + .timeout(readTimeout) + .build(); + + try { + HttpResponse response = client.send(request, HttpResponse.BodyHandlers.ofString()); + + // Handle rate limiting by waiting and retrying once + if (response.statusCode() == 429) { + timer.waitRetryDelay(new HttpFetcherImpl.RateLimitException( + response.headers().firstValue("Retry-After").orElse("5") + )); + response = client.send(request, HttpResponse.BodyHandlers.ofString()); + } + + String contentType = response.headers().firstValue("Content-Type").orElse("").toLowerCase(); + + if (response.statusCode() == 200) { + if (!contentType.toLowerCase().startsWith("text/html")) { + return new FetchResult.Error(parsedUrl); + } + + String body = response.body(); + if (body.length() > 1024 * 1024) { + return new FetchResult.Error(parsedUrl); + } + + return new FetchResult.Success(domainId, parsedUrl, body, headersToString(response.headers())); + } + } + catch (IOException ex) { + // We don't want a full stack trace on every error, as it's quite common and very noisy + logger.error("Error fetching URL {}: {} {}", parsedUrl, ex.getClass().getSimpleName(), ex.getMessage()); + } + + return new FetchResult.Error(parsedUrl); + } + + sealed interface FetchResult { + record Success(int domainId, EdgeUrl url, String body, String headers) implements FetchResult {} + record Error(EdgeUrl url) implements FetchResult {} + } + + private String headersToString(HttpHeaders headers) { + StringBuilder headersStr = new StringBuilder(); + headers.map().forEach((k, v) -> { + headersStr.append(k).append(": ").append(v).append("\n"); + }); + return headersStr.toString(); + } + + @Override + public void close() throws Exception { + pool.shutDown(); + for (int i = 0; i < 4; i++) { + pool.awaitTermination(1, TimeUnit.HOURS); + } + pool.shutDownNow(); + } +} diff --git a/code/processes/live-crawling-process/test/nu/marginalia/livecrawler/LiveCrawlDataSetTest.java b/code/processes/live-crawling-process/test/nu/marginalia/livecrawler/LiveCrawlDataSetTest.java new file mode 100644 index 00000000..ee1124e9 --- /dev/null +++ b/code/processes/live-crawling-process/test/nu/marginalia/livecrawler/LiveCrawlDataSetTest.java @@ -0,0 +1,92 @@ +package nu.marginalia.livecrawler; + +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawldata.CrawledDocument; +import nu.marginalia.model.crawldata.CrawledDomain; +import nu.marginalia.model.crawldata.SerializableCrawlData; +import org.apache.commons.io.FileUtils; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; + +public class LiveCrawlDataSetTest { + + @Test + public void testGetDataSet() throws Exception { + Path tempDir = Files.createTempDirectory("live-crawl-data-set-test"); + try (LiveCrawlDataSet dataSet = new LiveCrawlDataSet(tempDir)) { + + Assertions.assertFalse(dataSet.hasUrl("https://www.example.com/")); + dataSet.saveDocument( + 1, + new EdgeUrl("https://www.example.com/"), + "test", + "test", + "test" + ); + Assertions.assertTrue(dataSet.hasUrl("https://www.example.com/")); + + var streams = dataSet.getDataStreams(); + Assertions.assertEquals(1, streams.size()); + var stream = streams.iterator().next(); + + List data = new ArrayList<>(); + while (stream.hasNext()) { + data.add(stream.next()); + } + + int dataCount = 0; + int domainCount = 0; + + for (var item : data) { + switch (item) { + case CrawledDomain domain -> { + domainCount++; + Assertions.assertEquals("www.example.com", domain.getDomain()); + } + case CrawledDocument document -> { + dataCount++; + Assertions.assertEquals("https://www.example.com/", document.url); + Assertions.assertEquals("test", document.documentBody); + } + } + } + + Assertions.assertEquals(1, dataCount); + Assertions.assertEquals(1, domainCount); + } + finally { + FileUtils.deleteDirectory(tempDir.toFile()); + } + } + + @Test + public void testHasUrl() throws Exception { + Path tempDir = Files.createTempDirectory("live-crawl-data-set-test"); + try (LiveCrawlDataSet dataSet = new LiveCrawlDataSet(tempDir)) { + Assertions.assertFalse(dataSet.hasUrl("https://www.example.com/")); + dataSet.saveDocument( + 1, + new EdgeUrl("https://www.example.com/saved"), + "test", + "test", + "test" + ); + Assertions.assertTrue(dataSet.hasUrl("https://www.example.com/saved")); + + dataSet.flagAsBad(new EdgeUrl("https://www.example.com/bad")); + + Assertions.assertTrue(dataSet.hasUrl("https://www.example.com/bad")); + + Assertions.assertFalse(dataSet.hasUrl("https://www.example.com/notPresent")); + } + finally { + FileUtils.deleteDirectory(tempDir.toFile()); + } + } + +} \ No newline at end of file diff --git a/code/processes/loading-process/build.gradle b/code/processes/loading-process/build.gradle index 0d9d51c1..4a97812e 100644 --- a/code/processes/loading-process/build.gradle +++ b/code/processes/loading-process/build.gradle @@ -20,7 +20,6 @@ tasks.distZip.enabled = false apply from: "$rootProject.projectDir/srcsets.gradle" dependencies { - implementation project(':code:common:process') implementation project(':code:processes:process-mq-api') implementation project(':code:index:api') implementation project(':code:common:model') diff --git a/code/processes/loading-process/java/nu/marginalia/loading/LoaderMain.java b/code/processes/loading-process/java/nu/marginalia/loading/LoaderMain.java index 5b59d972..cc639711 100644 --- a/code/processes/loading-process/java/nu/marginalia/loading/LoaderMain.java +++ b/code/processes/loading-process/java/nu/marginalia/loading/LoaderMain.java @@ -4,8 +4,6 @@ import com.google.gson.Gson; import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; -import nu.marginalia.ProcessConfiguration; -import nu.marginalia.ProcessConfigurationModule; import nu.marginalia.linkdb.docs.DocumentDbWriter; import nu.marginalia.loading.documents.DocumentLoaderService; import nu.marginalia.loading.documents.KeywordLoaderService; @@ -13,26 +11,21 @@ import nu.marginalia.loading.domains.DomainIdRegistry; import nu.marginalia.loading.domains.DomainLoaderService; import nu.marginalia.loading.links.DomainLinksLoaderService; import nu.marginalia.mq.MessageQueueFactory; -import nu.marginalia.mq.MqMessage; -import nu.marginalia.mq.MqMessageState; -import nu.marginalia.mq.inbox.MqInboxResponse; -import nu.marginalia.mq.inbox.MqSingleShotInbox; +import nu.marginalia.mqapi.loading.LoadRequest; +import nu.marginalia.process.ProcessConfiguration; +import nu.marginalia.process.ProcessConfigurationModule; +import nu.marginalia.process.ProcessMainClass; import nu.marginalia.process.control.ProcessHeartbeatImpl; -import nu.marginalia.service.ProcessMainClass; import nu.marginalia.service.module.DatabaseModule; import nu.marginalia.storage.FileStorageService; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.nio.file.Path; -import java.sql.SQLException; import java.util.ArrayList; import java.util.List; -import java.util.Optional; -import java.util.UUID; import java.util.concurrent.ForkJoinPool; import java.util.concurrent.Future; -import java.util.concurrent.TimeUnit; import static nu.marginalia.mqapi.ProcessInboxNames.LOADER_INBOX; @@ -40,15 +33,12 @@ public class LoaderMain extends ProcessMainClass { private static final Logger logger = LoggerFactory.getLogger(LoaderMain.class); private final ProcessHeartbeatImpl heartbeat; - private final MessageQueueFactory messageQueueFactory; private final FileStorageService fileStorageService; private final DocumentDbWriter documentDbWriter; private final DomainLoaderService domainService; private final DomainLinksLoaderService linksService; private final KeywordLoaderService keywordLoaderService; private final DocumentLoaderService documentLoaderService; - private final int node; - private final Gson gson; public static void main(String... args) { try { @@ -62,7 +52,7 @@ public class LoaderMain extends ProcessMainClass { var instance = injector.getInstance(LoaderMain.class); - var instructions = instance.fetchInstructions(); + var instructions = instance.fetchInstructions(LoadRequest.class); logger.info("Instructions received"); instance.run(instructions); } @@ -83,22 +73,27 @@ public class LoaderMain extends ProcessMainClass { ProcessConfiguration processConfiguration, Gson gson ) { - this.node = processConfiguration.node(); + + super(messageQueueFactory, processConfiguration, gson, LOADER_INBOX); + this.heartbeat = heartbeat; - this.messageQueueFactory = messageQueueFactory; this.fileStorageService = fileStorageService; this.documentDbWriter = documentDbWriter; this.domainService = domainService; this.linksService = linksService; this.keywordLoaderService = keywordLoaderService; this.documentLoaderService = documentLoaderService; - this.gson = gson; heartbeat.start(); } - void run(LoadRequest instructions) throws Throwable { - LoaderInputData inputData = instructions.inputData; + void run(Instructions instructions) throws Throwable { + + List inputSources = new ArrayList<>(); + for (var storageId : instructions.value().inputProcessDataStorageIds) { + inputSources.add(fileStorageService.getStorage(storageId).asPath()); + } + var inputData = new LoaderInputData(inputSources); DomainIdRegistry domainIdRegistry = domainService.getOrCreateDomainIds(heartbeat, inputData); @@ -134,67 +129,5 @@ public class LoaderMain extends ProcessMainClass { System.exit(0); } - private static class LoadRequest { - private final LoaderInputData inputData; - private final MqMessage message; - private final MqSingleShotInbox inbox; - - LoadRequest(LoaderInputData inputData, MqMessage message, MqSingleShotInbox inbox) { - this.inputData = inputData; - this.message = message; - this.inbox = inbox; - } - - public void ok() { - inbox.sendResponse(message, MqInboxResponse.ok()); - } - public void err() { - inbox.sendResponse(message, MqInboxResponse.err()); - } - } - - private LoadRequest fetchInstructions() throws Exception { - - var inbox = messageQueueFactory.createSingleShotInbox(LOADER_INBOX, node, UUID.randomUUID()); - - var msgOpt = getMessage(inbox, nu.marginalia.mqapi.loading.LoadRequest.class.getSimpleName()); - if (msgOpt.isEmpty()) - throw new RuntimeException("No instruction received in inbox"); - var msg = msgOpt.get(); - - if (!nu.marginalia.mqapi.loading.LoadRequest.class.getSimpleName().equals(msg.function())) { - throw new RuntimeException("Unexpected message in inbox: " + msg); - } - - try { - var request = gson.fromJson(msg.payload(), nu.marginalia.mqapi.loading.LoadRequest.class); - - List inputSources = new ArrayList<>(); - for (var storageId : request.inputProcessDataStorageIds) { - inputSources.add(fileStorageService.getStorage(storageId).asPath()); - } - - return new LoadRequest(new LoaderInputData(inputSources), msg, inbox); - } - catch (Exception ex) { - inbox.sendResponse(msg, new MqInboxResponse("FAILED", MqMessageState.ERR)); - throw ex; - } - } - - private Optional getMessage(MqSingleShotInbox inbox, String expectedFunction) throws SQLException, InterruptedException { - var opt = inbox.waitForMessage(30, TimeUnit.SECONDS); - if (opt.isPresent()) { - if (!opt.get().function().equals(expectedFunction)) { - throw new RuntimeException("Unexpected function: " + opt.get().function()); - } - return opt; - } - else { - var stolenMessage = inbox.stealMessage(msg -> msg.function().equals(expectedFunction)); - stolenMessage.ifPresent(mqMessage -> logger.info("Stole message {}", mqMessage)); - return stolenMessage; - } - } } diff --git a/code/processes/loading-process/java/nu/marginalia/loading/documents/KeywordLoaderService.java b/code/processes/loading-process/java/nu/marginalia/loading/documents/KeywordLoaderService.java index fadbd64c..4fb1e711 100644 --- a/code/processes/loading-process/java/nu/marginalia/loading/documents/KeywordLoaderService.java +++ b/code/processes/loading-process/java/nu/marginalia/loading/documents/KeywordLoaderService.java @@ -27,7 +27,8 @@ public class KeywordLoaderService { public boolean loadKeywords(DomainIdRegistry domainIdRegistry, ProcessHeartbeat heartbeat, - LoaderInputData inputData) throws IOException { + LoaderInputData inputData) throws IOException + { try (var task = heartbeat.createAdHocTaskHeartbeat("KEYWORDS")) { Collection> documentFiles = inputData.listDocumentFiles(); diff --git a/code/processes/loading-process/java/nu/marginalia/loading/domains/CachingDomainIdRegistry.java b/code/processes/loading-process/java/nu/marginalia/loading/domains/CachingDomainIdRegistry.java new file mode 100644 index 00000000..b80f4ce1 --- /dev/null +++ b/code/processes/loading-process/java/nu/marginalia/loading/domains/CachingDomainIdRegistry.java @@ -0,0 +1,27 @@ +package nu.marginalia.loading.domains; + +import java.util.HashMap; +import java.util.Map; + +/** Maps domain names to domain ids */ +public class CachingDomainIdRegistry implements DomainIdRegistry { + private final Map domainIds = new HashMap<>(10_000); + + @Override + public int getDomainId(String domainName) { + Integer id = domainIds.get(domainName.toLowerCase()); + + if (id == null) { + // This is a very severe problem + throw new IllegalStateException("Unknown domain id for domain " + domainName); + } + + return id; + } + + @Override + public void add(String domainName, int id) { + domainIds.put(domainName, id); + } + +} diff --git a/code/processes/loading-process/java/nu/marginalia/loading/domains/DbDomainIdRegistry.java b/code/processes/loading-process/java/nu/marginalia/loading/domains/DbDomainIdRegistry.java new file mode 100644 index 00000000..ce71c305 --- /dev/null +++ b/code/processes/loading-process/java/nu/marginalia/loading/domains/DbDomainIdRegistry.java @@ -0,0 +1,59 @@ +package nu.marginalia.loading.domains; + +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.model.EdgeDomain; + +import java.sql.Statement; + +public class DbDomainIdRegistry implements DomainIdRegistry { + private final HikariDataSource dataSource; + + public DbDomainIdRegistry(HikariDataSource dataSource) { + this.dataSource = dataSource; + } + + @Override + public int getDomainId(String domainName) { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) { + + stmt.setString(1, domainName); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return rsp.getInt(1); + } + } + catch (Exception e) { + throw new RuntimeException("Failed to query domain ID", e); + } + + // Insert the domain if it doesn't exist (unlikely) + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement("INSERT IGNORE INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES (?, ?, ?)", + Statement.RETURN_GENERATED_KEYS)) { + + var domain = new EdgeDomain(domainName); + + stmt.setString(1, domain.toString()); + stmt.setString(2, domain.getTopDomain()); + stmt.setInt(3, 0); // "up for grabs" node affinity + stmt.executeUpdate(); + + var gk = stmt.getGeneratedKeys(); + if (gk.next()) { + return gk.getInt(1); + } + else { + // recurse in the doubly unlikely event that the domain was inserted by another thread + return getDomainId(domainName); + } + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + @Override + public void add(String domainName, int id) { + throw new UnsupportedOperationException("Not implemented"); + } +} diff --git a/code/processes/loading-process/java/nu/marginalia/loading/domains/DomainIdRegistry.java b/code/processes/loading-process/java/nu/marginalia/loading/domains/DomainIdRegistry.java index bccc3ed3..e26314e6 100644 --- a/code/processes/loading-process/java/nu/marginalia/loading/domains/DomainIdRegistry.java +++ b/code/processes/loading-process/java/nu/marginalia/loading/domains/DomainIdRegistry.java @@ -1,25 +1,7 @@ package nu.marginalia.loading.domains; -import java.util.HashMap; -import java.util.Map; - -/** Maps domain names to domain ids */ -public class DomainIdRegistry { - private final Map domainIds = new HashMap<>(10_000); - - public int getDomainId(String domainName) { - Integer id = domainIds.get(domainName.toLowerCase()); - - if (id == null) { - // This is a very severe problem - throw new IllegalStateException("Unknown domain id for domain " + domainName); - } - - return id; - } - - void add(String domainName, int id) { - domainIds.put(domainName, id); - } +public interface DomainIdRegistry { + int getDomainId(String domainName); + void add(String domainName, int id); } diff --git a/code/processes/loading-process/java/nu/marginalia/loading/domains/DomainLoaderService.java b/code/processes/loading-process/java/nu/marginalia/loading/domains/DomainLoaderService.java index 66389062..383fac01 100644 --- a/code/processes/loading-process/java/nu/marginalia/loading/domains/DomainLoaderService.java +++ b/code/processes/loading-process/java/nu/marginalia/loading/domains/DomainLoaderService.java @@ -3,11 +3,11 @@ package nu.marginalia.loading.domains; import com.google.inject.Inject; import com.google.inject.Singleton; import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.ProcessConfiguration; import nu.marginalia.loading.LoaderInputData; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.processed.SlopDomainLinkRecord; import nu.marginalia.model.processed.SlopDomainRecord; +import nu.marginalia.process.ProcessConfiguration; import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.process.control.ProcessHeartbeatImpl; import nu.marginalia.slop.SlopTable; @@ -52,7 +52,7 @@ public class DomainLoaderService { throws IOException, SQLException { Set domainNamesAll = new HashSet<>(100_000); - DomainIdRegistry ret = new DomainIdRegistry(); + DomainIdRegistry ret = new CachingDomainIdRegistry(); try (var conn = dataSource.getConnection(); var taskHeartbeat = heartbeat.createProcessTaskHeartbeat(Steps.class, "DOMAIN_IDS")) diff --git a/code/processes/loading-process/test/nu/marginalia/loading/domains/DbDomainIdRegistryTest.java b/code/processes/loading-process/test/nu/marginalia/loading/domains/DbDomainIdRegistryTest.java new file mode 100644 index 00000000..2c976878 --- /dev/null +++ b/code/processes/loading-process/test/nu/marginalia/loading/domains/DbDomainIdRegistryTest.java @@ -0,0 +1,44 @@ +package nu.marginalia.loading.domains; + +import com.zaxxer.hikari.HikariConfig; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.test.TestMigrationLoader; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.testcontainers.containers.MariaDBContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +@Testcontainers +@Tag("slow") +class DbDomainIdRegistryTest { + + @Container + static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") + .withDatabaseName("WMSA_prod") + .withUsername("wmsa") + .withPassword("wmsa") + .withNetworkAliases("mariadb"); + + static HikariDataSource dataSource; + + @BeforeAll + public static void setup() { + HikariConfig config = new HikariConfig(); + config.setJdbcUrl(mariaDBContainer.getJdbcUrl()); + config.setUsername("wmsa"); + config.setPassword("wmsa"); + + dataSource = new HikariDataSource(config); + + TestMigrationLoader.flywayMigration(dataSource); + } + + @Test + void getDomainId() { + Assertions.assertEquals(1, new DbDomainIdRegistry(dataSource).getDomainId("test.com")); + Assertions.assertEquals(1, new DbDomainIdRegistry(dataSource).getDomainId("test.com")); + } +} \ No newline at end of file diff --git a/code/processes/process-mq-api/java/nu/marginalia/mqapi/ProcessInboxNames.java b/code/processes/process-mq-api/java/nu/marginalia/mqapi/ProcessInboxNames.java index b7906c22..e5582aed 100644 --- a/code/processes/process-mq-api/java/nu/marginalia/mqapi/ProcessInboxNames.java +++ b/code/processes/process-mq-api/java/nu/marginalia/mqapi/ProcessInboxNames.java @@ -4,6 +4,9 @@ public class ProcessInboxNames { public static final String CONVERTER_INBOX = "converter"; public static final String LOADER_INBOX = "loader"; public static final String CRAWLER_INBOX = "crawler"; + public static final String LIVE_CRAWLER_INBOX = "live-crawler"; + + public static final String EXPORT_TASK_INBOX = "export-task"; public static final String INDEX_CONSTRUCTOR_INBOX = "index_constructor"; } diff --git a/code/processes/process-mq-api/java/nu/marginalia/mqapi/crawling/LiveCrawlRequest.java b/code/processes/process-mq-api/java/nu/marginalia/mqapi/crawling/LiveCrawlRequest.java new file mode 100644 index 00000000..8aea5ab5 --- /dev/null +++ b/code/processes/process-mq-api/java/nu/marginalia/mqapi/crawling/LiveCrawlRequest.java @@ -0,0 +1,4 @@ +package nu.marginalia.mqapi.crawling; + +public class LiveCrawlRequest { +} diff --git a/code/processes/process-mq-api/java/nu/marginalia/mqapi/tasks/ExportTaskRequest.java b/code/processes/process-mq-api/java/nu/marginalia/mqapi/tasks/ExportTaskRequest.java new file mode 100644 index 00000000..036f3a10 --- /dev/null +++ b/code/processes/process-mq-api/java/nu/marginalia/mqapi/tasks/ExportTaskRequest.java @@ -0,0 +1,57 @@ +package nu.marginalia.mqapi.tasks; + +import nu.marginalia.storage.model.FileStorageId; + +public class ExportTaskRequest { + public enum Task { + ATAGS, + FEEDS, + TERM_FREQ, + SAMPLE_DATA, + ADJACENCIES, + } + + public Task task; + public FileStorageId crawlId; + public FileStorageId destId; + public int size; + public String name; + + public ExportTaskRequest(Task task) { + this.task = task; + } + + public static ExportTaskRequest atags(FileStorageId crawlId, FileStorageId destId) { + ExportTaskRequest request = new ExportTaskRequest(Task.ATAGS); + request.crawlId = crawlId; + request.destId = destId; + return request; + } + + public static ExportTaskRequest feeds(FileStorageId crawlId, FileStorageId destId) { + ExportTaskRequest request = new ExportTaskRequest(Task.FEEDS); + request.crawlId = crawlId; + request.destId = destId; + return request; + } + + public static ExportTaskRequest termFreq(FileStorageId crawlId, FileStorageId destId) { + ExportTaskRequest request = new ExportTaskRequest(Task.TERM_FREQ); + request.crawlId = crawlId; + request.destId = destId; + return request; + } + + public static ExportTaskRequest sampleData(FileStorageId crawlId, FileStorageId destId, int size, String name) { + ExportTaskRequest request = new ExportTaskRequest(Task.SAMPLE_DATA); + request.crawlId = crawlId; + request.destId = destId; + request.size = size; + request.name = name; + return request; + } + + public static ExportTaskRequest adjacencies() { + return new ExportTaskRequest(Task.ADJACENCIES); + } +} diff --git a/code/processes/website-adjacencies-calculator/build.gradle b/code/processes/website-adjacencies-calculator/build.gradle deleted file mode 100644 index 37787b6a..00000000 --- a/code/processes/website-adjacencies-calculator/build.gradle +++ /dev/null @@ -1,49 +0,0 @@ -plugins { - id 'java' - - id 'application' - id 'jvm-test-suite' -} -java { - toolchain { - languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) - } -} - -application { - mainClass = 'nu.marginalia.adjacencies.WebsiteAdjacenciesCalculator' - applicationName = 'website-adjacencies-calculator' -} - -tasks.distZip.enabled = false - -apply from: "$rootProject.projectDir/srcsets.gradle" - -dependencies { - implementation project(':code:common:model') - implementation project(':code:common:db') - implementation project(':code:common:process') - implementation project(':code:common:service') - implementation project(':code:functions:link-graph:api') - - implementation libs.bundles.slf4j - - implementation libs.guava - implementation dependencies.create(libs.guice.get()) { - exclude group: 'com.google.guava' - } - implementation libs.roaringbitmap - implementation libs.trove - implementation libs.fastutil - implementation libs.bundles.mariadb - - testImplementation libs.bundles.slf4j.test - testImplementation libs.bundles.junit - testImplementation libs.mockito - - testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4') - testImplementation libs.commons.codec - testImplementation 'org.testcontainers:mariadb:1.17.4' - testImplementation 'org.testcontainers:junit-jupiter:1.17.4' - testImplementation project(':code:libraries:test-helpers') -} diff --git a/code/processes/website-adjacencies-calculator/java/nu/marginalia/adjacencies/WebsiteAdjacenciesCalculator.java b/code/processes/website-adjacencies-calculator/java/nu/marginalia/adjacencies/WebsiteAdjacenciesCalculator.java deleted file mode 100644 index 1389cd63..00000000 --- a/code/processes/website-adjacencies-calculator/java/nu/marginalia/adjacencies/WebsiteAdjacenciesCalculator.java +++ /dev/null @@ -1,198 +0,0 @@ -package nu.marginalia.adjacencies; - -import com.google.inject.Guice; -import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.ProcessConfiguration; -import nu.marginalia.api.linkgraph.AggregateLinkGraphClient; -import nu.marginalia.db.DbDomainQueries; -import nu.marginalia.model.EdgeDomain; -import nu.marginalia.process.control.ProcessHeartbeat; -import nu.marginalia.process.control.ProcessHeartbeatImpl; -import nu.marginalia.service.ProcessMainClass; -import nu.marginalia.service.module.DatabaseModule; -import nu.marginalia.service.module.ServiceDiscoveryModule; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.sql.SQLException; -import java.time.Duration; -import java.util.*; -import java.util.concurrent.Executors; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.function.Consumer; -import java.util.stream.IntStream; - -import static nu.marginalia.adjacencies.SparseBitVector.andCardinality; -import static nu.marginalia.adjacencies.SparseBitVector.weightedProduct; - -public class WebsiteAdjacenciesCalculator extends ProcessMainClass { - private final HikariDataSource dataSource; - public AdjacenciesData adjacenciesData; - public DomainAliases domainAliases; - private static final Logger logger = LoggerFactory.getLogger(WebsiteAdjacenciesCalculator.class); - - float[] weights; - public WebsiteAdjacenciesCalculator(AggregateLinkGraphClient domainLinksClient, HikariDataSource dataSource) throws SQLException { - this.dataSource = dataSource; - - domainAliases = new DomainAliases(dataSource); - adjacenciesData = new AdjacenciesData(domainLinksClient, domainAliases); - weights = adjacenciesData.getWeights(); - } - - public void tryDomains(String... domainName) { - var dataStoreDao = new DbDomainQueries(dataSource); - - System.out.println(Arrays.toString(domainName)); - - int[] domainIds = Arrays.stream(domainName).map(EdgeDomain::new) - .mapToInt(dataStoreDao::getDomainId) - .map(domainAliases::deAlias) - .toArray(); - - for (int domainId : domainIds) { - findAdjacentDtoS(domainId, similarities -> { - for (var similarity : similarities.similarities()) { - System.out.println(dataStoreDao.getDomain(similarity.domainId).map(Object::toString).orElse("") + " " + prettyPercent(similarity.value)); - } - }); - } - } - - private String prettyPercent(double val) { - return String.format("%2.2f%%", 100. * val); - } - - public void loadAll(ProcessHeartbeat processHeartbeat) throws InterruptedException { - AdjacenciesLoader loader = new AdjacenciesLoader(dataSource); - var executor = Executors.newFixedThreadPool(16); - - int total = adjacenciesData.getIdsList().size(); - AtomicInteger progress = new AtomicInteger(0); - IntStream.of(adjacenciesData.getIdsList().toArray()).parallel() - .filter(domainAliases::isNotAliased) - .forEach(id -> { - findAdjacent(id, loader::load); - processHeartbeat.setProgress(progress.incrementAndGet() / (double) total); - }); - - executor.shutdown(); - System.out.println("Waiting for wrap-up"); - loader.stop(); - } - - public void findAdjacent(int domainId, Consumer andThen) { - findAdjacentDtoS(domainId, andThen); - } - - double cosineSimilarity(SparseBitVector a, SparseBitVector b) { - double andCardinality = andCardinality(a, b); - andCardinality /= Math.sqrt(a.getCardinality()); - andCardinality /= Math.sqrt(b.getCardinality()); - return andCardinality; - } - - double expensiveCosineSimilarity(SparseBitVector a, SparseBitVector b) { - return weightedProduct(weights, a, b) / Math.sqrt(a.mulAndSum(weights) * b.mulAndSum(weights)); - } - - public record DomainSimilarities(int domainId, List similarities) {} - - public record DomainSimilarity(int domainId, double value) {} - - private void findAdjacentDtoS(int domainId, Consumer andThen) { - var vector = adjacenciesData.getVector(domainId); - - if (vector == null || !vector.cardinalityExceeds(10)) { - return; - } - - List similarities = new ArrayList<>(1000); - - var items = adjacenciesData.getCandidates(vector); - - - int cardMin = Math.max(2, (int) (0.01 * vector.getCardinality())); - - items.forEach(id -> { - var otherVec = adjacenciesData.getVector(id); - - if (null == otherVec || otherVec == vector) - return true; - - if (otherVec.getCardinality() < cardMin) - return true; - - double similarity = cosineSimilarity(vector, otherVec); - if (similarity > 0.1) { - var recalculated = expensiveCosineSimilarity(vector, otherVec); - if (recalculated > 0.1) { - similarities.add(new DomainSimilarity(id, recalculated)); - } - } - - return true; - }); - - if (similarities.size() > 128) { - similarities.sort(Comparator.comparing(DomainSimilarity::value)); - similarities.subList(0, similarities.size() - 128).clear(); - } - - - andThen.accept(new DomainSimilarities(domainId, similarities)); - } - - - public static void main(String[] args) throws SQLException, InterruptedException { - var injector = Guice.createInjector( - new DatabaseModule(false), - new ServiceDiscoveryModule()); - - - var dataSource = injector.getInstance(HikariDataSource.class); - var lc = injector.getInstance(AggregateLinkGraphClient.class); - - if (!lc.waitReady(Duration.ofSeconds(30))) { - throw new IllegalStateException("Failed to connect to domain-links"); - } - - var main = new WebsiteAdjacenciesCalculator(lc, dataSource); - - if (args.length == 1 && "load".equals(args[0])) { - var processHeartbeat = new ProcessHeartbeatImpl( - new ProcessConfiguration("website-adjacencies-calculator", 0, UUID.randomUUID()), - dataSource - ); - - try { - processHeartbeat.start(); - main.loadAll(processHeartbeat); - } - catch (Exception ex) { - logger.error("Failed to load", ex); - } - finally { - processHeartbeat.shutDown(); - } - return; - } - - for (;;) { - String domains = System.console().readLine("> "); - - if (domains.isBlank()) - break; - - var parts = domains.split("\\s+,\\s+"); - try { - main.tryDomains(parts); - } - catch (Exception ex) { - ex.printStackTrace(); - } - } - - } - -} diff --git a/code/processes/website-adjacencies-calculator/readme.md b/code/processes/website-adjacencies-calculator/readme.md deleted file mode 100644 index 6a69a10a..00000000 --- a/code/processes/website-adjacencies-calculator/readme.md +++ /dev/null @@ -1,8 +0,0 @@ -# Website Adjacencies Calculator - -This job updates the website similarity table based on the data in the domain and links-tables in the URL database. - -It performs a brute force cosine similarity calculation across the entire link graph. - -These adjacencies power the [explorer service](../../services-application/explorer-service) and -[random websites](../../features-search/random-websites)-functionality. \ No newline at end of file diff --git a/code/services-application/explorer-service/readme.md b/code/services-application/explorer-service/readme.md index 567886d6..4f3204b1 100644 --- a/code/services-application/explorer-service/readme.md +++ b/code/services-application/explorer-service/readme.md @@ -8,4 +8,3 @@ Externally the service is available at [https://explore2.marginalia.nu/](https:/ * [features-search/screenshots](../../features-search/screenshots) * [features-search/random-websites](../../features-search/random-websites) -* [processes/website-adjacencies-calculator](../../processes/website-adjacencies-calculator) diff --git a/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlNodeService.java b/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlNodeService.java index 003e950e..62929c83 100644 --- a/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlNodeService.java +++ b/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlNodeService.java @@ -12,6 +12,7 @@ import nu.marginalia.control.sys.svc.HeartbeatService; import nu.marginalia.executor.client.ExecutorClient; import nu.marginalia.nodecfg.NodeConfigurationService; import nu.marginalia.nodecfg.model.NodeConfiguration; +import nu.marginalia.nodecfg.model.NodeProfile; import nu.marginalia.service.ServiceId; import nu.marginalia.service.ServiceMonitors; import nu.marginalia.storage.FileStorageService; @@ -52,7 +53,8 @@ public class ControlNodeService { HikariDataSource dataSource, ServiceMonitors monitors, RedirectControl redirectControl, - NodeConfigurationService nodeConfigurationService, ControlCrawlDataService crawlDataService) + NodeConfigurationService nodeConfigurationService, + ControlCrawlDataService crawlDataService) { this.fileStorageService = fileStorageService; this.rendererFactory = rendererFactory; @@ -269,6 +271,8 @@ public class ControlNodeService { String act = request.queryParams("act"); if ("config".equals(act)) { + var oldConfig = nodeConfigurationService.get(nodeId); + var newConfig = new NodeConfiguration( nodeId, request.queryParams("description"), @@ -276,10 +280,19 @@ public class ControlNodeService { "on".equalsIgnoreCase(request.queryParams("autoClean")), "on".equalsIgnoreCase(request.queryParams("includeInPrecession")), "on".equalsIgnoreCase(request.queryParams("keepWarcs")), + NodeProfile.valueOf(request.queryParams("profile")), "on".equalsIgnoreCase(request.queryParams("disabled")) ); nodeConfigurationService.save(newConfig); + + if (!(Objects.equals(oldConfig.profile(), newConfig.profile()))) { + // Restart the executor service if the profile has changed + executorClient.restartExecutorService(nodeId); + } + else if (newConfig.disabled()) { + executorClient.restartExecutorService(nodeId); + } } else if ("storage".equals(act)) { throw new UnsupportedOperationException(); diff --git a/code/services-core/control-service/resources/templates/control/node/node-config.hdb b/code/services-core/control-service/resources/templates/control/node/node-config.hdb index ac15e1bf..e646f113 100644 --- a/code/services-core/control-service/resources/templates/control/node/node-config.hdb +++ b/code/services-core/control-service/resources/templates/control/node/node-config.hdb @@ -21,6 +21,35 @@ + +
+ + + +
The node profile configures which actors are available. +
    +
  • + Batch Crawl - This node is configured for batch crawling. It will not have the sideload actors available. +
  • +
  • + Sideload - This node is configured for sideloading. It will not have the batch crawl actors available. +
  • +
  • + Real Time - This node is configured for real time processing. + It will not have the batch crawl or sideload actors available, but have actors for real time (daily) crawling. +
  • +
  • + Mixed Use - This node is configured for both batch crawling and sideloading. +
  • +
+
+
+
diff --git a/code/services-core/control-service/resources/templates/control/node/partial-node-nav.hdb b/code/services-core/control-service/resources/templates/control/node/partial-node-nav.hdb index 07c7ced3..14c477a3 100644 --- a/code/services-core/control-service/resources/templates/control/node/partial-node-nav.hdb +++ b/code/services-core/control-service/resources/templates/control/node/partial-node-nav.hdb @@ -1,5 +1,5 @@ -

Index Node {{node.id}}

+

Index Node {{node.id}}: {{node.profile}}

{{#if node.disabled}} This index node is disabled! {{/if}} @@ -10,8 +10,10 @@ + {{#unless node.profile.realtime}} + {{/unless}} + {{#unless node.profile.realtime}} + {{/unless}} + + diff --git a/code/services-core/control-service/resources/templates/control/partials/nodes-table.hdb b/code/services-core/control-service/resources/templates/control/partials/nodes-table.hdb index 8a0d4e68..3472a64b 100644 --- a/code/services-core/control-service/resources/templates/control/partials/nodes-table.hdb +++ b/code/services-core/control-service/resources/templates/control/partials/nodes-table.hdb @@ -2,13 +2,16 @@

Nodes

- + {{#each .}} +
NodeQueriesEnabledIndexExecutorNodeProfileQueriesEnabledIndexExecutor
node-{{id}} + {{configuration.profile}} + {{#if configuration.acceptQueries}} ✓ diff --git a/code/services-core/executor-service/build.gradle b/code/services-core/executor-service/build.gradle index 2e7934bc..dc18f2e1 100644 --- a/code/services-core/executor-service/build.gradle +++ b/code/services-core/executor-service/build.gradle @@ -25,7 +25,6 @@ dependencies { // These look weird but they're needed to be able to spawn the processes // from the executor service - implementation project(':code:processes:website-adjacencies-calculator') implementation project(':code:processes:crawling-process') implementation project(':code:processes:loading-process') implementation project(':code:processes:converting-process') @@ -33,7 +32,6 @@ dependencies { implementation project(':code:common:config') implementation project(':code:common:model') - implementation project(':code:common:process') implementation project(':code:common:db') implementation project(':code:common:linkdb') @@ -48,7 +46,6 @@ dependencies { implementation project(':code:processes:crawling-process:model') implementation project(':code:processes:crawling-process:model') implementation project(':code:processes:crawling-process:ft-link-parser') - implementation project(':code:execution:data-extractors') implementation project(':code:index:index-journal') implementation project(':code:index:api') implementation project(':code:processes:process-mq-api') diff --git a/code/services-core/index-service/build.gradle b/code/services-core/index-service/build.gradle index df3773f9..af3a0fdd 100644 --- a/code/services-core/index-service/build.gradle +++ b/code/services-core/index-service/build.gradle @@ -37,7 +37,6 @@ dependencies { implementation project(':code:index:api') testImplementation project(path: ':code:services-core:control-service') - testImplementation project(':code:common:process') implementation libs.bundles.slf4j diff --git a/code/tools/experiment-runner/build.gradle b/code/tools/experiment-runner/build.gradle index d011a973..0e26439f 100644 --- a/code/tools/experiment-runner/build.gradle +++ b/code/tools/experiment-runner/build.gradle @@ -27,7 +27,6 @@ dependencies { implementation project(':code:common:db') implementation project(':code:common:model') implementation project(':code:common:config') - implementation project(':code:common:process') implementation project(':code:common:service') implementation project(':code:libraries:language-processing') implementation project(':code:libraries:term-frequency-dict') diff --git a/code/tools/integration-test/build.gradle b/code/tools/integration-test/build.gradle index 81e3cde9..0e5d3cf0 100644 --- a/code/tools/integration-test/build.gradle +++ b/code/tools/integration-test/build.gradle @@ -34,7 +34,6 @@ dependencies { implementation project(':code:common:db') implementation project(':code:common:config') implementation project(':code:common:linkdb') - implementation project(':code:common:process') implementation project(':code:common:service') implementation project(':code:common:model') diff --git a/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java b/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java index a29924a2..498daa79 100644 --- a/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java +++ b/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java @@ -31,6 +31,7 @@ import nu.marginalia.loading.LoaderIndexJournalWriter; import nu.marginalia.loading.LoaderInputData; import nu.marginalia.loading.documents.DocumentLoaderService; import nu.marginalia.loading.documents.KeywordLoaderService; +import nu.marginalia.loading.domains.CachingDomainIdRegistry; import nu.marginalia.loading.domains.DomainIdRegistry; import nu.marginalia.loading.links.DomainLinksLoaderService; import nu.marginalia.model.EdgeDomain; @@ -166,7 +167,7 @@ public class IntegrationTest { LoaderInputData inputData = new LoaderInputData(List.of(processedDataDir)); - DomainIdRegistry domainIdRegistry = Mockito.mock(DomainIdRegistry.class); + DomainIdRegistry domainIdRegistry = Mockito.mock(CachingDomainIdRegistry.class); when(domainIdRegistry.getDomainId(any())).thenReturn(1); linksService.loadLinks(domainIdRegistry, new FakeProcessHeartbeat(), inputData); diff --git a/code/tools/integration-test/test/nu/marginalia/test/IntegrationTestModule.java b/code/tools/integration-test/test/nu/marginalia/test/IntegrationTestModule.java index 83f79fbf..2869a229 100644 --- a/code/tools/integration-test/test/nu/marginalia/test/IntegrationTestModule.java +++ b/code/tools/integration-test/test/nu/marginalia/test/IntegrationTestModule.java @@ -8,7 +8,6 @@ import com.google.inject.name.Names; import gnu.trove.list.array.TIntArrayList; import nu.marginalia.IndexLocations; import nu.marginalia.LanguageModels; -import nu.marginalia.ProcessConfiguration; import nu.marginalia.WmsaHome; import nu.marginalia.db.DomainTypes; import nu.marginalia.index.domainrankings.DomainRankings; @@ -18,6 +17,7 @@ import nu.marginalia.index.searchset.SearchSetsService; import nu.marginalia.linkdb.docs.DocumentDbReader; import nu.marginalia.linkdb.docs.DocumentDbWriter; import nu.marginalia.linkgraph.io.DomainLinksWriter; +import nu.marginalia.process.ProcessConfiguration; import nu.marginalia.process.control.FakeProcessHeartbeat; import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.service.ServiceId; diff --git a/settings.gradle b/settings.gradle index 1df48b72..1ed535be 100644 --- a/settings.gradle +++ b/settings.gradle @@ -59,11 +59,11 @@ include 'code:features-search:screenshots' include 'code:features-search:random-websites' include 'code:processes:converting-process:ft-anchor-keywords' -include 'code:execution:data-extractors' include 'code:processes:crawling-process:ft-crawl-blocklist' include 'code:processes:crawling-process:ft-link-parser' include 'code:processes:crawling-process:ft-content-type' +include 'code:processes:live-crawling-process' include 'code:processes:process-mq-api' @@ -73,7 +73,6 @@ include 'code:common:service' include 'code:common:config' include 'code:common:model' include 'code:common:renderer' -include 'code:common:process' include 'code:processes:converting-process' include 'code:processes:converting-process:model' @@ -85,7 +84,8 @@ include 'code:processes:crawling-process:model' include 'code:processes:loading-process' include 'code:processes:index-constructor-process' include 'code:processes:test-data' -include 'code:processes:website-adjacencies-calculator' + +include 'code:processes:export-task-process' include 'code:tools:experiment-runner' include 'code:tools:screenshot-capture-tool'