mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Merge pull request #125 from MarginaliaSearch/live-search
Add near real-time crawling from RSS feeds to supplement the slower batch based crawls
This commit is contained in:
commit
df298df852
@ -3,6 +3,7 @@ package nu.marginalia.nodecfg;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
import nu.marginalia.nodecfg.model.NodeConfiguration;
|
import nu.marginalia.nodecfg.model.NodeConfiguration;
|
||||||
|
import nu.marginalia.nodecfg.model.NodeProfile;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -20,10 +21,10 @@ public class NodeConfigurationService {
|
|||||||
this.dataSource = dataSource;
|
this.dataSource = dataSource;
|
||||||
}
|
}
|
||||||
|
|
||||||
public NodeConfiguration create(int id, String description, boolean acceptQueries, boolean keepWarcs) throws SQLException {
|
public NodeConfiguration create(int id, String description, boolean acceptQueries, boolean keepWarcs, NodeProfile nodeProfile) throws SQLException {
|
||||||
try (var conn = dataSource.getConnection();
|
try (var conn = dataSource.getConnection();
|
||||||
var is = conn.prepareStatement("""
|
var is = conn.prepareStatement("""
|
||||||
INSERT IGNORE INTO NODE_CONFIGURATION(ID, DESCRIPTION, ACCEPT_QUERIES, KEEP_WARCS) VALUES(?, ?, ?, ?)
|
INSERT IGNORE INTO NODE_CONFIGURATION(ID, DESCRIPTION, ACCEPT_QUERIES, KEEP_WARCS, NODE_PROFILE) VALUES(?, ?, ?, ?, ?)
|
||||||
""")
|
""")
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
@ -31,6 +32,7 @@ public class NodeConfigurationService {
|
|||||||
is.setString(2, description);
|
is.setString(2, description);
|
||||||
is.setBoolean(3, acceptQueries);
|
is.setBoolean(3, acceptQueries);
|
||||||
is.setBoolean(4, keepWarcs);
|
is.setBoolean(4, keepWarcs);
|
||||||
|
is.setString(5, nodeProfile.name());
|
||||||
|
|
||||||
if (is.executeUpdate() <= 0) {
|
if (is.executeUpdate() <= 0) {
|
||||||
throw new IllegalStateException("Failed to insert configuration");
|
throw new IllegalStateException("Failed to insert configuration");
|
||||||
@ -43,7 +45,7 @@ public class NodeConfigurationService {
|
|||||||
public List<NodeConfiguration> getAll() {
|
public List<NodeConfiguration> getAll() {
|
||||||
try (var conn = dataSource.getConnection();
|
try (var conn = dataSource.getConnection();
|
||||||
var qs = conn.prepareStatement("""
|
var qs = conn.prepareStatement("""
|
||||||
SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, KEEP_WARCS, DISABLED
|
SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, KEEP_WARCS, NODE_PROFILE, DISABLED
|
||||||
FROM NODE_CONFIGURATION
|
FROM NODE_CONFIGURATION
|
||||||
""")) {
|
""")) {
|
||||||
var rs = qs.executeQuery();
|
var rs = qs.executeQuery();
|
||||||
@ -58,6 +60,7 @@ public class NodeConfigurationService {
|
|||||||
rs.getBoolean("AUTO_CLEAN"),
|
rs.getBoolean("AUTO_CLEAN"),
|
||||||
rs.getBoolean("PRECESSION"),
|
rs.getBoolean("PRECESSION"),
|
||||||
rs.getBoolean("KEEP_WARCS"),
|
rs.getBoolean("KEEP_WARCS"),
|
||||||
|
NodeProfile.valueOf(rs.getString("NODE_PROFILE")),
|
||||||
rs.getBoolean("DISABLED")
|
rs.getBoolean("DISABLED")
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
@ -72,7 +75,7 @@ public class NodeConfigurationService {
|
|||||||
public NodeConfiguration get(int nodeId) throws SQLException {
|
public NodeConfiguration get(int nodeId) throws SQLException {
|
||||||
try (var conn = dataSource.getConnection();
|
try (var conn = dataSource.getConnection();
|
||||||
var qs = conn.prepareStatement("""
|
var qs = conn.prepareStatement("""
|
||||||
SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, KEEP_WARCS, DISABLED
|
SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, KEEP_WARCS, NODE_PROFILE, DISABLED
|
||||||
FROM NODE_CONFIGURATION
|
FROM NODE_CONFIGURATION
|
||||||
WHERE ID=?
|
WHERE ID=?
|
||||||
""")) {
|
""")) {
|
||||||
@ -86,6 +89,7 @@ public class NodeConfigurationService {
|
|||||||
rs.getBoolean("AUTO_CLEAN"),
|
rs.getBoolean("AUTO_CLEAN"),
|
||||||
rs.getBoolean("PRECESSION"),
|
rs.getBoolean("PRECESSION"),
|
||||||
rs.getBoolean("KEEP_WARCS"),
|
rs.getBoolean("KEEP_WARCS"),
|
||||||
|
NodeProfile.valueOf(rs.getString("NODE_PROFILE")),
|
||||||
rs.getBoolean("DISABLED")
|
rs.getBoolean("DISABLED")
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@ -98,7 +102,7 @@ public class NodeConfigurationService {
|
|||||||
try (var conn = dataSource.getConnection();
|
try (var conn = dataSource.getConnection();
|
||||||
var us = conn.prepareStatement("""
|
var us = conn.prepareStatement("""
|
||||||
UPDATE NODE_CONFIGURATION
|
UPDATE NODE_CONFIGURATION
|
||||||
SET DESCRIPTION=?, ACCEPT_QUERIES=?, AUTO_CLEAN=?, PRECESSION=?, KEEP_WARCS=?, DISABLED=?
|
SET DESCRIPTION=?, ACCEPT_QUERIES=?, AUTO_CLEAN=?, PRECESSION=?, KEEP_WARCS=?, DISABLED=?, NODE_PROFILE=?
|
||||||
WHERE ID=?
|
WHERE ID=?
|
||||||
"""))
|
"""))
|
||||||
{
|
{
|
||||||
@ -108,7 +112,8 @@ public class NodeConfigurationService {
|
|||||||
us.setBoolean(4, config.includeInPrecession());
|
us.setBoolean(4, config.includeInPrecession());
|
||||||
us.setBoolean(5, config.keepWarcs());
|
us.setBoolean(5, config.keepWarcs());
|
||||||
us.setBoolean(6, config.disabled());
|
us.setBoolean(6, config.disabled());
|
||||||
us.setInt(7, config.node());
|
us.setString(7, config.profile().name());
|
||||||
|
us.setInt(8, config.node());
|
||||||
|
|
||||||
if (us.executeUpdate() <= 0)
|
if (us.executeUpdate() <= 0)
|
||||||
throw new IllegalStateException("Failed to update configuration");
|
throw new IllegalStateException("Failed to update configuration");
|
||||||
|
@ -6,6 +6,7 @@ public record NodeConfiguration(int node,
|
|||||||
boolean autoClean,
|
boolean autoClean,
|
||||||
boolean includeInPrecession,
|
boolean includeInPrecession,
|
||||||
boolean keepWarcs,
|
boolean keepWarcs,
|
||||||
|
NodeProfile profile,
|
||||||
boolean disabled
|
boolean disabled
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
|
@ -0,0 +1,28 @@
|
|||||||
|
package nu.marginalia.nodecfg.model;
|
||||||
|
|
||||||
|
public enum NodeProfile {
|
||||||
|
BATCH_CRAWL,
|
||||||
|
REALTIME,
|
||||||
|
MIXED,
|
||||||
|
SIDELOAD;
|
||||||
|
|
||||||
|
public boolean isBatchCrawl() {
|
||||||
|
return this == BATCH_CRAWL;
|
||||||
|
}
|
||||||
|
public boolean isRealtime() {
|
||||||
|
return this == REALTIME;
|
||||||
|
}
|
||||||
|
public boolean isMixed() {
|
||||||
|
return this == MIXED;
|
||||||
|
}
|
||||||
|
public boolean isSideload() {
|
||||||
|
return this == SIDELOAD;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean permitBatchCrawl() {
|
||||||
|
return isBatchCrawl() ||isMixed();
|
||||||
|
}
|
||||||
|
public boolean permitSideload() {
|
||||||
|
return isMixed() || isSideload();
|
||||||
|
}
|
||||||
|
}
|
@ -2,6 +2,7 @@ package nu.marginalia.nodecfg;
|
|||||||
|
|
||||||
import com.zaxxer.hikari.HikariConfig;
|
import com.zaxxer.hikari.HikariConfig;
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import nu.marginalia.nodecfg.model.NodeProfile;
|
||||||
import nu.marginalia.test.TestMigrationLoader;
|
import nu.marginalia.test.TestMigrationLoader;
|
||||||
import org.junit.jupiter.api.BeforeAll;
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
import org.junit.jupiter.api.Tag;
|
import org.junit.jupiter.api.Tag;
|
||||||
@ -46,8 +47,8 @@ public class NodeConfigurationServiceTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test() throws SQLException {
|
public void test() throws SQLException {
|
||||||
var a = nodeConfigurationService.create(1, "Test", false, false);
|
var a = nodeConfigurationService.create(1, "Test", false, false, NodeProfile.MIXED);
|
||||||
var b = nodeConfigurationService.create(2, "Foo", true, false);
|
var b = nodeConfigurationService.create(2, "Foo", true, false, NodeProfile.MIXED);
|
||||||
|
|
||||||
assertEquals(1, a.node());
|
assertEquals(1, a.node());
|
||||||
assertEquals("Test", a.description());
|
assertEquals("Test", a.description());
|
||||||
|
@ -0,0 +1 @@
|
|||||||
|
ALTER TABLE WMSA_prod.NODE_CONFIGURATION ADD COLUMN NODE_PROFILE VARCHAR(255) DEFAULT 'MIXED';
|
@ -1,34 +0,0 @@
|
|||||||
plugins {
|
|
||||||
id 'java'
|
|
||||||
|
|
||||||
id 'jvm-test-suite'
|
|
||||||
}
|
|
||||||
|
|
||||||
java {
|
|
||||||
toolchain {
|
|
||||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
|
||||||
|
|
||||||
dependencies {
|
|
||||||
implementation libs.notnull
|
|
||||||
|
|
||||||
implementation libs.bundles.slf4j
|
|
||||||
testImplementation libs.bundles.slf4j.test
|
|
||||||
|
|
||||||
implementation libs.guava
|
|
||||||
implementation libs.guava
|
|
||||||
implementation dependencies.create(libs.guice.get()) {
|
|
||||||
exclude group: 'com.google.guava'
|
|
||||||
}
|
|
||||||
implementation libs.bundles.mariadb
|
|
||||||
implementation libs.commons.lang3
|
|
||||||
|
|
||||||
implementation libs.snakeyaml
|
|
||||||
|
|
||||||
testImplementation libs.bundles.slf4j.test
|
|
||||||
testImplementation libs.bundles.junit
|
|
||||||
testImplementation libs.mockito
|
|
||||||
}
|
|
@ -1,7 +0,0 @@
|
|||||||
package nu.marginalia.process.control;
|
|
||||||
|
|
||||||
public interface ProcessAdHocTaskHeartbeat extends AutoCloseable {
|
|
||||||
void progress(String step, int progress, int total);
|
|
||||||
|
|
||||||
void close();
|
|
||||||
}
|
|
@ -1,4 +0,0 @@
|
|||||||
# Process
|
|
||||||
|
|
||||||
Basic functionality for a Process. Processes must include this dependency to ensure
|
|
||||||
their loggers are configured properly!
|
|
@ -1,9 +0,0 @@
|
|||||||
log4j2.isThreadContextMapInheritable=true
|
|
||||||
status = info
|
|
||||||
appender.console.type = Console
|
|
||||||
appender.console.name = LogToConsole
|
|
||||||
appender.console.layout.type = PatternLayout
|
|
||||||
appender.console.layout.pattern = %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %c{1}- %msg{nolookups}%n
|
|
||||||
appender.console.filter.http.type = MarkerFilter
|
|
||||||
rootLogger.level = info
|
|
||||||
rootLogger.appenderRef.console.ref = LogToConsole
|
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia;
|
package nu.marginalia.process;
|
||||||
|
|
||||||
import java.util.UUID;
|
import java.util.UUID;
|
||||||
|
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia;
|
package nu.marginalia.process;
|
||||||
|
|
||||||
import com.google.inject.AbstractModule;
|
import com.google.inject.AbstractModule;
|
||||||
import com.google.inject.name.Names;
|
import com.google.inject.name.Names;
|
@ -0,0 +1,102 @@
|
|||||||
|
package nu.marginalia.process;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import nu.marginalia.mq.MessageQueueFactory;
|
||||||
|
import nu.marginalia.mq.MqMessage;
|
||||||
|
import nu.marginalia.mq.inbox.MqInboxResponse;
|
||||||
|
import nu.marginalia.mq.inbox.MqSingleShotInbox;
|
||||||
|
import nu.marginalia.service.ConfigLoader;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.UUID;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
public abstract class ProcessMainClass {
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(ProcessMainClass.class);
|
||||||
|
|
||||||
|
private final MessageQueueFactory messageQueueFactory;
|
||||||
|
private final int node;
|
||||||
|
private final String inboxName;
|
||||||
|
|
||||||
|
static {
|
||||||
|
// Load global config ASAP
|
||||||
|
ConfigLoader.loadConfig(
|
||||||
|
ConfigLoader.getConfigPath("system")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
private final Gson gson;
|
||||||
|
|
||||||
|
public ProcessMainClass(MessageQueueFactory messageQueueFactory,
|
||||||
|
ProcessConfiguration config,
|
||||||
|
Gson gson,
|
||||||
|
String inboxName
|
||||||
|
) {
|
||||||
|
this.gson = gson;
|
||||||
|
new org.mariadb.jdbc.Driver();
|
||||||
|
this.messageQueueFactory = messageQueueFactory;
|
||||||
|
this.node = config.node();
|
||||||
|
this.inboxName = inboxName;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
protected <T> Instructions<T> fetchInstructions(Class<T> requestType) throws Exception {
|
||||||
|
|
||||||
|
var inbox = messageQueueFactory.createSingleShotInbox(inboxName, node, UUID.randomUUID());
|
||||||
|
|
||||||
|
logger.info("Waiting for instructions");
|
||||||
|
|
||||||
|
var msgOpt = getMessage(inbox, requestType.getSimpleName());
|
||||||
|
var msg = msgOpt.orElseThrow(() -> new RuntimeException("No message received"));
|
||||||
|
|
||||||
|
// for live crawl, request is empty for now
|
||||||
|
T request = gson.fromJson(msg.payload(), requestType);
|
||||||
|
|
||||||
|
return new Instructions<>(msg, inbox, request);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Optional<MqMessage> getMessage(MqSingleShotInbox inbox, String expectedFunction) throws InterruptedException, SQLException {
|
||||||
|
var opt = inbox.waitForMessage(30, TimeUnit.SECONDS);
|
||||||
|
if (opt.isPresent()) {
|
||||||
|
if (!opt.get().function().equals(expectedFunction)) {
|
||||||
|
throw new RuntimeException("Unexpected function: " + opt.get().function());
|
||||||
|
}
|
||||||
|
return opt;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
var stolenMessage = inbox.stealMessage(msg -> msg.function().equals(expectedFunction));
|
||||||
|
stolenMessage.ifPresent(mqMessage -> logger.info("Stole message {}", mqMessage));
|
||||||
|
return stolenMessage;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
protected static class Instructions<T> {
|
||||||
|
private final MqMessage message;
|
||||||
|
private final MqSingleShotInbox inbox;
|
||||||
|
private final T value;
|
||||||
|
Instructions(MqMessage message, MqSingleShotInbox inbox, T value)
|
||||||
|
{
|
||||||
|
this.message = message;
|
||||||
|
this.inbox = inbox;
|
||||||
|
this.value = value;
|
||||||
|
}
|
||||||
|
|
||||||
|
public T value() {
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void ok() {
|
||||||
|
inbox.sendResponse(message, MqInboxResponse.ok());
|
||||||
|
}
|
||||||
|
public void err() {
|
||||||
|
inbox.sendResponse(message, MqInboxResponse.err());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -3,6 +3,8 @@ package nu.marginalia.process.control;
|
|||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.util.Collection;
|
||||||
|
|
||||||
/** Dummy implementation of ProcessHeartbeat that does nothing */
|
/** Dummy implementation of ProcessHeartbeat that does nothing */
|
||||||
public class FakeProcessHeartbeat implements ProcessHeartbeat {
|
public class FakeProcessHeartbeat implements ProcessHeartbeat {
|
||||||
private static final Logger logger = LoggerFactory.getLogger(FakeProcessHeartbeat.class);
|
private static final Logger logger = LoggerFactory.getLogger(FakeProcessHeartbeat.class);
|
||||||
@ -30,6 +32,11 @@ public class FakeProcessHeartbeat implements ProcessHeartbeat {
|
|||||||
logger.info("Progress: {}, {}/{}", step, progress, total);
|
logger.info("Progress: {}, {}/{}", step, progress, total);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public <T> Iterable<T> wrap(String step, Collection<T> collection) {
|
||||||
|
return collection;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void close() {}
|
public void close() {}
|
||||||
};
|
};
|
@ -0,0 +1,12 @@
|
|||||||
|
package nu.marginalia.process.control;
|
||||||
|
|
||||||
|
import java.util.Collection;
|
||||||
|
|
||||||
|
public interface ProcessAdHocTaskHeartbeat extends AutoCloseable {
|
||||||
|
void progress(String step, int progress, int total);
|
||||||
|
|
||||||
|
/** Wrap a collection to provide heartbeat progress updates as it's iterated through */
|
||||||
|
<T> Iterable<T> wrap(String step, Collection<T> collection);
|
||||||
|
|
||||||
|
void close();
|
||||||
|
}
|
@ -2,11 +2,13 @@ package nu.marginalia.process.control;
|
|||||||
|
|
||||||
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
import nu.marginalia.ProcessConfiguration;
|
import nu.marginalia.process.ProcessConfiguration;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.Iterator;
|
||||||
import java.util.UUID;
|
import java.util.UUID;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
@ -69,6 +71,35 @@ public class ProcessAdHocTaskHeartbeatImpl implements AutoCloseable, ProcessAdHo
|
|||||||
logger.info("ProcessTask {} progress: {}%", taskBase, progress);
|
logger.info("ProcessTask {} progress: {}%", taskBase, progress);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Wrap a collection to provide heartbeat progress updates as it's iterated through */
|
||||||
|
@Override
|
||||||
|
public <T> Iterable<T> wrap(String step, Collection<T> collection) {
|
||||||
|
return () -> new Iterator<>() {
|
||||||
|
private final Iterator<T> base = collection.iterator();
|
||||||
|
private final int size = collection.size();
|
||||||
|
private final int updateInterval = Math.max(1, size / 100); // update every 1% of the collection, or at least once
|
||||||
|
private int pos = 0;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean hasNext() {
|
||||||
|
boolean ret = base.hasNext();
|
||||||
|
if (!ret) {
|
||||||
|
progress(step, size, size);
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public T next() {
|
||||||
|
// update every 1% of the collection, to avoid hammering the database with updates
|
||||||
|
if (pos++ % updateInterval == 0) {
|
||||||
|
progress(step, pos, size);
|
||||||
|
}
|
||||||
|
return base.next();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
public void shutDown() {
|
public void shutDown() {
|
||||||
if (!running)
|
if (!running)
|
||||||
return;
|
return;
|
||||||
@ -185,6 +216,5 @@ public class ProcessAdHocTaskHeartbeatImpl implements AutoCloseable, ProcessAdHo
|
|||||||
public void close() {
|
public void close() {
|
||||||
shutDown();
|
shutDown();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
@ -4,17 +4,18 @@ package nu.marginalia.process.control;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
import nu.marginalia.ProcessConfiguration;
|
import nu.marginalia.process.ProcessConfiguration;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.Closeable;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
/** This service sends a heartbeat to the database every 5 seconds.
|
/** This service sends a heartbeat to the database every 5 seconds.
|
||||||
*/
|
*/
|
||||||
@Singleton
|
@Singleton
|
||||||
public class ProcessHeartbeatImpl implements ProcessHeartbeat {
|
public class ProcessHeartbeatImpl implements ProcessHeartbeat, Closeable {
|
||||||
private final Logger logger = LoggerFactory.getLogger(ProcessHeartbeatImpl.class);
|
private final Logger logger = LoggerFactory.getLogger(ProcessHeartbeatImpl.class);
|
||||||
private final String processName;
|
private final String processName;
|
||||||
private final String processBase;
|
private final String processBase;
|
||||||
@ -169,5 +170,9 @@ public class ProcessHeartbeatImpl implements ProcessHeartbeat {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void close() {
|
||||||
|
shutDown();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -2,7 +2,7 @@ package nu.marginalia.process.control;
|
|||||||
|
|
||||||
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
import nu.marginalia.ProcessConfiguration;
|
import nu.marginalia.process.ProcessConfiguration;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
@ -9,11 +9,11 @@ import java.util.Properties;
|
|||||||
|
|
||||||
public class ConfigLoader {
|
public class ConfigLoader {
|
||||||
|
|
||||||
static Path getConfigPath(String configName) {
|
public static Path getConfigPath(String configName) {
|
||||||
return WmsaHome.getHomePath().resolve("conf/properties/" + configName + ".properties");
|
return WmsaHome.getHomePath().resolve("conf/properties/" + configName + ".properties");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void loadConfig(Path configPath) {
|
public static void loadConfig(Path configPath) {
|
||||||
if (!Files.exists(configPath)) {
|
if (!Files.exists(configPath)) {
|
||||||
System.err.println("No config file found at " + configPath);
|
System.err.println("No config file found at " + configPath);
|
||||||
return;
|
return;
|
||||||
|
@ -1,20 +0,0 @@
|
|||||||
package nu.marginalia.service;
|
|
||||||
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
public abstract class ProcessMainClass {
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(ProcessMainClass.class);
|
|
||||||
|
|
||||||
static {
|
|
||||||
// Load global config ASAP
|
|
||||||
ConfigLoader.loadConfig(
|
|
||||||
ConfigLoader.getConfigPath("system")
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
public ProcessMainClass() {
|
|
||||||
new org.mariadb.jdbc.Driver();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -13,7 +13,10 @@ public enum ServiceId {
|
|||||||
|
|
||||||
Dating("dating-service"),
|
Dating("dating-service"),
|
||||||
Status("setatus-service"),
|
Status("setatus-service"),
|
||||||
Explorer("explorer-service");
|
Explorer("explorer-service"),
|
||||||
|
|
||||||
|
NOT_A_SERVICE("NOT_A_SERVICE")
|
||||||
|
;
|
||||||
|
|
||||||
public final String serviceName;
|
public final String serviceName;
|
||||||
|
|
||||||
|
@ -4,6 +4,7 @@ import com.google.inject.Inject;
|
|||||||
import com.google.inject.name.Named;
|
import com.google.inject.name.Named;
|
||||||
import nu.marginalia.mq.persistence.MqPersistence;
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||||
|
import nu.marginalia.nodecfg.model.NodeProfile;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.storage.model.FileStorageBaseType;
|
import nu.marginalia.storage.model.FileStorageBaseType;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -56,7 +57,9 @@ public class NodeStatusWatcher {
|
|||||||
|
|
||||||
private void setupNode() {
|
private void setupNode() {
|
||||||
try {
|
try {
|
||||||
configurationService.create(nodeId, "Node " + nodeId, true, false);
|
NodeProfile profile = NodeProfile.MIXED;
|
||||||
|
|
||||||
|
configurationService.create(nodeId, "Node " + nodeId, true, false, profile);
|
||||||
|
|
||||||
fileStorageService.createStorageBase("Index Data", Path.of("/idx"), nodeId, FileStorageBaseType.CURRENT);
|
fileStorageService.createStorageBase("Index Data", Path.of("/idx"), nodeId, FileStorageBaseType.CURRENT);
|
||||||
fileStorageService.createStorageBase("Index Backups", Path.of("/backup"), nodeId, FileStorageBaseType.BACKUP);
|
fileStorageService.createStorageBase("Index Backups", Path.of("/backup"), nodeId, FileStorageBaseType.BACKUP);
|
||||||
|
@ -182,4 +182,10 @@ public class ExecutorClient {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void restartExecutorService(int node) {
|
||||||
|
channelPool.call(ExecutorApiBlockingStub::restartExecutorService)
|
||||||
|
.forNode(node)
|
||||||
|
.run(Empty.getDefaultInstance());
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -17,6 +17,8 @@ service ExecutorApi {
|
|||||||
rpc downloadSampleData(RpcDownloadSampleData) returns (Empty) {}
|
rpc downloadSampleData(RpcDownloadSampleData) returns (Empty) {}
|
||||||
rpc calculateAdjacencies(Empty) returns (Empty) {}
|
rpc calculateAdjacencies(Empty) returns (Empty) {}
|
||||||
rpc restoreBackup(RpcFileStorageId) returns (Empty) {}
|
rpc restoreBackup(RpcFileStorageId) returns (Empty) {}
|
||||||
|
|
||||||
|
rpc restartExecutorService(Empty) returns (Empty) {}
|
||||||
}
|
}
|
||||||
|
|
||||||
service ExecutorCrawlApi {
|
service ExecutorCrawlApi {
|
||||||
|
@ -15,15 +15,15 @@ dependencies {
|
|||||||
// These look weird but they're needed to be able to spawn the processes
|
// These look weird but they're needed to be able to spawn the processes
|
||||||
// from the executor service
|
// from the executor service
|
||||||
|
|
||||||
implementation project(':code:processes:website-adjacencies-calculator')
|
implementation project(':code:processes:export-task-process')
|
||||||
implementation project(':code:processes:crawling-process')
|
implementation project(':code:processes:crawling-process')
|
||||||
|
implementation project(':code:processes:live-crawling-process')
|
||||||
implementation project(':code:processes:loading-process')
|
implementation project(':code:processes:loading-process')
|
||||||
implementation project(':code:processes:converting-process')
|
implementation project(':code:processes:converting-process')
|
||||||
implementation project(':code:processes:index-constructor-process')
|
implementation project(':code:processes:index-constructor-process')
|
||||||
|
|
||||||
implementation project(':code:common:config')
|
implementation project(':code:common:config')
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
implementation project(':code:common:process')
|
|
||||||
implementation project(':code:common:db')
|
implementation project(':code:common:db')
|
||||||
implementation project(':code:common:linkdb')
|
implementation project(':code:common:linkdb')
|
||||||
|
|
||||||
@ -42,7 +42,6 @@ dependencies {
|
|||||||
implementation project(':code:processes:crawling-process:model')
|
implementation project(':code:processes:crawling-process:model')
|
||||||
implementation project(':code:processes:crawling-process:model')
|
implementation project(':code:processes:crawling-process:model')
|
||||||
implementation project(':code:processes:crawling-process:ft-link-parser')
|
implementation project(':code:processes:crawling-process:ft-link-parser')
|
||||||
implementation project(':code:execution:data-extractors')
|
|
||||||
implementation project(':code:index:index-journal')
|
implementation project(':code:index:index-journal')
|
||||||
implementation project(':code:index:api')
|
implementation project(':code:index:api')
|
||||||
implementation project(':code:processes:process-mq-api')
|
implementation project(':code:processes:process-mq-api')
|
||||||
|
@ -1,7 +0,0 @@
|
|||||||
Contains converter-*like* extraction jobs that operate on crawled data to produce export files.
|
|
||||||
|
|
||||||
## Important classes
|
|
||||||
|
|
||||||
* [AtagExporter](java/nu/marginalia/extractor/AtagExporter.java) - extracts anchor texts from the crawled data.
|
|
||||||
* [FeedExporter](java/nu/marginalia/extractor/FeedExporter.java) - tries to find RSS/Atom feeds within the crawled data.
|
|
||||||
* [TermFrequencyExporter](java/nu/marginalia/extractor/TermFrequencyExporter.java) - exports the 'TF' part of TF-IDF.
|
|
@ -1,29 +1,38 @@
|
|||||||
package nu.marginalia.actor;
|
package nu.marginalia.actor;
|
||||||
|
|
||||||
|
import nu.marginalia.nodecfg.model.NodeProfile;
|
||||||
|
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
public enum ExecutorActor {
|
public enum ExecutorActor {
|
||||||
CRAWL,
|
CRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
RECRAWL,
|
RECRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
RECRAWL_SINGLE_DOMAIN,
|
RECRAWL_SINGLE_DOMAIN(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
CONVERT_AND_LOAD,
|
PROC_CONVERTER_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
PROC_CONVERTER_SPAWNER,
|
PROC_CRAWLER_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
PROC_LOADER_SPAWNER,
|
PROC_EXPORT_TASKS_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
PROC_CRAWLER_SPAWNER,
|
ADJACENCY_CALCULATION(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
MONITOR_PROCESS_LIVENESS,
|
EXPORT_DATA(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
MONITOR_FILE_STORAGE,
|
EXPORT_SEGMENTATION_MODEL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
ADJACENCY_CALCULATION,
|
EXPORT_ATAGS(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
CRAWL_JOB_EXTRACTOR,
|
EXPORT_TERM_FREQUENCIES(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
EXPORT_DATA,
|
EXPORT_FEEDS(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
EXPORT_SEGMENTATION_MODEL,
|
EXPORT_SAMPLE_DATA(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
EXPORT_ATAGS,
|
DOWNLOAD_SAMPLE(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
EXPORT_TERM_FREQUENCIES,
|
|
||||||
EXPORT_FEEDS,
|
PROC_LOADER_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.SIDELOAD),
|
||||||
PROC_INDEX_CONSTRUCTOR_SPAWNER,
|
RESTORE_BACKUP(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.SIDELOAD),
|
||||||
CONVERT,
|
CONVERT(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.SIDELOAD),
|
||||||
RESTORE_BACKUP,
|
|
||||||
EXPORT_SAMPLE_DATA,
|
CONVERT_AND_LOAD(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.REALTIME, NodeProfile.SIDELOAD),
|
||||||
DOWNLOAD_SAMPLE,
|
MONITOR_PROCESS_LIVENESS(NodeProfile.BATCH_CRAWL, NodeProfile.REALTIME, NodeProfile.MIXED, NodeProfile.SIDELOAD),
|
||||||
SCRAPE_FEEDS,
|
MONITOR_FILE_STORAGE(NodeProfile.BATCH_CRAWL, NodeProfile.REALTIME, NodeProfile.MIXED, NodeProfile.SIDELOAD),
|
||||||
UPDATE_RSS;
|
PROC_INDEX_CONSTRUCTOR_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.REALTIME, NodeProfile.MIXED, NodeProfile.SIDELOAD),
|
||||||
|
|
||||||
|
LIVE_CRAWL(NodeProfile.REALTIME),
|
||||||
|
PROC_LIVE_CRAWL_SPAWNER(NodeProfile.REALTIME),
|
||||||
|
SCRAPE_FEEDS(NodeProfile.REALTIME),
|
||||||
|
UPDATE_RSS(NodeProfile.REALTIME);
|
||||||
|
|
||||||
public String id() {
|
public String id() {
|
||||||
return "fsm:" + name().toLowerCase();
|
return "fsm:" + name().toLowerCase();
|
||||||
@ -33,4 +42,9 @@ public enum ExecutorActor {
|
|||||||
return "fsm:" + name().toLowerCase() + ":" + node;
|
return "fsm:" + name().toLowerCase() + ":" + node;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ExecutorActor(NodeProfile... profileSet) {
|
||||||
|
this.profileSet = Set.of(profileSet);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Set<NodeProfile> profileSet;
|
||||||
}
|
}
|
||||||
|
@ -10,11 +10,14 @@ import nu.marginalia.actor.state.ActorStateInstance;
|
|||||||
import nu.marginalia.actor.state.ActorStep;
|
import nu.marginalia.actor.state.ActorStep;
|
||||||
import nu.marginalia.actor.task.*;
|
import nu.marginalia.actor.task.*;
|
||||||
import nu.marginalia.mq.MessageQueueFactory;
|
import nu.marginalia.mq.MessageQueueFactory;
|
||||||
|
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||||
|
import nu.marginalia.nodecfg.model.NodeConfiguration;
|
||||||
import nu.marginalia.service.control.ServiceEventLog;
|
import nu.marginalia.service.control.ServiceEventLog;
|
||||||
import nu.marginalia.service.server.BaseServiceParams;
|
import nu.marginalia.service.server.BaseServiceParams;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.sql.SQLException;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.UUID;
|
import java.util.UUID;
|
||||||
@ -28,18 +31,23 @@ public class ExecutorActorControlService {
|
|||||||
public Map<ExecutorActor, ActorPrototype> actorDefinitions = new HashMap<>();
|
public Map<ExecutorActor, ActorPrototype> actorDefinitions = new HashMap<>();
|
||||||
private final int node;
|
private final int node;
|
||||||
|
|
||||||
|
private final NodeConfiguration nodeConfiguration;
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public ExecutorActorControlService(MessageQueueFactory messageQueueFactory,
|
public ExecutorActorControlService(MessageQueueFactory messageQueueFactory,
|
||||||
|
NodeConfigurationService configurationService,
|
||||||
BaseServiceParams baseServiceParams,
|
BaseServiceParams baseServiceParams,
|
||||||
ConvertActor convertActor,
|
ConvertActor convertActor,
|
||||||
ConvertAndLoadActor convertAndLoadActor,
|
ConvertAndLoadActor convertAndLoadActor,
|
||||||
CrawlActor crawlActor,
|
CrawlActor crawlActor,
|
||||||
|
LiveCrawlActor liveCrawlActor,
|
||||||
RecrawlSingleDomainActor recrawlSingleDomainActor,
|
RecrawlSingleDomainActor recrawlSingleDomainActor,
|
||||||
RestoreBackupActor restoreBackupActor,
|
RestoreBackupActor restoreBackupActor,
|
||||||
ConverterMonitorActor converterMonitorFSM,
|
ConverterMonitorActor converterMonitorFSM,
|
||||||
CrawlerMonitorActor crawlerMonitorActor,
|
CrawlerMonitorActor crawlerMonitorActor,
|
||||||
|
LiveCrawlerMonitorActor liveCrawlerMonitorActor,
|
||||||
LoaderMonitorActor loaderMonitor,
|
LoaderMonitorActor loaderMonitor,
|
||||||
ProcessLivenessMonitorActor processMonitorFSM,
|
ProcessLivenessMonitorActor processMonitorFSM,
|
||||||
FileStorageMonitorActor fileStorageMonitorActor,
|
FileStorageMonitorActor fileStorageMonitorActor,
|
||||||
@ -51,16 +59,20 @@ public class ExecutorActorControlService {
|
|||||||
ExportSampleDataActor exportSampleDataActor,
|
ExportSampleDataActor exportSampleDataActor,
|
||||||
ExportTermFreqActor exportTermFrequenciesActor,
|
ExportTermFreqActor exportTermFrequenciesActor,
|
||||||
ExportSegmentationModelActor exportSegmentationModelActor,
|
ExportSegmentationModelActor exportSegmentationModelActor,
|
||||||
|
ExportTaskMonitorActor exportTasksMonitorActor,
|
||||||
DownloadSampleActor downloadSampleActor,
|
DownloadSampleActor downloadSampleActor,
|
||||||
ScrapeFeedsActor scrapeFeedsActor,
|
ScrapeFeedsActor scrapeFeedsActor,
|
||||||
ExecutorActorStateMachines stateMachines,
|
ExecutorActorStateMachines stateMachines,
|
||||||
UpdateRssActor updateRssActor) {
|
UpdateRssActor updateRssActor) throws SQLException {
|
||||||
this.messageQueueFactory = messageQueueFactory;
|
this.messageQueueFactory = messageQueueFactory;
|
||||||
this.eventLog = baseServiceParams.eventLog;
|
this.eventLog = baseServiceParams.eventLog;
|
||||||
this.stateMachines = stateMachines;
|
this.stateMachines = stateMachines;
|
||||||
this.node = baseServiceParams.configuration.node();
|
this.node = baseServiceParams.configuration.node();
|
||||||
|
|
||||||
|
this.nodeConfiguration = configurationService.get(node);
|
||||||
|
|
||||||
register(ExecutorActor.CRAWL, crawlActor);
|
register(ExecutorActor.CRAWL, crawlActor);
|
||||||
|
register(ExecutorActor.LIVE_CRAWL, liveCrawlActor);
|
||||||
register(ExecutorActor.RECRAWL_SINGLE_DOMAIN, recrawlSingleDomainActor);
|
register(ExecutorActor.RECRAWL_SINGLE_DOMAIN, recrawlSingleDomainActor);
|
||||||
|
|
||||||
register(ExecutorActor.CONVERT, convertActor);
|
register(ExecutorActor.CONVERT, convertActor);
|
||||||
@ -71,6 +83,8 @@ public class ExecutorActorControlService {
|
|||||||
register(ExecutorActor.PROC_CONVERTER_SPAWNER, converterMonitorFSM);
|
register(ExecutorActor.PROC_CONVERTER_SPAWNER, converterMonitorFSM);
|
||||||
register(ExecutorActor.PROC_LOADER_SPAWNER, loaderMonitor);
|
register(ExecutorActor.PROC_LOADER_SPAWNER, loaderMonitor);
|
||||||
register(ExecutorActor.PROC_CRAWLER_SPAWNER, crawlerMonitorActor);
|
register(ExecutorActor.PROC_CRAWLER_SPAWNER, crawlerMonitorActor);
|
||||||
|
register(ExecutorActor.PROC_LIVE_CRAWL_SPAWNER, liveCrawlerMonitorActor);
|
||||||
|
register(ExecutorActor.PROC_EXPORT_TASKS_SPAWNER, exportTasksMonitorActor);
|
||||||
|
|
||||||
register(ExecutorActor.MONITOR_PROCESS_LIVENESS, processMonitorFSM);
|
register(ExecutorActor.MONITOR_PROCESS_LIVENESS, processMonitorFSM);
|
||||||
register(ExecutorActor.MONITOR_FILE_STORAGE, fileStorageMonitorActor);
|
register(ExecutorActor.MONITOR_FILE_STORAGE, fileStorageMonitorActor);
|
||||||
@ -91,6 +105,11 @@ public class ExecutorActorControlService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private void register(ExecutorActor process, RecordActorPrototype graph) {
|
private void register(ExecutorActor process, RecordActorPrototype graph) {
|
||||||
|
|
||||||
|
if (!process.profileSet.contains(nodeConfiguration.profile())) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
var sm = new ActorStateMachine(messageQueueFactory, process.id(), node, UUID.randomUUID(), graph);
|
var sm = new ActorStateMachine(messageQueueFactory, process.id(), node, UUID.randomUUID(), graph);
|
||||||
sm.listen((function, param) -> logStateChange(process, function));
|
sm.listen((function, param) -> logStateChange(process, function));
|
||||||
|
|
||||||
|
@ -0,0 +1,29 @@
|
|||||||
|
package nu.marginalia.actor.proc;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
||||||
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
|
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||||
|
import nu.marginalia.process.ProcessService;
|
||||||
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class ExportTaskMonitorActor extends AbstractProcessSpawnerActor {
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public ExportTaskMonitorActor(Gson gson,
|
||||||
|
ServiceConfiguration configuration,
|
||||||
|
MqPersistence persistence,
|
||||||
|
ProcessService processService) {
|
||||||
|
super(gson,
|
||||||
|
configuration,
|
||||||
|
persistence,
|
||||||
|
processService,
|
||||||
|
ProcessInboxNames.EXPORT_TASK_INBOX,
|
||||||
|
ProcessService.ProcessId.EXPORT_TASKS);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,29 @@
|
|||||||
|
package nu.marginalia.actor.proc;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
||||||
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
|
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||||
|
import nu.marginalia.process.ProcessService;
|
||||||
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class LiveCrawlerMonitorActor extends AbstractProcessSpawnerActor {
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public LiveCrawlerMonitorActor(Gson gson,
|
||||||
|
ServiceConfiguration configuration,
|
||||||
|
MqPersistence persistence,
|
||||||
|
ProcessService processService) {
|
||||||
|
super(gson,
|
||||||
|
configuration,
|
||||||
|
persistence,
|
||||||
|
processService,
|
||||||
|
ProcessInboxNames.LIVE_CRAWLER_INBOX,
|
||||||
|
ProcessService.ProcessId.LIVE_CRAWLER);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@ -10,6 +10,8 @@ import nu.marginalia.actor.state.ActorResumeBehavior;
|
|||||||
import nu.marginalia.actor.state.ActorStep;
|
import nu.marginalia.actor.state.ActorStep;
|
||||||
import nu.marginalia.actor.state.Resume;
|
import nu.marginalia.actor.state.Resume;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
|
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||||
|
import nu.marginalia.nodecfg.model.NodeProfile;
|
||||||
import nu.marginalia.service.control.ServiceEventLog;
|
import nu.marginalia.service.control.ServiceEventLog;
|
||||||
import nu.marginalia.service.module.ServiceConfiguration;
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
@ -39,6 +41,7 @@ public class ScrapeFeedsActor extends RecordActorPrototype {
|
|||||||
private final Duration pollInterval = Duration.ofHours(6);
|
private final Duration pollInterval = Duration.ofHours(6);
|
||||||
|
|
||||||
private final ServiceEventLog eventLog;
|
private final ServiceEventLog eventLog;
|
||||||
|
private final NodeConfigurationService nodeConfigurationService;
|
||||||
private final HikariDataSource dataSource;
|
private final HikariDataSource dataSource;
|
||||||
private final int nodeId;
|
private final int nodeId;
|
||||||
|
|
||||||
@ -54,8 +57,8 @@ public class ScrapeFeedsActor extends RecordActorPrototype {
|
|||||||
public ActorStep transition(ActorStep self) throws Exception {
|
public ActorStep transition(ActorStep self) throws Exception {
|
||||||
return switch(self) {
|
return switch(self) {
|
||||||
case Initial() -> {
|
case Initial() -> {
|
||||||
if (nodeId > 1) {
|
if (nodeConfigurationService.get(nodeId).profile() != NodeProfile.REALTIME) {
|
||||||
yield new End();
|
yield new Error("Invalid node profile for RSS update");
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
yield new Wait(LocalDateTime.now().toString());
|
yield new Wait(LocalDateTime.now().toString());
|
||||||
@ -177,10 +180,12 @@ public class ScrapeFeedsActor extends RecordActorPrototype {
|
|||||||
public ScrapeFeedsActor(Gson gson,
|
public ScrapeFeedsActor(Gson gson,
|
||||||
ServiceEventLog eventLog,
|
ServiceEventLog eventLog,
|
||||||
ServiceConfiguration configuration,
|
ServiceConfiguration configuration,
|
||||||
|
NodeConfigurationService nodeConfigurationService,
|
||||||
HikariDataSource dataSource)
|
HikariDataSource dataSource)
|
||||||
{
|
{
|
||||||
super(gson);
|
super(gson);
|
||||||
this.eventLog = eventLog;
|
this.eventLog = eventLog;
|
||||||
|
this.nodeConfigurationService = nodeConfigurationService;
|
||||||
this.dataSource = dataSource;
|
this.dataSource = dataSource;
|
||||||
this.nodeId = configuration.node();
|
this.nodeId = configuration.node();
|
||||||
}
|
}
|
||||||
|
@ -11,6 +11,8 @@ import nu.marginalia.api.feeds.RpcFeedUpdateMode;
|
|||||||
import nu.marginalia.mq.MqMessage;
|
import nu.marginalia.mq.MqMessage;
|
||||||
import nu.marginalia.mq.MqMessageState;
|
import nu.marginalia.mq.MqMessageState;
|
||||||
import nu.marginalia.mq.persistence.MqPersistence;
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
|
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||||
|
import nu.marginalia.nodecfg.model.NodeProfile;
|
||||||
import nu.marginalia.service.module.ServiceConfiguration;
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
|
|
||||||
import java.time.Duration;
|
import java.time.Duration;
|
||||||
@ -25,13 +27,19 @@ public class UpdateRssActor extends RecordActorPrototype {
|
|||||||
private final Duration updateInterval = Duration.ofHours(24);
|
private final Duration updateInterval = Duration.ofHours(24);
|
||||||
private final int cleanInterval = 60;
|
private final int cleanInterval = 60;
|
||||||
|
|
||||||
|
private final NodeConfigurationService nodeConfigurationService;
|
||||||
private final MqPersistence persistence;
|
private final MqPersistence persistence;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public UpdateRssActor(Gson gson, FeedsClient feedsClient, ServiceConfiguration serviceConfiguration, MqPersistence persistence) {
|
public UpdateRssActor(Gson gson,
|
||||||
|
FeedsClient feedsClient,
|
||||||
|
ServiceConfiguration serviceConfiguration,
|
||||||
|
NodeConfigurationService nodeConfigurationService,
|
||||||
|
MqPersistence persistence) {
|
||||||
super(gson);
|
super(gson);
|
||||||
this.feedsClient = feedsClient;
|
this.feedsClient = feedsClient;
|
||||||
this.nodeId = serviceConfiguration.node();
|
this.nodeId = serviceConfiguration.node();
|
||||||
|
this.nodeConfigurationService = nodeConfigurationService;
|
||||||
this.persistence = persistence;
|
this.persistence = persistence;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -55,9 +63,8 @@ public class UpdateRssActor extends RecordActorPrototype {
|
|||||||
public ActorStep transition(ActorStep self) throws Exception {
|
public ActorStep transition(ActorStep self) throws Exception {
|
||||||
return switch (self) {
|
return switch (self) {
|
||||||
case Initial() -> {
|
case Initial() -> {
|
||||||
if (nodeId > 1) {
|
if (nodeConfigurationService.get(nodeId).profile() != NodeProfile.REALTIME) {
|
||||||
// Only run on the first node
|
yield new Error("Invalid node profile for RSS update");
|
||||||
yield new End();
|
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
// Wait for 5 minutes before starting the first update, to give the system time to start up properly
|
// Wait for 5 minutes before starting the first update, to give the system time to start up properly
|
||||||
|
@ -3,50 +3,68 @@ package nu.marginalia.actor.task;
|
|||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.extractor.AtagExporter;
|
|
||||||
import nu.marginalia.extractor.ExporterIf;
|
|
||||||
import nu.marginalia.storage.model.*;
|
|
||||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||||
import nu.marginalia.actor.state.ActorStep;
|
import nu.marginalia.actor.state.ActorStep;
|
||||||
|
import nu.marginalia.mq.MqMessageState;
|
||||||
|
import nu.marginalia.mq.outbox.MqOutbox;
|
||||||
|
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
||||||
|
import nu.marginalia.process.ProcessOutboxes;
|
||||||
|
import nu.marginalia.process.ProcessService;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
|
import nu.marginalia.storage.model.FileStorageId;
|
||||||
|
import nu.marginalia.storage.model.FileStorageState;
|
||||||
|
import nu.marginalia.storage.model.FileStorageType;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.time.LocalDateTime;
|
import java.time.LocalDateTime;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class ExportAtagsActor extends RecordActorPrototype {
|
public class ExportAtagsActor extends RecordActorPrototype {
|
||||||
private final FileStorageService storageService;
|
private final FileStorageService storageService;
|
||||||
private final ExporterIf atagExporter;
|
private final ActorProcessWatcher processWatcher;
|
||||||
|
private final MqOutbox exportTasksOutbox;
|
||||||
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
public record Export(FileStorageId crawlId) implements ActorStep {}
|
public record Export(FileStorageId crawlId) implements ActorStep {}
|
||||||
public record Run(FileStorageId crawlId, FileStorageId destId) implements ActorStep {}
|
public record Run(FileStorageId crawlId, FileStorageId destId, long msgId) implements ActorStep {
|
||||||
|
public Run(FileStorageId crawlId, FileStorageId destId) {
|
||||||
|
this(crawlId, destId, -1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public ActorStep transition(ActorStep self) throws Exception {
|
public ActorStep transition(ActorStep self) throws Exception {
|
||||||
return switch(self) {
|
return switch(self) {
|
||||||
case Export(FileStorageId crawlId) -> {
|
case Export(FileStorageId crawlId) -> {
|
||||||
var storage = storageService.allocateStorage(FileStorageType.EXPORT, "atag-export", "Anchor Tags " + LocalDateTime.now());
|
var storage = storageService.allocateStorage(FileStorageType.EXPORT, "atags-export", "Atags " + LocalDateTime.now());
|
||||||
|
|
||||||
if (storage == null) yield new Error("Bad storage id");
|
if (storage == null) yield new Error("Bad storage id");
|
||||||
yield new Run(crawlId, storage.id());
|
yield new Run(crawlId, storage.id());
|
||||||
}
|
}
|
||||||
case Run(FileStorageId crawlId, FileStorageId destId) -> {
|
case Run(FileStorageId crawlId, FileStorageId destId, long msgId) when msgId < 0 -> {
|
||||||
storageService.setFileStorageState(destId, FileStorageState.NEW);
|
storageService.setFileStorageState(destId, FileStorageState.NEW);
|
||||||
|
|
||||||
try {
|
long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.atags(crawlId, destId));
|
||||||
atagExporter.export(crawlId, destId);
|
yield new Run(crawlId, destId, newMsgId);
|
||||||
storageService.setFileStorageState(destId, FileStorageState.UNSET);
|
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
|
||||||
storageService.setFileStorageState(destId, FileStorageState.DELETE);
|
|
||||||
yield new Error("Failed to export data");
|
|
||||||
}
|
}
|
||||||
|
case Run(_, FileStorageId destId, long msgId) -> {
|
||||||
|
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
|
||||||
|
|
||||||
|
if (rsp.state() != MqMessageState.OK) {
|
||||||
|
storageService.flagFileForDeletion(destId);
|
||||||
|
yield new Error("Exporter failed");
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
storageService.setFileStorageState(destId, FileStorageState.UNSET);
|
||||||
yield new End();
|
yield new End();
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
default -> new Error();
|
default -> new Error();
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String describe() {
|
public String describe() {
|
||||||
return "Export anchor tags from crawl data";
|
return "Export anchor tags from crawl data";
|
||||||
@ -55,11 +73,13 @@ public class ExportAtagsActor extends RecordActorPrototype {
|
|||||||
@Inject
|
@Inject
|
||||||
public ExportAtagsActor(Gson gson,
|
public ExportAtagsActor(Gson gson,
|
||||||
FileStorageService storageService,
|
FileStorageService storageService,
|
||||||
AtagExporter atagExporter)
|
ProcessOutboxes processOutboxes,
|
||||||
|
ActorProcessWatcher processWatcher)
|
||||||
{
|
{
|
||||||
super(gson);
|
super(gson);
|
||||||
|
this.exportTasksOutbox = processOutboxes.getExportTasksOutbox();
|
||||||
this.storageService = storageService;
|
this.storageService = storageService;
|
||||||
this.atagExporter = atagExporter;
|
this.processWatcher = processWatcher;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -5,8 +5,11 @@ import com.google.inject.Inject;
|
|||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||||
import nu.marginalia.actor.state.ActorStep;
|
import nu.marginalia.actor.state.ActorStep;
|
||||||
import nu.marginalia.extractor.ExporterIf;
|
import nu.marginalia.mq.MqMessageState;
|
||||||
import nu.marginalia.extractor.FeedExporter;
|
import nu.marginalia.mq.outbox.MqOutbox;
|
||||||
|
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
||||||
|
import nu.marginalia.process.ProcessOutboxes;
|
||||||
|
import nu.marginalia.process.ProcessService;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.storage.model.FileStorageId;
|
import nu.marginalia.storage.model.FileStorageId;
|
||||||
import nu.marginalia.storage.model.FileStorageState;
|
import nu.marginalia.storage.model.FileStorageState;
|
||||||
@ -19,11 +22,17 @@ import java.time.LocalDateTime;
|
|||||||
@Singleton
|
@Singleton
|
||||||
public class ExportFeedsActor extends RecordActorPrototype {
|
public class ExportFeedsActor extends RecordActorPrototype {
|
||||||
private final FileStorageService storageService;
|
private final FileStorageService storageService;
|
||||||
|
private final ActorProcessWatcher processWatcher;
|
||||||
|
private final MqOutbox exportTasksOutbox;
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
private final ExporterIf feedExporter;
|
|
||||||
public record Export(FileStorageId crawlId) implements ActorStep {}
|
public record Export(FileStorageId crawlId) implements ActorStep {}
|
||||||
public record Run(FileStorageId crawlId, FileStorageId destId) implements ActorStep {}
|
public record Run(FileStorageId crawlId, FileStorageId destId, long msgId) implements ActorStep {
|
||||||
|
public Run(FileStorageId crawlId, FileStorageId destId) {
|
||||||
|
this(crawlId, destId, -1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public ActorStep transition(ActorStep self) throws Exception {
|
public ActorStep transition(ActorStep self) throws Exception {
|
||||||
return switch(self) {
|
return switch(self) {
|
||||||
@ -33,20 +42,25 @@ public class ExportFeedsActor extends RecordActorPrototype {
|
|||||||
if (storage == null) yield new Error("Bad storage id");
|
if (storage == null) yield new Error("Bad storage id");
|
||||||
yield new Run(crawlId, storage.id());
|
yield new Run(crawlId, storage.id());
|
||||||
}
|
}
|
||||||
case Run(FileStorageId crawlId, FileStorageId destId) -> {
|
case Run(FileStorageId crawlId, FileStorageId destId, long msgId) when msgId < 0 -> {
|
||||||
storageService.setFileStorageState(destId, FileStorageState.NEW);
|
storageService.setFileStorageState(destId, FileStorageState.NEW);
|
||||||
|
|
||||||
try {
|
long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.feeds(crawlId, destId));
|
||||||
feedExporter.export(crawlId, destId);
|
yield new Run(crawlId, destId, newMsgId);
|
||||||
storageService.setFileStorageState(destId, FileStorageState.UNSET);
|
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
|
||||||
storageService.setFileStorageState(destId, FileStorageState.DELETE);
|
|
||||||
yield new Error("Failed to export data");
|
|
||||||
}
|
}
|
||||||
|
case Run(_, FileStorageId destId, long msgId) -> {
|
||||||
|
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
|
||||||
|
|
||||||
|
if (rsp.state() != MqMessageState.OK) {
|
||||||
|
storageService.flagFileForDeletion(destId);
|
||||||
|
yield new Error("Exporter failed");
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
storageService.setFileStorageState(destId, FileStorageState.UNSET);
|
||||||
yield new End();
|
yield new End();
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
default -> new Error();
|
default -> new Error();
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@ -60,11 +74,13 @@ public class ExportFeedsActor extends RecordActorPrototype {
|
|||||||
@Inject
|
@Inject
|
||||||
public ExportFeedsActor(Gson gson,
|
public ExportFeedsActor(Gson gson,
|
||||||
FileStorageService storageService,
|
FileStorageService storageService,
|
||||||
FeedExporter feedExporter)
|
ActorProcessWatcher processWatcher,
|
||||||
|
ProcessOutboxes outboxes)
|
||||||
{
|
{
|
||||||
super(gson);
|
super(gson);
|
||||||
this.storageService = storageService;
|
this.storageService = storageService;
|
||||||
this.feedExporter = feedExporter;
|
this.processWatcher = processWatcher;
|
||||||
|
this.exportTasksOutbox = outboxes.getExportTasksOutbox();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -5,7 +5,11 @@ import com.google.inject.Inject;
|
|||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||||
import nu.marginalia.actor.state.ActorStep;
|
import nu.marginalia.actor.state.ActorStep;
|
||||||
import nu.marginalia.extractor.SampleDataExporter;
|
import nu.marginalia.mq.MqMessageState;
|
||||||
|
import nu.marginalia.mq.outbox.MqOutbox;
|
||||||
|
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
||||||
|
import nu.marginalia.process.ProcessOutboxes;
|
||||||
|
import nu.marginalia.process.ProcessService;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.storage.model.FileStorageId;
|
import nu.marginalia.storage.model.FileStorageId;
|
||||||
import nu.marginalia.storage.model.FileStorageState;
|
import nu.marginalia.storage.model.FileStorageState;
|
||||||
@ -18,11 +22,17 @@ import java.time.LocalDateTime;
|
|||||||
@Singleton
|
@Singleton
|
||||||
public class ExportSampleDataActor extends RecordActorPrototype {
|
public class ExportSampleDataActor extends RecordActorPrototype {
|
||||||
private final FileStorageService storageService;
|
private final FileStorageService storageService;
|
||||||
|
private final ActorProcessWatcher processWatcher;
|
||||||
|
private final MqOutbox exportTasksOutbox;
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
private final SampleDataExporter dataExporter;
|
|
||||||
public record Export(FileStorageId crawlId, int size, String name) implements ActorStep {}
|
public record Export(FileStorageId crawlId, int size, String name) implements ActorStep {}
|
||||||
public record Run(FileStorageId crawlId, FileStorageId destId, int size, String name) implements ActorStep {}
|
public record Run(FileStorageId crawlId, FileStorageId destId, int size, String name, long msgId) implements ActorStep {
|
||||||
|
public Run(FileStorageId crawlId, FileStorageId destId, int size, String name) {
|
||||||
|
this(crawlId, destId, size, name, -1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public ActorStep transition(ActorStep self) throws Exception {
|
public ActorStep transition(ActorStep self) throws Exception {
|
||||||
return switch(self) {
|
return switch(self) {
|
||||||
@ -35,28 +45,29 @@ public class ExportSampleDataActor extends RecordActorPrototype {
|
|||||||
if (storage == null) yield new Error("Bad storage id");
|
if (storage == null) yield new Error("Bad storage id");
|
||||||
yield new Run(crawlId, storage.id(), size, name);
|
yield new Run(crawlId, storage.id(), size, name);
|
||||||
}
|
}
|
||||||
case Run(FileStorageId crawlId, FileStorageId destId, int size, String name) -> {
|
case Run(FileStorageId crawlId, FileStorageId destId, int size, String name, long msgId) when msgId < 0 -> {
|
||||||
storageService.setFileStorageState(destId, FileStorageState.NEW);
|
storageService.setFileStorageState(destId, FileStorageState.NEW);
|
||||||
|
|
||||||
try {
|
long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.sampleData(crawlId, destId, size, name));
|
||||||
dataExporter.export(crawlId, destId, size, name);
|
yield new Run(crawlId, destId, size, name, newMsgId);
|
||||||
|
}
|
||||||
|
case Run(_, FileStorageId destId, _, _, long msgId) -> {
|
||||||
|
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
|
||||||
|
|
||||||
|
if (rsp.state() != MqMessageState.OK) {
|
||||||
|
storageService.flagFileForDeletion(destId);
|
||||||
|
yield new Error("Exporter failed");
|
||||||
|
}
|
||||||
|
else {
|
||||||
storageService.setFileStorageState(destId, FileStorageState.UNSET);
|
storageService.setFileStorageState(destId, FileStorageState.UNSET);
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
|
||||||
storageService.setFileStorageState(destId, FileStorageState.DELETE);
|
|
||||||
|
|
||||||
logger.error("Failed to export data", ex);
|
|
||||||
|
|
||||||
yield new Error("Failed to export data");
|
|
||||||
}
|
|
||||||
|
|
||||||
yield new End();
|
yield new End();
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
default -> new Error();
|
default -> new Error();
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String describe() {
|
public String describe() {
|
||||||
return "Export RSS/Atom feeds from crawl data";
|
return "Export RSS/Atom feeds from crawl data";
|
||||||
@ -65,11 +76,13 @@ public class ExportSampleDataActor extends RecordActorPrototype {
|
|||||||
@Inject
|
@Inject
|
||||||
public ExportSampleDataActor(Gson gson,
|
public ExportSampleDataActor(Gson gson,
|
||||||
FileStorageService storageService,
|
FileStorageService storageService,
|
||||||
SampleDataExporter dataExporter)
|
ProcessOutboxes processOutboxes,
|
||||||
|
ActorProcessWatcher processWatcher)
|
||||||
{
|
{
|
||||||
super(gson);
|
super(gson);
|
||||||
this.storageService = storageService;
|
this.storageService = storageService;
|
||||||
this.dataExporter = dataExporter;
|
this.processWatcher = processWatcher;
|
||||||
|
this.exportTasksOutbox = processOutboxes.getExportTasksOutbox();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -5,45 +5,62 @@ import com.google.inject.Inject;
|
|||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||||
import nu.marginalia.actor.state.ActorStep;
|
import nu.marginalia.actor.state.ActorStep;
|
||||||
import nu.marginalia.extractor.ExporterIf;
|
import nu.marginalia.mq.MqMessageState;
|
||||||
import nu.marginalia.extractor.TermFrequencyExporter;
|
import nu.marginalia.mq.outbox.MqOutbox;
|
||||||
|
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
||||||
|
import nu.marginalia.process.ProcessOutboxes;
|
||||||
|
import nu.marginalia.process.ProcessService;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.storage.model.FileStorageId;
|
import nu.marginalia.storage.model.FileStorageId;
|
||||||
import nu.marginalia.storage.model.FileStorageState;
|
import nu.marginalia.storage.model.FileStorageState;
|
||||||
import nu.marginalia.storage.model.FileStorageType;
|
import nu.marginalia.storage.model.FileStorageType;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.time.LocalDateTime;
|
import java.time.LocalDateTime;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class ExportTermFreqActor extends RecordActorPrototype {
|
public class ExportTermFreqActor extends RecordActorPrototype {
|
||||||
private final FileStorageService storageService;
|
private final FileStorageService storageService;
|
||||||
private final ExporterIf exporter;
|
private final ActorProcessWatcher processWatcher;
|
||||||
|
private final MqOutbox exportTasksOutbox;
|
||||||
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
public record Export(FileStorageId crawlId) implements ActorStep {}
|
public record Export(FileStorageId crawlId) implements ActorStep {}
|
||||||
public record Run(FileStorageId crawlId, FileStorageId destId) implements ActorStep {}
|
public record Run(FileStorageId crawlId, FileStorageId destId, long msgId) implements ActorStep {
|
||||||
|
public Run(FileStorageId crawlId, FileStorageId destId) {
|
||||||
|
this(crawlId, destId, -1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public ActorStep transition(ActorStep self) throws Exception {
|
public ActorStep transition(ActorStep self) throws Exception {
|
||||||
return switch(self) {
|
return switch(self) {
|
||||||
case Export(FileStorageId crawlId) -> {
|
case Export(FileStorageId crawlId) -> {
|
||||||
var storage = storageService.allocateStorage(FileStorageType.EXPORT, "term-freq-export", "Term Frequencies " + LocalDateTime.now());
|
var storage = storageService.allocateStorage(FileStorageType.EXPORT, "term-freq", "Term Frequencies " + LocalDateTime.now());
|
||||||
|
|
||||||
if (storage == null) yield new Error("Bad storage id");
|
if (storage == null) yield new Error("Bad storage id");
|
||||||
yield new Run(crawlId, storage.id());
|
yield new Run(crawlId, storage.id());
|
||||||
}
|
}
|
||||||
case Run(FileStorageId crawlId, FileStorageId destId) -> {
|
case Run(FileStorageId crawlId, FileStorageId destId, long msgId) when msgId < 0 -> {
|
||||||
storageService.setFileStorageState(destId, FileStorageState.NEW);
|
storageService.setFileStorageState(destId, FileStorageState.NEW);
|
||||||
|
|
||||||
try {
|
long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.termFreq(crawlId, destId));
|
||||||
exporter.export(crawlId, destId);
|
yield new Run(crawlId, destId, newMsgId);
|
||||||
storageService.setFileStorageState(destId, FileStorageState.UNSET);
|
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
|
||||||
storageService.setFileStorageState(destId, FileStorageState.DELETE);
|
|
||||||
yield new Error("Failed to export data");
|
|
||||||
}
|
}
|
||||||
|
case Run(_, FileStorageId destId, long msgId) -> {
|
||||||
|
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
|
||||||
|
|
||||||
|
if (rsp.state() != MqMessageState.OK) {
|
||||||
|
storageService.flagFileForDeletion(destId);
|
||||||
|
yield new Error("Exporter failed");
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
storageService.setFileStorageState(destId, FileStorageState.UNSET);
|
||||||
yield new End();
|
yield new End();
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
default -> new Error();
|
default -> new Error();
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@ -57,11 +74,13 @@ public class ExportTermFreqActor extends RecordActorPrototype {
|
|||||||
@Inject
|
@Inject
|
||||||
public ExportTermFreqActor(Gson gson,
|
public ExportTermFreqActor(Gson gson,
|
||||||
FileStorageService storageService,
|
FileStorageService storageService,
|
||||||
TermFrequencyExporter exporter)
|
ProcessOutboxes processOutboxes,
|
||||||
|
ActorProcessWatcher processWatcher)
|
||||||
{
|
{
|
||||||
super(gson);
|
super(gson);
|
||||||
this.storageService = storageService;
|
this.storageService = storageService;
|
||||||
this.exporter = exporter;
|
this.processWatcher = processWatcher;
|
||||||
|
this.exportTasksOutbox = processOutboxes.getExportTasksOutbox();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
108
code/execution/java/nu/marginalia/actor/task/LiveCrawlActor.java
Normal file
108
code/execution/java/nu/marginalia/actor/task/LiveCrawlActor.java
Normal file
@ -0,0 +1,108 @@
|
|||||||
|
package nu.marginalia.actor.task;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import nu.marginalia.IndexLocations;
|
||||||
|
import nu.marginalia.actor.ExecutorActor;
|
||||||
|
import nu.marginalia.actor.ExecutorActorStateMachines;
|
||||||
|
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||||
|
import nu.marginalia.actor.state.ActorStep;
|
||||||
|
import nu.marginalia.api.feeds.FeedsClient;
|
||||||
|
import nu.marginalia.mq.MqMessageState;
|
||||||
|
import nu.marginalia.mq.outbox.MqOutbox;
|
||||||
|
import nu.marginalia.mqapi.crawling.LiveCrawlRequest;
|
||||||
|
import nu.marginalia.process.ProcessOutboxes;
|
||||||
|
import nu.marginalia.process.ProcessService;
|
||||||
|
import nu.marginalia.storage.FileStorageService;
|
||||||
|
import org.apache.commons.io.FileUtils;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.time.Duration;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class LiveCrawlActor extends RecordActorPrototype {
|
||||||
|
|
||||||
|
// STATES
|
||||||
|
private final ActorProcessWatcher processWatcher;
|
||||||
|
private final MqOutbox mqLiveCrawlerOutbox;
|
||||||
|
private final ExecutorActorStateMachines executorActorStateMachines;
|
||||||
|
private final FeedsClient feedsClient;
|
||||||
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
|
private final FileStorageService fileStorageService;
|
||||||
|
|
||||||
|
public record Initial() implements ActorStep {}
|
||||||
|
public record Monitor(String feedsHash) implements ActorStep {}
|
||||||
|
public record LiveCrawl(String feedsHash, long msgId) implements ActorStep {
|
||||||
|
public LiveCrawl(String feedsHash) { this(feedsHash, -1); }
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ActorStep transition(ActorStep self) throws Exception {
|
||||||
|
logger.info("{}", self);
|
||||||
|
return switch (self) {
|
||||||
|
case Initial() -> {
|
||||||
|
yield new Monitor("-");
|
||||||
|
}
|
||||||
|
case Monitor(String feedsHash) -> {
|
||||||
|
for (;;) {
|
||||||
|
String currentHash = feedsClient.getFeedDataHash();
|
||||||
|
if (!Objects.equals(currentHash, feedsHash)) {
|
||||||
|
yield new LiveCrawl(currentHash);
|
||||||
|
}
|
||||||
|
Thread.sleep(Duration.ofMinutes(15));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case LiveCrawl(String feedsHash, long msgId) when msgId < 0 -> {
|
||||||
|
// Clear the index journal before starting the crawl
|
||||||
|
Path indexJournalLocation = IndexLocations.getIndexConstructionArea(fileStorageService).resolve("index-journal");
|
||||||
|
if (Files.isDirectory(indexJournalLocation)) {
|
||||||
|
FileUtils.deleteDirectory(indexJournalLocation.toFile());
|
||||||
|
}
|
||||||
|
|
||||||
|
long id = mqLiveCrawlerOutbox.sendAsync(new LiveCrawlRequest());
|
||||||
|
yield new LiveCrawl(feedsHash, id);
|
||||||
|
}
|
||||||
|
case LiveCrawl(String feedsHash, long msgId) -> {
|
||||||
|
var rsp = processWatcher.waitResponse(mqLiveCrawlerOutbox, ProcessService.ProcessId.LIVE_CRAWLER, msgId);
|
||||||
|
|
||||||
|
if (rsp.state() != MqMessageState.OK) {
|
||||||
|
yield new Error("Crawler failed");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build the index
|
||||||
|
executorActorStateMachines.initFrom(ExecutorActor.CONVERT_AND_LOAD, new ConvertAndLoadActor.Rerank());
|
||||||
|
|
||||||
|
yield new Monitor(feedsHash);
|
||||||
|
}
|
||||||
|
default -> new Error("Unknown state");
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String describe() {
|
||||||
|
return "Actor that polls the feeds database for changes, and triggers the live crawler when needed";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public LiveCrawlActor(ActorProcessWatcher processWatcher,
|
||||||
|
ProcessOutboxes processOutboxes,
|
||||||
|
FeedsClient feedsClient,
|
||||||
|
Gson gson,
|
||||||
|
ExecutorActorStateMachines executorActorStateMachines, FileStorageService fileStorageService)
|
||||||
|
{
|
||||||
|
super(gson);
|
||||||
|
this.processWatcher = processWatcher;
|
||||||
|
this.mqLiveCrawlerOutbox = processOutboxes.getLiveCrawlerOutbox();
|
||||||
|
this.executorActorStateMachines = executorActorStateMachines;
|
||||||
|
this.feedsClient = feedsClient;
|
||||||
|
this.fileStorageService = fileStorageService;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@ -5,53 +5,59 @@ import com.google.inject.Inject;
|
|||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||||
import nu.marginalia.actor.state.ActorStep;
|
import nu.marginalia.actor.state.ActorStep;
|
||||||
|
import nu.marginalia.mq.MqMessageState;
|
||||||
|
import nu.marginalia.mq.outbox.MqOutbox;
|
||||||
|
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
||||||
|
import nu.marginalia.process.ProcessOutboxes;
|
||||||
import nu.marginalia.process.ProcessService;
|
import nu.marginalia.process.ProcessService;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.util.concurrent.ExecutorService;
|
|
||||||
import java.util.concurrent.Executors;
|
|
||||||
import java.util.concurrent.atomic.AtomicBoolean;
|
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class TriggerAdjacencyCalculationActor extends RecordActorPrototype {
|
public class TriggerAdjacencyCalculationActor extends RecordActorPrototype {
|
||||||
|
|
||||||
|
private final ActorProcessWatcher processWatcher;
|
||||||
|
private final MqOutbox exportTasksOutbox;
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
private final ProcessService processService;
|
|
||||||
private final ExecutorService executor = Executors.newSingleThreadExecutor();
|
|
||||||
|
|
||||||
public record Run() implements ActorStep {}
|
public record Run(long msgId) implements ActorStep {
|
||||||
|
public Run() {
|
||||||
|
this(-1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public ActorStep transition(ActorStep self) throws Exception {
|
public ActorStep transition(ActorStep self) throws Exception {
|
||||||
return switch (self) {
|
return switch(self) {
|
||||||
case Run() -> {
|
case Run(long msgId) when msgId < 0 -> {
|
||||||
AtomicBoolean hasError = new AtomicBoolean(false);
|
long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.adjacencies());
|
||||||
var future = executor.submit(() -> {
|
yield new Run(newMsgId);
|
||||||
try {
|
|
||||||
processService.trigger(ProcessService.ProcessId.ADJACENCIES_CALCULATOR, "load");
|
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
case Run(long msgId) -> {
|
||||||
logger.warn("Error triggering adjacency calculation", ex);
|
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
|
||||||
hasError.set(true);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
future.get();
|
|
||||||
|
|
||||||
if (hasError.get()) {
|
if (rsp.state() != MqMessageState.OK) {
|
||||||
yield new Error("Error triggering adjacency calculation");
|
yield new Error("Exporter failed");
|
||||||
}
|
}
|
||||||
|
else {
|
||||||
yield new End();
|
yield new End();
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
default -> new Error();
|
default -> new Error();
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public TriggerAdjacencyCalculationActor(Gson gson,
|
public TriggerAdjacencyCalculationActor(Gson gson,
|
||||||
ProcessService processService) {
|
ProcessOutboxes processOutboxes,
|
||||||
|
ActorProcessWatcher processWatcher) {
|
||||||
super(gson);
|
super(gson);
|
||||||
this.processService = processService;
|
|
||||||
|
this.exportTasksOutbox = processOutboxes.getExportTasksOutbox();
|
||||||
|
this.processWatcher = processWatcher;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -15,9 +15,12 @@ import nu.marginalia.service.module.ServiceConfiguration;
|
|||||||
import nu.marginalia.service.server.DiscoverableService;
|
import nu.marginalia.service.server.DiscoverableService;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.storage.model.FileStorageId;
|
import nu.marginalia.storage.model.FileStorageId;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.time.Duration;
|
||||||
import java.time.LocalDateTime;
|
import java.time.LocalDateTime;
|
||||||
import java.time.ZoneId;
|
import java.time.ZoneId;
|
||||||
import java.time.format.DateTimeFormatter;
|
import java.time.format.DateTimeFormatter;
|
||||||
@ -32,6 +35,8 @@ public class ExecutorGrpcService
|
|||||||
private final ServiceConfiguration serviceConfiguration;
|
private final ServiceConfiguration serviceConfiguration;
|
||||||
private final ExecutorActorControlService actorControlService;
|
private final ExecutorActorControlService actorControlService;
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(ExecutorGrpcService.class);
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public ExecutorGrpcService(ActorApi actorApi,
|
public ExecutorGrpcService(ActorApi actorApi,
|
||||||
FileStorageService fileStorageService,
|
FileStorageService fileStorageService,
|
||||||
@ -240,5 +245,22 @@ public class ExecutorGrpcService
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void restartExecutorService(Empty request, StreamObserver<Empty> responseObserver) {
|
||||||
|
responseObserver.onNext(Empty.getDefaultInstance());
|
||||||
|
responseObserver.onCompleted();
|
||||||
|
|
||||||
|
logger.info("Restarting executor service on node {}", serviceConfiguration.node());
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Wait for the response to be sent before restarting
|
||||||
|
Thread.sleep(Duration.ofSeconds(5));
|
||||||
|
}
|
||||||
|
catch (InterruptedException e) {
|
||||||
|
logger.warn("Interrupted while waiting for restart", e);
|
||||||
|
}
|
||||||
|
|
||||||
|
System.exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -13,6 +13,8 @@ public class ProcessOutboxes {
|
|||||||
private final MqOutbox loaderOutbox;
|
private final MqOutbox loaderOutbox;
|
||||||
private final MqOutbox crawlerOutbox;
|
private final MqOutbox crawlerOutbox;
|
||||||
private final MqOutbox indexConstructorOutbox;
|
private final MqOutbox indexConstructorOutbox;
|
||||||
|
private final MqOutbox liveCrawlerOutbox;
|
||||||
|
private final MqOutbox exportTasksOutbox;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public ProcessOutboxes(BaseServiceParams params, MqPersistence persistence) {
|
public ProcessOutboxes(BaseServiceParams params, MqPersistence persistence) {
|
||||||
@ -44,6 +46,22 @@ public class ProcessOutboxes {
|
|||||||
params.configuration.node(),
|
params.configuration.node(),
|
||||||
params.configuration.instanceUuid()
|
params.configuration.instanceUuid()
|
||||||
);
|
);
|
||||||
|
|
||||||
|
liveCrawlerOutbox = new MqOutbox(persistence,
|
||||||
|
ProcessInboxNames.LIVE_CRAWLER_INBOX,
|
||||||
|
params.configuration.node(),
|
||||||
|
params.configuration.serviceName(),
|
||||||
|
params.configuration.node(),
|
||||||
|
params.configuration.instanceUuid()
|
||||||
|
);
|
||||||
|
|
||||||
|
exportTasksOutbox = new MqOutbox(persistence,
|
||||||
|
ProcessInboxNames.EXPORT_TASK_INBOX,
|
||||||
|
params.configuration.node(),
|
||||||
|
params.configuration.serviceName(),
|
||||||
|
params.configuration.node(),
|
||||||
|
params.configuration.instanceUuid()
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -60,4 +78,8 @@ public class ProcessOutboxes {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public MqOutbox getIndexConstructorOutbox() { return indexConstructorOutbox; }
|
public MqOutbox getIndexConstructorOutbox() { return indexConstructorOutbox; }
|
||||||
|
|
||||||
|
public MqOutbox getLiveCrawlerOutbox() { return liveCrawlerOutbox; }
|
||||||
|
|
||||||
|
public MqOutbox getExportTasksOutbox() { return exportTasksOutbox; }
|
||||||
}
|
}
|
||||||
|
@ -3,14 +3,14 @@ package nu.marginalia.process;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.adjacencies.WebsiteAdjacenciesCalculator;
|
|
||||||
import nu.marginalia.converting.ConverterMain;
|
import nu.marginalia.converting.ConverterMain;
|
||||||
import nu.marginalia.crawl.CrawlerMain;
|
import nu.marginalia.crawl.CrawlerMain;
|
||||||
import nu.marginalia.index.IndexConstructorMain;
|
import nu.marginalia.index.IndexConstructorMain;
|
||||||
|
import nu.marginalia.livecrawler.LiveCrawlerMain;
|
||||||
import nu.marginalia.loading.LoaderMain;
|
import nu.marginalia.loading.LoaderMain;
|
||||||
import nu.marginalia.service.ProcessMainClass;
|
|
||||||
import nu.marginalia.service.control.ServiceEventLog;
|
import nu.marginalia.service.control.ServiceEventLog;
|
||||||
import nu.marginalia.service.server.BaseServiceParams;
|
import nu.marginalia.service.server.BaseServiceParams;
|
||||||
|
import nu.marginalia.task.ExportTasksMain;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import org.slf4j.Marker;
|
import org.slf4j.Marker;
|
||||||
@ -37,23 +37,24 @@ public class ProcessService {
|
|||||||
private final int node;
|
private final int node;
|
||||||
|
|
||||||
|
|
||||||
public static ProcessService.ProcessId translateExternalIdBase(String id) {
|
public static ProcessId translateExternalIdBase(String id) {
|
||||||
return switch (id) {
|
return switch (id) {
|
||||||
case "converter" -> ProcessService.ProcessId.CONVERTER;
|
case "converter" -> ProcessId.CONVERTER;
|
||||||
case "crawler" -> ProcessService.ProcessId.CRAWLER;
|
case "crawler" -> ProcessId.CRAWLER;
|
||||||
case "loader" -> ProcessService.ProcessId.LOADER;
|
case "loader" -> ProcessId.LOADER;
|
||||||
case "website-adjacencies-calculator" -> ProcessService.ProcessId.ADJACENCIES_CALCULATOR;
|
case "export-tasks" -> ProcessId.EXPORT_TASKS;
|
||||||
case "index-constructor" -> ProcessService.ProcessId.INDEX_CONSTRUCTOR;
|
case "index-constructor" -> ProcessId.INDEX_CONSTRUCTOR;
|
||||||
default -> null;
|
default -> null;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
public enum ProcessId {
|
public enum ProcessId {
|
||||||
CRAWLER(CrawlerMain.class),
|
CRAWLER(CrawlerMain.class),
|
||||||
|
LIVE_CRAWLER(LiveCrawlerMain.class),
|
||||||
CONVERTER(ConverterMain.class),
|
CONVERTER(ConverterMain.class),
|
||||||
LOADER(LoaderMain.class),
|
LOADER(LoaderMain.class),
|
||||||
INDEX_CONSTRUCTOR(IndexConstructorMain.class),
|
INDEX_CONSTRUCTOR(IndexConstructorMain.class),
|
||||||
ADJACENCIES_CALCULATOR(WebsiteAdjacenciesCalculator.class)
|
EXPORT_TASKS(ExportTasksMain.class),
|
||||||
;
|
;
|
||||||
|
|
||||||
public final String mainClass;
|
public final String mainClass;
|
||||||
@ -64,10 +65,11 @@ public class ProcessService {
|
|||||||
List<String> envOpts() {
|
List<String> envOpts() {
|
||||||
String variable = switch (this) {
|
String variable = switch (this) {
|
||||||
case CRAWLER -> "CRAWLER_PROCESS_OPTS";
|
case CRAWLER -> "CRAWLER_PROCESS_OPTS";
|
||||||
|
case LIVE_CRAWLER -> "LIVE_CRAWLER_PROCESS_OPTS";
|
||||||
case CONVERTER -> "CONVERTER_PROCESS_OPTS";
|
case CONVERTER -> "CONVERTER_PROCESS_OPTS";
|
||||||
case LOADER -> "LOADER_PROCESS_OPTS";
|
case LOADER -> "LOADER_PROCESS_OPTS";
|
||||||
case INDEX_CONSTRUCTOR -> "INDEX_CONSTRUCTION_PROCESS_OPTS";
|
case INDEX_CONSTRUCTOR -> "INDEX_CONSTRUCTION_PROCESS_OPTS";
|
||||||
case ADJACENCIES_CALCULATOR -> "ADJACENCIES_CALCULATOR_PROCESS_OPTS";
|
case EXPORT_TASKS -> "EXPORT_TASKS_PROCESS_OPTS";
|
||||||
};
|
};
|
||||||
String value = System.getenv(variable);
|
String value = System.getenv(variable);
|
||||||
|
|
||||||
|
@ -11,10 +11,15 @@ import nu.marginalia.service.discovery.property.ServicePartition;
|
|||||||
import nu.marginalia.service.module.ServiceConfiguration;
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
|
|
||||||
import javax.annotation.CheckReturnValue;
|
import javax.annotation.CheckReturnValue;
|
||||||
|
import java.time.Instant;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
import java.util.UUID;
|
import java.util.UUID;
|
||||||
import java.util.concurrent.CompletableFuture;
|
import java.util.concurrent.CompletableFuture;
|
||||||
|
import java.util.concurrent.ExecutionException;
|
||||||
import java.util.concurrent.ExecutorService;
|
import java.util.concurrent.ExecutorService;
|
||||||
import java.util.concurrent.Executors;
|
import java.util.concurrent.Executors;
|
||||||
|
import java.util.function.BiConsumer;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class FeedsClient {
|
public class FeedsClient {
|
||||||
@ -23,7 +28,9 @@ public class FeedsClient {
|
|||||||
private final MqOutbox updateFeedsOutbox;
|
private final MqOutbox updateFeedsOutbox;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public FeedsClient(GrpcChannelPoolFactory factory, MqPersistence mqPersistence, ServiceConfiguration serviceConfiguration) {
|
public FeedsClient(GrpcChannelPoolFactory factory,
|
||||||
|
MqPersistence mqPersistence,
|
||||||
|
ServiceConfiguration serviceConfiguration) {
|
||||||
|
|
||||||
// The client is only interested in the primary node
|
// The client is only interested in the primary node
|
||||||
var key = ServiceKey.forGrpcApi(FeedApiGrpc.class, ServicePartition.any());
|
var key = ServiceKey.forGrpcApi(FeedApiGrpc.class, ServicePartition.any());
|
||||||
@ -46,6 +53,25 @@ public class FeedsClient {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void getUpdatedDomains(Instant since, BiConsumer<String, List<String>> consumer) throws ExecutionException, InterruptedException {
|
||||||
|
channelPool.call(FeedApiGrpc.FeedApiBlockingStub::getUpdatedLinks)
|
||||||
|
.run(RpcUpdatedLinksRequest.newBuilder().setSinceEpochMillis(since.toEpochMilli()).build())
|
||||||
|
.forEachRemaining(rsp -> consumer.accept(rsp.getDomain(), new ArrayList<>(rsp.getUrlList())));
|
||||||
|
}
|
||||||
|
|
||||||
|
public record UpdatedDomain(String domain, List<String> urls) {
|
||||||
|
public UpdatedDomain(RpcUpdatedLinksResponse rsp) {
|
||||||
|
this(rsp.getDomain(), new ArrayList<>(rsp.getUrlList()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Get the hash of the feed data, for identifying when the data has been updated */
|
||||||
|
public String getFeedDataHash() {
|
||||||
|
return channelPool.call(FeedApiGrpc.FeedApiBlockingStub::getFeedDataHash)
|
||||||
|
.run(Empty.getDefaultInstance())
|
||||||
|
.getHash();
|
||||||
|
}
|
||||||
|
|
||||||
/** Update the feeds, return a message ID for the update */
|
/** Update the feeds, return a message ID for the update */
|
||||||
@CheckReturnValue
|
@CheckReturnValue
|
||||||
public long updateFeeds(RpcFeedUpdateMode mode) throws Exception {
|
public long updateFeeds(RpcFeedUpdateMode mode) throws Exception {
|
||||||
|
@ -7,7 +7,22 @@ option java_multiple_files=true;
|
|||||||
|
|
||||||
service FeedApi {
|
service FeedApi {
|
||||||
rpc getFeed(RpcDomainId) returns (RpcFeed) {}
|
rpc getFeed(RpcDomainId) returns (RpcFeed) {}
|
||||||
|
rpc getFeedDataHash(Empty) returns (RpcFeedDataHash) {}
|
||||||
rpc updateFeeds(RpcUpdateRequest) returns (Empty) {}
|
rpc updateFeeds(RpcUpdateRequest) returns (Empty) {}
|
||||||
|
rpc getUpdatedLinks(RpcUpdatedLinksRequest) returns (stream RpcUpdatedLinksResponse) {}
|
||||||
|
}
|
||||||
|
|
||||||
|
message RpcUpdatedLinksRequest {
|
||||||
|
int64 sinceEpochMillis = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
message RpcUpdatedLinksResponse {
|
||||||
|
string domain = 1;
|
||||||
|
repeated string url = 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
message RpcFeedDataHash {
|
||||||
|
string hash = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
message RpcDomainId {
|
message RpcDomainId {
|
||||||
|
@ -11,11 +11,16 @@ import org.jetbrains.annotations.NotNull;
|
|||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.BufferedInputStream;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.nio.file.StandardCopyOption;
|
import java.nio.file.StandardCopyOption;
|
||||||
|
import java.security.MessageDigest;
|
||||||
|
import java.time.Instant;
|
||||||
|
import java.util.Base64;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
import java.util.function.BiConsumer;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class FeedDb {
|
public class FeedDb {
|
||||||
@ -45,6 +50,18 @@ public class FeedDb {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Constructor for testing */
|
||||||
|
public FeedDb(Path dbPath) {
|
||||||
|
feedDbEnabled = true;
|
||||||
|
readerDbPath = dbPath;
|
||||||
|
|
||||||
|
try {
|
||||||
|
reader = new FeedDbReader(readerDbPath);
|
||||||
|
} catch (Exception e) {
|
||||||
|
reader = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public boolean isEnabled() {
|
public boolean isEnabled() {
|
||||||
return feedDbEnabled;
|
return feedDbEnabled;
|
||||||
}
|
}
|
||||||
@ -112,7 +129,7 @@ public class FeedDb {
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
Path dbFile = Files.createTempFile(WmsaHome.getDataPath(), "rss-feeds", ".tmp.db");
|
Path dbFile = Files.createTempFile(readerDbPath.getParent(), "rss-feeds", ".tmp.db");
|
||||||
return new FeedDbWriter(dbFile);
|
return new FeedDbWriter(dbFile);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
logger.error("Error creating new database writer", e);
|
logger.error("Error creating new database writer", e);
|
||||||
@ -141,4 +158,34 @@ public class FeedDb {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String getDataHash() throws Exception {
|
||||||
|
MessageDigest digest = MessageDigest.getInstance("MD5");
|
||||||
|
|
||||||
|
byte[] buffer = new byte[4096];
|
||||||
|
|
||||||
|
try (var inputStream = new BufferedInputStream(Files.newInputStream(readerDbPath))) {
|
||||||
|
int rb;
|
||||||
|
|
||||||
|
while ((rb = inputStream.read(buffer)) >= 0) {
|
||||||
|
digest.update(buffer, 0, rb);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return Base64.getEncoder().encodeToString(digest.digest());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void getLinksUpdatedSince(Instant since, BiConsumer<String, List<String>> consumer) throws Exception {
|
||||||
|
if (!feedDbEnabled) {
|
||||||
|
throw new IllegalStateException("Feed database is disabled on this node");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Capture the current reader to avoid concurrency issues
|
||||||
|
FeedDbReader reader = this.reader;
|
||||||
|
|
||||||
|
if (reader == null) {
|
||||||
|
throw new NullPointerException("Reader is not available");
|
||||||
|
}
|
||||||
|
|
||||||
|
reader.getLinksUpdatedSince(since, consumer);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -12,9 +12,11 @@ import java.nio.file.Path;
|
|||||||
import java.sql.Connection;
|
import java.sql.Connection;
|
||||||
import java.sql.DriverManager;
|
import java.sql.DriverManager;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
|
import java.time.Instant;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
import java.util.function.BiConsumer;
|
||||||
|
|
||||||
public class FeedDbReader implements AutoCloseable {
|
public class FeedDbReader implements AutoCloseable {
|
||||||
private static final Logger logger = LoggerFactory.getLogger(FeedDbReader.class);
|
private static final Logger logger = LoggerFactory.getLogger(FeedDbReader.class);
|
||||||
@ -99,4 +101,27 @@ public class FeedDbReader implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void getLinksUpdatedSince(Instant since, BiConsumer<String, List<String>> consumer) {
|
||||||
|
try (var stmt = connection.prepareStatement("SELECT FEED FROM feed")) {
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
|
||||||
|
while (rs.next()) {
|
||||||
|
FeedItems items = deserialize(rs.getString(1));
|
||||||
|
|
||||||
|
List<String> urls = new ArrayList<>();
|
||||||
|
for (var item : items.items()) {
|
||||||
|
if (item.getUpdateTimeZD().toInstant().isAfter(since)) {
|
||||||
|
urls.add(item.url());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!urls.isEmpty()) {
|
||||||
|
consumer.accept(items.domain(), new ArrayList<>(urls));
|
||||||
|
urls.clear();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (SQLException e) {
|
||||||
|
logger.error("Error getting updated links", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -55,6 +55,11 @@ public record FeedItem(String title,
|
|||||||
return zonedDateTime.map(date -> date.format(DATE_FORMAT)).orElse("");
|
return zonedDateTime.map(date -> date.format(DATE_FORMAT)).orElse("");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public ZonedDateTime getUpdateTimeZD() {
|
||||||
|
return ZonedDateTime.parse(date, DATE_FORMAT);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int compareTo(@NotNull FeedItem o) {
|
public int compareTo(@NotNull FeedItem o) {
|
||||||
return o.date.compareTo(date);
|
return o.date.compareTo(date);
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
package nu.marginalia.rss.model;
|
package nu.marginalia.rss.model;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Optional;
|
|
||||||
|
|
||||||
public record FeedItems(String domain,
|
public record FeedItems(String domain,
|
||||||
String feedUrl,
|
String feedUrl,
|
||||||
@ -17,17 +16,4 @@ public record FeedItems(String domain,
|
|||||||
public boolean isEmpty() {
|
public boolean isEmpty() {
|
||||||
return items.isEmpty();
|
return items.isEmpty();
|
||||||
}
|
}
|
||||||
|
|
||||||
public Optional<FeedItem> getLatest() {
|
|
||||||
if (items.isEmpty())
|
|
||||||
return Optional.empty();
|
|
||||||
|
|
||||||
return Optional.of(
|
|
||||||
items.getFirst()
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
public Optional<String> getLatestDate() {
|
|
||||||
return getLatest().map(FeedItem::date);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -70,7 +70,7 @@ public class FeedFetcherService {
|
|||||||
this.serviceHeartbeat = serviceHeartbeat;
|
this.serviceHeartbeat = serviceHeartbeat;
|
||||||
this.executorClient = executorClient;
|
this.executorClient = executorClient;
|
||||||
|
|
||||||
rssReader.addHeader("User-Agent", WmsaHome.getUserAgent().uaIdentifier() + " RSS Feed Fetcher");
|
rssReader.addHeader("User-Agent", WmsaHome.getUserAgent().uaIdentifier());
|
||||||
}
|
}
|
||||||
|
|
||||||
public enum UpdateMode {
|
public enum UpdateMode {
|
||||||
|
@ -14,6 +14,8 @@ import nu.marginalia.service.server.DiscoverableService;
|
|||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.time.Instant;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
public class FeedsGrpcService extends FeedApiGrpc.FeedApiImplBase implements DiscoverableService {
|
public class FeedsGrpcService extends FeedApiGrpc.FeedApiImplBase implements DiscoverableService {
|
||||||
@ -64,6 +66,45 @@ public class FeedsGrpcService extends FeedApiGrpc.FeedApiImplBase implements Dis
|
|||||||
responseObserver.onCompleted();
|
responseObserver.onCompleted();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void getFeedDataHash(Empty request, StreamObserver<RpcFeedDataHash> responseObserver) {
|
||||||
|
if (!feedDb.isEnabled()) {
|
||||||
|
responseObserver.onError(new IllegalStateException("Feed database is disabled on this node"));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
String hash = feedDb.getDataHash();
|
||||||
|
responseObserver.onNext(RpcFeedDataHash.newBuilder().setHash(hash).build());
|
||||||
|
responseObserver.onCompleted();
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
logger.error("Error getting feed data hash", e);
|
||||||
|
responseObserver.onError(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void getUpdatedLinks(RpcUpdatedLinksRequest request, StreamObserver<RpcUpdatedLinksResponse> responseObserver) {
|
||||||
|
Instant since = Instant.ofEpochMilli(request.getSinceEpochMillis());
|
||||||
|
|
||||||
|
try {
|
||||||
|
feedDb.getLinksUpdatedSince(since, (String domain, List<String> urls) -> {
|
||||||
|
RpcUpdatedLinksResponse rsp = RpcUpdatedLinksResponse.newBuilder()
|
||||||
|
.setDomain(domain)
|
||||||
|
.addAllUrl(urls)
|
||||||
|
.build();
|
||||||
|
responseObserver.onNext(rsp);
|
||||||
|
});
|
||||||
|
|
||||||
|
responseObserver.onCompleted();
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
logger.error("Error getting updated links", e);
|
||||||
|
responseObserver.onError(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void getFeed(RpcDomainId request,
|
public void getFeed(RpcDomainId request,
|
||||||
StreamObserver<RpcFeed> responseObserver)
|
StreamObserver<RpcFeed> responseObserver)
|
||||||
|
@ -0,0 +1,34 @@
|
|||||||
|
package nu.marginalia.rss.db;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Tag;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.time.Instant;
|
||||||
|
import java.time.temporal.ChronoUnit;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
class FeedDbReaderTest {
|
||||||
|
|
||||||
|
@Tag("flaky") // will only work on ~vlofgren, not on CI; remove test when this feature is stable
|
||||||
|
@Test
|
||||||
|
void getLinksUpdatedSince() throws SQLException {
|
||||||
|
var reader = new FeedDbReader(Path.of("/home/vlofgren/rss-feeds.db"));
|
||||||
|
Map<String, List<String>> links = new HashMap<>();
|
||||||
|
|
||||||
|
reader.getLinksUpdatedSince(Instant.now().minus(10, ChronoUnit.DAYS), links::put);
|
||||||
|
|
||||||
|
System.out.println(links.size());
|
||||||
|
for (var link : links.values()) {
|
||||||
|
if (link.size() < 2) {
|
||||||
|
System.out.println(link);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
reader.close();
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,36 @@
|
|||||||
|
package nu.marginalia.rss.db;
|
||||||
|
|
||||||
|
import nu.marginalia.rss.model.FeedItem;
|
||||||
|
import nu.marginalia.rss.model.FeedItems;
|
||||||
|
import org.junit.jupiter.api.Assertions;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
class FeedDbTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDbHash() throws Exception{
|
||||||
|
Path dbPath = Files.createTempFile("rss-feeds", ".db");
|
||||||
|
try {
|
||||||
|
FeedDb db = new FeedDb(dbPath);
|
||||||
|
|
||||||
|
try (var writer = db.createWriter()) {
|
||||||
|
writer.saveFeed(new FeedItems("foo", "bar", "baz", List.of(
|
||||||
|
new FeedItem("title1", "link1", "description1", "content1"),
|
||||||
|
new FeedItem("title2", "link2", "description2", "content2")
|
||||||
|
)));
|
||||||
|
|
||||||
|
db.switchDb(writer);
|
||||||
|
}
|
||||||
|
|
||||||
|
var hash = db.getDataHash();
|
||||||
|
Assertions.assertFalse(hash.isBlank());
|
||||||
|
} finally {
|
||||||
|
Files.delete(dbPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
@ -67,7 +67,6 @@ dependencies {
|
|||||||
testImplementation libs.bundles.junit
|
testImplementation libs.bundles.junit
|
||||||
testImplementation libs.mockito
|
testImplementation libs.mockito
|
||||||
testImplementation libs.commons.lang3
|
testImplementation libs.commons.lang3
|
||||||
testImplementation project(':code:common:process')
|
|
||||||
testImplementation project(':code:libraries:array')
|
testImplementation project(':code:libraries:array')
|
||||||
|
|
||||||
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
|
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
|
||||||
|
@ -20,7 +20,7 @@ dependencies {
|
|||||||
implementation project(':code:index:query')
|
implementation project(':code:index:query')
|
||||||
implementation project(':code:index:index-journal')
|
implementation project(':code:index:index-journal')
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
implementation project(':code:common:process')
|
implementation project(':code:common:service')
|
||||||
implementation project(':code:processes:converting-process:model')
|
implementation project(':code:processes:converting-process:model')
|
||||||
|
|
||||||
implementation libs.bundles.slf4j
|
implementation libs.bundles.slf4j
|
||||||
|
@ -21,8 +21,8 @@ dependencies {
|
|||||||
implementation project(':code:index:query')
|
implementation project(':code:index:query')
|
||||||
implementation project(':code:index:index-journal')
|
implementation project(':code:index:index-journal')
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
|
implementation project(':code:common:service')
|
||||||
implementation project(':code:processes:converting-process:model')
|
implementation project(':code:processes:converting-process:model')
|
||||||
implementation project(':code:common:process')
|
|
||||||
|
|
||||||
implementation project(':third-party:parquet-floor')
|
implementation project(':third-party:parquet-floor')
|
||||||
implementation project(':third-party:commons-codec')
|
implementation project(':third-party:commons-codec')
|
||||||
|
@ -21,7 +21,6 @@ tasks.distZip.enabled = false
|
|||||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':code:common:process')
|
|
||||||
|
|
||||||
implementation project(':third-party:porterstemmer')
|
implementation project(':third-party:porterstemmer')
|
||||||
implementation project(':third-party:count-min-sketch')
|
implementation project(':third-party:count-min-sketch')
|
||||||
|
@ -14,9 +14,9 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
|
|||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':code:common:config')
|
implementation project(':code:common:config')
|
||||||
|
implementation project(':code:common:service')
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
implementation project(':code:common:db')
|
implementation project(':code:common:db')
|
||||||
implementation project(':code:common:process')
|
|
||||||
implementation project(':code:processes:converting-process:ft-keyword-extraction')
|
implementation project(':code:processes:converting-process:ft-keyword-extraction')
|
||||||
implementation project(':code:libraries:language-processing')
|
implementation project(':code:libraries:language-processing')
|
||||||
implementation project(':code:libraries:term-frequency-dict')
|
implementation project(':code:libraries:term-frequency-dict')
|
||||||
|
@ -2,10 +2,10 @@ package nu.marginalia.atags.source;
|
|||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
import nu.marginalia.ProcessConfiguration;
|
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.atags.model.DomainLinks;
|
import nu.marginalia.atags.model.DomainLinks;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
|
import nu.marginalia.process.ProcessConfiguration;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
@ -4,8 +4,6 @@ import com.google.gson.Gson;
|
|||||||
import com.google.inject.Guice;
|
import com.google.inject.Guice;
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Injector;
|
import com.google.inject.Injector;
|
||||||
import nu.marginalia.ProcessConfiguration;
|
|
||||||
import nu.marginalia.ProcessConfigurationModule;
|
|
||||||
import nu.marginalia.converting.model.CrawlPlan;
|
import nu.marginalia.converting.model.CrawlPlan;
|
||||||
import nu.marginalia.converting.model.WorkDir;
|
import nu.marginalia.converting.model.WorkDir;
|
||||||
import nu.marginalia.converting.processor.DomainProcessor;
|
import nu.marginalia.converting.processor.DomainProcessor;
|
||||||
@ -17,14 +15,14 @@ import nu.marginalia.converting.writer.ConverterWriter;
|
|||||||
import nu.marginalia.io.CrawledDomainReader;
|
import nu.marginalia.io.CrawledDomainReader;
|
||||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||||
import nu.marginalia.mq.MessageQueueFactory;
|
import nu.marginalia.mq.MessageQueueFactory;
|
||||||
import nu.marginalia.mq.MqMessage;
|
import nu.marginalia.mqapi.converting.ConvertRequest;
|
||||||
import nu.marginalia.mq.inbox.MqInboxResponse;
|
import nu.marginalia.process.ProcessConfiguration;
|
||||||
import nu.marginalia.mq.inbox.MqSingleShotInbox;
|
import nu.marginalia.process.ProcessConfigurationModule;
|
||||||
|
import nu.marginalia.process.ProcessMainClass;
|
||||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||||
import nu.marginalia.process.control.ProcessHeartbeatImpl;
|
import nu.marginalia.process.control.ProcessHeartbeatImpl;
|
||||||
import nu.marginalia.process.log.WorkLog;
|
import nu.marginalia.process.log.WorkLog;
|
||||||
import nu.marginalia.process.log.WorkLogEntry;
|
import nu.marginalia.process.log.WorkLogEntry;
|
||||||
import nu.marginalia.service.ProcessMainClass;
|
|
||||||
import nu.marginalia.service.module.DatabaseModule;
|
import nu.marginalia.service.module.DatabaseModule;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||||
@ -34,13 +32,13 @@ import org.apache.logging.log4j.util.Strings;
|
|||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.UUID;
|
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
@ -50,12 +48,9 @@ import static nu.marginalia.mqapi.ProcessInboxNames.CONVERTER_INBOX;
|
|||||||
public class ConverterMain extends ProcessMainClass {
|
public class ConverterMain extends ProcessMainClass {
|
||||||
private static final Logger logger = LoggerFactory.getLogger(ConverterMain.class);
|
private static final Logger logger = LoggerFactory.getLogger(ConverterMain.class);
|
||||||
private final DomainProcessor processor;
|
private final DomainProcessor processor;
|
||||||
private final Gson gson;
|
|
||||||
private final ProcessHeartbeat heartbeat;
|
private final ProcessHeartbeat heartbeat;
|
||||||
private final MessageQueueFactory messageQueueFactory;
|
|
||||||
private final FileStorageService fileStorageService;
|
private final FileStorageService fileStorageService;
|
||||||
private final SideloadSourceFactory sideloadSourceFactory;
|
private final SideloadSourceFactory sideloadSourceFactory;
|
||||||
private final int node;
|
|
||||||
|
|
||||||
public static void main(String... args) throws Exception {
|
public static void main(String... args) throws Exception {
|
||||||
|
|
||||||
@ -70,8 +65,9 @@ public class ConverterMain extends ProcessMainClass {
|
|||||||
|
|
||||||
logger.info("Starting pipe");
|
logger.info("Starting pipe");
|
||||||
|
|
||||||
converter
|
Instructions<ConvertRequest> instructions = converter.fetchInstructions(ConvertRequest.class);
|
||||||
.fetchInstructions()
|
|
||||||
|
converter.createAction(instructions)
|
||||||
.execute(converter);
|
.execute(converter);
|
||||||
|
|
||||||
logger.info("Finished");
|
logger.info("Finished");
|
||||||
@ -83,6 +79,65 @@ public class ConverterMain extends ProcessMainClass {
|
|||||||
System.exit(0);
|
System.exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private Action createAction(Instructions<ConvertRequest> instructions) throws SQLException, IOException {
|
||||||
|
var request = instructions.value();
|
||||||
|
final Path inputPath = request.getInputPath();
|
||||||
|
|
||||||
|
return switch (request.action) {
|
||||||
|
case ConvertCrawlData -> {
|
||||||
|
var crawlData = fileStorageService.getStorage(request.crawlStorage);
|
||||||
|
var processData = fileStorageService.getStorage(request.processedDataStorage);
|
||||||
|
|
||||||
|
var plan = new CrawlPlan(null,
|
||||||
|
new WorkDir(crawlData.asPath().toString(), "crawler.log"),
|
||||||
|
new WorkDir(processData.asPath().toString(), "processor.log")
|
||||||
|
);
|
||||||
|
|
||||||
|
yield new ConvertCrawlDataAction(plan, instructions);
|
||||||
|
}
|
||||||
|
case SideloadEncyclopedia -> {
|
||||||
|
var processData = fileStorageService.getStorage(request.processedDataStorage);
|
||||||
|
|
||||||
|
yield new SideloadAction(
|
||||||
|
sideloadSourceFactory.sideloadEncyclopediaMarginaliaNu(inputPath, request.baseUrl),
|
||||||
|
processData.asPath(),
|
||||||
|
instructions);
|
||||||
|
}
|
||||||
|
case SideloadDirtree -> {
|
||||||
|
var processData = fileStorageService.getStorage(request.processedDataStorage);
|
||||||
|
|
||||||
|
yield new SideloadAction(
|
||||||
|
sideloadSourceFactory.sideloadDirtree(inputPath),
|
||||||
|
processData.asPath(),
|
||||||
|
instructions);
|
||||||
|
}
|
||||||
|
case SideloadWarc -> {
|
||||||
|
var processData = fileStorageService.getStorage(request.processedDataStorage);
|
||||||
|
|
||||||
|
yield new SideloadAction(
|
||||||
|
sideloadSourceFactory.sideloadWarc(inputPath),
|
||||||
|
processData.asPath(),
|
||||||
|
instructions);
|
||||||
|
}
|
||||||
|
case SideloadReddit -> {
|
||||||
|
var processData = fileStorageService.getStorage(request.processedDataStorage);
|
||||||
|
|
||||||
|
yield new SideloadAction(
|
||||||
|
sideloadSourceFactory.sideloadReddit(inputPath),
|
||||||
|
processData.asPath(),
|
||||||
|
instructions);
|
||||||
|
}
|
||||||
|
case SideloadStackexchange -> {
|
||||||
|
var processData = fileStorageService.getStorage(request.processedDataStorage);
|
||||||
|
|
||||||
|
yield new SideloadAction(
|
||||||
|
sideloadSourceFactory.sideloadStackexchange(inputPath),
|
||||||
|
processData.asPath(),
|
||||||
|
instructions);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public ConverterMain(
|
public ConverterMain(
|
||||||
DomainProcessor processor,
|
DomainProcessor processor,
|
||||||
@ -94,13 +149,12 @@ public class ConverterMain extends ProcessMainClass {
|
|||||||
ProcessConfiguration processConfiguration
|
ProcessConfiguration processConfiguration
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
|
super(messageQueueFactory, processConfiguration, gson, CONVERTER_INBOX);
|
||||||
|
|
||||||
this.processor = processor;
|
this.processor = processor;
|
||||||
this.gson = gson;
|
|
||||||
this.heartbeat = heartbeat;
|
this.heartbeat = heartbeat;
|
||||||
this.messageQueueFactory = messageQueueFactory;
|
|
||||||
this.fileStorageService = fileStorageService;
|
this.fileStorageService = fileStorageService;
|
||||||
this.sideloadSourceFactory = sideloadSourceFactory;
|
this.sideloadSourceFactory = sideloadSourceFactory;
|
||||||
this.node = processConfiguration.node();
|
|
||||||
|
|
||||||
heartbeat.start();
|
heartbeat.start();
|
||||||
}
|
}
|
||||||
@ -220,45 +274,44 @@ public class ConverterMain extends ProcessMainClass {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private abstract static class ConvertRequest {
|
private abstract static class Action {
|
||||||
private final MqMessage message;
|
final Instructions<?> instructions;
|
||||||
private final MqSingleShotInbox inbox;
|
|
||||||
|
|
||||||
private ConvertRequest(MqMessage message, MqSingleShotInbox inbox) {
|
public Action(Instructions<?> instructions) {
|
||||||
this.message = message;
|
this.instructions = instructions;
|
||||||
this.inbox = inbox;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public abstract void execute(ConverterMain converterMain) throws Exception;
|
public abstract void execute(ConverterMain converterMain) throws Exception;
|
||||||
|
|
||||||
public void ok() {
|
public void ok() {
|
||||||
inbox.sendResponse(message, MqInboxResponse.ok());
|
instructions.ok();
|
||||||
}
|
}
|
||||||
public void err() {
|
public void err() {
|
||||||
inbox.sendResponse(message, MqInboxResponse.err());
|
instructions.err();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class SideloadAction extends ConvertRequest {
|
private static class SideloadAction extends Action {
|
||||||
|
|
||||||
private final Collection<? extends SideloadSource> sideloadSources;
|
private final Collection<? extends SideloadSource> sideloadSources;
|
||||||
private final Path workDir;
|
private final Path workDir;
|
||||||
|
|
||||||
SideloadAction(SideloadSource sideloadSource,
|
SideloadAction(SideloadSource sideloadSource,
|
||||||
Path workDir,
|
Path workDir,
|
||||||
MqMessage message, MqSingleShotInbox inbox) {
|
Instructions<?> instructions) {
|
||||||
super(message, inbox);
|
super(instructions);
|
||||||
this.sideloadSources = List.of(sideloadSource);
|
this.sideloadSources = List.of(sideloadSource);
|
||||||
this.workDir = workDir;
|
this.workDir = workDir;
|
||||||
}
|
}
|
||||||
|
|
||||||
SideloadAction(Collection<? extends SideloadSource> sideloadSources,
|
SideloadAction(Collection<? extends SideloadSource> sideloadSources,
|
||||||
Path workDir,
|
Path workDir,
|
||||||
MqMessage message, MqSingleShotInbox inbox) {
|
Instructions<?> instructions) {
|
||||||
super(message, inbox);
|
super(instructions);
|
||||||
this.sideloadSources = sideloadSources;
|
this.sideloadSources = sideloadSources;
|
||||||
this.workDir = workDir;
|
this.workDir = workDir;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void execute(ConverterMain converterMain) throws Exception {
|
public void execute(ConverterMain converterMain) throws Exception {
|
||||||
try {
|
try {
|
||||||
@ -272,11 +325,12 @@ public class ConverterMain extends ProcessMainClass {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class ConvertCrawlDataAction extends ConvertRequest {
|
private static class ConvertCrawlDataAction extends Action {
|
||||||
private final CrawlPlan plan;
|
private final CrawlPlan plan;
|
||||||
|
|
||||||
private ConvertCrawlDataAction(CrawlPlan plan, MqMessage message, MqSingleShotInbox inbox) {
|
private ConvertCrawlDataAction(CrawlPlan plan,
|
||||||
super(message, inbox);
|
Instructions<?> instructions) {
|
||||||
|
super(instructions);
|
||||||
this.plan = plan;
|
this.plan = plan;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -294,94 +348,4 @@ public class ConverterMain extends ProcessMainClass {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private ConvertRequest fetchInstructions() throws Exception {
|
|
||||||
|
|
||||||
var inbox = messageQueueFactory.createSingleShotInbox(CONVERTER_INBOX, node, UUID.randomUUID());
|
|
||||||
|
|
||||||
var msgOpt = getMessage(inbox, nu.marginalia.mqapi.converting.ConvertRequest.class.getSimpleName());
|
|
||||||
var msg = msgOpt.orElseThrow(() -> new RuntimeException("No message received"));
|
|
||||||
|
|
||||||
try {
|
|
||||||
var request = gson.fromJson(msg.payload(), nu.marginalia.mqapi.converting.ConvertRequest.class);
|
|
||||||
|
|
||||||
// will be null on ConvertCrawlData
|
|
||||||
final Path inputPath = request.getInputPath();
|
|
||||||
|
|
||||||
return switch (request.action) {
|
|
||||||
case ConvertCrawlData -> {
|
|
||||||
var crawlData = fileStorageService.getStorage(request.crawlStorage);
|
|
||||||
var processData = fileStorageService.getStorage(request.processedDataStorage);
|
|
||||||
|
|
||||||
var plan = new CrawlPlan(null,
|
|
||||||
new WorkDir(crawlData.asPath().toString(), "crawler.log"),
|
|
||||||
new WorkDir(processData.asPath().toString(), "processor.log")
|
|
||||||
);
|
|
||||||
|
|
||||||
yield new ConvertCrawlDataAction(plan, msg, inbox);
|
|
||||||
}
|
|
||||||
case SideloadEncyclopedia -> {
|
|
||||||
var processData = fileStorageService.getStorage(request.processedDataStorage);
|
|
||||||
|
|
||||||
yield new SideloadAction(
|
|
||||||
sideloadSourceFactory.sideloadEncyclopediaMarginaliaNu(inputPath, request.baseUrl),
|
|
||||||
processData.asPath(),
|
|
||||||
msg, inbox);
|
|
||||||
}
|
|
||||||
case SideloadDirtree -> {
|
|
||||||
var processData = fileStorageService.getStorage(request.processedDataStorage);
|
|
||||||
|
|
||||||
yield new SideloadAction(
|
|
||||||
sideloadSourceFactory.sideloadDirtree(inputPath),
|
|
||||||
processData.asPath(),
|
|
||||||
msg, inbox);
|
|
||||||
}
|
|
||||||
case SideloadWarc -> {
|
|
||||||
var processData = fileStorageService.getStorage(request.processedDataStorage);
|
|
||||||
|
|
||||||
yield new SideloadAction(
|
|
||||||
sideloadSourceFactory.sideloadWarc(inputPath),
|
|
||||||
processData.asPath(),
|
|
||||||
msg, inbox);
|
|
||||||
}
|
|
||||||
case SideloadReddit -> {
|
|
||||||
var processData = fileStorageService.getStorage(request.processedDataStorage);
|
|
||||||
|
|
||||||
yield new SideloadAction(
|
|
||||||
sideloadSourceFactory.sideloadReddit(inputPath),
|
|
||||||
processData.asPath(),
|
|
||||||
msg, inbox);
|
|
||||||
}
|
|
||||||
case SideloadStackexchange -> {
|
|
||||||
var processData = fileStorageService.getStorage(request.processedDataStorage);
|
|
||||||
|
|
||||||
yield new SideloadAction(
|
|
||||||
sideloadSourceFactory.sideloadStackexchange(inputPath),
|
|
||||||
processData.asPath(),
|
|
||||||
msg, inbox);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
|
||||||
inbox.sendResponse(msg, MqInboxResponse.err(ex.getClass().getSimpleName() + ": " + ex.getMessage()));
|
|
||||||
|
|
||||||
throw ex;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private Optional<MqMessage> getMessage(MqSingleShotInbox inbox, String expectedFunction) throws SQLException, InterruptedException {
|
|
||||||
var opt = inbox.waitForMessage(30, TimeUnit.SECONDS);
|
|
||||||
if (opt.isPresent()) {
|
|
||||||
if (!opt.get().function().equals(expectedFunction)) {
|
|
||||||
throw new RuntimeException("Unexpected function: " + opt.get().function());
|
|
||||||
}
|
|
||||||
return opt;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
var stolenMessage = inbox.stealMessage(msg -> msg.function().equals(expectedFunction));
|
|
||||||
stolenMessage.ifPresent(mqMessage -> logger.info("Stole message {}", mqMessage));
|
|
||||||
return stolenMessage;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
package nu.marginalia.converting.processor;
|
package nu.marginalia.converting.processor;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import nu.marginalia.atags.AnchorTextKeywords;
|
|
||||||
import nu.marginalia.atags.model.DomainLinks;
|
import nu.marginalia.atags.model.DomainLinks;
|
||||||
import nu.marginalia.atags.source.AnchorTagsSource;
|
import nu.marginalia.atags.source.AnchorTagsSource;
|
||||||
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
||||||
@ -37,7 +36,6 @@ public class DomainProcessor {
|
|||||||
private final DocumentProcessor documentProcessor;
|
private final DocumentProcessor documentProcessor;
|
||||||
private final SiteWords siteWords;
|
private final SiteWords siteWords;
|
||||||
private final AnchorTagsSource anchorTagsSource;
|
private final AnchorTagsSource anchorTagsSource;
|
||||||
private final AnchorTextKeywords anchorTextKeywords;
|
|
||||||
private final GeoIpDictionary geoIpDictionary;
|
private final GeoIpDictionary geoIpDictionary;
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
@ -46,12 +44,10 @@ public class DomainProcessor {
|
|||||||
public DomainProcessor(DocumentProcessor documentProcessor,
|
public DomainProcessor(DocumentProcessor documentProcessor,
|
||||||
SiteWords siteWords,
|
SiteWords siteWords,
|
||||||
AnchorTagsSourceFactory anchorTagsSourceFactory,
|
AnchorTagsSourceFactory anchorTagsSourceFactory,
|
||||||
AnchorTextKeywords anchorTextKeywords,
|
|
||||||
GeoIpDictionary geoIpDictionary) throws SQLException
|
GeoIpDictionary geoIpDictionary) throws SQLException
|
||||||
{
|
{
|
||||||
this.documentProcessor = documentProcessor;
|
this.documentProcessor = documentProcessor;
|
||||||
this.siteWords = siteWords;
|
this.siteWords = siteWords;
|
||||||
this.anchorTextKeywords = anchorTextKeywords;
|
|
||||||
this.anchorTagsSource = anchorTagsSourceFactory.create();
|
this.anchorTagsSource = anchorTagsSourceFactory.create();
|
||||||
this.geoIpDictionary = geoIpDictionary;
|
this.geoIpDictionary = geoIpDictionary;
|
||||||
|
|
||||||
|
@ -51,10 +51,6 @@ public class SideloaderProcessing {
|
|||||||
"NP",
|
"NP",
|
||||||
"",
|
"",
|
||||||
body,
|
body,
|
||||||
Integer.toHexString(url.hashCode()),
|
|
||||||
url,
|
|
||||||
"",
|
|
||||||
"SIDELOAD",
|
|
||||||
false,
|
false,
|
||||||
null,
|
null,
|
||||||
null
|
null
|
||||||
|
@ -27,6 +27,8 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter
|
|||||||
private final SlopDomainLinkRecord.Writer domainLinkWriter;
|
private final SlopDomainLinkRecord.Writer domainLinkWriter;
|
||||||
private final SlopDocumentRecord.Writer documentWriter;
|
private final SlopDocumentRecord.Writer documentWriter;
|
||||||
|
|
||||||
|
private int ordinalOffset = 0;
|
||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(ConverterBatchWriter.class);
|
private static final Logger logger = LoggerFactory.getLogger(ConverterBatchWriter.class);
|
||||||
|
|
||||||
public ConverterBatchWriter(Path basePath, int batchNumber) throws IOException {
|
public ConverterBatchWriter(Path basePath, int batchNumber) throws IOException {
|
||||||
@ -46,6 +48,11 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter
|
|||||||
documentWriter = new SlopDocumentRecord.Writer(ProcessedDataFileNames.documentFileName(basePath), batchNumber);
|
documentWriter = new SlopDocumentRecord.Writer(ProcessedDataFileNames.documentFileName(basePath), batchNumber);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Sets the lowest ordinal value for the documents in this batch */
|
||||||
|
public void setOrdinalOffset(int ordinalOffset) {
|
||||||
|
this.ordinalOffset = ordinalOffset;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void write(ConverterBatchWritableIf writable) throws IOException {
|
public void write(ConverterBatchWritableIf writable) throws IOException {
|
||||||
writable.write(this);
|
writable.write(this);
|
||||||
@ -79,7 +86,7 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter
|
|||||||
throws IOException
|
throws IOException
|
||||||
{
|
{
|
||||||
|
|
||||||
int ordinal = 0;
|
int ordinal = ordinalOffset;
|
||||||
|
|
||||||
String domainName = domain.toString();
|
String domainName = domain.toString();
|
||||||
|
|
||||||
|
@ -147,10 +147,6 @@ public class ConvertingIntegrationTest {
|
|||||||
"",
|
"",
|
||||||
"",
|
"",
|
||||||
readClassPathFile(p.toString()),
|
readClassPathFile(p.toString()),
|
||||||
Double.toString(Math.random()),
|
|
||||||
"https://memex.marginalia.nu/" + file,
|
|
||||||
null,
|
|
||||||
"",
|
|
||||||
false,
|
false,
|
||||||
null,
|
null,
|
||||||
null
|
null
|
||||||
|
@ -3,9 +3,9 @@ package nu.marginalia.converting;
|
|||||||
import com.google.inject.AbstractModule;
|
import com.google.inject.AbstractModule;
|
||||||
import com.google.inject.name.Names;
|
import com.google.inject.name.Names;
|
||||||
import nu.marginalia.LanguageModels;
|
import nu.marginalia.LanguageModels;
|
||||||
import nu.marginalia.ProcessConfiguration;
|
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.converting.processor.ConverterDomainTypes;
|
import nu.marginalia.converting.processor.ConverterDomainTypes;
|
||||||
|
import nu.marginalia.process.ProcessConfiguration;
|
||||||
import nu.marginalia.service.module.ServiceConfiguration;
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
import org.mockito.Mockito;
|
import org.mockito.Mockito;
|
||||||
|
|
||||||
|
@ -2,10 +2,10 @@ package nu.marginalia.converting.sideload.reddit;
|
|||||||
|
|
||||||
import com.google.inject.AbstractModule;
|
import com.google.inject.AbstractModule;
|
||||||
import com.google.inject.Guice;
|
import com.google.inject.Guice;
|
||||||
import nu.marginalia.ProcessConfiguration;
|
|
||||||
import nu.marginalia.converting.ConverterModule;
|
import nu.marginalia.converting.ConverterModule;
|
||||||
import nu.marginalia.converting.processor.ConverterDomainTypes;
|
import nu.marginalia.converting.processor.ConverterDomainTypes;
|
||||||
import nu.marginalia.converting.sideload.SideloadSourceFactory;
|
import nu.marginalia.converting.sideload.SideloadSourceFactory;
|
||||||
|
import nu.marginalia.process.ProcessConfiguration;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Tag;
|
import org.junit.jupiter.api.Tag;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
@ -21,7 +21,6 @@ tasks.distZip.enabled = false
|
|||||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':code:common:process')
|
|
||||||
|
|
||||||
implementation project(':code:common:db')
|
implementation project(':code:common:db')
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
|
@ -5,8 +5,6 @@ import com.google.inject.Guice;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Injector;
|
import com.google.inject.Injector;
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
import nu.marginalia.ProcessConfiguration;
|
|
||||||
import nu.marginalia.ProcessConfigurationModule;
|
|
||||||
import nu.marginalia.UserAgent;
|
import nu.marginalia.UserAgent;
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.atags.model.DomainLinks;
|
import nu.marginalia.atags.model.DomainLinks;
|
||||||
@ -25,15 +23,15 @@ import nu.marginalia.io.CrawledDomainReader;
|
|||||||
import nu.marginalia.io.CrawlerOutputFile;
|
import nu.marginalia.io.CrawlerOutputFile;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.mq.MessageQueueFactory;
|
import nu.marginalia.mq.MessageQueueFactory;
|
||||||
import nu.marginalia.mq.MqMessage;
|
|
||||||
import nu.marginalia.mq.inbox.MqInboxResponse;
|
|
||||||
import nu.marginalia.mq.inbox.MqSingleShotInbox;
|
|
||||||
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
|
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
|
||||||
|
import nu.marginalia.process.ProcessConfiguration;
|
||||||
|
import nu.marginalia.process.ProcessConfigurationModule;
|
||||||
|
import nu.marginalia.process.ProcessMainClass;
|
||||||
import nu.marginalia.process.control.ProcessHeartbeatImpl;
|
import nu.marginalia.process.control.ProcessHeartbeatImpl;
|
||||||
import nu.marginalia.process.log.WorkLog;
|
import nu.marginalia.process.log.WorkLog;
|
||||||
import nu.marginalia.service.ProcessMainClass;
|
|
||||||
import nu.marginalia.service.module.DatabaseModule;
|
import nu.marginalia.service.module.DatabaseModule;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
|
import nu.marginalia.storage.model.FileStorageId;
|
||||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||||
import okhttp3.ConnectionPool;
|
import okhttp3.ConnectionPool;
|
||||||
import okhttp3.Dispatcher;
|
import okhttp3.Dispatcher;
|
||||||
@ -46,8 +44,10 @@ import java.nio.file.Files;
|
|||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.nio.file.StandardCopyOption;
|
import java.nio.file.StandardCopyOption;
|
||||||
import java.security.Security;
|
import java.security.Security;
|
||||||
import java.sql.SQLException;
|
import java.util.ArrayList;
|
||||||
import java.util.*;
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
@ -59,14 +59,12 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
|
|
||||||
private final UserAgent userAgent;
|
private final UserAgent userAgent;
|
||||||
private final ProcessHeartbeatImpl heartbeat;
|
private final ProcessHeartbeatImpl heartbeat;
|
||||||
private final MessageQueueFactory messageQueueFactory;
|
|
||||||
private final DomainProber domainProber;
|
private final DomainProber domainProber;
|
||||||
private final FileStorageService fileStorageService;
|
private final FileStorageService fileStorageService;
|
||||||
private final AnchorTagsSourceFactory anchorTagsSourceFactory;
|
private final AnchorTagsSourceFactory anchorTagsSourceFactory;
|
||||||
private final WarcArchiverFactory warcArchiverFactory;
|
private final WarcArchiverFactory warcArchiverFactory;
|
||||||
private final HikariDataSource dataSource;
|
private final HikariDataSource dataSource;
|
||||||
private final DomainBlacklist blacklist;
|
private final DomainBlacklist blacklist;
|
||||||
private final Gson gson;
|
|
||||||
private final int node;
|
private final int node;
|
||||||
private final SimpleBlockingThreadPool pool;
|
private final SimpleBlockingThreadPool pool;
|
||||||
|
|
||||||
@ -96,16 +94,17 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
HikariDataSource dataSource,
|
HikariDataSource dataSource,
|
||||||
DomainBlacklist blacklist,
|
DomainBlacklist blacklist,
|
||||||
Gson gson) throws InterruptedException {
|
Gson gson) throws InterruptedException {
|
||||||
|
|
||||||
|
super(messageQueueFactory, processConfiguration, gson, CRAWLER_INBOX);
|
||||||
|
|
||||||
this.userAgent = userAgent;
|
this.userAgent = userAgent;
|
||||||
this.heartbeat = heartbeat;
|
this.heartbeat = heartbeat;
|
||||||
this.messageQueueFactory = messageQueueFactory;
|
|
||||||
this.domainProber = domainProber;
|
this.domainProber = domainProber;
|
||||||
this.fileStorageService = fileStorageService;
|
this.fileStorageService = fileStorageService;
|
||||||
this.anchorTagsSourceFactory = anchorTagsSourceFactory;
|
this.anchorTagsSourceFactory = anchorTagsSourceFactory;
|
||||||
this.warcArchiverFactory = warcArchiverFactory;
|
this.warcArchiverFactory = warcArchiverFactory;
|
||||||
this.dataSource = dataSource;
|
this.dataSource = dataSource;
|
||||||
this.blacklist = blacklist;
|
this.blacklist = blacklist;
|
||||||
this.gson = gson;
|
|
||||||
this.node = processConfiguration.node();
|
this.node = processConfiguration.node();
|
||||||
|
|
||||||
pool = new SimpleBlockingThreadPool("CrawlerPool",
|
pool = new SimpleBlockingThreadPool("CrawlerPool",
|
||||||
@ -144,13 +143,14 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
);
|
);
|
||||||
var crawler = injector.getInstance(CrawlerMain.class);
|
var crawler = injector.getInstance(CrawlerMain.class);
|
||||||
|
|
||||||
var instructions = crawler.fetchInstructions();
|
var instructions = crawler.fetchInstructions(nu.marginalia.mqapi.crawling.CrawlRequest.class);
|
||||||
try {
|
try {
|
||||||
if (instructions.targetDomainName != null) {
|
var req = instructions.value();
|
||||||
crawler.runForSingleDomain(instructions.targetDomainName, instructions.outputDir);
|
if (req.targetDomainName != null) {
|
||||||
|
crawler.runForSingleDomain(req.targetDomainName, req.crawlStorage);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
crawler.runForDatabaseDomains(instructions.outputDir);
|
crawler.runForDatabaseDomains(req.crawlStorage);
|
||||||
}
|
}
|
||||||
instructions.ok();
|
instructions.ok();
|
||||||
} catch (Exception ex) {
|
} catch (Exception ex) {
|
||||||
@ -166,6 +166,10 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
System.exit(0);
|
System.exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void runForDatabaseDomains(FileStorageId fileStorageId) throws Exception {
|
||||||
|
runForDatabaseDomains(fileStorageService.getStorage(fileStorageId).asPath());
|
||||||
|
}
|
||||||
|
|
||||||
public void runForDatabaseDomains(Path outputDir) throws Exception {
|
public void runForDatabaseDomains(Path outputDir) throws Exception {
|
||||||
|
|
||||||
heartbeat.start();
|
heartbeat.start();
|
||||||
@ -285,6 +289,11 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void runForSingleDomain(String targetDomainName, FileStorageId fileStorageId) throws Exception {
|
||||||
|
runForSingleDomain(targetDomainName, fileStorageService.getStorage(fileStorageId).asPath());
|
||||||
|
}
|
||||||
|
|
||||||
public void runForSingleDomain(String targetDomainName, Path outputDir) throws Exception {
|
public void runForSingleDomain(String targetDomainName, Path outputDir) throws Exception {
|
||||||
|
|
||||||
heartbeat.start();
|
heartbeat.start();
|
||||||
@ -410,70 +419,6 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
private static class CrawlRequest {
|
|
||||||
private final Path outputDir;
|
|
||||||
private final MqMessage message;
|
|
||||||
private final MqSingleShotInbox inbox;
|
|
||||||
|
|
||||||
private final String targetDomainName;
|
|
||||||
|
|
||||||
CrawlRequest(String targetDomainName,
|
|
||||||
Path outputDir,
|
|
||||||
MqMessage message,
|
|
||||||
MqSingleShotInbox inbox)
|
|
||||||
{
|
|
||||||
this.message = message;
|
|
||||||
this.inbox = inbox;
|
|
||||||
this.outputDir = outputDir;
|
|
||||||
this.targetDomainName = targetDomainName;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void ok() {
|
|
||||||
inbox.sendResponse(message, MqInboxResponse.ok());
|
|
||||||
}
|
|
||||||
public void err() {
|
|
||||||
inbox.sendResponse(message, MqInboxResponse.err());
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
private CrawlRequest fetchInstructions() throws Exception {
|
|
||||||
|
|
||||||
var inbox = messageQueueFactory.createSingleShotInbox(CRAWLER_INBOX, node, UUID.randomUUID());
|
|
||||||
|
|
||||||
logger.info("Waiting for instructions");
|
|
||||||
|
|
||||||
var msgOpt = getMessage(inbox, nu.marginalia.mqapi.crawling.CrawlRequest.class.getSimpleName());
|
|
||||||
var msg = msgOpt.orElseThrow(() -> new RuntimeException("No message received"));
|
|
||||||
|
|
||||||
var request = gson.fromJson(msg.payload(), nu.marginalia.mqapi.crawling.CrawlRequest.class);
|
|
||||||
var crawlStorage = fileStorageService.getStorage(request.crawlStorage);
|
|
||||||
|
|
||||||
return new CrawlRequest(
|
|
||||||
request.targetDomainName,
|
|
||||||
crawlStorage.asPath(),
|
|
||||||
msg,
|
|
||||||
inbox);
|
|
||||||
}
|
|
||||||
|
|
||||||
private Optional<MqMessage> getMessage(MqSingleShotInbox inbox, String expectedFunction) throws SQLException, InterruptedException {
|
|
||||||
var opt = inbox.waitForMessage(30, TimeUnit.SECONDS);
|
|
||||||
if (opt.isPresent()) {
|
|
||||||
if (!opt.get().function().equals(expectedFunction)) {
|
|
||||||
throw new RuntimeException("Unexpected function: " + opt.get().function());
|
|
||||||
}
|
|
||||||
return opt;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
var stolenMessage = inbox.stealMessage(msg -> msg.function().equals(expectedFunction));
|
|
||||||
stolenMessage.ifPresent(mqMessage -> logger.info("Stole message {}", mqMessage));
|
|
||||||
return stolenMessage;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public record CrawlSpecRecord(@NotNull String domain, int crawlDepth, @NotNull List<String> urls) {
|
public record CrawlSpecRecord(@NotNull String domain, int crawlDepth, @NotNull List<String> urls) {
|
||||||
|
|
||||||
public CrawlSpecRecord(String domain, int crawlDepth) {
|
public CrawlSpecRecord(String domain, int crawlDepth) {
|
||||||
|
@ -19,8 +19,13 @@ public class CrawlDelayTimer {
|
|||||||
private final long delayTime;
|
private final long delayTime;
|
||||||
|
|
||||||
public CrawlDelayTimer(long delayTime) {
|
public CrawlDelayTimer(long delayTime) {
|
||||||
|
if (delayTime <= 0) {
|
||||||
|
this.delayTime = DEFAULT_CRAWL_DELAY_MIN_MS;
|
||||||
|
}
|
||||||
|
else {
|
||||||
this.delayTime = delayTime;
|
this.delayTime = delayTime;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/** Call when we've gotten an HTTP 429 response. This will wait a moment, and then
|
/** Call when we've gotten an HTTP 429 response. This will wait a moment, and then
|
||||||
* set a flag that slows down the main crawl delay as well. */
|
* set a flag that slows down the main crawl delay as well. */
|
||||||
@ -41,6 +46,10 @@ public class CrawlDelayTimer {
|
|||||||
Thread.sleep(delay.toMillis());
|
Thread.sleep(delay.toMillis());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void waitFetchDelay() {
|
||||||
|
waitFetchDelay(0);
|
||||||
|
}
|
||||||
|
|
||||||
public void waitFetchDelay(long spentTime) {
|
public void waitFetchDelay(long spentTime) {
|
||||||
long sleepTime = delayTime;
|
long sleepTime = delayTime;
|
||||||
|
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
package nu.marginalia.crawl.warc;
|
package nu.marginalia.crawl.warc;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import nu.marginalia.ProcessConfiguration;
|
|
||||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||||
|
import nu.marginalia.process.ProcessConfiguration;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -20,7 +20,6 @@ dependencies {
|
|||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
implementation project(':code:common:db')
|
implementation project(':code:common:db')
|
||||||
implementation project(':code:common:config')
|
implementation project(':code:common:config')
|
||||||
implementation project(':code:common:process')
|
|
||||||
implementation project(':code:index:api')
|
implementation project(':code:index:api')
|
||||||
implementation project(':code:processes:crawling-process:ft-content-type')
|
implementation project(':code:processes:crawling-process:ft-content-type')
|
||||||
implementation project(':code:libraries:language-processing')
|
implementation project(':code:libraries:language-processing')
|
||||||
|
@ -148,10 +148,7 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
|
|||||||
"",
|
"",
|
||||||
nextRecord.headers,
|
nextRecord.headers,
|
||||||
bodyString,
|
bodyString,
|
||||||
Long.toHexString(hash.hashNearlyASCII(bodyString)), // this field isn't actually used, maybe we can skip calculating it?
|
// this field isn't actually used, maybe we can skip calculating it?
|
||||||
nextRecord.url,
|
|
||||||
null,
|
|
||||||
"",
|
|
||||||
nextRecord.cookies,
|
nextRecord.cookies,
|
||||||
lastModified,
|
lastModified,
|
||||||
etag));
|
etag));
|
||||||
|
@ -4,7 +4,7 @@ import nu.marginalia.model.EdgeUrl;
|
|||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.jetbrains.annotations.Nullable;
|
import org.jetbrains.annotations.Nullable;
|
||||||
|
|
||||||
public class CrawledDocument implements SerializableCrawlData {
|
public final class CrawledDocument implements SerializableCrawlData {
|
||||||
public String crawlId;
|
public String crawlId;
|
||||||
|
|
||||||
public String url;
|
public String url;
|
||||||
@ -21,16 +21,6 @@ public class CrawledDocument implements SerializableCrawlData {
|
|||||||
|
|
||||||
public String documentBody;
|
public String documentBody;
|
||||||
|
|
||||||
@Deprecated
|
|
||||||
public String documentBodyHash;
|
|
||||||
|
|
||||||
@Deprecated
|
|
||||||
public String canonicalUrl;
|
|
||||||
public String redirectUrl;
|
|
||||||
|
|
||||||
@Deprecated
|
|
||||||
public String recrawlState;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This is not guaranteed to be set in all versions of the format,
|
* This is not guaranteed to be set in all versions of the format,
|
||||||
* information may come in CrawledDomain instead
|
* information may come in CrawledDomain instead
|
||||||
@ -40,7 +30,7 @@ public class CrawledDocument implements SerializableCrawlData {
|
|||||||
public String lastModifiedMaybe;
|
public String lastModifiedMaybe;
|
||||||
public String etagMaybe;
|
public String etagMaybe;
|
||||||
|
|
||||||
public CrawledDocument(String crawlId, String url, String contentType, String timestamp, int httpStatus, String crawlerStatus, String crawlerStatusDesc, @Nullable String headers, String documentBody, String documentBodyHash, String canonicalUrl, String redirectUrl, String recrawlState, Boolean hasCookies, String lastModifiedMaybe, String etagMaybe) {
|
public CrawledDocument(String crawlId, String url, String contentType, String timestamp, int httpStatus, String crawlerStatus, String crawlerStatusDesc, @Nullable String headers, String documentBody, Boolean hasCookies, String lastModifiedMaybe, String etagMaybe) {
|
||||||
this.crawlId = crawlId;
|
this.crawlId = crawlId;
|
||||||
this.url = url;
|
this.url = url;
|
||||||
this.contentType = contentType;
|
this.contentType = contentType;
|
||||||
@ -50,10 +40,6 @@ public class CrawledDocument implements SerializableCrawlData {
|
|||||||
this.crawlerStatusDesc = crawlerStatusDesc;
|
this.crawlerStatusDesc = crawlerStatusDesc;
|
||||||
this.headers = headers;
|
this.headers = headers;
|
||||||
this.documentBody = documentBody;
|
this.documentBody = documentBody;
|
||||||
this.documentBodyHash = documentBodyHash;
|
|
||||||
this.canonicalUrl = canonicalUrl;
|
|
||||||
this.redirectUrl = redirectUrl;
|
|
||||||
this.recrawlState = recrawlState;
|
|
||||||
this.hasCookies = hasCookies;
|
this.hasCookies = hasCookies;
|
||||||
this.lastModifiedMaybe = lastModifiedMaybe;
|
this.lastModifiedMaybe = lastModifiedMaybe;
|
||||||
this.etagMaybe = etagMaybe;
|
this.etagMaybe = etagMaybe;
|
||||||
@ -120,7 +106,7 @@ public class CrawledDocument implements SerializableCrawlData {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "CrawledDocument(crawlId=" + this.crawlId + ", url=" + this.url + ", contentType=" + this.contentType + ", timestamp=" + this.timestamp + ", httpStatus=" + this.httpStatus + ", crawlerStatus=" + this.crawlerStatus + ", crawlerStatusDesc=" + this.crawlerStatusDesc + ", headers=" + this.headers + ", documentBody=" + this.documentBody + ", documentBodyHash=" + this.documentBodyHash + ", canonicalUrl=" + this.canonicalUrl + ", redirectUrl=" + this.redirectUrl + ", recrawlState=" + this.recrawlState + ", hasCookies=" + this.hasCookies + ", lastModifiedMaybe=" + this.lastModifiedMaybe + ", etagMaybe=" + this.etagMaybe + ")";
|
return "CrawledDocument(crawlId=" + this.crawlId + ", url=" + this.url + ", contentType=" + this.contentType + ", timestamp=" + this.timestamp + ", httpStatus=" + this.httpStatus + ", crawlerStatus=" + this.crawlerStatus + ", crawlerStatusDesc=" + this.crawlerStatusDesc + ", headers=" + this.headers + ", documentBody=" + this.documentBody + ", hasCookies=" + this.hasCookies + ", lastModifiedMaybe=" + this.lastModifiedMaybe + ", etagMaybe=" + this.etagMaybe + ")";
|
||||||
}
|
}
|
||||||
|
|
||||||
public static class CrawledDocumentBuilder {
|
public static class CrawledDocumentBuilder {
|
||||||
@ -133,9 +119,6 @@ public class CrawledDocument implements SerializableCrawlData {
|
|||||||
private String crawlerStatusDesc;
|
private String crawlerStatusDesc;
|
||||||
private @Nullable String headers;
|
private @Nullable String headers;
|
||||||
private String documentBody;
|
private String documentBody;
|
||||||
private String documentBodyHash;
|
|
||||||
private String canonicalUrl;
|
|
||||||
private String redirectUrl;
|
|
||||||
private String recrawlState;
|
private String recrawlState;
|
||||||
private Boolean hasCookies;
|
private Boolean hasCookies;
|
||||||
private String lastModifiedMaybe;
|
private String lastModifiedMaybe;
|
||||||
@ -189,23 +172,6 @@ public class CrawledDocument implements SerializableCrawlData {
|
|||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Deprecated
|
|
||||||
public CrawledDocumentBuilder documentBodyHash(String documentBodyHash) {
|
|
||||||
this.documentBodyHash = documentBodyHash;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Deprecated
|
|
||||||
public CrawledDocumentBuilder canonicalUrl(String canonicalUrl) {
|
|
||||||
this.canonicalUrl = canonicalUrl;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public CrawledDocumentBuilder redirectUrl(String redirectUrl) {
|
|
||||||
this.redirectUrl = redirectUrl;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Deprecated
|
@Deprecated
|
||||||
public CrawledDocumentBuilder recrawlState(String recrawlState) {
|
public CrawledDocumentBuilder recrawlState(String recrawlState) {
|
||||||
this.recrawlState = recrawlState;
|
this.recrawlState = recrawlState;
|
||||||
@ -228,11 +194,11 @@ public class CrawledDocument implements SerializableCrawlData {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public CrawledDocument build() {
|
public CrawledDocument build() {
|
||||||
return new CrawledDocument(this.crawlId, this.url, this.contentType, this.timestamp, this.httpStatus, this.crawlerStatus, this.crawlerStatusDesc, this.headers, this.documentBody, this.documentBodyHash, this.canonicalUrl, this.redirectUrl, this.recrawlState, this.hasCookies, this.lastModifiedMaybe, this.etagMaybe);
|
return new CrawledDocument(this.crawlId, this.url, this.contentType, this.timestamp, this.httpStatus, this.crawlerStatus, this.crawlerStatusDesc, this.headers, this.documentBody, this.hasCookies, this.lastModifiedMaybe, this.etagMaybe);
|
||||||
}
|
}
|
||||||
|
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "CrawledDocument.CrawledDocumentBuilder(crawlId=" + this.crawlId + ", url=" + this.url + ", contentType=" + this.contentType + ", timestamp=" + this.timestamp + ", httpStatus=" + this.httpStatus + ", crawlerStatus=" + this.crawlerStatus + ", crawlerStatusDesc=" + this.crawlerStatusDesc + ", headers=" + this.headers + ", documentBody=" + this.documentBody + ", documentBodyHash=" + this.documentBodyHash + ", canonicalUrl=" + this.canonicalUrl + ", redirectUrl=" + this.redirectUrl + ", recrawlState=" + this.recrawlState + ", hasCookies=" + this.hasCookies + ", lastModifiedMaybe=" + this.lastModifiedMaybe + ", etagMaybe=" + this.etagMaybe + ")";
|
return "CrawledDocument.CrawledDocumentBuilder(crawlId=" + this.crawlId + ", url=" + this.url + ", contentType=" + this.contentType + ", timestamp=" + this.timestamp + ", httpStatus=" + this.httpStatus + ", crawlerStatus=" + this.crawlerStatus + ", crawlerStatusDesc=" + this.crawlerStatusDesc + ", headers=" + this.headers + ", documentBody=" + this.documentBody + ", recrawlState=" + this.recrawlState + ", hasCookies=" + this.hasCookies + ", lastModifiedMaybe=" + this.lastModifiedMaybe + ", etagMaybe=" + this.etagMaybe + ")";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
package nu.marginalia.model.crawldata;
|
package nu.marginalia.model.crawldata;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
public class CrawledDomain implements SerializableCrawlData {
|
public final class CrawledDomain implements SerializableCrawlData {
|
||||||
public String domain;
|
public String domain;
|
||||||
|
|
||||||
public String redirectDomain;
|
public String redirectDomain;
|
||||||
@ -11,6 +12,7 @@ public class CrawledDomain implements SerializableCrawlData {
|
|||||||
public String crawlerStatusDesc;
|
public String crawlerStatusDesc;
|
||||||
public String ip;
|
public String ip;
|
||||||
|
|
||||||
|
@Deprecated // This used to be populated, but is no longer
|
||||||
public List<CrawledDocument> doc;
|
public List<CrawledDocument> doc;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -29,15 +31,6 @@ public class CrawledDomain implements SerializableCrawlData {
|
|||||||
this.cookies = cookies;
|
this.cookies = cookies;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static CrawledDomainBuilder builder() {
|
|
||||||
return new CrawledDomainBuilder();
|
|
||||||
}
|
|
||||||
|
|
||||||
public int size() {
|
|
||||||
if (doc == null) return 0;
|
|
||||||
return doc.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getDomain() {
|
public String getDomain() {
|
||||||
return this.domain;
|
return this.domain;
|
||||||
}
|
}
|
||||||
@ -94,119 +87,26 @@ public class CrawledDomain implements SerializableCrawlData {
|
|||||||
this.cookies = cookies;
|
this.cookies = cookies;
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean equals(final Object o) {
|
@Override
|
||||||
if (o == this) return true;
|
public boolean equals(Object o) {
|
||||||
if (!(o instanceof CrawledDomain)) return false;
|
if (!(o instanceof CrawledDomain that)) return false;
|
||||||
final CrawledDomain other = (CrawledDomain) o;
|
|
||||||
if (!other.canEqual((Object) this)) return false;
|
return Objects.equals(domain, that.domain) && Objects.equals(redirectDomain, that.redirectDomain) && Objects.equals(crawlerStatus, that.crawlerStatus) && Objects.equals(crawlerStatusDesc, that.crawlerStatusDesc) && Objects.equals(ip, that.ip) && Objects.equals(doc, that.doc) && Objects.equals(cookies, that.cookies);
|
||||||
final Object this$domain = this.getDomain();
|
|
||||||
final Object other$domain = other.getDomain();
|
|
||||||
if (this$domain == null ? other$domain != null : !this$domain.equals(other$domain)) return false;
|
|
||||||
final Object this$redirectDomain = this.getRedirectDomain();
|
|
||||||
final Object other$redirectDomain = other.getRedirectDomain();
|
|
||||||
if (this$redirectDomain == null ? other$redirectDomain != null : !this$redirectDomain.equals(other$redirectDomain))
|
|
||||||
return false;
|
|
||||||
final Object this$crawlerStatus = this.getCrawlerStatus();
|
|
||||||
final Object other$crawlerStatus = other.getCrawlerStatus();
|
|
||||||
if (this$crawlerStatus == null ? other$crawlerStatus != null : !this$crawlerStatus.equals(other$crawlerStatus))
|
|
||||||
return false;
|
|
||||||
final Object this$crawlerStatusDesc = this.getCrawlerStatusDesc();
|
|
||||||
final Object other$crawlerStatusDesc = other.getCrawlerStatusDesc();
|
|
||||||
if (this$crawlerStatusDesc == null ? other$crawlerStatusDesc != null : !this$crawlerStatusDesc.equals(other$crawlerStatusDesc))
|
|
||||||
return false;
|
|
||||||
final Object this$ip = this.getIp();
|
|
||||||
final Object other$ip = other.getIp();
|
|
||||||
if (this$ip == null ? other$ip != null : !this$ip.equals(other$ip)) return false;
|
|
||||||
final Object this$doc = this.getDoc();
|
|
||||||
final Object other$doc = other.getDoc();
|
|
||||||
if (this$doc == null ? other$doc != null : !this$doc.equals(other$doc)) return false;
|
|
||||||
final Object this$cookies = this.getCookies();
|
|
||||||
final Object other$cookies = other.getCookies();
|
|
||||||
if (this$cookies == null ? other$cookies != null : !this$cookies.equals(other$cookies)) return false;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
protected boolean canEqual(final Object other) {
|
|
||||||
return other instanceof CrawledDomain;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
public int hashCode() {
|
public int hashCode() {
|
||||||
final int PRIME = 59;
|
int result = Objects.hashCode(domain);
|
||||||
int result = 1;
|
result = 31 * result + Objects.hashCode(redirectDomain);
|
||||||
final Object $domain = this.getDomain();
|
result = 31 * result + Objects.hashCode(crawlerStatus);
|
||||||
result = result * PRIME + ($domain == null ? 43 : $domain.hashCode());
|
result = 31 * result + Objects.hashCode(crawlerStatusDesc);
|
||||||
final Object $redirectDomain = this.getRedirectDomain();
|
result = 31 * result + Objects.hashCode(ip);
|
||||||
result = result * PRIME + ($redirectDomain == null ? 43 : $redirectDomain.hashCode());
|
result = 31 * result + Objects.hashCode(doc);
|
||||||
final Object $crawlerStatus = this.getCrawlerStatus();
|
result = 31 * result + Objects.hashCode(cookies);
|
||||||
result = result * PRIME + ($crawlerStatus == null ? 43 : $crawlerStatus.hashCode());
|
|
||||||
final Object $crawlerStatusDesc = this.getCrawlerStatusDesc();
|
|
||||||
result = result * PRIME + ($crawlerStatusDesc == null ? 43 : $crawlerStatusDesc.hashCode());
|
|
||||||
final Object $ip = this.getIp();
|
|
||||||
result = result * PRIME + ($ip == null ? 43 : $ip.hashCode());
|
|
||||||
final Object $doc = this.getDoc();
|
|
||||||
result = result * PRIME + ($doc == null ? 43 : $doc.hashCode());
|
|
||||||
final Object $cookies = this.getCookies();
|
|
||||||
result = result * PRIME + ($cookies == null ? 43 : $cookies.hashCode());
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "CrawledDomain(domain=" + this.getDomain() + ", redirectDomain=" + this.getRedirectDomain() + ", crawlerStatus=" + this.getCrawlerStatus() + ", crawlerStatusDesc=" + this.getCrawlerStatusDesc() + ", ip=" + this.getIp() + ", doc=" + this.getDoc() + ", cookies=" + this.getCookies() + ")";
|
return "CrawledDomain(domain=" + this.getDomain() + ", redirectDomain=" + this.getRedirectDomain() + ", crawlerStatus=" + this.getCrawlerStatus() + ", crawlerStatusDesc=" + this.getCrawlerStatusDesc() + ", ip=" + this.getIp() + ", doc=" + this.getDoc() + ", cookies=" + this.getCookies() + ")";
|
||||||
}
|
}
|
||||||
|
|
||||||
public static class CrawledDomainBuilder {
|
|
||||||
private String domain;
|
|
||||||
private String redirectDomain;
|
|
||||||
private String crawlerStatus;
|
|
||||||
private String crawlerStatusDesc;
|
|
||||||
private String ip;
|
|
||||||
private List<CrawledDocument> doc;
|
|
||||||
private List<String> cookies;
|
|
||||||
|
|
||||||
CrawledDomainBuilder() {
|
|
||||||
}
|
|
||||||
|
|
||||||
public CrawledDomainBuilder domain(String domain) {
|
|
||||||
this.domain = domain;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public CrawledDomainBuilder redirectDomain(String redirectDomain) {
|
|
||||||
this.redirectDomain = redirectDomain;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public CrawledDomainBuilder crawlerStatus(String crawlerStatus) {
|
|
||||||
this.crawlerStatus = crawlerStatus;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public CrawledDomainBuilder crawlerStatusDesc(String crawlerStatusDesc) {
|
|
||||||
this.crawlerStatusDesc = crawlerStatusDesc;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public CrawledDomainBuilder ip(String ip) {
|
|
||||||
this.ip = ip;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public CrawledDomainBuilder doc(List<CrawledDocument> doc) {
|
|
||||||
this.doc = doc;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public CrawledDomainBuilder cookies(List<String> cookies) {
|
|
||||||
this.cookies = cookies;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public CrawledDomain build() {
|
|
||||||
return new CrawledDomain(this.domain, this.redirectDomain, this.crawlerStatus, this.crawlerStatusDesc, this.ip, this.doc, this.cookies);
|
|
||||||
}
|
|
||||||
|
|
||||||
public String toString() {
|
|
||||||
return "CrawledDomain.CrawledDomainBuilder(domain=" + this.domain + ", redirectDomain=" + this.redirectDomain + ", crawlerStatus=" + this.crawlerStatus + ", crawlerStatusDesc=" + this.crawlerStatusDesc + ", ip=" + this.ip + ", doc=" + this.doc + ", cookies=" + this.cookies + ")";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
package nu.marginalia.model.crawldata;
|
package nu.marginalia.model.crawldata;
|
||||||
|
|
||||||
public interface SerializableCrawlData {
|
public sealed interface SerializableCrawlData permits CrawledDocument, CrawledDomain {
|
||||||
String getDomain();
|
String getDomain();
|
||||||
}
|
}
|
||||||
|
@ -1,24 +1,33 @@
|
|||||||
plugins {
|
plugins {
|
||||||
id 'java'
|
id 'java'
|
||||||
|
|
||||||
|
id 'application'
|
||||||
id "de.undercouch.download" version "5.1.0"
|
|
||||||
|
|
||||||
id 'jvm-test-suite'
|
id 'jvm-test-suite'
|
||||||
}
|
}
|
||||||
|
|
||||||
java {
|
java {
|
||||||
toolchain {
|
toolchain {
|
||||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
application {
|
||||||
|
mainClass = 'nu.marginalia.task.ExportTaskMain'
|
||||||
|
applicationName = 'export-task-process'
|
||||||
|
}
|
||||||
|
|
||||||
|
tasks.distZip.enabled = false
|
||||||
|
|
||||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':code:common:config')
|
|
||||||
implementation project(':code:common:process')
|
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
|
implementation project(':code:common:db')
|
||||||
|
implementation project(':code:common:service')
|
||||||
|
implementation project(':code:common:config')
|
||||||
|
implementation project(':code:libraries:message-queue')
|
||||||
|
|
||||||
|
implementation project(':code:functions:link-graph:api')
|
||||||
|
implementation project(':code:processes:process-mq-api')
|
||||||
implementation project(':code:libraries:language-processing')
|
implementation project(':code:libraries:language-processing')
|
||||||
implementation project(':code:libraries:term-frequency-dict')
|
implementation project(':code:libraries:term-frequency-dict')
|
||||||
implementation project(':code:libraries:blocking-thread-pool')
|
implementation project(':code:libraries:blocking-thread-pool')
|
||||||
@ -28,20 +37,32 @@ dependencies {
|
|||||||
implementation project(':code:processes:converting-process')
|
implementation project(':code:processes:converting-process')
|
||||||
implementation project(':third-party:commons-codec')
|
implementation project(':third-party:commons-codec')
|
||||||
|
|
||||||
|
|
||||||
implementation libs.bundles.slf4j
|
implementation libs.bundles.slf4j
|
||||||
|
|
||||||
implementation libs.guava
|
implementation libs.guava
|
||||||
implementation dependencies.create(libs.guice.get()) {
|
implementation dependencies.create(libs.guice.get()) {
|
||||||
exclude group: 'com.google.guava'
|
exclude group: 'com.google.guava'
|
||||||
}
|
}
|
||||||
|
implementation libs.roaringbitmap
|
||||||
implementation libs.trove
|
implementation libs.trove
|
||||||
|
implementation libs.fastutil
|
||||||
|
implementation libs.bundles.mariadb
|
||||||
|
implementation libs.gson
|
||||||
implementation libs.commons.lang3
|
implementation libs.commons.lang3
|
||||||
|
implementation libs.commons.io
|
||||||
implementation libs.commons.compress
|
implementation libs.commons.compress
|
||||||
implementation libs.notnull
|
implementation libs.commons.codec
|
||||||
implementation libs.jsoup
|
implementation libs.jsoup
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
testImplementation libs.bundles.slf4j.test
|
testImplementation libs.bundles.slf4j.test
|
||||||
testImplementation libs.bundles.junit
|
testImplementation libs.bundles.junit
|
||||||
testImplementation libs.mockito
|
testImplementation libs.mockito
|
||||||
}
|
|
||||||
|
|
||||||
|
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
|
||||||
|
testImplementation libs.commons.codec
|
||||||
|
testImplementation 'org.testcontainers:mariadb:1.17.4'
|
||||||
|
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
|
||||||
|
testImplementation project(':code:libraries:test-helpers')
|
||||||
|
}
|
@ -0,0 +1,127 @@
|
|||||||
|
package nu.marginalia.adjacencies;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import nu.marginalia.api.linkgraph.AggregateLinkGraphClient;
|
||||||
|
import nu.marginalia.process.ProcessConfiguration;
|
||||||
|
import nu.marginalia.process.control.ProcessHeartbeatImpl;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.concurrent.Executors;
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
import java.util.function.Consumer;
|
||||||
|
import java.util.stream.IntStream;
|
||||||
|
|
||||||
|
import static nu.marginalia.adjacencies.SparseBitVector.andCardinality;
|
||||||
|
import static nu.marginalia.adjacencies.SparseBitVector.weightedProduct;
|
||||||
|
|
||||||
|
public class WebsiteAdjacenciesCalculator {
|
||||||
|
private final AggregateLinkGraphClient domainLinksClient;
|
||||||
|
private final ProcessConfiguration configuration;
|
||||||
|
private final HikariDataSource dataSource;
|
||||||
|
public AdjacenciesData adjacenciesData;
|
||||||
|
public DomainAliases domainAliases;
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(WebsiteAdjacenciesCalculator.class);
|
||||||
|
|
||||||
|
float[] weights;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public WebsiteAdjacenciesCalculator(AggregateLinkGraphClient domainLinksClient,
|
||||||
|
ProcessConfiguration configuration,
|
||||||
|
HikariDataSource dataSource) {
|
||||||
|
this.domainLinksClient = domainLinksClient;
|
||||||
|
this.configuration = configuration;
|
||||||
|
this.dataSource = dataSource;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void export() throws Exception {
|
||||||
|
try (var processHeartbeat = new ProcessHeartbeatImpl(configuration, dataSource)) {
|
||||||
|
domainAliases = new DomainAliases(dataSource);
|
||||||
|
adjacenciesData = new AdjacenciesData(domainLinksClient, domainAliases);
|
||||||
|
weights = adjacenciesData.getWeights();
|
||||||
|
|
||||||
|
AdjacenciesLoader loader = new AdjacenciesLoader(dataSource);
|
||||||
|
var executor = Executors.newFixedThreadPool(16);
|
||||||
|
|
||||||
|
int total = adjacenciesData.getIdsList().size();
|
||||||
|
AtomicInteger progress = new AtomicInteger(0);
|
||||||
|
IntStream.of(adjacenciesData.getIdsList().toArray()).parallel()
|
||||||
|
.filter(domainAliases::isNotAliased)
|
||||||
|
.forEach(id -> {
|
||||||
|
findAdjacent(id, loader::load);
|
||||||
|
processHeartbeat.setProgress(progress.incrementAndGet() / (double) total);
|
||||||
|
});
|
||||||
|
|
||||||
|
executor.shutdown();
|
||||||
|
System.out.println("Waiting for wrap-up");
|
||||||
|
loader.stop();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void findAdjacent(int domainId, Consumer<DomainSimilarities> andThen) {
|
||||||
|
findAdjacentDtoS(domainId, andThen);
|
||||||
|
}
|
||||||
|
|
||||||
|
double cosineSimilarity(SparseBitVector a, SparseBitVector b) {
|
||||||
|
double andCardinality = andCardinality(a, b);
|
||||||
|
andCardinality /= Math.sqrt(a.getCardinality());
|
||||||
|
andCardinality /= Math.sqrt(b.getCardinality());
|
||||||
|
return andCardinality;
|
||||||
|
}
|
||||||
|
|
||||||
|
double expensiveCosineSimilarity(SparseBitVector a, SparseBitVector b) {
|
||||||
|
return weightedProduct(weights, a, b) / Math.sqrt(a.mulAndSum(weights) * b.mulAndSum(weights));
|
||||||
|
}
|
||||||
|
|
||||||
|
public record DomainSimilarities(int domainId, List<DomainSimilarity> similarities) {}
|
||||||
|
|
||||||
|
public record DomainSimilarity(int domainId, double value) {}
|
||||||
|
|
||||||
|
private void findAdjacentDtoS(int domainId, Consumer<DomainSimilarities> andThen) {
|
||||||
|
var vector = adjacenciesData.getVector(domainId);
|
||||||
|
|
||||||
|
if (vector == null || !vector.cardinalityExceeds(10)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
List<DomainSimilarity> similarities = new ArrayList<>(1000);
|
||||||
|
|
||||||
|
var items = adjacenciesData.getCandidates(vector);
|
||||||
|
|
||||||
|
|
||||||
|
int cardMin = Math.max(2, (int) (0.01 * vector.getCardinality()));
|
||||||
|
|
||||||
|
items.forEach(id -> {
|
||||||
|
var otherVec = adjacenciesData.getVector(id);
|
||||||
|
|
||||||
|
if (null == otherVec || otherVec == vector)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
if (otherVec.getCardinality() < cardMin)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
double similarity = cosineSimilarity(vector, otherVec);
|
||||||
|
if (similarity > 0.1) {
|
||||||
|
var recalculated = expensiveCosineSimilarity(vector, otherVec);
|
||||||
|
if (recalculated > 0.1) {
|
||||||
|
similarities.add(new DomainSimilarity(id, recalculated));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
});
|
||||||
|
|
||||||
|
if (similarities.size() > 128) {
|
||||||
|
similarities.sort(Comparator.comparing(DomainSimilarity::value));
|
||||||
|
similarities.subList(0, similarities.size() - 128).clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
andThen.accept(new DomainSimilarities(domainId, similarities));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,82 @@
|
|||||||
|
package nu.marginalia.task;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import com.google.inject.Guice;
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import nu.marginalia.adjacencies.WebsiteAdjacenciesCalculator;
|
||||||
|
import nu.marginalia.extractor.AtagExporter;
|
||||||
|
import nu.marginalia.extractor.FeedExporter;
|
||||||
|
import nu.marginalia.extractor.SampleDataExporter;
|
||||||
|
import nu.marginalia.extractor.TermFrequencyExporter;
|
||||||
|
import nu.marginalia.mq.MessageQueueFactory;
|
||||||
|
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||||
|
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
||||||
|
import nu.marginalia.process.ProcessConfiguration;
|
||||||
|
import nu.marginalia.process.ProcessConfigurationModule;
|
||||||
|
import nu.marginalia.process.ProcessMainClass;
|
||||||
|
import nu.marginalia.service.module.DatabaseModule;
|
||||||
|
import nu.marginalia.service.module.ServiceDiscoveryModule;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
public class ExportTasksMain extends ProcessMainClass {
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(ExportTasksMain.class);
|
||||||
|
|
||||||
|
private final AtagExporter atagExporter;
|
||||||
|
private final FeedExporter feedExporter;
|
||||||
|
private final SampleDataExporter sampleDataExporter;
|
||||||
|
private final TermFrequencyExporter termFrequencyExporter;
|
||||||
|
private final WebsiteAdjacenciesCalculator websiteAdjacenciesCalculator;
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
|
||||||
|
var injector = Guice.createInjector(
|
||||||
|
new ServiceDiscoveryModule(),
|
||||||
|
new ProcessConfigurationModule("export-tasks"),
|
||||||
|
new DatabaseModule(false)
|
||||||
|
);
|
||||||
|
|
||||||
|
var exportTasks = injector.getInstance(ExportTasksMain.class);
|
||||||
|
|
||||||
|
Instructions<ExportTaskRequest> instructions = exportTasks.fetchInstructions(ExportTaskRequest.class);
|
||||||
|
try {
|
||||||
|
exportTasks.run(instructions.value());
|
||||||
|
instructions.ok();
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
logger.error("Error running export task", e);
|
||||||
|
instructions.err();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public ExportTasksMain(MessageQueueFactory messageQueueFactory,
|
||||||
|
ProcessConfiguration config,
|
||||||
|
AtagExporter atagExporter,
|
||||||
|
FeedExporter feedExporter,
|
||||||
|
SampleDataExporter sampleDataExporter,
|
||||||
|
TermFrequencyExporter termFrequencyExporter,
|
||||||
|
Gson gson, WebsiteAdjacenciesCalculator websiteAdjacenciesCalculator)
|
||||||
|
{
|
||||||
|
super(messageQueueFactory, config, gson, ProcessInboxNames.EXPORT_TASK_INBOX);
|
||||||
|
this.atagExporter = atagExporter;
|
||||||
|
this.feedExporter = feedExporter;
|
||||||
|
this.sampleDataExporter = sampleDataExporter;
|
||||||
|
this.termFrequencyExporter = termFrequencyExporter;
|
||||||
|
this.websiteAdjacenciesCalculator = websiteAdjacenciesCalculator;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void run(ExportTaskRequest request) throws Exception {
|
||||||
|
switch (request.task) {
|
||||||
|
case ATAGS: atagExporter.export(request.crawlId, request.destId); break;
|
||||||
|
case FEEDS: feedExporter.export(request.crawlId, request.destId); break;
|
||||||
|
case TERM_FREQ: termFrequencyExporter.export(request.crawlId, request.destId); break;
|
||||||
|
case SAMPLE_DATA: sampleDataExporter.export(request.crawlId, request.destId, request.size, request.name); break;
|
||||||
|
case ADJACENCIES: websiteAdjacenciesCalculator.export(); break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@ -22,7 +22,6 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
|
|||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':code:processes:process-mq-api')
|
implementation project(':code:processes:process-mq-api')
|
||||||
implementation project(':code:common:process')
|
|
||||||
implementation project(':code:common:service')
|
implementation project(':code:common:service')
|
||||||
implementation project(':code:common:db')
|
implementation project(':code:common:db')
|
||||||
implementation project(':code:common:config')
|
implementation project(':code:common:config')
|
||||||
|
@ -1,11 +1,8 @@
|
|||||||
package nu.marginalia.index;
|
package nu.marginalia.index;
|
||||||
|
|
||||||
import com.google.gson.Gson;
|
|
||||||
import com.google.inject.Guice;
|
import com.google.inject.Guice;
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import nu.marginalia.IndexLocations;
|
import nu.marginalia.IndexLocations;
|
||||||
import nu.marginalia.ProcessConfiguration;
|
|
||||||
import nu.marginalia.ProcessConfigurationModule;
|
|
||||||
import nu.marginalia.index.construction.full.FullIndexConstructor;
|
import nu.marginalia.index.construction.full.FullIndexConstructor;
|
||||||
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
|
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
|
||||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||||
@ -15,13 +12,11 @@ import nu.marginalia.index.journal.IndexJournal;
|
|||||||
import nu.marginalia.model.gson.GsonFactory;
|
import nu.marginalia.model.gson.GsonFactory;
|
||||||
import nu.marginalia.model.id.UrlIdCodec;
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
import nu.marginalia.mq.MessageQueueFactory;
|
import nu.marginalia.mq.MessageQueueFactory;
|
||||||
import nu.marginalia.mq.MqMessage;
|
|
||||||
import nu.marginalia.mq.inbox.MqInboxResponse;
|
|
||||||
import nu.marginalia.mq.inbox.MqSingleShotInbox;
|
|
||||||
import nu.marginalia.mqapi.index.CreateIndexRequest;
|
import nu.marginalia.mqapi.index.CreateIndexRequest;
|
||||||
import nu.marginalia.mqapi.index.IndexName;
|
import nu.marginalia.process.ProcessConfiguration;
|
||||||
|
import nu.marginalia.process.ProcessConfigurationModule;
|
||||||
|
import nu.marginalia.process.ProcessMainClass;
|
||||||
import nu.marginalia.process.control.ProcessHeartbeatImpl;
|
import nu.marginalia.process.control.ProcessHeartbeatImpl;
|
||||||
import nu.marginalia.service.ProcessMainClass;
|
|
||||||
import nu.marginalia.service.module.DatabaseModule;
|
import nu.marginalia.service.module.DatabaseModule;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -31,8 +26,6 @@ import java.io.IOException;
|
|||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.UUID;
|
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
import static nu.marginalia.mqapi.ProcessInboxNames.INDEX_CONSTRUCTOR_INBOX;
|
import static nu.marginalia.mqapi.ProcessInboxNames.INDEX_CONSTRUCTOR_INBOX;
|
||||||
@ -40,15 +33,12 @@ import static nu.marginalia.mqapi.ProcessInboxNames.INDEX_CONSTRUCTOR_INBOX;
|
|||||||
public class IndexConstructorMain extends ProcessMainClass {
|
public class IndexConstructorMain extends ProcessMainClass {
|
||||||
private final FileStorageService fileStorageService;
|
private final FileStorageService fileStorageService;
|
||||||
private final ProcessHeartbeatImpl heartbeat;
|
private final ProcessHeartbeatImpl heartbeat;
|
||||||
private final MessageQueueFactory messageQueueFactory;
|
|
||||||
private final DomainRankings domainRankings;
|
private final DomainRankings domainRankings;
|
||||||
private final int node;
|
|
||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(IndexConstructorMain.class);
|
private static final Logger logger = LoggerFactory.getLogger(IndexConstructorMain.class);
|
||||||
private final Gson gson = GsonFactory.get();
|
|
||||||
public static void main(String[] args) throws Exception {
|
|
||||||
CreateIndexInstructions instructions = null;
|
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
Instructions<CreateIndexRequest> instructions = null;
|
||||||
try {
|
try {
|
||||||
new org.mariadb.jdbc.Driver();
|
new org.mariadb.jdbc.Driver();
|
||||||
|
|
||||||
@ -58,9 +48,8 @@ public class IndexConstructorMain extends ProcessMainClass {
|
|||||||
new DatabaseModule(false))
|
new DatabaseModule(false))
|
||||||
.getInstance(IndexConstructorMain.class);
|
.getInstance(IndexConstructorMain.class);
|
||||||
|
|
||||||
instructions = main.fetchInstructions();
|
instructions = main.fetchInstructions(CreateIndexRequest.class);
|
||||||
|
main.run(instructions.value());
|
||||||
main.run(instructions);
|
|
||||||
instructions.ok();
|
instructions.ok();
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
@ -85,17 +74,17 @@ public class IndexConstructorMain extends ProcessMainClass {
|
|||||||
ProcessConfiguration processConfiguration,
|
ProcessConfiguration processConfiguration,
|
||||||
DomainRankings domainRankings) {
|
DomainRankings domainRankings) {
|
||||||
|
|
||||||
|
super(messageQueueFactory, processConfiguration, GsonFactory.get(), INDEX_CONSTRUCTOR_INBOX);
|
||||||
|
|
||||||
this.fileStorageService = fileStorageService;
|
this.fileStorageService = fileStorageService;
|
||||||
this.heartbeat = heartbeat;
|
this.heartbeat = heartbeat;
|
||||||
this.messageQueueFactory = messageQueueFactory;
|
|
||||||
this.domainRankings = domainRankings;
|
this.domainRankings = domainRankings;
|
||||||
this.node = processConfiguration.node();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void run(CreateIndexInstructions instructions) throws SQLException, IOException {
|
private void run(CreateIndexRequest instructions) throws SQLException, IOException {
|
||||||
heartbeat.start();
|
heartbeat.start();
|
||||||
|
|
||||||
switch (instructions.name) {
|
switch (instructions.indexName()) {
|
||||||
case FORWARD -> createForwardIndex();
|
case FORWARD -> createForwardIndex();
|
||||||
case REVERSE_FULL -> createFullReverseIndex();
|
case REVERSE_FULL -> createFullReverseIndex();
|
||||||
case REVERSE_PRIO -> createPrioReverseIndex();
|
case REVERSE_PRIO -> createPrioReverseIndex();
|
||||||
@ -171,52 +160,4 @@ public class IndexConstructorMain extends ProcessMainClass {
|
|||||||
docId);
|
docId);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class CreateIndexInstructions {
|
|
||||||
|
|
||||||
public final IndexName name;
|
|
||||||
private final MqSingleShotInbox inbox;
|
|
||||||
private final MqMessage message;
|
|
||||||
|
|
||||||
private CreateIndexInstructions(IndexName name, MqSingleShotInbox inbox, MqMessage message) {
|
|
||||||
this.name = name;
|
|
||||||
this.inbox = inbox;
|
|
||||||
this.message = message;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void ok() {
|
|
||||||
inbox.sendResponse(message, MqInboxResponse.ok());
|
|
||||||
}
|
|
||||||
public void err() {
|
|
||||||
inbox.sendResponse(message, MqInboxResponse.err());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private CreateIndexInstructions fetchInstructions() throws Exception {
|
|
||||||
|
|
||||||
var inbox = messageQueueFactory.createSingleShotInbox(INDEX_CONSTRUCTOR_INBOX, node, UUID.randomUUID());
|
|
||||||
|
|
||||||
logger.info("Waiting for instructions");
|
|
||||||
var msgOpt = getMessage(inbox, CreateIndexRequest.class.getSimpleName());
|
|
||||||
var msg = msgOpt.orElseThrow(() -> new RuntimeException("No message received"));
|
|
||||||
|
|
||||||
var payload = gson.fromJson(msg.payload(), CreateIndexRequest.class);
|
|
||||||
var name = payload.indexName();
|
|
||||||
|
|
||||||
return new CreateIndexInstructions(name, inbox, msg);
|
|
||||||
}
|
|
||||||
|
|
||||||
private Optional<MqMessage> getMessage(MqSingleShotInbox inbox, String expectedFunction) throws SQLException, InterruptedException {
|
|
||||||
var opt = inbox.waitForMessage(30, TimeUnit.SECONDS);
|
|
||||||
if (opt.isPresent()) {
|
|
||||||
if (!opt.get().function().equals(expectedFunction)) {
|
|
||||||
throw new RuntimeException("Unexpected function: " + opt.get().function());
|
|
||||||
}
|
|
||||||
return opt;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
var stolenMessage = inbox.stealMessage(msg -> msg.function().equals(expectedFunction));
|
|
||||||
stolenMessage.ifPresent(mqMessage -> logger.info("Stole message {}", mqMessage));
|
|
||||||
return stolenMessage;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
77
code/processes/live-crawling-process/build.gradle
Normal file
77
code/processes/live-crawling-process/build.gradle
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
plugins {
|
||||||
|
id 'java'
|
||||||
|
|
||||||
|
id 'application'
|
||||||
|
id 'jvm-test-suite'
|
||||||
|
}
|
||||||
|
|
||||||
|
java {
|
||||||
|
toolchain {
|
||||||
|
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
application {
|
||||||
|
mainClass = 'nu.marginalia.livecrawler.LiveCrawlerMain'
|
||||||
|
applicationName = 'live-crawler-process'
|
||||||
|
}
|
||||||
|
|
||||||
|
tasks.distZip.enabled = false
|
||||||
|
|
||||||
|
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
|
||||||
|
implementation project(':code:common:db')
|
||||||
|
implementation project(':code:common:model')
|
||||||
|
implementation project(':code:common:config')
|
||||||
|
implementation project(':code:common:service')
|
||||||
|
implementation project(':code:common:linkdb')
|
||||||
|
implementation project(':code:functions:live-capture:api')
|
||||||
|
implementation project(':code:libraries:blocking-thread-pool')
|
||||||
|
implementation project(':code:index:api')
|
||||||
|
implementation project(':code:processes:process-mq-api')
|
||||||
|
implementation project(':code:libraries:message-queue')
|
||||||
|
implementation project(':code:libraries:language-processing')
|
||||||
|
implementation project(':code:libraries:easy-lsh')
|
||||||
|
implementation project(':code:processes:crawling-process')
|
||||||
|
implementation project(':code:processes:crawling-process:model')
|
||||||
|
implementation project(':code:processes:converting-process')
|
||||||
|
implementation project(':code:processes:loading-process')
|
||||||
|
|
||||||
|
|
||||||
|
implementation project(':code:processes:crawling-process:ft-crawl-blocklist')
|
||||||
|
implementation project(':code:processes:crawling-process:ft-link-parser')
|
||||||
|
implementation project(':code:processes:crawling-process:ft-content-type')
|
||||||
|
implementation project(':third-party:commons-codec')
|
||||||
|
|
||||||
|
implementation libs.bundles.slf4j
|
||||||
|
|
||||||
|
implementation libs.notnull
|
||||||
|
implementation libs.guava
|
||||||
|
implementation dependencies.create(libs.guice.get()) {
|
||||||
|
exclude group: 'com.google.guava'
|
||||||
|
}
|
||||||
|
implementation libs.gson
|
||||||
|
implementation libs.zstd
|
||||||
|
implementation libs.jwarc
|
||||||
|
implementation libs.crawlercommons
|
||||||
|
implementation libs.okhttp3
|
||||||
|
implementation libs.jsoup
|
||||||
|
implementation libs.opencsv
|
||||||
|
implementation libs.fastutil
|
||||||
|
implementation libs.sqlite
|
||||||
|
|
||||||
|
implementation libs.bundles.mariadb
|
||||||
|
|
||||||
|
|
||||||
|
testImplementation libs.bundles.slf4j.test
|
||||||
|
testImplementation libs.bundles.junit
|
||||||
|
testImplementation libs.mockito
|
||||||
|
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
|
||||||
|
testImplementation libs.commons.codec
|
||||||
|
testImplementation 'org.testcontainers:mariadb:1.17.4'
|
||||||
|
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
|
||||||
|
testImplementation project(':code:libraries:test-helpers')
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,244 @@
|
|||||||
|
package nu.marginalia.livecrawler;
|
||||||
|
|
||||||
|
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
|
import nu.marginalia.model.crawldata.CrawledDomain;
|
||||||
|
import nu.marginalia.model.crawldata.SerializableCrawlData;
|
||||||
|
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.ByteArrayOutputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.sql.Connection;
|
||||||
|
import java.sql.DriverManager;
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.time.Instant;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.zip.GZIPInputStream;
|
||||||
|
import java.util.zip.GZIPOutputStream;
|
||||||
|
|
||||||
|
/** Data access object for the live crawl database, a simple sqlite file */
|
||||||
|
public class LiveCrawlDataSet implements AutoCloseable {
|
||||||
|
private final Connection connection;
|
||||||
|
private final Path basePath;
|
||||||
|
|
||||||
|
public LiveCrawlDataSet(Path basePath) throws SQLException {
|
||||||
|
this.basePath = basePath;
|
||||||
|
this.connection = DriverManager.getConnection("jdbc:sqlite:" + basePath.resolve("live-crawl-data.db"));
|
||||||
|
this.connection.setAutoCommit(true);
|
||||||
|
|
||||||
|
try (var stmt = connection.createStatement()) {
|
||||||
|
stmt.execute("CREATE TABLE IF NOT EXISTS urls (url TEXT PRIMARY KEY, domainId LONG, body BLOB, headers BLOB, ip TEXT, timestamp long)");
|
||||||
|
stmt.execute("CREATE INDEX IF NOT EXISTS domainIdIndex ON urls (domainId)");
|
||||||
|
stmt.execute("CREATE TABLE IF NOT EXISTS badUrls (url TEXT PRIMARY KEY, timestamp long)");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public Path createWorkDir() throws IOException {
|
||||||
|
return Files.createTempDirectory(basePath, "work");
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Remove entries older than the given timestamp */
|
||||||
|
public void prune(Instant cutoff) throws SQLException {
|
||||||
|
try (var stmt = connection.prepareStatement("DELETE FROM urls WHERE timestamp < ?")) {
|
||||||
|
stmt.setLong(1, cutoff.toEpochMilli());
|
||||||
|
stmt.executeUpdate();
|
||||||
|
}
|
||||||
|
|
||||||
|
try (var stmt = connection.prepareStatement("DELETE FROM badUrls WHERE timestamp < ?")) {
|
||||||
|
stmt.setLong(1, cutoff.toEpochMilli());
|
||||||
|
stmt.executeUpdate();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Check if the given URL is already in the database */
|
||||||
|
public boolean hasUrl(String url) throws SQLException {
|
||||||
|
try (var stmt = connection.prepareStatement("""
|
||||||
|
SELECT 1 FROM urls WHERE urls.url = ?
|
||||||
|
UNION
|
||||||
|
SELECT 1 FROM badUrls WHERE badUrls.url = ?
|
||||||
|
""");
|
||||||
|
) {
|
||||||
|
stmt.setString(1, url);
|
||||||
|
stmt.setString(2, url);
|
||||||
|
|
||||||
|
return stmt.executeQuery().next();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Check if the given URL is already in the database */
|
||||||
|
public boolean hasUrl(EdgeUrl url) throws SQLException {
|
||||||
|
return hasUrl(url.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Save a document to the database */
|
||||||
|
public void saveDocument(int domainId, EdgeUrl url, String body, String headers, String ip) throws SQLException, IOException {
|
||||||
|
try (var stmt = connection.prepareStatement("""
|
||||||
|
INSERT OR REPLACE INTO urls (domainId, url, body, headers, ip, timestamp)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?)
|
||||||
|
"""))
|
||||||
|
{
|
||||||
|
stmt.setInt(1, domainId);
|
||||||
|
stmt.setString(2, url.toString());
|
||||||
|
stmt.setBytes(3, compress(body));
|
||||||
|
stmt.setBytes(4, compress(headers));
|
||||||
|
stmt.setString(5, ip);
|
||||||
|
stmt.setLong(6, Instant.now().toEpochMilli());
|
||||||
|
stmt.executeUpdate();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Flag a URL as bad, i.e. it should not be revisited */
|
||||||
|
public void flagAsBad(EdgeUrl url) {
|
||||||
|
try (var stmt = connection.prepareStatement("""
|
||||||
|
INSERT OR IGNORE INTO badUrls (url, timestamp)
|
||||||
|
VALUES (?, ?)
|
||||||
|
"""))
|
||||||
|
{
|
||||||
|
stmt.setString(1, url.toString());
|
||||||
|
stmt.setLong(2, Instant.now().toEpochMilli());
|
||||||
|
stmt.executeUpdate();
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
throw new RuntimeException(ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private byte[] compress(String data) throws IOException {
|
||||||
|
// gzip compression
|
||||||
|
try (var bos = new ByteArrayOutputStream();
|
||||||
|
var gzip = new GZIPOutputStream(bos))
|
||||||
|
{
|
||||||
|
gzip.write(data.getBytes());
|
||||||
|
gzip.finish();
|
||||||
|
return bos.toByteArray();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private String decompress(byte[] data) {
|
||||||
|
// gzip decompression
|
||||||
|
try (var bis = new ByteArrayInputStream(data);
|
||||||
|
var gzip = new GZIPInputStream(bis))
|
||||||
|
{
|
||||||
|
return new String(gzip.readAllBytes());
|
||||||
|
}
|
||||||
|
catch (IOException ex) {
|
||||||
|
throw new RuntimeException(ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Get the data in the database as a list of SerializableCrawlDataStream's, the
|
||||||
|
* format expected by the converter code.
|
||||||
|
*/
|
||||||
|
public Collection<SerializableCrawlDataStream> getDataStreams() throws SQLException {
|
||||||
|
List<Integer> domainIds = new ArrayList<>();
|
||||||
|
|
||||||
|
try (var stmt = connection.createStatement()) {
|
||||||
|
var rs = stmt.executeQuery("SELECT DISTINCT domainId FROM urls");
|
||||||
|
while (rs.next()) {
|
||||||
|
domainIds.add(rs.getInt(1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
List<SerializableCrawlDataStream> streams = new ArrayList<>();
|
||||||
|
for (var domainId : domainIds) {
|
||||||
|
streams.add(new WrappedDataStream(domainId));
|
||||||
|
}
|
||||||
|
return streams;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Wraps the data in the database as a SerializableCrawlDataStream.
|
||||||
|
* <p></p>
|
||||||
|
* This is a bit clunky as the interface is built intending the data
|
||||||
|
* to be a stream of objects being read from Parquet.
|
||||||
|
* */
|
||||||
|
private class WrappedDataStream implements SerializableCrawlDataStream {
|
||||||
|
private final int domainId;
|
||||||
|
private ArrayList<SerializableCrawlData> dataStack;
|
||||||
|
|
||||||
|
WrappedDataStream(int domainId) {
|
||||||
|
this.domainId = domainId;
|
||||||
|
this.dataStack = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Lazy initialization for the data being iterated over */
|
||||||
|
private void query() {
|
||||||
|
try (var stmt = connection.prepareStatement("""
|
||||||
|
SELECT url, body, headers, ip, timestamp
|
||||||
|
FROM urls
|
||||||
|
WHERE domainId = ?
|
||||||
|
""")) {
|
||||||
|
stmt.setInt(1, domainId);
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
dataStack = new ArrayList<>();
|
||||||
|
while (rs.next()) {
|
||||||
|
String url = rs.getString("url");
|
||||||
|
String body = decompress(rs.getBytes("body"));
|
||||||
|
String headers = decompress(rs.getBytes("headers"));
|
||||||
|
|
||||||
|
dataStack.add(new CrawledDocument(
|
||||||
|
"LIVE",
|
||||||
|
url,
|
||||||
|
"text/html",
|
||||||
|
Instant.ofEpochMilli(rs.getLong("timestamp")).toString(),
|
||||||
|
200,
|
||||||
|
"OK",
|
||||||
|
"",
|
||||||
|
headers,
|
||||||
|
body,
|
||||||
|
false,
|
||||||
|
"",
|
||||||
|
""
|
||||||
|
));
|
||||||
|
}
|
||||||
|
var last = dataStack.getLast();
|
||||||
|
var domain = new CrawledDomain(
|
||||||
|
last.getDomain(),
|
||||||
|
null,
|
||||||
|
"OK",
|
||||||
|
"",
|
||||||
|
"0.0.0.0",
|
||||||
|
List.of(),
|
||||||
|
List.of()
|
||||||
|
);
|
||||||
|
|
||||||
|
// Add the domain as the last element, which will be the first
|
||||||
|
// element popped from the list
|
||||||
|
dataStack.addLast(domain);
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
throw new RuntimeException(ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public SerializableCrawlData next() throws IOException {
|
||||||
|
if (dataStack == null)
|
||||||
|
query();
|
||||||
|
|
||||||
|
return dataStack.removeLast();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean hasNext() throws IOException {
|
||||||
|
if (dataStack == null) {
|
||||||
|
query();
|
||||||
|
}
|
||||||
|
return !dataStack.isEmpty();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws Exception {
|
||||||
|
dataStack.clear();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws Exception {
|
||||||
|
connection.close();
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,224 @@
|
|||||||
|
package nu.marginalia.livecrawler;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import com.google.inject.Guice;
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Injector;
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import nu.marginalia.WmsaHome;
|
||||||
|
import nu.marginalia.api.feeds.FeedsClient;
|
||||||
|
import nu.marginalia.converting.ConverterModule;
|
||||||
|
import nu.marginalia.converting.processor.DomainProcessor;
|
||||||
|
import nu.marginalia.converting.writer.ConverterBatchWriter;
|
||||||
|
import nu.marginalia.db.DbDomainQueries;
|
||||||
|
import nu.marginalia.db.DomainBlacklist;
|
||||||
|
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||||
|
import nu.marginalia.loading.LoaderInputData;
|
||||||
|
import nu.marginalia.loading.documents.DocumentLoaderService;
|
||||||
|
import nu.marginalia.loading.documents.KeywordLoaderService;
|
||||||
|
import nu.marginalia.loading.domains.DbDomainIdRegistry;
|
||||||
|
import nu.marginalia.loading.domains.DomainIdRegistry;
|
||||||
|
import nu.marginalia.model.EdgeDomain;
|
||||||
|
import nu.marginalia.mq.MessageQueueFactory;
|
||||||
|
import nu.marginalia.mqapi.crawling.LiveCrawlRequest;
|
||||||
|
import nu.marginalia.process.ProcessConfiguration;
|
||||||
|
import nu.marginalia.process.ProcessConfigurationModule;
|
||||||
|
import nu.marginalia.process.ProcessMainClass;
|
||||||
|
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||||
|
import nu.marginalia.service.module.DatabaseModule;
|
||||||
|
import nu.marginalia.service.module.ServiceDiscoveryModule;
|
||||||
|
import nu.marginalia.storage.FileStorageService;
|
||||||
|
import nu.marginalia.storage.model.FileStorageBaseType;
|
||||||
|
import org.apache.commons.io.FileUtils;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.security.Security;
|
||||||
|
import java.time.Instant;
|
||||||
|
import java.time.temporal.ChronoUnit;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import static nu.marginalia.mqapi.ProcessInboxNames.LIVE_CRAWLER_INBOX;
|
||||||
|
|
||||||
|
public class LiveCrawlerMain extends ProcessMainClass {
|
||||||
|
|
||||||
|
private static final Logger logger =
|
||||||
|
LoggerFactory.getLogger(LiveCrawlerMain.class);
|
||||||
|
|
||||||
|
private final FeedsClient feedsClient;
|
||||||
|
private final ProcessHeartbeat heartbeat;
|
||||||
|
private final DbDomainQueries domainQueries;
|
||||||
|
private final DomainBlacklist domainBlacklist;
|
||||||
|
private final DomainProcessor domainProcessor;
|
||||||
|
private final FileStorageService fileStorageService;
|
||||||
|
private final KeywordLoaderService keywordLoaderService;
|
||||||
|
private final DocumentLoaderService documentLoaderService;
|
||||||
|
private final HikariDataSource dataSource;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public LiveCrawlerMain(FeedsClient feedsClient,
|
||||||
|
Gson gson,
|
||||||
|
ProcessConfiguration config,
|
||||||
|
ProcessHeartbeat heartbeat,
|
||||||
|
DbDomainQueries domainQueries,
|
||||||
|
DomainBlacklist domainBlacklist,
|
||||||
|
MessageQueueFactory messageQueueFactory,
|
||||||
|
DomainProcessor domainProcessor,
|
||||||
|
FileStorageService fileStorageService,
|
||||||
|
KeywordLoaderService keywordLoaderService,
|
||||||
|
DocumentLoaderService documentLoaderService, HikariDataSource dataSource)
|
||||||
|
throws Exception
|
||||||
|
{
|
||||||
|
super(messageQueueFactory, config, gson, LIVE_CRAWLER_INBOX);
|
||||||
|
|
||||||
|
this.feedsClient = feedsClient;
|
||||||
|
this.heartbeat = heartbeat;
|
||||||
|
this.domainQueries = domainQueries;
|
||||||
|
this.domainBlacklist = domainBlacklist;
|
||||||
|
this.domainProcessor = domainProcessor;
|
||||||
|
this.fileStorageService = fileStorageService;
|
||||||
|
this.keywordLoaderService = keywordLoaderService;
|
||||||
|
this.documentLoaderService = documentLoaderService;
|
||||||
|
this.dataSource = dataSource;
|
||||||
|
|
||||||
|
domainBlacklist.waitUntilLoaded();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String... args) throws Exception {
|
||||||
|
|
||||||
|
// Prevent Java from caching DNS lookups forever (filling up the system RAM as a result)
|
||||||
|
Security.setProperty("networkaddress.cache.ttl", "3600");
|
||||||
|
|
||||||
|
// This must run *early*
|
||||||
|
System.setProperty("http.agent", WmsaHome.getUserAgent().uaString());
|
||||||
|
|
||||||
|
// If these aren't set properly, the JVM will hang forever on some requests
|
||||||
|
System.setProperty("sun.net.client.defaultConnectTimeout", "30000");
|
||||||
|
System.setProperty("sun.net.client.defaultReadTimeout", "30000");
|
||||||
|
|
||||||
|
// We don't want to use too much memory caching sessions for https
|
||||||
|
System.setProperty("javax.net.ssl.sessionCacheSize", "2048");
|
||||||
|
|
||||||
|
try {
|
||||||
|
Injector injector = Guice.createInjector(
|
||||||
|
new LiveCrawlerModule(),
|
||||||
|
new ProcessConfigurationModule("crawler"),
|
||||||
|
new ConverterModule(),
|
||||||
|
new ServiceDiscoveryModule(),
|
||||||
|
new DatabaseModule(false)
|
||||||
|
);
|
||||||
|
|
||||||
|
var crawler = injector.getInstance(LiveCrawlerMain.class);
|
||||||
|
Instructions<LiveCrawlRequest> instructions = crawler.fetchInstructions(LiveCrawlRequest.class);
|
||||||
|
|
||||||
|
try{
|
||||||
|
crawler.run();
|
||||||
|
instructions.ok();
|
||||||
|
} catch (Exception e) {
|
||||||
|
instructions.err();
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("LiveCrawler failed", e);
|
||||||
|
|
||||||
|
System.exit(1);
|
||||||
|
}
|
||||||
|
System.exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
enum LiveCrawlState {
|
||||||
|
PRUNE_DB,
|
||||||
|
FETCH_LINKS,
|
||||||
|
CRAWLING,
|
||||||
|
PROCESSING,
|
||||||
|
LOADING,
|
||||||
|
CONSTRUCTING,
|
||||||
|
DONE
|
||||||
|
}
|
||||||
|
|
||||||
|
private void run() throws Exception {
|
||||||
|
Path basePath = fileStorageService.getStorageBase(FileStorageBaseType.STORAGE).asPath().resolve("live-crawl-data");
|
||||||
|
|
||||||
|
if (!Files.isDirectory(basePath)) {
|
||||||
|
Files.createDirectories(basePath);
|
||||||
|
}
|
||||||
|
|
||||||
|
run(basePath);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void run(Path basePath) throws Exception {
|
||||||
|
try (var processHeartbeat = heartbeat.createProcessTaskHeartbeat(LiveCrawlState.class, "LiveCrawler");
|
||||||
|
LiveCrawlDataSet dataSet = new LiveCrawlDataSet(basePath))
|
||||||
|
{
|
||||||
|
final Instant cutoff = Instant.now().minus(60, ChronoUnit.DAYS);
|
||||||
|
|
||||||
|
processHeartbeat.progress(LiveCrawlState.FETCH_LINKS);
|
||||||
|
|
||||||
|
Map<String, List<String>> urlsPerDomain = new HashMap<>(10_000);
|
||||||
|
feedsClient.getUpdatedDomains(cutoff, urlsPerDomain::put);
|
||||||
|
|
||||||
|
logger.info("Fetched data for {} domains", urlsPerDomain.size());
|
||||||
|
|
||||||
|
processHeartbeat.progress(LiveCrawlState.PRUNE_DB);
|
||||||
|
|
||||||
|
// Remove data that is too old
|
||||||
|
dataSet.prune(cutoff);
|
||||||
|
|
||||||
|
processHeartbeat.progress(LiveCrawlState.CRAWLING);
|
||||||
|
|
||||||
|
try (SimpleLinkScraper fetcher = new SimpleLinkScraper(dataSet, domainQueries, domainBlacklist);
|
||||||
|
var hb = heartbeat.createAdHocTaskHeartbeat("Live Crawling"))
|
||||||
|
{
|
||||||
|
for (Map.Entry<String, List<String>> entry : hb.wrap("Fetching", urlsPerDomain.entrySet())) {
|
||||||
|
EdgeDomain domain = new EdgeDomain(entry.getKey());
|
||||||
|
List<String> urls = entry.getValue();
|
||||||
|
|
||||||
|
fetcher.scheduleRetrieval(domain, urls);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Path tempPath = dataSet.createWorkDir();
|
||||||
|
|
||||||
|
try {
|
||||||
|
processHeartbeat.progress(LiveCrawlState.PROCESSING);
|
||||||
|
|
||||||
|
try (var hb = heartbeat.createAdHocTaskHeartbeat("Processing");
|
||||||
|
var writer = new ConverterBatchWriter(tempPath, 0)
|
||||||
|
) {
|
||||||
|
// Offset the documents' ordinals toward the upper range, to avoid an ID collisions with the
|
||||||
|
// main indexes (the maximum permissible for doc ordinal is value is 67_108_863, so this
|
||||||
|
// leaves us with a lot of headroom still)
|
||||||
|
writer.setOrdinalOffset(67_000_000);
|
||||||
|
|
||||||
|
for (SerializableCrawlDataStream stream : hb.wrap("Processing", dataSet.getDataStreams())) {
|
||||||
|
writer.write(domainProcessor.sideloadProcessing(stream, 0));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
processHeartbeat.progress(LiveCrawlState.LOADING);
|
||||||
|
|
||||||
|
LoaderInputData lid = new LoaderInputData(tempPath, 1);
|
||||||
|
|
||||||
|
DomainIdRegistry domainIdRegistry = new DbDomainIdRegistry(dataSource);
|
||||||
|
|
||||||
|
keywordLoaderService.loadKeywords(domainIdRegistry, heartbeat, lid);
|
||||||
|
documentLoaderService.loadDocuments(domainIdRegistry, heartbeat, lid);
|
||||||
|
|
||||||
|
keywordLoaderService.close();
|
||||||
|
|
||||||
|
} finally {
|
||||||
|
FileUtils.deleteDirectory(tempPath.toFile());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Construct the index
|
||||||
|
|
||||||
|
processHeartbeat.progress(LiveCrawlState.DONE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,49 @@
|
|||||||
|
package nu.marginalia.livecrawler;
|
||||||
|
|
||||||
|
import com.google.inject.AbstractModule;
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Provides;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import com.google.inject.name.Names;
|
||||||
|
import nu.marginalia.IndexLocations;
|
||||||
|
import nu.marginalia.UserAgent;
|
||||||
|
import nu.marginalia.WmsaHome;
|
||||||
|
import nu.marginalia.linkdb.docs.DocumentDbWriter;
|
||||||
|
import nu.marginalia.process.ProcessConfiguration;
|
||||||
|
import nu.marginalia.service.ServiceId;
|
||||||
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
|
import nu.marginalia.storage.FileStorageService;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.util.UUID;
|
||||||
|
|
||||||
|
import static nu.marginalia.linkdb.LinkdbFileNames.DOCDB_FILE_NAME;
|
||||||
|
|
||||||
|
public class LiveCrawlerModule extends AbstractModule {
|
||||||
|
|
||||||
|
public void configure() {
|
||||||
|
bind(UserAgent.class).toInstance(WmsaHome.getUserAgent());
|
||||||
|
bind(Path.class).annotatedWith(Names.named("local-index-path")).toInstance(Path.of(System.getProperty("local-index-path", "/vol")));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
@Provides @Singleton
|
||||||
|
private DocumentDbWriter createLinkdbWriter(FileStorageService service) throws SQLException, IOException {
|
||||||
|
// Migrate
|
||||||
|
Path dbPath = IndexLocations.getLinkdbWritePath(service).resolve(DOCDB_FILE_NAME);
|
||||||
|
|
||||||
|
if (Files.exists(dbPath)) {
|
||||||
|
Files.delete(dbPath);
|
||||||
|
}
|
||||||
|
return new DocumentDbWriter(dbPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
@Provides
|
||||||
|
public ServiceConfiguration provideServiceConfiguration(ProcessConfiguration processConfiguration) {
|
||||||
|
return new ServiceConfiguration(ServiceId.NOT_A_SERVICE, processConfiguration.node(), null, null, -1, UUID.randomUUID());
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,217 @@
|
|||||||
|
package nu.marginalia.livecrawler;
|
||||||
|
|
||||||
|
import crawlercommons.robots.SimpleRobotRules;
|
||||||
|
import crawlercommons.robots.SimpleRobotRulesParser;
|
||||||
|
import nu.marginalia.WmsaHome;
|
||||||
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
|
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||||
|
import nu.marginalia.db.DbDomainQueries;
|
||||||
|
import nu.marginalia.db.DomainBlacklist;
|
||||||
|
import nu.marginalia.link_parser.LinkParser;
|
||||||
|
import nu.marginalia.model.EdgeDomain;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import javax.annotation.Nullable;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.net.http.HttpClient;
|
||||||
|
import java.net.http.HttpHeaders;
|
||||||
|
import java.net.http.HttpRequest;
|
||||||
|
import java.net.http.HttpResponse;
|
||||||
|
import java.time.Duration;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.concurrent.ThreadLocalRandom;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
/** A simple link scraper that fetches URLs and stores them in a database,
|
||||||
|
* with no concept of a crawl frontier, WARC output, or other advanced features
|
||||||
|
*/
|
||||||
|
public class SimpleLinkScraper implements AutoCloseable {
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(SimpleLinkScraper.class);
|
||||||
|
|
||||||
|
private final SimpleBlockingThreadPool pool = new SimpleBlockingThreadPool("LiveCrawler", 32, 10);
|
||||||
|
private final LinkParser lp = new LinkParser();
|
||||||
|
private final LiveCrawlDataSet dataSet;
|
||||||
|
private final DbDomainQueries domainQueries;
|
||||||
|
private final DomainBlacklist domainBlacklist;
|
||||||
|
private final Duration connectTimeout = Duration.ofSeconds(10);
|
||||||
|
private final Duration readTimeout = Duration.ofSeconds(10);
|
||||||
|
|
||||||
|
public SimpleLinkScraper(LiveCrawlDataSet dataSet,
|
||||||
|
DbDomainQueries domainQueries,
|
||||||
|
DomainBlacklist domainBlacklist) {
|
||||||
|
this.dataSet = dataSet;
|
||||||
|
this.domainQueries = domainQueries;
|
||||||
|
this.domainBlacklist = domainBlacklist;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void scheduleRetrieval(EdgeDomain domain, List<String> urls) {
|
||||||
|
|
||||||
|
var id = domainQueries.tryGetDomainId(domain);
|
||||||
|
if (id.isEmpty() || domainBlacklist.isBlacklisted(id.getAsInt())) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
pool.submitQuietly(() -> retrieveNow(domain, id.getAsInt(), urls));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void retrieveNow(EdgeDomain domain, int domainId, List<String> urls) throws Exception {
|
||||||
|
try (HttpClient client = HttpClient
|
||||||
|
.newBuilder()
|
||||||
|
.connectTimeout(connectTimeout)
|
||||||
|
.followRedirects(HttpClient.Redirect.NEVER)
|
||||||
|
.version(HttpClient.Version.HTTP_2)
|
||||||
|
.build()) {
|
||||||
|
|
||||||
|
EdgeUrl rootUrl = domain.toRootUrlHttps();
|
||||||
|
|
||||||
|
SimpleRobotRules rules = fetchRobotsRules(rootUrl, client);
|
||||||
|
|
||||||
|
if (rules == null) { // I/O error fetching robots.txt
|
||||||
|
// If we can't fetch the robots.txt,
|
||||||
|
for (var url : urls) {
|
||||||
|
lp.parseLink(rootUrl, url).ifPresent(this::maybeFlagAsBad);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
CrawlDelayTimer timer = new CrawlDelayTimer(rules.getCrawlDelay());
|
||||||
|
|
||||||
|
for (var url : urls) {
|
||||||
|
Optional<EdgeUrl> optParsedUrl = lp.parseLink(rootUrl, url);
|
||||||
|
if (optParsedUrl.isEmpty()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (dataSet.hasUrl(optParsedUrl.get())) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
EdgeUrl parsedUrl = optParsedUrl.get();
|
||||||
|
if (!rules.isAllowed(url)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (fetchUrl(domainId, parsedUrl, timer, client)) {
|
||||||
|
case FetchResult.Success(int id, EdgeUrl docUrl, String body, String headers)
|
||||||
|
-> dataSet.saveDocument(id, docUrl, body, headers, "");
|
||||||
|
case FetchResult.Error(EdgeUrl docUrl) -> maybeFlagAsBad(docUrl);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void maybeFlagAsBad(EdgeUrl url) {
|
||||||
|
// To give bad URLs a chance to be re-fetched, we only flag them as bad
|
||||||
|
// with a 20% probability. This will prevent the same bad URL being
|
||||||
|
// re-fetched over and over again for several months, but still allow
|
||||||
|
// us to *mostly* re-fetch it if it was just a transient error.
|
||||||
|
|
||||||
|
// There's of course the chance we immediately flag it as bad on an
|
||||||
|
// unlucky roll, but you know, that's xcom baby
|
||||||
|
|
||||||
|
if (ThreadLocalRandom.current().nextDouble(0, 1) < 0.2) {
|
||||||
|
dataSet.flagAsBad(url);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Nullable
|
||||||
|
private SimpleRobotRules fetchRobotsRules(EdgeUrl rootUrl, HttpClient client) throws IOException, InterruptedException, URISyntaxException {
|
||||||
|
var robotsRequest = HttpRequest.newBuilder(rootUrl.withPathAndParam("/robots.txt", null).asURI())
|
||||||
|
.GET()
|
||||||
|
.header("User-Agent", WmsaHome.getUserAgent().uaString())
|
||||||
|
.timeout(readTimeout);
|
||||||
|
|
||||||
|
// Fetch the robots.txt
|
||||||
|
|
||||||
|
try {
|
||||||
|
SimpleRobotRulesParser parser = new SimpleRobotRulesParser();
|
||||||
|
HttpResponse<byte[]> robotsTxt = client.send(robotsRequest.build(), HttpResponse.BodyHandlers.ofByteArray());
|
||||||
|
if (robotsTxt.statusCode() == 200) {
|
||||||
|
return parser.parseContent(rootUrl.toString(),
|
||||||
|
robotsTxt.body(),
|
||||||
|
robotsTxt.headers().firstValue("Content-Type").orElse("text/plain"),
|
||||||
|
WmsaHome.getUserAgent().uaIdentifier());
|
||||||
|
}
|
||||||
|
else if (robotsTxt.statusCode() == 404) {
|
||||||
|
return new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_ALL);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (IOException ex) {
|
||||||
|
logger.error("Error fetching robots.txt for {}: {} {}", rootUrl, ex.getClass().getSimpleName(), ex.getMessage());
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Fetch a URL and store it in the database
|
||||||
|
*/
|
||||||
|
private FetchResult fetchUrl(int domainId, EdgeUrl parsedUrl, CrawlDelayTimer timer, HttpClient client) throws Exception {
|
||||||
|
|
||||||
|
timer.waitFetchDelay();
|
||||||
|
|
||||||
|
HttpRequest request = HttpRequest.newBuilder(parsedUrl.asURI())
|
||||||
|
.GET()
|
||||||
|
.header("User-Agent", WmsaHome.getUserAgent().uaString())
|
||||||
|
.header("Accept", "text/html")
|
||||||
|
.timeout(readTimeout)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
try {
|
||||||
|
HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString());
|
||||||
|
|
||||||
|
// Handle rate limiting by waiting and retrying once
|
||||||
|
if (response.statusCode() == 429) {
|
||||||
|
timer.waitRetryDelay(new HttpFetcherImpl.RateLimitException(
|
||||||
|
response.headers().firstValue("Retry-After").orElse("5")
|
||||||
|
));
|
||||||
|
response = client.send(request, HttpResponse.BodyHandlers.ofString());
|
||||||
|
}
|
||||||
|
|
||||||
|
String contentType = response.headers().firstValue("Content-Type").orElse("").toLowerCase();
|
||||||
|
|
||||||
|
if (response.statusCode() == 200) {
|
||||||
|
if (!contentType.toLowerCase().startsWith("text/html")) {
|
||||||
|
return new FetchResult.Error(parsedUrl);
|
||||||
|
}
|
||||||
|
|
||||||
|
String body = response.body();
|
||||||
|
if (body.length() > 1024 * 1024) {
|
||||||
|
return new FetchResult.Error(parsedUrl);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new FetchResult.Success(domainId, parsedUrl, body, headersToString(response.headers()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (IOException ex) {
|
||||||
|
// We don't want a full stack trace on every error, as it's quite common and very noisy
|
||||||
|
logger.error("Error fetching URL {}: {} {}", parsedUrl, ex.getClass().getSimpleName(), ex.getMessage());
|
||||||
|
}
|
||||||
|
|
||||||
|
return new FetchResult.Error(parsedUrl);
|
||||||
|
}
|
||||||
|
|
||||||
|
sealed interface FetchResult {
|
||||||
|
record Success(int domainId, EdgeUrl url, String body, String headers) implements FetchResult {}
|
||||||
|
record Error(EdgeUrl url) implements FetchResult {}
|
||||||
|
}
|
||||||
|
|
||||||
|
private String headersToString(HttpHeaders headers) {
|
||||||
|
StringBuilder headersStr = new StringBuilder();
|
||||||
|
headers.map().forEach((k, v) -> {
|
||||||
|
headersStr.append(k).append(": ").append(v).append("\n");
|
||||||
|
});
|
||||||
|
return headersStr.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws Exception {
|
||||||
|
pool.shutDown();
|
||||||
|
for (int i = 0; i < 4; i++) {
|
||||||
|
pool.awaitTermination(1, TimeUnit.HOURS);
|
||||||
|
}
|
||||||
|
pool.shutDownNow();
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,92 @@
|
|||||||
|
package nu.marginalia.livecrawler;
|
||||||
|
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
|
import nu.marginalia.model.crawldata.CrawledDomain;
|
||||||
|
import nu.marginalia.model.crawldata.SerializableCrawlData;
|
||||||
|
import org.apache.commons.io.FileUtils;
|
||||||
|
import org.junit.jupiter.api.Assertions;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class LiveCrawlDataSetTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetDataSet() throws Exception {
|
||||||
|
Path tempDir = Files.createTempDirectory("live-crawl-data-set-test");
|
||||||
|
try (LiveCrawlDataSet dataSet = new LiveCrawlDataSet(tempDir)) {
|
||||||
|
|
||||||
|
Assertions.assertFalse(dataSet.hasUrl("https://www.example.com/"));
|
||||||
|
dataSet.saveDocument(
|
||||||
|
1,
|
||||||
|
new EdgeUrl("https://www.example.com/"),
|
||||||
|
"test",
|
||||||
|
"test",
|
||||||
|
"test"
|
||||||
|
);
|
||||||
|
Assertions.assertTrue(dataSet.hasUrl("https://www.example.com/"));
|
||||||
|
|
||||||
|
var streams = dataSet.getDataStreams();
|
||||||
|
Assertions.assertEquals(1, streams.size());
|
||||||
|
var stream = streams.iterator().next();
|
||||||
|
|
||||||
|
List<SerializableCrawlData> data = new ArrayList<>();
|
||||||
|
while (stream.hasNext()) {
|
||||||
|
data.add(stream.next());
|
||||||
|
}
|
||||||
|
|
||||||
|
int dataCount = 0;
|
||||||
|
int domainCount = 0;
|
||||||
|
|
||||||
|
for (var item : data) {
|
||||||
|
switch (item) {
|
||||||
|
case CrawledDomain domain -> {
|
||||||
|
domainCount++;
|
||||||
|
Assertions.assertEquals("www.example.com", domain.getDomain());
|
||||||
|
}
|
||||||
|
case CrawledDocument document -> {
|
||||||
|
dataCount++;
|
||||||
|
Assertions.assertEquals("https://www.example.com/", document.url);
|
||||||
|
Assertions.assertEquals("test", document.documentBody);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Assertions.assertEquals(1, dataCount);
|
||||||
|
Assertions.assertEquals(1, domainCount);
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
FileUtils.deleteDirectory(tempDir.toFile());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testHasUrl() throws Exception {
|
||||||
|
Path tempDir = Files.createTempDirectory("live-crawl-data-set-test");
|
||||||
|
try (LiveCrawlDataSet dataSet = new LiveCrawlDataSet(tempDir)) {
|
||||||
|
Assertions.assertFalse(dataSet.hasUrl("https://www.example.com/"));
|
||||||
|
dataSet.saveDocument(
|
||||||
|
1,
|
||||||
|
new EdgeUrl("https://www.example.com/saved"),
|
||||||
|
"test",
|
||||||
|
"test",
|
||||||
|
"test"
|
||||||
|
);
|
||||||
|
Assertions.assertTrue(dataSet.hasUrl("https://www.example.com/saved"));
|
||||||
|
|
||||||
|
dataSet.flagAsBad(new EdgeUrl("https://www.example.com/bad"));
|
||||||
|
|
||||||
|
Assertions.assertTrue(dataSet.hasUrl("https://www.example.com/bad"));
|
||||||
|
|
||||||
|
Assertions.assertFalse(dataSet.hasUrl("https://www.example.com/notPresent"));
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
FileUtils.deleteDirectory(tempDir.toFile());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user