diff --git a/build.gradle b/build.gradle index 155d5b89..ea0db6be 100644 --- a/build.gradle +++ b/build.gradle @@ -9,14 +9,37 @@ version 'SNAPSHOT' compileJava.options.encoding = "UTF-8" compileTestJava.options.encoding = "UTF-8" -task dist(type: Copy) { +tasks.register('dist', Copy) { from subprojects.collect { it.tasks.withType(Tar) } into "$buildDir/dist" -} + doLast { + copy { + from tarTree("$buildDir/dist/converter-process.tar") + into "$projectDir/run/dist/" + } + copy { + from tarTree("$buildDir/dist/crawler-process.tar") + into "$projectDir/run/dist/" + } + copy { + from tarTree("$buildDir/dist/loader-process.tar") + into "$projectDir/run/dist/" + } + copy { + from tarTree("$buildDir/dist/website-adjacencies-calculator.tar") + into "$projectDir/run/dist/" + } + copy { + from tarTree("$buildDir/dist/crawl-job-extractor-process.tar") + into "$projectDir/run/dist/" + } + } +} idea { module { excludeDirs.add(file("$projectDir/run/model")) + excludeDirs.add(file("$projectDir/run/dist")) excludeDirs.add(file("$projectDir/run/samples")) excludeDirs.add(file("$projectDir/run/db")) excludeDirs.add(file("$projectDir/run/logs")) diff --git a/code/api/index-api/build.gradle b/code/api/index-api/build.gradle index 6dbcd98f..edb6056d 100644 --- a/code/api/index-api/build.gradle +++ b/code/api/index-api/build.gradle @@ -16,7 +16,7 @@ dependencies { implementation project(':code:common:config') implementation project(':code:common:service-discovery') implementation project(':code:common:service-client') - + implementation project(':code:common:message-queue') implementation project(':code:features-index:index-query') implementation libs.lombok diff --git a/code/api/index-api/src/main/java/nu/marginalia/index/client/IndexClient.java b/code/api/index-api/src/main/java/nu/marginalia/index/client/IndexClient.java index 8db8772f..db60948d 100644 --- a/code/api/index-api/src/main/java/nu/marginalia/index/client/IndexClient.java +++ b/code/api/index-api/src/main/java/nu/marginalia/index/client/IndexClient.java @@ -8,27 +8,41 @@ import nu.marginalia.WmsaHome; import nu.marginalia.client.AbstractDynamicClient; import nu.marginalia.client.Context; import nu.marginalia.index.client.model.query.SearchSpecification; -import nu.marginalia.index.client.model.results.SearchResultItem; import nu.marginalia.index.client.model.results.SearchResultSet; import nu.marginalia.model.gson.GsonFactory; +import nu.marginalia.mq.MessageQueueFactory; +import nu.marginalia.mq.outbox.MqOutbox; import nu.marginalia.service.descriptor.ServiceDescriptors; import nu.marginalia.service.id.ServiceId; import javax.annotation.CheckReturnValue; -import java.util.List; +import java.util.UUID; @Singleton public class IndexClient extends AbstractDynamicClient { private static final Summary wmsa_search_index_api_time = Summary.build().name("wmsa_search_index_api_time").help("-").register(); + private final MqOutbox outbox; + @Inject - public IndexClient(ServiceDescriptors descriptors) { + public IndexClient(ServiceDescriptors descriptors, + MessageQueueFactory messageQueueFactory) { super(descriptors.forId(ServiceId.Index), WmsaHome.getHostsFile(), GsonFactory::get); + String inboxName = ServiceId.Index.name + ":" + "0"; + String outboxName = System.getProperty("service-name", UUID.randomUUID().toString()); + + outbox = messageQueueFactory.createOutbox(inboxName, outboxName, UUID.randomUUID()); + setTimeout(30); } + + public MqOutbox outbox() { + return outbox; + } + @CheckReturnValue public SearchResultSet query(Context ctx, SearchSpecification specs) { return wmsa_search_index_api_time.time( diff --git a/code/api/index-api/src/main/java/nu/marginalia/index/client/IndexMqEndpoints.java b/code/api/index-api/src/main/java/nu/marginalia/index/client/IndexMqEndpoints.java new file mode 100644 index 00000000..f8349eb7 --- /dev/null +++ b/code/api/index-api/src/main/java/nu/marginalia/index/client/IndexMqEndpoints.java @@ -0,0 +1,10 @@ +package nu.marginalia.index.client; + +public class IndexMqEndpoints { + public static final String INDEX_IS_BLOCKED = "INDEX-IS-BLOCKED"; + public static final String INDEX_REPARTITION = "INDEX-REPARTITION"; + + public static final String INDEX_RELOAD_LEXICON = "INDEX-RELOAD-LEXICON"; + public static final String INDEX_REINDEX = "INDEX-REINDEX"; + +} diff --git a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/query/SearchSetIdentifier.java b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/query/SearchSetIdentifier.java index aca5c291..e89d6d8b 100644 --- a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/query/SearchSetIdentifier.java +++ b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/query/SearchSetIdentifier.java @@ -8,6 +8,7 @@ package nu.marginalia.index.client.model.query; public enum SearchSetIdentifier { NONE, RETRO, + BLOGS, ACADEMIA, SMALLWEB } diff --git a/code/api/process-mqapi/build.gradle b/code/api/process-mqapi/build.gradle new file mode 100644 index 00000000..0b360576 --- /dev/null +++ b/code/api/process-mqapi/build.gradle @@ -0,0 +1,30 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + implementation project(':code:common:db') + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} \ No newline at end of file diff --git a/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/ProcessInboxNames.java b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/ProcessInboxNames.java new file mode 100644 index 00000000..9ca91fe6 --- /dev/null +++ b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/ProcessInboxNames.java @@ -0,0 +1,7 @@ +package nu.marginalia.mqapi; + +public class ProcessInboxNames { + public static final String CONVERTER_INBOX = "converter"; + public static final String LOADER_INBOX = "loader"; + public static final String CRAWLER_INBOX = "crawler"; +} diff --git a/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertAction.java b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertAction.java new file mode 100644 index 00000000..abc571f7 --- /dev/null +++ b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertAction.java @@ -0,0 +1,7 @@ +package nu.marginalia.mqapi.converting; + +public enum ConvertAction { + ConvertCrawlData, + SideloadEncyclopedia, + SideloadStackexchange +} diff --git a/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertRequest.java b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertRequest.java new file mode 100644 index 00000000..abacf8af --- /dev/null +++ b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertRequest.java @@ -0,0 +1,12 @@ +package nu.marginalia.mqapi.converting; + +import lombok.AllArgsConstructor; +import nu.marginalia.db.storage.model.FileStorageId; + +@AllArgsConstructor +public class ConvertRequest { + public final ConvertAction action; + public final String inputSource; + public final FileStorageId crawlStorage; + public final FileStorageId processedDataStorage; +} diff --git a/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/crawling/CrawlRequest.java b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/crawling/CrawlRequest.java new file mode 100644 index 00000000..16cdc6f3 --- /dev/null +++ b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/crawling/CrawlRequest.java @@ -0,0 +1,11 @@ +package nu.marginalia.mqapi.crawling; + +import lombok.AllArgsConstructor; +import nu.marginalia.db.storage.model.FileStorageId; + +/** A request to start a crawl */ +@AllArgsConstructor +public class CrawlRequest { + public FileStorageId specStorage; + public FileStorageId crawlStorage; +} diff --git a/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/loading/LoadRequest.java b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/loading/LoadRequest.java new file mode 100644 index 00000000..eff92c9c --- /dev/null +++ b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/loading/LoadRequest.java @@ -0,0 +1,9 @@ +package nu.marginalia.mqapi.loading; + +import lombok.AllArgsConstructor; +import nu.marginalia.db.storage.model.FileStorageId; + +@AllArgsConstructor +public class LoadRequest { + public FileStorageId processedDataStorage; +} diff --git a/code/api/readme.md b/code/api/readme.md index 4b19381f..f98f326a 100644 --- a/code/api/readme.md +++ b/code/api/readme.md @@ -1,4 +1,10 @@ -# Core Service Clients +# Clients + +## Core Services + +* [assistant-api](assistant-api/) +* [search-api](search-api/) +* [index-api](index-api/) These are clients for the [core services](../services-core/), along with what models are necessary for speaking to them. They each implement the abstract client classes from @@ -8,3 +14,10 @@ All that is necessary is to `@Inject` them into the constructor and then requests can be sent. **Note:** If you are looking for the public API, it's handled by the api service in [services-satellite/api-service](../services-satellite/api-service). + +## MQ-API Process API + +[process-mqapi](process-mqapi/) defines requests and inboxes for the message queue based API used +for interacting with processes. + +See [common/message-queue](../common/message-queue) and [services-satellite/control-service](../services-satellite/control-service). \ No newline at end of file diff --git a/code/api/search-api/build.gradle b/code/api/search-api/build.gradle index 8c38b5f3..ba00a702 100644 --- a/code/api/search-api/build.gradle +++ b/code/api/search-api/build.gradle @@ -14,6 +14,7 @@ java { dependencies { implementation project(':code:common:model') implementation project(':code:common:config') + implementation project(':code:common:message-queue') implementation project(':code:common:service-discovery') implementation project(':code:common:service-client') diff --git a/code/api/search-api/src/main/java/nu/marginalia/search/client/SearchClient.java b/code/api/search-api/src/main/java/nu/marginalia/search/client/SearchClient.java index 393fa285..8faef5be 100644 --- a/code/api/search-api/src/main/java/nu/marginalia/search/client/SearchClient.java +++ b/code/api/search-api/src/main/java/nu/marginalia/search/client/SearchClient.java @@ -5,6 +5,8 @@ import com.google.inject.Singleton; import io.reactivex.rxjava3.core.Observable; import nu.marginalia.client.AbstractDynamicClient; import nu.marginalia.model.gson.GsonFactory; +import nu.marginalia.mq.MessageQueueFactory; +import nu.marginalia.mq.outbox.MqOutbox; import nu.marginalia.search.client.model.ApiSearchResults; import nu.marginalia.service.descriptor.ServiceDescriptors; import nu.marginalia.service.id.ServiceId; @@ -16,14 +18,30 @@ import org.slf4j.LoggerFactory; import javax.annotation.CheckReturnValue; import java.net.URLEncoder; import java.nio.charset.StandardCharsets; +import java.util.UUID; @Singleton public class SearchClient extends AbstractDynamicClient { private final Logger logger = LoggerFactory.getLogger(getClass()); + private final MqOutbox outbox; + @Inject - public SearchClient(ServiceDescriptors descriptors) { + public SearchClient(ServiceDescriptors descriptors, + MessageQueueFactory messageQueueFactory) { + super(descriptors.forId(ServiceId.Search), WmsaHome.getHostsFile(), GsonFactory::get); + + String inboxName = ServiceId.Search.name + ":" + "0"; + String outboxName = System.getProperty("service-name", UUID.randomUUID().toString()); + + outbox = messageQueueFactory.createOutbox(inboxName, outboxName, UUID.randomUUID()); + + } + + + public MqOutbox outbox() { + return outbox; } @CheckReturnValue diff --git a/code/api/search-api/src/main/java/nu/marginalia/search/client/SearchMqEndpoints.java b/code/api/search-api/src/main/java/nu/marginalia/search/client/SearchMqEndpoints.java new file mode 100644 index 00000000..1c546b3e --- /dev/null +++ b/code/api/search-api/src/main/java/nu/marginalia/search/client/SearchMqEndpoints.java @@ -0,0 +1,6 @@ +package nu.marginalia.search.client; + +public class SearchMqEndpoints { + /** Flushes the URL caches, run if significant changes have occurred in the URLs database */ + public static final String FLUSH_CACHES = "FLUSH_CACHES"; +} diff --git a/code/common/config/src/main/java/nu/marginalia/WmsaHome.java b/code/common/config/src/main/java/nu/marginalia/WmsaHome.java index ae09940e..d63be333 100644 --- a/code/common/config/src/main/java/nu/marginalia/WmsaHome.java +++ b/code/common/config/src/main/java/nu/marginalia/WmsaHome.java @@ -10,7 +10,6 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.Optional; -import java.util.Properties; import java.util.stream.Stream; public class WmsaHome { @@ -79,35 +78,6 @@ public class WmsaHome { return getHomePath().resolve("data").resolve("IP2LOCATION-LITE-DB1.CSV"); } - public static Path getDisk(String name) { - var pathStr = getDiskProperties().getProperty(name); - if (null == pathStr) { - throw new RuntimeException("Disk " + name + " was not configured"); - } - Path p = Path.of(pathStr); - if (!Files.isDirectory(p)) { - throw new RuntimeException("Disk " + name + " does not exist or is not a directory!"); - } - return p; - } - - public static Properties getDiskProperties() { - Path settingsFile = getHomePath().resolve("conf/disks.properties"); - - if (!Files.isRegularFile(settingsFile)) { - throw new RuntimeException("Could not find disk settings " + settingsFile); - } - - try (var is = Files.newInputStream(settingsFile)) { - var props = new Properties(); - props.load(is); - return props; - } - catch (IOException ex) { - throw new RuntimeException(ex); - } - } - public static LanguageModels getLanguageModels() { final Path home = getHomePath(); diff --git a/code/common/db/build.gradle b/code/common/db/build.gradle index b7e3f0ef..62ede7ac 100644 --- a/code/common/db/build.gradle +++ b/code/common/db/build.gradle @@ -2,7 +2,7 @@ plugins { id 'java' id "io.freefair.lombok" version "5.3.3.3" id 'jvm-test-suite' - + id "org.flywaydb.flyway" version "8.2.0" } java { @@ -11,6 +11,10 @@ java { } } +configurations { + flywayMigration.extendsFrom(implementation) +} + dependencies { implementation project(':code:common:model') @@ -29,6 +33,7 @@ dependencies { implementation libs.rxjava implementation libs.bundles.mariadb + flywayMigration 'org.flywaydb:flyway-mysql:9.8.1' testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.junit @@ -40,6 +45,15 @@ dependencies { testImplementation 'org.testcontainers:junit-jupiter:1.17.4' } +flyway { + url = 'jdbc:mariadb://localhost:3306/WMSA_prod' + user = 'wmsa' + password = 'wmsa' + schemas = ['WMSA_prod'] + configurations = [ 'compileClasspath', 'flywayMigration' ] + locations = ['filesystem:src/main/resources/db/migration'] +} + test { maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1 diff --git a/code/common/db/readme.md b/code/common/db/readme.md index 886153b6..ad62169d 100644 --- a/code/common/db/readme.md +++ b/code/common/db/readme.md @@ -2,10 +2,29 @@ This module primarily contains SQL files for the URLs database. The most central tables are `EC_DOMAIN`, `EC_URL` and `EC_PAGE_DATA`. +## Flyway + +The system uses flyway to track database changes and allow easy migrations, this is accessible via gradle tasks. + +* `flywayMigrate` +* `flywayBaseline` +* `flywayRepair` +* `flywayClean` (dangerous as in wipes your entire database) + +Refer to the [Flyway documentation](https://documentation.red-gate.com/fd/flyway-documentation-138346877.html) for guidance. +It's well documented and these are probably the only four tasks you'll ever need. + +If you are not running the system via docker, you need to provide alternative connection details than +the defaults (TODO: how?). + +The migration files are in [resources/db/migration](src/main/resources/db/migration). The file name convention +incorporates the project's cal-ver versioning; and are applied in lexicographical order. + + VYY_MM_v_nnn__description.sql + ## Central Paths -* [current](src/main/resources/sql/current) - The current database model -* [migrations](src/main/resources/sql/migrations) +* [migrations](src/main/resources/db/migration) - Flyway migrations ## See Also diff --git a/code/common/db/src/main/java/nu/marginalia/db/DomainBlacklistImpl.java b/code/common/db/src/main/java/nu/marginalia/db/DomainBlacklistImpl.java index 87be3942..8bfbca7e 100644 --- a/code/common/db/src/main/java/nu/marginalia/db/DomainBlacklistImpl.java +++ b/code/common/db/src/main/java/nu/marginalia/db/DomainBlacklistImpl.java @@ -52,7 +52,7 @@ public class DomainBlacklistImpl implements DomainBlacklist { } try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement("SELECT EC_DOMAIN.ID FROM EC_DOMAIN INNER JOIN EC_DOMAIN_BLACKLIST ON EC_DOMAIN_BLACKLIST.URL_DOMAIN = EC_DOMAIN.DOMAIN_TOP")) { + try (var stmt = connection.prepareStatement("SELECT EC_DOMAIN.ID FROM EC_DOMAIN INNER JOIN EC_DOMAIN_BLACKLIST ON (EC_DOMAIN_BLACKLIST.URL_DOMAIN = EC_DOMAIN.DOMAIN_TOP OR EC_DOMAIN_BLACKLIST.URL_DOMAIN = EC_DOMAIN.DOMAIN_NAME)")) { stmt.setFetchSize(1000); var rsp = stmt.executeQuery(); while (rsp.next()) { diff --git a/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageManifest.java b/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageManifest.java new file mode 100644 index 00000000..f002a47d --- /dev/null +++ b/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageManifest.java @@ -0,0 +1,51 @@ +package nu.marginalia.db.storage; + +import com.google.gson.Gson; +import nu.marginalia.db.storage.model.FileStorage; +import nu.marginalia.db.storage.model.FileStorageType; +import nu.marginalia.model.gson.GsonFactory; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; +import java.util.Optional; + +record FileStorageManifest(FileStorageType type, String description) { + private static final Gson gson = GsonFactory.get(); + private static final String fileName = "marginalia-manifest.json"; + private static final Logger logger = LoggerFactory.getLogger(FileStorageManifest.class); + + public static Optional find(Path directory) { + Path expectedFileName = directory.resolve(fileName); + + if (!Files.isRegularFile(expectedFileName) || + !Files.isReadable(expectedFileName)) { + return Optional.empty(); + } + + try (var reader = Files.newBufferedReader(expectedFileName)) { + return Optional.of(gson.fromJson(reader, FileStorageManifest.class)); + } + catch (Exception e) { + logger.warn("Failed to read manifest " + expectedFileName, e); + return Optional.empty(); + } + } + + public void write(FileStorage dir) { + Path expectedFileName = dir.asPath().resolve(fileName); + + try (var writer = Files.newBufferedWriter(expectedFileName, + StandardOpenOption.CREATE, + StandardOpenOption.TRUNCATE_EXISTING)) + { + gson.toJson(this, writer); + } + catch (Exception e) { + logger.warn("Failed to write manifest " + expectedFileName, e); + } + } + +} diff --git a/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java b/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java new file mode 100644 index 00000000..813d1c57 --- /dev/null +++ b/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java @@ -0,0 +1,432 @@ +package nu.marginalia.db.storage; + +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.db.storage.model.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.inject.Inject; +import javax.inject.Singleton; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.nio.file.*; +import java.nio.file.attribute.PosixFilePermissions; +import java.sql.SQLException; +import java.util.*; + +/** Manages file storage for processes and services + */ +@Singleton +public class FileStorageService { + private final HikariDataSource dataSource; + private final Logger logger = LoggerFactory.getLogger(FileStorageService.class); + @Inject + public FileStorageService(HikariDataSource dataSource) { + this.dataSource = dataSource; + } + + public Optional findFileStorageToDelete() { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT ID FROM FILE_STORAGE WHERE DO_PURGE LIMIT 1 + """)) { + var rs = stmt.executeQuery(); + if (rs.next()) { + return Optional.of(getStorage(new FileStorageId(rs.getLong(1)))); + } + } catch (SQLException e) { + return Optional.empty(); + } + return Optional.empty(); + } + + /** @return the storage base with the given id, or null if it does not exist */ + public FileStorageBase getStorageBase(FileStorageBaseId type) throws SQLException { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT ID, NAME, PATH, TYPE, PERMIT_TEMP + FROM FILE_STORAGE_BASE WHERE ID = ? + """)) { + stmt.setLong(1, type.id()); + try (var rs = stmt.executeQuery()) { + if (rs.next()) { + return new FileStorageBase( + new FileStorageBaseId(rs.getLong(1)), + FileStorageBaseType.valueOf(rs.getString(4)), + rs.getString(2), + rs.getString(3), + rs.getBoolean(5) + ); + } + } + } + return null; + } + + public void synchronizeStorageManifests(FileStorageBase base) { + Set ignoredPaths = new HashSet<>(); + + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT PATH FROM FILE_STORAGE WHERE BASE_ID = ? + """)) { + stmt.setLong(1, base.id().id()); + var rs = stmt.executeQuery(); + while (rs.next()) { + ignoredPaths.add(rs.getString(1)); + } + } catch (SQLException e) { + throw new RuntimeException(e); + } + + File basePathFile = Path.of(base.path()).toFile(); + File[] files = basePathFile.listFiles(pathname -> pathname.isDirectory() && !ignoredPaths.contains(pathname.getName())); + if (files == null) return; + for (File file : files) { + var maybeManifest = FileStorageManifest.find(file.toPath()); + if (maybeManifest.isEmpty()) continue; + var manifest = maybeManifest.get(); + + logger.info("Discovered new file storage: " + file.getName() + " (" + manifest.type() + ")"); + + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + INSERT INTO FILE_STORAGE(BASE_ID, PATH, TYPE, DESCRIPTION) + VALUES (?, ?, ?, ?) + """)) { + stmt.setLong(1, base.id().id()); + stmt.setString(2, file.getName()); + stmt.setString(3, manifest.type().name()); + stmt.setString(4, manifest.description()); + stmt.execute(); + conn.commit(); + + } catch (SQLException e) { + throw new RuntimeException(e); + } + } + } + public void relateFileStorages(FileStorageId source, FileStorageId target) { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + INSERT INTO FILE_STORAGE_RELATION(SOURCE_ID, TARGET_ID) VALUES (?, ?) + """)) { + stmt.setLong(1, source.id()); + stmt.setLong(2, target.id()); + stmt.executeUpdate(); + } catch (SQLException e) { + throw new RuntimeException(e); + } + } + + public List getSourceFromStorage(FileStorage storage) throws SQLException { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT SOURCE_ID FROM FILE_STORAGE_RELATION WHERE TARGET_ID = ? + """)) { + stmt.setLong(1, storage.id().id()); + var rs = stmt.executeQuery(); + List ret = new ArrayList<>(); + while (rs.next()) { + ret.add(getStorage(new FileStorageId(rs.getLong(1)))); + } + return ret; + } + } + + public List getTargetFromStorage(FileStorage storage) throws SQLException { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT TARGET_ID FROM FILE_STORAGE_RELATION WHERE SOURCE_ID = ? + """)) { + stmt.setLong(1, storage.id().id()); + var rs = stmt.executeQuery(); + List ret = new ArrayList<>(); + while (rs.next()) { + ret.add(getStorage(new FileStorageId(rs.getLong(1)))); + } + return ret; + } + } + + /** @return the storage base with the given type, or null if it does not exist */ + public FileStorageBase getStorageBase(FileStorageBaseType type) throws SQLException { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT ID, NAME, PATH, TYPE, PERMIT_TEMP + FROM FILE_STORAGE_BASE WHERE TYPE = ? + """)) { + stmt.setString(1, type.name()); + try (var rs = stmt.executeQuery()) { + if (rs.next()) { + return new FileStorageBase( + new FileStorageBaseId(rs.getLong(1)), + FileStorageBaseType.valueOf(rs.getString(4)), + rs.getString(2), + rs.getString(3), + rs.getBoolean(5) + ); + } + } + } + return null; + } + + public FileStorageBase createStorageBase(String name, Path path, FileStorageBaseType type, boolean permitTemp) throws SQLException, FileNotFoundException { + + if (!Files.exists(path)) { + throw new FileNotFoundException("Storage base path does not exist: " + path); + } + + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + INSERT INTO FILE_STORAGE_BASE(NAME, PATH, TYPE, PERMIT_TEMP) + VALUES (?, ?, ?, ?) + """)) { + stmt.setString(1, name); + stmt.setString(2, path.toString()); + stmt.setString(3, type.name()); + stmt.setBoolean(4, permitTemp); + + int update = stmt.executeUpdate(); + if (update < 0) { + throw new SQLException("Failed to create storage base"); + } + } + + return getStorageBase(type); + } + + /** Allocate a temporary storage of the given type if temporary allocation is permitted */ + public FileStorage allocateTemporaryStorage(FileStorageBase base, + FileStorageType type, + String prefix, + String description) throws IOException, SQLException + { + if (!base.permitTemp()) { + throw new IllegalArgumentException("Temporary storage not permitted in base " + base.name()); + } + + Path tempDir = Files.createTempDirectory(base.asPath(), prefix, + PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rwxr-xr-x")) + ); + + String relDir = base.asPath().relativize(tempDir).normalize().toString(); + + try (var conn = dataSource.getConnection(); + var insert = conn.prepareStatement(""" + INSERT INTO FILE_STORAGE(PATH, TYPE, DESCRIPTION, BASE_ID) + VALUES (?, ?, ?, ?) + """); + var query = conn.prepareStatement(""" + SELECT ID FROM FILE_STORAGE WHERE PATH = ? AND BASE_ID = ? + """) + ) { + insert.setString(1, relDir); + insert.setString(2, type.name()); + insert.setString(3, description); + insert.setLong(4, base.id().id()); + + if (insert.executeUpdate() < 1) { + throw new SQLException("Failed to insert storage"); + } + + + query.setString(1, relDir); + query.setLong(2, base.id().id()); + var rs = query.executeQuery(); + + if (rs.next()) { + var storage = getStorage(new FileStorageId(rs.getLong("ID"))); + + // Write a manifest file so we can pick this up later without needing to insert it into DB + // (e.g. when loading from outside the system) + var manifest = new FileStorageManifest(type, description); + manifest.write(storage); + + return storage; + } + + } + + throw new SQLException("Failed to insert storage"); + } + + + /** Allocate permanent storage in base */ + public FileStorage allocatePermanentStorage(FileStorageBase base, String relativePath, FileStorageType type, String description) throws IOException, SQLException { + + Path newDir = base.asPath().resolve(relativePath); + + if (Files.exists(newDir)) { + throw new IllegalArgumentException("Storage already exists: " + newDir); + } + + Files.createDirectory(newDir, PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rwxr-xr-x"))); + + try (var conn = dataSource.getConnection(); + var update = conn.prepareStatement(""" + INSERT INTO FILE_STORAGE(PATH, TYPE, DESCRIPTION, BASE_ID) + VALUES (?, ?, ?, ?) + """); + var query = conn.prepareStatement(""" + SELECT ID + FROM FILE_STORAGE WHERE PATH = ? AND BASE_ID = ? + """) + ) { + update.setString(1, relativePath); + update.setString(2, type.name()); + update.setString(3, description); + update.setLong(4, base.id().id()); + + if (update.executeUpdate() < 1) + throw new SQLException("Failed to insert storage"); + + query.setString(1, relativePath); + query.setLong(2, base.id().id()); + var rs = query.executeQuery(); + + if (rs.next()) { + return new FileStorage( + new FileStorageId(rs.getLong("ID")), + base, + type, + newDir.toString(), + description + ); + } + + } + + throw new SQLException("Failed to insert storage"); + } + + public FileStorage getStorageByType(FileStorageType type) throws SQLException { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT PATH, DESCRIPTION, ID, BASE_ID + FROM FILE_STORAGE_VIEW WHERE TYPE = ? + """)) { + stmt.setString(1, type.name()); + + long storageId; + long baseId; + String path; + String description; + + try (var rs = stmt.executeQuery()) { + if (rs.next()) { + baseId = rs.getLong("BASE_ID"); + storageId = rs.getLong("ID"); + path = rs.getString("PATH"); + description = rs.getString("DESCRIPTION"); + } + else { + return null; + } + + var base = getStorageBase(new FileStorageBaseId(baseId)); + + return new FileStorage( + new FileStorageId(storageId), + base, + type, + path, + description + ); + } + } + } + + /** @return the storage with the given id, or null if it does not exist */ + public FileStorage getStorage(FileStorageId id) throws SQLException { + + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT PATH, TYPE, DESCRIPTION, ID, BASE_ID + FROM FILE_STORAGE_VIEW WHERE ID = ? + """)) { + stmt.setLong(1, id.id()); + + long storageId; + long baseId; + String path; + String description; + FileStorageType type; + + try (var rs = stmt.executeQuery()) { + if (rs.next()) { + baseId = rs.getLong("BASE_ID"); + storageId = rs.getLong("ID"); + type = FileStorageType.valueOf(rs.getString("TYPE")); + path = rs.getString("PATH"); + description = rs.getString("DESCRIPTION"); + } + else { + return null; + } + + var base = getStorageBase(new FileStorageBaseId(baseId)); + + return new FileStorage( + new FileStorageId(storageId), + base, + type, + path, + description + ); + } + } + } + + public void removeFileStorage(FileStorageId id) throws SQLException { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + DELETE FROM FILE_STORAGE WHERE ID = ? + """)) { + stmt.setLong(1, id.id()); + stmt.executeUpdate(); + } + } + + public List getEachFileStorage() { + List ret = new ArrayList<>(); + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT PATH, TYPE, DESCRIPTION, ID, BASE_ID + FROM FILE_STORAGE_VIEW + """)) { + + long storageId; + long baseId; + String path; + String description; + FileStorageType type; + + try (var rs = stmt.executeQuery()) { + while (rs.next()) { + baseId = rs.getLong("BASE_ID"); + storageId = rs.getLong("ID"); + path = rs.getString("PATH"); + type = FileStorageType.valueOf(rs.getString("TYPE")); + description = rs.getString("DESCRIPTION"); + + var base = getStorageBase(new FileStorageBaseId(baseId)); + + ret.add(new FileStorage( + new FileStorageId(storageId), + base, + type, + path, + description + )); + } + } + } catch (SQLException e) { + e.printStackTrace(); + } + + return ret; + } +} diff --git a/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorage.java b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorage.java new file mode 100644 index 00000000..3a619809 --- /dev/null +++ b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorage.java @@ -0,0 +1,24 @@ +package nu.marginalia.db.storage.model; + +import java.nio.file.Path; + +/** + * Represents a file storage area + * + * @param id the id of the storage in the database + * @param base the base of the storage + * @param type the type of data expected + * @param path the full path of the storage on disk + * @param description a description of the storage + */ +public record FileStorage( + FileStorageId id, + FileStorageBase base, + FileStorageType type, + String path, + String description) +{ + public Path asPath() { + return Path.of(path); + } +} diff --git a/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageBase.java b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageBase.java new file mode 100644 index 00000000..1e8245ad --- /dev/null +++ b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageBase.java @@ -0,0 +1,23 @@ +package nu.marginalia.db.storage.model; + +import java.nio.file.Path; + +/** + * Represents a file storage base directory + * + * @param id the id of the storage base in the database + * @param type the type of the storage base + * @param name the name of the storage base + * @param path the path of the storage base + * @param permitTemp if true, the storage may be used for temporary files + */ +public record FileStorageBase(FileStorageBaseId id, + FileStorageBaseType type, + String name, + String path, + boolean permitTemp + ) { + public Path asPath() { + return Path.of(path); + } +} diff --git a/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageBaseId.java b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageBaseId.java new file mode 100644 index 00000000..1c7ededd --- /dev/null +++ b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageBaseId.java @@ -0,0 +1,8 @@ +package nu.marginalia.db.storage.model; + +public record FileStorageBaseId(long id) { + + public String toString() { + return Long.toString(id); + } +} diff --git a/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageBaseType.java b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageBaseType.java new file mode 100644 index 00000000..08d67069 --- /dev/null +++ b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageBaseType.java @@ -0,0 +1,8 @@ +package nu.marginalia.db.storage.model; + +public enum FileStorageBaseType { + SSD_INDEX, + SSD_WORK, + SLOW, + BACKUP +} diff --git a/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageId.java b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageId.java new file mode 100644 index 00000000..a89ad9f8 --- /dev/null +++ b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageId.java @@ -0,0 +1,14 @@ +package nu.marginalia.db.storage.model; + +public record FileStorageId(long id) { + public static FileStorageId parse(String str) { + return new FileStorageId(Long.parseLong(str)); + } + public static FileStorageId of(int storageId) { + return new FileStorageId(storageId); + } + + public String toString() { + return Long.toString(id); + } +} diff --git a/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageType.java b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageType.java new file mode 100644 index 00000000..9f512d06 --- /dev/null +++ b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageType.java @@ -0,0 +1,14 @@ +package nu.marginalia.db.storage.model; + +public enum FileStorageType { + CRAWL_SPEC, + CRAWL_DATA, + PROCESSED_DATA, + INDEX_STAGING, + LEXICON_STAGING, + INDEX_LIVE, + LEXICON_LIVE, + BACKUP, + EXPORT, + SEARCH_SETS +} diff --git a/code/common/db/src/main/resources/sql/current/00-base.sql b/code/common/db/src/main/resources/db/migration/V23_06_0_000__base.sql similarity index 100% rename from code/common/db/src/main/resources/sql/current/00-base.sql rename to code/common/db/src/main/resources/db/migration/V23_06_0_000__base.sql diff --git a/code/common/db/src/main/resources/sql/current/01-blacklist.sql b/code/common/db/src/main/resources/db/migration/V23_06_0_001__blacklist.sql similarity index 82% rename from code/common/db/src/main/resources/sql/current/01-blacklist.sql rename to code/common/db/src/main/resources/db/migration/V23_06_0_001__blacklist.sql index e46161bc..d05d8e9d 100644 --- a/code/common/db/src/main/resources/sql/current/01-blacklist.sql +++ b/code/common/db/src/main/resources/db/migration/V23_06_0_001__blacklist.sql @@ -2,6 +2,7 @@ CREATE TABLE IF NOT EXISTS EC_DOMAIN_BLACKLIST ( ID INT PRIMARY KEY AUTO_INCREMENT, URL_DOMAIN VARCHAR(255) UNIQUE NOT NULL + COMMENT VARCHAR(255) DEFAULT NULL ) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; diff --git a/code/common/db/src/main/resources/sql/current/02-dictionary.sql b/code/common/db/src/main/resources/db/migration/V23_06_0_002__dictionary.sql similarity index 100% rename from code/common/db/src/main/resources/sql/current/02-dictionary.sql rename to code/common/db/src/main/resources/db/migration/V23_06_0_002__dictionary.sql diff --git a/code/common/db/src/main/resources/sql/current/03-crawl-queue.sql b/code/common/db/src/main/resources/db/migration/V23_06_0_003__crawl-queue.sql similarity index 100% rename from code/common/db/src/main/resources/sql/current/03-crawl-queue.sql rename to code/common/db/src/main/resources/db/migration/V23_06_0_003__crawl-queue.sql diff --git a/code/common/db/src/main/resources/sql/current/04-screenshot.sql b/code/common/db/src/main/resources/db/migration/V23_06_0_004__screenshot.sql similarity index 100% rename from code/common/db/src/main/resources/sql/current/04-screenshot.sql rename to code/common/db/src/main/resources/db/migration/V23_06_0_004__screenshot.sql diff --git a/code/common/db/src/main/resources/sql/current/05-domain-complaint.sql b/code/common/db/src/main/resources/db/migration/V23_06_0_005__domain_complaint.sql similarity index 100% rename from code/common/db/src/main/resources/sql/current/05-domain-complaint.sql rename to code/common/db/src/main/resources/db/migration/V23_06_0_005__domain_complaint.sql diff --git a/code/common/db/src/main/resources/sql/current/06-api-key.sql b/code/common/db/src/main/resources/db/migration/V23_06_0_006__api_key.sql similarity index 100% rename from code/common/db/src/main/resources/sql/current/06-api-key.sql rename to code/common/db/src/main/resources/db/migration/V23_06_0_006__api_key.sql diff --git a/code/common/db/src/main/resources/sql/current/07-neighbors.sql b/code/common/db/src/main/resources/db/migration/V23_06_0_007__neighbors.sql similarity index 100% rename from code/common/db/src/main/resources/sql/current/07-neighbors.sql rename to code/common/db/src/main/resources/db/migration/V23_06_0_007__neighbors.sql diff --git a/code/common/db/src/main/resources/sql/current/08-random-domains.sql b/code/common/db/src/main/resources/db/migration/V23_06_0_008__random_domains.sql similarity index 100% rename from code/common/db/src/main/resources/sql/current/08-random-domains.sql rename to code/common/db/src/main/resources/db/migration/V23_06_0_008__random_domains.sql diff --git a/code/common/db/src/main/resources/sql/current/09-news-feed.sql b/code/common/db/src/main/resources/db/migration/V23_06_0_009__news_feed.sql similarity index 100% rename from code/common/db/src/main/resources/sql/current/09-news-feed.sql rename to code/common/db/src/main/resources/db/migration/V23_06_0_009__news_feed.sql diff --git a/code/common/db/src/main/resources/sql/current/10-domain-type.sql b/code/common/db/src/main/resources/db/migration/V23_07_0_001__domain_type.sql similarity index 100% rename from code/common/db/src/main/resources/sql/current/10-domain-type.sql rename to code/common/db/src/main/resources/db/migration/V23_07_0_001__domain_type.sql diff --git a/code/common/db/src/main/resources/db/migration/V23_07_0_002__service_status.sql b/code/common/db/src/main/resources/db/migration/V23_07_0_002__service_status.sql new file mode 100644 index 00000000..a5d392c5 --- /dev/null +++ b/code/common/db/src/main/resources/db/migration/V23_07_0_002__service_status.sql @@ -0,0 +1,27 @@ +CREATE TABLE IF NOT EXISTS SERVICE_HEARTBEAT ( + SERVICE_NAME VARCHAR(255) PRIMARY KEY COMMENT "Full name of the service, including node id if applicable, e.g. search-service:0", + SERVICE_BASE VARCHAR(255) NOT NULL COMMENT "Base name of the service, e.g. search-service", + INSTANCE VARCHAR(255) NOT NULL COMMENT "UUID of the service instance", + ALIVE BOOLEAN NOT NULL DEFAULT TRUE COMMENT "Set to false when the service is doing an orderly shutdown", + HEARTBEAT_TIME TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) COMMENT "Service was last seen at this point" +); + +CREATE TABLE IF NOT EXISTS PROCESS_HEARTBEAT ( + PROCESS_NAME VARCHAR(255) PRIMARY KEY COMMENT "Full name of the process, including node id if applicable, e.g. converter:0", + PROCESS_BASE VARCHAR(255) NOT NULL COMMENT "Base name of the process, e.g. converter", + INSTANCE VARCHAR(255) NOT NULL COMMENT "UUID of the process instance", + STATUS ENUM ('STARTING', 'RUNNING', 'STOPPED') NOT NULL DEFAULT 'STARTING' COMMENT "Status of the process", + PROGRESS INT NOT NULL DEFAULT 0 COMMENT "Progress of the process", + HEARTBEAT_TIME TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) COMMENT "Process was last seen at this point" +); + +CREATE TABLE IF NOT EXISTS SERVICE_EVENTLOG( + ID BIGINT AUTO_INCREMENT PRIMARY KEY COMMENT "Unique id", + SERVICE_NAME VARCHAR(255) NOT NULL COMMENT "Full name of the service, including node id if applicable, e.g. search-service:0", + SERVICE_BASE VARCHAR(255) NOT NULL COMMENT "Base name of the service, e.g. search-service", + INSTANCE VARCHAR(255) NOT NULL COMMENT "UUID of the service instance", + EVENT_TIME TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) COMMENT "Event time", + EVENT_TYPE VARCHAR(255) NOT NULL COMMENT "Event type", + EVENT_MESSAGE VARCHAR(255) NOT NULL COMMENT "Event message" +); + diff --git a/code/common/db/src/main/resources/db/migration/V23_07_0_003__message_queue.sql b/code/common/db/src/main/resources/db/migration/V23_07_0_003__message_queue.sql new file mode 100644 index 00000000..6e628e80 --- /dev/null +++ b/code/common/db/src/main/resources/db/migration/V23_07_0_003__message_queue.sql @@ -0,0 +1,21 @@ +CREATE TABLE IF NOT EXISTS MESSAGE_QUEUE ( + ID BIGINT AUTO_INCREMENT PRIMARY KEY COMMENT 'Unique id', + RELATED_ID BIGINT NOT NULL DEFAULT -1 COMMENT 'Unique id a related message', + SENDER_INBOX VARCHAR(255) COMMENT 'Name of the sender inbox', + RECIPIENT_INBOX VARCHAR(255) NOT NULL COMMENT 'Name of the recipient inbox', + FUNCTION VARCHAR(255) NOT NULL COMMENT 'Which function to run', + PAYLOAD TEXT COMMENT 'Message to recipient', + -- These fields are used to avoid double processing of messages + -- instance marks the unique instance of the party, and the tick marks + -- the current polling iteration. Both are necessary. + OWNER_INSTANCE VARCHAR(255) COMMENT 'Instance UUID corresponding to the party that has claimed the message', + OWNER_TICK BIGINT DEFAULT -1 COMMENT 'Used by recipient to determine which messages it has processed', + STATE ENUM('NEW', 'ACK', 'OK', 'ERR', 'DEAD') + NOT NULL DEFAULT 'NEW' COMMENT 'Processing state', + CREATED_TIME TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) COMMENT 'Time of creation', + UPDATED_TIME TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) COMMENT 'Time of last update', + TTL INT COMMENT 'Time to live in seconds' +); + +CREATE INDEX MESSAGE_QUEUE_STATE_IDX ON MESSAGE_QUEUE(STATE); +CREATE INDEX MESSAGE_QUEUE_OI_TICK_IDX ON MESSAGE_QUEUE(OWNER_INSTANCE, OWNER_TICK); diff --git a/code/common/db/src/main/resources/db/migration/V23_07_0_004__file_storage.sql b/code/common/db/src/main/resources/db/migration/V23_07_0_004__file_storage.sql new file mode 100644 index 00000000..641d0e03 --- /dev/null +++ b/code/common/db/src/main/resources/db/migration/V23_07_0_004__file_storage.sql @@ -0,0 +1,42 @@ +CREATE TABLE IF NOT EXISTS FILE_STORAGE_BASE ( + ID BIGINT PRIMARY KEY AUTO_INCREMENT, + NAME VARCHAR(255) NOT NULL UNIQUE, + PATH VARCHAR(255) NOT NULL UNIQUE COMMENT 'The path to the storage base', + TYPE ENUM ('SSD_INDEX', 'SSD_WORK', 'SLOW', 'BACKUP') NOT NULL, + PERMIT_TEMP BOOLEAN NOT NULL DEFAULT FALSE COMMENT 'If true, the storage can be used for temporary files' +) +CHARACTER SET utf8mb4 +COLLATE utf8mb4_bin; + +CREATE TABLE IF NOT EXISTS FILE_STORAGE ( + ID BIGINT PRIMARY KEY AUTO_INCREMENT, + BASE_ID BIGINT NOT NULL, + PATH VARCHAR(255) NOT NULL COMMENT 'The path to the storage relative to the base', + DESCRIPTION VARCHAR(255) NOT NULL, + TYPE ENUM ('CRAWL_SPEC', 'CRAWL_DATA', 'PROCESSED_DATA', 'INDEX_STAGING', 'LEXICON_STAGING', 'INDEX_LIVE', 'LEXICON_LIVE', 'SEARCH_SETS', 'BACKUP', 'EXPORT') NOT NULL, + DO_PURGE BOOLEAN NOT NULL DEFAULT FALSE COMMENT 'If true, the storage may be cleaned', + CREATE_DATE TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6), + CONSTRAINT CONS UNIQUE (BASE_ID, PATH), + FOREIGN KEY (BASE_ID) REFERENCES FILE_STORAGE_BASE(ID) ON DELETE CASCADE +) +CHARACTER SET utf8mb4 +COLLATE utf8mb4_bin; + +CREATE TABLE IF NOT EXISTS FILE_STORAGE_RELATION ( + SOURCE_ID BIGINT NOT NULL, + TARGET_ID BIGINT NOT NULL, + CONSTRAINT CONS UNIQUE (SOURCE_ID, TARGET_ID), + FOREIGN KEY (SOURCE_ID) REFERENCES FILE_STORAGE(ID) ON DELETE CASCADE, + FOREIGN KEY (TARGET_ID) REFERENCES FILE_STORAGE(ID) ON DELETE CASCADE +); + +CREATE VIEW FILE_STORAGE_VIEW +AS SELECT + CONCAT(BASE.PATH, '/', STORAGE.PATH) AS PATH, + STORAGE.TYPE AS TYPE, + DESCRIPTION AS DESCRIPTION, + CREATE_DATE AS CREATE_DATE, + STORAGE.ID AS ID, + BASE.ID AS BASE_ID +FROM FILE_STORAGE STORAGE +INNER JOIN FILE_STORAGE_BASE BASE ON STORAGE.BASE_ID=BASE.ID; diff --git a/code/common/db/src/main/resources/db/migration/V23_07_0_005__file_storage_default_values.sql b/code/common/db/src/main/resources/db/migration/V23_07_0_005__file_storage_default_values.sql new file mode 100644 index 00000000..74434055 --- /dev/null +++ b/code/common/db/src/main/resources/db/migration/V23_07_0_005__file_storage_default_values.sql @@ -0,0 +1,28 @@ +INSERT IGNORE INTO FILE_STORAGE_BASE(NAME, PATH, TYPE, PERMIT_TEMP) +VALUES +('Index Storage', '/vol', 'SSD_INDEX', false), +('Data Storage', '/samples', 'SLOW', true); + +INSERT IGNORE INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE) +SELECT ID, 'iw', "Index Staging Area", 'INDEX_STAGING' +FROM FILE_STORAGE_BASE WHERE NAME='Index Storage'; + +INSERT IGNORE INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE) +SELECT ID, 'ir', "Index Live Area", 'INDEX_LIVE' +FROM FILE_STORAGE_BASE WHERE NAME='Index Storage'; + +INSERT IGNORE INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE) +SELECT ID, 'lw', "Lexicon Staging Area", 'LEXICON_STAGING' +FROM FILE_STORAGE_BASE WHERE NAME='Index Storage'; + +INSERT IGNORE INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE) +SELECT ID, 'lr', "Lexicon Live Area", 'LEXICON_LIVE' +FROM FILE_STORAGE_BASE WHERE NAME='Index Storage'; + +INSERT IGNORE INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE) +SELECT ID, 'ss', "Search Sets", 'SEARCH_SETS' +FROM FILE_STORAGE_BASE WHERE NAME='Index Storage'; + +INSERT IGNORE INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE) +SELECT ID, 'export', "Exported Data", 'EXPORT' +FROM FILE_STORAGE_BASE WHERE TYPE='EXPORT'; \ No newline at end of file diff --git a/code/common/db/src/main/resources/db/migration/V23_07_0_006__message_queue_default_jobs.sql b/code/common/db/src/main/resources/db/migration/V23_07_0_006__message_queue_default_jobs.sql new file mode 100644 index 00000000..9b78e3b4 --- /dev/null +++ b/code/common/db/src/main/resources/db/migration/V23_07_0_006__message_queue_default_jobs.sql @@ -0,0 +1,7 @@ +INSERT INTO MESSAGE_QUEUE(RECIPIENT_INBOX,FUNCTION,PAYLOAD) VALUES + ('fsm:converter_monitor','INITIAL',''), + ('fsm:loader_monitor','INITIAL',''), + ('fsm:crawler_monitor','INITIAL',''), + ('fsm:message_queue_monitor','INITIAL',''), + ('fsm:process_liveness_monitor','INITIAL',''), + ('fsm:file_storage_monitor','INITIAL',''); diff --git a/code/common/db/src/main/resources/db/migration/V23_07_0_007__task_status.sql b/code/common/db/src/main/resources/db/migration/V23_07_0_007__task_status.sql new file mode 100644 index 00000000..7c7ec175 --- /dev/null +++ b/code/common/db/src/main/resources/db/migration/V23_07_0_007__task_status.sql @@ -0,0 +1,10 @@ +CREATE TABLE IF NOT EXISTS TASK_HEARTBEAT ( + TASK_NAME VARCHAR(255) PRIMARY KEY COMMENT "Full name of the task, including node id if applicable, e.g. reconvert:0", + TASK_BASE VARCHAR(255) NOT NULL COMMENT "Base name of the task, e.g. reconvert", + INSTANCE VARCHAR(255) NOT NULL COMMENT "UUID of the task instance", + SERVICE_INSTANCE VARCHAR(255) NOT NULL COMMENT "UUID of the parent service", + STATUS ENUM ('STARTING', 'RUNNING', 'STOPPED') NOT NULL DEFAULT 'STARTING' COMMENT "Status of the task", + PROGRESS INT NOT NULL DEFAULT 0 COMMENT "Progress of the task", + STAGE_NAME VARCHAR(255) DEFAULT "", + HEARTBEAT_TIME TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) COMMENT "Task was last seen at this point" +); diff --git a/code/common/db/src/main/resources/sql/migrations/00-news-items.sql b/code/common/db/src/main/resources/sql/migrations/00-news-items.sql deleted file mode 100644 index 4f237b67..00000000 --- a/code/common/db/src/main/resources/sql/migrations/00-news-items.sql +++ /dev/null @@ -1,76 +0,0 @@ - -INSERT IGNORE INTO SEARCH_NEWS_FEED(TITLE, LINK, SOURCE, LIST_DATE) VALUES ( -'A search engine that favors text-heavy sites and punishes modern web design', -'https://news.ycombinator.com/item?id=28550764', -'Hacker News', -'2021-09-16' -); - -INSERT IGNORE INTO SEARCH_NEWS_FEED(TITLE, LINK, SOURCE, LIST_DATE) VALUES ( -'A Search Engine Designed To Surprise You', -'https://onezero.medium.com/a-search-engine-designed-to-surprise-you-b81944ed5c06', -'Clive Thompson OneZero', -'2021-09-16' -); - -INSERT IGNORE INTO SEARCH_NEWS_FEED(TITLE, LINK, SOURCE, LIST_DATE) VALUES ( -'🎂 First anniversary! 🎊', -'https://memex.marginalia.nu/log/49-marginalia-1-year.gmi', -null, -'2022-02-26'); - -INSERT IGNORE INTO SEARCH_NEWS_FEED(TITLE, LINK, SOURCE, LIST_DATE) VALUES ( -'Marginalia Search - Serendipity Engineering', -'https://www.metafilter.com/194653/Marginalia-Search-Serendipity-Engineering', -'MetaFilter', -'2022-03-09'); - -INSERT IGNORE INTO SEARCH_NEWS_FEED(TITLE, LINK, SOURCE, LIST_DATE) VALUES ( -'What Google Search Isn\'t Showing You', -'https://www.newyorker.com/culture/infinite-scroll/what-google-search-isnt-showing-you', -'The New Yorker 🎩', -'2022-03-10' -); - -INSERT IGNORE INTO SEARCH_NEWS_FEED(TITLE, LINK, SOURCE, LIST_DATE) VALUES ( -'You Should Check Out the Indie Web 🎞️', -'https://www.youtube.com/watch?v=rTSEr0cRJY8', -'YouTube, You\'ve Got Kat', -'2022-03-15' -); - -INSERT IGNORE INTO SEARCH_NEWS_FEED(TITLE, LINK, SOURCE, LIST_DATE) VALUES ( -'Marginalia Goes Open Source', -'https://news.ycombinator.com/item?id=31536626', -'Hacker News', -'2022-05-28' -); - -INSERT IGNORE INTO SEARCH_NEWS_FEED(TITLE, LINK, SOURCE, LIST_DATE) VALUES ( -'Kritik an Googles Suche - Platzhirsch auf dem Nebenschauplatz', -'https://www.deutschlandfunkkultur.de/google-suche-100.html', -'Deutschlandfunk Kultur 🇩🇪', -'2022-08-18' -); - -INSERT IGNORE INTO SEARCH_NEWS_FEED(TITLE, LINK, SOURCE, LIST_DATE) VALUES ( -'Google ei enää tideä', -'https://www.hs.fi/visio/art-2000009139237.html', -'Helsing Sanomat 🇫🇮', -'2022-10-19' -); - -INSERT IGNORE INTO SEARCH_NEWS_FEED(TITLE, LINK, SOURCE, LIST_DATE) VALUES ( -'Marginalia\'s Index Reaches 100,000,000 Documents 🎊', -'https://memex.marginalia.nu/log/64-hundred-million.gmi', -null, -'2022-10-21' -); - -INSERT IGNORE INTO SEARCH_NEWS_FEED(TITLE, LINK, SOURCE, LIST_DATE) VALUES ( -'Marginalia Receives NLnet grant', -'https://memex.marginalia.nu/log/74-marginalia-2-years.gmi', -null, -'2023-02-26' -); - diff --git a/code/common/db/src/main/resources/sql/migrations/01-domain.sql b/code/common/db/src/main/resources/sql/migrations/01-domain.sql deleted file mode 100644 index 0402fecb..00000000 --- a/code/common/db/src/main/resources/sql/migrations/01-domain.sql +++ /dev/null @@ -1 +0,0 @@ -ALTER TABLE EC_DOMAIN MODIFY COLUMN IP VARCHAR(48); \ No newline at end of file diff --git a/code/common/db/src/test/java/nu/marginalia/db/DomainTypesTest.java b/code/common/db/src/test/java/nu/marginalia/db/DomainTypesTest.java index 0829f6f5..387c880e 100644 --- a/code/common/db/src/test/java/nu/marginalia/db/DomainTypesTest.java +++ b/code/common/db/src/test/java/nu/marginalia/db/DomainTypesTest.java @@ -24,7 +24,7 @@ public class DomainTypesTest { .withDatabaseName("WMSA_prod") .withUsername("wmsa") .withPassword("wmsa") - .withInitScript("sql/current/10-domain-type.sql") + .withInitScript("db/migration/V23_07_0_001__domain_type.sql") .withNetworkAliases("mariadb"); static HikariDataSource dataSource; diff --git a/code/common/db/src/test/java/nu/marginalia/db/storage/FileStorageServiceTest.java b/code/common/db/src/test/java/nu/marginalia/db/storage/FileStorageServiceTest.java new file mode 100644 index 00000000..92020f32 --- /dev/null +++ b/code/common/db/src/test/java/nu/marginalia/db/storage/FileStorageServiceTest.java @@ -0,0 +1,154 @@ +package nu.marginalia.db.storage; + +import com.google.common.collect.Lists; +import com.zaxxer.hikari.HikariConfig; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.db.storage.model.FileStorageBaseType; +import nu.marginalia.db.storage.model.FileStorageType; +import org.junit.jupiter.api.*; +import org.junit.jupiter.api.parallel.Execution; +import org.testcontainers.containers.MariaDBContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.List; +import java.util.UUID; + +import static org.junit.Assert.*; +import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD; + +@Testcontainers +@Execution(SAME_THREAD) +@Tag("slow") +public class FileStorageServiceTest { + @Container + static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") + .withDatabaseName("WMSA_prod") + .withUsername("wmsa") + .withPassword("wmsa") + .withInitScript("db/migration/V23_07_0_004__file_storage.sql") + .withNetworkAliases("mariadb"); + + static HikariDataSource dataSource; + static FileStorageService fileStorageService; + + static List tempDirs = new ArrayList<>(); + + @BeforeAll + public static void setup() { + HikariConfig config = new HikariConfig(); + config.setJdbcUrl(mariaDBContainer.getJdbcUrl()); + config.setUsername("wmsa"); + config.setPassword("wmsa"); + + dataSource = new HikariDataSource(config); + } + + + @BeforeEach + public void setupEach() { + fileStorageService = new FileStorageService(dataSource); + } + + @AfterEach + public void tearDownEach() { + try (var conn = dataSource.getConnection(); + var stmt = conn.createStatement()) { + stmt.execute("DELETE FROM FILE_STORAGE"); + stmt.execute("DELETE FROM FILE_STORAGE_BASE"); + } catch (SQLException e) { + throw new RuntimeException(e); + } + } + + @AfterAll + public static void teardown() { + dataSource.close(); + + Lists.reverse(tempDirs).forEach(path -> { + try { + System.out.println("Deleting " + path); + Files.delete(path); + } catch (IOException e) { + e.printStackTrace(); + } + }); + } + + private Path createTempDir() { + try { + Path dir = Files.createTempDirectory("file-storage-test"); + tempDirs.add(dir); + return dir; + } catch (IOException e) { + throw new RuntimeException(e); + } + + } + + @Test + public void testCreateBase() throws SQLException, FileNotFoundException { + String name = "test-" + UUID.randomUUID(); + + var storage = new FileStorageService(dataSource); + var base = storage.createStorageBase(name, createTempDir(), FileStorageBaseType.SLOW, false); + + Assertions.assertEquals(name, base.name()); + Assertions.assertEquals(FileStorageBaseType.SLOW, base.type()); + Assertions.assertFalse(base.permitTemp()); + } + @Test + public void testAllocateTempInNonPermitted() throws SQLException, FileNotFoundException { + String name = "test-" + UUID.randomUUID(); + + var storage = new FileStorageService(dataSource); + + var base = storage.createStorageBase(name, createTempDir(), FileStorageBaseType.SLOW, false); + + try { + storage.allocateTemporaryStorage(base, FileStorageType.CRAWL_DATA, "xyz", "thisShouldFail"); + fail(); + } + catch (IllegalArgumentException ex) {} // ok + catch (Exception ex) { + ex.printStackTrace(); + fail(); + } + } + + @Test + public void testAllocatePermanentInNonPermitted() throws SQLException, IOException { + String name = "test-" + UUID.randomUUID(); + + var storage = new FileStorageService(dataSource); + + var base = storage.createStorageBase(name, createTempDir(), FileStorageBaseType.SLOW, false); + + var created = storage.allocatePermanentStorage(base, "xyz", FileStorageType.CRAWL_DATA, "thisShouldSucceed"); + tempDirs.add(created.asPath()); + + var actual = storage.getStorage(created.id()); + Assertions.assertEquals(created, actual); + } + + @Test + public void testAllocateTempInPermitted() throws IOException, SQLException { + String name = "test-" + UUID.randomUUID(); + + var storage = new FileStorageService(dataSource); + + var base = storage.createStorageBase(name, createTempDir(), FileStorageBaseType.SLOW, true); + var fileStorage = storage.allocateTemporaryStorage(base, FileStorageType.CRAWL_DATA, "xyz", "thisShouldSucceed"); + + Assertions.assertTrue(Files.exists(fileStorage.asPath())); + tempDirs.add(fileStorage.asPath()); + } + + +} \ No newline at end of file diff --git a/code/common/message-queue/build.gradle b/code/common/message-queue/build.gradle new file mode 100644 index 00000000..d71ca1d4 --- /dev/null +++ b/code/common/message-queue/build.gradle @@ -0,0 +1,49 @@ +plugins { + id 'java' +} + + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + implementation project(':code:common:service-client') + implementation project(':code:common:service-discovery') + implementation project(':code:common:db') + + implementation libs.lombok + annotationProcessor libs.lombok + + implementation libs.spark + implementation libs.guice + implementation libs.gson + implementation libs.rxjava + + implementation libs.bundles.prometheus + implementation libs.bundles.slf4j + implementation libs.bucket4j + + testImplementation libs.bundles.slf4j.test + implementation libs.bundles.mariadb + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito + + testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4') + testImplementation 'org.testcontainers:mariadb:1.17.4' + testImplementation 'org.testcontainers:junit-jupiter:1.17.4' +} + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} diff --git a/code/common/message-queue/msgstate.svg b/code/common/message-queue/msgstate.svg new file mode 100644 index 00000000..22691893 --- /dev/null +++ b/code/common/message-queue/msgstate.svg @@ -0,0 +1,4 @@ + + + +
If the message is not
acknowledged, it may
be declared dead after
TTL
If the message is not...
Inbox acknowledges the message
Inbox acknowledges the message
New
New
Message processing
failed
Message processing...
If the message doesn't
finish within TTL it will
be marked as dead
If the message doesn't...
Message processed
OK, sender may
receive a reply in their
inbox
Message processed...
Ack
Ack
Ok
Ok
Err
Err
Dead
Dead
Terminal States
Terminal S...
Intermediate States
Intermedia...
Initial State
Initial St...

Message States

Messages pass through several states through their lifecycle

Message States...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/code/common/message-queue/readme.md b/code/common/message-queue/readme.md new file mode 100644 index 00000000..d71459dd --- /dev/null +++ b/code/common/message-queue/readme.md @@ -0,0 +1,100 @@ +# Message Queue + +Implements resilient message queueing for the application, +as well as a finite state machine library backed by the +message queue that enables long-running tasks that outlive +the execution lifespan of the involved processes. + +![Message States](msgstate.svg) + +The message queue is interacted with via the Inbox and Outbox classes. + +There are three types of inboxes; + +Name|Description +---|--- +MqSingleShotInbox|A single message is received and then the inbox is closed. +MqAsynchronousInbox|Messages are received asynchronously and can be processed in parallel. +MqSynchronousInbox|Messages are received synchronously and will be processed in order; message processing can be aborted. + +A single outbox implementation exists, the `MqOutbox`, which implements multiple message sending strategies, +including blocking and asynchronous paradigms. Lower level access to the message queue itself is provided by the `MqPersistence` class. + +The inbox implementations as well as the outbox can be constructed via the `MessageQueueFactory` class. + +## Message Queue State Machine (MQSM) + +The MQSM is a finite state machine that is backed by the message queue used to implement an Actor style paradigm. + +The machine itself is defined through a class that extends the 'AbstractStateGraph'; with state transitions and +names defined as implementations. + +Example: + +```java +class ExampleStateMachine extends AbstractStateGraph { + + @GraphState(name = "INITIAL", next="GREET") + public void initial() { + return "World"; // passed to the next state + } + + @GraphState(name = "GREET", next="COUNT-TO-FIVE") + public void greet(String name) { + System.out.println("Hello " + name); + } + + @GraphState(name = "COUNT-TO-FIVE", next="END") + public void countToFive(Integer value) { + // value is passed from the previous state, since greet didn't pass a value, + // null will be the default. + + if (null == value) { + // jumps to the current state with a value of 0 + transition("COUNT-TO-FIVE", 0); + } + + + System.out.println(++value); + if (value < 5) { + // Loops the current state until value = 5 + transition("COUNT-TO-FIVE", value); + } + + if (value > 5) { + // demonstrates an error condition + error("Illegal value"); + } + + // Default transition is to END + } + + @GraphState(name="END") + public void end() { + System.out.println("Done"); + } +} +``` + +Each method should ideally be idempotent, or at least be able to handle being called multiple times. +It can not be assumed that the states are invoked within the same process, or even on the same machine, +on the same day, etc. + +The usual considerations for writing deterministic Java code are advisable unless unavoidable; +all state must be local, don't iterate over hash maps, etc. + +### Create a state machine +To create an ActorStateMachine from the above class, the following code can be used: + +```java +ActorStateMachine actorStateMachine = new ActorStateMachine( + messageQueueFactory, + actorInboxName, + actorInstanceUUID, + new ExampleStateMachine()); + +actorStateMachine.start(); +``` + +The state machine will now run until it reaches the end state +and listen to messages on the inbox for state transitions. diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/MessageQueueFactory.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/MessageQueueFactory.java new file mode 100644 index 00000000..bc664d38 --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/MessageQueueFactory.java @@ -0,0 +1,44 @@ +package nu.marginalia.mq; + +import nu.marginalia.mq.inbox.MqAsynchronousInbox; +import nu.marginalia.mq.inbox.MqInboxIf; +import nu.marginalia.mq.inbox.MqSingleShotInbox; +import nu.marginalia.mq.inbox.MqSynchronousInbox; +import nu.marginalia.mq.outbox.MqOutbox; +import nu.marginalia.mq.persistence.MqPersistence; + +import javax.inject.Inject; +import javax.inject.Singleton; +import java.util.UUID; + +@Singleton +public class MessageQueueFactory { + private final MqPersistence persistence; + + @Inject + public MessageQueueFactory(MqPersistence persistence) { + this.persistence = persistence; + } + + public MqSingleShotInbox createSingleShotInbox(String inboxName, UUID instanceUUID) + { + return new MqSingleShotInbox(persistence, inboxName, instanceUUID); + } + + + public MqAsynchronousInbox createAsynchronousInbox(String inboxName, UUID instanceUUID) + { + return new MqAsynchronousInbox(persistence, inboxName, instanceUUID); + } + + public MqSynchronousInbox createSynchronousInbox(String inboxName, UUID instanceUUID) + { + return new MqSynchronousInbox(persistence, inboxName, instanceUUID); + } + + + public MqOutbox createOutbox(String inboxName, String outboxName, UUID instanceUUID) + { + return new MqOutbox(persistence, inboxName, outboxName, instanceUUID); + } +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/MqException.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/MqException.java new file mode 100644 index 00000000..351f60d7 --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/MqException.java @@ -0,0 +1,11 @@ +package nu.marginalia.mq; + +public class MqException extends Exception { + public MqException(String message) { + super(message); + } + + public MqException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/MqMessage.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/MqMessage.java new file mode 100644 index 00000000..df0c4839 --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/MqMessage.java @@ -0,0 +1,11 @@ +package nu.marginalia.mq; + +public record MqMessage( + long msgId, + long relatedId, + String function, + String payload, + MqMessageState state, + boolean expectsResponse +) { +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/MqMessageState.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/MqMessageState.java new file mode 100644 index 00000000..94f7411b --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/MqMessageState.java @@ -0,0 +1,14 @@ +package nu.marginalia.mq; + +public enum MqMessageState { + /** The message is new and has not yet been acknowledged by the recipient */ + NEW, + /** The message has been acknowledged by the recipient */ + ACK, + /** The message has been processed successfully by the recipient */ + OK, + /** The message processing has failed */ + ERR, + /** The message did not reach a terminal state within the TTL */ + DEAD +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqAsynchronousInbox.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqAsynchronousInbox.java new file mode 100644 index 00000000..94fa82f6 --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqAsynchronousInbox.java @@ -0,0 +1,226 @@ +package nu.marginalia.mq.inbox; + +import nu.marginalia.mq.MqMessage; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.persistence.MqPersistence; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; +import java.sql.SQLException; +import java.util.Collection; +import java.util.UUID; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.TimeUnit; + +/** Message queue inbox that spawns news threads for each message */ +public class MqAsynchronousInbox implements MqInboxIf { + private final Logger logger = LoggerFactory.getLogger(MqAsynchronousInbox.class); + + private final String inboxName; + private final String instanceUUID; + private final ExecutorService threadPool; + private final MqPersistence persistence; + + private volatile boolean run = true; + + private final int pollIntervalMs = Integer.getInteger("mq.inbox.poll-interval-ms", 100); + private final int maxPollCount = Integer.getInteger("mq.inbox.max-poll-count", 10); + private final List eventSubscribers = new ArrayList<>(); + private final LinkedBlockingQueue queue = new LinkedBlockingQueue<>(32); + + private Thread pollDbThread; + private Thread notifyThread; + + public MqAsynchronousInbox(MqPersistence persistence, + String inboxName, + UUID instanceUUID) + { + this(persistence, inboxName, instanceUUID, Executors.newCachedThreadPool()); + } + + public MqAsynchronousInbox(MqPersistence persistence, + String inboxName, + UUID instanceUUID, + ExecutorService executorService) + { + this.threadPool = executorService; + this.persistence = persistence; + this.inboxName = inboxName; + this.instanceUUID = instanceUUID.toString(); + } + + /** Subscribe to messages on this inbox. Must be run before start()! */ + @Override + public void subscribe(MqSubscription subscription) { + eventSubscribers.add(subscription); + } + + /** Start receiving messages.

+ * Note: Subscribe to messages before calling this method. + *

*/ + @Override + public void start() { + run = true; + + if (eventSubscribers.isEmpty()) { + logger.error("No subscribers for inbox {}, registering shredder", inboxName); + } + + // Add a final handler that fails any message that is not handled + eventSubscribers.add(new MqInboxShredder()); + + pollDbThread = new Thread(this::pollDb, "mq-inbox-update-thread:"+inboxName); + pollDbThread.setDaemon(true); + pollDbThread.start(); + + notifyThread = new Thread(this::notifySubscribers, "mq-inbox-notify-thread:"+inboxName); + notifyThread.setDaemon(true); + notifyThread.start(); + } + + /** Stop receiving messages and shut down all threads */ + @Override + public void stop() throws InterruptedException { + if (!run) + return; + + logger.info("Shutting down inbox {}", inboxName); + + run = false; + pollDbThread.join(); + notifyThread.join(); + + threadPool.shutdownNow(); + + while (!threadPool.awaitTermination(5, TimeUnit.SECONDS)); + } + + private void notifySubscribers() { + try { + while (run) { + + MqMessage msg = queue.poll(pollIntervalMs, TimeUnit.MILLISECONDS); + + if (msg == null) + continue; + + logger.info("Notifying subscribers of message {}", msg.msgId()); + + boolean handled = false; + + for (var eventSubscriber : eventSubscribers) { + if (eventSubscriber.filter(msg)) { + handleMessageWithSubscriber(eventSubscriber, msg); + handled = true; + break; + } + } + + if (!handled) { + logger.error("No subscriber wanted to handle message {}", msg.msgId()); + } + } + } + catch (InterruptedException ex) { + logger.error("MQ inbox notify thread interrupted", ex); + } + } + + private void handleMessageWithSubscriber(MqSubscription subscriber, MqMessage msg) { + + if (msg.expectsResponse()) { + threadPool.execute(() -> respondToMessage(subscriber, msg)); + } + else { + threadPool.execute(() -> acknowledgeNotification(subscriber, msg)); + } + } + + private void respondToMessage(MqSubscription subscriber, MqMessage msg) { + try { + final var rsp = subscriber.onRequest(msg); + sendResponse(msg, rsp.state(), rsp.message()); + } catch (Exception ex) { + logger.error("Message Queue subscriber threw exception", ex); + sendResponse(msg, MqMessageState.ERR); + } + } + + private void acknowledgeNotification(MqSubscription subscriber, MqMessage msg) { + try { + subscriber.onNotification(msg); + updateMessageState(msg, MqMessageState.OK); + } catch (Exception ex) { + logger.error("Message Queue subscriber threw exception", ex); + updateMessageState(msg, MqMessageState.ERR); + } + } + + private void sendResponse(MqMessage msg, MqMessageState state) { + try { + persistence.updateMessageState(msg.msgId(), state); + } + catch (SQLException ex) { + logger.error("Failed to update message state", ex); + } + } + + private void updateMessageState(MqMessage msg, MqMessageState state) { + try { + persistence.updateMessageState(msg.msgId(), state); + } + catch (SQLException ex2) { + logger.error("Failed to update message state", ex2); + } + } + + private void sendResponse(MqMessage msg, MqMessageState mqMessageState, String response) { + try { + persistence.sendResponse(msg.msgId(), mqMessageState, response); + } + catch (SQLException ex) { + logger.error("Failed to update message state", ex); + } + } + + private void pollDb() { + try { + for (long tick = 1; run; tick++) { + + queue.addAll(pollInbox(tick)); + + TimeUnit.MILLISECONDS.sleep(pollIntervalMs); + } + } + catch (InterruptedException ex) { + logger.error("MQ inbox update thread interrupted", ex); + } + } + + private Collection pollInbox(long tick) { + try { + return persistence.pollInbox(inboxName, instanceUUID, tick, maxPollCount); + } + catch (SQLException ex) { + logger.error("Failed to poll inbox", ex); + return List.of(); + } + } + + /** Retrieve the last N messages from the inbox. */ + @Override + public List replay(int lastN) { + try { + return persistence.lastNMessages(inboxName, lastN); + } + catch (SQLException ex) { + logger.error("Failed to replay inbox", ex); + return List.of(); + } + } + +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInboxIf.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInboxIf.java new file mode 100644 index 00000000..b317a1c5 --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInboxIf.java @@ -0,0 +1,15 @@ +package nu.marginalia.mq.inbox; + +import nu.marginalia.mq.MqMessage; + +import java.util.List; + +public interface MqInboxIf { + void subscribe(MqSubscription subscription); + + void start(); + + void stop() throws InterruptedException; + + List replay(int lastN); +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInboxResponse.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInboxResponse.java new file mode 100644 index 00000000..ba4eb6f2 --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInboxResponse.java @@ -0,0 +1,22 @@ +package nu.marginalia.mq.inbox; + +import nu.marginalia.mq.MqMessageState; + +public record MqInboxResponse(String message, MqMessageState state) { + + public static MqInboxResponse ok(String message) { + return new MqInboxResponse(message, MqMessageState.OK); + } + + public static MqInboxResponse ok() { + return new MqInboxResponse("", MqMessageState.OK); + } + + public static MqInboxResponse err(String message) { + return new MqInboxResponse(message, MqMessageState.ERR); + } + + public static MqInboxResponse err() { + return new MqInboxResponse("", MqMessageState.ERR); + } +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInboxShredder.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInboxShredder.java new file mode 100644 index 00000000..18c346f2 --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqInboxShredder.java @@ -0,0 +1,29 @@ +package nu.marginalia.mq.inbox; + +import nu.marginalia.mq.MqMessage; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +class MqInboxShredder implements MqSubscription { + + private final Logger logger = LoggerFactory.getLogger(getClass()); + + public MqInboxShredder() { + } + + @Override + public boolean filter(MqMessage rawMessage) { + return true; + } + + @Override + public MqInboxResponse onRequest(MqMessage msg) { + logger.warn("Unhandled message {}", msg.msgId()); + return MqInboxResponse.err(); + } + + @Override + public void onNotification(MqMessage msg) { + logger.warn("Unhandled message {}", msg.msgId()); + } +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSingleShotInbox.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSingleShotInbox.java new file mode 100644 index 00000000..19645c64 --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSingleShotInbox.java @@ -0,0 +1,100 @@ +package nu.marginalia.mq.inbox; + +import lombok.SneakyThrows; +import nu.marginalia.mq.MqMessage; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.persistence.MqPersistence; + +import java.sql.SQLException; +import java.util.Optional; +import java.util.UUID; +import java.util.concurrent.TimeUnit; +import java.util.function.Predicate; + +/** A single-shot inbox that can be used to wait for a single message + * to arrive in an inbox, and then reply to that message + */ +public class MqSingleShotInbox { + + private final String inboxName; + private final String instanceUUID; + private final MqPersistence persistence; + + public MqSingleShotInbox(MqPersistence persistence, + String inboxName, + UUID instanceUUID + ) { + this.inboxName = inboxName; + this.instanceUUID = instanceUUID.toString(); + this.persistence = persistence; + } + + /** Wait for a message to arrive in the specified inbox, up to the specified timeout. + * + * @param timeout The timeout + * @param unit The time unit + * @return The message, or empty if no message arrived before the timeout + */ + public Optional waitForMessage(long timeout, TimeUnit unit) throws InterruptedException, SQLException { + final long deadline = System.currentTimeMillis() + unit.toMillis(timeout); + + for (int i = 0;; i++) { + if (System.currentTimeMillis() >= deadline) { + return Optional.empty(); + } + + var messages = persistence.pollInbox(inboxName, instanceUUID, i, 1); + + if (messages.size() > 0) { + return Optional.of(messages.iterator().next()); + } + + TimeUnit.SECONDS.sleep(1); + } + } + + + /** Steal a message from the inbox, and change the owner to this instance. This is useful + * for resuming an aborted process. This should be done judiciously, only in cases we're certain + * that the original owner is no longer running as it may cause duplicate processing, race + * conditions, etc. + *

+ * @param predicate A predicate that must be true for the message to be stolen + * @return The stolen message, or empty if no message was stolen + */ + @SneakyThrows + public Optional stealMessage(Predicate predicate) { + for (var message : persistence.eavesdrop(inboxName, 5)) { + if (predicate.test(message)) { + persistence.changeOwner(message.msgId(), instanceUUID, -1); + return Optional.of(message); + } + } + + return Optional.empty(); + } + + /** Send a response to the specified message. If the original message has no response inbox, + * the original message will be marked as OK instead. + * + * @param originalMessage The original message + * @param response The response + */ + public void sendResponse(MqMessage originalMessage, MqInboxResponse response) { + try { + if (!originalMessage.expectsResponse()) { + // If the original message doesn't expect a response, we can just mark it as OK, + // since the sendResponse method will fail explosively since it can't insert a response + // to a non-existent inbox. + + persistence.updateMessageState(originalMessage.msgId(), MqMessageState.OK); + } + else { + persistence.sendResponse(originalMessage.msgId(), response.state(), response.message()); + } + } catch (SQLException e) { + throw new RuntimeException(e); + } + } + +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSubscription.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSubscription.java new file mode 100644 index 00000000..417b7b35 --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSubscription.java @@ -0,0 +1,14 @@ +package nu.marginalia.mq.inbox; + +import nu.marginalia.mq.MqMessage; + +public interface MqSubscription { + /** Return true if this subscription should handle the message. */ + boolean filter(MqMessage rawMessage); + + /** Handle the message and return a response. */ + MqInboxResponse onRequest(MqMessage msg); + + /** Handle a message with no reply address */ + void onNotification(MqMessage msg); +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSynchronousInbox.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSynchronousInbox.java new file mode 100644 index 00000000..09749209 --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/inbox/MqSynchronousInbox.java @@ -0,0 +1,222 @@ +package nu.marginalia.mq.inbox; + +import nu.marginalia.mq.MqMessage; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.persistence.MqPersistence; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.SQLException; +import java.util.*; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; + +/** Message queue inbox that responds to a single message at a time + * within the polling thread + */ +public class MqSynchronousInbox implements MqInboxIf { + private final Logger logger = LoggerFactory.getLogger(MqSynchronousInbox.class); + + private final String inboxName; + private final String instanceUUID; + private final MqPersistence persistence; + + private volatile boolean run = true; + + private final int pollIntervalMs = Integer.getInteger("mq.inbox.poll-interval-ms", 100); + private final List eventSubscribers = new ArrayList<>(); + + private Thread pollDbThread; + private ExecutorService executorService = Executors.newSingleThreadExecutor(); + + public MqSynchronousInbox(MqPersistence persistence, + String inboxName, + UUID instanceUUID) + { + this.persistence = persistence; + this.inboxName = inboxName; + this.instanceUUID = instanceUUID.toString(); + } + + /** Subscribe to messages on this inbox. Must be run before start()! */ + @Override + public void subscribe(MqSubscription subscription) { + eventSubscribers.add(subscription); + } + + /** Start receiving messages.

+ * Note: Subscribe to messages before calling this method. + *

*/ + @Override + public void start() { + run = true; + + if (eventSubscribers.isEmpty()) { + logger.error("No subscribers for inbox {}, registering shredder", inboxName); + } + + // Add a final handler that fails any message that is not handled + eventSubscribers.add(new MqInboxShredder()); + + pollDbThread = new Thread(this::pollDb, "mq-inbox-update-thread:"+inboxName); + pollDbThread.setDaemon(true); + pollDbThread.start(); + } + + /** Stop receiving messages and shut down all threads */ + @Override + public void stop() throws InterruptedException { + if (!run) + return; + + logger.info("Shutting down inbox {}", inboxName); + + run = false; + pollDbThread.join(); + executorService.shutdown(); + executorService.awaitTermination(10, TimeUnit.SECONDS); + + } + + private void handleMessageWithSubscriber(MqSubscription subscriber, MqMessage msg) { + + if (msg.expectsResponse()) { + respondToMessage(subscriber, msg); + } + else { + acknowledgeNotification(subscriber, msg); + } + } + + private void respondToMessage(MqSubscription subscriber, MqMessage msg) { + try { + final var rsp = subscriber.onRequest(msg); + sendResponse(msg, rsp.state(), rsp.message()); + } catch (Exception ex) { + logger.error("Message Queue subscriber threw exception", ex); + sendResponse(msg, MqMessageState.ERR); + } + } + + private void acknowledgeNotification(MqSubscription subscriber, MqMessage msg) { + try { + subscriber.onNotification(msg); + updateMessageState(msg, MqMessageState.OK); + } + catch (Exception ex) { + logger.error("Message Queue subscriber threw exception", ex); + updateMessageState(msg, MqMessageState.ERR); + } + } + + private void sendResponse(MqMessage msg, MqMessageState state) { + try { + persistence.updateMessageState(msg.msgId(), state); + } + catch (SQLException ex) { + logger.error("Failed to update message state", ex); + } + } + + private void updateMessageState(MqMessage msg, MqMessageState state) { + try { + persistence.updateMessageState(msg.msgId(), state); + } + catch (SQLException ex2) { + logger.error("Failed to update message state", ex2); + } + } + + private void sendResponse(MqMessage msg, MqMessageState mqMessageState, String response) { + try { + persistence.sendResponse(msg.msgId(), mqMessageState, response); + } + catch (SQLException ex) { + logger.error("Failed to update message state", ex); + } + } + + private volatile java.util.concurrent.Future currentTask = null; + public void pollDb() { + try { + for (long tick = 1; run; tick++) { + + var messages = pollInbox(tick); + + for (var msg : messages) { + // Handle message in a separate thread but wait for that thread, so we can interrupt that thread + // without interrupting the polling thread and shutting down the inbox completely + try { + currentTask = executorService.submit(() -> handleMessage(msg)); + currentTask.get(); + } + catch (Exception ex) { + logger.error("Inbox task was aborted"); + } + finally { + currentTask = null; + } + } + + if (messages.isEmpty()) { + TimeUnit.MILLISECONDS.sleep(pollIntervalMs); + } + } + } + catch (InterruptedException ex) { + logger.error("MQ inbox update thread interrupted", ex); + } + } + + /** Attempt to abort the current task using an interrupt */ + public void abortCurrentTask() { + var task = currentTask; // capture the value to avoid race conditions with the + // polling thread between the check and the interrupt + if (task != null) { + task.cancel(true); + } + } + + + private void handleMessage(MqMessage msg) { + logger.info("Notifying subscribers of msg {}", msg.msgId()); + + boolean handled = false; + + for (var eventSubscriber : eventSubscribers) { + if (eventSubscriber.filter(msg)) { + handleMessageWithSubscriber(eventSubscriber, msg); + handled = true; + break; + } + } + + if (!handled) { + logger.error("No subscriber wanted to handle msg {}", msg.msgId()); + } + } + + private Collection pollInbox(long tick) { + try { + return persistence.pollInbox(inboxName, instanceUUID, tick, 1); + } + catch (SQLException ex) { + logger.error("Failed to poll inbox", ex); + return List.of(); + } + } + + /** Retrieve the last N messages from the inbox. */ + @Override + public List replay(int lastN) { + try { + return persistence.lastNMessages(inboxName, lastN); + } + catch (SQLException ex) { + logger.error("Failed to replay inbox", ex); + return List.of(); + } + } + +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java new file mode 100644 index 00000000..40022c11 --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/outbox/MqOutbox.java @@ -0,0 +1,172 @@ +package nu.marginalia.mq.outbox; + +import nu.marginalia.mq.MqMessage; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.persistence.MqPersistence; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.SQLException; +import java.util.Optional; +import java.util.UUID; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; + +public class MqOutbox { + private final Logger logger = LoggerFactory.getLogger(MqOutbox.class); + private final MqPersistence persistence; + private final String inboxName; + private final String replyInboxName; + private final String instanceUUID; + + private final ConcurrentHashMap pendingResponses = new ConcurrentHashMap<>(); + + private final int pollIntervalMs = Integer.getInteger("mq.outbox.poll-interval-ms", 100); + private final int maxPollCount = Integer.getInteger("mq.outbox.max-poll-count", 10); + private final Thread pollThread; + + private volatile boolean run = true; + + public MqOutbox(MqPersistence persistence, + String inboxName, + String outboxName, + UUID instanceUUID) { + this.persistence = persistence; + + this.inboxName = inboxName; + this.replyInboxName = outboxName + "//" + inboxName; + this.instanceUUID = instanceUUID.toString(); + + pollThread = new Thread(this::poll, "mq-outbox-poll-thread:" + inboxName); + pollThread.setDaemon(true); + pollThread.start(); + } + + public void stop() throws InterruptedException { + if (!run) + return; + + logger.info("Shutting down outbox {}", inboxName); + + run = false; + pollThread.join(); + } + + private void poll() { + try { + for (long id = 1; run; id++) { + pollDb(id); + + TimeUnit.MILLISECONDS.sleep(pollIntervalMs); + } + } catch (InterruptedException ex) { + logger.error("Outbox poll thread interrupted", ex); + } + } + + private void pollDb(long tick) { + try { + var updates = persistence.pollReplyInbox(replyInboxName, instanceUUID, tick, maxPollCount); + + for (var message : updates) { + pendingResponses.put(message.relatedId(), message); + } + + if (updates.isEmpty()) + return; + + logger.info("Notifying {} pending responses", pendingResponses.size()); + + synchronized (pendingResponses) { + pendingResponses.notifyAll(); + } + } + catch (SQLException ex) { + logger.error("Failed to poll inbox", ex); + } + + } + + /** Send a message and wait for a response. */ + public MqMessage send(String function, String payload) throws Exception { + final long id = sendAsync(function, payload); + + return waitResponse(id); + } + + /** Send a message asynchronously, without waiting for a response. + *
+ * Use waitResponse(id) or pollResponse(id) to fetch the response. */ + public long sendAsync(String function, String payload) throws Exception { + return persistence.sendNewMessage(inboxName, replyInboxName, null, function, payload, null); + } + + /** Blocks until a response arrives for the given message id (possibly forever) */ + public MqMessage waitResponse(long id) throws Exception { + synchronized (pendingResponses) { + while (!pendingResponses.containsKey(id)) { + pendingResponses.wait(100); + } + + var msg = pendingResponses.remove(id); + // Mark the response as OK so it can be cleaned up + persistence.updateMessageState(msg.msgId(), MqMessageState.OK); + + return msg; + } + } + + + /** Blocks until a response arrives for the given message id or the timeout passes. + *

+ * @throws TimeoutException if the timeout passes before a response arrives. + * @throws InterruptedException if the thread is interrupted while waiting. + */ + public MqMessage waitResponse(long id, int timeout, TimeUnit unit) throws TimeoutException, SQLException, InterruptedException { + long deadline = System.currentTimeMillis() + unit.toMillis(timeout); + + synchronized (pendingResponses) { + while (!pendingResponses.containsKey(id)) { + if (System.currentTimeMillis() > deadline) + throw new TimeoutException("Timeout waiting for response"); + + pendingResponses.wait(100); + } + + var msg = pendingResponses.remove(id); + // Mark the response as OK so it can be cleaned up + persistence.updateMessageState(msg.msgId(), MqMessageState.OK); + + return msg; + } + } + + /** Polls for a response for the given message id. */ + public Optional pollResponse(long id) throws SQLException { + // no need to sync here if we aren't going to wait() + var response = pendingResponses.get(id); + + if (response != null) { + // Mark the response as OK so it can be cleaned up + persistence.updateMessageState(response.msgId(), MqMessageState.OK); + } + return Optional.ofNullable(response); + } + + public long sendNotice(String function, String payload) throws Exception { + return persistence.sendNewMessage(inboxName, null, null, function, payload, null); + } + public long sendNotice(long relatedId, String function, String payload) throws Exception { + return persistence.sendNewMessage(inboxName, null, relatedId, function, payload, null); + } + + public void flagAsBad(long id) throws SQLException { + persistence.updateMessageState(id, MqMessageState.ERR); + } + + public void flagAsDead(long id) throws SQLException { + persistence.updateMessageState(id, MqMessageState.DEAD); + } + +} \ No newline at end of file diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java b/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java new file mode 100644 index 00000000..68fb2f83 --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mq/persistence/MqPersistence.java @@ -0,0 +1,487 @@ +package nu.marginalia.mq.persistence; + +import com.google.common.collect.Lists; +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.MqMessage; + +import javax.annotation.Nullable; +import java.sql.SQLException; +import java.time.Duration; +import java.util.*; + +import static nu.marginalia.mq.MqMessageState.NEW; + +/** A persistence layer for the message queue. + *

+ * All storage operations must be done through this class. + */ +@Singleton +public class MqPersistence { + private final HikariDataSource dataSource; + + @Inject + public MqPersistence(HikariDataSource dataSource) { + this.dataSource = dataSource; + } + + /** + * Adds a new message to the message queue. + * + * @param recipientInboxName The recipient's inbox name + * @param senderInboxName (nullable) The sender's inbox name. Only needed if a reply is expected. If null, the message is not expected to be replied to. + * @param relatedMessageId (nullable) The id of the message this message is related to. If null, the message is not related to any other message. + * @param function The function to call + * @param payload The payload to send, typically JSON. + * @param ttl (nullable) The time to live of the message, in seconds. If null, the message will never set to DEAD. + * @return The id of the message + */ + public long sendNewMessage(String recipientInboxName, + @Nullable + String senderInboxName, + Long relatedMessageId, + String function, + String payload, + @Nullable Duration ttl + ) throws Exception { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + INSERT INTO MESSAGE_QUEUE(RECIPIENT_INBOX, SENDER_INBOX, RELATED_ID, FUNCTION, PAYLOAD, TTL) + VALUES(?, ?, ?, ?, ?, ?) + """); + var lastIdQuery = conn.prepareStatement("SELECT LAST_INSERT_ID()")) { + + stmt.setString(1, recipientInboxName); + + if (senderInboxName == null) stmt.setNull(2, java.sql.Types.VARCHAR); + else stmt.setString(2, senderInboxName); + + // Translate null to -1, as 0 is a valid id + stmt.setLong(3, Objects.requireNonNullElse(relatedMessageId, -1L)); + + stmt.setString(4, function); + stmt.setString(5, payload); + if (ttl == null) stmt.setNull(6, java.sql.Types.BIGINT); + else stmt.setLong(6, ttl.toSeconds()); + + stmt.executeUpdate(); + + if (!conn.getAutoCommit()) + conn.commit(); + + var rsp = lastIdQuery.executeQuery(); + + if (!rsp.next()) { + throw new IllegalStateException("No last insert id"); + } + + return rsp.getLong(1); + } + } + + /** Modifies the state of a message by id. + *

+ * If the state is 'NEW', ownership information will be stripped to avoid creating + * a broken message that can't be dequeued because it has an owner. + * + * @param id The id of the message + * @param mqMessageState The new state + * */ + public void updateMessageState(long id, MqMessageState mqMessageState) throws SQLException { + if (NEW == mqMessageState) { + reinitializeMessage(id); + return; + } + + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + UPDATE MESSAGE_QUEUE + SET STATE=?, UPDATED_TIME=CURRENT_TIMESTAMP(6) + WHERE ID=? + """)) { + + stmt.setString(1, mqMessageState.name()); + stmt.setLong(2, id); + + if (stmt.executeUpdate() != 1) { + throw new IllegalArgumentException("No rows updated"); + } + + if (!conn.getAutoCommit()) + conn.commit(); + } + } + + /** Sets the message to 'NEW' state and removes any owner */ + private void reinitializeMessage(long id) throws SQLException { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + UPDATE MESSAGE_QUEUE + SET STATE='NEW', + OWNER_INSTANCE=NULL, + OWNER_TICK=NULL, + UPDATED_TIME=CURRENT_TIMESTAMP(6) + WHERE ID=? + """)) { + + stmt.setLong(1, id); + + if (stmt.executeUpdate() != 1) { + throw new IllegalArgumentException("No rows updated"); + } + + if (!conn.getAutoCommit()) + conn.commit(); + } + } + + /** Creates a new message in the queue referencing as a reply to an existing message + * This message will have it's RELATED_ID set to the original message's ID. + */ + public long sendResponse(long id, MqMessageState mqMessageState, String message) throws SQLException { + try (var conn = dataSource.getConnection()) { + conn.setAutoCommit(false); + + try (var updateState = conn.prepareStatement(""" + UPDATE MESSAGE_QUEUE + SET STATE=?, UPDATED_TIME=CURRENT_TIMESTAMP(6) + WHERE ID=? + """); + var addResponse = conn.prepareStatement(""" + INSERT INTO MESSAGE_QUEUE(RECIPIENT_INBOX, RELATED_ID, FUNCTION, PAYLOAD) + SELECT SENDER_INBOX, ID, ?, ? + FROM MESSAGE_QUEUE + WHERE ID=? AND SENDER_INBOX IS NOT NULL + """); + var lastIdQuery = conn.prepareStatement("SELECT LAST_INSERT_ID()") + ) { + + updateState.setString(1, mqMessageState.name()); + updateState.setLong(2, id); + if (updateState.executeUpdate() != 1) { + throw new IllegalArgumentException("No rows updated"); + } + + addResponse.setString(1, "REPLY"); + addResponse.setString(2, message); + addResponse.setLong(3, id); + if (addResponse.executeUpdate() != 1) { + throw new IllegalArgumentException("No rows updated"); + } + + var rsp = lastIdQuery.executeQuery(); + if (!rsp.next()) { + throw new IllegalStateException("No last insert id"); + } + long newId = rsp.getLong(1); + + conn.commit(); + + return newId; + } catch (SQLException|IllegalStateException|IllegalArgumentException ex) { + conn.rollback(); + throw ex; + } finally { + conn.setAutoCommit(true); + } + } + } + + + /** Marks unclaimed messages addressed to this inbox with instanceUUID and tick, + * then returns the number of messages marked. This is an atomic operation that + * ensures that messages aren't double processed. + */ + private int markInboxMessages(String inboxName, String instanceUUID, long tick, int n) throws SQLException { + try (var conn = dataSource.getConnection(); + var updateStmt = conn.prepareStatement(""" + UPDATE MESSAGE_QUEUE + SET OWNER_INSTANCE=?, OWNER_TICK=?, UPDATED_TIME=CURRENT_TIMESTAMP(6), STATE='ACK' + WHERE RECIPIENT_INBOX=? + AND OWNER_INSTANCE IS NULL AND STATE='NEW' + ORDER BY ID ASC + LIMIT ? + """); + ) { + updateStmt.setString(1, instanceUUID); + updateStmt.setLong(2, tick); + updateStmt.setString(3, inboxName); + updateStmt.setInt(4, n); + var ret = updateStmt.executeUpdate(); + if (!conn.getAutoCommit()) + conn.commit(); + return ret; + } + } + + /** Return up to n unprocessed messages from the specified inbox that are in states 'NEW' or 'ACK' + * without updating their ownership information + */ + public Collection eavesdrop(String inboxName, int n) throws SQLException { + try (var conn = dataSource.getConnection(); + var queryStmt = conn.prepareStatement(""" + SELECT + ID, + RELATED_ID, + FUNCTION, + PAYLOAD, + STATE, + SENDER_INBOX IS NOT NULL AS EXPECTS_RESPONSE + FROM MESSAGE_QUEUE + WHERE STATE IN ('NEW', 'ACK') + AND RECIPIENT_INBOX=? + LIMIT ? + """) + ) + { + queryStmt.setString(1, inboxName); + queryStmt.setInt(2, n); + var rs = queryStmt.executeQuery(); + + List messages = new ArrayList<>(n); + + while (rs.next()) { + long msgId = rs.getLong("ID"); + long relatedId = rs.getLong("RELATED_ID"); + + String function = rs.getString("FUNCTION"); + String payload = rs.getString("PAYLOAD"); + + MqMessageState state = MqMessageState.valueOf(rs.getString("STATE")); + boolean expectsResponse = rs.getBoolean("EXPECTS_RESPONSE"); + + var msg = new MqMessage(msgId, relatedId, function, payload, state, expectsResponse); + + messages.add(msg); + } + + return messages; + } + + } + + /** Returns the message with the specified ID + * + * @throws SQLException if there is a problem with the database + * @throws IllegalArgumentException if the message doesn't exist + */ + public MqMessage getMessage(long id) throws SQLException { + try (var conn = dataSource.getConnection(); + var queryStmt = conn.prepareStatement(""" + SELECT + ID, + RELATED_ID, + FUNCTION, + PAYLOAD, + STATE, + SENDER_INBOX IS NOT NULL AS EXPECTS_RESPONSE + FROM MESSAGE_QUEUE + WHERE ID=? + """) + ) + { + queryStmt.setLong(1, id); + var rs = queryStmt.executeQuery(); + + if (rs.next()) { + long msgId = rs.getLong("ID"); + long relatedId = rs.getLong("RELATED_ID"); + + String function = rs.getString("FUNCTION"); + String payload = rs.getString("PAYLOAD"); + + MqMessageState state = MqMessageState.valueOf(rs.getString("STATE")); + boolean expectsResponse = rs.getBoolean("EXPECTS_RESPONSE"); + + return new MqMessage(msgId, relatedId, function, payload, state, expectsResponse); + } + } + + throw new IllegalArgumentException("No message with id " + id); + } + /** Marks unclaimed messages addressed to this inbox with instanceUUID and tick, + * then returns these messages. + */ + public Collection pollInbox(String inboxName, String instanceUUID, long tick, int n) throws SQLException { + + // Mark new messages as claimed + int expected = markInboxMessages(inboxName, instanceUUID, tick, n); + if (expected == 0) { + return Collections.emptyList(); + } + + // Then fetch the messages that were marked + try (var conn = dataSource.getConnection(); + var queryStmt = conn.prepareStatement(""" + SELECT + ID, + RELATED_ID, + FUNCTION, + PAYLOAD, + STATE, + SENDER_INBOX IS NOT NULL AS EXPECTS_RESPONSE + FROM MESSAGE_QUEUE + WHERE OWNER_INSTANCE=? AND OWNER_TICK=? + """) + ) { + queryStmt.setString(1, instanceUUID); + queryStmt.setLong(2, tick); + var rs = queryStmt.executeQuery(); + + List messages = new ArrayList<>(expected); + + while (rs.next()) { + long msgId = rs.getLong("ID"); + long relatedId = rs.getLong("RELATED_ID"); + + String function = rs.getString("FUNCTION"); + String payload = rs.getString("PAYLOAD"); + + MqMessageState state = MqMessageState.valueOf(rs.getString("STATE")); + boolean expectsResponse = rs.getBoolean("EXPECTS_RESPONSE"); + + var msg = new MqMessage(msgId, relatedId, function, payload, state, expectsResponse); + + messages.add(msg); + } + + return messages; + } + + } + + + /** Marks unclaimed messages addressed to this inbox with instanceUUID and tick, + * then returns these messages. + */ + public Collection pollReplyInbox(String inboxName, String instanceUUID, long tick, int n) throws SQLException { + + // Mark new messages as claimed + int expected = markInboxMessages(inboxName, instanceUUID, tick, n); + if (expected == 0) { + return Collections.emptyList(); + } + + // Then fetch the messages that were marked + try (var conn = dataSource.getConnection(); + var queryStmt = conn.prepareStatement(""" + SELECT SELF.ID, SELF.RELATED_ID, SELF.FUNCTION, SELF.PAYLOAD, PARENT.STATE FROM MESSAGE_QUEUE SELF + LEFT JOIN MESSAGE_QUEUE PARENT ON SELF.RELATED_ID=PARENT.ID + WHERE SELF.OWNER_INSTANCE=? AND SELF.OWNER_TICK=? + """) + ) { + queryStmt.setString(1, instanceUUID); + queryStmt.setLong(2, tick); + var rs = queryStmt.executeQuery(); + + List messages = new ArrayList<>(expected); + + while (rs.next()) { + long msgId = rs.getLong(1); + long relatedId = rs.getLong(2); + + String function = rs.getString(3); + String payload = rs.getString(4); + + MqMessageState state = MqMessageState.valueOf(rs.getString(5)); + + var msg = new MqMessage(msgId, relatedId, function, payload, state, false); + + messages.add(msg); + } + + return messages; + } + } + + /** Returns the last N messages sent to this inbox */ + public List lastNMessages(String inboxName, int lastN) throws SQLException { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT ID, RELATED_ID, FUNCTION, PAYLOAD, STATE, SENDER_INBOX FROM MESSAGE_QUEUE + WHERE RECIPIENT_INBOX = ? + ORDER BY ID DESC LIMIT ? + """)) { + + stmt.setString(1, inboxName); + stmt.setInt(2, lastN); + List messages = new ArrayList<>(lastN); + + var rs = stmt.executeQuery(); + while (rs.next()) { + long msgId = rs.getLong(1); + long relatedId = rs.getLong(2); + + String function = rs.getString(3); + String payload = rs.getString(4); + + MqMessageState state = MqMessageState.valueOf(rs.getString(5)); + boolean expectsResponse = rs.getBoolean(6); + + var msg = new MqMessage(msgId, relatedId, function, payload, state, expectsResponse); + + messages.add(msg); + } + + // We want the last N messages in ascending order + return Lists.reverse(messages); + } + + } + + /** Modify the message indicated by id to have the given owner information */ + public void changeOwner(long id, String instanceUUID, int tick) { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + UPDATE MESSAGE_QUEUE SET OWNER_INSTANCE=?, OWNER_TICK=? + WHERE ID=? + """)) { + stmt.setString(1, instanceUUID); + stmt.setInt(2, tick); + stmt.setLong(3, id); + stmt.executeUpdate(); + + if (!conn.getAutoCommit()) + conn.commit(); + + } catch (SQLException e) { + throw new RuntimeException(e); + } + } + + + /** Flags messages as dead if they have not been set to a terminal state within a TTL after the last update. */ + public int reapDeadMessages() throws SQLException { + try (var conn = dataSource.getConnection(); + var setToDead = conn.prepareStatement(""" + UPDATE MESSAGE_QUEUE + SET STATE='DEAD', UPDATED_TIME=CURRENT_TIMESTAMP(6) + WHERE STATE IN ('NEW', 'ACK') + AND TTL IS NOT NULL + AND TIMESTAMPDIFF(SECOND, UPDATED_TIME, CURRENT_TIMESTAMP(6)) > TTL + """)) { + int ret = setToDead.executeUpdate(); + if (!conn.getAutoCommit()) + conn.commit(); + return ret; + } + } + + /** Removes messages that have been set to a terminal state a while after their last update timestamp */ + public int cleanOldMessages() throws SQLException { + try (var conn = dataSource.getConnection(); + var setToDead = conn.prepareStatement(""" + DELETE FROM MESSAGE_QUEUE + WHERE STATE = 'OK' + AND TTL IS NOT NULL + AND TIMESTAMPDIFF(SECOND, UPDATED_TIME, CURRENT_TIMESTAMP(6)) > 3600 + """)) { + int ret = setToDead.executeUpdate(); + if (!conn.getAutoCommit()) + conn.commit(); + return ret; + } + } +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/ActorStateMachine.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/ActorStateMachine.java new file mode 100644 index 00000000..a3f7edbe --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/ActorStateMachine.java @@ -0,0 +1,408 @@ +package nu.marginalia.mqsm; + +import nu.marginalia.mq.MessageQueueFactory; +import nu.marginalia.mq.MqMessage; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.inbox.MqInboxResponse; +import nu.marginalia.mq.inbox.MqSubscription; +import nu.marginalia.mq.inbox.MqSynchronousInbox; +import nu.marginalia.mq.outbox.MqOutbox; +import nu.marginalia.mqsm.graph.ResumeBehavior; +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mqsm.state.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.*; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import java.util.function.BiConsumer; + +/** A state machine that can be used to implement an actor + * using a message queue as the persistence layer. The state machine is + * resilient to crashes and can be resumed from the last state. + */ +public class ActorStateMachine { + private final Logger logger = LoggerFactory.getLogger(ActorStateMachine.class); + + private final MqSynchronousInbox smInbox; + private final MqOutbox smOutbox; + private final String queueName; + + + private volatile MachineState state; + private volatile ExpectedMessage expectedMessage = ExpectedMessage.anyUnrelated(); + + + private final MachineState errorState = new StateFactory.ErrorState(); + private final MachineState finalState = new StateFactory.FinalState(); + private final MachineState resumingState = new StateFactory.ResumingState(); + + private final List> stateChangeListeners = new ArrayList<>(); + private final Map allStates = new HashMap<>(); + + private final boolean isDirectlyInitializable; + + public ActorStateMachine(MessageQueueFactory messageQueueFactory, + String queueName, + UUID instanceUUID, + AbstractStateGraph stateGraph) + { + this.queueName = queueName; + + smInbox = messageQueueFactory.createSynchronousInbox(queueName, instanceUUID); + smOutbox = messageQueueFactory.createOutbox(queueName, queueName+"//out", instanceUUID); + + smInbox.subscribe(new StateEventSubscription()); + + registerStates(List.of(errorState, finalState, resumingState)); + registerStates(stateGraph); + isDirectlyInitializable = stateGraph.isDirectlyInitializable(); + + for (var declaredState : stateGraph.declaredStates()) { + if (!allStates.containsKey(declaredState.name())) { + throw new IllegalArgumentException("State " + declaredState.name() + " is not defined in the state graph"); + } + if (!allStates.containsKey(declaredState.next())) { + throw new IllegalArgumentException("State " + declaredState.next() + " is not defined in the state graph"); + } + for (var state : declaredState.transitions()) { + if (!allStates.containsKey(state)) { + throw new IllegalArgumentException("State " + state + " is not defined in the state graph"); + } + } + } + + resume(); + + smInbox.start(); + } + + /** Listen to state changes */ + public void listen(BiConsumer listener) { + stateChangeListeners.add(listener); + } + + /** Register the state graph */ + void registerStates(List states) { + for (var state : states) { + allStates.put(state.name(), state); + } + } + + /** Register the state graph */ + void registerStates(AbstractStateGraph states) { + registerStates(states.asStateList()); + } + + /** Wait for the state machine to reach a final state. + * (possibly forever, halting problem and so on) + */ + public void join() throws InterruptedException { + synchronized (this) { + if (null == state) + return; + + while (!state.isFinal()) { + wait(); + } + } + } + + /** Wait for the state machine to reach a final state up to a given timeout. + */ + public void join(long timeout, TimeUnit timeUnit) throws InterruptedException, TimeoutException { + long deadline = System.currentTimeMillis() + timeUnit.toMillis(timeout); + + synchronized (this) { + if (null == state) + return; + + while (!state.isFinal()) { + if (deadline <= System.currentTimeMillis()) + throw new TimeoutException("Timeout waiting for state machine to reach final state"); + wait(100); + } + } + } + + /** Initialize the state machine. */ + public void init() throws Exception { + var transition = StateTransition.to("INITIAL"); + + synchronized (this) { + this.state = allStates.get(transition.state()); + notifyAll(); + } + + smOutbox.sendNotice(transition.state(), transition.message()); + } + + /** Initialize the state machine. */ + public void initFrom(String firstState) throws Exception { + var transition = StateTransition.to(firstState); + + synchronized (this) { + this.state = allStates.get(transition.state()); + notifyAll(); + } + + smOutbox.sendNotice(transition.state(), transition.message()); + } + + /** Initialize the state machine. */ + public void init(String jsonEncodedArgument) throws Exception { + var transition = StateTransition.to("INITIAL", jsonEncodedArgument); + + synchronized (this) { + this.state = allStates.get(transition.state()); + notifyAll(); + } + + smOutbox.sendNotice(transition.state(), transition.message()); + } + + /** Initialize the state machine. */ + public void initFrom(String state, String jsonEncodedArgument) throws Exception { + var transition = StateTransition.to(state, jsonEncodedArgument); + + synchronized (this) { + this.state = allStates.get(transition.state()); + notifyAll(); + } + + smOutbox.sendNotice(transition.state(), transition.message()); + } + + /** Resume the state machine from the last known state. */ + private void resume() { + + // We only permit resuming from the unitialized state + if (state != null) { + return; + } + + // Fetch the last messages from the inbox + var message = smInbox.replay(5) + .stream() + .filter(m -> (m.state() == MqMessageState.NEW) || (m.state() == MqMessageState.ACK)) + .findFirst(); + + if (message.isEmpty()) { + // No messages in the inbox, so start in a terminal state + expectedMessage = ExpectedMessage.anyUnrelated(); + state = finalState; + return; + } + + var firstMessage = message.get(); + var resumeState = allStates.get(firstMessage.function()); + + logger.info("Resuming state machine from {}({})/{}", firstMessage.function(), firstMessage.payload(), firstMessage.state()); + expectedMessage = ExpectedMessage.expectThis(firstMessage); + + if (firstMessage.state() == MqMessageState.NEW) { + // The message is not acknowledged, so starting the inbox will trigger a state transition + // We still need to set a state here so that the join() method works + + state = resumingState; + } + else if (firstMessage.state() == MqMessageState.ACK) { + resumeFromAck(resumeState, firstMessage); + } + } + + private void resumeFromAck(MachineState resumeState, + MqMessage message) + { + try { + if (resumeState.resumeBehavior().equals(ResumeBehavior.ERROR)) { + // The message is acknowledged, but the state does not support resuming + smOutbox.sendNotice(expectedMessage.id, "ERROR", "Illegal resumption from ACK'ed state " + message.function()); + } + else if (resumeState.resumeBehavior().equals(ResumeBehavior.RESTART)) { + this.state = resumeState; + + // The message is already acknowledged, we flag it as dead and then send an identical message + smOutbox.flagAsDead(message.msgId()); + expectedMessage = ExpectedMessage.responseTo(message); + smOutbox.sendNotice(message.msgId(), "INITIAL", ""); + } + else { + this.state = resumeState; + + // The message is already acknowledged, we flag it as dead and then send an identical message + smOutbox.flagAsDead(message.msgId()); + expectedMessage = ExpectedMessage.responseTo(message); + smOutbox.sendNotice(message.msgId(), message.function(), message.payload()); + } + } + catch (Exception e) { + logger.error("Failed to replay state", e); + } + } + + public void stop() throws InterruptedException { + smInbox.stop(); + smOutbox.stop(); + } + + private void onStateTransition(MqMessage msg) { + final String nextState = msg.function(); + final String data = msg.payload(); + + final long relatedId = msg.relatedId(); + + if (!expectedMessage.isExpected(msg)) { + // We've received a message that we didn't expect, throwing an exception will cause it to be flagged + // as an error in the message queue; the message queue will proceed + + throw new IllegalStateException("Unexpected message id " + relatedId + ", expected " + expectedMessage.id); + } + + try { + logger.info("FSM State change in {}: {}->{}({})", + queueName, + state == null ? "[null]" : state.name(), + nextState, + data); + + if (!allStates.containsKey(nextState)) { + logger.error("Unknown state {}", nextState); + setErrorState(); + return; + } + + synchronized (this) { + this.state = allStates.get(nextState); + notifyAll(); + } + + if (!state.isFinal()) { + logger.info("Transitining from state {}", state.name()); + var transition = state.next(msg.payload()); + + if (!expectedMessage.isExpected(msg)) { + logger.warn("Expected message changed during execution, skipping state transition to {}", transition.state()); + } + else { + expectedMessage = ExpectedMessage.responseTo(msg); + smOutbox.sendNotice(expectedMessage.id, transition.state(), transition.message()); + } + } + else { + // On terminal transition, we expect any message + expectedMessage = ExpectedMessage.anyUnrelated(); + } + } + catch (Exception e) { + logger.error("Error in state machine transition", e); + setErrorState(); + } + } + + private void setErrorState() { + synchronized (this) { + state = errorState; + notifyAll(); + } + } + + public MachineState getState() { + return state; + } + + public void abortExecution() throws Exception { + // Create a fake message to abort the execution + // This helps make sense of the queue when debugging + // and also permits the real termination message to have an + // unique expected ID + + long abortMsgId = smOutbox.sendNotice(expectedMessage.id, "ABORT", "Aborting execution"); + + // Set it as dead to clean up the queue from mystery ACK messages + smOutbox.flagAsDead(abortMsgId); + + // Set the expected message to the abort message, + // technically there's a slight chance of a race condition here, + // which will cause this message to be ERR'd and the process to + // continue, but it's very unlikely and the worst that can happen + // is you have to abort twice. + + expectedMessage = ExpectedMessage.expectId(abortMsgId); + + // Add a state transition to the final state + smOutbox.sendNotice(abortMsgId, finalState.name(), ""); + + // Dislodge the current task with an interrupt. + // It's actually fine if we accidentally interrupt the wrong thread + // (i.e. the abort task), since it shouldn't be doing anything interruptable + smInbox.abortCurrentTask(); + } + + /** Returns true if there is an INITIAL state that requires no parameters */ + public boolean isDirectlyInitializable() { + return isDirectlyInitializable; + } + + private class StateEventSubscription implements MqSubscription { + + @Override + public boolean filter(MqMessage rawMessage) { + return true; + } + + @Override + public MqInboxResponse onRequest(MqMessage msg) { + return null; + } + + @Override + public void onNotification(MqMessage msg) { + onStateTransition(msg); + try { + stateChangeListeners.forEach(l -> l.accept(msg.function(), msg.payload())); + } + catch (Exception ex) { + // Rethrowing this will flag the message as an error in the message queue + throw new RuntimeException("Error in state change listener", ex); + } + } + } +} + +/** ExpectedMessage guards against spurious state changes being triggered by old messages in the queue + * + * It contains the message id of the last message that was processed, and the messages sent by the state machine to + * itself via the message queue all have relatedId set to expectedMessageId. If the state machine is unitialized or + * in a terminal state, it will accept messages with relatedIds that are equal to -1. + * */ +class ExpectedMessage { + public final long id; + public ExpectedMessage(long id) { + this.id = id; + } + + public static ExpectedMessage expectThis(MqMessage message) { + return new ExpectedMessage(message.relatedId()); + } + + public static ExpectedMessage responseTo(MqMessage message) { + return new ExpectedMessage(message.msgId()); + } + + public static ExpectedMessage anyUnrelated() { + return new ExpectedMessage(-1); + } + + public static ExpectedMessage expectId(long id) { + return new ExpectedMessage(id); + } + + public boolean isExpected(MqMessage message) { + if (id < 0) + return true; + + return id == message.relatedId(); + } +} \ No newline at end of file diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateFactory.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateFactory.java new file mode 100644 index 00000000..6df583b3 --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/StateFactory.java @@ -0,0 +1,143 @@ +package nu.marginalia.mqsm; + +import com.google.gson.Gson; +import com.google.gson.JsonSyntaxException; +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.mqsm.graph.ResumeBehavior; +import nu.marginalia.mqsm.state.MachineState; +import nu.marginalia.mqsm.state.StateTransition; + +import java.util.function.Function; +import java.util.function.Supplier; + +@Singleton +public class StateFactory { + private final Gson gson; + + @Inject + public StateFactory(Gson gson) { + this.gson = gson; + } + + public MachineState create(String name, ResumeBehavior resumeBehavior, Class param, Function logic) { + return new MachineState() { + @Override + public String name() { + return name; + } + + @Override + public StateTransition next(String message) { + + if (message.isEmpty()) { + return logic.apply(null); + } + + try { + var paramObj = gson.fromJson(message, param); + return logic.apply(paramObj); + } + catch (JsonSyntaxException ex) { + throw new IllegalArgumentException("Failed to parse '" + message + + "' into a '" + param.getSimpleName() + "'", ex); + } + } + + @Override + public ResumeBehavior resumeBehavior() { + return resumeBehavior; + } + + @Override + public boolean isFinal() { + return false; + } + }; + } + + public MachineState create(String name, ResumeBehavior resumeBehavior, Supplier logic) { + return new MachineState() { + @Override + public String name() { + return name; + } + + @Override + public StateTransition next(String message) { + return logic.get(); + } + + @Override + public ResumeBehavior resumeBehavior() { + return resumeBehavior; + } + + @Override + public boolean isFinal() { + return false; + } + }; + } + + public StateTransition transition(String state) { + return StateTransition.to(state); + } + + public StateTransition transition(String state, Object message) { + + if (null == message) { + return StateTransition.to(state); + } + + return StateTransition.to(state, gson.toJson(message)); + } + + public static class ErrorState implements MachineState { + @Override + public String name() { return "ERROR"; } + + @Override + public StateTransition next(String message) { + throw new UnsupportedOperationException(); + } + + @Override + public ResumeBehavior resumeBehavior() { return ResumeBehavior.RETRY; } + + @Override + public boolean isFinal() { return true; } + } + + public static class FinalState implements MachineState { + @Override + public String name() { return "END"; } + + @Override + public StateTransition next(String message) { + throw new UnsupportedOperationException(); + } + + @Override + public ResumeBehavior resumeBehavior() { return ResumeBehavior.RETRY; } + + @Override + public boolean isFinal() { return true; } + } + + public static class ResumingState implements MachineState { + @Override + public String name() { return "RESUMING"; } + + @Override + public StateTransition next(String message) { + throw new UnsupportedOperationException(); + } + + @Override + public ResumeBehavior resumeBehavior() { return ResumeBehavior.RETRY; } + + @Override + public boolean isFinal() { return false; } + } +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/AbstractStateGraph.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/AbstractStateGraph.java new file mode 100644 index 00000000..977f2ce4 --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/AbstractStateGraph.java @@ -0,0 +1,176 @@ +package nu.marginalia.mqsm.graph; + +import nu.marginalia.mqsm.state.MachineState; +import nu.marginalia.mqsm.StateFactory; +import nu.marginalia.mqsm.state.StateTransition; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.util.*; + +public abstract class AbstractStateGraph { + private final StateFactory stateFactory; + private static final Logger logger = LoggerFactory.getLogger(AbstractStateGraph.class); + + public AbstractStateGraph(StateFactory stateFactory) { + this.stateFactory = stateFactory; + } + + public void transition(String state) { + throw new ControlFlowException(state, null); + } + + public void transition(String state, T payload) { + throw new ControlFlowException(state, payload); + } + + public void error() { + throw new ControlFlowException("ERROR", ""); + } + + + public void error(T payload) { + throw new ControlFlowException("ERROR", payload); + } + + public void error(Exception ex) { + throw new ControlFlowException("ERROR", ex.getClass().getSimpleName() + ":" + ex.getMessage()); + } + + /** Check whether there is an INITIAL state that can be directly initialized + * without declared parameters. */ + public boolean isDirectlyInitializable() { + for (var method : getClass().getMethods()) { + var gs = method.getAnnotation(GraphState.class); + if (gs == null) { + continue; + } + if ("INITIAL".equals(gs.name()) && method.getParameterCount() == 0) { + return true; + } + } + return false; + } + + public Set declaredStates() { + Set ret = new HashSet<>(); + + for (var method : getClass().getMethods()) { + var gs = method.getAnnotation(GraphState.class); + if (gs != null) { + ret.add(gs); + } + } + + return ret; + } + + + public Set terminalStates() { + Set ret = new HashSet<>(); + + for (var method : getClass().getMethods()) { + var gs = method.getAnnotation(TerminalGraphState.class); + if (gs != null) { + ret.add(gs); + } + } + + return ret; + } + + public List asStateList() { + List ret = new ArrayList<>(); + + for (var method : getClass().getMethods()) { + var gs = method.getAnnotation(GraphState.class); + if (gs != null) { + ret.add(graphState(method, gs)); + } + + var ts = method.getAnnotation(TerminalGraphState.class); + if (ts != null) { + ret.add(stateFactory.create(ts.name(), ResumeBehavior.ERROR, () -> { + throw new ControlFlowException(ts.name(), null); + })); + } + } + + return ret; + } + + private MachineState graphState(Method method, GraphState gs) { + + var parameters = method.getParameterTypes(); + boolean returnsVoid = method.getGenericReturnType().equals(Void.TYPE); + + if (parameters.length == 0) { + return stateFactory.create(gs.name(), gs.resume(), () -> { + try { + if (returnsVoid) { + method.invoke(this); + return StateTransition.to(gs.next()); + } else { + Object ret = method.invoke(this); + return stateFactory.transition(gs.next(), ret); + } + } + catch (Exception e) { + return invocationExceptionToStateTransition(gs.name(), e); + } + }); + } + else if (parameters.length == 1) { + return stateFactory.create(gs.name(), gs.resume(), parameters[0], (param) -> { + try { + if (returnsVoid) { + method.invoke(this, param); + return StateTransition.to(gs.next()); + } else { + Object ret = method.invoke(this, param); + return stateFactory.transition(gs.next(), ret); + } + } catch (Exception e) { + return invocationExceptionToStateTransition(gs.name(), e); + } + }); + } + else { + // We permit only @GraphState-annotated methods like this: + // + // void foo(); + // void foo(Object bar); + // Object foo(); + // Object foo(Object bar); + + throw new IllegalStateException("StateGraph " + + getClass().getSimpleName() + + " has invalid method signature for method " + + method.getName() + + ": Expected 0 or 1 parameter(s) but found " + + Arrays.toString(parameters)); + } + } + + private StateTransition invocationExceptionToStateTransition(String state, Throwable ex) { + while (ex instanceof InvocationTargetException e) { + if (e.getCause() != null) ex = ex.getCause(); + } + + if (ex instanceof ControlFlowException cfe) { + return stateFactory.transition(cfe.getState(), cfe.getPayload()); + } + else if (ex instanceof InterruptedException intE) { + logger.error("State execution was interrupted " + state); + return StateTransition.to("ERR", "Execution interrupted"); + } + else { + logger.error("Error in state invocation " + state, ex); + return StateTransition.to("ERROR", + "Exception: " + ex.getClass().getSimpleName() + "/" + ex.getMessage()); + } + } + +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/ControlFlowException.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/ControlFlowException.java new file mode 100644 index 00000000..12e5b569 --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/ControlFlowException.java @@ -0,0 +1,22 @@ +package nu.marginalia.mqsm.graph; + +/** Exception thrown by a state to indicate that the state machine should jump to a different state. */ +public class ControlFlowException extends RuntimeException { + private final String state; + private final Object payload; + + public ControlFlowException(String state, Object payload) { + this.state = state; + this.payload = payload; + } + + public String getState() { + return state; + } + + public Object getPayload() { + return payload; + } + + public StackTraceElement[] getStackTrace() { return new StackTraceElement[0]; } +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/GraphState.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/GraphState.java new file mode 100644 index 00000000..e5764dd2 --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/GraphState.java @@ -0,0 +1,15 @@ +package nu.marginalia.mqsm.graph; + + +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; + +/** Annotation for declaring a state in an actor's state graph. */ +@Retention(RetentionPolicy.RUNTIME) +public @interface GraphState { + String name(); + String next() default "ERROR"; + String[] transitions() default {}; + String description() default ""; + ResumeBehavior resume() default ResumeBehavior.ERROR; +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/ResumeBehavior.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/ResumeBehavior.java new file mode 100644 index 00000000..33dacb5d --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/ResumeBehavior.java @@ -0,0 +1,10 @@ +package nu.marginalia.mqsm.graph; + +public enum ResumeBehavior { + /** Retry the state on resume */ + RETRY, + /** Jump to ERROR on resume if the message has been acknowledged */ + ERROR, + /** Jump to INITIAL on resume */ + RESTART +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/TerminalGraphState.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/TerminalGraphState.java new file mode 100644 index 00000000..c7b11730 --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/graph/TerminalGraphState.java @@ -0,0 +1,10 @@ +package nu.marginalia.mqsm.graph; + +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; + +@Retention(RetentionPolicy.RUNTIME) +public @interface TerminalGraphState { + String name(); + String description() default ""; +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/MachineState.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/MachineState.java new file mode 100644 index 00000000..84a0b11c --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/MachineState.java @@ -0,0 +1,14 @@ +package nu.marginalia.mqsm.state; + +import nu.marginalia.mqsm.graph.ResumeBehavior; + +public interface MachineState { + String name(); + + StateTransition next(String message); + + ResumeBehavior resumeBehavior(); + + boolean isFinal(); + +} diff --git a/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/StateTransition.java b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/StateTransition.java new file mode 100644 index 00000000..6ca5d387 --- /dev/null +++ b/code/common/message-queue/src/main/java/nu/marginalia/mqsm/state/StateTransition.java @@ -0,0 +1,11 @@ +package nu.marginalia.mqsm.state; + +public record StateTransition(String state, String message) { + public static StateTransition to(String state) { + return new StateTransition(state, ""); + } + + public static StateTransition to(String state, String message) { + return new StateTransition(state, message); + } +} diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mq/MqMessageRow.java b/code/common/message-queue/src/test/java/nu/marginalia/mq/MqMessageRow.java new file mode 100644 index 00000000..ef12105a --- /dev/null +++ b/code/common/message-queue/src/test/java/nu/marginalia/mq/MqMessageRow.java @@ -0,0 +1,21 @@ +package nu.marginalia.mq; + +import nu.marginalia.mq.MqMessageState; + +import javax.annotation.Nullable; + +public record MqMessageRow ( + long id, + long relatedId, + @Nullable + String senderInbox, + String recipientInbox, + String function, + String payload, + MqMessageState state, + String ownerInstance, + long ownerTick, + long createdTime, + long updatedTime, + long ttl +) {} \ No newline at end of file diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mq/MqTestUtil.java b/code/common/message-queue/src/test/java/nu/marginalia/mq/MqTestUtil.java new file mode 100644 index 00000000..b3ba62cf --- /dev/null +++ b/code/common/message-queue/src/test/java/nu/marginalia/mq/MqTestUtil.java @@ -0,0 +1,51 @@ +package nu.marginalia.mq; + +import com.zaxxer.hikari.HikariDataSource; +import org.junit.jupiter.api.Assertions; + +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.List; + +public class MqTestUtil { + public static List getMessages(HikariDataSource dataSource, String inbox) { + List messages = new ArrayList<>(); + + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT ID, RELATED_ID, + SENDER_INBOX, RECIPIENT_INBOX, + FUNCTION, PAYLOAD, + STATE, + OWNER_INSTANCE, OWNER_TICK, + CREATED_TIME, UPDATED_TIME, + TTL + FROM MESSAGE_QUEUE + WHERE RECIPIENT_INBOX = ? + """)) + { + stmt.setString(1, inbox); + var rsp = stmt.executeQuery(); + while (rsp.next()) { + messages.add(new MqMessageRow( + rsp.getLong("ID"), + rsp.getLong("RELATED_ID"), + rsp.getString("SENDER_INBOX"), + rsp.getString("RECIPIENT_INBOX"), + rsp.getString("FUNCTION"), + rsp.getString("PAYLOAD"), + MqMessageState.valueOf(rsp.getString("STATE")), + rsp.getString("OWNER_INSTANCE"), + rsp.getLong("OWNER_TICK"), + rsp.getTimestamp("CREATED_TIME").getTime(), + rsp.getTimestamp("UPDATED_TIME").getTime(), + rsp.getLong("TTL") + )); + } + } + catch (SQLException ex) { + Assertions.fail(ex); + } + return messages; + } +} diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqOutboxTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqOutboxTest.java new file mode 100644 index 00000000..ea2105bd --- /dev/null +++ b/code/common/message-queue/src/test/java/nu/marginalia/mq/outbox/MqOutboxTest.java @@ -0,0 +1,308 @@ +package nu.marginalia.mq.outbox; + +import com.zaxxer.hikari.HikariConfig; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.mq.MqMessage; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.MqTestUtil; +import nu.marginalia.mq.inbox.*; +import nu.marginalia.mq.persistence.MqPersistence; +import org.junit.jupiter.api.*; +import org.testcontainers.containers.MariaDBContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +import java.util.UUID; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; + +import static org.junit.jupiter.api.Assertions.*; + +@Tag("slow") +@Testcontainers +public class MqOutboxTest { + @Container + static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") + .withDatabaseName("WMSA_prod") + .withUsername("wmsa") + .withPassword("wmsa") + .withInitScript("db/migration/V23_07_0_003__message_queue.sql") + .withNetworkAliases("mariadb"); + + static HikariDataSource dataSource; + private String inboxId; + + @BeforeEach + public void setUp() { + inboxId = UUID.randomUUID().toString(); + } + @BeforeAll + public static void setUpAll() { + HikariConfig config = new HikariConfig(); + config.setJdbcUrl(mariaDBContainer.getJdbcUrl()); + config.setUsername("wmsa"); + config.setPassword("wmsa"); + + dataSource = new HikariDataSource(config); + } + + @AfterAll + public static void tearDownAll() { + dataSource.close(); + } + + @Test + public void testOpenClose() throws InterruptedException { + var outbox = new MqOutbox(new MqPersistence(dataSource), inboxId, inboxId+"/reply", UUID.randomUUID()); + outbox.stop(); + } + + @Test + public void testSingleShotInboxTimeout() throws Exception { + var inbox = new MqSingleShotInbox(new MqPersistence(dataSource), inboxId, UUID.randomUUID()); + var message = inbox.waitForMessage(100, TimeUnit.MILLISECONDS); + assertTrue(message.isEmpty()); + } + + @Test + public void testOutboxTimeout() throws Exception { + var outbox = new MqOutbox(new MqPersistence(dataSource), inboxId, inboxId+"/reply", UUID.randomUUID()); + long id = outbox.sendAsync("test", "Hello World"); + try { + outbox.waitResponse(id, 100, TimeUnit.MILLISECONDS); + } + catch (TimeoutException ex) { + return; // ok + } + catch (Exception ex) { + ex.printStackTrace(); + } + fail(); + } + + @Test + public void testSingleShotInbox() throws Exception { + // Send a message to the inbox + var outbox = new MqOutbox(new MqPersistence(dataSource), inboxId,inboxId+"/reply", UUID.randomUUID()); + long id = outbox.sendAsync("test", "Hello World"); + + // Create a single-shot inbox + var inbox = new MqSingleShotInbox(new MqPersistence(dataSource), inboxId, UUID.randomUUID()); + + // Wait for the message to arrive + var message = inbox.waitForMessage(1, TimeUnit.SECONDS); + + // Check that the message arrived + assertTrue(message.isPresent()); + assertEquals("Hello World", message.get().payload()); + + // Send a response + inbox.sendResponse(message.get(), new MqInboxResponse("Alright then", MqMessageState.OK)); + + // Wait for the response to arrive + var response = outbox.waitResponse(id, 1, TimeUnit.SECONDS); + + // Check that the response arrived + assertEquals(MqMessageState.OK, response.state()); + assertEquals("Alright then", response.payload()); + } + + @Test + public void testSend() throws Exception { + var outbox = new MqOutbox(new MqPersistence(dataSource), inboxId,inboxId+"/reply", UUID.randomUUID()); + Executors.newSingleThreadExecutor().submit(() -> outbox.send("test", "Hello World")); + + TimeUnit.MILLISECONDS.sleep(100); + + var messages = MqTestUtil.getMessages(dataSource, inboxId); + assertEquals(1, messages.size()); + System.out.println(messages.get(0)); + + outbox.stop(); + } + + + @Test + public void testSendAndRespondAsyncInbox() throws Exception { + var outbox = new MqOutbox(new MqPersistence(dataSource), inboxId,inboxId+"/reply", UUID.randomUUID()); + + var inbox = new MqAsynchronousInbox(new MqPersistence(dataSource), inboxId, UUID.randomUUID()); + inbox.subscribe(justRespond("Alright then")); + inbox.start(); + + var rsp = outbox.send("test", "Hello World"); + + assertEquals(MqMessageState.OK, rsp.state()); + assertEquals("Alright then", rsp.payload()); + + var messages = MqTestUtil.getMessages(dataSource, inboxId); + assertEquals(1, messages.size()); + assertEquals(MqMessageState.OK, messages.get(0).state()); + + outbox.stop(); + inbox.stop(); + } + + @Test + public void testSendAndRespondSyncInbox() throws Exception { + var outbox = new MqOutbox(new MqPersistence(dataSource), inboxId,inboxId+"/reply", UUID.randomUUID()); + + var inbox = new MqSynchronousInbox(new MqPersistence(dataSource), inboxId, UUID.randomUUID()); + inbox.subscribe(justRespond("Alright then")); + inbox.start(); + + var rsp = outbox.send("test", "Hello World"); + + assertEquals(MqMessageState.OK, rsp.state()); + assertEquals("Alright then", rsp.payload()); + + var messages = MqTestUtil.getMessages(dataSource, inboxId); + assertEquals(1, messages.size()); + assertEquals(MqMessageState.OK, messages.get(0).state()); + + outbox.stop(); + inbox.stop(); + } + + @Test + public void testSendMultipleAsyncInbox() throws Exception { + var outbox = new MqOutbox(new MqPersistence(dataSource), inboxId,inboxId+"/reply", UUID.randomUUID()); + + var inbox = new MqAsynchronousInbox(new MqPersistence(dataSource), inboxId, UUID.randomUUID()); + inbox.subscribe(echo()); + inbox.start(); + + var rsp1 = outbox.send("test", "one"); + var rsp2 = outbox.send("test", "two"); + var rsp3 = outbox.send("test", "three"); + var rsp4 = outbox.send("test", "four"); + + Thread.sleep(500); + + assertEquals(MqMessageState.OK, rsp1.state()); + assertEquals("one", rsp1.payload()); + assertEquals(MqMessageState.OK, rsp2.state()); + assertEquals("two", rsp2.payload()); + assertEquals(MqMessageState.OK, rsp3.state()); + assertEquals("three", rsp3.payload()); + assertEquals(MqMessageState.OK, rsp4.state()); + assertEquals("four", rsp4.payload()); + + var messages = MqTestUtil.getMessages(dataSource, inboxId); + assertEquals(4, messages.size()); + for (var message : messages) { + assertEquals(MqMessageState.OK, message.state()); + } + + outbox.stop(); + inbox.stop(); + } + + @Test + public void testSendMultipleSyncInbox() throws Exception { + var outbox = new MqOutbox(new MqPersistence(dataSource), inboxId,inboxId+"/reply", UUID.randomUUID()); + + var inbox = new MqSynchronousInbox(new MqPersistence(dataSource), inboxId, UUID.randomUUID()); + inbox.subscribe(echo()); + inbox.start(); + + var rsp1 = outbox.send("test", "one"); + var rsp2 = outbox.send("test", "two"); + var rsp3 = outbox.send("test", "three"); + var rsp4 = outbox.send("test", "four"); + + Thread.sleep(500); + + assertEquals(MqMessageState.OK, rsp1.state()); + assertEquals("one", rsp1.payload()); + assertEquals(MqMessageState.OK, rsp2.state()); + assertEquals("two", rsp2.payload()); + assertEquals(MqMessageState.OK, rsp3.state()); + assertEquals("three", rsp3.payload()); + assertEquals(MqMessageState.OK, rsp4.state()); + assertEquals("four", rsp4.payload()); + + var messages = MqTestUtil.getMessages(dataSource, inboxId); + assertEquals(4, messages.size()); + for (var message : messages) { + assertEquals(MqMessageState.OK, message.state()); + } + + outbox.stop(); + inbox.stop(); + } + + @Test + public void testSendAndRespondWithErrorHandlerAsyncInbox() throws Exception { + var outbox = new MqOutbox(new MqPersistence(dataSource), inboxId,inboxId+"/reply", UUID.randomUUID()); + var inbox = new MqAsynchronousInbox(new MqPersistence(dataSource), inboxId, UUID.randomUUID()); + + inbox.start(); + + var rsp = outbox.send("test", "Hello World"); + + assertEquals(MqMessageState.ERR, rsp.state()); + + var messages = MqTestUtil.getMessages(dataSource, inboxId); + assertEquals(1, messages.size()); + assertEquals(MqMessageState.ERR, messages.get(0).state()); + + outbox.stop(); + inbox.stop(); + } + + @Test + public void testSendAndRespondWithErrorHandlerSyncInbox() throws Exception { + var outbox = new MqOutbox(new MqPersistence(dataSource), inboxId,inboxId+"/reply", UUID.randomUUID()); + var inbox = new MqSynchronousInbox(new MqPersistence(dataSource), inboxId, UUID.randomUUID()); + + inbox.start(); + + var rsp = outbox.send("test", "Hello World"); + + assertEquals(MqMessageState.ERR, rsp.state()); + + var messages = MqTestUtil.getMessages(dataSource, inboxId); + assertEquals(1, messages.size()); + assertEquals(MqMessageState.ERR, messages.get(0).state()); + + outbox.stop(); + inbox.stop(); + } + + public MqSubscription justRespond(String response) { + return new MqSubscription() { + @Override + public boolean filter(MqMessage rawMessage) { + return true; + } + + @Override + public MqInboxResponse onRequest(MqMessage msg) { + return MqInboxResponse.ok(response); + } + + @Override + public void onNotification(MqMessage msg) { } + }; + } + + public MqSubscription echo() { + return new MqSubscription() { + @Override + public boolean filter(MqMessage rawMessage) { + return true; + } + + @Override + public MqInboxResponse onRequest(MqMessage msg) { + return MqInboxResponse.ok(msg.payload()); + } + + @Override + public void onNotification(MqMessage msg) {} + }; + } + +} \ No newline at end of file diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mq/persistence/MqPersistenceTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mq/persistence/MqPersistenceTest.java new file mode 100644 index 00000000..bab700c0 --- /dev/null +++ b/code/common/message-queue/src/test/java/nu/marginalia/mq/persistence/MqPersistenceTest.java @@ -0,0 +1,190 @@ +package nu.marginalia.mq.persistence; + +import com.zaxxer.hikari.HikariConfig; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.MqTestUtil; +import org.junit.jupiter.api.*; +import org.testcontainers.containers.MariaDBContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +import java.time.Duration; +import java.util.UUID; +import java.util.concurrent.TimeUnit; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNull; + +@Tag("slow") +@Testcontainers +public class MqPersistenceTest { + @Container + static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") + .withDatabaseName("WMSA_prod") + .withUsername("wmsa") + .withPassword("wmsa") + .withInitScript("db/migration/V23_07_0_003__message_queue.sql") + .withNetworkAliases("mariadb"); + + static HikariDataSource dataSource; + static MqPersistence persistence; + String recipientId; + String senderId; + + @BeforeEach + public void setUp() { + senderId = UUID.randomUUID().toString(); + recipientId = UUID.randomUUID().toString(); + } + + @BeforeAll + public static void setUpAll() { + HikariConfig config = new HikariConfig(); + config.setJdbcUrl(mariaDBContainer.getJdbcUrl()); + config.setUsername("wmsa"); + config.setPassword("wmsa"); + + dataSource = new HikariDataSource(config); + persistence = new MqPersistence(dataSource); + } + + @AfterAll + public static void tearDownAll() { + dataSource.close(); + } + + @Test + public void testReaper() throws Exception { + + long id = persistence.sendNewMessage(recipientId, senderId, null, "function", "payload", Duration.ofSeconds(2)); + persistence.reapDeadMessages(); + + var messages = MqTestUtil.getMessages(dataSource, recipientId); + assertEquals(1, messages.size()); + assertEquals(MqMessageState.NEW, messages.get(0).state()); + System.out.println(messages); + + TimeUnit.SECONDS.sleep(5); + + persistence.reapDeadMessages(); + + messages = MqTestUtil.getMessages(dataSource, recipientId); + assertEquals(1, messages.size()); + assertEquals(MqMessageState.DEAD, messages.get(0).state()); + } + + @Test + public void sendWithReplyAddress() throws Exception { + + long id = persistence.sendNewMessage(recipientId, senderId, null, "function", "payload", Duration.ofSeconds(30)); + + var messages = MqTestUtil.getMessages(dataSource, recipientId); + assertEquals(1, messages.size()); + + var message = messages.get(0); + + assertEquals(id, message.id()); + assertEquals("function", message.function()); + assertEquals("payload", message.payload()); + assertEquals(MqMessageState.NEW, message.state()); + + System.out.println(message); + } + + @Test + public void sendNoReplyAddress() throws Exception { + + long id = persistence.sendNewMessage(recipientId, null, null, "function", "payload", Duration.ofSeconds(30)); + + var messages = MqTestUtil.getMessages(dataSource, recipientId); + assertEquals(1, messages.size()); + + var message = messages.get(0); + + assertEquals(id, message.id()); + assertNull(message.senderInbox()); + assertEquals("function", message.function()); + assertEquals("payload", message.payload()); + assertEquals(MqMessageState.NEW, message.state()); + + System.out.println(message); + } + + @Test + public void updateState() throws Exception { + + long id = persistence.sendNewMessage(recipientId, senderId, null, "function", "payload", Duration.ofSeconds(30)); + persistence.updateMessageState(id, MqMessageState.OK); + System.out.println(id); + + var messages = MqTestUtil.getMessages(dataSource, recipientId); + assertEquals(1, messages.size()); + + var message = messages.get(0); + + assertEquals(id, message.id()); + assertEquals(MqMessageState.OK, message.state()); + + System.out.println(message); + } + + @Test + public void testReply() throws Exception { + long request = persistence.sendNewMessage(recipientId, senderId, null, "function", "payload", Duration.ofSeconds(30)); + long response = persistence.sendResponse(request, MqMessageState.OK, "response"); + + var sentMessages = MqTestUtil.getMessages(dataSource, recipientId); + System.out.println(sentMessages); + assertEquals(1, sentMessages.size()); + + var requestMessage = sentMessages.get(0); + assertEquals(request, requestMessage.id()); + assertEquals(MqMessageState.OK, requestMessage.state()); + + + var replies = MqTestUtil.getMessages(dataSource, senderId); + System.out.println(replies); + assertEquals(1, replies.size()); + + var responseMessage = replies.get(0); + assertEquals(response, responseMessage.id()); + assertEquals(request, responseMessage.relatedId()); + assertEquals(MqMessageState.NEW, responseMessage.state()); + } + + @Test + public void testPollInbox() throws Exception { + + String instanceId = "BATMAN"; + long tick = 1234L; + + long id = persistence.sendNewMessage(recipientId, null, null, "function", "payload", Duration.ofSeconds(30)); + + var messagesPollFirstTime = persistence.pollInbox(recipientId, instanceId , tick, 10); + + /** CHECK POLL RESULT */ + assertEquals(1, messagesPollFirstTime.size()); + var firstPollMessage = messagesPollFirstTime.iterator().next(); + assertEquals(id, firstPollMessage.msgId()); + assertEquals("function", firstPollMessage.function()); + assertEquals("payload", firstPollMessage.payload()); + + /** CHECK DB TABLE */ + var messages = MqTestUtil.getMessages(dataSource, recipientId); + assertEquals(1, messages.size()); + + var message = messages.get(0); + + assertEquals(id, message.id()); + assertEquals("function", message.function()); + assertEquals("payload", message.payload()); + assertEquals(MqMessageState.ACK, message.state()); + assertEquals(instanceId, message.ownerInstance()); + assertEquals(tick, message.ownerTick()); + + /** VERIFY SECOND POLL IS EMPTY */ + var messagePollSecondTime = persistence.pollInbox(recipientId, instanceId , 1, 10); + assertEquals(0, messagePollSecondTime.size()); + } +} diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/ActorStateMachineErrorTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/ActorStateMachineErrorTest.java new file mode 100644 index 00000000..3ca46e83 --- /dev/null +++ b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/ActorStateMachineErrorTest.java @@ -0,0 +1,104 @@ +package nu.marginalia.mqsm; + +import com.google.gson.GsonBuilder; +import com.zaxxer.hikari.HikariConfig; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.mq.MessageQueueFactory; +import nu.marginalia.mq.MqMessageRow; +import nu.marginalia.mq.MqTestUtil; +import nu.marginalia.mq.persistence.MqPersistence; +import nu.marginalia.mqsm.graph.GraphState; +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mqsm.graph.ResumeBehavior; +import org.junit.jupiter.api.*; +import org.junit.jupiter.api.parallel.Execution; +import org.testcontainers.containers.MariaDBContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +import java.util.List; +import java.util.UUID; +import java.util.concurrent.TimeUnit; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD; + +@Tag("slow") +@Testcontainers +@Execution(SAME_THREAD) +public class ActorStateMachineErrorTest { + @Container + static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") + .withDatabaseName("WMSA_prod") + .withUsername("wmsa") + .withPassword("wmsa") + .withInitScript("db/migration/V23_07_0_003__message_queue.sql") + .withNetworkAliases("mariadb"); + + static HikariDataSource dataSource; + static MqPersistence persistence; + static MessageQueueFactory messageQueueFactory; + private String inboxId; + + @BeforeEach + public void setUp() { + inboxId = UUID.randomUUID().toString(); + } + @BeforeAll + public static void setUpAll() { + HikariConfig config = new HikariConfig(); + config.setJdbcUrl(mariaDBContainer.getJdbcUrl()); + config.setUsername("wmsa"); + config.setPassword("wmsa"); + + dataSource = new HikariDataSource(config); + persistence = new MqPersistence(dataSource); + messageQueueFactory = new MessageQueueFactory(persistence); + } + + @AfterAll + public static void tearDownAll() { + dataSource.close(); + } + + public static class ErrorHurdles extends AbstractStateGraph { + + public ErrorHurdles(StateFactory stateFactory) { + super(stateFactory); + } + + @GraphState(name = "INITIAL", next = "FAILING") + public void initial() { + + } + @GraphState(name = "FAILING", next = "OK", resume = ResumeBehavior.RETRY) + public void resumable() { + throw new RuntimeException("Boom!"); + } + @GraphState(name = "OK", next = "END") + public void ok() { + + } + + } + + @Test + public void smResumeResumableFromNew() throws Exception { + var stateFactory = new StateFactory(new GsonBuilder().create()); + var sm = new ActorStateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new ErrorHurdles(stateFactory)); + + sm.init(); + + sm.join(2, TimeUnit.SECONDS); + sm.stop(); + + List states = MqTestUtil.getMessages(dataSource, inboxId) + .stream() + .peek(System.out::println) + .map(MqMessageRow::function) + .toList(); + + assertEquals(List.of("INITIAL", "FAILING", "ERROR"), states); + } + +} diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/ActorStateMachineNullTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/ActorStateMachineNullTest.java new file mode 100644 index 00000000..a20c75f0 --- /dev/null +++ b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/ActorStateMachineNullTest.java @@ -0,0 +1,98 @@ +package nu.marginalia.mqsm; + +import com.google.gson.GsonBuilder; +import com.zaxxer.hikari.HikariConfig; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.mq.MessageQueueFactory; +import nu.marginalia.mq.MqTestUtil; +import nu.marginalia.mq.persistence.MqPersistence; +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mqsm.graph.GraphState; +import org.junit.jupiter.api.*; +import org.junit.jupiter.api.parallel.Execution; +import org.testcontainers.containers.MariaDBContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +import java.util.UUID; +import java.util.concurrent.TimeUnit; + +import static org.junit.Assert.fail; +import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD; + +@Tag("slow") +@Testcontainers +@Execution(SAME_THREAD) +public class ActorStateMachineNullTest { + @Container + static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") + .withDatabaseName("WMSA_prod") + .withUsername("wmsa") + .withPassword("wmsa") + .withInitScript("db/migration/V23_07_0_003__message_queue.sql") + .withNetworkAliases("mariadb"); + + static HikariDataSource dataSource; + static MqPersistence persistence; + static MessageQueueFactory messageQueueFactory; + private String inboxId; + + @BeforeEach + public void setUp() { + inboxId = UUID.randomUUID().toString(); + } + @BeforeAll + public static void setUpAll() { + HikariConfig config = new HikariConfig(); + config.setJdbcUrl(mariaDBContainer.getJdbcUrl()); + config.setUsername("wmsa"); + config.setPassword("wmsa"); + + dataSource = new HikariDataSource(config); + persistence = new MqPersistence(dataSource); + messageQueueFactory = new MessageQueueFactory(persistence); + } + + @AfterAll + public static void tearDownAll() { + dataSource.close(); + } + + public static class TestGraph extends AbstractStateGraph { + public TestGraph(StateFactory stateFactory) { + super(stateFactory); + } + + @GraphState(name = "INITIAL", next = "GREET") + public void initial() {} + + @GraphState(name = "GREET", next = "END") + public void greet(String message) { + if (null == message) { + System.out.println("Hello, null!"); + return; + } + Assertions.fail("Should not be called"); + } + + } + + @Test + public void testStateGraphNullSerialization() throws Exception { + var stateFactory = new StateFactory(new GsonBuilder().create()); + var graph = new TestGraph(stateFactory); + + + var sm = new ActorStateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), graph); + sm.registerStates(graph); + + sm.init(); + + sm.join(2, TimeUnit.SECONDS); + sm.stop(); + + MqTestUtil.getMessages(dataSource, inboxId).forEach(System.out::println); + + } + +} diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/ActorStateMachineResumeTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/ActorStateMachineResumeTest.java new file mode 100644 index 00000000..825a4c43 --- /dev/null +++ b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/ActorStateMachineResumeTest.java @@ -0,0 +1,186 @@ +package nu.marginalia.mqsm; + +import com.google.gson.GsonBuilder; +import com.zaxxer.hikari.HikariConfig; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.mq.MessageQueueFactory; +import nu.marginalia.mq.MqMessageRow; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.MqTestUtil; +import nu.marginalia.mq.persistence.MqPersistence; +import nu.marginalia.mqsm.graph.GraphState; +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mqsm.graph.ResumeBehavior; +import org.junit.jupiter.api.*; +import org.junit.jupiter.api.parallel.Execution; +import org.testcontainers.containers.MariaDBContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +import java.util.List; +import java.util.UUID; +import java.util.concurrent.TimeUnit; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD; + +@Tag("slow") +@Testcontainers +@Execution(SAME_THREAD) +public class ActorStateMachineResumeTest { + @Container + static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") + .withDatabaseName("WMSA_prod") + .withUsername("wmsa") + .withPassword("wmsa") + .withInitScript("db/migration/V23_07_0_003__message_queue.sql") + .withNetworkAliases("mariadb"); + + static HikariDataSource dataSource; + static MqPersistence persistence; + static MessageQueueFactory messageQueueFactory; + private String inboxId; + + @BeforeEach + public void setUp() { + inboxId = UUID.randomUUID().toString(); + } + @BeforeAll + public static void setUpAll() { + HikariConfig config = new HikariConfig(); + config.setJdbcUrl(mariaDBContainer.getJdbcUrl()); + config.setUsername("wmsa"); + config.setPassword("wmsa"); + + dataSource = new HikariDataSource(config); + persistence = new MqPersistence(dataSource); + messageQueueFactory = new MessageQueueFactory(persistence); + } + + @AfterAll + public static void tearDownAll() { + dataSource.close(); + } + + public static class ResumeTrialsGraph extends AbstractStateGraph { + + public ResumeTrialsGraph(StateFactory stateFactory) { + super(stateFactory); + } + + @GraphState(name = "INITIAL", next = "RESUMABLE") + public void initial() {} + @GraphState(name = "RESUMABLE", next = "NON-RESUMABLE", resume = ResumeBehavior.RETRY) + public void resumable() {} + @GraphState(name = "NON-RESUMABLE", next = "OK", resume = ResumeBehavior.ERROR) + public void nonResumable() {} + + @GraphState(name = "OK", next = "END") + public void ok() {} + + } + + @Test + public void smResumeResumableFromNew() throws Exception { + var stateFactory = new StateFactory(new GsonBuilder().create()); + + + persistence.sendNewMessage(inboxId, null, -1L, "RESUMABLE", "", null); + var sm = new ActorStateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); + + sm.join(2, TimeUnit.SECONDS); + sm.stop(); + + List states = MqTestUtil.getMessages(dataSource, inboxId) + .stream() + .peek(System.out::println) + .map(MqMessageRow::function) + .toList(); + + assertEquals(List.of("RESUMABLE", "NON-RESUMABLE", "OK", "END"), states); + } + + @Test + public void smResumeFromAck() throws Exception { + var stateFactory = new StateFactory(new GsonBuilder().create()); + + long id = persistence.sendNewMessage(inboxId, null, -1L, "RESUMABLE", "", null); + persistence.updateMessageState(id, MqMessageState.ACK); + + var sm = new ActorStateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); + + sm.join(4, TimeUnit.SECONDS); + sm.stop(); + + List states = MqTestUtil.getMessages(dataSource, inboxId) + .stream() + .peek(System.out::println) + .map(MqMessageRow::function) + .toList(); + + assertEquals(List.of("RESUMABLE", "RESUMABLE", "NON-RESUMABLE", "OK", "END"), states); + } + + + @Test + public void smResumeNonResumableFromNew() throws Exception { + var stateFactory = new StateFactory(new GsonBuilder().create()); + + + persistence.sendNewMessage(inboxId, null, -1L, "NON-RESUMABLE", "", null); + + var sm = new ActorStateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); + + sm.join(2, TimeUnit.SECONDS); + sm.stop(); + + List states = MqTestUtil.getMessages(dataSource, inboxId) + .stream() + .peek(System.out::println) + .map(MqMessageRow::function) + .toList(); + + assertEquals(List.of("NON-RESUMABLE", "OK", "END"), states); + } + + @Test + public void smResumeNonResumableFromAck() throws Exception { + var stateFactory = new StateFactory(new GsonBuilder().create()); + + + long id = persistence.sendNewMessage(inboxId, null, null, "NON-RESUMABLE", "", null); + persistence.updateMessageState(id, MqMessageState.ACK); + + var sm = new ActorStateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); + + sm.join(2, TimeUnit.SECONDS); + sm.stop(); + + List states = MqTestUtil.getMessages(dataSource, inboxId) + .stream() + .peek(System.out::println) + .map(MqMessageRow::function) + .toList(); + + assertEquals(List.of("NON-RESUMABLE", "ERROR"), states); + } + + @Test + public void smResumeEmptyQueue() throws Exception { + var stateFactory = new StateFactory(new GsonBuilder().create()); + + + var sm = new ActorStateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new ResumeTrialsGraph(stateFactory)); + + sm.join(2, TimeUnit.SECONDS); + sm.stop(); + + List states = MqTestUtil.getMessages(dataSource, inboxId) + .stream() + .peek(System.out::println) + .map(MqMessageRow::function) + .toList(); + + assertEquals(List.of(), states); + } +} diff --git a/code/common/message-queue/src/test/java/nu/marginalia/mqsm/ActorStateMachineTest.java b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/ActorStateMachineTest.java new file mode 100644 index 00000000..5574c771 --- /dev/null +++ b/code/common/message-queue/src/test/java/nu/marginalia/mqsm/ActorStateMachineTest.java @@ -0,0 +1,144 @@ +package nu.marginalia.mqsm; + +import com.google.gson.GsonBuilder; +import com.zaxxer.hikari.HikariConfig; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.mq.MessageQueueFactory; +import nu.marginalia.mq.MqTestUtil; +import nu.marginalia.mq.persistence.MqPersistence; +import nu.marginalia.mqsm.graph.GraphState; +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import org.junit.jupiter.api.*; +import org.junit.jupiter.api.parallel.Execution; +import org.testcontainers.containers.MariaDBContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +import java.util.UUID; +import java.util.concurrent.TimeUnit; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD; + +@Tag("slow") +@Testcontainers +@Execution(SAME_THREAD) +public class ActorStateMachineTest { + @Container + static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") + .withDatabaseName("WMSA_prod") + .withUsername("wmsa") + .withPassword("wmsa") + .withInitScript("db/migration/V23_07_0_003__message_queue.sql") + .withNetworkAliases("mariadb"); + + static HikariDataSource dataSource; + static MqPersistence persistence; + static MessageQueueFactory messageQueueFactory; + private String inboxId; + + @BeforeEach + public void setUp() { + inboxId = UUID.randomUUID().toString(); + } + @BeforeAll + public static void setUpAll() { + HikariConfig config = new HikariConfig(); + config.setJdbcUrl(mariaDBContainer.getJdbcUrl()); + config.setUsername("wmsa"); + config.setPassword("wmsa"); + + dataSource = new HikariDataSource(config); + persistence = new MqPersistence(dataSource); + messageQueueFactory = new MessageQueueFactory(persistence); + } + + @AfterAll + public static void tearDownAll() { + dataSource.close(); + } + + public static class TestGraph extends AbstractStateGraph { + public TestGraph(StateFactory stateFactory) { + super(stateFactory); + } + + @GraphState(name = "INITIAL", next = "GREET") + public String initial() { + return "World"; + } + + @GraphState(name = "GREET") + public void greet(String message) { + System.out.println("Hello, " + message + "!"); + + transition("COUNT-DOWN", 5); + } + + @GraphState(name = "COUNT-DOWN", next = "END") + public void countDown(Integer from) { + if (from > 0) { + System.out.println(from); + transition("COUNT-DOWN", from - 1); + } + } + } + + @Test + public void testAnnotatedStateGraph() throws Exception { + var stateFactory = new StateFactory(new GsonBuilder().create()); + var graph = new TestGraph(stateFactory); + + + var sm = new ActorStateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), graph); + sm.registerStates(graph); + + sm.init(); + + sm.join(2, TimeUnit.SECONDS); + sm.stop(); + + MqTestUtil.getMessages(dataSource, inboxId).forEach(System.out::println); + + } + + @Test + public void testStartStopStartStop() throws Exception { + var stateFactory = new StateFactory(new GsonBuilder().create()); + var sm = new ActorStateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new TestGraph(stateFactory)); + + sm.init(); + + Thread.sleep(150); + sm.stop(); + + System.out.println("-------------------- "); + + var sm2 = new ActorStateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new TestGraph(stateFactory)); + sm2.join(2, TimeUnit.SECONDS); + sm2.stop(); + + MqTestUtil.getMessages(dataSource, inboxId).forEach(System.out::println); + } + + @Test + public void testFalseTransition() throws Exception { + var stateFactory = new StateFactory(new GsonBuilder().create()); + + // Prep the queue with a message to set the state to initial, + // and an additional message to trigger the false transition back to initial + + persistence.sendNewMessage(inboxId, null, null, "INITIAL", "", null); + persistence.sendNewMessage(inboxId, null, null, "INITIAL", "", null); + + var sm = new ActorStateMachine(messageQueueFactory, inboxId, UUID.randomUUID(), new TestGraph(stateFactory)); + + Thread.sleep(50); + + sm.join(2, TimeUnit.SECONDS); + sm.stop(); + + MqTestUtil.getMessages(dataSource, inboxId).forEach(System.out::println); + } + +} diff --git a/code/common/model/src/main/java/nu/marginalia/model/EdgeDomain.java b/code/common/model/src/main/java/nu/marginalia/model/EdgeDomain.java index 96a44718..88dea9c7 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/EdgeDomain.java +++ b/code/common/model/src/main/java/nu/marginalia/model/EdgeDomain.java @@ -3,13 +3,15 @@ package nu.marginalia.model; import lombok.*; import javax.annotation.Nonnull; +import java.io.Serializable; import java.util.Objects; import java.util.function.Predicate; import java.util.regex.Pattern; @AllArgsConstructor @Getter @Setter @Builder -public class EdgeDomain { +public class EdgeDomain implements Serializable { + @Nonnull public final String subDomain; @Nonnull @@ -160,22 +162,16 @@ public class EdgeDomain { public boolean equals(final Object o) { if (o == this) return true; - if (!(o instanceof EdgeDomain)) return false; - final EdgeDomain other = (EdgeDomain) o; - if (!other.canEqual((Object) this)) return false; + if (!(o instanceof EdgeDomain other)) return false; final String this$subDomain = this.getSubDomain(); final String other$subDomain = other.getSubDomain(); - if (!this$subDomain.equalsIgnoreCase(other$subDomain)) return false; + if (!Objects.equals(this$subDomain,other$subDomain)) return false; final String this$domain = this.getDomain(); final String other$domain = other.getDomain(); - if (!this$domain.equalsIgnoreCase(other$domain)) return false; + if (!Objects.equals(this$domain,other$domain)) return false; return true; } - protected boolean canEqual(final Object other) { - return other instanceof EdgeDomain; - } - public int hashCode() { final int PRIME = 59; int result = 1; diff --git a/code/common/model/src/main/java/nu/marginalia/model/EdgeUrl.java b/code/common/model/src/main/java/nu/marginalia/model/EdgeUrl.java index 19a9eb1b..9def0480 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/EdgeUrl.java +++ b/code/common/model/src/main/java/nu/marginalia/model/EdgeUrl.java @@ -5,6 +5,8 @@ import lombok.Getter; import lombok.Setter; import nu.marginalia.util.QueryParams; +import javax.annotation.Nullable; +import java.io.Serializable; import java.net.MalformedURLException; import java.net.URI; import java.net.URISyntaxException; @@ -14,7 +16,7 @@ import java.util.Optional; import java.util.regex.Pattern; @Getter @Setter @Builder -public class EdgeUrl { +public class EdgeUrl implements Serializable { public final String proto; public final EdgeDomain domain; public final Integer port; @@ -33,8 +35,12 @@ public class EdgeUrl { this(new URI(urlencodeFixer(url))); } - public static Optional parse(String url) { + public static Optional parse(@Nullable String url) { try { + if (null == url) { + return Optional.empty(); + } + return Optional.of(new EdgeUrl(url)); } catch (URISyntaxException e) { return Optional.empty(); diff --git a/code/common/model/src/main/java/nu/marginalia/model/crawl/HtmlFeature.java b/code/common/model/src/main/java/nu/marginalia/model/crawl/HtmlFeature.java index d9adbff6..03e5557c 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/crawl/HtmlFeature.java +++ b/code/common/model/src/main/java/nu/marginalia/model/crawl/HtmlFeature.java @@ -3,11 +3,15 @@ package nu.marginalia.model.crawl; import java.util.Collection; public enum HtmlFeature { + // Note, the first 32 of these features are bit encoded in the database + // so be sure to keep anything that's potentially important toward the top + // of the list + MEDIA( "special:media"), JS("special:scripts"), AFFILIATE_LINK( "special:affiliate"), - TRACKING_INNOCENT("special:tracking"), - TRACKING_EVIL("special:tracking2"), + TRACKING("special:tracking"), + TRACKING_ADTECH("special:ads"), // We'll this as ads for now VIEWPORT("special:viewport"), diff --git a/code/common/model/src/main/java/nu/marginalia/model/idx/DocumentMetadata.java b/code/common/model/src/main/java/nu/marginalia/model/idx/DocumentMetadata.java index fc49b300..0b1fe480 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/idx/DocumentMetadata.java +++ b/code/common/model/src/main/java/nu/marginalia/model/idx/DocumentMetadata.java @@ -2,6 +2,7 @@ package nu.marginalia.model.idx; import nu.marginalia.model.crawl.PubDate; +import java.io.Serializable; import java.util.EnumSet; import java.util.Set; @@ -15,7 +16,9 @@ public record DocumentMetadata(int avgSentLength, int year, int sets, int quality, - byte flags) { + byte flags) + implements Serializable +{ public String toString() { StringBuilder sb = new StringBuilder(getClass().getSimpleName()); diff --git a/code/common/process/build.gradle b/code/common/process/build.gradle index a762887b..d1b9ae5b 100644 --- a/code/common/process/build.gradle +++ b/code/common/process/build.gradle @@ -20,6 +20,7 @@ dependencies { implementation libs.guava implementation libs.guice + implementation libs.bundles.mariadb implementation libs.commons.lang3 implementation libs.snakeyaml @@ -29,4 +30,16 @@ dependencies { testImplementation libs.mockito } +test { + maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1 + maxHeapSize = "8G" + useJUnitPlatform() +} +task fastTests(type: Test) { + maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1 + maxHeapSize = "8G" + useJUnitPlatform { + excludeTags "slow" + } +} \ No newline at end of file diff --git a/code/common/process/src/main/java/nu/marginalia/ProcessConfiguration.java b/code/common/process/src/main/java/nu/marginalia/ProcessConfiguration.java new file mode 100644 index 00000000..35e1433f --- /dev/null +++ b/code/common/process/src/main/java/nu/marginalia/ProcessConfiguration.java @@ -0,0 +1,7 @@ +package nu.marginalia; + +import java.util.UUID; + +public record ProcessConfiguration(String processName, int node, UUID instanceUuid) { + +} diff --git a/code/common/process/src/main/java/nu/marginalia/process/control/ProcessHeartbeat.java b/code/common/process/src/main/java/nu/marginalia/process/control/ProcessHeartbeat.java new file mode 100644 index 00000000..82b2c95e --- /dev/null +++ b/code/common/process/src/main/java/nu/marginalia/process/control/ProcessHeartbeat.java @@ -0,0 +1,155 @@ +package nu.marginalia.process.control; + + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.ProcessConfiguration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.SQLException; +import java.util.concurrent.TimeUnit; + +/** This service sends a heartbeat to the database every 5 seconds. + */ +@Singleton +public class ProcessHeartbeat { + private final Logger logger = LoggerFactory.getLogger(ProcessHeartbeat.class); + private final String processName; + private final String processBase; + private final String instanceUUID; + private final HikariDataSource dataSource; + + + private final Thread runnerThread; + private final int heartbeatInterval = Integer.getInteger("mcp.heartbeat.interval", 1); + + private volatile boolean running = false; + + private volatile int progress = -1; + + @Inject + public ProcessHeartbeat(ProcessConfiguration configuration, + HikariDataSource dataSource) + { + this.processName = configuration.processName() + ":" + configuration.node(); + this.processBase = configuration.processName(); + this.dataSource = dataSource; + + this.instanceUUID = configuration.instanceUuid().toString(); + + runnerThread = new Thread(this::run); + + Runtime.getRuntime().addShutdownHook(new Thread(this::shutDown)); + } + + public void setProgress(double progress) { + this.progress = (int) (progress * 100); + } + + public void start() { + if (!running) { + runnerThread.start(); + } + } + + public void shutDown() { + if (!running) + return; + + running = false; + + try { + runnerThread.join(); + heartbeatStop(); + } + catch (InterruptedException|SQLException ex) { + logger.warn("ServiceHeartbeat shutdown failed", ex); + } + } + + private void run() { + if (!running) + running = true; + else + return; + + try { + heartbeatInit(); + + while (running) { + + try { + heartbeatUpdate(); + } + catch (SQLException ex) { + logger.warn("ServiceHeartbeat failed to update", ex); + } + + TimeUnit.SECONDS.sleep(heartbeatInterval); + } + } + catch (InterruptedException|SQLException ex) { + logger.error("ServiceHeartbeat caught irrecoverable exception, killing service", ex); + System.exit(255); + } + } + + private void heartbeatInit() throws SQLException { + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement( + """ + INSERT INTO PROCESS_HEARTBEAT (PROCESS_NAME, PROCESS_BASE, INSTANCE, HEARTBEAT_TIME, STATUS) + VALUES (?, ?, ?, CURRENT_TIMESTAMP(6), 'STARTING') + ON DUPLICATE KEY UPDATE + INSTANCE = ?, + HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), + STATUS = 'STARTING' + """ + )) + { + stmt.setString(1, processName); + stmt.setString(2, processBase); + stmt.setString(3, instanceUUID); + stmt.setString(4, instanceUUID); + stmt.executeUpdate(); + } + } + } + + private void heartbeatUpdate() throws SQLException { + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement( + """ + UPDATE PROCESS_HEARTBEAT + SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), STATUS = 'RUNNING', PROGRESS = ? + WHERE INSTANCE = ? + """) + ) + { + stmt.setInt(1, progress); + stmt.setString(2, instanceUUID); + stmt.executeUpdate(); + } + } + } + + private void heartbeatStop() throws SQLException { + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement( + """ + UPDATE PROCESS_HEARTBEAT + SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), STATUS='STOPPED', PROGRESS=? + WHERE INSTANCE = ? + """) + ) + { + stmt.setInt(1, progress); + stmt.setString( 2, instanceUUID); + stmt.executeUpdate(); + } + } + } +} + diff --git a/code/common/process/src/main/java/nu/marginalia/process/log/WorkLoadIterable.java b/code/common/process/src/main/java/nu/marginalia/process/log/WorkLoadIterable.java new file mode 100644 index 00000000..992c1991 --- /dev/null +++ b/code/common/process/src/main/java/nu/marginalia/process/log/WorkLoadIterable.java @@ -0,0 +1,52 @@ +package nu.marginalia.process.log; + +import lombok.SneakyThrows; +import org.jetbrains.annotations.NotNull; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Iterator; +import java.util.Optional; +import java.util.function.Function; + +class WorkLoadIterable implements Iterable { + + private final Path logFile; + private final Function> mapper; + + WorkLoadIterable(Path logFile, Function> mapper) { + this.logFile = logFile; + this.mapper = mapper; + } + + @NotNull + @Override + @SneakyThrows + public Iterator iterator() { + var stream = Files.lines(logFile); + return new Iterator<>() { + final Iterator iter = stream + .filter(WorkLogEntry::isJobId) + .map(WorkLogEntry::parse) + .map(mapper) + .filter(Optional::isPresent) + .map(Optional::get) + .iterator(); + + @Override + public boolean hasNext() { + if (iter.hasNext()) { + return true; + } else { + stream.close(); + return false; + } + } + + @Override + public T next() { + return iter.next(); + } + }; + } +} diff --git a/code/common/process/src/main/java/nu/marginalia/process/log/WorkLog.java b/code/common/process/src/main/java/nu/marginalia/process/log/WorkLog.java index db5b22a8..9be31d17 100644 --- a/code/common/process/src/main/java/nu/marginalia/process/log/WorkLog.java +++ b/code/common/process/src/main/java/nu/marginalia/process/log/WorkLog.java @@ -1,95 +1,105 @@ package nu.marginalia.process.log; -import com.google.errorprone.annotations.MustBeClosed; -import org.apache.logging.log4j.util.Strings; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; -import java.io.FileNotFoundException; +import java.io.Closeable; import java.io.FileOutputStream; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.time.LocalDateTime; -import java.util.HashSet; -import java.util.Set; -import java.util.function.Consumer; -import java.util.regex.Pattern; -import java.util.stream.Stream; +import java.util.*; +import java.util.function.Function; -public class WorkLog implements AutoCloseable { +/** WorkLog is a journal of work done by a process, + * so that it can be resumed after a crash or termination. + *

+ * The log file itself is a tab-separated file with the following columns: + *

    + *
  • Job ID
  • + *
  • Timestamp
  • + *
  • Location (e.g. path on disk)
  • + *
  • Size
  • + *

    + * + */ +public class WorkLog implements AutoCloseable, Closeable { private final Set finishedJobs = new HashSet<>(); private final FileOutputStream logWriter; + private final Logger logger = LoggerFactory.getLogger(getClass()); public WorkLog(Path logFile) throws IOException { - loadLog(logFile); + if (Files.exists(logFile)) { + try (var lines = Files.lines(logFile)) { + lines.filter(WorkLogEntry::isJobId) + .map(WorkLogEntry::parseJobIdFromLogLine) + .forEach(finishedJobs::add); + } + } logWriter = new FileOutputStream(logFile.toFile(), true); - writeLogEntry("# Starting WorkLog @ " + LocalDateTime.now()); + writeLogEntry("# Starting WorkLog @ " + LocalDateTime.now() + "\n"); } - public static void readLog(Path logFile, Consumer entryConsumer) throws FileNotFoundException { - if (!Files.exists(logFile)) { - throw new FileNotFoundException("Log file not found " + logFile); - } - - try (var entries = streamLog(logFile)) { - entries.forEach(entryConsumer); - } catch (IOException e) { - e.printStackTrace(); - } + /** Create an iterable over the work log + *
    + * Caveat: If the iterator is not iterated to the end, + * it will leak a file descriptor. + */ + public static Iterable iterable(Path logFile) { + return new WorkLoadIterable<>(logFile, Optional::of); } - @MustBeClosed - public static Stream streamLog(Path logFile) throws IOException { - return Files.lines(logFile).filter(WorkLog::isJobId).map(line -> { - String[] parts = line.split("\\s+"); - return new WorkLogEntry(parts[0], parts[1], parts[2], Integer.parseInt(parts[3])); - }); - } - - private void loadLog(Path logFile) throws IOException { - if (!Files.exists(logFile)) { - return; - } - - try (var lines = Files.lines(logFile)) { - lines.filter(WorkLog::isJobId).map(this::getJobIdFromWrittenString).forEach(finishedJobs::add); - } - } - - private static boolean isJobId(String s) { - return Strings.isNotBlank(s) && !s.startsWith("#"); - } - - private static final Pattern splitPattern = Pattern.compile("\\s+"); - - private String getJobIdFromWrittenString(String s) { - return splitPattern.split(s, 2)[0]; - } - - public synchronized boolean isJobFinished(String id) { - return finishedJobs.contains(id); + /** Create an iterable over the work log, applying a mapping function to each item + *
    + * Caveat: If the iterator is not iterated to the end, + * it will leak a file descriptor. + */ + public static Iterable iterableMap(Path logFile, Function> mapper) { + return new WorkLoadIterable<>(logFile, mapper); } // Use synchro over concurrent set to avoid competing writes // - correct is better than fast here, it's sketchy enough to use // a PrintWriter + /** Mark the job as finished in the work log + * + * @param id job identifier + * @param where free form field, e.g. location on disk + * @param size free form field, e.g. how many items were processed + */ public synchronized void setJobToFinished(String id, String where, int size) throws IOException { - finishedJobs.add(id); + if (!finishedJobs.add(id)) { + logger.warn("Setting job {} to finished, but it was already finished", id); + } - writeLogEntry(String.format("%s\t%s\t%s\t%d",id, LocalDateTime.now(), where, size)); + writeLogEntry(String.format("%s\t%s\t%s\t%d\n",id, LocalDateTime.now(), where, size)); + } + + public synchronized boolean isJobFinished(String id) { + return finishedJobs.contains(id); } private void writeLogEntry(String entry) throws IOException { logWriter.write(entry.getBytes(StandardCharsets.UTF_8)); - logWriter.write("\n".getBytes(StandardCharsets.UTF_8)); logWriter.flush(); } @Override - public void close() throws Exception { - logWriter.flush(); - logWriter.close(); + public void close() { + try { + logWriter.flush(); + logWriter.close(); + } + catch (IOException e) { + logger.error("Error closing work log", e); + } + } + + public int countFinishedJobs() { + return finishedJobs.size(); } } diff --git a/code/common/process/src/main/java/nu/marginalia/process/log/WorkLogEntry.java b/code/common/process/src/main/java/nu/marginalia/process/log/WorkLogEntry.java index 9f9579f3..d0cf0ef8 100644 --- a/code/common/process/src/main/java/nu/marginalia/process/log/WorkLogEntry.java +++ b/code/common/process/src/main/java/nu/marginalia/process/log/WorkLogEntry.java @@ -1,4 +1,23 @@ package nu.marginalia.process.log; +import org.apache.logging.log4j.util.Strings; + +import java.util.regex.Pattern; + public record WorkLogEntry(String id, String ts, String path, int cnt) { + private static final Pattern splitPattern = Pattern.compile("\\s+"); + + static WorkLogEntry parse(String line) { + String[] parts = splitPattern.split(line); + return new WorkLogEntry(parts[0], parts[1], parts[2], Integer.parseInt(parts[3])); + } + + static boolean isJobId(String line) { + return Strings.isNotBlank(line) && !line.startsWith("#"); + } + + static String parseJobIdFromLogLine(String s) { + return splitPattern.split(s, 2)[0]; + } + } diff --git a/code/common/process/src/main/java/nu/marginalia/util/ParallelPipe.java b/code/common/process/src/main/java/nu/marginalia/util/ParallelPipe.java deleted file mode 100644 index fc95debe..00000000 --- a/code/common/process/src/main/java/nu/marginalia/util/ParallelPipe.java +++ /dev/null @@ -1,112 +0,0 @@ -package nu.marginalia.util; - -import lombok.SneakyThrows; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.ArrayList; -import java.util.List; -import java.util.concurrent.LinkedBlockingQueue; -import java.util.concurrent.TimeUnit; - -/** Generalization of the workflow
    - * -- single provider thread reading sequentially from disk
    - * -> multiple independent CPU-bound processing tasks
    - * -> single consumer thread writing to network/disk
    - *

    - */ -public abstract class ParallelPipe { - private final LinkedBlockingQueue inputs; - private final LinkedBlockingQueue intermediates; - - private final Logger logger = LoggerFactory.getLogger(getClass()); - - private final List processThreads = new ArrayList<>(); - private final Thread receiverThread; - - private volatile boolean expectingInput = true; - private volatile boolean expectingOutput = true; - - public ParallelPipe(String name, int numberOfThreads, int inputQueueSize, int intermediateQueueSize) { - inputs = new LinkedBlockingQueue<>(inputQueueSize); - intermediates = new LinkedBlockingQueue<>(intermediateQueueSize); - - for (int i = 0; i < numberOfThreads; i++) { - processThreads.add(new Thread(this::runProcessThread, name + "-process["+i+"]")); - } - receiverThread = new Thread(this::runReceiverThread, name + "-receiver"); - - processThreads.forEach(Thread::start); - receiverThread.start(); - } - - public void clearQueues() { - inputs.clear(); - intermediates.clear(); - } - - @SneakyThrows - private void runProcessThread() { - while (expectingInput || !inputs.isEmpty()) { - var in = inputs.poll(10, TimeUnit.SECONDS); - - if (in != null) { - try { - var ret = onProcess(in); - if (ret != null) { - intermediates.put(ret); - } - } - catch (InterruptedException ex) { - throw ex; - } - catch (Exception ex) { - logger.error("Exception", ex); - } - - } - } - - logger.info("Terminating {}", Thread.currentThread().getName()); - } - - @SneakyThrows - private void runReceiverThread() { - while (expectingOutput || !inputs.isEmpty() || !intermediates.isEmpty()) { - var intermediate = intermediates.poll(997, TimeUnit.MILLISECONDS); - if (intermediate != null) { - try { - onReceive(intermediate); - } - catch (Exception ex) { - logger.error("Exception", ex); - } - } - } - - logger.info("Terminating {}", Thread.currentThread().getName()); - } - - /** Begin processing an item */ - @SneakyThrows - public void accept(INPUT input) { - inputs.put(input); - } - - /** The meat of the processor thread runtime */ - protected abstract INTERMEDIATE onProcess(INPUT input) throws Exception; - - /** The meat of the consumer thread runtime */ - protected abstract void onReceive(INTERMEDIATE intermediate) throws Exception; - - public void join() throws InterruptedException { - expectingInput = false; - - for (var thread : processThreads) { - thread.join(); - } - - expectingOutput = false; - receiverThread.join(); - } -} diff --git a/code/common/process/src/test/java/nu/marginalia/process/log/WorkLogTest.java b/code/common/process/src/test/java/nu/marginalia/process/log/WorkLogTest.java new file mode 100644 index 00000000..8ed7e68f --- /dev/null +++ b/code/common/process/src/test/java/nu/marginalia/process/log/WorkLogTest.java @@ -0,0 +1,96 @@ +package nu.marginalia.process.log; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.HashMap; +import java.util.Map; + +import static org.junit.jupiter.api.Assertions.*; + +class WorkLogTest { + + private Path logFile; + @BeforeEach + public void setUp() throws IOException { + logFile = Files.createTempFile("worklog", ".log"); + } + + @AfterEach + public void tearDown() throws IOException { + Files.deleteIfExists(logFile); + } + + + @Test + public void testLog() throws IOException { + var log = new WorkLog(logFile); + log.setJobToFinished("A", "a.txt",1); + log.setJobToFinished("B", "b.txt",2); + log.setJobToFinished("C", "c.txt",3); + assertTrue(log.isJobFinished("A")); + assertTrue(log.isJobFinished("B")); + assertTrue(log.isJobFinished("C")); + assertFalse(log.isJobFinished("E")); + } + + @Test + public void testLogResume() throws Exception { + WorkLog log = new WorkLog(logFile); + log.setJobToFinished("A", "a.txt",1); + log.setJobToFinished("B", "b.txt",2); + log.setJobToFinished("C", "c.txt",3); + log.close(); + log = new WorkLog(logFile); + log.setJobToFinished("E", "e.txt",4); + assertTrue(log.isJobFinished("A")); + assertTrue(log.isJobFinished("B")); + assertTrue(log.isJobFinished("C")); + assertTrue(log.isJobFinished("E")); + log.close(); + + Files.readAllLines(logFile).forEach(System.out::println); + } + + @Test + public void test() { + try (var workLog = new WorkLog(logFile)) { + workLog.setJobToFinished("test", "loc1", 4); + workLog.setJobToFinished("test2", "loc2", 5); + workLog.setJobToFinished("test3", "loc3", 1); + } catch (Exception e) { + e.printStackTrace(); + fail(); + } + + try (var workLog = new WorkLog(logFile)) { + workLog.setJobToFinished("test4", "loc4", 0); + + assertTrue(workLog.isJobFinished("test")); + assertTrue(workLog.isJobFinished("test2")); + assertTrue(workLog.isJobFinished("test3")); + assertTrue(workLog.isJobFinished("test4")); + assertFalse(workLog.isJobFinished("test5")); + } + catch (Exception e) { + e.printStackTrace(); + fail(); + } + + + Map entriesById = new HashMap<>(); + WorkLog.iterable(logFile).forEach(e -> entriesById.put(e.id(), e)); + + assertEquals(4, entriesById.size()); + + assertEquals("loc1", entriesById.get("test").path()); + assertEquals("loc2", entriesById.get("test2").path()); + assertEquals("loc3", entriesById.get("test3").path()); + assertEquals("loc4", entriesById.get("test4").path()); + + } +} \ No newline at end of file diff --git a/code/common/service-client/src/main/java/nu/marginalia/client/AbortingScheduler.java b/code/common/service-client/src/main/java/nu/marginalia/client/AbortingScheduler.java index 2ec196e6..f190bfe4 100644 --- a/code/common/service-client/src/main/java/nu/marginalia/client/AbortingScheduler.java +++ b/code/common/service-client/src/main/java/nu/marginalia/client/AbortingScheduler.java @@ -13,7 +13,6 @@ import java.util.concurrent.Executors; import java.util.concurrent.ThreadFactory; public class AbortingScheduler { - private final String name; private final ThreadFactory threadFactory; private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -22,7 +21,6 @@ public class AbortingScheduler { private ExecutorService executorService; public AbortingScheduler(String name) { - this.name = name; threadFactory = new ThreadFactoryBuilder() .setNameFormat(name+"client--%d") .setUncaughtExceptionHandler(this::handleException) diff --git a/code/common/service-client/src/main/java/nu/marginalia/client/ServiceMonitors.java b/code/common/service-client/src/main/java/nu/marginalia/client/ServiceMonitors.java new file mode 100644 index 00000000..a77768be --- /dev/null +++ b/code/common/service-client/src/main/java/nu/marginalia/client/ServiceMonitors.java @@ -0,0 +1,133 @@ +package nu.marginalia.client; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.service.id.ServiceId; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.SQLException; +import java.util.*; +import java.util.concurrent.TimeUnit; + +@Singleton +public class ServiceMonitors { + private final HikariDataSource dataSource; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + private final Set runningServices = new HashSet<>(); + private final Set callbacks = new HashSet<>(); + + + private final int heartbeatInterval = Integer.getInteger("mcp.heartbeat.interval", 5); + + private volatile boolean running; + + @Inject + public ServiceMonitors(HikariDataSource dataSource) { + this.dataSource = dataSource; + + var runThread = new Thread(this::run, "service monitor"); + runThread.setDaemon(true); + runThread.start(); + } + + public void subscribe(Runnable callback) { + synchronized (callbacks) { + callbacks.add(callback); + } + } + public void unsubscribe(Runnable callback) { + synchronized (callbacks) { + callbacks.remove(callback); + } + } + + public void run() { + if (running) { + return; + } + else { + running = true; + } + + while (running) { + if (updateRunningServices()) { + runCallbacks(); + } + + try { + TimeUnit.SECONDS.sleep(heartbeatInterval); + } + catch (InterruptedException ex) { + logger.warn("ServiceMonitors interrupted", ex); + running = false; + } + } + } + + private void runCallbacks() { + synchronized (callbacks) { + for (var callback : callbacks) { + synchronized (runningServices) { + callback.run(); + } + } + } + } + + private boolean updateRunningServices() { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT SERVICE_BASE, TIMESTAMPDIFF(SECOND, HEARTBEAT_TIME, CURRENT_TIMESTAMP(6)) + FROM SERVICE_HEARTBEAT + WHERE ALIVE=1 + """)) { + try (var rs = stmt.executeQuery()) { + Set newRunningServices = new HashSet<>(10); + while (rs.next()) { + String svc = rs.getString(1); + int dtime = rs.getInt(2); + if (dtime < 2.5 * heartbeatInterval) { + newRunningServices.add(svc); + } + } + + boolean changed; + + synchronized (runningServices) { + changed = !Objects.equals(runningServices, newRunningServices); + + runningServices.clear(); + runningServices.addAll(newRunningServices); + } + + return changed; + } + } + catch (SQLException ex) { + logger.warn("Failed to update running services", ex); + } + + return false; + } + + public boolean isServiceUp(ServiceId serviceId) { + synchronized (runningServices) { + return runningServices.contains(serviceId.name); + } + } + + public List getRunningServices() { + List ret = new ArrayList<>(ServiceId.values().length); + + synchronized (runningServices) { + for (var runningService : runningServices) { + ret.add(ServiceId.byName(runningService)); + } + } + + return ret; + } +} diff --git a/code/common/service-discovery/src/main/java/nu/marginalia/service/SearchServiceDescriptors.java b/code/common/service-discovery/src/main/java/nu/marginalia/service/SearchServiceDescriptors.java index a1f2bf13..c082bedb 100644 --- a/code/common/service-discovery/src/main/java/nu/marginalia/service/SearchServiceDescriptors.java +++ b/code/common/service-discovery/src/main/java/nu/marginalia/service/SearchServiceDescriptors.java @@ -13,5 +13,7 @@ public class SearchServiceDescriptors { new ServiceDescriptor(ServiceId.Search, 5023), new ServiceDescriptor(ServiceId.Assistant, 5025), new ServiceDescriptor(ServiceId.Dating, 5070), - new ServiceDescriptor(ServiceId.Explorer, 5071))); + new ServiceDescriptor(ServiceId.Explorer, 5071), + new ServiceDescriptor(ServiceId.Control, 5090) + )); } diff --git a/code/common/service-discovery/src/main/java/nu/marginalia/service/id/ServiceId.java b/code/common/service-discovery/src/main/java/nu/marginalia/service/id/ServiceId.java index 92ffb4a7..ad459d36 100644 --- a/code/common/service-discovery/src/main/java/nu/marginalia/service/id/ServiceId.java +++ b/code/common/service-discovery/src/main/java/nu/marginalia/service/id/ServiceId.java @@ -7,19 +7,22 @@ public enum ServiceId { Search("search-service"), Index("index-service"), + Control("control-service"), + Dating("dating-service"), - Explorer("explorer-service"), - - Other_Auth("auth"), - Other_Memex("memex"), - - - Other_ResourceStore("resource-store"), - Other_Renderer("renderer"), - Other_PodcastScraper("podcast-scraper"); + Explorer("explorer-service"); public final String name; ServiceId(String name) { this.name = name; } + + public static ServiceId byName(String name) { + for (ServiceId id : values()) { + if (id.name.equals(name)) { + return id; + } + } + return null; + } } diff --git a/code/common/service/build.gradle b/code/common/service/build.gradle index bad65877..156b826f 100644 --- a/code/common/service/build.gradle +++ b/code/common/service/build.gradle @@ -12,6 +12,8 @@ java { dependencies { implementation project(':code:common:service-client') implementation project(':code:common:service-discovery') + implementation project(':code:common:message-queue') + implementation project(':code:common:db') implementation libs.lombok annotationProcessor libs.lombok diff --git a/code/common/service/readme.md b/code/common/service/readme.md index 9077e8d4..91741dc0 100644 --- a/code/common/service/readme.md +++ b/code/common/service/readme.md @@ -3,6 +3,52 @@ Contains the base classes for the services. This is where port configuration, and common endpoints are set up. +## Creating a new Service + +The minimal service needs a `MainClass` and a `Service` class. + +For proper initiation, the main class should look like this: + +```java +public class FoobarMain extends MainClass { + + @Inject + public FoobarMain(FoobarService service) {} + + public static void main(String... args) { + init(ServiceId.Foobar, args); + + Injector injector = Guice.createInjector( + new FoobarModule(), /* optional custom bindings go here */ + new DatabaseModule(), + new ConfigurationModule(SearchServiceDescriptors.descriptors, + ServiceId.Foobar)); + + injector.getInstance(FoobarMain.class); + + // set the service as ready so that delayed tasks can be started + injector.getInstance(Initialization.class).setReady(); + } +} +``` + +A service class has a boilerplate set-up that looks like this: + +```java +@Singleton +public class FoobarService extends Service { + + @Inject + public FoobarService(BaseServiceParams params) { + super(params); + + // set up Spark endpoints here + } +} +``` + +Further the new service needs to be added to the `ServiceId` enum in [service-discovery](../service-discovery). + ## Central Classes * [MainClass](src/main/java/nu/marginalia/service/MainClass.java) bootstraps all executables diff --git a/code/common/service/src/main/java/nu/marginalia/service/MainClass.java b/code/common/service/src/main/java/nu/marginalia/service/MainClass.java index 26343581..c935e282 100644 --- a/code/common/service/src/main/java/nu/marginalia/service/MainClass.java +++ b/code/common/service/src/main/java/nu/marginalia/service/MainClass.java @@ -11,6 +11,9 @@ import org.slf4j.LoggerFactory; import java.net.SocketTimeoutException; import java.net.UnknownHostException; +/** Each main class of a service should extend this class. + * They must also invoke init() in their main method. + */ public abstract class MainClass { private final Logger logger = LoggerFactory.getLogger(getClass()); diff --git a/code/common/service/src/main/java/nu/marginalia/service/control/ServiceEventLog.java b/code/common/service/src/main/java/nu/marginalia/service/control/ServiceEventLog.java new file mode 100644 index 00000000..f5f6e90b --- /dev/null +++ b/code/common/service/src/main/java/nu/marginalia/service/control/ServiceEventLog.java @@ -0,0 +1,58 @@ +package nu.marginalia.service.control; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.service.module.ServiceConfiguration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.SQLException; +import java.util.Objects; +import java.util.UUID; + +@Singleton +public class ServiceEventLog { + private final HikariDataSource dataSource; + + private final Logger logger = LoggerFactory.getLogger(ServiceEventLog.class); + + private final String serviceName; + private final UUID instanceUuid; + private final String serviceBase; + + @Inject + public ServiceEventLog(HikariDataSource dataSource, + ServiceConfiguration configuration + ) { + this.dataSource = dataSource; + + this.serviceName = configuration.serviceName() + ":" + configuration.node(); + this.instanceUuid = configuration.instanceUuid(); + this.serviceBase = configuration.serviceName(); + + logger.info("Starting service {} instance {}", serviceName, instanceUuid); + + logEvent("START", "Service starting"); + } + + public void logEvent(String type, String message) { + + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + INSERT INTO SERVICE_EVENTLOG(SERVICE_NAME, SERVICE_BASE, INSTANCE, EVENT_TYPE, EVENT_MESSAGE) + VALUES (?, ?, ?, ?, ?) + """)) { + stmt.setString(1, serviceName); + stmt.setString(2, serviceBase); + stmt.setString(3, instanceUuid.toString()); + stmt.setString(4, type); + stmt.setString(5, Objects.requireNonNull(message, "")); + + stmt.executeUpdate(); + } + catch (SQLException ex) { + logger.error("Failed to log event {}:{}", type, message); + } + } +} diff --git a/code/common/service/src/main/java/nu/marginalia/service/control/ServiceHeartbeat.java b/code/common/service/src/main/java/nu/marginalia/service/control/ServiceHeartbeat.java new file mode 100644 index 00000000..c9c5085c --- /dev/null +++ b/code/common/service/src/main/java/nu/marginalia/service/control/ServiceHeartbeat.java @@ -0,0 +1,157 @@ +package nu.marginalia.service.control; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.service.module.ServiceConfiguration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.SQLException; +import java.util.concurrent.TimeUnit; + +/** This service sends a heartbeat to the database every 5 seconds, + * updating the control service with the liveness information for the service. + */ +@Singleton +public class ServiceHeartbeat { + private final Logger logger = LoggerFactory.getLogger(ServiceHeartbeat.class); + private final String serviceName; + private final String serviceBase; + private final String instanceUUID; + private final ServiceConfiguration configuration; + private final ServiceEventLog eventLog; + private final HikariDataSource dataSource; + + + private final Thread runnerThread; + private final int heartbeatInterval = Integer.getInteger("mcp.heartbeat.interval", 5); + + private volatile boolean running = false; + + @Inject + public ServiceHeartbeat(ServiceConfiguration configuration, + ServiceEventLog eventLog, + HikariDataSource dataSource) + { + this.serviceName = configuration.serviceName() + ":" + configuration.node(); + this.serviceBase = configuration.serviceName(); + this.configuration = configuration; + this.eventLog = eventLog; + this.dataSource = dataSource; + + this.instanceUUID = configuration.instanceUuid().toString(); + + runnerThread = new Thread(this::run); + + Runtime.getRuntime().addShutdownHook(new Thread(this::shutDown)); + } + + public > ServiceTaskHeartbeat createServiceTaskHeartbeat(Class steps, String processName) { + return new ServiceTaskHeartbeat<>(steps, configuration, processName, eventLog, dataSource); + } + + + public void start() { + if (!running) { + runnerThread.start(); + } + } + + public void shutDown() { + if (!running) + return; + + running = false; + + try { + runnerThread.join(); + heartbeatStop(); + } + catch (InterruptedException|SQLException ex) { + logger.warn("ServiceHeartbeat shutdown failed", ex); + } + } + + private void run() { + if (!running) + running = true; + else + return; + + try { + heartbeatInit(); + + while (running) { + + try { + heartbeatUpdate(); + } + catch (SQLException ex) { + logger.warn("ServiceHeartbeat failed to update", ex); + } + + TimeUnit.SECONDS.sleep(heartbeatInterval); + } + } + catch (InterruptedException|SQLException ex) { + logger.error("ServiceHeartbeat caught irrecoverable exception, killing service", ex); + System.exit(255); + } + } + + private void heartbeatInit() throws SQLException { + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement( + """ + INSERT INTO SERVICE_HEARTBEAT (SERVICE_NAME, SERVICE_BASE, INSTANCE, HEARTBEAT_TIME, ALIVE) + VALUES (?, ?, ?, CURRENT_TIMESTAMP(6), 1) + ON DUPLICATE KEY UPDATE + INSTANCE = ?, + HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), + ALIVE = 1 + """ + )) + { + stmt.setString(1, serviceName); + stmt.setString(2, serviceBase); + stmt.setString(3, instanceUUID); + stmt.setString(4, instanceUUID); + stmt.executeUpdate(); + } + } + } + + private void heartbeatUpdate() throws SQLException { + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement( + """ + UPDATE SERVICE_HEARTBEAT + SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6) + WHERE INSTANCE = ? AND ALIVE = 1 + """) + ) + { + stmt.setString(1, instanceUUID); + stmt.executeUpdate(); + } + } + } + + private void heartbeatStop() throws SQLException { + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement( + """ + UPDATE SERVICE_HEARTBEAT + SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), ALIVE = 0 + WHERE INSTANCE = ? + """) + ) + { + stmt.setString(1, instanceUUID); + stmt.executeUpdate(); + } + } + } + +} diff --git a/code/common/service/src/main/java/nu/marginalia/service/control/ServiceTaskHeartbeat.java b/code/common/service/src/main/java/nu/marginalia/service/control/ServiceTaskHeartbeat.java new file mode 100644 index 00000000..bf0d6a9f --- /dev/null +++ b/code/common/service/src/main/java/nu/marginalia/service/control/ServiceTaskHeartbeat.java @@ -0,0 +1,196 @@ +package nu.marginalia.service.control; + + +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.service.module.ServiceConfiguration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.SQLException; +import java.util.UUID; +import java.util.concurrent.TimeUnit; + +/** This object sends a heartbeat to the database every few seconds, + * updating with the progress of a task within a service. Progress is tracked by providing + * enumerations corresponding to the steps in the task. It's important they're arranged in the same + * order as the steps in the task in order to get an accurate progress tracking. + */ +public class ServiceTaskHeartbeat> implements AutoCloseable { + private final Logger logger = LoggerFactory.getLogger(ServiceTaskHeartbeat.class); + private final String taskName; + private final String taskBase; + private final String instanceUUID; + private final HikariDataSource dataSource; + + + private final Thread runnerThread; + private final int heartbeatInterval = Integer.getInteger("mcp.heartbeat.interval", 1); + private final String serviceInstanceUUID; + private final int stepCount; + private final ServiceEventLog eventLog; + + private volatile boolean running = false; + private volatile int stepNum = 0; + private volatile String step = "-"; + + ServiceTaskHeartbeat(Class stepClass, + ServiceConfiguration configuration, + String taskName, + ServiceEventLog eventLog, + HikariDataSource dataSource) + { + this.eventLog = eventLog; + this.taskName = configuration.serviceName() + "." + taskName + ":" + configuration.node(); + this.taskBase = configuration.serviceName() + "." + taskName; + this.dataSource = dataSource; + + this.instanceUUID = UUID.randomUUID().toString(); + this.serviceInstanceUUID = configuration.instanceUuid().toString(); + + this.stepCount = stepClass.getEnumConstants().length; + + heartbeatInit(); + + runnerThread = new Thread(this::run); + runnerThread.start(); + } + + /** Update the progress of the task. This is a fast function that doesn't block; + * the actual update is done in a separate thread. + * + * @param step The current step in the task. + */ + public void progress(T step) { + this.step = step.name(); + + + // off by one since we calculate the progress based on the number of steps, + // and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the + // final progress being 80% and not 100%) + + this.stepNum = 1 + step.ordinal(); + + logger.info("ServiceTask {} progress: {}", taskBase, step.name()); + eventLog.logEvent("TASK-STEP", taskName + " = " + step.name()); + } + + public void shutDown() { + if (!running) + return; + + running = false; + + try { + runnerThread.join(); + heartbeatStop(); + } + catch (InterruptedException|SQLException ex) { + logger.warn("ServiceHeartbeat shutdown failed", ex); + } + } + + private void run() { + if (!running) + running = true; + else + return; + + try { + while (running) { + try { + heartbeatUpdate(); + } + catch (SQLException ex) { + logger.warn("ServiceHeartbeat failed to update", ex); + } + + TimeUnit.SECONDS.sleep(heartbeatInterval); + } + } + catch (InterruptedException ex) { + logger.error("ServiceHeartbeat caught irrecoverable exception, killing service", ex); + System.exit(255); + } + } + + private void heartbeatInit() { + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement( + """ + INSERT INTO TASK_HEARTBEAT (TASK_NAME, TASK_BASE, INSTANCE, SERVICE_INSTANCE, HEARTBEAT_TIME, STATUS) + VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP(6), 'STARTING') + ON DUPLICATE KEY UPDATE + INSTANCE = ?, + SERVICE_INSTANCE = ?, + HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), + STATUS = 'STARTING' + """ + )) + { + stmt.setString(1, taskName); + stmt.setString(2, taskBase); + stmt.setString(3, instanceUUID); + stmt.setString(4, serviceInstanceUUID); + stmt.setString(5, instanceUUID); + stmt.setString(6, serviceInstanceUUID); + stmt.executeUpdate(); + } + } + catch (SQLException ex) { + logger.error("ServiceHeartbeat failed to initialize", ex); + throw new RuntimeException(ex); + } + + eventLog.logEvent("TASK-STARTED", taskName); + } + + private void heartbeatUpdate() throws SQLException { + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement( + """ + UPDATE TASK_HEARTBEAT + SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), + STATUS = 'RUNNING', + PROGRESS = ?, + STAGE_NAME = ? + WHERE INSTANCE = ? + """) + ) + { + stmt.setInt(1, (int) Math.round(100 * stepNum / (double) stepCount)); + stmt.setString(2, step); + stmt.setString(3, instanceUUID); + stmt.executeUpdate(); + } + } + } + + private void heartbeatStop() throws SQLException { + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement( + """ + UPDATE TASK_HEARTBEAT + SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), + STATUS='STOPPED', + PROGRESS = ?, + STAGE_NAME = ? + WHERE INSTANCE = ? + """) + ) + { + stmt.setInt(1, (int) Math.round(100 * stepNum / (double) stepCount)); + stmt.setString( 2, step); + stmt.setString( 3, instanceUUID); + stmt.executeUpdate(); + } + } + eventLog.logEvent("TASK-TERMINATED", taskName); + } + + @Override + public void close() { + shutDown(); + } + +} + diff --git a/code/common/service/src/main/java/nu/marginalia/service/module/ConfigurationModule.java b/code/common/service/src/main/java/nu/marginalia/service/module/ConfigurationModule.java index a0d763d0..62d1f9ce 100644 --- a/code/common/service/src/main/java/nu/marginalia/service/module/ConfigurationModule.java +++ b/code/common/service/src/main/java/nu/marginalia/service/module/ConfigurationModule.java @@ -8,9 +8,9 @@ import nu.marginalia.service.descriptor.ServiceDescriptors; import nu.marginalia.service.id.ServiceId; import java.util.Objects; +import java.util.UUID; public class ConfigurationModule extends AbstractModule { - private static final String SERVICE_NAME = System.getProperty("service-name"); private final ServiceDescriptors descriptors; private final ServiceId id; @@ -21,15 +21,13 @@ public class ConfigurationModule extends AbstractModule { public void configure() { bind(ServiceDescriptors.class).toInstance(descriptors); - bind(String.class).annotatedWith(Names.named("service-name")).toInstance(Objects.requireNonNull(SERVICE_NAME)); - bind(String.class).annotatedWith(Names.named("service-host")).toInstance(System.getProperty("service-host", "127.0.0.1")); - bind(Integer.class).annotatedWith(Names.named("service-port")).toInstance(descriptors.forId(id).port); - } - @Provides - @Named("metrics-server-port") - public Integer provideMetricsServerPort(@Named("service-port") Integer servicePort) { - return servicePort + 1000; + int basePort = descriptors.forId(id).port; + int prometheusPort = basePort + 1000; + String host = Objects.requireNonNull(System.getProperty("service-host", "127.0.0.1")); + var configObject = new ServiceConfiguration(id, 0, host, basePort, prometheusPort, UUID.randomUUID()); + + bind(ServiceConfiguration.class).toInstance(configObject); } } diff --git a/code/common/service/src/main/java/nu/marginalia/service/module/DatabaseModule.java b/code/common/service/src/main/java/nu/marginalia/service/module/DatabaseModule.java index e3d660ad..70af3ed4 100644 --- a/code/common/service/src/main/java/nu/marginalia/service/module/DatabaseModule.java +++ b/code/common/service/src/main/java/nu/marginalia/service/module/DatabaseModule.java @@ -90,8 +90,8 @@ public class DatabaseModule extends AbstractModule { config.addDataSourceProperty("prepStmtCacheSize", "250"); config.addDataSourceProperty("prepStmtCacheSqlLimit", "2048"); - config.setMaximumPoolSize(100); - config.setMinimumIdle(10); + config.setMaximumPoolSize(20); + config.setMinimumIdle(2); config.setMaxLifetime(Duration.ofMinutes(9).toMillis()); diff --git a/code/common/service/src/main/java/nu/marginalia/service/module/ServiceConfiguration.java b/code/common/service/src/main/java/nu/marginalia/service/module/ServiceConfiguration.java new file mode 100644 index 00000000..df97b7b0 --- /dev/null +++ b/code/common/service/src/main/java/nu/marginalia/service/module/ServiceConfiguration.java @@ -0,0 +1,27 @@ +package nu.marginalia.service.module; + +import nu.marginalia.service.id.ServiceId; + +import java.util.UUID; + +/** + * Configuration object for a service. This is a guice-injectable object + * intended to keep down the amount of named bindings. + * + * @param serviceId - service descriptor + * @param node - always 0 for now, for future service partitioning + * @param host - the bind address of the service + * @param port - main port of the service + * @param metricsPort - prometheus metrics server port + * @param instanceUuid - unique identifier for this instance of the service + */ +public record ServiceConfiguration(ServiceId serviceId, + int node, + String host, + int port, + int metricsPort, + UUID instanceUuid) { + public String serviceName() { + return serviceId.name; + } +} diff --git a/code/common/service/src/main/java/nu/marginalia/service/server/BaseServiceParams.java b/code/common/service/src/main/java/nu/marginalia/service/server/BaseServiceParams.java new file mode 100644 index 00000000..73706dc8 --- /dev/null +++ b/code/common/service/src/main/java/nu/marginalia/service/server/BaseServiceParams.java @@ -0,0 +1,33 @@ +package nu.marginalia.service.server; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.mq.MessageQueueFactory; +import nu.marginalia.service.control.ServiceEventLog; +import nu.marginalia.service.control.ServiceHeartbeat; +import nu.marginalia.service.module.ServiceConfiguration; + +/** This class exists to reduce Service boilerplate */ +@Singleton +public class BaseServiceParams { + public final ServiceConfiguration configuration; + public final Initialization initialization; + public final MetricsServer metricsServer; + public final ServiceHeartbeat heartbeat; + public final ServiceEventLog eventLog; + public final MessageQueueFactory messageQueueInboxFactory; + @Inject + public BaseServiceParams(ServiceConfiguration configuration, + Initialization initialization, + MetricsServer metricsServer, + ServiceHeartbeat heartbeat, + ServiceEventLog eventLog, + MessageQueueFactory messageQueueInboxFactory) { + this.configuration = configuration; + this.initialization = initialization; + this.metricsServer = metricsServer; + this.heartbeat = heartbeat; + this.eventLog = eventLog; + this.messageQueueInboxFactory = messageQueueInboxFactory; + } +} diff --git a/code/common/service/src/main/java/nu/marginalia/service/server/Initialization.java b/code/common/service/src/main/java/nu/marginalia/service/server/Initialization.java index c7a857ea..e75db6fe 100644 --- a/code/common/service/src/main/java/nu/marginalia/service/server/Initialization.java +++ b/code/common/service/src/main/java/nu/marginalia/service/server/Initialization.java @@ -5,10 +5,14 @@ import lombok.SneakyThrows; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.util.ArrayList; +import java.util.List; + @Singleton public class Initialization { boolean initialized; private final Logger logger = LoggerFactory.getLogger(getClass()); + private final List callbacks = new ArrayList<>(); public static Initialization already() { Initialization init = new Initialization(); @@ -21,6 +25,27 @@ public class Initialization { logger.info("Initialized"); initialized = true; notifyAll(); + + } + + callbacks.forEach(Runnable::run); + callbacks.clear(); + } + + public void addCallback(Runnable callback) { + boolean runNow; + + synchronized (this) { + if (!initialized) { + callbacks.add(callback); + runNow = false; + } else { + runNow = true; + } + } + + if (runNow) { + callback.run(); } } diff --git a/code/common/service/src/main/java/nu/marginalia/service/server/MetricsServer.java b/code/common/service/src/main/java/nu/marginalia/service/server/MetricsServer.java index 1822b465..7dc52d9e 100644 --- a/code/common/service/src/main/java/nu/marginalia/service/server/MetricsServer.java +++ b/code/common/service/src/main/java/nu/marginalia/service/server/MetricsServer.java @@ -1,9 +1,9 @@ package nu.marginalia.service.server; import com.google.inject.Inject; -import com.google.inject.name.Named; import io.prometheus.client.exporter.MetricsServlet; import lombok.SneakyThrows; +import nu.marginalia.service.module.ServiceConfiguration; import org.eclipse.jetty.server.Server; import org.eclipse.jetty.servlet.ServletContextHandler; import org.eclipse.jetty.servlet.ServletHolder; @@ -12,8 +12,8 @@ public class MetricsServer { @SneakyThrows @Inject - public MetricsServer(@Named("metrics-server-port") int port) { - Server server = new Server(port); + public MetricsServer(ServiceConfiguration configuration) { + Server server = new Server(configuration.metricsPort()); ServletContextHandler context = new ServletContextHandler(); context.setContextPath("/"); server.setHandler(context); diff --git a/code/common/service/src/main/java/nu/marginalia/service/server/Service.java b/code/common/service/src/main/java/nu/marginalia/service/server/Service.java index e5c04877..4185aad6 100644 --- a/code/common/service/src/main/java/nu/marginalia/service/server/Service.java +++ b/code/common/service/src/main/java/nu/marginalia/service/server/Service.java @@ -1,9 +1,11 @@ package nu.marginalia.service.server; -import com.google.common.base.Strings; import io.prometheus.client.Counter; import nu.marginalia.client.Context; import nu.marginalia.client.exception.MessagingException; +import nu.marginalia.mq.inbox.*; +import nu.marginalia.service.server.mq.MqRequest; +import nu.marginalia.service.server.mq.ServiceMqSubscription; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.slf4j.Marker; @@ -35,22 +37,39 @@ public class Service { .labelNames("service") .register(); private final String serviceName; - private static volatile boolean initialized = false; - public Service(String ip, int port, Initialization initialization, MetricsServer metricsServer, Runnable configureStaticFiles) { - this.initialization = initialization; + protected final MqInboxIf messageQueueInbox; + + public Service(BaseServiceParams params, + Runnable configureStaticFiles + ) { + this.initialization = params.initialization; + var config = params.configuration; + + String inboxName = config.serviceName() + ":" + config.node(); + logger.info("Inbox name: {}", inboxName); + + var mqInboxFactory = params.messageQueueInboxFactory; + messageQueueInbox = mqInboxFactory.createAsynchronousInbox(inboxName, config.instanceUuid()); + messageQueueInbox.subscribe(new ServiceMqSubscription(this)); serviceName = System.getProperty("service-name"); + initialization.addCallback(params.heartbeat::start); + initialization.addCallback(messageQueueInbox::start); + initialization.addCallback(() -> params.eventLog.logEvent("SVC-INIT", "")); + if (!initialization.isReady() && ! initialized ) { initialized = true; Spark.threadPool(32, 4, 60_000); - Spark.ipAddress(ip); - Spark.port(port); + Spark.ipAddress(params.configuration.host()); + Spark.port(params.configuration.port()); - logger.info("{} Listening to {}:{}", getClass().getSimpleName(), ip == null ? "" : ip, port); + logger.info("{} Listening to {}:{}", getClass().getSimpleName(), + params.configuration.host(), + params.configuration.port()); configureStaticFiles.run(); @@ -66,8 +85,8 @@ public class Service { } } - public Service(String ip, int port, Initialization initialization, MetricsServer metricsServer) { - this(ip, port, initialization, metricsServer, () -> { + public Service(BaseServiceParams params) { + this(params, () -> { // configureStaticFiles can't be an overridable method in Service because it may // need to depend on parameters to the constructor, and super-constructors // must run first @@ -76,6 +95,16 @@ public class Service { }); } + @MqRequest(endpoint = "SVC-READY") + public boolean mqIsReady() { + return initialization.isReady(); + } + + @MqRequest(endpoint = "SVC-PING") + public String mqPing() { + return "pong"; + } + private void filterPublicRequests(Request request, Response response) { if (null == request.headers("X-Public")) { return; @@ -90,11 +119,7 @@ public class Service { Spark.halt(403); } - String url = request.pathInfo(); - if (request.queryString() != null) { - url = url + "?" + request.queryString(); - } - logger.info(httpMarker, "PUBLIC {}: {} {}", Context.fromRequest(request).getContextId(), request.requestMethod(), url); + logRequest(request); } private Object isInitialized(Request request, Response response) { @@ -139,9 +164,8 @@ public class Service { request_counter_bad.labels(serviceName).inc(); } - if (null != request.headers("X-Public")) { - logger.info(httpMarker, "RSP {}", response.status()); - } + logResponse(request, response); + } private void paintThreadName(Request request, String prefix) { @@ -149,7 +173,7 @@ public class Service { Thread.currentThread().setName(prefix + ctx.getContextId()); } - private void handleException(Exception ex, Request request, Response response) { + protected void handleException(Exception ex, Request request, Response response) { request_counter_err.labels(serviceName).inc(); if (ex instanceof MessagingException) { logger.error("{} {}", ex.getClass().getSimpleName(), ex.getMessage()); @@ -159,4 +183,21 @@ public class Service { } } + /** Log the request on the HTTP log */ + protected void logRequest(Request request) { + String url = request.pathInfo(); + if (request.queryString() != null) { + url = url + "?" + request.queryString(); + } + + logger.info(httpMarker, "PUBLIC {}: {} {}", Context.fromRequest(request).getContextId(), request.requestMethod(), url); + } + + /** Log the response on the HTTP log */ + protected void logResponse(Request request, Response response) { + if (null != request.headers("X-Public")) { + logger.info(httpMarker, "RSP {}", response.status()); + } + } + } diff --git a/code/common/service/src/main/java/nu/marginalia/service/server/mq/MqNotification.java b/code/common/service/src/main/java/nu/marginalia/service/server/mq/MqNotification.java new file mode 100644 index 00000000..20586f3e --- /dev/null +++ b/code/common/service/src/main/java/nu/marginalia/service/server/mq/MqNotification.java @@ -0,0 +1,9 @@ +package nu.marginalia.service.server.mq; + +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; + +@Retention(RetentionPolicy.RUNTIME) +public @interface MqNotification { + String endpoint(); +} diff --git a/code/common/service/src/main/java/nu/marginalia/service/server/mq/MqRequest.java b/code/common/service/src/main/java/nu/marginalia/service/server/mq/MqRequest.java new file mode 100644 index 00000000..60b7ebd8 --- /dev/null +++ b/code/common/service/src/main/java/nu/marginalia/service/server/mq/MqRequest.java @@ -0,0 +1,9 @@ +package nu.marginalia.service.server.mq; + +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; + +@Retention(RetentionPolicy.RUNTIME) +public @interface MqRequest { + String endpoint(); +} diff --git a/code/common/service/src/main/java/nu/marginalia/service/server/mq/ServiceMqSubscription.java b/code/common/service/src/main/java/nu/marginalia/service/server/mq/ServiceMqSubscription.java new file mode 100644 index 00000000..61a024f5 --- /dev/null +++ b/code/common/service/src/main/java/nu/marginalia/service/server/mq/ServiceMqSubscription.java @@ -0,0 +1,85 @@ +package nu.marginalia.service.server.mq; + +import nu.marginalia.mq.MqMessage; +import nu.marginalia.mq.inbox.MqInboxResponse; +import nu.marginalia.mq.inbox.MqSubscription; +import nu.marginalia.service.server.Service; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.util.HashMap; +import java.util.Map; + +public class ServiceMqSubscription implements MqSubscription { + private static final Logger logger = LoggerFactory.getLogger(ServiceMqSubscription.class); + private final Map requests = new HashMap<>(); + private final Map notifications = new HashMap<>(); + private final Service service; + + + public ServiceMqSubscription(Service service) { + this.service = service; + + /* Wire up all methods annotated with @MqRequest and @MqNotification + * to receive corresponding messages from this subscription */ + + for (var method : service.getClass().getMethods()) { + var annotation = method.getAnnotation(MqRequest.class); + if (annotation != null) { + requests.put(annotation.endpoint(), method); + } + } + + for (var method : service.getClass().getMethods()) { + var annotation = method.getAnnotation(MqNotification.class); + if (annotation != null) { + notifications.put(annotation.endpoint(), method); + } + } + } + + @Override + public boolean filter(MqMessage rawMessage) { + if (requests.containsKey(rawMessage.function())) { + return true; + } + if (notifications.containsKey(rawMessage.function())) { + return true; + } + + logger.warn("Received message for unknown function " + rawMessage.function()); + + return false; + } + + @Override + public MqInboxResponse onRequest(MqMessage msg) { + var method = requests.get(msg.function()); + + try { + return MqInboxResponse.ok(method.invoke(service, msg.payload()).toString()); + } + catch (InvocationTargetException ex) { + logger.error("Error invoking method " + method, ex); + return MqInboxResponse.err(ex.getCause().getMessage()); + } + catch (Exception ex) { + logger.error("Error invoking method " + method, ex); + return MqInboxResponse.err(ex.getMessage()); + } + } + + @Override + public void onNotification(MqMessage msg) { + var method = notifications.get(msg.function()); + + try { + method.invoke(service, msg.payload()); + } + catch (Exception ex) { + logger.error("Error invoking method " + method, ex); + } + } +} diff --git a/code/common/service/src/main/resources/log4j2.properties b/code/common/service/src/main/resources/log4j2.properties index 66d688b0..96c73ea0 100644 --- a/code/common/service/src/main/resources/log4j2.properties +++ b/code/common/service/src/main/resources/log4j2.properties @@ -4,6 +4,22 @@ appender.console.type = Console appender.console.name = LogToConsole appender.console.layout.type = PatternLayout appender.console.layout.pattern = %d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1} -- %msg{nolookups}%n +appender.console.filter.process.type = MarkerFilter +appender.console.filter.process.onMismatch=ACCEPT +appender.console.filter.process.onMatch=DENY +appender.console.filter.process.marker=PROCESS +appender.console.filter.http.type = MarkerFilter +appender.console.filter.http.onMismatch=ACCEPT +appender.console.filter.http.onMatch=DENY +appender.console.filter.http.marker=HTTP +appender.processconsole.type = Console +appender.processconsole.name = ProcessLogToConsole +appender.processconsole.layout.type = PatternLayout +appender.processconsole.layout.pattern = %msg{nolookups}%n +appender.processconsole.filter.process.type = MarkerFilter +appender.processconsole.filter.process.onMismatch=DENY +appender.processconsole.filter.process.onMatch=ACCEPT +appender.processconsole.filter.process.marker=PROCESS appender.rolling.type = RollingFile appender.rolling.name = RollingFile appender.rolling.fileName = /var/log/wmsa/wmsa-${sys:service-name}.log @@ -23,6 +39,27 @@ appender.rolling.filter.http.type = MarkerFilter appender.rolling.filter.http.onMismatch=ACCEPT appender.rolling.filter.http.onMatch=DENY appender.rolling.filter.http.marker=HTTP +appender.rolling.filter.process.type = MarkerFilter +appender.rolling.filter.process.onMismatch=ACCEPT +appender.rolling.filter.process.onMatch=DENY +appender.rolling.filter.process.marker=PROCESS +appender.process.type = RollingFile +appender.process.name = ProcessFile +appender.process.fileName = /var/log/wmsa/process.log +appender.process.filePattern = /var/log/wmsa/process-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz +appender.process.layout.pattern = %msg{nolookups}%n +appender.process.layout.type = PatternLayout +appender.process.policies.type = Policies +appender.process.policies.size.type = SizeBasedTriggeringPolicy +appender.process.policies.size.size=10MB +appender.process.strategy.type = DefaultRolloverStrategy +appender.process.strategy.max = 10 +appender.process.filter.process.type = MarkerFilter +appender.process.filter.process.onMismatch=DENY +appender.process.filter.process.onMatch=ACCEPT +appender.process.filter.process.marker=PROCESS rootLogger.level = info rootLogger.appenderRef.console.ref = LogToConsole -rootLogger.appenderRef.rolling.ref = RollingFile \ No newline at end of file +rootLogger.appenderRef.processconsole.ref = ProcessLogToConsole +rootLogger.appenderRef.rolling.ref = RollingFile +rootLogger.appenderRef.process.ref = ProcessFile \ No newline at end of file diff --git a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/model/DocumentKeywords.java b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/model/DocumentKeywords.java index 55622cb8..5d611cc9 100644 --- a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/model/DocumentKeywords.java +++ b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/model/DocumentKeywords.java @@ -3,11 +3,13 @@ package nu.marginalia.keyword.model; import nu.marginalia.model.idx.WordMetadata; +import java.io.Serializable; import java.util.Arrays; -public record DocumentKeywords( - String[] keywords, - long[] metadata) { +public record DocumentKeywords(String[] keywords, + long[] metadata) +implements Serializable +{ @Override public String toString() { diff --git a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java index 0912d459..6ce80372 100644 --- a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java +++ b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java @@ -62,7 +62,7 @@ public class DocumentKeywordsBuilder { words.putIfAbsent(word, 0); } - public void setFlagOnMetadataForWords(WordFlags flag, Set flagWords) { + public void setFlagOnMetadataForWords(WordFlags flag, Collection flagWords) { flagWords.forEach(word -> words.mergeLong(word, flag.asBit(), (a, b) -> a|b) ); diff --git a/code/features-index/index-forward/build.gradle b/code/features-index/index-forward/build.gradle index e7a34566..299c6496 100644 --- a/code/features-index/index-forward/build.gradle +++ b/code/features-index/index-forward/build.gradle @@ -18,6 +18,7 @@ dependencies { implementation project(':code:features-index:index-journal') implementation project(':code:features-index:lexicon') implementation project(':code:common:model') + implementation project(':code:common:service') implementation project(':third-party:uppend') diff --git a/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexConverter.java b/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexConverter.java index a82d2ea6..07a966f8 100644 --- a/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexConverter.java +++ b/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexConverter.java @@ -7,6 +7,7 @@ import nu.marginalia.array.LongArray; import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.ranking.DomainRankings; +import nu.marginalia.service.control.ServiceHeartbeat; import org.roaringbitmap.IntConsumer; import org.roaringbitmap.RoaringBitmap; import org.slf4j.Logger; @@ -19,6 +20,7 @@ import java.nio.file.Path; public class ForwardIndexConverter { + private final ServiceHeartbeat heartbeat; private final File inputFile; private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -28,18 +30,27 @@ public class ForwardIndexConverter { private final DomainRankings domainRankings; - public ForwardIndexConverter( + public ForwardIndexConverter(ServiceHeartbeat heartbeat, File inputFile, Path outputFileDocsId, Path outputFileDocsData, DomainRankings domainRankings ) { + this.heartbeat = heartbeat; this.inputFile = inputFile; this.outputFileDocsId = outputFileDocsId; this.outputFileDocsData = outputFileDocsData; this.domainRankings = domainRankings; } + public enum TaskSteps { + GET_DOC_IDS, + GATHER_OFFSETS, + SUPPLEMENTAL_INDEXES, + FORCE, + FINISHED + } + public void convert() throws IOException { deleteOldFiles(); @@ -53,18 +64,21 @@ public class ForwardIndexConverter { logger.info("Domain Rankings size = {}", domainRankings.size()); - try { + try (var progress = heartbeat.createServiceTaskHeartbeat(TaskSteps.class, "forwardIndexConverter")) { + progress.progress(TaskSteps.GET_DOC_IDS); + LongArray docsFileId = getDocIds(outputFileDocsId, journalReader); + progress.progress(TaskSteps.GATHER_OFFSETS); + // doc ids -> sorted list of ids - logger.info("Gathering Offsets"); Long2IntOpenHashMap docIdToIdx = new Long2IntOpenHashMap((int) docsFileId.size()); docsFileId.forEach(0, docsFileId.size(), (pos, val) -> docIdToIdx.put(val, (int) pos)); - // docIdToIdx -> file offset for id + progress.progress(TaskSteps.SUPPLEMENTAL_INDEXES); - logger.info("Creating Supplementary Indexes"); + // docIdToIdx -> file offset for id LongArray docFileData = LongArray.mmapForWriting(outputFileDocsData, ForwardIndexParameters.ENTRY_SIZE * docsFileId.size()); @@ -78,11 +92,15 @@ public class ForwardIndexConverter { docFileData.set(entryOffset + ForwardIndexParameters.DOMAIN_OFFSET, entry.domainId()); }); + progress.progress(TaskSteps.FORCE); + docFileData.force(); docsFileId.force(); docFileData.advice(NativeIO.Advice.DontNeed); docsFileId.advice(NativeIO.Advice.DontNeed); + + progress.progress(TaskSteps.FINISHED); } catch (IOException ex) { logger.error("Failed to convert", ex); throw ex; diff --git a/code/features-index/index-forward/src/test/java/nu/marginalia/index/forward/ForwardIndexConverterTest.java b/code/features-index/index-forward/src/test/java/nu/marginalia/index/forward/ForwardIndexConverterTest.java index 8e8bc252..1c6fdf1c 100644 --- a/code/features-index/index-forward/src/test/java/nu/marginalia/index/forward/ForwardIndexConverterTest.java +++ b/code/features-index/index-forward/src/test/java/nu/marginalia/index/forward/ForwardIndexConverterTest.java @@ -1,17 +1,20 @@ package nu.marginalia.index.forward; import lombok.SneakyThrows; -import nu.marginalia.dict.OffHeapDictionaryHashMap; import nu.marginalia.index.journal.model.IndexJournalEntry; import nu.marginalia.index.journal.writer.IndexJournalWriterImpl; import nu.marginalia.index.journal.writer.IndexJournalWriter; +import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode; import nu.marginalia.ranking.DomainRankings; import nu.marginalia.lexicon.KeywordLexicon; import nu.marginalia.lexicon.journal.KeywordLexiconJournal; +import nu.marginalia.service.control.ServiceHeartbeat; +import nu.marginalia.service.control.ServiceTaskHeartbeat; import nu.marginalia.test.TestUtil; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import org.mockito.Mockito; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -21,6 +24,7 @@ import java.nio.file.Path; import java.util.stream.IntStream; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.mockito.Mockito.when; class ForwardIndexConverterTest { @@ -45,7 +49,7 @@ class ForwardIndexConverterTest { dictionaryFile = Files.createTempFile("tmp", ".dict"); dictionaryFile.toFile().deleteOnExit(); - keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile())); + keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile(), KeywordLexiconJournalMode.READ_WRITE)); keywordLexicon.getOrInsert("0"); indexFile = Files.createTempFile("tmp", ".idx"); @@ -63,7 +67,6 @@ class ForwardIndexConverterTest { keywordLexicon.commitToDisk(); - writer.forceWrite(); writer.close(); @@ -98,7 +101,12 @@ class ForwardIndexConverterTest { @Test void testForwardIndex() throws IOException { - new ForwardIndexConverter(indexFile.toFile(), docsFileId, docsFileData, new DomainRankings()).convert(); + // RIP fairies + var serviceHeartbeat = Mockito.mock(ServiceHeartbeat.class); + when(serviceHeartbeat.createServiceTaskHeartbeat(Mockito.any(), Mockito.any())) + .thenReturn(Mockito.mock(ServiceTaskHeartbeat.class)); + + new ForwardIndexConverter(serviceHeartbeat, indexFile.toFile(), docsFileId, docsFileData, new DomainRankings()).convert(); var forwardReader = new ForwardIndexReader(docsFileId, docsFileData); diff --git a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryData.java b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryData.java index 423626ce..f24be823 100644 --- a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryData.java +++ b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryData.java @@ -8,7 +8,7 @@ import java.util.Iterator; public class IndexJournalEntryData implements Iterable { private final int size; - private final long[] underlyingArray; + public final long[] underlyingArray; public static final int MAX_LENGTH = 1000; public static final int ENTRY_SIZE = 2; @@ -23,11 +23,6 @@ public class IndexJournalEntryData implements Iterable= size) throw new ArrayIndexOutOfBoundsException(); @@ -37,7 +32,6 @@ public class IndexJournalEntryData implements Iterable 0 && i < entry.size()) { + dataBuffer.putLong(entry.underlyingArray[i++]); + } + } numEntries++; } - @Override - public void forceWrite() throws IOException { - outputStream.flush(); - - try (var raf = new RandomAccessFile(outputFile.toFile(), "rws")) { - raf.writeLong(numEntries); - raf.writeLong(lexicon.size()); - } - } - - @Override - public void flushWords() { - lexicon.commitToDisk(); - } - public void close() throws IOException { - forceWrite(); + dataBuffer.flip(); + compressingStream.compress(dataBuffer); + dataBuffer.clear(); + compressingStream.flush(); + compressingStream.close(); - outputStream.close(); + + // Finalize the file by writing a header + + ByteBuffer header = ByteBuffer.allocate(16); + header.putLong(numEntries); + header.putLong(lexicon.size()); + header.flip(); + + while (header.position() < header.limit()) { + fileChannel.write(header, header.position()); + } + + fileChannel.close(); } } diff --git a/code/features-index/index-journal/src/test/java/nu/marginalia/index/journal/IndexJournalTest.java b/code/features-index/index-journal/src/test/java/nu/marginalia/index/journal/IndexJournalTest.java index 67b23dee..9cb96781 100644 --- a/code/features-index/index-journal/src/test/java/nu/marginalia/index/journal/IndexJournalTest.java +++ b/code/features-index/index-journal/src/test/java/nu/marginalia/index/journal/IndexJournalTest.java @@ -41,7 +41,6 @@ public class IndexJournalTest { .add(5, 5) .add(6, 6) .build()); - journalWriter.forceWrite(); journalWriter.close(); reader = new IndexJournalReaderSingleCompressedFile(tempFile); diff --git a/code/features-index/index-reverse/build.gradle b/code/features-index/index-reverse/build.gradle index 3ef67762..d2e3b233 100644 --- a/code/features-index/index-reverse/build.gradle +++ b/code/features-index/index-reverse/build.gradle @@ -20,6 +20,7 @@ dependencies { implementation project(':code:features-index:index-journal') implementation project(':code:features-index:lexicon') implementation project(':code:common:model') + implementation project(':code:common:service') implementation libs.lombok annotationProcessor libs.lombok diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullConverter.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullConverter.java index 339e1c39..f2e3f91b 100644 --- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullConverter.java +++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullConverter.java @@ -21,11 +21,14 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardOpenOption; +import nu.marginalia.service.control.ServiceHeartbeat; + import static nu.marginalia.index.full.ReverseIndexFullParameters.bTreeContext; public class ReverseIndexFullConverter { private static final int RWF_BIN_SIZE = 10_000_000; + private final ServiceHeartbeat heartbeat; private final Path tmpFileDir; private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -36,11 +39,13 @@ public class ReverseIndexFullConverter { private final Path outputFileDocs; private final SortingContext sortingContext; - public ReverseIndexFullConverter(Path tmpFileDir, + public ReverseIndexFullConverter(ServiceHeartbeat heartbeat, + Path tmpFileDir, IndexJournalReader journalReader, DomainRankings domainRankings, Path outputFileWords, Path outputFileDocs) { + this.heartbeat = heartbeat; this.tmpFileDir = tmpFileDir; this.journalReader = journalReader; this.domainRankings = domainRankings; @@ -49,6 +54,18 @@ public class ReverseIndexFullConverter { this.sortingContext = new SortingContext(tmpFileDir, 64_000); } + public enum TaskSteps { + ACCUMULATE_STATISTICS, + INCREMENT_OFFSETS, + COUNT_OFFSETS, + CREATE_INTERMEDIATE_DOCS, + SORT_INTERMEDIATE_DOCS, + SIZING, + FINALIZING_DOCS, + FORCE, + FINISHED, + } + public void convert() throws IOException { deleteOldFiles(); @@ -57,28 +74,32 @@ public class ReverseIndexFullConverter { return; } - final IndexJournalStatistics statistics = journalReader.getStatistics(); - final Path intermediateUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat"); + try (var progress = heartbeat.createServiceTaskHeartbeat(TaskSteps.class, "reverseIndexFullConverter")) { + progress.progress(TaskSteps.ACCUMULATE_STATISTICS); - try { + final IndexJournalStatistics statistics = journalReader.getStatistics(); final long wordsFileSize = statistics.highestWord() + 1; + progress.progress(TaskSteps.INCREMENT_OFFSETS); + logger.debug("Words file size: {}", wordsFileSize); // Create a count of how many documents has contains each word final LongArray wordsOffsets = LongArray.allocate(wordsFileSize); - logger.info("Gathering Offsets"); journalReader.forEachWordId(wordsOffsets::increment); + progress.progress(TaskSteps.COUNT_OFFSETS); + wordsOffsets.transformEach(0, wordsFileSize, new CountToOffsetTransformer(ReverseIndexFullParameters.ENTRY_SIZE)); + progress.progress(TaskSteps.CREATE_INTERMEDIATE_DOCS); + // Construct an intermediate representation of the reverse documents index try (FileChannel intermediateDocChannel = (FileChannel) Files.newByteChannel(intermediateUrlsFile, StandardOpenOption.CREATE, StandardOpenOption.READ, StandardOpenOption.WRITE)) { - logger.info("Creating Intermediate Docs File"); // Construct intermediate index try (RandomWriteFunnel intermediateDocumentWriteFunnel = new RandomWriteFunnel(tmpFileDir, RWF_BIN_SIZE); @@ -89,8 +110,7 @@ public class ReverseIndexFullConverter { intermediateDocumentWriteFunnel.write(intermediateDocChannel); } intermediateDocChannel.force(false); - - logger.info("Sorting Intermediate Docs File"); + progress.progress(TaskSteps.SORT_INTERMEDIATE_DOCS); // Sort each segment of the intermediate file { @@ -102,28 +122,29 @@ public class ReverseIndexFullConverter { intermediateDocs.force(); } - - logger.info("Sizing"); + progress.progress(TaskSteps.SIZING); IndexSizeEstimator sizeEstimator = new IndexSizeEstimator( ReverseIndexFullParameters.bTreeContext, ReverseIndexFullParameters.ENTRY_SIZE); wordsOffsets.fold(0, 0, wordsOffsets.size(), sizeEstimator); - - logger.info("Finalizing Docs File"); + progress.progress(TaskSteps.FINALIZING_DOCS); LongArray finalDocs = LongArray.mmapForWriting(outputFileDocs, sizeEstimator.size); // Construct the proper reverse index wordsOffsets.transformEachIO(0, wordsOffsets.size(), new ReverseIndexBTreeTransformer(finalDocs, ReverseIndexFullParameters.ENTRY_SIZE, bTreeContext, intermediateDocChannel)); wordsOffsets.write(outputFileWords); + progress.progress(TaskSteps.FORCE); + // Attempt to clean up before forcing (important disk space preservation) Files.deleteIfExists(intermediateUrlsFile); wordsOffsets.force(); finalDocs.force(); - logger.info("Done"); + + progress.progress(TaskSteps.FINISHED); } } catch (IOException ex) { diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityConverter.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityConverter.java index fbd49405..4c9cd0d0 100644 --- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityConverter.java +++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityConverter.java @@ -12,6 +12,7 @@ import nu.marginalia.index.journal.model.IndexJournalStatistics; import nu.marginalia.index.journal.reader.IndexJournalReader; import nu.marginalia.ranking.DomainRankings; import nu.marginalia.rwf.RandomWriteFunnel; +import nu.marginalia.service.control.ServiceHeartbeat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -21,9 +22,12 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardOpenOption; +import static nu.marginalia.index.priority.ReverseIndexPriorityParameters.bTreeContext; + public class ReverseIndexPriorityConverter { private static final int RWF_BIN_SIZE = 10_000_000; + private final ServiceHeartbeat heartbeat; private final Path tmpFileDir; private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -34,11 +38,13 @@ public class ReverseIndexPriorityConverter { private final Path outputFileDocs; private final SortingContext sortingContext; - public ReverseIndexPriorityConverter(Path tmpFileDir, + public ReverseIndexPriorityConverter(ServiceHeartbeat heartbeat, + Path tmpFileDir, IndexJournalReader journalReader, DomainRankings domainRankings, Path outputFileWords, Path outputFileDocs) { + this.heartbeat = heartbeat; this.tmpFileDir = tmpFileDir; this.journalReader = journalReader; this.domainRankings = domainRankings; @@ -47,6 +53,18 @@ public class ReverseIndexPriorityConverter { this.sortingContext = new SortingContext(tmpFileDir, 64_000); } + public enum TaskSteps { + ACCUMULATE_STATISTICS, + INCREMENT_OFFSETS, + COUNT_OFFSETS, + CREATE_INTERMEDIATE_DOCS, + SORT_INTERMEDIATE_DOCS, + SIZING, + FINALIZING_DOCS, + FORCE, + FINISHED, + } + public void convert() throws IOException { deleteOldFiles(); @@ -55,28 +73,32 @@ public class ReverseIndexPriorityConverter { return; } - final IndexJournalStatistics statistics = journalReader.getStatistics(); - final Path intermediateUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat"); + try (var progress = heartbeat.createServiceTaskHeartbeat(TaskSteps.class, "reverseIndexPriorityConverter")) { + progress.progress(TaskSteps.ACCUMULATE_STATISTICS); - try { + final IndexJournalStatistics statistics = journalReader.getStatistics(); final long wordsFileSize = statistics.highestWord() + 1; + progress.progress(TaskSteps.INCREMENT_OFFSETS); + logger.debug("Words file size: {}", wordsFileSize); // Create a count of how many documents has contains each word final LongArray wordsOffsets = LongArray.allocate(wordsFileSize); - logger.info("Gathering Offsets"); journalReader.forEachWordId(wordsOffsets::increment); + progress.progress(TaskSteps.COUNT_OFFSETS); + wordsOffsets.transformEach(0, wordsFileSize, new CountToOffsetTransformer(ReverseIndexPriorityParameters.ENTRY_SIZE)); + progress.progress(TaskSteps.CREATE_INTERMEDIATE_DOCS); + // Construct an intermediate representation of the reverse documents index try (FileChannel intermediateDocChannel = (FileChannel) Files.newByteChannel(intermediateUrlsFile, StandardOpenOption.CREATE, StandardOpenOption.READ, StandardOpenOption.WRITE)) { - logger.info("Creating Intermediate Docs File"); // Construct intermediate index try (RandomWriteFunnel intermediateDocumentWriteFunnel = new RandomWriteFunnel(tmpFileDir, RWF_BIN_SIZE); @@ -87,8 +109,7 @@ public class ReverseIndexPriorityConverter { intermediateDocumentWriteFunnel.write(intermediateDocChannel); } intermediateDocChannel.force(false); - - logger.info("Sorting Intermediate Docs File"); + progress.progress(TaskSteps.SORT_INTERMEDIATE_DOCS); // Sort each segment of the intermediate file { @@ -100,32 +121,29 @@ public class ReverseIndexPriorityConverter { intermediateDocs.force(); } + progress.progress(TaskSteps.SIZING); - logger.info("Sizing"); - - IndexSizeEstimator indexSizeEstimator = new IndexSizeEstimator( - ReverseIndexPriorityParameters.bTreeContext, + IndexSizeEstimator sizeEstimator = new IndexSizeEstimator( + bTreeContext, ReverseIndexPriorityParameters.ENTRY_SIZE); - wordsOffsets.fold(0, 0, wordsOffsets.size(), indexSizeEstimator); + wordsOffsets.fold(0, 0, wordsOffsets.size(), sizeEstimator); + progress.progress(TaskSteps.FINALIZING_DOCS); - logger.info("Finalizing Docs File"); - - LongArray finalDocs = LongArray.mmapForWriting(outputFileDocs, indexSizeEstimator.size); + LongArray finalDocs = LongArray.mmapForWriting(outputFileDocs, sizeEstimator.size); // Construct the proper reverse index - wordsOffsets.transformEachIO(0, wordsOffsets.size(), - new ReverseIndexBTreeTransformer(finalDocs, - ReverseIndexPriorityParameters.ENTRY_SIZE, - ReverseIndexPriorityParameters.bTreeContext, - intermediateDocChannel)); + wordsOffsets.transformEachIO(0, wordsOffsets.size(), new ReverseIndexBTreeTransformer(finalDocs, ReverseIndexPriorityParameters.ENTRY_SIZE, bTreeContext, intermediateDocChannel)); wordsOffsets.write(outputFileWords); + progress.progress(TaskSteps.FORCE); + // Attempt to clean up before forcing (important disk space preservation) Files.deleteIfExists(intermediateUrlsFile); wordsOffsets.force(); finalDocs.force(); - logger.info("Done"); + + progress.progress(TaskSteps.FINISHED); } } catch (IOException ex) { diff --git a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest.java b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest.java index a61f2a91..7644d019 100644 --- a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest.java +++ b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest.java @@ -8,13 +8,17 @@ import nu.marginalia.index.journal.model.IndexJournalEntry; import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile; import nu.marginalia.index.journal.writer.IndexJournalWriterImpl; import nu.marginalia.index.journal.writer.IndexJournalWriter; +import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode; import nu.marginalia.ranking.DomainRankings; import nu.marginalia.lexicon.KeywordLexicon; import nu.marginalia.lexicon.journal.KeywordLexiconJournal; import nu.marginalia.model.idx.DocumentMetadata; +import nu.marginalia.service.control.ServiceHeartbeat; +import nu.marginalia.service.control.ServiceTaskHeartbeat; import nu.marginalia.test.TestUtil; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import org.mockito.Mockito; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -25,6 +29,7 @@ import java.util.stream.IntStream; import java.util.stream.LongStream; import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.mockito.Mockito.when; class ReverseIndexFullConverterTest { KeywordLexicon keywordLexicon; @@ -42,7 +47,7 @@ class ReverseIndexFullConverterTest { dictionaryFile = Files.createTempFile("tmp", ".dict"); dictionaryFile.toFile().deleteOnExit(); - keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile())); + keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile(), KeywordLexiconJournalMode.READ_WRITE)); keywordLexicon.getOrInsert("0"); indexFile = Files.createTempFile("tmp", ".idx"); @@ -83,7 +88,14 @@ class ReverseIndexFullConverterTest { var docsFile = dataDir.resolve("docs.dat"); var journalReader = new IndexJournalReaderSingleCompressedFile(indexFile); - new ReverseIndexFullConverter(tmpDir, journalReader, new DomainRankings(), wordsFile, docsFile) + // RIP fairies + var serviceHeartbeat = Mockito.mock(ServiceHeartbeat.class); + when(serviceHeartbeat.createServiceTaskHeartbeat(Mockito.any(), Mockito.any())) + .thenReturn(Mockito.mock(ServiceTaskHeartbeat.class)); + + new ReverseIndexFullConverter( + serviceHeartbeat, + tmpDir, journalReader, new DomainRankings(), wordsFile, docsFile) .convert(); var reverseIndexReader = new ReverseIndexFullReader(wordsFile, docsFile); diff --git a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest2.java b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest2.java index 5ce603c1..e4c7b7e4 100644 --- a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest2.java +++ b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest2.java @@ -10,13 +10,17 @@ import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile import nu.marginalia.index.journal.writer.IndexJournalWriterImpl; import nu.marginalia.index.journal.writer.IndexJournalWriter; import nu.marginalia.index.priority.ReverseIndexPriorityParameters; +import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode; import nu.marginalia.ranking.DomainRankings; import nu.marginalia.lexicon.KeywordLexicon; import nu.marginalia.lexicon.journal.KeywordLexiconJournal; +import nu.marginalia.service.control.ServiceHeartbeat; +import nu.marginalia.service.control.ServiceTaskHeartbeat; import nu.marginalia.test.TestUtil; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import org.mockito.Mockito; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -27,6 +31,8 @@ import java.util.Arrays; import java.util.stream.IntStream; import java.util.stream.LongStream; +import static org.mockito.Mockito.when; + class ReverseIndexFullConverterTest2 { KeywordLexicon keywordLexicon; @@ -52,7 +58,7 @@ class ReverseIndexFullConverterTest2 { dictionaryFile = Files.createTempFile("tmp", ".dict"); dictionaryFile.toFile().deleteOnExit(); - keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile())); + keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile(), KeywordLexiconJournalMode.READ_WRITE)); keywordLexicon.getOrInsert("0"); indexFile = Files.createTempFile("tmp", ".idx"); @@ -75,7 +81,7 @@ class ReverseIndexFullConverterTest2 { keywordLexicon.commitToDisk(); Thread.sleep(1000); - writer.forceWrite(); + writer.close(); var reader = new IndexJournalReaderSingleCompressedFile(indexFile); @@ -116,7 +122,12 @@ class ReverseIndexFullConverterTest2 { Path tmpDir = Path.of("/tmp"); - new ReverseIndexFullConverter(tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile), new DomainRankings(), wordsFile, docsFile).convert(); + // RIP fairies + var serviceHeartbeat = Mockito.mock(ServiceHeartbeat.class); + when(serviceHeartbeat.createServiceTaskHeartbeat(Mockito.any(), Mockito.any())) + .thenReturn(Mockito.mock(ServiceTaskHeartbeat.class)); + + new ReverseIndexFullConverter(serviceHeartbeat, tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile), new DomainRankings(), wordsFile, docsFile).convert(); var reverseReader = new ReverseIndexFullReader(wordsFile, docsFile); @@ -141,7 +152,12 @@ class ReverseIndexFullConverterTest2 { Path tmpDir = Path.of("/tmp"); - new ReverseIndexFullConverter(tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile, null, ReverseIndexPriorityParameters::filterPriorityRecord), new DomainRankings(), wordsFile, docsFile).convert(); + // RIP fairies + var serviceHeartbeat = Mockito.mock(ServiceHeartbeat.class); + when(serviceHeartbeat.createServiceTaskHeartbeat(Mockito.any(), Mockito.any())) + .thenReturn(Mockito.mock(ServiceTaskHeartbeat.class)); + + new ReverseIndexFullConverter(serviceHeartbeat, tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile, null, ReverseIndexPriorityParameters::filterPriorityRecord), new DomainRankings(), wordsFile, docsFile).convert(); var reverseReader = new ReverseIndexFullReader(wordsFile, docsFile); diff --git a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexPriorityConverterTest2.java b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexPriorityConverterTest2.java index 21d6198b..dcd46e22 100644 --- a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexPriorityConverterTest2.java +++ b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexPriorityConverterTest2.java @@ -12,11 +12,15 @@ import nu.marginalia.index.priority.ReverseIndexPriorityConverter; import nu.marginalia.index.priority.ReverseIndexPriorityParameters; import nu.marginalia.lexicon.KeywordLexicon; import nu.marginalia.lexicon.journal.KeywordLexiconJournal; +import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode; import nu.marginalia.ranking.DomainRankings; +import nu.marginalia.service.control.ServiceHeartbeat; +import nu.marginalia.service.control.ServiceTaskHeartbeat; import nu.marginalia.test.TestUtil; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import org.mockito.Mockito; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -27,6 +31,8 @@ import java.util.Arrays; import java.util.stream.IntStream; import java.util.stream.LongStream; +import static org.mockito.Mockito.when; + class ReverseIndexPriorityConverterTest2 { KeywordLexicon keywordLexicon; @@ -52,7 +58,7 @@ class ReverseIndexPriorityConverterTest2 { dictionaryFile = Files.createTempFile("tmp", ".dict"); dictionaryFile.toFile().deleteOnExit(); - keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile())); + keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile(), KeywordLexiconJournalMode.READ_WRITE)); keywordLexicon.getOrInsert("0"); indexFile = Files.createTempFile("tmp", ".idx"); @@ -75,7 +81,7 @@ class ReverseIndexPriorityConverterTest2 { keywordLexicon.commitToDisk(); Thread.sleep(1000); - writer.forceWrite(); + writer.close(); var reader = new IndexJournalReaderSingleCompressedFile(indexFile); @@ -116,7 +122,12 @@ class ReverseIndexPriorityConverterTest2 { Path tmpDir = Path.of("/tmp"); - new ReverseIndexPriorityConverter(tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile), new DomainRankings(), wordsFile, docsFile).convert(); + // RIP fairies + var serviceHeartbeat = Mockito.mock(ServiceHeartbeat.class); + when(serviceHeartbeat.createServiceTaskHeartbeat(Mockito.any(), Mockito.any())) + .thenReturn(Mockito.mock(ServiceTaskHeartbeat.class)); + + new ReverseIndexPriorityConverter(serviceHeartbeat, tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile), new DomainRankings(), wordsFile, docsFile).convert(); var reverseReader = new ReverseIndexPriorityReader(wordsFile, docsFile); @@ -141,7 +152,12 @@ class ReverseIndexPriorityConverterTest2 { Path tmpDir = Path.of("/tmp"); - new ReverseIndexPriorityConverter(tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile, null, ReverseIndexPriorityParameters::filterPriorityRecord), new DomainRankings(), wordsFile, docsFile).convert(); + // RIP fairies + var serviceHeartbeat = Mockito.mock(ServiceHeartbeat.class); + when(serviceHeartbeat.createServiceTaskHeartbeat(Mockito.any(), Mockito.any())) + .thenReturn(Mockito.mock(ServiceTaskHeartbeat.class)); + + new ReverseIndexPriorityConverter(serviceHeartbeat, tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile, null, ReverseIndexPriorityParameters::filterPriorityRecord), new DomainRankings(), wordsFile, docsFile).convert(); var reverseReader = new ReverseIndexPriorityReader(wordsFile, docsFile); diff --git a/code/features-index/lexicon/build.gradle b/code/features-index/lexicon/build.gradle index 18da060e..131b0bf6 100644 --- a/code/features-index/lexicon/build.gradle +++ b/code/features-index/lexicon/build.gradle @@ -22,6 +22,7 @@ dependencies { implementation libs.prometheus implementation libs.guava implementation libs.fastutil + implementation project(':third-party:commons-codec') testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.junit diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/dict/DictionaryData.java b/code/features-index/lexicon/src/main/java/nu/marginalia/dict/DictionaryData.java index 830ed4a7..ea291052 100644 --- a/code/features-index/lexicon/src/main/java/nu/marginalia/dict/DictionaryData.java +++ b/code/features-index/lexicon/src/main/java/nu/marginalia/dict/DictionaryData.java @@ -9,7 +9,6 @@ public class DictionaryData { public DictionaryData(int bankSize) { this.bankSize = bankSize; - banks.add(new DictionaryDataBank(0, bankSize)); } @@ -36,4 +35,8 @@ public class DictionaryData { return banks.get(offset/ bankSize).keyEquals(offset, otherKey); } + public void clear() { + banks.clear(); + banks.add(new DictionaryDataBank(0, bankSize)); + } } diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/dict/DictionaryMap.java b/code/features-index/lexicon/src/main/java/nu/marginalia/dict/DictionaryMap.java index dc904441..260015be 100644 --- a/code/features-index/lexicon/src/main/java/nu/marginalia/dict/DictionaryMap.java +++ b/code/features-index/lexicon/src/main/java/nu/marginalia/dict/DictionaryMap.java @@ -1,12 +1,24 @@ package nu.marginalia.dict; +/** Backing store for the KeywordLexicon, available in on and off-heap versions. + *

    + * The off-heap version is necessary when loading a lexicon that is too large to fit in RAM, due + * to Java's 2GB limit on the size of a single array. It is slower and less optimized than the on-heap version. + *

    + * The off-heap version is on the precipice of being deprecated and its use is discouraged. + */ public interface DictionaryMap { int NO_VALUE = Integer.MIN_VALUE; static DictionaryMap create() { + // Default to on-heap version + // TODO: Make this configurable + return new OnHeapDictionaryMap(); } + void clear(); + int size(); int put(long key); diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/dict/OffHeapDictionaryHashMap.java b/code/features-index/lexicon/src/main/java/nu/marginalia/dict/OffHeapDictionaryHashMap.java index e17c9c19..6a7aa07f 100644 --- a/code/features-index/lexicon/src/main/java/nu/marginalia/dict/OffHeapDictionaryHashMap.java +++ b/code/features-index/lexicon/src/main/java/nu/marginalia/dict/OffHeapDictionaryHashMap.java @@ -58,6 +58,13 @@ public class OffHeapDictionaryHashMap implements DictionaryMap { } } + @Override + public void clear() { + dictionaryData.clear(); + initializeBuffers(); + sz.set(0); + } + @Override public int size() { return sz.get(); diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/dict/OnHeapDictionaryMap.java b/code/features-index/lexicon/src/main/java/nu/marginalia/dict/OnHeapDictionaryMap.java index 3b70e7e4..96dd5d13 100644 --- a/code/features-index/lexicon/src/main/java/nu/marginalia/dict/OnHeapDictionaryMap.java +++ b/code/features-index/lexicon/src/main/java/nu/marginalia/dict/OnHeapDictionaryMap.java @@ -6,6 +6,11 @@ public class OnHeapDictionaryMap implements DictionaryMap { private static final int DEFAULT_SIZE = Integer.getInteger("lexiconSizeHint", 100_000); private final Long2IntOpenHashMap entries = new Long2IntOpenHashMap(DEFAULT_SIZE, 0.75f); + @Override + public void clear() { + entries.clear(); + } + @Override public int size() { return entries.size(); diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexicon.java b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexicon.java index 40f9d73b..84507511 100644 --- a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexicon.java +++ b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexicon.java @@ -1,20 +1,34 @@ package nu.marginalia.lexicon; -import com.google.common.hash.HashFunction; -import com.google.common.hash.Hashing; import io.prometheus.client.Gauge; import lombok.SneakyThrows; import nu.marginalia.dict.DictionaryMap; +import nu.marginalia.hash.MurmurHash3_128; import nu.marginalia.lexicon.journal.KeywordLexiconJournal; +import nu.marginalia.lexicon.journal.KeywordLexiconJournalFingerprint; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; +/** The keyword lexicon is used to map keywords to unique numeric IDs. + * This class is used to both construct the lexicon, and to read from it. + *

    + * Readers will want to use the KeywordLexiconReadOnlyView wrapper, as it + * only exposes read-only methods and hides the mutating methods. + *

    + * Between instances, the lexicon is stored in a journal file, exactly in the + * order they were received by the writer. The journal file is then replayed + * on startup to reconstruct the lexicon, giving each term an ID according to + * the order they are loaded. It is therefore important that the journal file + * is not tampered with, as this will cause the lexicon to be corrupted. + * */ + public class KeywordLexicon implements AutoCloseable { private final DictionaryMap reverseIndex; @@ -22,13 +36,16 @@ public class KeywordLexicon implements AutoCloseable { private final Logger logger = LoggerFactory.getLogger(getClass()); private static final AtomicInteger instances = new AtomicInteger(); - private final HashFunction hashFunction = Hashing.murmur3_128(); private static final Gauge request_time_metrics = Gauge.build("wmsa_edge_index_dictionary_size", "Dictionary Size") .register(); private final KeywordLexiconJournal journal; + private volatile KeywordLexiconJournalFingerprint fingerprint = null; + + private final MurmurHash3_128 hasher = new MurmurHash3_128(); + @SneakyThrows public KeywordLexicon(KeywordLexiconJournal keywordLexiconJournal) { @@ -41,15 +58,36 @@ public class KeywordLexicon implements AutoCloseable { logger.error("MULTIPLE LEXICON INSTANCES!"); } - journal.loadFile(bytes -> reverseIndex.put(hashFunction.hashBytes(bytes).padToLong())); + reload(); logger.info("Done creating dictionary writer"); } + public boolean needsReload() throws IOException { + var newFingerprint = journal.journalFingerprint(); + return !newFingerprint.equals(fingerprint); + } + + /** Reload the lexicon from the journal */ + public void reload() throws IOException { + var lock = memoryLock.writeLock(); + lock.lock(); + try { + reverseIndex.clear(); + journal.loadFile(bytes -> reverseIndex.put(hasher.hash(bytes))); + fingerprint = journal.journalFingerprint(); + } + finally { + lock.unlock(); + } + } + + /** Get method that inserts the word into the lexicon if it is not present */ public int getOrInsert(String macroWord) { return getOrInsert(macroWord.getBytes(StandardCharsets.UTF_8)); } + /** Get method that inserts the word into the lexicon if it is not present */ @SneakyThrows private int getOrInsert(byte[] bytes) { if (bytes.length >= Byte.MAX_VALUE) { @@ -57,7 +95,7 @@ public class KeywordLexicon implements AutoCloseable { return DictionaryMap.NO_VALUE; } - final long key = hashFunction.hashBytes(bytes).padToLong(); + final long key = hasher.hash(bytes); int idx = getReadOnly(key); @@ -89,11 +127,13 @@ public class KeywordLexicon implements AutoCloseable { } } + /** Get method that does not modify the lexicon if the word is not present */ public int getReadOnly(String word) { final byte[] bytes = word.getBytes(StandardCharsets.UTF_8); - return getReadOnly(hashFunction.hashBytes(bytes).padToLong()); + return getReadOnly(hasher.hash(bytes)); } + /** Get method that does not modify the lexicon if the word is not present */ public int getReadOnly(long hashedKey) { Lock lock = memoryLock.readLock(); try { diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexiconReadOnlyView.java b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexiconReadOnlyView.java index 9cdef151..076cc84d 100644 --- a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexiconReadOnlyView.java +++ b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexiconReadOnlyView.java @@ -3,12 +3,19 @@ package nu.marginalia.lexicon; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; import lombok.SneakyThrows; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.io.IOException; import java.util.concurrent.TimeUnit; +/** A read-only view of a keyword lexicon. + * + * @see KeywordLexicon + * */ public class KeywordLexiconReadOnlyView { private final KeywordLexicon writer; - + private final Logger logger = LoggerFactory.getLogger(getClass()); private final Cache cache = CacheBuilder.newBuilder().maximumSize(10_000).expireAfterAccess(60, TimeUnit.SECONDS).build(); @SneakyThrows @@ -21,4 +28,15 @@ public class KeywordLexiconReadOnlyView { return cache.get(word, () -> writer.getReadOnly(word)); } + public boolean suggestReload() throws IOException { + if (writer.needsReload()) { + logger.info("Reloading lexicon"); + writer.reload(); + cache.invalidateAll(); + } + else { + logger.info("Foregoing lexicon reload"); + } + return true; + } } diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournal.java b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournal.java index 84a23247..01ba412b 100644 --- a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournal.java +++ b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournal.java @@ -5,35 +5,70 @@ import org.slf4j.LoggerFactory; import java.io.File; import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.attribute.BasicFileAttributes; import java.util.List; import java.util.function.Consumer; +/** The journal for the keyword lexicon. + * It's used both for writing the lexicon, but also for reconstructing it for reading later. + */ public class KeywordLexiconJournal { private static final boolean noCommit = Boolean.getBoolean("DictionaryJournal.noCommit"); private final KeywordLexiconJournalCommitQueue commitQueue; - private final KeywordLexiconJournalFile journalFile; + private KeywordLexiconJournalFile journalFile; private final Logger logger = LoggerFactory.getLogger(getClass()); private final Thread commitToDiskThread; private volatile boolean running = true; + private final Path journalFilePath; - public KeywordLexiconJournal(File file) throws IOException { - commitQueue = new KeywordLexiconJournalCommitQueue(); - journalFile = new KeywordLexiconJournalFile(file); + /** Create a new journal. + * + * @param file The file to use for the journal. + * @param mode The mode to use for the journal. If READ_ONLY, the journal will be read-only and refuse + * to accept new entries. + */ + public KeywordLexiconJournal(File file, KeywordLexiconJournalMode mode) throws IOException { + journalFilePath = file.toPath(); - commitToDiskThread = new Thread(this::commitToDiskRunner, "CommitToDiskThread"); - commitToDiskThread.start(); + if (mode == KeywordLexiconJournalMode.READ_WRITE) { + commitQueue = new KeywordLexiconJournalCommitQueue(); + journalFile = new KeywordLexiconJournalFile(file); - Runtime.getRuntime().addShutdownHook(new Thread(this::commitToDisk)); + commitToDiskThread = new Thread(this::commitToDiskRunner, "CommitToDiskThread"); + commitToDiskThread.start(); + + Runtime.getRuntime().addShutdownHook(new Thread(this::commitToDisk)); + } + else { + journalFile = new KeywordLexiconJournalFile(file); + + commitQueue = null; + commitToDiskThread = null; + } } public void enqueue(byte[] word) throws InterruptedException { + if (null == commitQueue) + throw new UnsupportedOperationException("Lexicon journal is read-only"); + commitQueue.enqueue(word); } + public KeywordLexiconJournalFingerprint journalFingerprint() throws IOException { + var attributes = Files.readAttributes(journalFilePath, BasicFileAttributes.class); + + long cTime = attributes.creationTime().toMillis(); + long mTime = attributes.lastModifiedTime().toMillis(); + long size = attributes.size(); + + return new KeywordLexiconJournalFingerprint(cTime, mTime, size); + } public void commitToDiskRunner() { if (noCommit) return; @@ -57,13 +92,23 @@ public class KeywordLexiconJournal { public void close() throws Exception { logger.info("Closing Journal"); running = false; - commitToDiskThread.join(); - commitToDisk(); - journalFile.close(); + if (commitToDiskThread != null) { + commitToDiskThread.join(); + commitToDisk(); + } + + if (journalFile != null) { + journalFile.close(); + } } public void loadFile(Consumer loadJournalEntry) throws IOException { + if (journalFile != null) { + journalFile.close(); + } + + journalFile = new KeywordLexiconJournalFile(journalFilePath.toFile()); journalFile.loadFile(loadJournalEntry); } } diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalCommitQueue.java b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalCommitQueue.java index 7c6a460f..8ff12d6d 100644 --- a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalCommitQueue.java +++ b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalCommitQueue.java @@ -7,6 +7,9 @@ import java.util.ArrayList; import java.util.Collections; import java.util.List; +/** An in-memory queue for lexicon journal entries used to improve the performance of + * large bursts of insert-operations. + */ class KeywordLexiconJournalCommitQueue { private final ArrayList commitQueue = new ArrayList<>(10_000); private final Logger logger = LoggerFactory.getLogger(getClass()); diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalFile.java b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalFile.java index 7473e4df..81789891 100644 --- a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalFile.java +++ b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalFile.java @@ -1,6 +1,5 @@ package nu.marginalia.lexicon.journal; -import lombok.SneakyThrows; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -27,6 +26,10 @@ public class KeywordLexiconJournalFile implements AutoCloseable { this.journalFile = journalFile; } + public void rewind() throws IOException { + journalFileRAF.seek(0); + } + public void loadFile(Consumer acceptEntry) throws IOException { if (!journalFile.exists()) { logger.info("File {} does not exist, can't load", journalFile); diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalFingerprint.java b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalFingerprint.java new file mode 100644 index 00000000..a08d7124 --- /dev/null +++ b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalFingerprint.java @@ -0,0 +1,10 @@ +package nu.marginalia.lexicon.journal; + +/** Contains values used to assess whether the lexicon is in sync with the journal + * or if it has been replaced with a newer version and should be reloaded + * */ +public record KeywordLexiconJournalFingerprint(long createdTime, + long mTime, + long sizeBytes) +{ +} diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalMode.java b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalMode.java new file mode 100644 index 00000000..6208fc47 --- /dev/null +++ b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalMode.java @@ -0,0 +1,6 @@ +package nu.marginalia.lexicon.journal; + +public enum KeywordLexiconJournalMode { + READ_ONLY, + READ_WRITE +} diff --git a/code/features-index/lexicon/src/test/java/nu/marginalia/lexicon/KeywordLexiconTest.java b/code/features-index/lexicon/src/test/java/nu/marginalia/lexicon/KeywordLexiconTest.java index ca044e5e..98249c27 100644 --- a/code/features-index/lexicon/src/test/java/nu/marginalia/lexicon/KeywordLexiconTest.java +++ b/code/features-index/lexicon/src/test/java/nu/marginalia/lexicon/KeywordLexiconTest.java @@ -2,6 +2,7 @@ package nu.marginalia.lexicon; import nu.marginalia.dict.OnHeapDictionaryMap; import nu.marginalia.lexicon.journal.KeywordLexiconJournal; +import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach; @@ -23,7 +24,7 @@ public class KeywordLexiconTest { public void setUp() throws IOException { journalFile = Files.createTempFile(getClass().getSimpleName(), ".dat"); - var lexiconJournal = new KeywordLexiconJournal(journalFile.toFile()); + var lexiconJournal = new KeywordLexiconJournal(journalFile.toFile(), KeywordLexiconJournalMode.READ_WRITE); lexicon = new KeywordLexicon(lexiconJournal); } @@ -64,7 +65,7 @@ public class KeywordLexiconTest { int c = lexicon.getOrInsert("ccc"); lexicon.commitToDisk(); - var lexiconJournal = new KeywordLexiconJournal(journalFile.toFile()); + var lexiconJournal = new KeywordLexiconJournal(journalFile.toFile(), KeywordLexiconJournalMode.READ_WRITE); try (var anotherLexicon = new KeywordLexicon(lexiconJournal)) { assertEquals(a, anotherLexicon.getReadOnly("aaa")); assertEquals(b, anotherLexicon.getReadOnly("bbb")); diff --git a/code/libraries/big-string/readme.md b/code/libraries/big-string/readme.md index 84fab2a2..f03c64ad 100644 --- a/code/libraries/big-string/readme.md +++ b/code/libraries/big-string/readme.md @@ -4,6 +4,10 @@ Microlibrary that offers string compression. This is useful when having to load of HTML documents in memory during conversion. XML has been described as the opposite of a compression scheme, and as a result, HTML compresses ridiculously well. +## Configuration + +If the Java property 'bigstring.disabled' is set to true, the BigString class will not compress strings. + ## Demo ```java diff --git a/code/libraries/big-string/src/main/java/nu/marginalia/bigstring/BigString.java b/code/libraries/big-string/src/main/java/nu/marginalia/bigstring/BigString.java index f1533977..55a26cd7 100644 --- a/code/libraries/big-string/src/main/java/nu/marginalia/bigstring/BigString.java +++ b/code/libraries/big-string/src/main/java/nu/marginalia/bigstring/BigString.java @@ -1,8 +1,11 @@ package nu.marginalia.bigstring; public interface BigString { + + boolean disableBigString = Boolean.getBoolean("bigstring.disabled"); + static BigString encode(String stringValue) { - if (stringValue.length() > 64) { + if (!disableBigString && stringValue.length() > 64) { return new CompressedBigString(stringValue); } else { diff --git a/code/process-models/converting-model/build.gradle b/code/process-models/converting-model/build.gradle index 1c2ef076..11426794 100644 --- a/code/process-models/converting-model/build.gradle +++ b/code/process-models/converting-model/build.gradle @@ -12,8 +12,9 @@ java { } dependencies { - implementation project(':third-party:monkey-patch-gson') + //implementation project(':third-party:monkey-patch-gson') + implementation project(':code:common:db') implementation project(':code:common:model') implementation project(':code:api:index-api') implementation project(':code:common:service-discovery') diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Instruction.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Instruction.java index 4964c9b1..b36ef217 100644 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Instruction.java +++ b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Instruction.java @@ -1,6 +1,8 @@ package nu.marginalia.converting.instruction; -public interface Instruction { +import java.io.Serializable; + +public interface Instruction extends Serializable { void apply(Interpreter interpreter); boolean isNoOp(); diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Interpreter.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Interpreter.java index 4583f31d..248ea38d 100644 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Interpreter.java +++ b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Interpreter.java @@ -10,18 +10,18 @@ import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError; public interface Interpreter { - void loadUrl(EdgeUrl[] url); - void loadDomain(EdgeDomain[] domain); - void loadRssFeed(EdgeUrl[] rssFeed); - void loadDomainLink(DomainLink[] links); + default void loadUrl(EdgeUrl[] url) {} + default void loadDomain(EdgeDomain[] domain) {} + default void loadRssFeed(EdgeUrl[] rssFeed) {} + default void loadDomainLink(DomainLink[] links) {} - void loadProcessedDomain(EdgeDomain domain, DomainIndexingState state, String ip); - void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument); - void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError); + default void loadProcessedDomain(EdgeDomain domain, DomainIndexingState state, String ip) {} + default void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) {} + default void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError) {} - void loadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words); + default void loadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) {} - void loadDomainRedirect(DomainLink link); + default void loadDomainRedirect(DomainLink link) {} - void loadDomainMetadata(EdgeDomain domain, int knownUrls, int goodUrls, int visitedUrls); + default void loadDomainMetadata(EdgeDomain domain, int knownUrls, int goodUrls, int visitedUrls) {} } diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/DomainLink.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/DomainLink.java index c33f9892..22230a37 100644 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/DomainLink.java +++ b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/DomainLink.java @@ -2,5 +2,7 @@ package nu.marginalia.converting.instruction.instructions; import nu.marginalia.model.EdgeDomain; -public record DomainLink(EdgeDomain from, EdgeDomain to) { +import java.io.Serializable; + +public record DomainLink(EdgeDomain from, EdgeDomain to) implements Serializable { } diff --git a/code/process-models/crawling-model/build.gradle b/code/process-models/crawling-model/build.gradle index 7a5e5fab..6f8d26e5 100644 --- a/code/process-models/crawling-model/build.gradle +++ b/code/process-models/crawling-model/build.gradle @@ -14,6 +14,7 @@ java { dependencies { implementation project(':code:common:model') + implementation project(':code:common:db') implementation project(':code:common:process') implementation project(':code:libraries:big-string') implementation project(':code:api:index-api') diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java index 9c293af7..82a8823b 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java @@ -1,17 +1,17 @@ package nu.marginalia.crawling.io; +import com.github.luben.zstd.RecyclingBufferPool; import com.github.luben.zstd.ZstdInputStream; import com.google.gson.Gson; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.crawling.model.SerializableCrawlData; +import nu.marginalia.crawling.model.spec.CrawlingSpecification; import nu.marginalia.model.gson.GsonFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.BufferedReader; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStreamReader; +import java.io.*; import java.nio.file.Path; import java.util.ArrayList; import java.util.List; @@ -27,10 +27,21 @@ public class CrawledDomainReader { public CrawledDomainReader() { } + /** An iterator-like access to domain data This must be closed otherwise it will leak off-heap memory! */ + public SerializableCrawlDataStream createDataStream(Path fullPath) throws IOException { + return new FileReadingSerializableCrawlDataStream(gson, fullPath.toFile()); + } + + /** An iterator-like access to domain data. This must be closed otherwise it will leak off-heap memory! */ + public SerializableCrawlDataStream createDataStream(Path basePath, CrawlingSpecification spec) throws IOException { + return createDataStream(CrawlerOutputFile.getOutputFile(basePath, spec.id, spec.domain)); + } + + /** Read the entirety of the domain data into memory. This uses a lot of RAM */ public CrawledDomain read(Path path) throws IOException { DomainDataAssembler domainData = new DomainDataAssembler(); - try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile()))))) { + try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile()), RecyclingBufferPool.INSTANCE)))) { String line; while ((line = br.readLine()) != null) { if (line.startsWith("//")) { @@ -64,7 +75,6 @@ public class CrawledDomainReader { return Optional.of(read(path)); } catch (Exception ex) { - logger.warn("Failed to read domain " + path, ex); return Optional.empty(); } } @@ -91,4 +101,57 @@ public class CrawledDomainReader { return domainPrototype; } } + + private static class FileReadingSerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream { + private final Gson gson; + private final BufferedReader bufferedReader; + private SerializableCrawlData next = null; + + public FileReadingSerializableCrawlDataStream(Gson gson, File file) throws IOException { + this.gson = gson; + bufferedReader = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(file), RecyclingBufferPool.INSTANCE))); + } + + @Override + public SerializableCrawlData next() throws IOException { + if (hasNext()) { + var ret = next; + next = null; + return ret; + } + throw new IllegalStateException("No more data"); + } + + @Override + public boolean hasNext() throws IOException { + if (next != null) + return true; + + String identifier = bufferedReader.readLine(); + if (identifier == null) { + bufferedReader.close(); + return false; + } + String data = bufferedReader.readLine(); + if (data == null) { + bufferedReader.close(); + return false; + } + + if (identifier.equals(CrawledDomain.SERIAL_IDENTIFIER)) { + next = gson.fromJson(data, CrawledDomain.class); + } else if (identifier.equals(CrawledDocument.SERIAL_IDENTIFIER)) { + next = gson.fromJson(data, CrawledDocument.class); + } + else { + throw new IllegalStateException("Unknown identifier: " + identifier); + } + return true; + } + + @Override + public void close() throws Exception { + bufferedReader.close(); + } + } } diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java index 51ffab18..bc83c10b 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java @@ -1,12 +1,12 @@ package nu.marginalia.crawling.io; +import com.github.luben.zstd.RecyclingBufferPool; import com.github.luben.zstd.ZstdOutputStream; import com.google.gson.Gson; import lombok.SneakyThrows; import nu.marginalia.crawling.model.SerializableCrawlData; +import nu.marginalia.crawling.model.spec.CrawlingSpecification; import nu.marginalia.model.gson.GsonFactory; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import java.io.BufferedOutputStream; import java.io.IOException; @@ -14,27 +14,37 @@ import java.io.OutputStreamWriter; import java.io.Writer; import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.StandardCopyOption; +import java.nio.file.StandardOpenOption; public class CrawledDomainWriter implements AutoCloseable { private final Path outputDir; private final Gson gson = GsonFactory.get(); - private static final Logger logger = LoggerFactory.getLogger(CrawledDomainWriter.class); private final Writer writer; - private final Path outputFile; + private final Path tmpFile; + private final Path actualFile; - public CrawledDomainWriter(Path outputDir, String name, String id) throws IOException { + public CrawledDomainWriter(Path outputDir, CrawlingSpecification spec) throws IOException { this.outputDir = outputDir; if (!Files.isDirectory(outputDir)) { throw new IllegalArgumentException("Output dir " + outputDir + " does not exist"); } - outputFile = getOutputFile(id, name); - writer = new OutputStreamWriter(new ZstdOutputStream(new BufferedOutputStream(Files.newOutputStream(outputFile)))); + + // Do the actual writing to a temporary file first, then move it to the actual file when close() is invoked + // this lets us read the old file and compare its contents while writing the new file. It also guards against + // half-written files if the process is killed. + + tmpFile = getOutputFile(spec.id, spec.domain + "_tmp"); + actualFile = getOutputFile(spec.id, spec.domain); + writer = new OutputStreamWriter(new ZstdOutputStream(new BufferedOutputStream(Files.newOutputStream(tmpFile, + StandardOpenOption.CREATE, StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING)), + RecyclingBufferPool.INSTANCE)); } public Path getOutputFile() { - return outputFile; + return actualFile; } @SneakyThrows @@ -46,32 +56,12 @@ public class CrawledDomainWriter implements AutoCloseable { } private Path getOutputFile(String id, String name) throws IOException { - String first = id.substring(0, 2); - String second = id.substring(2, 4); - - Path destDir = outputDir.resolve(first).resolve(second); - if (!Files.exists(destDir)) { - Files.createDirectories(destDir); - } - return destDir.resolve(id + "-" + filesystemSafeName(name) + ".zstd"); - } - - private String filesystemSafeName(String name) { - StringBuilder nameSaneBuilder = new StringBuilder(); - - name.chars() - .map(Character::toLowerCase) - .map(c -> (c & ~0x7F) == 0 ? c : 'X') - .map(c -> (Character.isDigit(c) || Character.isAlphabetic(c) || c == '.') ? c : 'X') - .limit(128) - .forEach(c -> nameSaneBuilder.append((char) c)); - - return nameSaneBuilder.toString(); - + return CrawlerOutputFile.createOutputPath(outputDir, id, name); } @Override public void close() throws IOException { + Files.move(tmpFile, actualFile, StandardCopyOption.REPLACE_EXISTING); writer.close(); } } diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java new file mode 100644 index 00000000..6cf5857f --- /dev/null +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java @@ -0,0 +1,53 @@ +package nu.marginalia.crawling.io; + +import nu.marginalia.crawling.model.spec.CrawlingSpecification; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +public class CrawlerOutputFile { + + public static Path getOutputFile(Path base, CrawlingSpecification spec) { + return getOutputFile(base, spec.id, spec.domain); + } + + + /** Return the Path to a file for the given id and name */ + public static Path getOutputFile(Path base, String id, String name) { + String first = id.substring(0, 2); + String second = id.substring(2, 4); + + Path destDir = base.resolve(first).resolve(second); + return destDir.resolve(id + "-" + filesystemSafeName(name) + ".zstd"); + } + + /** Return the Path to a file for the given id and name, creating the prerequisite + * directory structure as necessary. */ + public static Path createOutputPath(Path base, String id, String name) throws IOException { + String first = id.substring(0, 2); + String second = id.substring(2, 4); + + Path destDir = base.resolve(first).resolve(second); + if (!Files.exists(destDir)) { + Files.createDirectories(destDir); + } + return destDir.resolve(id + "-" + filesystemSafeName(name) + ".zstd"); + } + + + private static String filesystemSafeName(String name) { + StringBuilder nameSaneBuilder = new StringBuilder(); + + name.chars() + .map(Character::toLowerCase) + .map(c -> (c & ~0x7F) == 0 ? c : 'X') + .map(c -> (Character.isDigit(c) || Character.isAlphabetic(c) || c == '.') ? c : 'X') + .limit(128) + .forEach(c -> nameSaneBuilder.append((char) c)); + + return nameSaneBuilder.toString(); + + } + +} diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/SerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/SerializableCrawlDataStream.java new file mode 100644 index 00000000..3aecc0fc --- /dev/null +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/SerializableCrawlDataStream.java @@ -0,0 +1,44 @@ +package nu.marginalia.crawling.io; + +import nu.marginalia.crawling.model.SerializableCrawlData; + +import java.io.IOException; +import java.util.Iterator; + +/** Closable iterator over serialized crawl data + * The data may appear in any order, and the iterator must be closed. + * + * @see CrawledDomainReader + * */ +public interface SerializableCrawlDataStream extends AutoCloseable { + + + SerializableCrawlData next() throws IOException; + + boolean hasNext() throws IOException; + + + // Dummy iterator over nothing + static SerializableCrawlDataStream empty() { + return new SerializableCrawlDataStream() { + @Override + public SerializableCrawlData next() throws IOException { throw new IllegalStateException("No more data"); } + @Override + public boolean hasNext() throws IOException { return false;} + public void close() {} + }; + } + + // for testing + static SerializableCrawlDataStream fromIterator(Iterator iterator) { + return new SerializableCrawlDataStream() { + @Override + public SerializableCrawlData next() { return iterator.next(); } + @Override + public boolean hasNext() { return iterator.hasNext(); } + public void close() {} + }; + + } + +} diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java index 004408eb..94d13235 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java @@ -21,21 +21,18 @@ public class CrawledDocument implements SerializableCrawlData { public String crawlerStatusDesc; public String headers; - public BigString documentBody; + public String documentBody; public String documentBodyHash; public String canonicalUrl; public String redirectUrl; + public String recrawlState; + public static final String SERIAL_IDENTIFIER = "// DOCUMENT"; @Override public String getSerialIdentifier() { return SERIAL_IDENTIFIER; } - /** Remove all large data from this object to save memory */ - public void dispose() { - documentBody = null; - headers = null; - } } diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/spec/CrawlerSpecificationLoader.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/spec/CrawlerSpecificationLoader.java index cf6fb1fb..d5d4e482 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/spec/CrawlerSpecificationLoader.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/spec/CrawlerSpecificationLoader.java @@ -1,28 +1,42 @@ package nu.marginalia.crawling.model.spec; +import com.github.luben.zstd.RecyclingBufferPool; import com.github.luben.zstd.ZstdInputStream; import com.google.gson.Gson; import com.google.gson.JsonStreamParser; +import lombok.SneakyThrows; import nu.marginalia.model.gson.GsonFactory; import java.io.BufferedReader; import java.io.FileInputStream; -import java.io.IOException; import java.io.InputStreamReader; import java.nio.file.Path; -import java.util.function.Consumer; +import java.util.Iterator; public class CrawlerSpecificationLoader { private final static Gson gson = GsonFactory.get(); - public static void readInputSpec(Path inputSpec, Consumer consumer) { - try (var inputStream = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(inputSpec.toFile()))))) { - var parser = new JsonStreamParser(inputStream); - while (parser.hasNext()) { - consumer.accept(gson.fromJson(parser.next(), CrawlingSpecification.class)); + @SneakyThrows + public static Iterable asIterable(Path inputSpec) { + var inputStream = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(inputSpec.toFile()), + RecyclingBufferPool.INSTANCE))); + var parser = new JsonStreamParser(inputStream); + + return () -> new Iterator<>() { + @Override + @SneakyThrows + public boolean hasNext() { + if (!parser.hasNext()) { + inputStream.close(); + return false; + } + return true; } - } catch (IOException e) { - e.printStackTrace(); - } + + @Override + public CrawlingSpecification next() { + return gson.fromJson(parser.next(), CrawlingSpecification.class); + } + }; } } diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/spec/CrawlingSpecification.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/spec/CrawlingSpecification.java index 47ecf921..718e2d7f 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/spec/CrawlingSpecification.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/spec/CrawlingSpecification.java @@ -3,10 +3,12 @@ package nu.marginalia.crawling.model.spec; import lombok.AllArgsConstructor; import lombok.Builder; import lombok.NoArgsConstructor; +import lombok.With; +import nu.marginalia.crawling.model.CrawledDomain; import java.util.List; -@AllArgsConstructor @NoArgsConstructor @Builder +@AllArgsConstructor @NoArgsConstructor @Builder @With public class CrawlingSpecification { public String id; diff --git a/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java b/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java index ff299d68..71307140 100644 --- a/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java +++ b/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java @@ -4,21 +4,17 @@ import lombok.AllArgsConstructor; import lombok.NoArgsConstructor; import lombok.ToString; import nu.marginalia.crawling.io.CrawledDomainReader; +import nu.marginalia.crawling.io.SerializableCrawlDataStream; import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.spec.CrawlerSpecificationLoader; import nu.marginalia.crawling.model.spec.CrawlingSpecification; import nu.marginalia.process.log.WorkLog; -import nu.marginalia.process.log.WorkLogEntry; -import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.FileNotFoundException; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import java.util.Iterator; -import java.util.function.Consumer; import java.util.function.Predicate; import java.util.stream.Stream; import java.util.Optional; @@ -78,88 +74,74 @@ public class CrawlPlan { return new WorkLog(process.getLogFile()); } - public void forEachCrawlingSpecification(Consumer consumer) { - CrawlerSpecificationLoader.readInputSpec(getJobSpec(), consumer); + public Iterable crawlingSpecificationIterable() { + return CrawlerSpecificationLoader.asIterable(getJobSpec()); } - public void forEachCrawlingLogEntry(Consumer consumer) throws FileNotFoundException { - WorkLog.readLog(this.crawl.getLogFile(), consumer); - } - public void forEachProcessingLogEntry(Consumer consumer) throws FileNotFoundException { - WorkLog.readLog(this.process.getLogFile(), consumer); + public int countCrawledDomains() { + int count = 0; + for (var ignored : WorkLog.iterable(crawl.getLogFile())) { + count++; + } + return count; } - public void forEachCrawledDomain(Consumer consumer) { + public Iterable domainsIterable() { final CrawledDomainReader reader = new CrawledDomainReader(); - try (Stream entryStream = WorkLog.streamLog(crawl.getLogFile())) { - entryStream - .map(WorkLogEntry::path) - .map(this::getCrawledFilePath) - .map(reader::readOptionally) - .filter(Optional::isPresent) - .map(Optional::get) - .forEach(consumer); - } - catch (IOException ex) { - logger.warn("Failed to read domains", ex); - - throw new RuntimeException(ex); - } + return WorkLog.iterableMap(crawl.getLogFile(), + entry -> { + var path = getCrawledFilePath(entry.path()); + if (!Files.exists(path)) { + logger.warn("File not found: {}", path); + return Optional.empty(); + } + return reader.readOptionally(path); + }); } - public void forEachCrawledDomain(Predicate idReadPredicate, Consumer consumer) { + + public Iterable domainsIterable(Predicate idPredicate) { final CrawledDomainReader reader = new CrawledDomainReader(); - try (Stream entryStream = WorkLog.streamLog(crawl.getLogFile())) { - entryStream - .filter(entry -> idReadPredicate.test(entry.id())) - .map(WorkLogEntry::path) - .map(this::getCrawledFilePath) - .filter(path -> { - if (!Files.exists(path)) { - logger.warn("File not found: {}", path); - return false; - } - return true; - }) - .map(reader::readOptionally) - .filter(Optional::isPresent) - .map(Optional::get) - .forEach(consumer); - } - catch (IOException ex) { - logger.error("Failed to read domains", ex); + return WorkLog.iterableMap(crawl.getLogFile(), + entry -> { + if (!idPredicate.test(entry.id())) { + return Optional.empty(); + } - throw new RuntimeException(ex); - } - } - public DomainsIterable domainsIterable() throws IOException { - return new DomainsIterable(); + var path = getCrawledFilePath(entry.path()); + + if (!Files.exists(path)) { + logger.warn("File not found: {}", path); + return Optional.empty(); + } + return reader.readOptionally(path); + }); } - public class DomainsIterable implements Iterable, AutoCloseable { - private final Stream stream; - DomainsIterable() throws IOException { - final CrawledDomainReader reader = new CrawledDomainReader(); + public Iterable crawlDataIterable(Predicate idPredicate) { + final CrawledDomainReader reader = new CrawledDomainReader(); - stream = WorkLog.streamLog(crawl.getLogFile()) - .map(WorkLogEntry::path) - .map(CrawlPlan.this::getCrawledFilePath) - .map(reader::readOptionally) - .filter(Optional::isPresent) - .map(Optional::get); - } + return WorkLog.iterableMap(crawl.getLogFile(), + entry -> { + if (!idPredicate.test(entry.id())) { + return Optional.empty(); + } - @Override - public void close() { - stream.close(); - } + var path = getCrawledFilePath(entry.path()); - @NotNull - @Override - public Iterator iterator() { - return stream.iterator(); - } + if (!Files.exists(path)) { + logger.warn("File not found: {}", path); + return Optional.empty(); + } + + try { + return Optional.of(reader.createDataStream(path)); + } + catch (IOException ex) { + return Optional.empty(); + } + }); } } diff --git a/code/processes/converting-process/build.gradle b/code/processes/converting-process/build.gradle index 4cc4c63b..6d5ce58c 100644 --- a/code/processes/converting-process/build.gradle +++ b/code/processes/converting-process/build.gradle @@ -26,12 +26,15 @@ dependencies { implementation project(':third-party:porterstemmer') implementation project(':third-party:count-min-sketch') + implementation project(':code:api:index-api') + implementation project(':code:api:process-mqapi') implementation project(':code:common:model') implementation project(':code:common:db') implementation project(':code:common:service') implementation project(':code:common:config') + implementation project(':code:common:message-queue') implementation project(':code:common:service-discovery') implementation project(':code:common:service-client') @@ -76,11 +79,15 @@ dependencies { implementation libs.crawlercommons implementation libs.commons.lang3 + implementation libs.commons.compress + implementation libs.sqlite testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.junit testImplementation libs.mockito + implementation 'org.tukaani:xz:1.8' + testImplementation project(':code:processes:test-data') testImplementation project(':code:processes:crawling-process') } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConversionLog.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConversionLog.java index 58aa8b04..10c11e21 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConversionLog.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConversionLog.java @@ -1,5 +1,6 @@ package nu.marginalia.converting; +import com.github.luben.zstd.RecyclingBufferPool; import com.github.luben.zstd.ZstdOutputStream; import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.idx.DocumentMetadata; @@ -27,8 +28,7 @@ public class ConversionLog implements AutoCloseable, Interpreter { String fileName = String.format("conversion-log-%s.zstd", LocalDateTime.now().toEpochSecond(ZoneOffset.UTC)); Path logFile = rootDir.resolve(fileName); - writer = new PrintWriter(new ZstdOutputStream( - new BufferedOutputStream(Files.newOutputStream(logFile, StandardOpenOption.WRITE, StandardOpenOption.CREATE)))); + writer = new PrintWriter(new ZstdOutputStream(new BufferedOutputStream(Files.newOutputStream(logFile, StandardOpenOption.WRITE, StandardOpenOption.CREATE)), RecyclingBufferPool.INSTANCE)); } @Override @@ -36,35 +36,9 @@ public class ConversionLog implements AutoCloseable, Interpreter { writer.close(); } - @Override - public void loadUrl(EdgeUrl[] url) {} - - @Override - public void loadDomain(EdgeDomain[] domain) {} - - @Override - public void loadRssFeed(EdgeUrl[] rssFeed) {} - - @Override - public void loadDomainLink(DomainLink[] links) {} - - @Override - public void loadProcessedDomain(EdgeDomain domain, DomainIndexingState state, String ip) {} - - @Override - public void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) {} - @Override public synchronized void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError) { writer.printf("%s\t%s\n", loadProcessedDocumentWithError.url(), loadProcessedDocumentWithError.reason()); } - @Override - public void loadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) {} - - @Override - public void loadDomainRedirect(DomainLink link) {} - - @Override - public void loadDomainMetadata(EdgeDomain domain, int knownUrls, int goodUrls, int visitedUrls) {} } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java index 3ecebb80..0dfd816c 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java @@ -4,98 +4,279 @@ import com.google.gson.Gson; import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; +import nu.marginalia.converting.model.ProcessedDomain; +import nu.marginalia.converting.sideload.SideloadSource; +import nu.marginalia.converting.sideload.SideloadSourceFactory; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.mq.MessageQueueFactory; +import nu.marginalia.mq.MqMessage; +import nu.marginalia.mq.inbox.MqInboxResponse; +import nu.marginalia.mq.inbox.MqSingleShotInbox; +import nu.marginalia.mqapi.converting.ConvertAction; +import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.process.log.WorkLog; import nu.marginalia.service.module.DatabaseModule; -import plan.CrawlPlanLoader; import plan.CrawlPlan; import nu.marginalia.converting.compiler.InstructionsCompiler; -import nu.marginalia.converting.instruction.Instruction; import nu.marginalia.converting.processor.DomainProcessor; -import nu.marginalia.crawling.model.CrawledDomain; -import nu.marginalia.util.ParallelPipe; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.nio.file.Path; -import java.util.List; +import java.sql.SQLException; +import java.util.Optional; +import java.util.UUID; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.Semaphore; +import java.util.concurrent.ThreadPoolExecutor; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; + +import static nu.marginalia.mqapi.ProcessInboxNames.CONVERTER_INBOX; public class ConverterMain { + private static final Logger logger = LoggerFactory.getLogger(ConverterMain.class); + private final DomainProcessor processor; + private final InstructionsCompiler compiler; + private final Gson gson; + private final ProcessHeartbeat heartbeat; + private final MessageQueueFactory messageQueueFactory; + private final FileStorageService fileStorageService; + private final SideloadSourceFactory sideloadSourceFactory; - private final Logger logger = LoggerFactory.getLogger(getClass()); - private final InstructionWriter instructionWriter; - - public static void main(String... args) throws IOException { - - if (args.length != 1) { - System.err.println("Arguments: crawl-plan.yaml"); - System.exit(0); - } - var plan = new CrawlPlanLoader().load(Path.of(args[0])); - + public static void main(String... args) throws Exception { Injector injector = Guice.createInjector( - new ConverterModule(plan), + new ConverterModule(), new DatabaseModule() ); - injector.getInstance(ConverterMain.class); - } + var converter = injector.getInstance(ConverterMain.class); - @Inject - public ConverterMain( - CrawlPlan plan, - DomainProcessor processor, - InstructionsCompiler compiler, - Gson gson - ) throws Exception { logger.info("Starting pipe"); - try (WorkLog processLog = plan.createProcessWorkLog(); - ConversionLog log = new ConversionLog(plan.process.getDir())) { - instructionWriter = new InstructionWriter(log, plan.process.getDir(), gson); - var pipe = new ParallelPipe("Converter", 16, 4, 2) { - - @Override - protected ProcessingInstructions onProcess(CrawledDomain domainData) { - Thread.currentThread().setName("Converter:Processor["+domainData.domain+"] - " + domainData.size()); - try { - var processed = processor.process(domainData); - var compiled = compiler.compile(processed); - - return new ProcessingInstructions(domainData.id, compiled); - } - finally { - Thread.currentThread().setName("Converter:Processor[IDLE]"); - } - } - - @Override - protected void onReceive(ProcessingInstructions processedInstructions) throws IOException { - Thread.currentThread().setName("Converter:Receiver["+processedInstructions.id+"]"); - try { - var instructions = processedInstructions.instructions; - instructions.removeIf(Instruction::isNoOp); - - String where = instructionWriter.accept(processedInstructions.id, instructions); - processLog.setJobToFinished(processedInstructions.id, where, instructions.size()); - } - finally { - Thread.currentThread().setName("Converter:Receiver[IDLE]"); - } - } - - }; - - plan.forEachCrawledDomain(id -> !processLog.isJobFinished(id), pipe::accept); - - pipe.join(); - } + converter + .fetchInstructions() + .execute(converter); logger.info("Finished"); System.exit(0); } - record ProcessingInstructions(String id, List instructions) {} + @Inject + public ConverterMain( + DomainProcessor processor, + InstructionsCompiler compiler, + Gson gson, + ProcessHeartbeat heartbeat, + MessageQueueFactory messageQueueFactory, + FileStorageService fileStorageService, + SideloadSourceFactory sideloadSourceFactory + ) + { + this.processor = processor; + this.compiler = compiler; + this.gson = gson; + this.heartbeat = heartbeat; + this.messageQueueFactory = messageQueueFactory; + this.fileStorageService = fileStorageService; + this.sideloadSourceFactory = sideloadSourceFactory; + + heartbeat.start(); + } + + public void convert(SideloadSource sideloadSource, Path writeDir) throws Exception { + int maxPoolSize = 16; + + try (WorkLog workLog = new WorkLog(writeDir.resolve("processor.log")); + ConversionLog conversionLog = new ConversionLog(writeDir)) { + var instructionWriter = new InstructionWriterFactory(conversionLog, writeDir, gson); + + final String where; + final int size; + + try (var writer = instructionWriter.createInstructionsForDomainWriter(sideloadSource.getId())) { + compiler.compileStreaming(sideloadSource, writer::accept); + where = writer.getFileName(); + size = writer.getSize(); + } + + workLog.setJobToFinished(sideloadSource.getId(), where, size); + } + } + + public void convert(CrawlPlan plan) throws Exception { + + final int maxPoolSize = Runtime.getRuntime().availableProcessors(); + + try (WorkLog processLog = plan.createProcessWorkLog(); + ConversionLog log = new ConversionLog(plan.process.getDir())) { + var instructionWriter = new InstructionWriterFactory(log, plan.process.getDir(), gson); + + var pool = new DumbThreadPool(maxPoolSize, 2); + + int totalDomains = plan.countCrawledDomains(); + AtomicInteger processedDomains = new AtomicInteger(0); + + // Advance the progress bar to the current position if this is a resumption + processedDomains.set(processLog.countFinishedJobs()); + heartbeat.setProgress(processedDomains.get() / (double) totalDomains); + + for (var domain : plan.crawlDataIterable(id -> !processLog.isJobFinished(id))) + { + pool.submit(() -> { + try { + ProcessedDomain processed = processor.process(domain); + + final String where; + final int size; + + try (var writer = instructionWriter.createInstructionsForDomainWriter(processed.id)) { + compiler.compile(processed, writer::accept); + where = writer.getFileName(); + size = writer.getSize(); + } + + processLog.setJobToFinished(processed.id, where, size); + heartbeat.setProgress(processedDomains.incrementAndGet() / (double) totalDomains); + } + catch (IOException ex) { + logger.warn("IO exception in converter", ex); + } + }); + } + + pool.shutDown(); + do { + System.out.println("Waiting for pool to terminate... " + pool.getActiveCount() + " remaining"); + } while (!pool.awaitTermination(60, TimeUnit.SECONDS)); + } + } + + private abstract static class ConvertRequest { + private final MqMessage message; + private final MqSingleShotInbox inbox; + + private ConvertRequest(MqMessage message, MqSingleShotInbox inbox) { + this.message = message; + this.inbox = inbox; + } + + public abstract void execute(ConverterMain converterMain) throws Exception; + + public void ok() { + inbox.sendResponse(message, MqInboxResponse.ok()); + } + public void err() { + inbox.sendResponse(message, MqInboxResponse.err()); + } + } + + private static class SideloadAction extends ConvertRequest { + + private final SideloadSource sideloadSource; + private final Path workDir; + + SideloadAction(SideloadSource sideloadSource, + Path workDir, + MqMessage message, MqSingleShotInbox inbox) { + super(message, inbox); + this.sideloadSource = sideloadSource; + this.workDir = workDir; + } + + @Override + public void execute(ConverterMain converterMain) throws Exception { + try { + converterMain.convert(sideloadSource, workDir); + ok(); + } + catch (Exception ex) { + logger.error("Error sideloading", ex); + err(); + } + } + } + + private static class ConvertCrawlDataAction extends ConvertRequest { + private final CrawlPlan plan; + + private ConvertCrawlDataAction(CrawlPlan plan, MqMessage message, MqSingleShotInbox inbox) { + super(message, inbox); + this.plan = plan; + } + + @Override + public void execute(ConverterMain converterMain) throws Exception { + try { + converterMain.convert(plan); + ok(); + } + catch (Exception ex) { + err(); + } + } + } + + + private ConvertRequest fetchInstructions() throws Exception { + + var inbox = messageQueueFactory.createSingleShotInbox(CONVERTER_INBOX, UUID.randomUUID()); + + var msgOpt = getMessage(inbox, nu.marginalia.mqapi.converting.ConvertRequest.class.getSimpleName()); + var msg = msgOpt.orElseThrow(() -> new RuntimeException("No message received")); + + var request = gson.fromJson(msg.payload(), nu.marginalia.mqapi.converting.ConvertRequest.class); + + if (request.action == ConvertAction.ConvertCrawlData) { + + var crawlData = fileStorageService.getStorage(request.crawlStorage); + var processData = fileStorageService.getStorage(request.processedDataStorage); + + var plan = new CrawlPlan(null, + new CrawlPlan.WorkDir(crawlData.path(), "crawler.log"), + new CrawlPlan.WorkDir(processData.path(), "processor.log")); + + return new ConvertCrawlDataAction(plan, msg, inbox); + } + + if (request.action == ConvertAction.SideloadEncyclopedia) { + var processData = fileStorageService.getStorage(request.processedDataStorage); + var filePath = Path.of(request.inputSource); + + return new SideloadAction(sideloadSourceFactory.sideloadEncyclopediaMarginaliaNu(filePath), + processData.asPath(), + msg, inbox); + } + + if (request.action == ConvertAction.SideloadStackexchange) { + var processData = fileStorageService.getStorage(request.processedDataStorage); + var filePath = Path.of(request.inputSource); + var domainName = filePath.toFile().getName().substring(0, filePath.toFile().getName().lastIndexOf('.')); + return new SideloadAction(sideloadSourceFactory.sideloadStackexchange(filePath, domainName), + processData.asPath(), + msg, inbox); + } + + else { + throw new RuntimeException("Unknown action: " + request.action); + } + } + + private Optional getMessage(MqSingleShotInbox inbox, String expectedFunction) throws SQLException, InterruptedException { + var opt = inbox.waitForMessage(30, TimeUnit.SECONDS); + if (opt.isPresent()) { + if (!opt.get().function().equals(expectedFunction)) { + throw new RuntimeException("Unexpected function: " + opt.get().function()); + } + return opt; + } + else { + var stolenMessage = inbox.stealMessage(msg -> msg.function().equals(expectedFunction)); + stolenMessage.ifPresent(mqMessage -> logger.info("Stole message {}", mqMessage)); + return stolenMessage; + } + } } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterModule.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterModule.java index e7a70aeb..90d4e3ad 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterModule.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterModule.java @@ -4,23 +4,22 @@ import com.google.gson.Gson; import com.google.inject.AbstractModule; import com.google.inject.name.Names; import nu.marginalia.LanguageModels; +import nu.marginalia.ProcessConfiguration; import nu.marginalia.WmsaHome; -import plan.CrawlPlan; import nu.marginalia.model.gson.GsonFactory; +import java.util.UUID; + public class ConverterModule extends AbstractModule { - private final CrawlPlan plan; - - public ConverterModule(CrawlPlan plan) { - this.plan = plan; + public ConverterModule() { } public void configure() { - bind(CrawlPlan.class).toInstance(plan); - bind(Gson.class).toInstance(createGson()); + bind(ProcessConfiguration.class).toInstance(new ProcessConfiguration("converter", 0, UUID.randomUUID())); + bind(Double.class).annotatedWith(Names.named("min-document-quality")).toInstance(-15.); bind(Integer.class).annotatedWith(Names.named("min-document-length")).toInstance(250); bind(Integer.class).annotatedWith(Names.named("max-title-length")).toInstance(128); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/DumbThreadPool.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/DumbThreadPool.java new file mode 100644 index 00000000..95cbf14a --- /dev/null +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/DumbThreadPool.java @@ -0,0 +1,119 @@ +package nu.marginalia.converting; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; + +/** A simple thread pool implementation that will never invoke + * a task in the calling thread like {@link java.util.concurrent.ThreadPoolExecutor} + * does when the queue is full. Instead, it will block until a thread + * becomes available to run the task. This is useful for coarse grained + * tasks where the calling thread might otherwise block for hours. + */ +// TODO: This class exists in crawler as well, should probably be broken out into a common library; use the one from crawler instead +public class DumbThreadPool { + private final List workers = new ArrayList<>(); + private final LinkedBlockingQueue tasks; + private volatile boolean shutDown = false; + private final AtomicInteger taskCount = new AtomicInteger(0); + private final Logger logger = LoggerFactory.getLogger(DumbThreadPool.class); + + public DumbThreadPool(int poolSize, int queueSize) { + tasks = new LinkedBlockingQueue<>(queueSize); + + for (int i = 0; i < poolSize; i++) { + Thread worker = new Thread(this::worker, "Converter Thread " + i); + worker.setDaemon(true); + worker.start(); + workers.add(worker); + } + + } + + public void submit(Runnable runnable) throws InterruptedException { + tasks.put(runnable); + } + + public void shutDown() { + this.shutDown = true; + } + + public void shutDownNow() { + this.shutDown = true; + for (Thread worker : workers) { + worker.interrupt(); + } + } + + private void worker() { + while (!shutDown) { + try { + Runnable task = tasks.poll(1, TimeUnit.SECONDS); + if (task == null) { + continue; + } + + try { + taskCount.incrementAndGet(); + task.run(); + } + catch (Exception ex) { + logger.warn("Error executing task", ex); + } + finally { + taskCount.decrementAndGet(); + } + } + + catch (InterruptedException ex) { + logger.warn("Thread pool worker interrupted", ex); + return; + } + } + } + + + /** Wait for all tasks to complete up to the specified timeout, + * then return true if all tasks completed, false otherwise. + */ + public boolean awaitTermination(int i, TimeUnit timeUnit) throws InterruptedException { + final long start = System.currentTimeMillis(); + final long deadline = start + timeUnit.toMillis(i); + + for (var thread : workers) { + if (!thread.isAlive()) + continue; + + long timeRemaining = deadline - System.currentTimeMillis(); + if (timeRemaining <= 0) + return false; + + thread.join(timeRemaining); + if (thread.isAlive()) + return false; + } + + // Doublecheck the bookkeeping so we didn't mess up. This may mean you have to Ctrl+C the process + // if you see this warning forever, but for the crawler this is preferable to terminating early + // and missing tasks. (maybe some cosmic ray or OOM condition or X-Files baddie of the week killed a + // thread so hard and it didn't invoke finally and didn't decrement the task count) + + int activeCount = getActiveCount(); + if (activeCount != 0) { + logger.warn("Thread pool terminated with {} active threads(?!) -- check what's going on with jstack and kill manually", activeCount); + return false; + } + + return true; + } + + public int getActiveCount() { + return taskCount.get(); + } + +} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriter.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriterFactory.java similarity index 56% rename from code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriter.java rename to code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriterFactory.java index 826c41cd..08f842c6 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriter.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriterFactory.java @@ -15,22 +15,18 @@ import nu.marginalia.model.EdgeUrl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.BufferedOutputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.OutputStreamWriter; +import java.io.*; import java.nio.file.Files; import java.nio.file.Path; -import java.util.List; -public class InstructionWriter { +public class InstructionWriterFactory { - private ConversionLog log; + private final ConversionLog log; private final Path outputDir; private final Gson gson; - private static final Logger logger = LoggerFactory.getLogger(InstructionWriter.class); + private static final Logger logger = LoggerFactory.getLogger(InstructionWriterFactory.class); - public InstructionWriter(ConversionLog log, Path outputDir, Gson gson) { + public InstructionWriterFactory(ConversionLog log, Path outputDir, Gson gson) { this.log = log; this.outputDir = outputDir; this.gson = gson; @@ -40,29 +36,59 @@ public class InstructionWriter { } } - public String accept(String id, List instructionList) throws IOException { + public InstructionWriter createInstructionsForDomainWriter(String id) throws IOException { Path outputFile = getOutputFile(id); + return new InstructionWriter(outputFile); + } - if (Files.exists(outputFile)) { - Files.delete(outputFile); + public class InstructionWriter implements AutoCloseable { + private final ObjectOutputStream outputStream; + private final String where; + private final SummarizingInterpreter summary = new SummarizingInterpreter(); + + private int size = 0; + + + InstructionWriter(Path filename) throws IOException { + where = filename.getFileName().toString(); + Files.deleteIfExists(filename); + outputStream = new ObjectOutputStream(new ZstdOutputStream(new FileOutputStream(filename.toFile()))); } - try (var outputStream = new OutputStreamWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outputFile.toFile()))))) { + public void accept(Instruction instruction) { + if (instruction.isNoOp()) return; - SummarizingInterpreter summary = new SummarizingInterpreter(instructionList); - logger.info("Writing {} - {} - {}", id, instructionList.size(), summary); + instruction.apply(summary); + instruction.apply(log); - for (var instr : instructionList) { - instr.apply(log); + size++; - outputStream.append(instr.tag().name()); - outputStream.append(' '); - gson.toJson(instr, outputStream); - outputStream.append('\n'); + try { + outputStream.writeObject(instruction); + + // Reset the stream to avoid keeping references to the objects + // (as this will cause the memory usage to grow indefinitely when + // writing huge amounts of data) + outputStream.reset(); + } + catch (IOException ex) { + logger.warn("IO exception writing instruction", ex); } } - return outputFile.getFileName().toString(); + @Override + public void close() throws IOException { + logger.info("Wrote {} - {} - {}", where, size, summary); + outputStream.close(); + } + + public String getFileName() { + return where; + } + + public int getSize() { + return size; + } } private Path getOutputFile(String id) throws IOException { @@ -79,32 +105,20 @@ public class InstructionWriter { private static class SummarizingInterpreter implements Interpreter { - private SummarizingInterpreter(List instructions) { - for (var i : instructions) { - i.apply(this); - } - } - private String domainName; private int ok = 0; private int error = 0; + int keywords = 0; + int documents = 0; + public String toString() { + // This shouldn't happen (TM) + assert keywords == documents : "keywords != documents"; + return String.format("%s - %d %d", domainName, ok, error); } - @Override - public void loadUrl(EdgeUrl[] url) {} - - @Override - public void loadDomain(EdgeDomain[] domain) {} - - @Override - public void loadRssFeed(EdgeUrl[] rssFeed) {} - - @Override - public void loadDomainLink(DomainLink[] links) {} - @Override public void loadProcessedDomain(EdgeDomain domain, DomainIndexingState state, String ip) { this.domainName = domain.toString(); @@ -112,20 +126,14 @@ public class InstructionWriter { @Override public void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) { - - } - - @Override - public void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError) { + documents++; } @Override public void loadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) { + keywords++; } - @Override - public void loadDomainRedirect(DomainLink link) {} - @Override public void loadDomainMetadata(EdgeDomain domain, int knownUrls, int goodUrls, int visitedUrls) { ok += goodUrls; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java index 36b112fa..9bc3f6b3 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java @@ -3,39 +3,39 @@ package nu.marginalia.converting.compiler; import nu.marginalia.converting.instruction.Instruction; import nu.marginalia.converting.instruction.instructions.LoadKeywords; import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; -import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.model.crawl.HtmlFeature; import java.util.List; +import java.util.function.Consumer; public class DocumentsCompiler { - public void compile(List ret, List documents) { + public void compile(Consumer instructionConsumer, List documents) { for (var doc : documents) { - compileDocumentDetails(ret, doc); + compileDocumentDetails(instructionConsumer, doc); } for (var doc : documents) { - compileWords(ret, doc); + compileWords(instructionConsumer, doc); } } - private void compileDocumentDetails(List ret, ProcessedDocument doc) { + public void compileDocumentDetails(Consumer instructionConsumer, ProcessedDocument doc) { var details = doc.details; if (details != null) { - ret.add(new LoadProcessedDocument(doc.url, doc.state, details.title, details.description, HtmlFeature.encode(details.features), details.standard.name(), details.length, details.hashCode, details.quality, details.pubYear)); + instructionConsumer.accept(new LoadProcessedDocument(doc.url, doc.state, details.title, details.description, HtmlFeature.encode(details.features), details.standard.name(), details.length, details.hashCode, details.quality, details.pubYear)); } } - private void compileWords(List ret, ProcessedDocument doc) { + public void compileWords(Consumer instructionConsumer, ProcessedDocument doc) { var words = doc.words; if (words != null) { - ret.add(new LoadKeywords(doc.url, doc.details.metadata, words.build())); + instructionConsumer.accept(new LoadKeywords(doc.url, doc.details.metadata, words.build())); } } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DomainMetadataCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DomainMetadataCompiler.java index e80f42eb..3909edb1 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DomainMetadataCompiler.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DomainMetadataCompiler.java @@ -11,11 +11,12 @@ import java.util.HashSet; import java.util.List; import java.util.Optional; import java.util.Set; +import java.util.function.Consumer; public class DomainMetadataCompiler { - public void compile(List ret, EdgeDomain domain, @NotNull List documents) { + public void compile(Consumer instructionConsumer, EdgeDomain domain, @NotNull List documents) { int visitedUrls = 0; int goodUrls = 0; @@ -36,7 +37,11 @@ public class DomainMetadataCompiler { .ifPresent(knownUrls::addAll); } - ret.add(new LoadDomainMetadata(domain, knownUrls.size(), goodUrls, visitedUrls)); + instructionConsumer.accept(new LoadDomainMetadata(domain, knownUrls.size(), goodUrls, visitedUrls)); + } + + public void compileFake(Consumer instructionConsumer, EdgeDomain domain, int countAll, int countGood) { + instructionConsumer.accept(new LoadDomainMetadata(domain, countAll, countGood, countAll)); } } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/FeedsCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/FeedsCompiler.java index 64779a0f..2c111ea2 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/FeedsCompiler.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/FeedsCompiler.java @@ -7,10 +7,11 @@ import nu.marginalia.model.EdgeUrl; import java.util.List; import java.util.Objects; +import java.util.function.Consumer; public class FeedsCompiler { - public void compile(List ret, List documents) { + public void compile(Consumer instructionConsumer, List documents) { EdgeUrl[] feeds = documents.stream().map(doc -> doc.details) .filter(Objects::nonNull) @@ -18,6 +19,6 @@ public class FeedsCompiler { .distinct() .toArray(EdgeUrl[]::new); - ret.add(new LoadRssFeed(feeds)); + instructionConsumer.accept(new LoadRssFeed(feeds)); } } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/InstructionsCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/InstructionsCompiler.java index a7076334..87f28e3c 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/InstructionsCompiler.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/InstructionsCompiler.java @@ -3,12 +3,16 @@ package nu.marginalia.converting.compiler; import com.google.inject.Inject; import nu.marginalia.converting.instruction.Instruction; import nu.marginalia.converting.instruction.instructions.LoadProcessedDomain; +import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.model.ProcessedDomain; +import nu.marginalia.converting.sideload.SideloadSource; +import nu.marginalia.model.EdgeUrl; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; -import java.util.ArrayList; import java.util.Collections; -import java.util.List; -import java.util.Objects; +import java.util.Iterator; +import java.util.function.Consumer; import static java.util.Objects.requireNonNullElse; @@ -20,6 +24,8 @@ public class InstructionsCompiler { private final LinksCompiler linksCompiler; private final RedirectCompiler redirectCompiler; + private final Logger logger = LoggerFactory.getLogger(InstructionsCompiler.class); + @Inject public InstructionsCompiler(UrlsCompiler urlsCompiler, DocumentsCompiler documentsCompiler, @@ -36,31 +42,52 @@ public class InstructionsCompiler { this.redirectCompiler = redirectCompiler; } - public List compile(ProcessedDomain domain) { - List ret = new ArrayList<>(domain.size()*4); - - ret.add(new LoadProcessedDomain(domain.domain, domain.state, domain.ip)); + public void compile(ProcessedDomain domain, Consumer instructionConsumer) { + // Guaranteed to always be first + instructionConsumer.accept(new LoadProcessedDomain(domain.domain, domain.state, domain.ip)); if (domain.documents != null) { - urlsCompiler.compile(ret, domain.documents); - documentsCompiler.compile(ret, domain.documents); + urlsCompiler.compile(instructionConsumer, domain.documents); + documentsCompiler.compile(instructionConsumer, domain.documents); - feedsCompiler.compile(ret, domain.documents); - - linksCompiler.compile(ret, domain.domain, domain.documents); + feedsCompiler.compile(instructionConsumer, domain.documents); + linksCompiler.compile(instructionConsumer, domain.domain, domain.documents); } if (domain.redirect != null) { - redirectCompiler.compile(ret, domain.domain, domain.redirect); + redirectCompiler.compile(instructionConsumer, domain.domain, domain.redirect); } - domainMetadataCompiler.compile(ret, domain.domain, requireNonNullElse(domain.documents, Collections.emptyList())); - - return ret; + domainMetadataCompiler.compile(instructionConsumer, domain.domain, requireNonNullElse(domain.documents, Collections.emptyList())); } + public void compileStreaming(SideloadSource sideloadSource, + Consumer instructionConsumer) { + ProcessedDomain domain = sideloadSource.getDomain(); + Iterator urlsIterator = sideloadSource.getUrlsIterator(); + Iterator documentsIterator = sideloadSource.getDocumentsStream(); + // Guaranteed to always be first + instructionConsumer.accept(new LoadProcessedDomain(domain.domain, domain.state, domain.ip)); + int countAll = 0; + int countGood = 0; + logger.info("Writing domains"); + urlsCompiler.compileJustDomain(instructionConsumer, domain.domain); + logger.info("Writing urls"); + urlsCompiler.compileJustUrls(instructionConsumer, urlsIterator); + logger.info("Writing docs"); + while (documentsIterator.hasNext()) { + var doc = documentsIterator.next(); + countAll++; + if (doc.isOk()) countGood++; + + documentsCompiler.compileDocumentDetails(instructionConsumer, doc); + documentsCompiler.compileWords(instructionConsumer, doc); + } + + domainMetadataCompiler.compileFake(instructionConsumer, domain.domain, countAll, countGood); + } } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/LinksCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/LinksCompiler.java index a578602d..e100cb86 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/LinksCompiler.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/LinksCompiler.java @@ -8,10 +8,11 @@ import nu.marginalia.model.EdgeDomain; import java.util.List; import java.util.Objects; +import java.util.function.Consumer; public class LinksCompiler { - public void compile(List ret, EdgeDomain from, List documents) { + public void compile(Consumer instructionConsumer, EdgeDomain from, List documents) { DomainLink[] links = documents.stream().map(doc -> doc.details) .filter(Objects::nonNull) @@ -21,6 +22,6 @@ public class LinksCompiler { .map(domain -> new DomainLink(from, domain)) .toArray(DomainLink[]::new); - ret.add(new LoadDomainLink(links)); + instructionConsumer.accept(new LoadDomainLink(links)); } } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/RedirectCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/RedirectCompiler.java index b14dedca..dcd0201f 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/RedirectCompiler.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/RedirectCompiler.java @@ -8,12 +8,13 @@ import nu.marginalia.converting.instruction.instructions.LoadDomainRedirect; import nu.marginalia.model.EdgeDomain; import java.util.List; +import java.util.function.Consumer; public class RedirectCompiler { - public void compile(List ret, EdgeDomain from, EdgeDomain to) { - ret.add(new LoadDomain(to)); - ret.add(new LoadDomainLink(new DomainLink(from, to))); - ret.add(new LoadDomainRedirect(new DomainLink(from, to))); + public void compile(Consumer instructionConsumer, EdgeDomain from, EdgeDomain to) { + instructionConsumer.accept(new LoadDomain(to)); + instructionConsumer.accept(new LoadDomainLink(new DomainLink(from, to))); + instructionConsumer.accept(new LoadDomainRedirect(new DomainLink(from, to))); } } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/UrlsCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/UrlsCompiler.java index 4d05a35d..ee4f3cbe 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/UrlsCompiler.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/UrlsCompiler.java @@ -6,44 +6,72 @@ import nu.marginalia.converting.instruction.instructions.LoadUrl; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.List; -import java.util.Set; +import java.util.*; +import java.util.function.Consumer; public class UrlsCompiler { private static final int MAX_INTERNAL_LINKS = 25; + private final Logger logger = LoggerFactory.getLogger(getClass()); - public void compile(List ret, List documents) { + public void compile(Consumer instructionConsumer, List documents) { Set seenUrls = new HashSet<>(documents.size()*4); Set seenDomains = new HashSet<>(documents.size()); for (var doc : documents) { + if (doc.url == null) { + logger.warn("Discovered document with null URL"); + continue; + } + seenUrls.add(doc.url); - if (doc.details != null) { + if (doc.details == null) { + continue; + } - for (var url : doc.details.linksExternal) { - if (seenDomains.add(url.domain)) { - seenUrls.add(url); - } + // Add *some* external links; to avoid loading too many and gunking up the database with nonsense, + // only permit this once per external domain per crawled domain + for (var url : doc.details.linksExternal) { + if (seenDomains.add(url.domain)) { + seenUrls.add(url); } + } - if (doc.isOk()) { - // Don't load more than a few from linksInternal, grows too big for no reason - var linksToAdd = new ArrayList<>(doc.details.linksInternal); - if (linksToAdd.size() > MAX_INTERNAL_LINKS) { - linksToAdd.subList(MAX_INTERNAL_LINKS, linksToAdd.size()).clear(); - } - seenUrls.addAll(linksToAdd); + if (doc.isOk()) { + // Don't load more than a few from linksInternal, grows too big for no reason + var linksToAdd = new ArrayList<>(doc.details.linksInternal); + if (linksToAdd.size() > MAX_INTERNAL_LINKS) { + linksToAdd.subList(MAX_INTERNAL_LINKS, linksToAdd.size()).clear(); } + seenUrls.addAll(linksToAdd); } } - ret.add(new LoadDomain(seenDomains.toArray(EdgeDomain[]::new))); - ret.add(new LoadUrl(seenUrls.toArray(EdgeUrl[]::new))); + instructionConsumer.accept(new LoadDomain(seenDomains.toArray(EdgeDomain[]::new))); + instructionConsumer.accept(new LoadUrl(seenUrls.toArray(EdgeUrl[]::new))); } + public void compileJustUrls(Consumer instructionConsumer, Iterator urlsIterator) { + var urls = new ArrayList(1000); + + while (urlsIterator.hasNext()) { + if (urls.size() >= 1000) { + instructionConsumer.accept(new LoadUrl(urls.toArray(EdgeUrl[]::new))); + urls.clear(); + } + + urls.add(urlsIterator.next()); + } + if (!urls.isEmpty()) { + instructionConsumer.accept(new LoadUrl(urls.toArray(EdgeUrl[]::new))); + } + } + + public void compileJustDomain(Consumer instructionConsumer, EdgeDomain domain) { + instructionConsumer.accept(new LoadDomain(domain)); + } } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDomain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDomain.java index 95b66a02..e445d5b2 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDomain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDomain.java @@ -6,7 +6,6 @@ import nu.marginalia.model.crawl.DomainIndexingState; import java.util.List; import java.util.Optional; -import java.util.OptionalDouble; @ToString public class ProcessedDomain { @@ -16,17 +15,7 @@ public class ProcessedDomain { public DomainIndexingState state; public EdgeDomain redirect; public String ip; - - public OptionalDouble averageQuality() { - if (documents == null) { - return OptionalDouble.empty(); - } - return documents.stream() - .map(ProcessedDocument::quality) - .filter(OptionalDouble::isPresent) - .mapToDouble(OptionalDouble::getAsDouble) - .average(); - } + public String id; public int size() { return Optional.ofNullable(documents).map(List::size).orElse(1); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/ConverterDomainTypes.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/ConverterDomainTypes.java index 95a1b5fd..83f3ad22 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/ConverterDomainTypes.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/ConverterDomainTypes.java @@ -9,8 +9,8 @@ import org.slf4j.LoggerFactory; import java.io.IOException; import java.sql.SQLException; -import java.util.HashMap; -import java.util.Map; +import java.util.HashSet; +import java.util.Set; /** Converter-side wrapper for of common:db's DomainTypes, * which is a list of domains of a known type (e.g. blog) @@ -18,11 +18,7 @@ import java.util.Map; @Singleton public class ConverterDomainTypes { private final Logger logger = LoggerFactory.getLogger(ConverterDomainTypes.class); - private final Map domainTypes = new HashMap<>(); - - private enum DomainType { - BLOG - } + private final Set blogs = new HashSet<>(10000, 0.5f); @Inject public ConverterDomainTypes(DomainTypes types) throws SQLException { @@ -40,14 +36,14 @@ public class ConverterDomainTypes { } for (var item : allBlogs) { - domainTypes.put(new EdgeDomain(item), DomainType.BLOG); + + blogs.add(new EdgeDomain(item)); } - logger.info("Loaded {} domain types", domainTypes.size()); - + logger.info("Loaded {} domain types", blogs.size()); } public boolean isBlog(EdgeDomain domain) { - return domainTypes.get(domain) == DomainType.BLOG; + return blogs.contains(domain); } } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java index b7ac1767..82e9c5d7 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java @@ -2,7 +2,6 @@ package nu.marginalia.converting.processor; import com.google.inject.Inject; import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.CrawlerDocumentStatus; import nu.marginalia.model.crawl.UrlIndexingState; import nu.marginalia.converting.model.DisqualifiedException; @@ -38,11 +37,14 @@ public class DocumentProcessor { processorPlugins.add(plainTextDocumentProcessorPlugin); } - public ProcessedDocument process(CrawledDocument crawledDocument, CrawledDomain crawledDomain) { + public ProcessedDocument process(CrawledDocument crawledDocument) { ProcessedDocument ret = new ProcessedDocument(); try { - processDocument(crawledDocument, crawledDomain, ret); + // We must always provide the URL, even if we don't process the document + ret.url = getDocumentUrl(crawledDocument); + + processDocument(crawledDocument, ret); } catch (DisqualifiedException ex) { ret.state = UrlIndexingState.DISQUALIFIED; @@ -53,13 +55,12 @@ public class DocumentProcessor { ret.state = UrlIndexingState.DISQUALIFIED; ret.stateReason = DisqualifiedException.DisqualificationReason.PROCESSING_EXCEPTION.toString(); logger.info("Failed to convert " + crawledDocument.url, ex); - ex.printStackTrace(); } return ret; } - private void processDocument(CrawledDocument crawledDocument, CrawledDomain crawledDomain, ProcessedDocument ret) throws URISyntaxException, DisqualifiedException { + private void processDocument(CrawledDocument crawledDocument, ProcessedDocument ret) throws URISyntaxException, DisqualifiedException { var crawlerStatus = CrawlerDocumentStatus.valueOf(crawledDocument.crawlerStatus); if (crawlerStatus != CrawlerDocumentStatus.OK) { @@ -74,15 +75,11 @@ public class DocumentProcessor { throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.CONTENT_TYPE); } - - ret.url = getDocumentUrl(crawledDocument); ret.state = crawlerStatusToUrlState(crawledDocument.crawlerStatus, crawledDocument.httpStatus); final var plugin = findPlugin(crawledDocument); - AbstractDocumentProcessorPlugin.DetailsWithWords detailsWithWords = plugin.createDetails(crawledDomain, crawledDocument); - - crawledDocument.dispose(); + AbstractDocumentProcessorPlugin.DetailsWithWords detailsWithWords = plugin.createDetails(crawledDocument); ret.details = detailsWithWords.details(); ret.words = detailsWithWords.words(); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index dcdda943..e313bcdf 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -1,18 +1,20 @@ package nu.marginalia.converting.processor; -import com.google.common.base.Strings; import com.google.inject.Inject; +import lombok.SneakyThrows; +import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.processor.logic.links.LinkGraph; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawledDomain; -import nu.marginalia.crawling.model.CrawlerDocumentStatus; -import nu.marginalia.crawling.model.CrawlerDomainStatus; +import nu.marginalia.crawling.io.SerializableCrawlDataStream; +import nu.marginalia.crawling.model.*; import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.converting.processor.logic.links.TopKeywords; import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator; +import nu.marginalia.model.crawl.HtmlFeature; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.util.*; @@ -21,53 +23,98 @@ public class DomainProcessor { private final SiteWords siteWords; private final LshDocumentDeduplicator documentDeduplicator; + private final Logger logger = LoggerFactory.getLogger(getClass()); + @Inject public DomainProcessor(DocumentProcessor documentProcessor, SiteWords siteWords, - LshDocumentDeduplicator documentDeduplicator) { + LshDocumentDeduplicator documentDeduplicator) + { this.documentProcessor = documentProcessor; this.siteWords = siteWords; this.documentDeduplicator = documentDeduplicator; } - public ProcessedDomain process(CrawledDomain crawledDomain) { + @SneakyThrows + public ProcessedDomain process(SerializableCrawlDataStream dataStream) { var ret = new ProcessedDomain(); + List docs = new ArrayList<>(); + boolean cookies = false; + String ip = ""; + while (dataStream.hasNext()) { + var data = dataStream.next(); - ret.domain = new EdgeDomain(crawledDomain.domain); - ret.ip = crawledDomain.ip; + if (data instanceof CrawledDomain crawledDomain) { + ret.domain = new EdgeDomain(crawledDomain.domain); + ret.ip = crawledDomain.ip; + ret.id = crawledDomain.id; - if (crawledDomain.redirectDomain != null) { - ret.redirect = new EdgeDomain(crawledDomain.redirectDomain); - } + cookies = Objects.requireNonNullElse(crawledDomain.cookies, Collections.emptyList()).size() > 0; + ip = crawledDomain.ip; - if (crawledDomain.doc != null) { - ret.documents = new ArrayList<>(crawledDomain.doc.size()); - - fixBadCanonicalTags(crawledDomain.doc); - - for (var doc : crawledDomain.doc) { - var processedDoc = documentProcessor.process(doc, crawledDomain); - - if (processedDoc.url != null) { - ret.documents.add(processedDoc); + if (crawledDomain.redirectDomain != null) { + ret.redirect = new EdgeDomain(crawledDomain.redirectDomain); } - + ret.documents = docs; + ret.state = getState(crawledDomain.crawlerStatus); } + else if (data instanceof CrawledDocument doc) { + try { + if (doc.url == null) + continue; + fixBadCanonicalTag(doc); - documentDeduplicator.deduplicate(ret.documents); - - calculateStatistics(ret); - } - else { - ret.documents = Collections.emptyList(); + docs.add(documentProcessor.process(doc)); + } + catch (Exception ex) { + logger.warn("Failed to process " + doc.url, ex); + } + } } - ret.state = getState(crawledDomain.crawlerStatus); + // Add late keywords and features from domain-level information + + List terms = new ArrayList<>(); + terms.add("ip:"+ip); + if (cookies) + terms.add(HtmlFeature.COOKIES.getKeyword()); + + for (var document : ret.documents) { + if (document.details == null) + continue; + + if (cookies) + document.details.features.add(HtmlFeature.COOKIES); + + document.words.addAllSyntheticTerms(terms); + } + + documentDeduplicator.deduplicate(ret.documents); + calculateStatistics(ret); return ret; } + private void fixBadCanonicalTag(CrawledDocument doc) { + // Some sites have a canonical tag that points to a different domain, + // but our loader can not support this, so we point these back to the + // original url. + + var canonicalOpt = EdgeUrl.parse(doc.canonicalUrl); + if (canonicalOpt.isEmpty()) return; + + var urlOpt = EdgeUrl.parse(doc.url); + if (urlOpt.isEmpty()) return; + + var urlActual = urlOpt.get(); + var canonicalActual = canonicalOpt.get(); + + if (!Objects.equals(urlActual.domain, canonicalActual.domain)) { + doc.canonicalUrl = doc.url; + } + } + private void calculateStatistics(ProcessedDomain ret) { LinkGraph linkGraph = new LinkGraph(); TopKeywords topKeywords = new TopKeywords(); @@ -91,61 +138,6 @@ public class DomainProcessor { siteWords.flagAdjacentWords(topKeywords, invertedLinkGraph, ret); } - - private void fixBadCanonicalTags(List docs) { - Map> seenCanonicals = new HashMap<>(); - Set seenUrls = new HashSet<>(); - - // Sometimes sites set a blanket canonical link to their root page - // this removes such links from consideration - - for (var document : docs) { - if (!Strings.isNullOrEmpty(document.canonicalUrl) - && !Objects.equals(document.canonicalUrl, document.url)) { - seenCanonicals.computeIfAbsent(document.canonicalUrl, url -> new HashSet<>()).add(document.documentBodyHash); - } - seenUrls.add(document.url); - } - - for (var document : docs) { - if (!Strings.isNullOrEmpty(document.canonicalUrl) - && !Objects.equals(document.canonicalUrl, document.url) - && seenCanonicals.getOrDefault(document.canonicalUrl, Collections.emptySet()).size() > 1) { - - if (seenUrls.add(document.canonicalUrl)) { - document.canonicalUrl = document.url; - } - else { - document.crawlerStatus = CrawlerDocumentStatus.BAD_CANONICAL.name(); - } - } - } - - for (var document : docs) { - if (!Strings.isNullOrEmpty(document.canonicalUrl) - && !Objects.equals(document.canonicalUrl, document.url) - && seenCanonicals.getOrDefault(document.canonicalUrl, Collections.emptySet()).size() > 1) { - document.canonicalUrl = document.url; - } - } - - // Ignore canonical URL if it points to a different domain - // ... this confuses the hell out of the loader - for (var document : docs) { - if (Strings.isNullOrEmpty(document.canonicalUrl)) - continue; - - Optional cUrl = EdgeUrl.parse(document.canonicalUrl); - Optional dUrl = EdgeUrl.parse(document.url); - - if (cUrl.isPresent() && dUrl.isPresent() - && !Objects.equals(cUrl.get().domain, dUrl.get().domain)) - { - document.canonicalUrl = document.url; - } - } - } - private DomainIndexingState getState(String crawlerStatus) { return switch (CrawlerDomainStatus.valueOf(crawlerStatus)) { case OK -> DomainIndexingState.ACTIVE; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java index 57a98879..040f96dd 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java @@ -4,7 +4,6 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.adblock.AdblockSimulator; import nu.marginalia.adblock.GoogleAnwersSpamDetector; -import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.topic.RecipeDetector; @@ -25,9 +24,11 @@ public class FeatureExtractor { "twitter.com", "bing.com", "msn.com"); - private static final List shittyTrackers = List.of("adform.net", + private static final List adtechTrackers = List.of("adform.net", "connect.facebook", "facebook.com/tr", + "absbygoogle.com", + "adnxs.com", "googletagmanager.com", "googlesyndication.com", "smartadserver.com", @@ -65,7 +66,7 @@ public class FeatureExtractor { this.googleAnwersSpamDetector = googleAnwersSpamDetector; } - public Set getFeatures(CrawledDomain domain, Document doc, DocumentLanguageData dld) { + public Set getFeatures(Document doc, DocumentLanguageData dld) { final Set features = new HashSet<>(); final Elements scriptTags = doc.getElementsByTag("script"); @@ -203,11 +204,11 @@ public class FeatureExtractor { for (var scriptTag : scriptTags) { if (hasInvasiveTrackingScript(scriptTag)) { - features.add(HtmlFeature.TRACKING_INNOCENT); - features.add(HtmlFeature.TRACKING_EVIL); + features.add(HtmlFeature.TRACKING); + features.add(HtmlFeature.TRACKING_ADTECH); } else if (hasNaiveTrackingScript(scriptTag)) { - features.add(HtmlFeature.TRACKING_INNOCENT); + features.add(HtmlFeature.TRACKING); } if (scriptTag.hasAttr("didomi/javascript")) { @@ -234,42 +235,44 @@ public class FeatureExtractor { features.add(HtmlFeature.COOKIELAW); } if (scriptText.contains("_linkedin_data_partner_id")) { - features.add(HtmlFeature.TRACKING_EVIL); + features.add(HtmlFeature.TRACKING); + features.add(HtmlFeature.TRACKING_ADTECH); } if (scriptText.contains("window.OneSignal")) { features.add(HtmlFeature.ONESIGNAL); } if (scriptText.contains("connect.facebook.net")) { - features.add(HtmlFeature.TRACKING_EVIL); + features.add(HtmlFeature.TRACKING); + features.add(HtmlFeature.TRACKING_ADTECH); } if (scriptText.contains("hotjar.com")) { - features.add(HtmlFeature.TRACKING_INNOCENT); + features.add(HtmlFeature.TRACKING); } } for (var noscript : doc.getElementsByTag("noscript")) { for (var iframe : noscript.getElementsByTag("iframe")) { if (hasInvasiveTrackingScript(iframe)) { - features.add(HtmlFeature.TRACKING_INNOCENT); - features.add(HtmlFeature.TRACKING_EVIL); + features.add(HtmlFeature.TRACKING); + features.add(HtmlFeature.TRACKING_ADTECH); } else if (hasNaiveTrackingScript(iframe)) { - features.add(HtmlFeature.TRACKING_INNOCENT); + features.add(HtmlFeature.TRACKING); } } for (var img : noscript.getElementsByTag("img")) { if (hasInvasiveTrackingScript(img)) { - features.add(HtmlFeature.TRACKING_INNOCENT); - features.add(HtmlFeature.TRACKING_EVIL); + features.add(HtmlFeature.TRACKING); + features.add(HtmlFeature.TRACKING_ADTECH); } else if (hasNaiveTrackingScript(img)) { - features.add(HtmlFeature.TRACKING_INNOCENT); + features.add(HtmlFeature.TRACKING); } } } if (scriptTags.html().contains("google-analytics.com")) { - features.add(HtmlFeature.TRACKING_INNOCENT); + features.add(HtmlFeature.TRACKING); } for (var aTag : doc.getElementsByTag("a")) { @@ -279,9 +282,6 @@ public class FeatureExtractor { } } - if (!domain.cookies.isEmpty()) - features.add(HtmlFeature.COOKIES); - if (recipeDetector.testP(dld) > 0.5) features.add(HtmlFeature.CATEGORY_FOOD); // these should be mutually exclusive @@ -299,7 +299,7 @@ public class FeatureExtractor { } private boolean hasInvasiveTrackingScript(String src) { - for (var tracker : shittyTrackers) { + for (var tracker : adtechTrackers) { if (src.contains(tracker)) { return true; } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java index c49d365f..14fd12ad 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java @@ -19,7 +19,7 @@ import java.util.*; public abstract class AbstractDocumentProcessorPlugin { protected LanguageFilter languageFilter = new LanguageFilter(); - public abstract DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocument crawledDocument) throws DisqualifiedException, URISyntaxException; + public abstract DetailsWithWords createDetails(CrawledDocument crawledDocument) throws DisqualifiedException, URISyntaxException; public abstract boolean isApplicable(CrawledDocument doc); protected void checkDocumentLanguage(DocumentLanguageData dld) throws DisqualifiedException { @@ -44,12 +44,6 @@ public abstract class AbstractDocumentProcessorPlugin { tagWords.add(key + ":" + value.toString().toLowerCase()); } - public MetaTagsBuilder addDomainCrawlData(CrawledDomain domain) { - add("ip", domain.ip); - - return this; - } - public MetaTagsBuilder addUrl(EdgeUrl url) { add("proto", url.proto); add("site", url.domain); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java index c2119688..8fb2b801 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java @@ -94,10 +94,10 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin } @Override - public DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocument crawledDocument) + public DetailsWithWords createDetails(CrawledDocument crawledDocument) throws DisqualifiedException, URISyntaxException { - String documentBody = crawledDocument.documentBody.decode(); + String documentBody = crawledDocument.documentBody; if (languageFilter.isBlockedUnicodeRange(documentBody)) { throw new DisqualifiedException(DisqualificationReason.LANGUAGE); @@ -141,7 +141,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin throw new DisqualifiedException(DisqualificationReason.QUALITY); } - final Set features = featureExtractor.getFeatures(crawledDomain, doc, dld); + final Set features = featureExtractor.getFeatures(doc, dld); ret.features = features; ret.hashCode = dld.localitySensitiveHashCode(); @@ -159,7 +159,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin ret.generator = generatorParts.type(); var tagWords = new MetaTagsBuilder() - .addDomainCrawlData(crawledDomain) .addPubDate(pubDate) .addUrl(url) .addFeatures(features) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java index e7d0a9a1..1dac05f1 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java @@ -55,10 +55,10 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP } @Override - public DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocument crawledDocument) + public DetailsWithWords createDetails(CrawledDocument crawledDocument) throws DisqualifiedException, URISyntaxException { - String documentBody = crawledDocument.documentBody.decode(); + String documentBody = crawledDocument.documentBody; if (languageFilter.isBlockedUnicodeRange(documentBody)) { throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LANGUAGE); @@ -97,7 +97,6 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url); var tagWords = new MetaTagsBuilder() - .addDomainCrawlData(crawledDomain) .addPubDate(pubDate) .addUrl(url) .addFeatures(ret.features) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/EncyclopediaMarginaliaNuSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/EncyclopediaMarginaliaNuSideloader.java new file mode 100644 index 00000000..ae07b6c3 --- /dev/null +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/EncyclopediaMarginaliaNuSideloader.java @@ -0,0 +1,249 @@ +package nu.marginalia.converting.sideload; + +import com.github.luben.zstd.ZstdInputStream; +import com.google.gson.Gson; +import lombok.SneakyThrows; +import nu.marginalia.converting.model.DisqualifiedException; +import nu.marginalia.converting.model.ProcessedDocument; +import nu.marginalia.converting.model.ProcessedDomain; +import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin; +import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawl.DomainIndexingState; +import nu.marginalia.model.crawl.UrlIndexingState; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.URISyntaxException; +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; +import java.nio.file.Path; +import java.sql.*; +import java.time.LocalDateTime; +import java.util.Iterator; +import java.util.List; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.Semaphore; +import java.util.concurrent.atomic.AtomicBoolean; + +/** This is an experimental sideloader for encyclopedia.marginalia.nu's database; + * (which serves as a way of loading wikipedia's zim files without binding to GPL2'd code) + * + * See https://github.com/MarginaliaSearch/encyclopedia.marginalia.nu for extracting the data + */ +public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoCloseable { + + private final Connection connection; + private final Gson gson; + private final HtmlDocumentProcessorPlugin htmlProcessorPlugin; + + public EncyclopediaMarginaliaNuSideloader(Path pathToDbFile, + Gson gson, + HtmlDocumentProcessorPlugin htmlProcessorPlugin) throws SQLException { + this.gson = gson; + this.htmlProcessorPlugin = htmlProcessorPlugin; + String sqliteDbString = "jdbc:sqlite:" + pathToDbFile.toString(); + + connection = DriverManager.getConnection(sqliteDbString); + + } + + @Override + public ProcessedDomain getDomain() { + var ret = new ProcessedDomain(); + + ret.domain = new EdgeDomain("encyclopedia.marginalia.nu"); + ret.id = "encyclopedia.marginalia.nu"; + ret.ip = "127.0.0.1"; + ret.state = DomainIndexingState.ACTIVE; + + return ret; + } + + @Override + @SneakyThrows + public Iterator getUrlsIterator() { + EdgeUrl base = new EdgeUrl("https://encyclopedia.marginalia.nu/"); + + return new SqlQueryIterator<>(connection.prepareStatement(""" + SELECT url, html FROM articles + """)) + { + @Override + public EdgeUrl convert(ResultSet rs) throws Exception { + var path = URLEncoder.encode(rs.getString("url"), StandardCharsets.UTF_8); + + return base.withPathAndParam("/article/"+path, null); + } + }; + } + + + @SneakyThrows + @Override + public Iterator getDocumentsStream() { + LinkedBlockingQueue docs = new LinkedBlockingQueue<>(32); + AtomicBoolean isFinished = new AtomicBoolean(false); + + ExecutorService executorService = Executors.newFixedThreadPool(16); + Semaphore sem = new Semaphore(16); + + executorService.submit(() -> { + try { + var stmt = connection.prepareStatement(""" + SELECT url,title,html FROM articles + """); + stmt.setFetchSize(100); + + var rs = stmt.executeQuery(); + while (rs.next()) { + var articleParts = fromCompressedJson(rs.getBytes("html"), ArticleParts.class); + String title = rs.getString("title"); + String url = URLEncoder.encode(rs.getString("url"), StandardCharsets.UTF_8); + + sem.acquire(); + + executorService.submit(() -> { + try { + docs.add(convertDocument(articleParts.parts, title, url)); + } catch (URISyntaxException | DisqualifiedException e) { + e.printStackTrace(); + } finally { + sem.release(); + } + }); + } + + stmt.close(); + } + catch (Exception e) { + e.printStackTrace(); + } + finally { + isFinished.set(true); + } + }); + + return new Iterator<>() { + @Override + public boolean hasNext() { + return !isFinished.get() || !docs.isEmpty() || sem.availablePermits() < 16; + } + + @SneakyThrows + @Override + public ProcessedDocument next() { + return docs.take(); + } + }; + } + + private ProcessedDocument convertDocument(List parts, String title, String url) throws URISyntaxException, DisqualifiedException { + String fullUrl = "https://encyclopedia.marginalia.nu/article/"+url; + + StringBuilder fullHtml = new StringBuilder(); + fullHtml.append("").append(title).append(""); + for (String part : parts) { + fullHtml.append("

    "); + fullHtml.append(part); + fullHtml.append("

    "); + } + fullHtml.append(""); + + var crawledDoc = new CrawledDocument( + "encyclopedia.marginalia.nu", + fullUrl, + "text/html", + LocalDateTime.now().toString(), + 200, + "OK", + "NP", + "", + fullHtml.toString(), + Integer.toHexString(fullHtml.hashCode()), + fullUrl, + "", + "SIDELOAD" + ); + + var ret = new ProcessedDocument(); + try { + var details = htmlProcessorPlugin.createDetails(crawledDoc); + + ret.words = details.words(); + ret.details = details.details(); + ret.details.metadata = ret.details.metadata + .withSize(10_000_000, Math.max(0, 255 - url.length())); + ret.url = new EdgeUrl(fullUrl); + ret.state = UrlIndexingState.OK; + ret.stateReason = "SIDELOAD"; + } + catch (Exception e) { + ret.url = new EdgeUrl(fullUrl); + ret.state = UrlIndexingState.DISQUALIFIED; + ret.stateReason = "SIDELOAD"; + } + + return ret; + + } + + private T fromCompressedJson(byte[] stream, Class type) throws IOException { + return gson.fromJson(new InputStreamReader(new ZstdInputStream(new ByteArrayInputStream(stream))), type); + } + + private record ArticleParts(List parts) {} + + @Override + public String getId() { + return "encyclopedia.marginalia.nu"; + } + + @Override + public void close() throws Exception { + connection.close(); + } + + private abstract static class SqlQueryIterator implements Iterator { + PreparedStatement stmt; + ResultSet rs; + T next = null; + + public SqlQueryIterator(PreparedStatement stmt) throws SQLException { + this.stmt = stmt; + stmt.setFetchSize(1000); + rs = stmt.executeQuery(); + } + + @SneakyThrows + @Override + public boolean hasNext() { + if (next != null) { + return true; + } + if (!rs.next()) { + stmt.close(); + return false; + } + + next = convert(rs); + + return true; + } + + public abstract T convert(ResultSet rs) throws Exception; + + @Override + public T next () { + if (!hasNext()) + throw new IllegalStateException("No next element"); + var ret = next; + next = null; + return ret; + } + } +} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSource.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSource.java new file mode 100644 index 00000000..d23a81ae --- /dev/null +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSource.java @@ -0,0 +1,15 @@ +package nu.marginalia.converting.sideload; + +import nu.marginalia.converting.model.ProcessedDocument; +import nu.marginalia.converting.model.ProcessedDomain; +import nu.marginalia.model.EdgeUrl; + +import java.util.Iterator; + +public interface SideloadSource { + ProcessedDomain getDomain(); + Iterator getUrlsIterator(); + Iterator getDocumentsStream(); + + String getId(); +} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java new file mode 100644 index 00000000..fd709951 --- /dev/null +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java @@ -0,0 +1,35 @@ +package nu.marginalia.converting.sideload; + +import com.google.gson.Gson; +import com.google.inject.Inject; +import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin; +import nu.marginalia.keyword.DocumentKeywordExtractor; +import nu.marginalia.language.sentence.SentenceExtractor; + +import java.nio.file.Path; +import java.sql.SQLException; + +public class SideloadSourceFactory { + private final Gson gson; + private final HtmlDocumentProcessorPlugin htmlProcessorPlugin; + private final SentenceExtractor sentenceExtractor; + private final DocumentKeywordExtractor documentKeywordExtractor; + + @Inject + public SideloadSourceFactory(Gson gson, HtmlDocumentProcessorPlugin htmlProcessorPlugin, SentenceExtractor sentenceExtractor, DocumentKeywordExtractor documentKeywordExtractor) { + this.gson = gson; + this.htmlProcessorPlugin = htmlProcessorPlugin; + this.sentenceExtractor = sentenceExtractor; + this.documentKeywordExtractor = documentKeywordExtractor; + } + + public SideloadSource sideloadEncyclopediaMarginaliaNu(Path pathToDbFile) throws SQLException { + return new EncyclopediaMarginaliaNuSideloader(pathToDbFile, gson, htmlProcessorPlugin); + } + + /** Do not use, this code isn't finished */ + @Deprecated() + public SideloadSource sideloadStackexchange(Path pathTo7zFile, String domainName) { + return new StackexchangeSideloader(pathTo7zFile, domainName, sentenceExtractor, documentKeywordExtractor); + } +} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/StackExchange7zReader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/StackExchange7zReader.java new file mode 100644 index 00000000..a3e42e65 --- /dev/null +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/StackExchange7zReader.java @@ -0,0 +1,229 @@ +package nu.marginalia.converting.sideload; + +import lombok.SneakyThrows; +import org.apache.commons.compress.archivers.sevenz.SevenZArchiveEntry; +import org.apache.commons.compress.archivers.sevenz.SevenZFile; + +import javax.xml.namespace.QName; +import javax.xml.stream.XMLInputFactory; +import javax.xml.stream.XMLStreamException; +import java.io.IOException; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; +import java.util.stream.Collectors; + +@Deprecated +public class StackExchange7zReader { + private final Path pathTo7zFile; + + public StackExchange7zReader(Path pathTo7zFile) { + this.pathTo7zFile = pathTo7zFile; + } + + public List getIds() throws IOException, XMLStreamException { + try (SevenZFile file = new SevenZFile(pathTo7zFile.toFile())) { + for (SevenZArchiveEntry entry : file.getEntries()) { + if ("Posts.xml".equals(entry.getName())) { + return getIds(file, entry); + } + } + } + return List.of(); + } + + + private List getIds(SevenZFile file, SevenZArchiveEntry entry) throws IOException, XMLStreamException { + List ids = new ArrayList<>(10000); + + XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance(); + var idField = new QName("Id"); + + try (var inputStream = file.getInputStream(entry)) { + + var xmlReader = xmlInputFactory.createXMLEventReader(inputStream); + + while (xmlReader.hasNext()) { + var event = xmlReader.nextEvent(); + if (!event.isStartElement()) continue; + + var startEvent = event.asStartElement(); + if (!"row".equals(startEvent.getName().getLocalPart())) continue; + + var fieldValue = startEvent.getAttributeByName(idField); + if (fieldValue != null) { + ids.add(fieldValue.getValue()); + } + } + } + + return ids; + } + + public Iterator postIterator() throws IOException, XMLStreamException { + SevenZFile postsFile = new SevenZFile(pathTo7zFile.toFile()); + SevenZFile commentsFile = new SevenZFile(pathTo7zFile.toFile()); + + SevenZArchiveEntry postsEntry = null; + SevenZArchiveEntry commentsEntry = null; + + for (SevenZArchiveEntry entry : postsFile.getEntries()) { + if ("Posts.xml".equals(entry.getName())) { + postsEntry = entry; + break; + } + } + + for (SevenZArchiveEntry entry : commentsFile.getEntries()) { + if ("Comments.xml".equals(entry.getName())) { + commentsEntry = entry; + break; + } + } + + if (postsEntry == null || commentsEntry == null) { + postsFile.close(); + commentsFile.close(); + + throw new IOException("Posts.xml or Comments.xml not found in 7z file"); + } + + var postsInputStream = postsFile.getInputStream(postsEntry); + var commentsInputStream = commentsFile.getInputStream(commentsEntry); + + XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance(); + + var postsXmlReader = xmlInputFactory.createXMLEventReader(postsInputStream); + var commentsXmlReader = xmlInputFactory.createXMLEventReader(commentsInputStream); + + QName titleName = new QName("Title"); + QName idName = new QName("Id"); + QName bodyName = new QName("Body"); + QName tagsName = new QName("Tags"); + QName creationDateName = new QName("CreationDate"); + QName score = new QName("Score"); + + QName postIdName = new QName("PostId"); + QName textName = new QName("Text"); + + return new Iterator<>() { + Post next = null; + Comment nextComment = null; + + @SneakyThrows + @Override + public boolean hasNext() { + if (next != null) + return true; + + while (postsXmlReader.hasNext()) { + var event = postsXmlReader.nextEvent(); + if (!event.isStartElement()) continue; + + var startEvent = event.asStartElement(); + if (!"row".equals(startEvent.getName().getLocalPart())) continue; + + var scoreAttribute = startEvent.getAttributeByName(score); + if (scoreAttribute == null) continue; + int score = Integer.parseInt(scoreAttribute.getValue()); + if (score < 1) continue; + + var titleAttribute = startEvent.getAttributeByName(titleName); + if (titleAttribute == null) continue; + String title = titleAttribute.getValue(); + + var idAttribute = startEvent.getAttributeByName(idName); + if (idAttribute == null) continue; + int id = Integer.parseInt(idAttribute.getValue()); + + var bodyAttribute = startEvent.getAttributeByName(bodyName); + if (bodyAttribute == null) continue; + String body = bodyAttribute.getValue(); + + var tagsAttribute = startEvent.getAttributeByName(tagsName); + if (tagsAttribute == null) continue; + String tags = tagsAttribute.getValue(); + List tagsParsed = parseTags(tags); + var creationDateAttribute = startEvent.getAttributeByName(creationDateName); + if (creationDateAttribute == null) continue; + String creationDate = creationDateAttribute.getValue(); + int year = Integer.parseInt(creationDate.substring(0, 4)); + + List comments = new ArrayList<>(); + do { + if (nextComment == null) continue; + + if (nextComment.postId > id) { + break; + } + if (nextComment.postId == id) { + comments.add(nextComment); + nextComment = null; + } + } + while (readNextComment()); + + next = new Post(title, tagsParsed, year, id, body, comments); + return true; + } + + postsInputStream.close(); + commentsInputStream.close(); + postsFile.close(); + commentsFile.close(); + + return false; + } + + private boolean readNextComment() throws XMLStreamException { + while (commentsXmlReader.hasNext()) { + var event = commentsXmlReader.nextEvent(); + if (!event.isStartElement()) continue; + + var startEvent = event.asStartElement(); + if (!"row".equals(startEvent.getName().getLocalPart())) continue; + + var postIdAttribute = startEvent.getAttributeByName(postIdName); + if (postIdAttribute == null) continue; + int postId = Integer.parseInt(postIdAttribute.getValue()); + + var textAttribute = startEvent.getAttributeByName(textName); + if (textAttribute == null) continue; + String text = textAttribute.getValue(); + + nextComment = new Comment(postId, text); + return true; + } + return false; + } + + @Override + public Post next() { + if (hasNext()) { + var ret = next; + next = null; + return ret; + } + + throw new IllegalStateException("No more posts"); + } + }; + } + + private List parseTags(String tags) { + return Arrays.stream(tags.split("<|>")) + .filter(s -> !s.isBlank()) + .collect(Collectors.toList()); + } + + + public record Post(String title, List tags, int year, int id, String body, List comments) { + + } + + public record Comment(int postId, String text) { + + } +} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/StackexchangeSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/StackexchangeSideloader.java new file mode 100644 index 00000000..97a37ac9 --- /dev/null +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/StackexchangeSideloader.java @@ -0,0 +1,149 @@ +package nu.marginalia.converting.sideload; + +import lombok.SneakyThrows; +import nu.marginalia.converting.model.*; +import nu.marginalia.keyword.DocumentKeywordExtractor; +import nu.marginalia.language.sentence.SentenceExtractor; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawl.DomainIndexingState; +import nu.marginalia.model.crawl.HtmlFeature; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.model.crawl.UrlIndexingState; +import nu.marginalia.model.idx.DocumentFlags; +import nu.marginalia.model.idx.DocumentMetadata; +import nu.marginalia.model.idx.WordFlags; +import org.apache.commons.lang3.StringUtils; +import org.jsoup.Jsoup; + +import javax.xml.stream.XMLStreamException; +import java.io.IOException; +import java.nio.file.Path; +import java.util.EnumSet; +import java.util.Iterator; +import java.util.List; +import java.util.Optional; + +/** This code is broken */ +@Deprecated() +public class StackexchangeSideloader implements SideloadSource { + private final StackExchange7zReader reader; + private final SentenceExtractor sentenceExtractor; + private final DocumentKeywordExtractor keywordExtractor; + private final String domainName; + + public StackexchangeSideloader(Path pathTo7zFile, + String domainName, + SentenceExtractor sentenceExtractor, + DocumentKeywordExtractor keywordExtractor + ) { + this.domainName = domainName; + reader = new StackExchange7zReader(pathTo7zFile); + this.sentenceExtractor = sentenceExtractor; + this.keywordExtractor = keywordExtractor; + } + + @Override + public ProcessedDomain getDomain() { + var ret = new ProcessedDomain(); + + ret.domain = new EdgeDomain(domainName); + ret.id = domainName; + ret.ip = "127.0.0.1"; + ret.state = DomainIndexingState.ACTIVE; + + return ret; + } + + @SneakyThrows + @Override + public Iterator getUrlsIterator() { + var ids = reader.getIds(); + return ids.stream() + .map(id -> EdgeUrl.parse("https://" + domainName + "/questions/" + id)) + .filter(Optional::isPresent) + .map(Optional::get) + .iterator(); + } + + @Override + public Iterator getDocumentsStream() { + try { + var baseIter = reader.postIterator(); + return new Iterator<>() { + + @Override + public boolean hasNext() { + return baseIter.hasNext(); + } + + @Override + public ProcessedDocument next() { + return convert(baseIter.next()); + } + }; + } catch (IOException e) { + throw new RuntimeException(e); + } catch (XMLStreamException e) { + throw new RuntimeException(e); + } + } + + @SneakyThrows + private ProcessedDocument convert(StackExchange7zReader.Post post) { + String fullUrl = "https://" + domainName + "/questions/" + post.id(); + + StringBuilder fullHtml = new StringBuilder(); + fullHtml.append("").append(post.title()).append(""); + fullHtml.append("

    ").append(post.title()).append("

    "); + for (var comment : post.comments()) { + fullHtml.append("

    ").append(comment.text()).append("

    "); + } + fullHtml.append(""); + + var ret = new ProcessedDocument(); + try { + + var url = new EdgeUrl(fullUrl); + var doc = Jsoup.parse(fullHtml.toString()); + var dld = sentenceExtractor.extractSentences(doc); + + ret.url = url; + ret.words = keywordExtractor.extractKeywords(dld, url); + ret.words.addJustNoMeta("site:"+domainName); + ret.words.addJustNoMeta("site:"+url.domain.domain); + ret.words.addJustNoMeta(url.domain.domain); + ret.words.setFlagOnMetadataForWords(WordFlags.Subjects, post.tags()); + ret.details = new ProcessedDocumentDetails(); + ret.details.pubYear = post.year(); + ret.details.quality = 5; + ret.details.metadata = new DocumentMetadata(4, + PubDate.toYearByte(ret.details.pubYear), (int) -ret.details.quality, EnumSet.noneOf(DocumentFlags.class)); + ret.details.features = EnumSet.noneOf(HtmlFeature.class); + ret.details.generator = GeneratorType.DOCS; + ret.details.title = StringUtils.truncate(post.title(), 128); + ret.details.description = StringUtils.truncate(doc.body().text(), 512); + ret.details.length = 128; + + ret.details.standard = HtmlStandard.HTML5; + ret.details.feedLinks = List.of(); + ret.details.linksExternal = List.of(); + ret.details.linksInternal = List.of(); + ret.state = UrlIndexingState.OK; + ret.stateReason = "SIDELOAD"; + } + catch (Exception e) { + ret.url = new EdgeUrl(fullUrl); + ret.state = UrlIndexingState.DISQUALIFIED; + ret.stateReason = "SIDELOAD"; + } + + return ret; + } + + + @Override + public String getId() { + return domainName; + } +} diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java index 5b78ac9e..850b6ec2 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java @@ -3,12 +3,13 @@ package nu.marginalia.converting; import com.google.inject.Guice; import com.google.inject.Injector; -import nu.marginalia.bigstring.BigString; import nu.marginalia.converting.model.HtmlStandard; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.processor.DomainProcessor; +import nu.marginalia.crawling.io.SerializableCrawlDataStream; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.crawling.model.SerializableCrawlData; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.crawl.PubDate; @@ -40,18 +41,17 @@ public class ConvertingIntegrationTest { public void testEmptyDomain() { var docs = new ArrayList(); - var ret = domainProcessor.process( - new CrawledDomain("123", "memex.marginalia.nu", null, "OK", "-", "127.0.0.1", - docs, Collections.emptyList())); + var domain = new CrawledDomain("123", "memex.marginalia.nu", null, "OK", "-", "127.0.0.1", + docs, Collections.emptyList()); + var ret = domainProcessor.process(asSerializableCrawlData(domain)); assertEquals(ret.state, DomainIndexingState.ACTIVE); assertEquals(ret.domain, new EdgeDomain("memex.marginalia.nu")); assertTrue(ret.documents.isEmpty()); } - @Test public void testMemexMarginaliaNuDateInternalConsistency() throws IOException { - var ret = domainProcessor.process(readMarginaliaWorkingSet()); + var ret = domainProcessor.process(asSerializableCrawlData(readMarginaliaWorkingSet())); ret.documents.stream().filter(ProcessedDocument::isProcessedFully).forEach(doc -> { int year = PubDate.fromYearByte(doc.details.metadata.year()); Integer yearMeta = doc.details.pubYear; @@ -64,7 +64,7 @@ public class ConvertingIntegrationTest { @Test public void testMemexMarginaliaNu() throws IOException { - var ret = domainProcessor.process(readMarginaliaWorkingSet()); + var ret = domainProcessor.process(asSerializableCrawlData(readMarginaliaWorkingSet())); assertEquals(ret.state, DomainIndexingState.ACTIVE); assertEquals(ret.domain, new EdgeDomain("memex.marginalia.nu")); @@ -110,10 +110,11 @@ public class ConvertingIntegrationTest { "OK", "", "", - BigString.encode(readClassPathFile(p.toString())), + readClassPathFile(p.toString()), Double.toString(Math.random()), "https://memex.marginalia.nu/" + file, - null + null, + "" ); docs.add(doc); } @@ -132,4 +133,14 @@ public class ConvertingIntegrationTest { return new String(Objects.requireNonNull(ClassLoader.getSystemResourceAsStream(s)).readAllBytes()); } + + private SerializableCrawlDataStream asSerializableCrawlData(CrawledDomain domain) { + List data = new ArrayList<>(); + if (domain.doc != null) { + data.addAll(domain.doc); + } + data.add(domain); + + return SerializableCrawlDataStream.fromIterator(data.iterator()); + } } diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java index 890a1081..f4aaf351 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java @@ -8,6 +8,7 @@ import nu.marginalia.converting.processor.DomainProcessor; import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; +import nu.marginalia.crawling.io.SerializableCrawlDataStream; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.SerializableCrawlData; @@ -55,7 +56,11 @@ public class CrawlingThenConvertingIntegrationTest { CrawledDomain domain = crawl(specs); - var output = domainProcessor.process(domain); + List data = new ArrayList<>(); + data.add(domain); + data.addAll(domain.doc); + + var output = domainProcessor.process(SerializableCrawlDataStream.fromIterator(data.iterator())); for (var doc : output.documents) { if (doc.isOk()) { diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/StackexchangeSideloaderTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/StackexchangeSideloaderTest.java new file mode 100644 index 00000000..ee48ccc9 --- /dev/null +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/StackexchangeSideloaderTest.java @@ -0,0 +1,21 @@ +package nu.marginalia.converting.sideload; + +import org.junit.jupiter.api.Test; + +import javax.xml.stream.XMLStreamException; +import java.io.IOException; +import java.nio.file.Path; + +class StackexchangeSideloaderTest { + @Test + public void test7zFile() throws IOException, XMLStreamException { + var stackExchangeReader = new StackExchange7zReader(Path.of("/mnt/storage/stackexchange/scifi.meta.stackexchange.com.7z")); + + System.out.println(stackExchangeReader.getIds()); + + var iter = stackExchangeReader.postIterator(); + while (iter.hasNext()) { + System.out.println(iter.next()); + } + } +} \ No newline at end of file diff --git a/code/processes/crawling-process/build.gradle b/code/processes/crawling-process/build.gradle index de504915..fcc7862d 100644 --- a/code/processes/crawling-process/build.gradle +++ b/code/processes/crawling-process/build.gradle @@ -21,14 +21,18 @@ tasks.distZip.enabled = false dependencies { implementation project(':code:common:process') + implementation project(':code:common:db') implementation project(':code:common:model') implementation project(':code:common:config') implementation project(':code:common:service') implementation project(':code:libraries:big-string') implementation project(':code:api:index-api') + implementation project(':code:api:process-mqapi') implementation project(':code:common:service-discovery') implementation project(':code:common:service-client') + implementation project(':code:common:message-queue') implementation project(':code:libraries:language-processing') + implementation project(':code:libraries:easy-lsh') implementation project(':code:process-models:crawling-model') implementation project(':code:process-models:converting-model') @@ -48,6 +52,8 @@ dependencies { implementation libs.jsoup implementation libs.opencsv implementation libs.rxjava + implementation libs.fastutil + implementation libs.bundles.mariadb testImplementation libs.bundles.slf4j.test diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java new file mode 100644 index 00000000..1b61cb0d --- /dev/null +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java @@ -0,0 +1,83 @@ +package nu.marginalia.crawl; + +import lombok.SneakyThrows; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.concurrent.Semaphore; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; + +public class CrawlLimiter { + public static final int maxPoolSize = Integer.getInteger("crawler.pool-size", 256); + + // Thresholds for throttling task-spawning. Note there's a bit of hysteresis to this + private final long THROTTLE_TRIGGER_FREE_RAM = Runtime.getRuntime().maxMemory() / 4; + private final long THROTTLE_RELEASE_FREE_RAM = Runtime.getRuntime().maxMemory() / 2; + + private final Semaphore taskSemCount = new Semaphore(maxPoolSize); + + // When set to true, the crawler will wait before starting additional tasks + private final AtomicBoolean throttle = new AtomicBoolean(false); + private static final Logger logger = LoggerFactory.getLogger(CrawlLimiter.class); + + public CrawlLimiter() { + Thread monitorThread = new Thread(this::monitor, "Memory Monitor"); + monitorThread.setDaemon(true); + monitorThread.start(); + } + + + @SneakyThrows + public void monitor() { + for (;;) { + synchronized (throttle) { + boolean oldThrottle = throttle.get(); + boolean newThrottle = oldThrottle; + + if (Runtime.getRuntime().maxMemory() == Long.MAX_VALUE) { + // According to the spec this may happen, although it seems to rarely + // be the case in practice + logger.warn("Memory based throttling disabled (set Xmx)"); + return; + } + + final long freeMemory = Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory(); + + if (oldThrottle && freeMemory > THROTTLE_RELEASE_FREE_RAM) { + newThrottle = false; + logger.warn("Memory based throttling released"); + } + else if (!oldThrottle && freeMemory < THROTTLE_TRIGGER_FREE_RAM) { + newThrottle = true; + logger.warn("Memory based throttling triggered"); + + // Try to GC + System.gc(); + } + + + throttle.set(newThrottle); + + if (!newThrottle) { + throttle.notifyAll(); + } + if (newThrottle != oldThrottle) { + logger.warn("Memory based throttling set to {}", newThrottle); + } + } + + TimeUnit.SECONDS.sleep(1); + } + } + + @SneakyThrows + public void waitForEnoughRAM() { + while (throttle.get()) { + synchronized (throttle) { + throttle.wait(30000); + } + } + } + +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java index cbd9513a..fd936a7a 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java @@ -1,11 +1,22 @@ package nu.marginalia.crawl; +import com.google.gson.Gson; +import com.google.inject.Guice; +import com.google.inject.Inject; +import com.google.inject.Injector; import nu.marginalia.UserAgent; import nu.marginalia.WmsaHome; +import nu.marginalia.crawl.retreival.CrawlDataReference; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; -import nu.marginalia.crawl.retreival.fetcher.SitemapRetriever; +import nu.marginalia.crawling.io.CrawledDomainReader; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.mq.MessageQueueFactory; +import nu.marginalia.mq.MqMessage; +import nu.marginalia.mq.inbox.MqInboxResponse; +import nu.marginalia.mq.inbox.MqSingleShotInbox; +import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.process.log.WorkLog; -import plan.CrawlPlanLoader; +import nu.marginalia.service.module.DatabaseModule; import plan.CrawlPlan; import nu.marginalia.crawling.io.CrawledDomainWriter; import nu.marginalia.crawling.model.spec.CrawlingSpecification; @@ -17,45 +28,55 @@ import okhttp3.internal.Util; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; import java.nio.file.Path; -import java.util.HashSet; -import java.util.Set; +import java.sql.SQLException; +import java.util.*; import java.util.concurrent.*; +import java.util.concurrent.atomic.AtomicInteger; -public class CrawlerMain implements AutoCloseable { +import static nu.marginalia.mqapi.ProcessInboxNames.CRAWLER_INBOX; + +public class CrawlerMain { private final Logger logger = LoggerFactory.getLogger(getClass()); - private final CrawlPlan plan; - private final Path crawlDataDir; - - private final WorkLog workLog; + private Path crawlDataDir; + private final ProcessHeartbeat heartbeat; private final ConnectionPool connectionPool = new ConnectionPool(5, 10, TimeUnit.SECONDS); private final Dispatcher dispatcher = new Dispatcher(new ThreadPoolExecutor(0, Integer.MAX_VALUE, 5, TimeUnit.SECONDS, new SynchronousQueue<>(), Util.threadFactory("OkHttp Dispatcher", true))); private final UserAgent userAgent; - private final ThreadPoolExecutor pool; - final int poolSize = Integer.getInteger("crawler.pool-size", 512); - final int poolQueueSize = 32; + private final MessageQueueFactory messageQueueFactory; + private final FileStorageService fileStorageService; + private final Gson gson; + private final DumbThreadPool pool; - private final Set processedIds = new HashSet<>(); + private final Map processingIds = new ConcurrentHashMap<>(); + private final CrawledDomainReader reader = new CrawledDomainReader(); - AbortMonitor abortMonitor = AbortMonitor.getInstance(); - Semaphore taskSem = new Semaphore(poolSize); + final AbortMonitor abortMonitor = AbortMonitor.getInstance(); - public CrawlerMain(CrawlPlan plan) throws Exception { - this.plan = plan; - this.userAgent = WmsaHome.getUserAgent(); + volatile int totalTasks; + final AtomicInteger tasksDone = new AtomicInteger(0); + private final CrawlLimiter limiter = new CrawlLimiter(); - // Ensure that the user agent is set for Java's HTTP requests + @Inject + public CrawlerMain(UserAgent userAgent, + ProcessHeartbeat heartbeat, + MessageQueueFactory messageQueueFactory, + FileStorageService fileStorageService, + Gson gson) { + this.heartbeat = heartbeat; + this.userAgent = userAgent; + this.messageQueueFactory = messageQueueFactory; + this.fileStorageService = fileStorageService; + this.gson = gson; - BlockingQueue queue = new LinkedBlockingQueue<>(poolQueueSize); - pool = new ThreadPoolExecutor(poolSize/128, poolSize, 5, TimeUnit.MINUTES, queue); // maybe need to set -Xss for JVM to deal with this? - - workLog = plan.createCrawlWorkLog(); - crawlDataDir = plan.crawl.getDir(); + // maybe need to set -Xss for JVM to deal with this? + pool = new DumbThreadPool(CrawlLimiter.maxPoolSize, 1); } public static void main(String... args) throws Exception { @@ -71,93 +92,193 @@ public class CrawlerMain implements AutoCloseable { System.setProperty("sun.net.client.defaultConnectTimeout", "30000"); System.setProperty("sun.net.client.defaultReadTimeout", "30000"); - if (args.length != 1) { - System.err.println("Arguments: crawl-plan.yaml"); - System.exit(0); - } - var plan = new CrawlPlanLoader().load(Path.of(args[0])); + // We don't want to use too much memory caching sessions for https + System.setProperty("javax.net.ssl.sessionCacheSize", "2048"); - try (var crawler = new CrawlerMain(plan)) { - crawler.run(); + Injector injector = Guice.createInjector( + new CrawlerModule(), + new DatabaseModule() + ); + var crawler = injector.getInstance(CrawlerMain.class); + + var instructions = crawler.fetchInstructions(); + try { + crawler.run(instructions.getPlan()); + instructions.ok(); } + catch (Exception ex) { + System.err.println("Crawler failed"); + ex.printStackTrace(); + instructions.err(); + } + + TimeUnit.SECONDS.sleep(5); System.exit(0); } - public void run() throws InterruptedException { - // First a validation run to ensure the file is all good to parse - logger.info("Validating JSON"); - plan.forEachCrawlingSpecification(unused -> {}); + public void run(CrawlPlan plan) throws InterruptedException, IOException { - logger.info("Let's go"); + heartbeat.start(); + try (WorkLog workLog = plan.createCrawlWorkLog()) { + // First a validation run to ensure the file is all good to parse + logger.info("Validating JSON"); - // TODO: Make this into an iterable instead so we can abort it - plan.forEachCrawlingSpecification(this::startCrawlTask); + crawlDataDir = plan.crawl.getDir(); + + int countTotal = 0; + for (var unused : plan.crawlingSpecificationIterable()) { + countTotal++; + } + totalTasks = countTotal; + + logger.info("Let's go"); + + for (var crawlingSpecification : plan.crawlingSpecificationIterable()) { + + if (!abortMonitor.isAlive()) + break; + + // Check #1: Have we already crawled this site? Check is necessary for resuming a craw after a crash or something + if (workLog.isJobFinished(crawlingSpecification.id)) { + continue; + } + + // Check #2: Have we already started this crawl (but not finished it)? + // This shouldn't realistically happen, but if it does, we need to ignore it, otherwise + // we'd end crawling the same site twice and might end up writing to the same output + // file from multiple threads with complete bit salad as a result. + if (processingIds.put(crawlingSpecification.id, "") != null) { + logger.error("Ignoring duplicate id: {}", crawlingSpecification.id); + continue; + } + + pool.submit(new CrawlTask(crawlingSpecification, workLog)); + } + + logger.info("Shutting down the pool, waiting for tasks to complete..."); + + pool.shutDown(); + do { + System.out.println("Waiting for pool to terminate... " + pool.getActiveCount() + " remaining"); + } while (!pool.awaitTermination(60, TimeUnit.SECONDS)); + } + finally { + heartbeat.shutDown(); + } } + class CrawlTask implements DumbThreadPool.Task { - private void startCrawlTask(CrawlingSpecification crawlingSpecification) { + private final CrawlingSpecification specification; + private final WorkLog workLog; - if (!processedIds.add(crawlingSpecification.id)) { - - // This is a duplicate id, so we ignore it. Otherwise we'd end crawling the same site twice, - // and if we're really unlucky, we might end up writing to the same output file from multiple - // threads with complete bit salad as a result. - - logger.error("Ignoring duplicate id: {}", crawlingSpecification.id); - return; + CrawlTask(CrawlingSpecification specification, WorkLog workLog) { + this.specification = specification; + this.workLog = workLog; } - if (!abortMonitor.isAlive()) { - return; - } + @Override + public void run() throws Exception { - try { - taskSem.acquire(); - } catch (InterruptedException e) { - throw new RuntimeException(e); - } + limiter.waitForEnoughRAM(); - pool.execute(() -> { - try { - fetchDomain(crawlingSpecification); + HttpFetcher fetcher = new HttpFetcherImpl(userAgent.uaString(), dispatcher, connectionPool); + + try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification); + CrawlDataReference reference = getReference(specification)) + { + Thread.currentThread().setName("crawling:" + specification.domain); + + var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept); + int size = retreiver.fetch(reference); + + workLog.setJobToFinished(specification.id, writer.getOutputFile().toString(), size); + heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks); + + logger.info("Fetched {}", specification.domain); + + } catch (Exception e) { + logger.error("Error fetching domain " + specification.domain, e); } finally { - taskSem.release(); + // We don't need to double-count these; it's also kept int he workLog + processingIds.remove(specification.id); + Thread.currentThread().setName("[idle]"); } - }); - } - - private void fetchDomain(CrawlingSpecification specification) { - if (workLog.isJobFinished(specification.id)) - return; - - HttpFetcher fetcher = new HttpFetcherImpl(userAgent.uaString(), dispatcher, connectionPool); - - try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) { - var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept); - - int size = retreiver.fetch(); - - workLog.setJobToFinished(specification.id, writer.getOutputFile().toString(), size); - - logger.info("Fetched {}", specification.domain); - } catch (Exception e) { - logger.error("Error fetching domain", e); } + + private CrawlDataReference getReference(CrawlingSpecification specification) { + try { + var dataStream = reader.createDataStream(crawlDataDir, specification); + return new CrawlDataReference(dataStream); + } catch (IOException e) { + logger.warn("Failed to read previous crawl data for {}", specification.domain); + return new CrawlDataReference(); + } + } + } - public void close() throws Exception { - logger.info("Awaiting termination"); - pool.shutdown(); - - while (!pool.awaitTermination(1, TimeUnit.SECONDS)); - logger.info("All finished"); - - workLog.close(); - dispatcher.executorService().shutdownNow(); + private static class CrawlRequest { + private final CrawlPlan plan; + private final MqMessage message; + private final MqSingleShotInbox inbox; + + CrawlRequest(CrawlPlan plan, MqMessage message, MqSingleShotInbox inbox) { + this.plan = plan; + this.message = message; + this.inbox = inbox; + } + + public CrawlPlan getPlan() { + return plan; + } + + public void ok() { + inbox.sendResponse(message, MqInboxResponse.ok()); + } + public void err() { + inbox.sendResponse(message, MqInboxResponse.err()); + } + + } + + private CrawlRequest fetchInstructions() throws Exception { + + var inbox = messageQueueFactory.createSingleShotInbox(CRAWLER_INBOX, UUID.randomUUID()); + + logger.info("Waiting for instructions"); + var msgOpt = getMessage(inbox, nu.marginalia.mqapi.crawling.CrawlRequest.class.getSimpleName()); + var msg = msgOpt.orElseThrow(() -> new RuntimeException("No message received")); + + var request = gson.fromJson(msg.payload(), nu.marginalia.mqapi.crawling.CrawlRequest.class); + + var specData = fileStorageService.getStorage(request.specStorage); + var crawlData = fileStorageService.getStorage(request.crawlStorage); + + var plan = new CrawlPlan(specData.asPath().resolve("crawler.spec").toString(), + new CrawlPlan.WorkDir(crawlData.path(), "crawler.log"), + null); + + return new CrawlRequest(plan, msg, inbox); + } + + private Optional getMessage(MqSingleShotInbox inbox, String expectedFunction) throws SQLException, InterruptedException { + var opt = inbox.waitForMessage(30, TimeUnit.SECONDS); + if (opt.isPresent()) { + if (!opt.get().function().equals(expectedFunction)) { + throw new RuntimeException("Unexpected function: " + opt.get().function()); + } + return opt; + } + else { + var stolenMessage = inbox.stealMessage(msg -> msg.function().equals(expectedFunction)); + stolenMessage.ifPresent(mqMessage -> logger.info("Stole message {}", mqMessage)); + return stolenMessage; + } } } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerModule.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerModule.java new file mode 100644 index 00000000..ebf6d33f --- /dev/null +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerModule.java @@ -0,0 +1,24 @@ +package nu.marginalia.crawl; + +import com.google.gson.Gson; +import com.google.inject.AbstractModule; +import lombok.SneakyThrows; +import nu.marginalia.ProcessConfiguration; +import nu.marginalia.UserAgent; +import nu.marginalia.WmsaHome; +import nu.marginalia.model.gson.GsonFactory; + +import java.util.UUID; + +public class CrawlerModule extends AbstractModule { + @SneakyThrows + public void configure() { + bind(Gson.class).toInstance(createGson()); + bind(UserAgent.class).toInstance(WmsaHome.getUserAgent()); + bind(ProcessConfiguration.class).toInstance(new ProcessConfiguration("crawler", 0, UUID.randomUUID())); + } + + private Gson createGson() { + return GsonFactory.get(); + } +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/DumbThreadPool.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/DumbThreadPool.java new file mode 100644 index 00000000..076eb9e5 --- /dev/null +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/DumbThreadPool.java @@ -0,0 +1,122 @@ +package nu.marginalia.crawl; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; + +/** A simple thread pool implementation that will never invoke + * a task in the calling thread like {@link java.util.concurrent.ThreadPoolExecutor} + * does when the queue is full. Instead, it will block until a thread + * becomes available to run the task. This is useful for coarse grained + * tasks where the calling thread might otherwise block for hours. + */ +// TODO: This class exists in converter as well, should probably be broken out into a common library; use this version +public class DumbThreadPool { + private final List workers = new ArrayList<>(); + private final LinkedBlockingQueue tasks; + private volatile boolean shutDown = false; + private final AtomicInteger taskCount = new AtomicInteger(0); + private final Logger logger = LoggerFactory.getLogger(DumbThreadPool.class); + + public DumbThreadPool(int poolSize, int queueSize) { + tasks = new LinkedBlockingQueue<>(queueSize); + + for (int i = 0; i < poolSize; i++) { + Thread worker = new Thread(this::worker, "Crawler Thread " + i); + worker.setDaemon(true); + worker.start(); + workers.add(worker); + } + + } + + public void submit(Task task) throws InterruptedException { + tasks.put(task); + } + + public void shutDown() { + this.shutDown = true; + } + + public void shutDownNow() { + this.shutDown = true; + for (Thread worker : workers) { + worker.interrupt(); + } + } + + private void worker() { + while (!shutDown) { + try { + Task task = tasks.poll(1, TimeUnit.SECONDS); + if (task == null) { + continue; + } + + try { + taskCount.incrementAndGet(); + task.run(); + } + catch (Exception ex) { + logger.warn("Error executing task", ex); + } + finally { + taskCount.decrementAndGet(); + } + } + + catch (InterruptedException ex) { + logger.warn("Thread pool worker interrupted", ex); + return; + } + } + } + + + /** Wait for all tasks to complete up to the specified timeout, + * then return true if all tasks completed, false otherwise. + */ + public boolean awaitTermination(int i, TimeUnit timeUnit) throws InterruptedException { + final long start = System.currentTimeMillis(); + final long deadline = start + timeUnit.toMillis(i); + + for (var thread : workers) { + if (!thread.isAlive()) + continue; + + long timeRemaining = deadline - System.currentTimeMillis(); + if (timeRemaining <= 0) + return false; + + thread.join(timeRemaining); + if (thread.isAlive()) + return false; + } + + // Doublecheck the bookkeeping so we didn't mess up. This may mean you have to Ctrl+C the process + // if you see this warning forever, but for the crawler this is preferable to terminating early + // and missing tasks. (maybe some cosmic ray or OOM condition or X-Files baddie of the week killed a + // thread so hard and it didn't invoke finally and didn't decrement the task count) + + int activeCount = getActiveCount(); + if (activeCount != 0) { + logger.warn("Thread pool terminated with {} active threads(?!) -- check what's going on with jstack and kill manually", activeCount); + return false; + } + + return true; + } + + public int getActiveCount() { + return taskCount.get(); + } + + public interface Task { + void run() throws Exception; + } +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java new file mode 100644 index 00000000..985bfc39 --- /dev/null +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java @@ -0,0 +1,82 @@ +package nu.marginalia.crawl.retreival; + +import com.google.common.hash.HashFunction; +import com.google.common.hash.Hashing; +import nu.marginalia.crawling.io.SerializableCrawlDataStream; +import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.lsh.EasyLSH; + +import javax.annotation.Nullable; +import java.io.IOException; + +/** A reference to a domain that has been crawled before. */ +public class CrawlDataReference implements AutoCloseable { + + private final SerializableCrawlDataStream data; + + public CrawlDataReference(SerializableCrawlDataStream data) { + this.data = data; + } + + public CrawlDataReference() { + this(SerializableCrawlDataStream.empty()); + } + + @Nullable + public CrawledDocument nextDocument() { + try { + while (data.hasNext()) { + if (data.next() instanceof CrawledDocument doc) { + return doc; + } + } + } + catch (IOException ex) { + ex.printStackTrace(); + } + return null; + } + + public boolean isContentBodySame(CrawledDocument one, CrawledDocument other) { + assert one.documentBody != null; + assert other.documentBody != null; + + final long contentHashOne = contentHash(one.documentBody); + final long contentHashOther = contentHash(other.documentBody); + + return EasyLSH.hammingDistance(contentHashOne, contentHashOther) < 4; + } + + private long contentHash(String content) { + EasyLSH hash = new EasyLSH(); + int next = 0; + + boolean isInTag = false; + + // In a naive best-effort fashion, extract the text + // content of the document and feed it into the LSH + for (int i = 0; i < content.length(); i++) { + char c = content.charAt(i); + if (c == '<') { + isInTag = true; + } else if (c == '>') { + isInTag = false; + } else if (!isInTag) { + next = (next << 8) | (c & 0xff); + hash.addHashUnordered(hashInt(next)); + } + } + + return hash.get(); + } + + private final HashFunction hashFunction = Hashing.murmur3_128(); + private int hashInt(int v) { + return hashFunction.hashInt(v).asInt(); + } + + @Override + public void close() throws Exception { + data.close(); + } +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java new file mode 100644 index 00000000..ca2494dc --- /dev/null +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java @@ -0,0 +1,57 @@ +package nu.marginalia.crawl.retreival; + +import lombok.SneakyThrows; + +import static java.lang.Math.max; +import static java.lang.Math.min; + +public class CrawlDelayTimer { + + // When no crawl delay is specified, lean toward twice the fetch+process time, within these limits: + private static final long DEFAULT_CRAWL_DELAY_MIN_MS = Long.getLong("defaultCrawlDelay", 1000); + private static final long DEFAULT_CRAWL_DELAY_MAX_MS = Long.getLong("defaultCrawlDelaySlow", 2500); + + /** Flag to indicate that the crawler should slow down, e.g. from 429s */ + private boolean slowDown = false; + + private final long delayTime; + + public CrawlDelayTimer(long delayTime) { + this.delayTime = delayTime; + } + + @SneakyThrows + public void delay(long spentTime) { + long sleepTime = delayTime; + + if (sleepTime >= 1) { + if (spentTime > sleepTime) + return; + + Thread.sleep(min(sleepTime - spentTime, 5000)); + } + else if (slowDown) { + // Additional delay when the server is signalling it wants slower requests + Thread.sleep( DEFAULT_CRAWL_DELAY_MIN_MS); + } + else { + // When no crawl delay is specified, lean toward twice the fetch+process time, + // within sane limits. This means slower servers get slower crawling, and faster + // servers get faster crawling. + + sleepTime = spentTime * 2; + sleepTime = min(sleepTime, DEFAULT_CRAWL_DELAY_MAX_MS); + sleepTime = max(sleepTime, DEFAULT_CRAWL_DELAY_MIN_MS); + + if (spentTime > sleepTime) + return; + + Thread.sleep(sleepTime - spentTime); + } + } + + /** Increase the delay between requests if the server is signalling it wants slower requests with HTTP 429 */ + public void slowDown() { + slowDown = true; + } +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index 3af0110a..3549b25b 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -4,6 +4,7 @@ import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; import crawlercommons.robots.SimpleRobotRules; import lombok.SneakyThrows; +import nu.marginalia.crawl.retreival.fetcher.ContentTags; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.SitemapRetriever; import nu.marginalia.crawling.model.spec.CrawlingSpecification; @@ -17,31 +18,19 @@ import org.jsoup.nodes.Document; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import javax.annotation.Nullable; import java.net.InetAddress; import java.net.UnknownHostException; import java.time.LocalDateTime; import java.util.*; import java.util.function.Consumer; -import static java.lang.Math.max; -import static java.lang.Math.min; - public class CrawlerRetreiver { - private static final long DEFAULT_CRAWL_DELAY_MIN_MS = Long.getLong("defaultCrawlDelay", 1000); - private static final long DEFAULT_CRAWL_DELAY_MAX_MS = Long.getLong("defaultCrawlDelaySlow", 2500); private static final int MAX_ERRORS = 20; private final HttpFetcher fetcher; - - /** Flag to indicate that the crawler should slow down, e.g. from 429s */ - private boolean slowDown = false; - - - /** Testing flag to disable crawl delay (otherwise crawler tests take several minutes) */ - private boolean testFlagIgnoreDelay = false; - private final String id; private final String domain; private final Consumer crawledDomainWriter; @@ -57,9 +46,14 @@ public class CrawlerRetreiver { private final SitemapRetriever sitemapRetriever; private final DomainCrawlFrontier crawlFrontier; - int errorCount = 0; + /** recrawlState tag for documents that had a HTTP status 304 */ + private static final String documentWasRetainedTag = "RETAINED/304"; + + /** recrawlState tag for documents that had a 200 status but were identical to a previous version */ + private static final String documentWasSameTag = "SAME-BY-COMPARISON"; + public CrawlerRetreiver(HttpFetcher fetcher, CrawlingSpecification specs, Consumer writer) { @@ -73,9 +67,9 @@ public class CrawlerRetreiver { this.crawlFrontier = new DomainCrawlFrontier(new EdgeDomain(domain), specs.urls, specs.crawlDepth); sitemapRetriever = fetcher.createSitemapRetriever(); + // We must always crawl the index page first, this is assumed when fingerprinting the server var fst = crawlFrontier.peek(); if (fst != null) { - // Ensure the index page is always crawled var root = fst.withPathAndParam("/", null); @@ -88,16 +82,15 @@ public class CrawlerRetreiver { } } - public CrawlerRetreiver withNoDelay() { - testFlagIgnoreDelay = true; - return this; + public int fetch() { + return fetch(new CrawlDataReference()); } - public int fetch() { + public int fetch(CrawlDataReference oldCrawlData) { final DomainProber.ProbeResult probeResult = domainProber.probeDomain(fetcher, domain, crawlFrontier.peek()); if (probeResult instanceof DomainProber.ProbeResultOk) { - return crawlDomain(); + return crawlDomain(oldCrawlData); } // handle error cases for probe @@ -134,21 +127,29 @@ public class CrawlerRetreiver { throw new IllegalStateException("Unknown probe result: " + probeResult); }; - private int crawlDomain() { + private int crawlDomain(CrawlDataReference oldCrawlData) { String ip = findIp(domain); assert !crawlFrontier.isEmpty(); - var robotsRules = fetcher.fetchRobotRules(crawlFrontier.peek().domain); + final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(crawlFrontier.peek().domain); + final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay()); + + sniffRootDocument(delayTimer); + + // Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified + int recrawled = recrawl(oldCrawlData, robotsRules, delayTimer); + + if (recrawled > 0) { + // If we have reference data, we will always grow the crawl depth a bit + crawlFrontier.increaseDepth(1.5); + } downloadSitemaps(robotsRules); - sniffRootDocument(); - - long crawlDelay = robotsRules.getCrawlDelay(); CrawledDomain ret = new CrawledDomain(id, domain, null, CrawlerDomainStatus.OK.name(), null, ip, new ArrayList<>(), null); - int fetchedCount = 0; + int fetchedCount = recrawled; while (!crawlFrontier.isEmpty() && !crawlFrontier.isCrawlDepthReached() @@ -161,18 +162,26 @@ public class CrawlerRetreiver { continue; } + // Check the link filter if the endpoint should be fetched based on site-type if (!crawlFrontier.filterLink(top)) continue; + + // Check vs blocklist if (urlBlocklist.isUrlBlocked(top)) continue; + if (!isAllowedProtocol(top.proto)) continue; + + // Check if the URL is too long to insert into the DB if (top.toString().length() > 255) continue; + if (!crawlFrontier.addVisited(top)) continue; - if (fetchDocument(top, crawlDelay)) { + + if (fetchWriteAndSleep(top, delayTimer, DocumentWithReference.empty()).isPresent()) { fetchedCount++; } } @@ -184,6 +193,74 @@ public class CrawlerRetreiver { return fetchedCount; } + /** Performs a re-crawl of old documents, comparing etags and last-modified */ + private int recrawl(CrawlDataReference oldCrawlData, + SimpleRobotRules robotsRules, + CrawlDelayTimer delayTimer) { + int recrawled = 0; + int retained = 0; + + for (;;) { + CrawledDocument doc = oldCrawlData.nextDocument(); + + if (doc == null) { + break; + } + + // This Shouldn't Happen (TM) + var urlMaybe = EdgeUrl.parse(doc.url); + if (urlMaybe.isEmpty()) continue; + var url = urlMaybe.get(); + + // If we've previously 404:d on this URL, we'll refrain from trying to fetch it again + if (doc.httpStatus == 404) { + crawlFrontier.addVisited(url); + continue; + } + + if (doc.httpStatus != 200) continue; + + if (!robotsRules.isAllowed(url.toString())) { + crawledDomainWriter.accept(createRobotsError(url)); + continue; + } + if (!crawlFrontier.filterLink(url)) + continue; + if (!crawlFrontier.addVisited(url)) + continue; + + + if (recrawled > 10 + && retained > 0.9 * recrawled + && Math.random() < 0.75) + { + // Since it looks like most of these documents haven't changed, + // we'll load the documents directly; but we do this in a random + // fashion to make sure we eventually catch changes over time + + crawledDomainWriter.accept(doc); + crawlFrontier.addVisited(url); + continue; + } + + + // GET the document with the stored document as a reference + // providing etag and last-modified headers, so we can recycle the + // document if it hasn't changed without actually downloading it + + var fetchedDocOpt = fetchWriteAndSleep(url, + delayTimer, + new DocumentWithReference(doc, oldCrawlData)); + if (fetchedDocOpt.isEmpty()) continue; + + if (documentWasRetainedTag.equals(fetchedDocOpt.get().recrawlState)) retained ++; + else if (documentWasSameTag.equals(fetchedDocOpt.get().recrawlState)) retained ++; + + recrawled ++; + } + + return recrawled; + } private void downloadSitemaps(SimpleRobotRules robotsRules) { List sitemaps = robotsRules.getSitemaps(); @@ -231,13 +308,13 @@ public class CrawlerRetreiver { logger.debug("Queue is now {}", crawlFrontier.queueSize()); } - private void sniffRootDocument() { + private void sniffRootDocument(CrawlDelayTimer delayTimer) { try { logger.debug("Configuring link filter"); - var url = crawlFrontier.peek(); + var url = crawlFrontier.peek().withPathAndParam("/", null); - var maybeSample = fetchUrl(url).filter(sample -> sample.httpStatus == 200); + var maybeSample = fetchUrl(url, delayTimer, DocumentWithReference.empty()).filter(sample -> sample.httpStatus == 200); if (maybeSample.isEmpty()) return; var sample = maybeSample.get(); @@ -246,7 +323,7 @@ public class CrawlerRetreiver { return; // Sniff the software based on the sample document - var doc = Jsoup.parse(sample.documentBody.decode()); + var doc = Jsoup.parse(sample.documentBody); crawlFrontier.setLinkFilter(linkFilterSelector.selectFilter(doc)); for (var link : doc.getElementsByTag("link")) { @@ -273,30 +350,41 @@ public class CrawlerRetreiver { } } - private boolean fetchDocument(EdgeUrl top, long crawlDelay) { + private Optional fetchWriteAndSleep(EdgeUrl top, + CrawlDelayTimer timer, + DocumentWithReference reference) { logger.debug("Fetching {}", top); long startTime = System.currentTimeMillis(); - var doc = fetchUrl(top); - if (doc.isPresent()) { - var d = doc.get(); - crawledDomainWriter.accept(d); + var docOpt = fetchUrl(top, timer, reference); - if (d.url != null) { - EdgeUrl.parse(d.url).ifPresent(crawlFrontier::addVisited); + if (docOpt.isPresent()) { + var doc = docOpt.get(); + + if (!Objects.equals(doc.recrawlState, documentWasRetainedTag) + && reference.isContentBodySame(doc)) + { + // The document didn't change since the last time + doc.recrawlState = documentWasSameTag; } - if ("ERROR".equals(d.crawlerStatus) && d.httpStatus != 404) { + crawledDomainWriter.accept(doc); + + if (doc.url != null) { + // We may have redirected to a different path + EdgeUrl.parse(doc.url).ifPresent(crawlFrontier::addVisited); + } + + if ("ERROR".equals(doc.crawlerStatus) && doc.httpStatus != 404) { errorCount++; } } - long crawledTime = System.currentTimeMillis() - startTime; - delay(crawlDelay, crawledTime); + timer.delay(System.currentTimeMillis() - startTime); - return doc.isPresent(); + return docOpt; } private boolean isAllowedProtocol(String proto) { @@ -304,18 +392,21 @@ public class CrawlerRetreiver { || proto.equalsIgnoreCase("https"); } - private Optional fetchUrl(EdgeUrl top) { + private Optional fetchUrl(EdgeUrl top, CrawlDelayTimer timer, DocumentWithReference reference) { try { - var doc = fetchContent(top); + var contentTags = reference.getContentTags(); + var fetchedDoc = tryDownload(top, timer, contentTags); + + CrawledDocument doc = reference.replaceOn304(fetchedDoc); if (doc.documentBody != null) { - doc.documentBodyHash = createHash(doc.documentBody.decode()); + doc.documentBodyHash = createHash(doc.documentBody); - Optional parsedDoc = parseDoc(doc); + var parsedDoc = Jsoup.parse(doc.documentBody); EdgeUrl url = new EdgeUrl(doc.url); - parsedDoc.ifPresent(parsed -> findLinks(url, parsed)); - parsedDoc.flatMap(parsed -> findCanonicalUrl(url, parsed)) + findLinks(url, parsedDoc); + findCanonicalUrl(url, parsedDoc) .ifPresent(canonicalLink -> doc.canonicalUrl = canonicalLink.toString()); } @@ -329,14 +420,18 @@ public class CrawlerRetreiver { } + @SneakyThrows - private CrawledDocument fetchContent(EdgeUrl top) { + private CrawledDocument tryDownload(EdgeUrl top, CrawlDelayTimer timer, ContentTags tags) { for (int i = 0; i < 2; i++) { try { - return fetcher.fetchContent(top); + var doc = fetcher.fetchContent(top, tags); + doc.recrawlState = "NEW"; + return doc; } catch (RateLimitException ex) { - slowDown = true; + timer.slowDown(); + int delay = ex.retryAfter(); if (delay > 0 && delay < 5000) { Thread.sleep(delay); @@ -351,12 +446,6 @@ public class CrawlerRetreiver { return hashMethod.hashUnencodedChars(documentBodyHash).toString(); } - private Optional parseDoc(CrawledDocument doc) { - if (doc.documentBody == null) - return Optional.empty(); - return Optional.of(Jsoup.parse(doc.documentBody.decode())); - } - private void findLinks(EdgeUrl baseUrl, Document parsed) { baseUrl = linkParser.getBaseLink(parsed, baseUrl); @@ -396,36 +485,6 @@ public class CrawlerRetreiver { } } - @SneakyThrows - private void delay(long sleepTime, long spentTime) { - if (testFlagIgnoreDelay) - return; - - if (sleepTime >= 1) { - if (spentTime > sleepTime) - return; - - Thread.sleep(min(sleepTime - spentTime, 5000)); - } - else if (slowDown) { - Thread.sleep( 1000); - } - else { - // When no crawl delay is specified, lean toward twice the fetch+process time, - // within sane limits. This means slower servers get slower crawling, and faster - // servers get faster crawling. - - sleepTime = spentTime * 2; - sleepTime = min(sleepTime, DEFAULT_CRAWL_DELAY_MAX_MS); - sleepTime = max(sleepTime, DEFAULT_CRAWL_DELAY_MIN_MS); - - if (spentTime > sleepTime) - return; - - Thread.sleep(sleepTime - spentTime); - } - } - private CrawledDocument createRobotsError(EdgeUrl url) { return CrawledDocument.builder() .url(url.toString()) @@ -443,4 +502,75 @@ public class CrawlerRetreiver { .build(); } + private record DocumentWithReference( + @Nullable CrawledDocument doc, + @Nullable CrawlDataReference reference) { + + private static final DocumentWithReference emptyInstance = new DocumentWithReference(null, null); + public static DocumentWithReference empty() { + return emptyInstance; + } + + public boolean isContentBodySame(CrawledDocument newDoc) { + if (reference == null) + return false; + if (doc == null) + return false; + if (doc.documentBody == null) + return false; + if (newDoc.documentBody == null) + return false; + + return reference.isContentBodySame(doc, newDoc); + } + + private ContentTags getContentTags() { + if (null == doc) + return ContentTags.empty(); + + String headers = doc.headers; + if (headers == null) + return ContentTags.empty(); + + String[] headersLines = headers.split("\n"); + + String lastmod = null; + String etag = null; + + for (String line : headersLines) { + if (line.toLowerCase().startsWith("etag:")) { + etag = line.substring(5).trim(); + } + if (line.toLowerCase().startsWith("last-modified:")) { + lastmod = line.substring(14).trim(); + } + } + + return new ContentTags(etag, lastmod); + } + + public boolean isEmpty() { + return doc == null || reference == null; + } + + /** If the provided document has HTTP status 304, and the reference document is provided, + * return the reference document; otherwise return the provided document. + */ + public CrawledDocument replaceOn304(CrawledDocument fetchedDoc) { + + if (doc == null) + return fetchedDoc; + + // HTTP status 304 is NOT MODIFIED, which means the document is the same as it was when + // we fetched it last time. We can recycle the reference document. + if (fetchedDoc.httpStatus != 304) + return fetchedDoc; + + var ret = doc; + ret.recrawlState = documentWasRetainedTag; + ret.timestamp = LocalDateTime.now().toString(); + return ret; + } + } + } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java index b6e23f0c..30902a8e 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java @@ -1,37 +1,53 @@ package nu.marginalia.crawl.retreival; +import com.google.common.hash.HashFunction; +import it.unimi.dsi.fastutil.longs.LongOpenHashSet; import nu.marginalia.ip_blocklist.UrlBlocklist; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; +import java.net.URISyntaxException; import java.util.*; import java.util.function.Predicate; public class DomainCrawlFrontier { - private final LinkedList queue = new LinkedList<>(); - private final HashSet visited; - private final HashSet known; + private final ArrayDeque queue; + + // To save the number of strings kept in memory, + // do an approximate check using 64 bit hashes instead + // .. + // This isn't perfect, and may lead to false positives, + // but this is relatively unlikely, since the cardinality of these + // need to be in the billions to approach Birthday Paradox + // territory + private final LongOpenHashSet visited; + private final LongOpenHashSet known; + private final HashFunction hasher = com.google.common.hash.Hashing.murmur3_128(); private final EdgeDomain thisDomain; private final UrlBlocklist urlBlocklist; private Predicate linkFilter = url -> true; - final int depth; + private int depth; public DomainCrawlFrontier(EdgeDomain thisDomain, Collection urls, int depth) { this.thisDomain = thisDomain; this.urlBlocklist = new UrlBlocklist(); this.depth = depth; - visited = new HashSet<>((int)(urls.size() * 1.5)); - known = new HashSet<>(urls.size() * 10); + queue = new ArrayDeque<>(10 + (int) (urls.size()*1.2)); + visited = new LongOpenHashSet(10 + (int)(urls.size() * 1.5)); + known = new LongOpenHashSet(10 + urls.size() * 2); for (String urlStr : urls) { EdgeUrl.parse(urlStr).ifPresent(this::addToQueue); } } + public void increaseDepth(double depthIncreaseFactor) { + depth = (int)(depth * depthIncreaseFactor); + } public void setLinkFilter(Predicate linkFilter) { this.linkFilter = linkFilter; } @@ -45,21 +61,45 @@ public class DomainCrawlFrontier { } public void addFirst(EdgeUrl url) { - if (known.add(url.toString())) { - queue.addFirst(url); + if (addKnown(url)) { + queue.addFirst(url.toString()); } } public EdgeUrl takeNextUrl() { - return queue.removeFirst(); + try { + return new EdgeUrl(queue.removeFirst()); + } catch (URISyntaxException e) { + // This should never happen since we only add urls via EdgeUrl.toString() + throw new RuntimeException(e); + } } public EdgeUrl peek() { - return queue.peek(); + try { + if (queue.peek() == null) { + return null; + } + return new EdgeUrl(queue.peek()); + } catch (URISyntaxException e) { + // This should never happen since we only add urls via EdgeUrl.toString() + throw new RuntimeException(e); + } } public boolean addVisited(EdgeUrl url) { - return visited.add(url.toString()); + long hashCode = hasher.hashUnencodedChars(url.toString()).padToLong(); + + return visited.add(hashCode); + } + public boolean addKnown(EdgeUrl url) { + long hashCode = hasher.hashUnencodedChars(url.toString()).padToLong(); + return known.add(hashCode); + } + + boolean isVisited(EdgeUrl url) { + long hashCode = hasher.hashUnencodedChars(url.toString()).padToLong(); + return visited.contains(hashCode); } public boolean filterLink(EdgeUrl url) { @@ -77,11 +117,14 @@ public class DomainCrawlFrontier { return; // reduce memory usage by not growing queue huge when crawling large sites - if (queue.size() + visited.size() >= depth + 100) + if (queue.size() + visited.size() >= depth + 200) return; - if (known.add(url.toString())) { - queue.addLast(url); + if (isVisited(url)) + return; + + if (addKnown(url)) { + queue.addLast(url.toString()); } } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/ContentTags.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/ContentTags.java new file mode 100644 index 00000000..e1df86c8 --- /dev/null +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/ContentTags.java @@ -0,0 +1,24 @@ +package nu.marginalia.crawl.retreival.fetcher; + +import okhttp3.Request; + +/** Encapsulates request modifiers; the ETag and Last-Modified tags for a resource */ +public record ContentTags(String etag, String lastMod) { + public static ContentTags empty() { + return new ContentTags(null, null); + } + + public boolean isPresent() { + return etag != null || lastMod != null; + } + + public boolean isEmpty() { + return etag == null && lastMod == null; + } + + /** Paints the tags onto the request builder. */ + public void paint(Request.Builder getBuilder) { + if (etag != null) getBuilder.addHeader("If-None-Match", etag); + if (lastMod != null) getBuilder.addHeader("If-Modified-Since", lastMod); + } +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java index 1f630ac5..11ad272e 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java @@ -18,7 +18,7 @@ public interface HttpFetcher { FetchResult probeDomain(EdgeUrl url); - CrawledDocument fetchContent(EdgeUrl url) throws RateLimitException; + CrawledDocument fetchContent(EdgeUrl url, ContentTags tags) throws RateLimitException; SimpleRobotRules fetchRobotRules(EdgeDomain domain); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java index 55a6d296..b0b0fd9d 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java @@ -12,19 +12,19 @@ import nu.marginalia.crawling.model.CrawlerDocumentStatus; import nu.marginalia.crawling.model.ContentType; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.bigstring.BigString; import nu.marginalia.crawl.retreival.logic.ContentTypeLogic; import nu.marginalia.crawl.retreival.logic.ContentTypeParser; import okhttp3.*; import org.apache.commons.io.input.BOMInputStream; +import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import javax.net.ssl.SSLException; import javax.net.ssl.X509TrustManager; +import java.io.EOFException; import java.io.IOException; -import java.net.SocketTimeoutException; -import java.net.URISyntaxException; -import java.net.URL; +import java.net.*; import java.nio.charset.Charset; import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.StandardCharsets; @@ -120,34 +120,31 @@ public class HttpFetcherImpl implements HttpFetcher { return probeDomain(new EdgeUrl("https", url.domain, url.port, url.path, url.param)); } - logger.info("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage()); + logger.info("Error during fetching", ex); return new FetchResult(FetchResultState.ERROR, url.domain); } } - private Request createHeadRequest(EdgeUrl url) { - return new Request.Builder().head().addHeader("User-agent", userAgent) - .url(url.toString()) - .addHeader("Accept-Encoding", "gzip") - .build(); - } - - private Request createGetRequest(EdgeUrl url) { - return new Request.Builder().get().addHeader("User-agent", userAgent) - .url(url.toString()) - .addHeader("Accept-Encoding", "gzip") - .build(); - - } @Override @SneakyThrows - public CrawledDocument fetchContent(EdgeUrl url) throws RateLimitException { + public CrawledDocument fetchContent(EdgeUrl url, + ContentTags contentTags) + throws RateLimitException + { - if (contentTypeLogic.isUrlLikeBinary(url)) { + // We don't want to waste time and resources on URLs that are not HTML, so if the file ending + // looks like it might be something else, we perform a HEAD first to check the content type + if (contentTags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url)) + { logger.debug("Probing suspected binary {}", url); - var head = createHeadRequest(url); + var headBuilder = new Request.Builder().head() + .addHeader("User-agent", userAgent) + .url(url.toString()) + .addHeader("Accept-Encoding", "gzip"); + + var head = headBuilder.build(); var call = client.newCall(head); try (var rsp = call.execute()) { @@ -155,6 +152,21 @@ public class HttpFetcherImpl implements HttpFetcher { if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) { return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, "Early probe failed"); } + + // Update the URL to the final URL of the HEAD request, otherwise we might end up doing + + // HEAD 301 url1 -> url2 + // HEAD 200 url2 + // GET 301 url1 -> url2 + // GET 200 url2 + + // which is not what we want. Overall we want to do as few requests as possible to not raise + // too many eyebrows when looking at the logs on the target server. Overall it's probably desirable + // that it looks like the traffic makes sense, as opposed to looking like a broken bot. + + var redirectUrl = new EdgeUrl(rsp.request().url().toString()); + if (Objects.equals(redirectUrl.domain, url.domain)) + url = redirectUrl; } catch (SocketTimeoutException ex) { return createTimeoutErrorRsp(url, ex); @@ -165,7 +177,15 @@ public class HttpFetcherImpl implements HttpFetcher { } } - var get = createGetRequest(url); + var getBuilder = new Request.Builder().get(); + + getBuilder.addHeader("User-agent", userAgent) + .url(url.toString()) + .addHeader("Accept-Encoding", "gzip"); + + contentTags.paint(getBuilder); + + var get = getBuilder.build(); var call = client.newCall(get); try (var rsp = call.execute()) { @@ -177,11 +197,18 @@ public class HttpFetcherImpl implements HttpFetcher { catch (SocketTimeoutException ex) { return createTimeoutErrorRsp(url, ex); } - catch (IllegalCharsetNameException ex) { + catch (UnknownHostException ex) { + return createUnknownHostError(url, ex); + } + catch (SocketException | ProtocolException | IllegalCharsetNameException | SSLException | EOFException ex) { + // This is a bit of a grab-bag of errors that crop up + // IllegalCharsetName is egg on our face, + // but SSLException and EOFException are probably the server's fault + return createHardErrorRsp(url, ex); } catch (Exception ex) { - logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage()); + logger.error("Error during fetching", ex); return createHardErrorRsp(url, ex); } } @@ -194,6 +221,16 @@ public class HttpFetcherImpl implements HttpFetcher { .url(url.toString()) .build(); } + + private CrawledDocument createUnknownHostError(EdgeUrl url, Exception why) { + return CrawledDocument.builder() + .crawlerStatus(CrawlerDocumentStatus.ERROR.toString()) + .crawlerStatusDesc("Unknown Host") + .timestamp(LocalDateTime.now().toString()) + .url(url.toString()) + .build(); + } + private CrawledDocument createTimeoutErrorRsp(EdgeUrl url, Exception why) { return CrawledDocument.builder() .crawlerStatus("Timeout") @@ -253,6 +290,17 @@ public class HttpFetcherImpl implements HttpFetcher { return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CHARSET, ""); } + if (!isXRobotsTagsPermitted(rsp.headers("X-Robots-Tag"), userAgent)) { + return CrawledDocument.builder() + .crawlerStatus(CrawlerDocumentStatus.ROBOTS_TXT.name()) + .crawlerStatusDesc("X-Robots-Tag") + .url(responseUrl.toString()) + .httpStatus(-1) + .timestamp(LocalDateTime.now().toString()) + .headers(rsp.headers().toString()) + .build(); + } + var strData = getStringData(data, contentType); var canonical = rsp.header("rel=canonical", ""); @@ -264,10 +312,57 @@ public class HttpFetcherImpl implements HttpFetcher { .canonicalUrl(canonical) .httpStatus(rsp.code()) .url(responseUrl.toString()) - .documentBody(BigString.encode(strData)) + .documentBody(strData) .build(); } + /** Check X-Robots-Tag header tag to see if we are allowed to index this page. + *

    + * Reference: https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag + * + * @param xRobotsHeaderTags List of X-Robots-Tag values + * @param userAgent User agent string + * @return true if we are allowed to index this page + */ + // Visible for tests + public static boolean isXRobotsTagsPermitted(List xRobotsHeaderTags, String userAgent) { + boolean isPermittedGeneral = true; + boolean isPermittedMarginalia = false; + boolean isForbiddenMarginalia = false; + + for (String header : xRobotsHeaderTags) { + if (header.indexOf(':') >= 0) { + String[] parts = StringUtils.split(header, ":", 2); + + if (parts.length < 2) + continue; + + // Is this relevant to us? + if (!Objects.equals(parts[0].trim(), userAgent)) + continue; + + if (parts[1].contains("noindex")) + isForbiddenMarginalia = true; + else if (parts[1].contains("none")) + isForbiddenMarginalia = true; + else if (parts[1].contains("all")) + isPermittedMarginalia = true; + } + else { + if (header.contains("noindex")) + isPermittedGeneral = false; + if (header.contains("none")) + isPermittedGeneral = false; + } + } + + if (isPermittedMarginalia) + return true; + if (isForbiddenMarginalia) + return false; + return isPermittedGeneral; + } + private String getStringData(byte[] data, ContentType contentType) { Charset charset; try { @@ -315,7 +410,7 @@ public class HttpFetcherImpl implements HttpFetcher { private Optional fetchRobotsForProto(String proto, EdgeDomain domain) { try { var url = new EdgeUrl(proto, domain, null, "/robots.txt", null); - return Optional.of(parseRobotsTxt(fetchContent(url))); + return Optional.of(parseRobotsTxt(fetchContent(url, ContentTags.empty()))); } catch (Exception ex) { return Optional.empty(); @@ -324,7 +419,7 @@ public class HttpFetcherImpl implements HttpFetcher { private SimpleRobotRules parseRobotsTxt(CrawledDocument doc) { return robotsParser.parseContent(doc.url, - doc.documentBody.decode().getBytes(), + doc.documentBody.getBytes(), doc.contentType, userAgent); } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/NoSecuritySSL.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/NoSecuritySSL.java index a52251bc..f86d2c48 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/NoSecuritySSL.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/NoSecuritySSL.java @@ -33,7 +33,14 @@ public class NoSecuritySSL { // Install the all-trusting trust manager final SSLContext sslContext = SSLContext.getInstance("SSL"); sslContext.init(null, trustAllCerts, new java.security.SecureRandom()); - // Create an ssl socket factory with our all-trusting manager + + var clientSessionContext = sslContext.getClientSessionContext(); + + // The default value for this is very high and will use a crapload of memory + // since the crawler will be making a lot of requests to various hosts + clientSessionContext.setSessionCacheSize(2048); + + // Create a ssl socket factory with our all-trusting manager return sslContext.getSocketFactory(); } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/SitemapRetriever.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/SitemapRetriever.java index bb2d2898..90b26a88 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/SitemapRetriever.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/SitemapRetriever.java @@ -5,7 +5,6 @@ import nu.marginalia.model.EdgeUrl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import javax.inject.Singleton; import java.io.FileNotFoundException; import java.io.IOException; import java.util.*; @@ -44,14 +43,19 @@ public class SitemapRetriever { final List urlsList = new ArrayList<>(10000); final Set seenUrls = new HashSet<>(); - final LinkedList maps = new LinkedList<>(); + final ArrayDeque maps = new ArrayDeque<>(); maps.add(map); - while (!maps.isEmpty()) { + while (!maps.isEmpty() && seenSiteMapUrls.size() > 2) { if (urlsList.size() >= 10000) break; + // This is some weird site that too many sitemaps + // ... it's causing us to run out of memory + if (seenSiteMapUrls.size() > 25) + break; + var firstMap = maps.removeFirst(); if (!seenSiteMapUrls.add(firstMap.getUrl().toString())) { @@ -74,7 +78,12 @@ public class SitemapRetriever { } else if (map instanceof SiteMapIndex index) { var sitemaps = index.getSitemaps(false); - maps.addAll(sitemaps); + for (var sitemap : sitemaps) { + // Limit how many sitemaps we can add to the queue + if (maps.size() < 25) { + maps.add(sitemap); + } + } } else { logger.warn("Unknown sitemap type: {}", map.getClass()); diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/DomainCrawlFrontierTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/DomainCrawlFrontierTest.java new file mode 100644 index 00000000..1396444b --- /dev/null +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/DomainCrawlFrontierTest.java @@ -0,0 +1,32 @@ +package nu.marginalia.crawl.retreival; + +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; +import org.junit.jupiter.api.Test; + +import java.net.URISyntaxException; +import java.util.Set; + +import static org.junit.jupiter.api.Assertions.*; + +class DomainCrawlFrontierTest { + + @Test + public void testVisited() throws URISyntaxException { + var dcf = new DomainCrawlFrontier(new EdgeDomain("example.com"), Set.of(), 100); + + assertTrue(dcf.addVisited(new EdgeUrl("https://example.com"))); + assertTrue(dcf.isVisited(new EdgeUrl("https://example.com"))); + assertFalse(dcf.addVisited(new EdgeUrl("https://example.com"))); + } + + @Test + public void testKnown() throws URISyntaxException { + var dcf = new DomainCrawlFrontier(new EdgeDomain("example.com"), Set.of(), 100); + + assertTrue(dcf.addKnown(new EdgeUrl("https://example.com"))); + assertFalse(dcf.addKnown(new EdgeUrl("https://example.com/"))); + assertTrue(dcf.addKnown(new EdgeUrl("https://example.com/index.html"))); + assertFalse(dcf.addKnown(new EdgeUrl("https://example.com"))); + } +} \ No newline at end of file diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImplTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImplTest.java new file mode 100644 index 00000000..27b55760 --- /dev/null +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImplTest.java @@ -0,0 +1,36 @@ +package nu.marginalia.crawl.retreival.fetcher; + +import org.junit.jupiter.api.Test; + +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +class HttpFetcherImplTest { + + @Test + public void testXRobotsTag() { + assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("foo:"), "search.marginalia.nu")); + assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of(":bar"), "search.marginalia.nu")); + assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of(":"), "search.marginalia.nu")); + assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of(""), "search.marginalia.nu")); + + assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("doindex"), "search.marginalia.nu")); + assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("noindex"), "search.marginalia.nu")); + assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("none"), "search.marginalia.nu")); + assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: noindex"), "search.marginalia.nu")); + assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none"), "search.marginalia.nu")); + assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("googlebot: noindex"), "search.marginalia.nu")); + + assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("noindex", "search.marginalia.nu: all"), "search.marginalia.nu")); + assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("none", "search.marginalia.nu: all"), "search.marginalia.nu")); + assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("none", "search.marginalia.nu: none"), "search.marginalia.nu")); + assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("all", "search.marginalia.nu: none"), "search.marginalia.nu")); + assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: all", "noindex"), "search.marginalia.nu")); + assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: all", "none"), "search.marginalia.nu")); + assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none", "none"), "search.marginalia.nu")); + assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none", "all"), "search.marginalia.nu")); + } + +} \ No newline at end of file diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java index f6c2f3a4..5893910f 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java @@ -2,6 +2,7 @@ package nu.marginalia.crawling; import lombok.SneakyThrows; import nu.marginalia.crawl.retreival.RateLimitException; +import nu.marginalia.crawl.retreival.fetcher.ContentTags; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.retreival.logic.ContentTypeLogic; import nu.marginalia.model.EdgeUrl; @@ -29,14 +30,14 @@ class HttpFetcherTest { @Test void fetchUTF8() throws URISyntaxException, RateLimitException { var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler"); - var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu")); + var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), ContentTags.empty()); System.out.println(str.contentType); } @Test void fetchText() throws URISyntaxException, RateLimitException { var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler"); - var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt")); + var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), ContentTags.empty()); System.out.println(str); } } \ No newline at end of file diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/RssCrawlerTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/RssCrawlerTest.java index 86caa3c7..05de76dc 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/RssCrawlerTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/RssCrawlerTest.java @@ -59,7 +59,4 @@ class RssCrawlerTest { return urls; } - - - } \ No newline at end of file diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/WorkLogTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/WorkLogTest.java deleted file mode 100644 index 34046445..00000000 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/WorkLogTest.java +++ /dev/null @@ -1,56 +0,0 @@ -package nu.marginalia.crawling; - -import nu.marginalia.process.log.WorkLog; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; - -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertTrue; - -class WorkLogTest { - Path outFile; - @BeforeEach - public void setUp() throws IOException { - outFile = Files.createTempFile(getClass().getSimpleName(), ".log"); - } - @AfterEach - public void tearDown() throws IOException { - Files.delete(outFile); - } - - @Test - public void testLog() throws IOException { - var log = new WorkLog(outFile); - log.setJobToFinished("A", "a.txt",1); - log.setJobToFinished("B", "b.txt",2); - log.setJobToFinished("C", "c.txt",3); - assertTrue(log.isJobFinished("A")); - assertTrue(log.isJobFinished("B")); - assertTrue(log.isJobFinished("C")); - assertFalse(log.isJobFinished("E")); - } - - @Test - public void testLogResume() throws Exception { - WorkLog log = new WorkLog(outFile); - log.setJobToFinished("A", "a.txt",1); - log.setJobToFinished("B", "b.txt",2); - log.setJobToFinished("C", "c.txt",3); - log.close(); - log = new WorkLog(outFile); - log.setJobToFinished("E", "e.txt",4); - assertTrue(log.isJobFinished("A")); - assertTrue(log.isJobFinished("B")); - assertTrue(log.isJobFinished("C")); - assertTrue(log.isJobFinished("E")); - log.close(); - - Files.readAllLines(outFile).forEach(System.out::println); - } - -} diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java index 7462b62c..fee1d44a 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java @@ -4,10 +4,7 @@ import crawlercommons.robots.SimpleRobotRules; import lombok.SneakyThrows; import nu.marginalia.bigstring.BigString; import nu.marginalia.crawl.retreival.CrawlerRetreiver; -import nu.marginalia.crawl.retreival.fetcher.FetchResult; -import nu.marginalia.crawl.retreival.fetcher.FetchResultState; -import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; -import nu.marginalia.crawl.retreival.fetcher.SitemapRetriever; +import nu.marginalia.crawl.retreival.fetcher.*; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawlerDocumentStatus; import nu.marginalia.crawling.model.SerializableCrawlData; @@ -33,7 +30,6 @@ public class CrawlerMockFetcherTest { Map mockData = new HashMap<>(); HttpFetcher fetcherMock = new MockFetcher(); - SitemapRetriever sitemapRetriever = new SitemapRetriever(); @AfterEach public void tearDown() { @@ -47,13 +43,12 @@ public class CrawlerMockFetcherTest { .contentType("text/html") .httpStatus(200) .crawlerStatus(CrawlerDocumentStatus.OK.name()) - .documentBody(BigString.encode(documentData)) + .documentBody(documentData) .build()); } @SneakyThrows private void registerUrlClasspathData(EdgeUrl url, String path) { - var data = BigString.encode(CommonTestData.loadTestData(path)); mockData.put(url, CrawledDocument.builder() .crawlId("1") @@ -61,7 +56,7 @@ public class CrawlerMockFetcherTest { .contentType("text/html") .httpStatus(200) .crawlerStatus(CrawlerDocumentStatus.OK.name()) - .documentBody(data) + .documentBody(CommonTestData.loadTestData(path)) .build()); } @@ -75,7 +70,6 @@ public class CrawlerMockFetcherTest { registerUrlClasspathData(new EdgeUrl("https://startrek.website/post/108995"), "mock-crawl-data/lemmy/108995.html"); new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 10, "startrek.website", new ArrayList<>()), out::add) - .withNoDelay() .fetch(); out.forEach(System.out::println); @@ -88,7 +82,6 @@ public class CrawlerMockFetcherTest { registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html"); new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 10, "en.wikipedia.org", new ArrayList<>()), out::add) - .withNoDelay() .fetch(); out.forEach(System.out::println); @@ -103,7 +96,6 @@ public class CrawlerMockFetcherTest { registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/combined-mode-but-grid/4489"), "mock-crawl-data/discourse/grid.html"); new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 100, "community.tt-rss.org", new ArrayList<>()), out::add) - .withNoDelay() .fetch(); out.forEach(System.out::println); @@ -127,7 +119,7 @@ public class CrawlerMockFetcherTest { } @Override - public CrawledDocument fetchContent(EdgeUrl url) { + public CrawledDocument fetchContent(EdgeUrl url, ContentTags tags) { logger.info("Fetching {}", url); if (mockData.containsKey(url)) { return mockData.get(url); diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java index 64c7e890..48aa39c9 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java @@ -2,16 +2,25 @@ package nu.marginalia.crawling.retreival; import lombok.SneakyThrows; import nu.marginalia.WmsaHome; +import nu.marginalia.crawl.retreival.CrawlDataReference; import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; +import nu.marginalia.crawling.io.CrawledDomainReader; +import nu.marginalia.crawling.io.CrawledDomainWriter; import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.spec.CrawlingSpecification; import nu.marginalia.crawling.model.SerializableCrawlData; import org.junit.jupiter.api.*; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.stream.Collectors; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -95,4 +104,45 @@ class CrawlerRetreiverTest { ); } + @Test + public void testRecrawl() throws IOException { + + var specs = CrawlingSpecification + .builder() + .id("123456") + .crawlDepth(12) + .domain("www.marginalia.nu") + .urls(List.of("https://www.marginalia.nu/some-dead-link")) + .build(); + + + Path out = Files.createTempDirectory("crawling-process"); + var writer = new CrawledDomainWriter(out, specs); + Map, List> data = new HashMap<>(); + + new CrawlerRetreiver(httpFetcher, specs, d -> { + data.computeIfAbsent(d.getClass(), k->new ArrayList<>()).add(d); + if (d instanceof CrawledDocument doc) { + System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus); + if (Math.random() > 0.5) { + doc.headers = ""; + } + } + writer.accept(d); + }).fetch(); + writer.close(); + + var reader = new CrawledDomainReader(); + var stream = reader.createDataStream(out, specs); + + CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0); + domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList()); + + new CrawlerRetreiver(httpFetcher, specs, d -> { + if (d instanceof CrawledDocument doc) { + System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus); + } + }).fetch(new CrawlDataReference(stream)); + + } } \ No newline at end of file diff --git a/code/processes/loading-process/build.gradle b/code/processes/loading-process/build.gradle index 47ec6f59..0a89c350 100644 --- a/code/processes/loading-process/build.gradle +++ b/code/processes/loading-process/build.gradle @@ -19,9 +19,11 @@ tasks.distZip.enabled = false dependencies { implementation project(':code:common:process') - + implementation project(':code:api:process-mqapi') implementation project(':code:api:index-api') implementation project(':code:common:model') + implementation project(':code:common:db') + implementation project(':code:common:message-queue') implementation project(':code:common:config') implementation project(':code:common:service') implementation project(':code:common:service-discovery') @@ -29,7 +31,7 @@ dependencies { implementation project(':code:features-index:lexicon') implementation project(':code:features-index:index-journal') implementation project(':code:libraries:language-processing') - + implementation project(':third-party:commons-codec') testImplementation project(':code:services-core:search-service') implementation project(':code:process-models:crawling-model') diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/ConvertedDomainReader.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/ConvertedDomainReader.java index 6b9dfbbd..86b9db1f 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/ConvertedDomainReader.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/ConvertedDomainReader.java @@ -1,56 +1,96 @@ package nu.marginalia.loading; +import com.github.luben.zstd.RecyclingBufferPool; import com.github.luben.zstd.ZstdInputStream; -import com.google.gson.Gson; -import com.google.gson.JsonParseException; +import lombok.SneakyThrows; import nu.marginalia.converting.instruction.Instruction; -import nu.marginalia.converting.instruction.InstructionTag; -import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import javax.inject.Inject; import java.io.*; +import java.lang.ref.Cleaner; import java.nio.file.Path; -import java.util.ArrayList; -import java.util.List; +import java.util.Iterator; +import java.util.concurrent.*; +import java.util.concurrent.atomic.AtomicBoolean; public class ConvertedDomainReader { + private final ExecutorService executorService = Executors.newSingleThreadExecutor(); private static final Logger logger = LoggerFactory.getLogger(ConvertedDomainReader.class); - private final Gson gson; - @Inject - public ConvertedDomainReader(Gson gson) { - this.gson = gson; + /** Creates a new iterator over Path. The implementation will try to read the file in a separate thread, and + * will block until the first instruction is available. Iterator$hasNext may block. + */ + public Iterator createIterator(Path path) { + return new PrefetchingInstructionIterator(path); } - public List read(Path path, int cntHint) throws IOException { - List ret = new ArrayList<>(cntHint); + class PrefetchingInstructionIterator implements Iterator { - try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new BufferedInputStream(new FileInputStream(path.toFile())))))) { - String line; - for (;;) { - line = br.readLine(); + private final LinkedBlockingQueue queue = new LinkedBlockingQueue<>(16); + private final AtomicBoolean finished = new AtomicBoolean(false); - if (line == null) { - break; - } - if (line.isBlank()) { - continue; - } - var parts= line.split(" ", 2); - var type = InstructionTag.valueOf(parts[0]).clazz; + private Instruction next = null; - try { - ret.add(gson.fromJson(parts[1], type)); - } - catch (NullPointerException|JsonParseException ex) { - logger.warn("Failed to deserialize {} {}", type.getSimpleName(), StringUtils.abbreviate(parts[1], 255)); - logger.warn("Json error", ex); + public PrefetchingInstructionIterator(Path path) { + Future future = executorService.submit(() -> readerThread(path)); + + // Cancel the future if the iterator is garbage collected + // to reduce the risk of leaking resources; as the worker thread + // will spin forever on put if the queue is full. + Cleaner.create().register(this, () -> { + future.cancel(true); + }); + } + + private Object readerThread(Path path) { + try (var or = new ObjectInputStream(new ZstdInputStream(new BufferedInputStream(new FileInputStream(path.toFile())), RecyclingBufferPool.INSTANCE))) { + for (; ; ) { + var nextObject = or.readObject(); + if (nextObject instanceof Instruction is) { + queue.put(is); + } else { + logger.warn("Spurious object in file: {}", nextObject.getClass().getSimpleName()); + } } + } catch (EOFException ex) { + // Expected + return null; + } catch (ClassNotFoundException | IOException | InterruptedException e) { + logger.warn("Error reading file " + path, e); + throw new RuntimeException(e); + } finally { + finished.set(true); } } - return ret; + @SneakyThrows + @Override + public boolean hasNext() { + if (next != null) + return true; + + // As long as the worker is still running, we'll do a blocking poll to wait for the next instruction + // (but we wake up every second to check if the worker is still running) + while (!finished.get()) { + if (null != (next = queue.poll(1, TimeUnit.SECONDS))) { + return true; + } + } + + // If the worker is not running, we just drain the queue without waiting + return null != (next = queue.poll()); + } + + @Override + public Instruction next() { + if (next != null || hasNext()) { + try { return next; } + finally { next = null; } + } + throw new IllegalStateException(); + } + } + } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java index eb04a06b..ea643d71 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java @@ -1,167 +1,226 @@ package nu.marginalia.loading; +import com.google.common.collect.Sets; +import com.google.gson.Gson; import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; -import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; -import nu.marginalia.process.log.WorkLog; -import plan.CrawlPlanLoader; -import plan.CrawlPlan; +import nu.marginalia.converting.instruction.Interpreter; +import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.keyword.model.DocumentKeywords; import nu.marginalia.loading.loader.IndexLoadKeywords; -import nu.marginalia.loading.loader.Loader; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.idx.DocumentMetadata; +import nu.marginalia.mq.MessageQueueFactory; +import nu.marginalia.mq.MqMessage; +import nu.marginalia.mq.inbox.MqInboxResponse; +import nu.marginalia.mq.inbox.MqSingleShotInbox; +import nu.marginalia.process.control.ProcessHeartbeat; +import nu.marginalia.process.log.WorkLog; +import plan.CrawlPlan; import nu.marginalia.loading.loader.LoaderFactory; -import nu.marginalia.converting.instruction.Instruction; import nu.marginalia.service.module.DatabaseModule; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; import java.nio.file.Path; import java.sql.SQLException; -import java.util.List; -import java.util.concurrent.LinkedBlockingQueue; +import java.util.HashSet; +import java.util.Optional; +import java.util.Set; +import java.util.UUID; import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicInteger; + +import static nu.marginalia.mqapi.ProcessInboxNames.LOADER_INBOX; public class LoaderMain { private static final Logger logger = LoggerFactory.getLogger(LoaderMain.class); - private final CrawlPlan plan; private final ConvertedDomainReader instructionsReader; private final LoaderFactory loaderFactory; + private final ProcessHeartbeat heartbeat; + private final MessageQueueFactory messageQueueFactory; + private final FileStorageService fileStorageService; private final IndexLoadKeywords indexLoadKeywords; - private volatile boolean running = true; - - final Thread processorThread = new Thread(this::processor, "Processor Thread"); - - public static void main(String... args) throws IOException { - if (args.length != 1) { - System.err.println("Arguments: crawl-plan.yaml"); - System.exit(0); - } + private final Gson gson; + public static void main(String... args) throws Exception { new org.mariadb.jdbc.Driver(); - var plan = new CrawlPlanLoader().load(Path.of(args[0])); - Injector injector = Guice.createInjector( - new LoaderModule(plan), + new LoaderModule(), new DatabaseModule() ); var instance = injector.getInstance(LoaderMain.class); - instance.run(); + try { + var instructions = instance.fetchInstructions(); + logger.info("Instructions received"); + instance.run(instructions); + } + catch (Exception ex) { + logger.error("Error running loader", ex); + } } @Inject - public LoaderMain(CrawlPlan plan, - ConvertedDomainReader instructionsReader, - HikariDataSource dataSource, - LoaderFactory loaderFactory, IndexLoadKeywords indexLoadKeywords) { + public LoaderMain(ConvertedDomainReader instructionsReader, + LoaderFactory loaderFactory, + ProcessHeartbeat heartbeat, + MessageQueueFactory messageQueueFactory, + FileStorageService fileStorageService, + IndexLoadKeywords indexLoadKeywords, + Gson gson + ) { - this.plan = plan; this.instructionsReader = instructionsReader; this.loaderFactory = loaderFactory; + this.heartbeat = heartbeat; + this.messageQueueFactory = messageQueueFactory; + this.fileStorageService = fileStorageService; this.indexLoadKeywords = indexLoadKeywords; + this.gson = gson; - nukeTables(dataSource); - - Runtime.getRuntime().addShutdownHook(new Thread(this::shutDownIndex)); - processorThread.start(); - } - - private void nukeTables(HikariDataSource dataSource) { - try (var conn = dataSource.getConnection(); - var stmt = conn.createStatement()) { - stmt.execute("SET FOREIGN_KEY_CHECKS = 0"); - stmt.execute("TRUNCATE TABLE EC_PAGE_DATA"); - stmt.execute("TRUNCATE TABLE EC_URL"); - stmt.execute("TRUNCATE TABLE EC_DOMAIN_LINK"); - stmt.execute("TRUNCATE TABLE DOMAIN_METADATA"); - stmt.execute("SET FOREIGN_KEY_CHECKS = 1"); - } - catch (SQLException ex) { - throw new RuntimeException(ex); - } + heartbeat.start(); } @SneakyThrows - private void shutDownIndex() { - // This must run otherwise the journal doesn't get a proper header - indexLoadKeywords.close(); - } - - @SneakyThrows - public void run() { + public void run(LoadRequest instructions) { + var plan = instructions.getPlan(); var logFile = plan.process.getLogFile(); - AtomicInteger loadTotal = new AtomicInteger(); - WorkLog.readLog(logFile, entry -> { loadTotal.incrementAndGet(); }); - LoaderMain.loadTotal = loadTotal.get(); + TaskStats taskStats = new TaskStats(100); + try { + int loadTotal = 0; + int loaded = 0; - WorkLog.readLog(logFile, entry -> { - load(plan, entry.path(), entry.cnt()); - }); + for (var unused : WorkLog.iterable(logFile)) { + loadTotal++; + } - running = false; - processorThread.join(); + logger.info("Loading {} files", loadTotal); + for (var entry : WorkLog.iterable(logFile)) { + InstructionCounter instructionCounter = new InstructionCounter(); + + heartbeat.setProgress(loaded++ / (double) loadTotal); + long startTime = System.currentTimeMillis(); + + Path destDir = plan.getProcessedFilePath(entry.path()); + + try (var loader = loaderFactory.create(entry.cnt())) { + var instructionsIter = instructionsReader.createIterator(destDir); + + while (instructionsIter.hasNext()) { + var next = instructionsIter.next(); + try { + next.apply(instructionCounter); + next.apply(loader); + } catch (Exception ex) { + logger.error("Failed to load instruction " + next.getClass().getSimpleName(), ex); + } + } + } + + long endTime = System.currentTimeMillis(); + long loadTime = endTime - startTime; + taskStats.observe(endTime - startTime); + + logger.info("Loaded {}/{} : {} ({}) {}ms {} l/s", taskStats.getCount(), + loadTotal, destDir, instructionCounter.getCount(), loadTime, taskStats.avgTime()); + } + + instructions.ok(); + + // This needs to be done in order to have a readable index journal + indexLoadKeywords.close(); + logger.info("Loading finished"); + } + catch (Exception ex) { + ex.printStackTrace(); + logger.error("Failed to load", ex); + instructions.err(); + throw ex; + } + finally { + heartbeat.shutDown(); + } System.exit(0); } - private volatile static int loadTotal; + private static class LoadRequest { + private final CrawlPlan plan; + private final MqMessage message; + private final MqSingleShotInbox inbox; - private void load(CrawlPlan plan, String path, int cnt) { - Path destDir = plan.getProcessedFilePath(path); - try { - var loader = loaderFactory.create(cnt); - var instructions = instructionsReader.read(destDir, cnt); - processQueue.put(new LoadJob(path, loader, instructions)); - } catch (Exception e) { - logger.error("Failed to load " + destDir, e); + LoadRequest(CrawlPlan plan, MqMessage message, MqSingleShotInbox inbox) { + this.plan = plan; + this.message = message; + this.inbox = inbox; } + + public CrawlPlan getPlan() { + return plan; + } + + public void ok() { + inbox.sendResponse(message, MqInboxResponse.ok()); + } + public void err() { + inbox.sendResponse(message, MqInboxResponse.err()); + } + } - static final TaskStats taskStats = new TaskStats(100); + private LoadRequest fetchInstructions() throws Exception { - private record LoadJob(String path, Loader loader, List instructionList) { - public void run() { - long startTime = System.currentTimeMillis(); - for (var i : instructionList) { - try { - i.apply(loader); - } - catch (Exception ex) { - logger.error("Failed to load instruction {}", i); - } + var inbox = messageQueueFactory.createSingleShotInbox(LOADER_INBOX, UUID.randomUUID()); + + var msgOpt = getMessage(inbox, nu.marginalia.mqapi.loading.LoadRequest.class.getSimpleName()); + if (msgOpt.isEmpty()) + throw new RuntimeException("No instruction received in inbox"); + var msg = msgOpt.get(); + + if (!nu.marginalia.mqapi.loading.LoadRequest.class.getSimpleName().equals(msg.function())) { + throw new RuntimeException("Unexpected message in inbox: " + msg); + } + + var request = gson.fromJson(msg.payload(), nu.marginalia.mqapi.loading.LoadRequest.class); + + var processData = fileStorageService.getStorage(request.processedDataStorage); + + var plan = new CrawlPlan(null, null, new CrawlPlan.WorkDir(processData.path(), "processor.log")); + + return new LoadRequest(plan, msg, inbox); + } + + private Optional getMessage(MqSingleShotInbox inbox, String expectedFunction) throws SQLException, InterruptedException { + var opt = inbox.waitForMessage(30, TimeUnit.SECONDS); + if (opt.isPresent()) { + if (!opt.get().function().equals(expectedFunction)) { + throw new RuntimeException("Unexpected function: " + opt.get().function()); } - - loader.finish(); - long loadTime = System.currentTimeMillis() - startTime; - taskStats.observe(loadTime); - logger.info("Loaded {}/{} : {} ({}) {}ms {} l/s", taskStats.getCount(), - loadTotal, path, loader.data.sizeHint, loadTime, taskStats.avgTime()); + return opt; + } + else { + var stolenMessage = inbox.stealMessage(msg -> msg.function().equals(expectedFunction)); + stolenMessage.ifPresent(mqMessage -> logger.info("Stole message {}", mqMessage)); + return stolenMessage; } - } - private static final LinkedBlockingQueue processQueue = new LinkedBlockingQueue<>(2); + public class InstructionCounter implements Interpreter { + private int count = 0; - private void processor() { - try { - while (running || !processQueue.isEmpty()) { - LoadJob job = processQueue.poll(1, TimeUnit.SECONDS); - - if (job != null) { - job.run(); - } - } - } catch (InterruptedException e) { - throw new RuntimeException(e); + public void loadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) { + count++; } + public int getCount() { + return count; + } } - } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java index fe8c022e..a2df0ea9 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java @@ -4,6 +4,7 @@ import com.google.gson.Gson; import com.google.inject.AbstractModule; import com.google.inject.name.Names; import nu.marginalia.LanguageModels; +import nu.marginalia.ProcessConfiguration; import nu.marginalia.WmsaHome; import plan.CrawlPlan; import nu.marginalia.model.gson.GsonFactory; @@ -11,21 +12,19 @@ import nu.marginalia.service.SearchServiceDescriptors; import nu.marginalia.service.descriptor.ServiceDescriptors; import java.nio.file.Path; +import java.util.UUID; public class LoaderModule extends AbstractModule { - private final CrawlPlan plan; - public LoaderModule(CrawlPlan plan) { - this.plan = plan; + public LoaderModule() { } public void configure() { - bind(CrawlPlan.class).toInstance(plan); - bind(ServiceDescriptors.class).toInstance(SearchServiceDescriptors.descriptors); + bind(ProcessConfiguration.class).toInstance(new ProcessConfiguration("loader", 0, UUID.randomUUID())); - bind(Gson.class).toInstance(createGson()); + bind(Gson.class).toProvider(this::createGson); bind(Path.class).annotatedWith(Names.named("local-index-path")).toInstance(Path.of(System.getProperty("local-index-path", "/vol"))); bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels()); diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/IndexLoadKeywords.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/IndexLoadKeywords.java index dd627f85..7374c0a3 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/IndexLoadKeywords.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/IndexLoadKeywords.java @@ -16,7 +16,7 @@ public class IndexLoadKeywords implements Runnable { private static final Logger logger = LoggerFactory.getLogger(IndexLoadKeywords.class); private final LinkedBlockingQueue insertQueue = new LinkedBlockingQueue<>(32); - private final LoaderIndexJournalWriter client; + private final LoaderIndexJournalWriter journalWriter; private record InsertTask(int urlId, int domainId, DocumentMetadata metadata, DocumentKeywords wordSet) {} @@ -25,8 +25,8 @@ public class IndexLoadKeywords implements Runnable { private volatile boolean canceled = false; @Inject - public IndexLoadKeywords(LoaderIndexJournalWriter client) { - this.client = client; + public IndexLoadKeywords(LoaderIndexJournalWriter journalWriter) { + this.journalWriter = journalWriter; runThread = new Thread(this, getClass().getSimpleName()); runThread.start(); } @@ -36,7 +36,7 @@ public class IndexLoadKeywords implements Runnable { while (!canceled) { var data = insertQueue.poll(1, TimeUnit.SECONDS); if (data != null) { - client.putWords(new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), data.metadata(), data.wordSet); + journalWriter.putWords(new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), data.metadata(), data.wordSet); } } } @@ -45,7 +45,7 @@ public class IndexLoadKeywords implements Runnable { if (!canceled) { canceled = true; runThread.join(); - client.close(); + journalWriter.close(); } } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java index 66eea626..d6f97076 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java @@ -15,7 +15,7 @@ import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.List; -public class Loader implements Interpreter { +public class Loader implements Interpreter, AutoCloseable { private final SqlLoadUrls sqlLoadUrls; private final SqlLoadDomains sqlLoadDomains; private final SqlLoadDomainLinks sqlLoadDomainLinks; @@ -30,8 +30,6 @@ public class Loader implements Interpreter { private final List processedDocumentList; private final List processedDocumentWithErrorList; - private final List deferredDomains = new ArrayList<>(); - private final List deferredUrls = new ArrayList<>(); public final LoaderData data; @@ -86,40 +84,26 @@ public class Loader implements Interpreter { @Override public void loadProcessedDocument(LoadProcessedDocument document) { - deferralCheck(document.url()); - processedDocumentList.add(document); + + if (processedDocumentList.size() > 100) { + sqlLoadProcessedDocument.load(data, processedDocumentList); + processedDocumentList.clear(); + } } @Override public void loadProcessedDocumentWithError(LoadProcessedDocumentWithError document) { - deferralCheck(document.url()); - processedDocumentWithErrorList.add(document); - } - private void deferralCheck(EdgeUrl url) { - if (data.getDomainId(url.domain) <= 0) - deferredDomains.add(url.domain); - - if (data.getUrlId(url) <= 0) - deferredUrls.add(url); + if (processedDocumentWithErrorList.size() > 100) { + sqlLoadProcessedDocument.loadWithError(data, processedDocumentWithErrorList); + processedDocumentWithErrorList.clear(); + } } @Override public void loadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) { - // This is a bit of a bandaid safeguard against a bug in - // in the converter, shouldn't be necessary in the future - if (!deferredDomains.isEmpty()) { - loadDomain(deferredDomains.toArray(EdgeDomain[]::new)); - deferredDomains.clear(); - } - - if (!deferredUrls.isEmpty()) { - loadUrl(deferredUrls.toArray(EdgeUrl[]::new)); - deferredUrls.clear(); - } - try { indexLoadKeywords.load(data, url, metadata, words); } catch (InterruptedException e) { @@ -137,19 +121,13 @@ public class Loader implements Interpreter { sqlLoadDomainMetadata.load(data, domain, knownUrls, goodUrls, visitedUrls); } - public void finish() { - // Some work needs to be processed out of order for the database relations to work out - - sqlLoadProcessedDocument.load(data, processedDocumentList); - sqlLoadProcessedDocument.loadWithError(data, processedDocumentWithErrorList); - } - public void close() { - try { - indexLoadKeywords.close(); + if (processedDocumentList.size() > 0) { + sqlLoadProcessedDocument.load(data, processedDocumentList); } - catch (Exception ex) { - logger.error("Error when closing the index loader", ex); + if (processedDocumentWithErrorList.size() > 0) { + sqlLoadProcessedDocument.loadWithError(data, processedDocumentWithErrorList); } } + } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java index 49cbd402..073b5c94 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java @@ -2,7 +2,9 @@ package nu.marginalia.loading.loader; import com.google.inject.Inject; import com.google.inject.Singleton; -import com.google.inject.name.Named; +import lombok.SneakyThrows; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.db.storage.model.FileStorageType; import nu.marginalia.dict.OffHeapDictionaryHashMap; import nu.marginalia.index.journal.model.IndexJournalEntryData; import nu.marginalia.index.journal.model.IndexJournalEntryHeader; @@ -11,6 +13,7 @@ import nu.marginalia.index.journal.writer.IndexJournalWriter; import nu.marginalia.keyword.model.DocumentKeywords; import nu.marginalia.lexicon.KeywordLexicon; import nu.marginalia.lexicon.journal.KeywordLexiconJournal; +import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; @@ -19,8 +22,11 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; -import java.nio.file.Path; +import java.nio.file.Files; +import java.nio.file.attribute.PosixFilePermissions; +import java.sql.SQLException; import java.util.Arrays; +import java.util.concurrent.*; @Singleton public class LoaderIndexJournalWriter { @@ -30,34 +36,70 @@ public class LoaderIndexJournalWriter { private static final Logger logger = LoggerFactory.getLogger(LoaderIndexJournalWriter.class); @Inject - public LoaderIndexJournalWriter(@Named("local-index-path") Path path) throws IOException { + public LoaderIndexJournalWriter(FileStorageService fileStorageService) throws IOException, SQLException { + var lexiconArea = fileStorageService.getStorageByType(FileStorageType.LEXICON_STAGING); + var indexArea = fileStorageService.getStorageByType(FileStorageType.INDEX_STAGING); - var lexiconJournal = new KeywordLexiconJournal(path.resolve("dictionary.dat").toFile()); - lexicon = new KeywordLexicon(lexiconJournal); - indexWriter = new IndexJournalWriterImpl(lexicon, path.resolve("index.dat")); + var lexiconPath = lexiconArea.asPath().resolve("dictionary.dat"); + var indexPath = indexArea.asPath().resolve("page-index.dat"); + + Files.deleteIfExists(indexPath); + Files.deleteIfExists(lexiconPath); + + Files.createFile(indexPath, PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--"))); + Files.createFile(lexiconPath, PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--"))); + + lexicon = new KeywordLexicon(new KeywordLexiconJournal(lexiconPath.toFile(), KeywordLexiconJournalMode.READ_WRITE)); + indexWriter = new IndexJournalWriterImpl(lexicon, indexPath); } + private final LinkedBlockingQueue keywordInsertTaskQueue = + new LinkedBlockingQueue<>(65536); + private final ExecutorService keywordInsertionExecutor = + new ThreadPoolExecutor(8, 16, 1, TimeUnit.MINUTES, keywordInsertTaskQueue); + + @SneakyThrows public void putWords(EdgeId domain, EdgeId url, DocumentMetadata metadata, DocumentKeywords wordSet) { - if (wordSet.keywords().length == 0) + if (wordSet.keywords().length == 0) { + logger.info("Skipping zero-length word set for {}:{}", domain, url); return; + } if (domain.id() <= 0 || url.id() <= 0) { logger.warn("Bad ID: {}:{}", domain, url); return; } + // Due to the very bursty access patterns of this method, doing the actual insertions in separate threads + // with a chonky work queue is a fairly decent improvement for (var chunk : KeywordListChunker.chopList(wordSet, IndexJournalEntryData.MAX_LENGTH)) { - - var entry = new IndexJournalEntryData(getOrInsertWordIds(chunk.keywords(), chunk.metadata())); - var header = new IndexJournalEntryHeader(domain, url, metadata.encode()); - - indexWriter.put(header, entry); + try { + keywordInsertionExecutor.submit(() -> loadWords(domain, url, metadata, chunk)); + } + catch (RejectedExecutionException ex) { + loadWords(domain, url, metadata, chunk); + } } } + private void loadWords(EdgeId domain, + EdgeId url, + DocumentMetadata metadata, + DocumentKeywords wordSet) { + if (null == metadata) { + logger.warn("Null metadata for {}:{}", domain, url); + return; + } + + var entry = new IndexJournalEntryData(getOrInsertWordIds(wordSet.keywords(), wordSet.metadata())); + var header = new IndexJournalEntryHeader(domain, url, metadata.encode()); + + indexWriter.put(header, entry); + } + private long[] getOrInsertWordIds(String[] words, long[] meta) { long[] ids = new long[words.length*2]; int putIdx = 0; @@ -79,6 +121,10 @@ public class LoaderIndexJournalWriter { } public void close() throws Exception { + keywordInsertionExecutor.shutdown(); + while (!keywordInsertionExecutor.awaitTermination(1, TimeUnit.DAYS)) { + // ...? + } indexWriter.close(); lexicon.close(); } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDocument.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDocument.java index 2a875b58..909ec986 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDocument.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDocument.java @@ -4,6 +4,7 @@ import com.google.inject.Inject; import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError; +import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -63,61 +64,66 @@ public class SqlLoadProcessedDocument { public void load(LoaderData data, List documents) { - try (var conn = dataSource.getConnection(); - var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)")) { - conn.setAutoCommit(false); + try (var conn = dataSource.getConnection()) { + try (var insertCall = conn.prepareCall("CALL INSERT_PAGE_VISIT(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)") + ) { + conn.setAutoCommit(false); - int cnt = 0; int batchOffset = 0; - for (var doc : documents) { - int urlId = data.getUrlId(doc.url()); - if (urlId <= 0) { - logger.warn("Failed to resolve ID for URL {}", doc.url()); - return; - } + int cnt = 0; + int batchOffset = 0; + for (var doc : documents) { + int urlId = data.getUrlId(doc.url()); + if (urlId <= 0) { + logger.warn("Failed to resolve ID for URL {}", doc.url()); + continue; + } - stmt.setInt(1, urlId); - stmt.setString(2, doc.state().name()); - stmt.setString(3, doc.title()); - stmt.setString(4, doc.description()); - stmt.setInt(5, doc.length()); - stmt.setInt(6, doc.htmlFeatures()); - stmt.setString(7, doc.standard()); - stmt.setDouble(8, doc.quality()); - stmt.setLong(9, doc.hash()); - if (doc.pubYear() != null) { - stmt.setShort(10, (short) doc.pubYear().intValue()); - } - else { - stmt.setInt(10, Types.SMALLINT); - } - stmt.addBatch(); + insertCall.setInt(1, urlId); + insertCall.setString(2, doc.state().name()); + insertCall.setString(3, doc.title()); + insertCall.setString(4, StringUtils.truncate(doc.description(), 255)); + insertCall.setInt(5, doc.length()); + insertCall.setInt(6, doc.htmlFeatures()); + insertCall.setString(7, doc.standard()); + insertCall.setDouble(8, doc.quality()); + insertCall.setLong(9, doc.hash()); + if (doc.pubYear() != null) { + insertCall.setShort(10, (short) doc.pubYear().intValue()); + } else { + insertCall.setInt(10, Types.SMALLINT); + } + insertCall.addBatch(); - if (++cnt == 100) { - var ret = stmt.executeBatch(); + if (++cnt == 100) { + var ret = insertCall.executeBatch(); + conn.commit(); + + for (int rv = 0; rv < cnt; rv++) { + if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { + logger.warn("load({}) -- bad row count {}", documents.get(batchOffset + rv), ret[rv]); + } + } + + cnt = 0; + batchOffset += 100; + } + } + if (cnt > 0) { + var ret = insertCall.executeBatch(); conn.commit(); - for (int rv = 0; rv < cnt; rv++) { if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { logger.warn("load({}) -- bad row count {}", documents.get(batchOffset + rv), ret[rv]); } } - - cnt = 0; - batchOffset += 100; } - } - if (cnt > 0) { - var ret = stmt.executeBatch(); - conn.commit(); - for (int rv = 0; rv < cnt; rv++) { - if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { - logger.warn("load({}) -- bad row count {}", documents.get(batchOffset + rv), ret[rv]); - } - } - } - - conn.setAutoCommit(true); + conn.setAutoCommit(true); + } + catch (SQLException ex) { + conn.rollback(); + throw ex; + } } catch (SQLException ex) { logger.warn("SQL error inserting document", ex); diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java index 1e1998c7..9ac576af 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java @@ -14,17 +14,23 @@ import java.sql.SQLException; public class SqlLoadProcessedDomain { private final HikariDataSource dataSource; private final SqlLoadDomains loadDomains; + private final SqlLoadUrls loadUrls; private static final Logger logger = LoggerFactory.getLogger(SqlLoadProcessedDomain.class); @Inject - public SqlLoadProcessedDomain(HikariDataSource dataSource, SqlLoadDomains loadDomains) { + public SqlLoadProcessedDomain(HikariDataSource dataSource, SqlLoadDomains loadDomains, SqlLoadUrls loadUrls) { this.dataSource = dataSource; this.loadDomains = loadDomains; + this.loadUrls = loadUrls; try (var conn = dataSource.getConnection()) { try (var stmt = conn.createStatement()) { stmt.execute("DROP PROCEDURE IF EXISTS INITIALIZE_DOMAIN"); + + // Note that there should be no need to delete from EC_PAGE_DATA here as it's done via their + // CASCADE DELETE constraint on EC_URL. + stmt.execute(""" CREATE PROCEDURE INITIALIZE_DOMAIN ( IN ST ENUM('ACTIVE', 'EXHAUSTED', 'SPECIAL', 'SOCIAL_MEDIA', 'BLOCKED', 'REDIR', 'ERROR', 'UNKNOWN'), @@ -32,6 +38,9 @@ public class SqlLoadProcessedDomain { IN DID INT, IN IP VARCHAR(48)) BEGIN + DELETE FROM DOMAIN_METADATA WHERE ID=DID; + DELETE FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=DID; + DELETE FROM EC_URL WHERE DOMAIN_ID=DID; UPDATE EC_DOMAIN SET INDEX_DATE=NOW(), STATE=ST, DOMAIN_ALIAS=NULL, INDEXED=GREATEST(INDEXED,IDX), IP=IP WHERE ID=DID; DELETE FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=DID; END @@ -44,21 +53,28 @@ public class SqlLoadProcessedDomain { } public void load(LoaderData data, EdgeDomain domain, DomainIndexingState state, String ip) { + data.setTargetDomain(domain); loadDomains.load(data, domain); - try (var conn = dataSource.getConnection(); - var initCall = conn.prepareCall("CALL INITIALIZE_DOMAIN(?,?,?,?)")) - { - initCall.setString(1, state.name()); - initCall.setInt(2, 1 + data.sizeHint / 100); - initCall.setInt(3, data.getDomainId(domain)); - initCall.setString(4, StringUtils.truncate(ip, 48)); - int rc = initCall.executeUpdate(); - conn.commit(); - if (rc < 1) { - logger.warn("load({},{}) -- bad rowcount {}", domain, state, rc); + try (var conn = dataSource.getConnection()) { + try (var initCall = conn.prepareCall("CALL INITIALIZE_DOMAIN(?,?,?,?)")) { + initCall.setString(1, state.name()); + initCall.setInt(2, 1 + data.sizeHint / 100); + initCall.setInt(3, data.getDomainId(domain)); + initCall.setString(4, StringUtils.truncate(ip, 48)); + int rc = initCall.executeUpdate(); + conn.commit(); + if (rc < 1) { + logger.warn("load({},{}) -- bad rowcount {}", domain, state, rc); + } + + loadUrls.loadUrlsForDomain(data, domain, 0); + } + catch (SQLException ex) { + conn.rollback(); + throw ex; } } catch (SQLException ex) { diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadUrls.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadUrls.java index 18bd32b1..4ef1509e 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadUrls.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadUrls.java @@ -1,15 +1,13 @@ package nu.marginalia.loading.loader; -import com.google.common.hash.HashFunction; -import com.google.common.hash.Hashing; import com.google.inject.Inject; import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.hash.MurmurHash3_128; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.nio.charset.StandardCharsets; import java.sql.SQLException; import java.sql.Types; import java.util.HashSet; @@ -26,81 +24,87 @@ public class SqlLoadUrls { public SqlLoadUrls(HikariDataSource dataSource) { this.dataSource = dataSource; } + private final MurmurHash3_128 murmurHash = new MurmurHash3_128(); public void load(LoaderData data, EdgeUrl[] urls) { Set affectedDomains = new HashSet<>(); - try (var conn = dataSource.getConnection(); - var insertCall = conn.prepareStatement("INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PARAM,PATH_HASH) VALUES (?,?,?,?,?,?)"); - var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH, PARAM FROM EC_URL WHERE DOMAIN_ID=?") - ) - { - conn.setAutoCommit(false); + if (urls.length == 0) + return; - int cnt = 0; int batchOffset = 0; - for (var url : urls) { - if (url.path.length() >= 255) { - logger.debug("Skipping bad URL {}", url); - continue; + int maxOldId = 0; + try (var conn = dataSource.getConnection()) { + + try (var insertStmt = conn.prepareStatement("INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PARAM,PATH_HASH) VALUES (?,?,?,?,?,?)"); + var queryMaxId = conn.prepareStatement("SELECT MAX(ID) FROM EC_URL")) { + + conn.setAutoCommit(false); + + var rs = queryMaxId.executeQuery(); + if (rs.next()) { + maxOldId = rs.getInt(1); } - affectedDomains.add(url.domain); - insertCall.setString(1, url.proto); - insertCall.setInt(2, data.getDomainId(url.domain)); - if (url.port != null) { - insertCall.setInt(3, url.port); + int cnt = 0; + int batchOffset = 0; + + for (var url : urls) { + if (data.getUrlId(url) != 0) + continue; + + if (url.path.length() >= 255) { + logger.info("Skipping bad URL {}", url); + continue; + } + var domainId = data.getDomainId(url.domain); + + affectedDomains.add(url.domain); + + insertStmt.setString(1, url.proto); + insertStmt.setInt(2, domainId); + if (url.port != null) { + insertStmt.setInt(3, url.port); + } else { + insertStmt.setNull(3, Types.INTEGER); + } + insertStmt.setString(4, url.path); + insertStmt.setString(5, url.param); + insertStmt.setLong(6, hashPath(url.path, url.param)); + insertStmt.addBatch(); + + if (++cnt == 1000) { + var ret = insertStmt.executeBatch(); + for (int rv = 0; rv < cnt; rv++) { + if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { + logger.warn("load({}) -- bad row count {}", urls[batchOffset + rv], ret[rv]); + } + } + + batchOffset += cnt; + cnt = 0; + } } - else { - insertCall.setNull(3, Types.INTEGER); - } - insertCall.setString(4, url.path); - insertCall.setString(5, url.param); - insertCall.setLong(6, hashPath(url.path, url.param)); - insertCall.addBatch(); - - if (cnt++ == 1000) { - var ret = insertCall.executeBatch(); - conn.commit(); + if (cnt > 0) { + var ret = insertStmt.executeBatch(); for (int rv = 0; rv < cnt; rv++) { if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { logger.warn("load({}) -- bad row count {}", urls[batchOffset + rv], ret[rv]); } } - - batchOffset += cnt; - cnt = 0; } - } - if (cnt > 0) { - var ret = insertCall.executeBatch(); + conn.commit(); + conn.setAutoCommit(true); - for (int rv = 0; rv < cnt; rv++) { - if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { - logger.warn("load({}) -- bad row count {}", urls[batchOffset + rv], ret[rv]); - } + for (var domain : affectedDomains) { + loadUrlsForDomain(data, domain, maxOldId); } } - - conn.setAutoCommit(true); - - - for (var domain : affectedDomains) { - queryCall.setInt(1, data.getDomainId(domain)); - var rsp = queryCall.executeQuery(); - rsp.setFetchSize(1000); - - while (rsp.next()) { - int urlId = rsp.getInt(1); - String proto = rsp.getString(2); - String path = rsp.getString(3); - String param = rsp.getString(4); - - data.addUrl(new EdgeUrl(proto, domain, null, path, param), urlId); - } + catch (SQLException ex) { + conn.rollback(); + throw ex; } - } catch (SQLException ex) { logger.warn("SQL error inserting URLs", ex); @@ -110,15 +114,38 @@ public class SqlLoadUrls { } } - private static final HashFunction murmur3_128 = Hashing.murmur3_128(); + /* We use a uniqueness constraint on DOMAIN_ID and this hash instead of on the PATH and PARAM + * fields as the uniqueness index grows absurdly large for some reason, possibly due to the prevalent + * shared leading substrings in paths? + */ private long hashPath(String path, String queryParam) { - long pathHash = murmur3_128.hashString(path, StandardCharsets.UTF_8).padToLong(); + long hash = murmurHash.hashNearlyASCII(path); + if (queryParam != null) { + hash ^= murmurHash.hashNearlyASCII(queryParam); + } + return hash; + } - if (queryParam == null) { - return pathHash; - } - else { - return pathHash + murmur3_128.hashString(queryParam, StandardCharsets.UTF_8).padToLong(); + /** Loads urlIDs for the domain into `data` from the database, starting at URL ID minId. */ + public void loadUrlsForDomain(LoaderData data, EdgeDomain domain, int minId) throws SQLException { + try (var conn = dataSource.getConnection(); + var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH, PARAM FROM EC_URL WHERE DOMAIN_ID=? AND ID > ?")) { + + queryCall.setFetchSize(1000); + queryCall.setInt(1, data.getDomainId(domain)); + queryCall.setInt(2, minId); + + var rsp = queryCall.executeQuery(); + + while (rsp.next()) { + int urlId = rsp.getInt(1); + String proto = rsp.getString(2); + String path = rsp.getString(3); + String param = rsp.getString(4); + + data.addUrl(new EdgeUrl(proto, domain, null, path, param), urlId); + } } + } } diff --git a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadDomainLinksTest.java b/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadDomainLinksTest.java index f80a54dc..a8d85699 100644 --- a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadDomainLinksTest.java +++ b/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadDomainLinksTest.java @@ -22,7 +22,7 @@ class SqlLoadDomainLinksTest { .withDatabaseName("WMSA_prod") .withUsername("wmsa") .withPassword("wmsa") - .withInitScript("sql/current/00-base.sql") + .withInitScript("db/migration/V23_06_0_000__base.sql") .withNetworkAliases("mariadb"); HikariDataSource dataSource; diff --git a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadDomainsTest.java b/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadDomainsTest.java index 21fc2902..16d52d33 100644 --- a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadDomainsTest.java +++ b/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadDomainsTest.java @@ -19,7 +19,7 @@ class SqlLoadDomainsTest { .withDatabaseName("WMSA_prod") .withUsername("wmsa") .withPassword("wmsa") - .withInitScript("sql/current/00-base.sql") + .withInitScript("db/migration/V23_06_0_000__base.sql") .withNetworkAliases("mariadb"); @Test diff --git a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDocumentTest.java b/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDocumentTest.java index 0abea35c..e9dd92b6 100644 --- a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDocumentTest.java +++ b/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDocumentTest.java @@ -33,7 +33,7 @@ class SqlLoadProcessedDocumentTest { .withDatabaseName("WMSA_prod") .withUsername("wmsa") .withPassword("wmsa") - .withInitScript("sql/current/00-base.sql") + .withInitScript("db/migration/V23_06_0_000__base.sql") .withNetworkAliases("mariadb"); HikariDataSource dataSource; diff --git a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDomainTest.java b/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDomainTest.java index df61cf50..0ef662eb 100644 --- a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDomainTest.java +++ b/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDomainTest.java @@ -5,6 +5,7 @@ import nu.marginalia.loading.loader.LoaderData; import nu.marginalia.loading.loader.SqlLoadDomains; import nu.marginalia.loading.loader.SqlLoadProcessedDomain; import nu.marginalia.converting.instruction.instructions.DomainLink; +import nu.marginalia.loading.loader.SqlLoadUrls; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.crawl.DomainIndexingState; import org.junit.jupiter.api.AfterEach; @@ -26,7 +27,7 @@ class SqlLoadProcessedDomainTest { .withDatabaseName("WMSA_prod") .withUsername("wmsa") .withPassword("wmsa") - .withInitScript("sql/current/00-base.sql") + .withInitScript("db/migration/V23_06_0_000__base.sql") .withNetworkAliases("mariadb"); HikariDataSource dataSource; @@ -50,13 +51,18 @@ class SqlLoadProcessedDomainTest { @Test public void loadProcessedDomain() { - var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource)); + var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource), new SqlLoadUrls(dataSource)); + loader.load(loaderData, new EdgeDomain("www.marginalia.nu"), DomainIndexingState.BLOCKED, "127.0.0.1"); + } + @Test + public void loadProcessedDomainTwice() { + var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource), new SqlLoadUrls(dataSource)); loader.load(loaderData, new EdgeDomain("www.marginalia.nu"), DomainIndexingState.BLOCKED, "127.0.0.1"); } @Test public void loadProcessedDomaiWithExtremelyLongIP() { - var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource)); + var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource), new SqlLoadUrls(dataSource)); String ip = Stream.generate(() -> "127.").limit(1024).collect(Collectors.joining()); @@ -65,7 +71,7 @@ class SqlLoadProcessedDomainTest { @Test public void loadDomainAlias() { - var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource)); + var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource), new SqlLoadUrls(dataSource)); loader.loadAlias(loaderData, new DomainLink(new EdgeDomain("memex.marginalia.nu"), new EdgeDomain("www.marginalia.nu"))); } } \ No newline at end of file diff --git a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadUrlsTest.java b/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadUrlsTest.java index cc5c1381..7fece308 100644 --- a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadUrlsTest.java +++ b/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadUrlsTest.java @@ -24,7 +24,7 @@ class SqlLoadUrlsTest { .withDatabaseName("WMSA_prod") .withUsername("wmsa") .withPassword("wmsa") - .withInitScript("sql/current/00-base.sql") + .withInitScript("db/migration/V23_06_0_000__base.sql") .withNetworkAliases("mariadb"); HikariDataSource dataSource; diff --git a/code/readme.md b/code/readme.md index 1de3e46c..3aca8a37 100644 --- a/code/readme.md +++ b/code/readme.md @@ -14,10 +14,11 @@ A map of the most important components and how they relate can be found below. ### Services * [core services](services-core/) "macroservices", stateful, memory hungry doing heavy lifting. +* * [control-service](services-core/control-service) * * [search](services-core/search-service) * * [index](services-core/index-service) * * [assistant](services-core/assistant-service) -* [sattelite services](services-satellite/) "microservices", stateless providing additional functionality. +* [satellite services](services-satellite/) "microservices", stateless providing additional functionality. * * [api](services-satellite/api-service) - public API * * [dating](services-satellite/dating-service) - [https://explore.marginalia.nu/](https://explore.marginalia.nu/) * * [explorer](services-satellite/explorer-service) - [https://explore2.marginalia.nu/](https://explore2.marginalia.nu/) diff --git a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/AssistantService.java b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/AssistantService.java index c0d908fd..3992986b 100644 --- a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/AssistantService.java +++ b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/AssistantService.java @@ -2,7 +2,6 @@ package nu.marginalia.assistant; import com.google.gson.Gson; import com.google.inject.Inject; -import com.google.inject.name.Named; import lombok.SneakyThrows; import nu.marginalia.assistant.eval.Units; import nu.marginalia.assistant.suggest.Suggestions; @@ -10,9 +9,7 @@ import nu.marginalia.assistant.eval.MathParser; import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.screenshot.ScreenshotService; import nu.marginalia.assistant.dict.DictionaryService; -import nu.marginalia.service.server.Initialization; -import nu.marginalia.service.server.MetricsServer; -import nu.marginalia.service.server.Service; +import nu.marginalia.service.server.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import spark.Request; @@ -28,18 +25,15 @@ public class AssistantService extends Service { @SneakyThrows @Inject - public AssistantService(@Named("service-host") String ip, - @Named("service-port") Integer port, - Initialization initialization, - MetricsServer metricsServer, + public AssistantService(BaseServiceParams params, DictionaryService dictionaryService, MathParser mathParser, Units units, ScreenshotService screenshotService, - Suggestions suggestions - ) + Suggestions suggestions) { - super(ip, port, initialization, metricsServer); + super(params); + this.mathParser = mathParser; this.units = units; this.suggestions = suggestions; diff --git a/code/services-core/control-service/build.gradle b/code/services-core/control-service/build.gradle new file mode 100644 index 00000000..90b832da --- /dev/null +++ b/code/services-core/control-service/build.gradle @@ -0,0 +1,71 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + id 'application' + id 'com.palantir.docker' version '0.34.0' + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +application { + mainClass = 'nu.marginalia.control.ControlMain' + applicationName = 'control-service' +} + +tasks.distZip.enabled = false + +apply from: "$rootProject.projectDir/docker-service.gradle" + +dependencies { + implementation libs.bundles.gson + + implementation project(':code:common:db') + implementation project(':code:common:model') + implementation project(':code:common:service') + implementation project(':code:common:config') + implementation project(':code:common:renderer') + implementation project(':code:common:message-queue') + implementation project(':code:common:service-discovery') + implementation project(':code:common:service-client') + implementation project(':code:api:search-api') + implementation project(':code:api:index-api') + implementation project(':code:api:process-mqapi') + + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + + implementation libs.prometheus + implementation libs.notnull + implementation libs.guice + + implementation libs.trove + implementation libs.spark + implementation libs.fastutil + implementation libs.commons.io + implementation libs.bundles.mariadb + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito + testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4') + testImplementation 'org.testcontainers:mariadb:1.17.4' + testImplementation 'org.testcontainers:junit-jupiter:1.17.4' +} + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} + diff --git a/code/services-core/control-service/readme.md b/code/services-core/control-service/readme.md new file mode 100644 index 00000000..82b08093 --- /dev/null +++ b/code/services-core/control-service/readme.md @@ -0,0 +1,18 @@ +# Control Service + +The control service provides an operator's user interface, and is responsible for orchestrating the various +processes of the system using Actors. + +Actors within the control service will spawn processes when necessary, by +monitoring their message queue inboxes. + +## Central Classes + +* [ControlService](src/main/java/nu/marginalia/control/ControlService.java) +* [ControlActors](src/main/java/nu/marginalia/control/actor/ControlActors.java) - Class responsible for Actors' lifecycle +* [ProcessService](src/main/java/nu/marginalia/control/process/ProcessService.java) - Class responsible for spawning Processes + +## See Also + +* [processes](../../processes) +* [common/message-queue](../../common/message-queue) - The Message Queue and MQFSM abstractions \ No newline at end of file diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlMain.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlMain.java new file mode 100644 index 00000000..52307353 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlMain.java @@ -0,0 +1,30 @@ +package nu.marginalia.control; + +import com.google.inject.Guice; +import com.google.inject.Inject; +import com.google.inject.Injector; +import nu.marginalia.service.MainClass; +import nu.marginalia.service.SearchServiceDescriptors; +import nu.marginalia.service.id.ServiceId; +import nu.marginalia.service.module.ConfigurationModule; +import nu.marginalia.service.module.DatabaseModule; +import nu.marginalia.service.server.Initialization; + +public class ControlMain extends MainClass { + + @Inject + public ControlMain(ControlService service) { + } + + public static void main(String... args) { + init(ServiceId.Control, args); + + Injector injector = Guice.createInjector( + new DatabaseModule(), + new ControlProcessModule(), + new ConfigurationModule(SearchServiceDescriptors.descriptors, ServiceId.Control)); + + injector.getInstance(ControlMain.class); + injector.getInstance(Initialization.class).setReady(); + } +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlProcessModule.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlProcessModule.java new file mode 100644 index 00000000..3530a89b --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlProcessModule.java @@ -0,0 +1,15 @@ +package nu.marginalia.control; + +import com.google.inject.AbstractModule; +import com.google.inject.Module; +import com.google.inject.name.Names; + +import java.nio.file.Path; + +public class ControlProcessModule extends AbstractModule { + @Override + protected void configure() { + String dist = System.getProperty("distPath", System.getProperty("WMSA_HOME") + "/dist/current"); + bind(Path.class).annotatedWith(Names.named("distPath")).toInstance(Path.of(dist)); + } +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java new file mode 100644 index 00000000..2b49b249 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java @@ -0,0 +1,350 @@ +package nu.marginalia.control; + +import com.google.gson.Gson; +import com.google.inject.Inject; +import nu.marginalia.client.ServiceMonitors; +import nu.marginalia.control.actor.Actor; +import nu.marginalia.control.model.DomainComplaintModel; +import nu.marginalia.control.svc.*; +import nu.marginalia.db.storage.model.FileStorageId; +import nu.marginalia.db.storage.model.FileStorageType; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.gson.GsonFactory; +import nu.marginalia.renderer.RendererFactory; +import nu.marginalia.service.server.*; +import org.eclipse.jetty.util.StringUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import spark.Request; +import spark.Response; +import spark.Spark; + +import java.io.IOException; +import java.sql.SQLException; +import java.util.Comparator; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.stream.Collectors; + +public class ControlService extends Service { + + private final Logger logger = LoggerFactory.getLogger(getClass()); + private final Gson gson = GsonFactory.get(); + + private final ServiceMonitors monitors; + private final HeartbeatService heartbeatService; + private final EventLogService eventLogService; + private final ApiKeyService apiKeyService; + private final DomainComplaintService domainComplaintService; + private final ControlBlacklistService blacklistService; + private final ControlActorService controlActorService; + private final StaticResources staticResources; + private final MessageQueueService messageQueueService; + private final ControlFileStorageService controlFileStorageService; + + + @Inject + public ControlService(BaseServiceParams params, + ServiceMonitors monitors, + HeartbeatService heartbeatService, + EventLogService eventLogService, + RendererFactory rendererFactory, + ControlActorService controlActorService, + StaticResources staticResources, + MessageQueueService messageQueueService, + ControlFileStorageService controlFileStorageService, + ApiKeyService apiKeyService, + DomainComplaintService domainComplaintService, + ControlBlacklistService blacklistService, + ControlActionsService controlActionsService + ) throws IOException { + + super(params); + this.monitors = monitors; + this.heartbeatService = heartbeatService; + this.eventLogService = eventLogService; + this.apiKeyService = apiKeyService; + this.domainComplaintService = domainComplaintService; + this.blacklistService = blacklistService; + + var indexRenderer = rendererFactory.renderer("control/index"); + var servicesRenderer = rendererFactory.renderer("control/services"); + var serviceByIdRenderer = rendererFactory.renderer("control/service-by-id"); + var actorsRenderer = rendererFactory.renderer("control/actors"); + var actorDetailsRenderer = rendererFactory.renderer("control/actor-details"); + var storageRenderer = rendererFactory.renderer("control/storage-overview"); + var storageSpecsRenderer = rendererFactory.renderer("control/storage-specs"); + var storageCrawlsRenderer = rendererFactory.renderer("control/storage-crawls"); + var storageProcessedRenderer = rendererFactory.renderer("control/storage-processed"); + + var apiKeysRenderer = rendererFactory.renderer("control/api-keys"); + var domainComplaintsRenderer = rendererFactory.renderer("control/domain-complaints"); + + var messageQueueRenderer = rendererFactory.renderer("control/message-queue"); + + var storageDetailsRenderer = rendererFactory.renderer("control/storage-details"); + var updateMessageStateRenderer = rendererFactory.renderer("control/update-message-state"); + var newMessageRenderer = rendererFactory.renderer("control/new-message"); + var viewMessageRenderer = rendererFactory.renderer("control/view-message"); + + var actionsViewRenderer = rendererFactory.renderer("control/actions"); + var blacklistRenderer = rendererFactory.renderer("control/blacklist"); + + this.controlActorService = controlActorService; + + this.staticResources = staticResources; + this.messageQueueService = messageQueueService; + this.controlFileStorageService = controlFileStorageService; + + Spark.get("/public/heartbeats", (req, res) -> { + res.type("application/json"); + return heartbeatService.getServiceHeartbeats(); + }, gson::toJson); + + Spark.get("/public/", this::overviewModel, indexRenderer::render); + + Spark.get("/public/actions", (rq,rsp) -> new Object() , actionsViewRenderer::render); + Spark.get("/public/services", this::servicesModel, servicesRenderer::render); + Spark.get("/public/services/:id", this::serviceModel, serviceByIdRenderer::render); + Spark.get("/public/actors", this::processesModel, actorsRenderer::render); + Spark.get("/public/actors/:fsm", this::actorDetailsModel, actorDetailsRenderer::render); + + final HtmlRedirect redirectToServices = new HtmlRedirect("/services"); + final HtmlRedirect redirectToActors = new HtmlRedirect("/actors"); + final HtmlRedirect redirectToApiKeys = new HtmlRedirect("/api-keys"); + final HtmlRedirect redirectToStorage = new HtmlRedirect("/storage"); + final HtmlRedirect redirectToBlacklist = new HtmlRedirect("/blacklist"); + final HtmlRedirect redirectToComplaints = new HtmlRedirect("/complaints"); + final HtmlRedirect redirectToMessageQueue = new HtmlRedirect("/message-queue"); + + // FSMs + + Spark.post("/public/fsms/:fsm/start", controlActorService::startFsm, redirectToActors); + Spark.post("/public/fsms/:fsm/stop", controlActorService::stopFsm, redirectToActors); + + // Message Queue + + Spark.get("/public/message-queue", messageQueueService::listMessageQueueModel, messageQueueRenderer::render); + Spark.post("/public/message-queue/", messageQueueService::createMessage, redirectToMessageQueue); + Spark.get("/public/message-queue/new", messageQueueService::newMessageModel, newMessageRenderer::render); + Spark.get("/public/message-queue/:id", messageQueueService::viewMessageModel, viewMessageRenderer::render); + Spark.get("/public/message-queue/:id/reply", messageQueueService::replyMessageModel, newMessageRenderer::render); + Spark.get("/public/message-queue/:id/edit", messageQueueService::viewMessageForEditStateModel, updateMessageStateRenderer::render); + Spark.post("/public/message-queue/:id/edit", messageQueueService::editMessageState, redirectToMessageQueue); + + // Storage + Spark.get("/public/storage", this::storageModel, storageRenderer::render); + Spark.get("/public/storage/specs", this::storageModelSpecs, storageSpecsRenderer::render); + Spark.get("/public/storage/crawls", this::storageModelCrawls, storageCrawlsRenderer::render); + Spark.get("/public/storage/processed", this::storageModelProcessed, storageProcessedRenderer::render); + Spark.get("/public/storage/:id", this::storageDetailsModel, storageDetailsRenderer::render); + Spark.get("/public/storage/:id/file", controlFileStorageService::downloadFileFromStorage); + + // Storage Actions + + Spark.post("/public/storage/:fid/crawl", controlActorService::triggerCrawling, redirectToActors); + Spark.post("/public/storage/:fid/recrawl", controlActorService::triggerRecrawling, redirectToActors); + Spark.post("/public/storage/:fid/process", controlActorService::triggerProcessing, redirectToActors); + Spark.post("/public/storage/:fid/load", controlActorService::loadProcessedData, redirectToActors); + + Spark.post("/public/storage/specs", controlActorService::createCrawlSpecification, redirectToStorage); + Spark.post("/public/storage/:fid/delete", controlFileStorageService::flagFileForDeletionRequest, redirectToStorage); + + // Blacklist + + Spark.get("/public/blacklist", this::blacklistModel, blacklistRenderer::render); + Spark.post("/public/blacklist", this::updateBlacklist, redirectToBlacklist); + + // API Keys + + Spark.get("/public/api-keys", this::apiKeysModel, apiKeysRenderer::render); + Spark.post("/public/api-keys", this::createApiKey, redirectToApiKeys); + Spark.delete("/public/api-keys/:key", this::deleteApiKey, redirectToApiKeys); + // HTML forms don't support the DELETE verb :-( + Spark.post("/public/api-keys/:key/delete", this::deleteApiKey, redirectToApiKeys); + + Spark.get("/public/complaints", this::complaintsModel, domainComplaintsRenderer::render); + Spark.post("/public/complaints/:domain", this::reviewComplaint, redirectToComplaints); + + // Actions + + Spark.post("/public/actions/calculate-adjacencies", controlActionsService::calculateAdjacencies, redirectToActors); + Spark.post("/public/actions/repartition-index", controlActionsService::triggerRepartition, redirectToActors); + Spark.post("/public/actions/reconstruct-index", controlActionsService::triggerIndexReconstruction, redirectToActors); + Spark.post("/public/actions/trigger-data-exports", controlActionsService::triggerDataExports, redirectToActors); + Spark.post("/public/actions/flush-search-caches", controlActionsService::flushSearchCaches, redirectToActors); + Spark.post("/public/actions/flush-api-caches", controlActionsService::flushApiCaches, redirectToActors); + Spark.post("/public/actions/truncate-links-database", controlActionsService::truncateLinkDatabase, redirectToActors); + + Spark.get("/public/:resource", this::serveStatic); + + monitors.subscribe(this::logMonitorStateChange); + } + + private Object blacklistModel(Request request, Response response) { + return Map.of("blacklist", blacklistService.lastNAdditions(100)); + } + + private Object updateBlacklist(Request request, Response response) { + var domain = new EdgeDomain(request.queryParams("domain")); + if ("add".equals(request.queryParams("act"))) { + var comment = Objects.requireNonNullElse(request.queryParams("comment"), ""); + blacklistService.addToBlacklist(domain, comment); + } else if ("del".equals(request.queryParams("act"))) { + blacklistService.removeFromBlacklist(domain); + } + + return ""; + } + + private Object overviewModel(Request request, Response response) { + + return Map.of("processes", heartbeatService.getProcessHeartbeats(), + "jobs", heartbeatService.getTaskHeartbeats(), + "actors", controlActorService.getActorStates(), + "services", heartbeatService.getServiceHeartbeats(), + "events", eventLogService.getLastEntries(20) + ); + } + + + private Object complaintsModel(Request request, Response response) { + Map> complaintsByReviewed = + domainComplaintService.getComplaints().stream().collect(Collectors.partitioningBy(DomainComplaintModel::reviewed)); + + var reviewed = complaintsByReviewed.get(true); + var unreviewed = complaintsByReviewed.get(false); + + reviewed.sort(Comparator.comparing(DomainComplaintModel::reviewDate).reversed()); + unreviewed.sort(Comparator.comparing(DomainComplaintModel::fileDate).reversed()); + + return Map.of("complaintsNew", unreviewed, "complaintsReviewed", reviewed); + } + + private Object reviewComplaint(Request request, Response response) { + var domain = new EdgeDomain(request.params("domain")); + String action = request.queryParams("action"); + + logger.info("Reviewing complaint for domain {} with action {}", domain, action); + + switch (action) { + case "noop" -> domainComplaintService.reviewNoAction(domain); + case "appeal" -> domainComplaintService.approveAppealBlacklisting(domain); + case "blacklist" -> domainComplaintService.blacklistDomain(domain); + default -> throw new UnsupportedOperationException(); + } + + return ""; + } + + private Object createApiKey(Request request, Response response) { + String license = request.queryParams("license"); + String name = request.queryParams("name"); + String email = request.queryParams("email"); + int rate = Integer.parseInt(request.queryParams("rate")); + + if (StringUtil.isBlank(license) || + StringUtil.isBlank(name) || + StringUtil.isBlank(email) || + rate <= 0) + { + response.status(400); + return ""; + } + + apiKeyService.addApiKey(license, name, email, rate); + + return ""; + } + + private Object deleteApiKey(Request request, Response response) { + String licenseKey = request.params("key"); + apiKeyService.deleteApiKey(licenseKey); + return ""; + } + + private Object apiKeysModel(Request request, Response response) { + return Map.of("apikeys", apiKeyService.getApiKeys()); + } + + + @Override + public void logRequest(Request request) { + if ("GET".equals(request.requestMethod())) + return; + + super.logRequest(request); + } + + @Override + public void logResponse(Request request, Response response) { + if ("GET".equals(request.requestMethod())) + return; + + super.logResponse(request, response); + } + + + private Object serviceModel(Request request, Response response) { + String serviceName = request.params("id"); + + return Map.of( + "id", serviceName, + "messages", messageQueueService.getEntriesForInbox(serviceName, Long.MAX_VALUE, 20), + "events", eventLogService.getLastEntriesForService(serviceName, 20)); + } + + private Object storageModel(Request request, Response response) { + return Map.of("storage", controlFileStorageService.getStorageList()); + } + + private Object storageDetailsModel(Request request, Response response) throws SQLException { + return Map.of("storage", controlFileStorageService.getFileStorageWithRelatedEntries(FileStorageId.parse(request.params("id")))); + } + private Object storageModelSpecs(Request request, Response response) { + return Map.of("storage", controlFileStorageService.getStorageList(FileStorageType.CRAWL_SPEC)); + } + private Object storageModelCrawls(Request request, Response response) { + return Map.of("storage", controlFileStorageService.getStorageList(FileStorageType.CRAWL_DATA)); + } + private Object storageModelProcessed(Request request, Response response) { + return Map.of("storage", controlFileStorageService.getStorageList(FileStorageType.PROCESSED_DATA)); + } + private Object servicesModel(Request request, Response response) { + return Map.of("services", heartbeatService.getServiceHeartbeats(), + "events", eventLogService.getLastEntries(20)); + } + + private Object processesModel(Request request, Response response) { + var processes = heartbeatService.getProcessHeartbeats(); + var jobs = heartbeatService.getTaskHeartbeats(); + + return Map.of("processes", processes, + "jobs", jobs, + "actors", controlActorService.getActorStates(), + "messages", messageQueueService.getLastEntries(20)); + } + private Object actorDetailsModel(Request request, Response response) { + final Actor actor = Actor.valueOf(request.params("fsm").toUpperCase()); + final String inbox = actor.id(); + + return Map.of( + "actor", actor, + "state-graph", controlActorService.getActorStateGraph(actor), + "messages", messageQueueService.getLastEntriesForInbox(inbox, 20)); + } + private Object serveStatic(Request request, Response response) { + String resource = request.params("resource"); + + staticResources.serveStatic("control", resource, request, response); + + return ""; + } + + + private void logMonitorStateChange() { + logger.info("Service state change: {}", monitors.getRunningServices()); + } + +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/HtmlRedirect.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/HtmlRedirect.java new file mode 100644 index 00000000..ff1e2368 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/HtmlRedirect.java @@ -0,0 +1,22 @@ +package nu.marginalia.control; + +import spark.ResponseTransformer; + +public class HtmlRedirect implements ResponseTransformer { + private final String html; + + /** Because Spark doesn't have a redirect method that works with relative URLs + * (without explicitly providing the external address),we use HTML and let the + * browser resolve the relative redirect instead */ + public HtmlRedirect(String destination) { + this.html = """ + + + """.formatted(destination); + } + + @Override + public String render(Object any) throws Exception { + return html; + } +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/Actor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/Actor.java new file mode 100644 index 00000000..d9002e18 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/Actor.java @@ -0,0 +1,24 @@ +package nu.marginalia.control.actor; + +public enum Actor { + CRAWL, + RECRAWL, + RECONVERT_LOAD, + CONVERTER_MONITOR, + LOADER_MONITOR, + CRAWLER_MONITOR, + MESSAGE_QUEUE_MONITOR, + PROCESS_LIVENESS_MONITOR, + FILE_STORAGE_MONITOR, + ADJACENCY_CALCULATION, + CRAWL_JOB_EXTRACTOR, + EXPORT_DATA, + TRUNCATE_LINK_DATABASE, + + CONVERT; + + + public String id() { + return "fsm:" + name().toLowerCase(); + } +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java new file mode 100644 index 00000000..37cd9e9c --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java @@ -0,0 +1,130 @@ +package nu.marginalia.control.actor; + +import com.google.gson.Gson; +import com.google.inject.Inject; +import com.google.inject.Singleton; +import lombok.SneakyThrows; +import nu.marginalia.control.actor.task.*; +import nu.marginalia.control.actor.monitor.*; +import nu.marginalia.control.actor.monitor.ConverterMonitorActor; +import nu.marginalia.control.actor.monitor.LoaderMonitorActor; +import nu.marginalia.model.gson.GsonFactory; +import nu.marginalia.mq.MessageQueueFactory; +import nu.marginalia.mqsm.ActorStateMachine; +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mqsm.state.MachineState; +import nu.marginalia.service.control.ServiceEventLog; +import nu.marginalia.service.server.BaseServiceParams; + +import java.util.HashMap; +import java.util.Map; +import java.util.UUID; +import java.util.stream.Collectors; + +/** This class is responsible for starting and stopping the various actors in the controller service */ +@Singleton +public class ControlActors { + private final ServiceEventLog eventLog; + private final Gson gson; + private final MessageQueueFactory messageQueueFactory; + public Map stateMachines = new HashMap<>(); + public Map actorDefinitions = new HashMap<>(); + + @Inject + public ControlActors(MessageQueueFactory messageQueueFactory, + GsonFactory gsonFactory, + BaseServiceParams baseServiceParams, + ConvertActor convertActor, + ReconvertAndLoadActor reconvertAndLoadActor, + CrawlActor crawlActor, + RecrawlActor recrawlActor, + ConverterMonitorActor converterMonitorFSM, + CrawlerMonitorActor crawlerMonitorActor, + LoaderMonitorActor loaderMonitor, + MessageQueueMonitorActor messageQueueMonitor, + ProcessLivenessMonitorActor processMonitorFSM, + FileStorageMonitorActor fileStorageMonitorActor, + TriggerAdjacencyCalculationActor triggerAdjacencyCalculationActor, + CrawlJobExtractorActor crawlJobExtractorActor, + ExportDataActor exportDataActor, + TruncateLinkDatabase truncateLinkDatabase + ) { + this.messageQueueFactory = messageQueueFactory; + this.eventLog = baseServiceParams.eventLog; + this.gson = gsonFactory.get(); + + register(Actor.CRAWL, crawlActor); + register(Actor.RECRAWL, recrawlActor); + register(Actor.CONVERT, convertActor); + register(Actor.RECONVERT_LOAD, reconvertAndLoadActor); + + register(Actor.CONVERTER_MONITOR, converterMonitorFSM); + register(Actor.LOADER_MONITOR, loaderMonitor); + register(Actor.CRAWLER_MONITOR, crawlerMonitorActor); + register(Actor.MESSAGE_QUEUE_MONITOR, messageQueueMonitor); + register(Actor.PROCESS_LIVENESS_MONITOR, processMonitorFSM); + register(Actor.FILE_STORAGE_MONITOR, fileStorageMonitorActor); + + register(Actor.ADJACENCY_CALCULATION, triggerAdjacencyCalculationActor); + register(Actor.CRAWL_JOB_EXTRACTOR, crawlJobExtractorActor); + register(Actor.EXPORT_DATA, exportDataActor); + register(Actor.TRUNCATE_LINK_DATABASE, truncateLinkDatabase); + } + + private void register(Actor process, AbstractStateGraph graph) { + var sm = new ActorStateMachine(messageQueueFactory, process.id(), UUID.randomUUID(), graph); + sm.listen((function, param) -> logStateChange(process, function)); + + stateMachines.put(process, sm); + actorDefinitions.put(process, graph); + } + + private void logStateChange(Actor process, String state) { + eventLog.logEvent("FSM-STATE-CHANGE", process.id() + " -> " + state); + } + + public void startFrom(Actor process, String state) throws Exception { + eventLog.logEvent("FSM-START", process.id()); + + stateMachines.get(process).initFrom(state); + } + + public void start(Actor process) throws Exception { + eventLog.logEvent("FSM-START", process.id()); + + stateMachines.get(process).init(); + } + + public void startFrom(Actor process, String state, Object arg) throws Exception { + eventLog.logEvent("FSM-START", process.id()); + + stateMachines.get(process).initFrom(state, gson.toJson(arg)); + } + + public void start(Actor process, Object arg) throws Exception { + eventLog.logEvent("FSM-START", process.id()); + + stateMachines.get(process).init(gson.toJson(arg)); + } + + @SneakyThrows + public void stop(Actor fsm) { + stateMachines.get(fsm).abortExecution(); + } + + public Map getActorStates() { + return stateMachines.entrySet().stream().collect( + Collectors.toMap( + Map.Entry::getKey, e -> e.getValue().getState()) + ); + } + + public boolean isDirectlyInitializable(Actor actor) { + return actorDefinitions.get(actor).isDirectlyInitializable(); + } + + public AbstractStateGraph getActorDefinition(Actor actor) { + return actorDefinitions.get(actor); + } + +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java new file mode 100644 index 00000000..92bbc1d6 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java @@ -0,0 +1,182 @@ +package nu.marginalia.control.actor.monitor; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.control.process.ProcessService; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.persistence.MqPersistence; +import nu.marginalia.mqsm.StateFactory; +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mqsm.graph.GraphState; +import nu.marginalia.mqsm.graph.ResumeBehavior; +import nu.marginalia.mqsm.graph.TerminalGraphState; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.SQLException; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; + +@Singleton +public class AbstractProcessSpawnerActor extends AbstractStateGraph { + + private final MqPersistence persistence; + private final ProcessService processService; + + private final Logger logger = LoggerFactory.getLogger(getClass()); + + public static final String INITIAL = "INITIAL"; + public static final String MONITOR = "MONITOR"; + public static final String RUN = "RUN"; + public static final String ERROR = "ERROR"; + public static final String ABORTED = "ABORTED"; + public static final String END = "END"; + + public static final int MAX_ATTEMPTS = 3; + private final String inboxName; + private final ProcessService.ProcessId processId; + private final ExecutorService executorService = Executors.newSingleThreadExecutor(); + + @Inject + public AbstractProcessSpawnerActor(StateFactory stateFactory, + MqPersistence persistence, + ProcessService processService, + String inboxName, + ProcessService.ProcessId processId) { + super(stateFactory); + this.persistence = persistence; + this.processService = processService; + this.inboxName = inboxName; + this.processId = processId; + } + + @GraphState(name = INITIAL, next = MONITOR) + public void init() { + + } + + @GraphState(name = MONITOR, + next = MONITOR, + resume = ResumeBehavior.RETRY, + transitions = {MONITOR, RUN}, + description = """ + Monitors the inbox of the process for messages. + If a message is found, transition to RUN. + The state takes an optional Integer parameter errorAttempts + that is passed to run. errorAttempts is set to zero after + a few seconds of silence. + """ + ) + public void monitor(Integer errorAttempts) throws SQLException, InterruptedException { + + if (errorAttempts == null) { + errorAttempts = 0; + } + for (;;) { + var messages = persistence.eavesdrop(inboxName, 1); + + if (messages.isEmpty() && !processService.isRunning(processId)) { + TimeUnit.SECONDS.sleep(5); + + if (errorAttempts > 0) { // Reset the error counter if there is silence in the inbox + transition(MONITOR, 0); + } + // else continue + } else { + transition(RUN, errorAttempts); + } + } + } + + @GraphState(name = RUN, + resume = ResumeBehavior.RESTART, + transitions = {MONITOR, ERROR, RUN, ABORTED}, + description = """ + Runs the process. + If the process fails, retransition to RUN up to MAX_ATTEMPTS times. + After MAX_ATTEMPTS at restarting the process, transition to ERROR. + If the process is cancelled, transition to ABORTED. + If the process is successful, transition to MONITOR(errorAttempts). + """ + ) + public void run(Integer attempts) throws Exception { + if (attempts == null) + attempts = 0; + + try { + long startTime = System.currentTimeMillis(); + var exec = new TaskExecution(); + long endTime = System.currentTimeMillis(); + + if (exec.isError()) { + if (attempts < MAX_ATTEMPTS) transition(RUN, attempts + 1); + else error(); + } + else if (endTime - startTime < TimeUnit.SECONDS.toMillis(10)) { + // To avoid boot loops, we transition to error if the process + // didn't run for longer than 10 seconds. This might happen if + // the process crashes before it can reach the heartbeat and inbox + // stages of execution. In this case it would not report having acted + // on its message, and the process would be restarted forever without + // the attempts counter incrementing. + error("Process terminated within 10 seconds of starting"); + } + } + catch (InterruptedException ex) { + // We get this exception when the process is cancelled by the user + + processService.kill(processId); + setCurrentMessageToDead(); + + transition(ABORTED); + } + + transition(MONITOR, attempts); + } + + /** Sets the message to dead in the database to avoid + * the service respawning on the same task when we + * re-enable this actor */ + private void setCurrentMessageToDead() { + try { + var messages = persistence.eavesdrop(inboxName, 1); + + if (messages.isEmpty()) // Possibly a race condition where the task is already finished + return; + + var theMessage = messages.iterator().next(); + persistence.updateMessageState(theMessage.msgId(), MqMessageState.DEAD); + } + catch (SQLException ex) { + logger.error("Tried but failed to set the message for " + processId + " to dead", ex); + } + } + + @TerminalGraphState(name = ABORTED, description = "The process was manually aborted") + public void aborted() throws Exception {} + + + /** Encapsulates the execution of the process in a separate thread so that + * we can interrupt the thread if the process is cancelled */ + private class TaskExecution { + private final AtomicBoolean error = new AtomicBoolean(false); + public TaskExecution() throws ExecutionException, InterruptedException { + // Run this call in a separate thread so that this thread can be interrupted waiting for it + executorService.submit(() -> { + try { + processService.trigger(processId); + } catch (Exception e) { + logger.warn("Error in triggering process", e); + error.set(true); + } + }).get(); // Wait for the process to start + } + + public boolean isError() { + return error.get(); + } + } +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/ConverterMonitorActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/ConverterMonitorActor.java new file mode 100644 index 00000000..158b48ca --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/ConverterMonitorActor.java @@ -0,0 +1,22 @@ +package nu.marginalia.control.actor.monitor; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.control.process.ProcessService; +import nu.marginalia.mqapi.ProcessInboxNames; +import nu.marginalia.mq.persistence.MqPersistence; +import nu.marginalia.mqsm.StateFactory; + +@Singleton +public class ConverterMonitorActor extends AbstractProcessSpawnerActor { + + + @Inject + public ConverterMonitorActor(StateFactory stateFactory, + MqPersistence persistence, + ProcessService processService) { + super(stateFactory, persistence, processService, ProcessInboxNames.CONVERTER_INBOX, ProcessService.ProcessId.CONVERTER); + } + + +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/CrawlerMonitorActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/CrawlerMonitorActor.java new file mode 100644 index 00000000..cc9c73fb --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/CrawlerMonitorActor.java @@ -0,0 +1,25 @@ +package nu.marginalia.control.actor.monitor; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.control.process.ProcessService; +import nu.marginalia.mq.persistence.MqPersistence; +import nu.marginalia.mqapi.ProcessInboxNames; +import nu.marginalia.mqsm.StateFactory; + +@Singleton +public class CrawlerMonitorActor extends AbstractProcessSpawnerActor { + + @Inject + public CrawlerMonitorActor(StateFactory stateFactory, + MqPersistence persistence, + ProcessService processService) { + super(stateFactory, + persistence, + processService, + ProcessInboxNames.CRAWLER_INBOX, + ProcessService.ProcessId.CRAWLER); + } + + +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/FileStorageMonitorActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/FileStorageMonitorActor.java new file mode 100644 index 00000000..9f2ced26 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/FileStorageMonitorActor.java @@ -0,0 +1,108 @@ +package nu.marginalia.control.actor.monitor; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.db.storage.model.FileStorage; +import nu.marginalia.db.storage.model.FileStorageBaseType; +import nu.marginalia.db.storage.model.FileStorageId; +import nu.marginalia.mqsm.StateFactory; +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mqsm.graph.GraphState; +import nu.marginalia.mqsm.graph.ResumeBehavior; +import org.apache.commons.io.FileUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.SQLException; +import java.util.List; +import java.util.Optional; +import java.util.concurrent.TimeUnit; + +@Singleton +public class FileStorageMonitorActor extends AbstractStateGraph { + + private final Logger logger = LoggerFactory.getLogger(getClass()); + // STATES + + private static final String INITIAL = "INITIAL"; + private static final String MONITOR = "MONITOR"; + private static final String PURGE = "PURGE"; + private static final String REMOVE_STALE = "REMOVE-STALE"; + private static final String END = "END"; + private final FileStorageService fileStorageService; + + + @Inject + public FileStorageMonitorActor(StateFactory stateFactory, + FileStorageService fileStorageService) { + super(stateFactory); + this.fileStorageService = fileStorageService; + } + + @GraphState(name = INITIAL, next = MONITOR) + public void init() { + } + + @GraphState(name = MONITOR, + next = PURGE, + resume = ResumeBehavior.RETRY, + transitions = { PURGE, REMOVE_STALE }, + description = """ + Monitor the file storage and trigger at transition to PURGE if any file storage area + has been marked for deletion. + """) + public void monitor() throws Exception { + + for (;;) { + Optional toDeleteOpt = fileStorageService.findFileStorageToDelete(); + + if (toDeleteOpt.isPresent()) { + transition(PURGE, toDeleteOpt.get().id()); + } + + List allStorageItems = fileStorageService.getEachFileStorage(); + var missing = allStorageItems.stream().filter(storage -> !Files.exists(storage.asPath())).findAny(); + if (missing.isPresent()) { + transition(REMOVE_STALE, missing.get().id()); + } + + fileStorageService.synchronizeStorageManifests(fileStorageService.getStorageBase(FileStorageBaseType.SLOW)); + + TimeUnit.SECONDS.sleep(10); + } + } + + @GraphState(name = PURGE, + next = MONITOR, + resume = ResumeBehavior.RETRY, + description = """ + Purge the file storage area and transition back to MONITOR. + """ + ) + public void purge(FileStorageId id) throws Exception { + var storage = fileStorageService.getStorage(id); + logger.info("Deleting {} ", storage.path()); + Path path = storage.asPath(); + + if (Files.exists(path)) { + FileUtils.deleteDirectory(path.toFile()); + } + + fileStorageService.removeFileStorage(storage.id()); + } + + @GraphState( + name = REMOVE_STALE, + next = MONITOR, + resume = ResumeBehavior.RETRY, + description = """ + Remove file storage from the database if it doesn't exist on disk. + """ + ) + public void removeStale(FileStorageId id) throws SQLException { + fileStorageService.removeFileStorage(id); + } +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/LoaderMonitorActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/LoaderMonitorActor.java new file mode 100644 index 00000000..fcf3b895 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/LoaderMonitorActor.java @@ -0,0 +1,24 @@ +package nu.marginalia.control.actor.monitor; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.control.process.ProcessService; +import nu.marginalia.mqapi.ProcessInboxNames; +import nu.marginalia.mq.persistence.MqPersistence; +import nu.marginalia.mqsm.StateFactory; + +@Singleton +public class LoaderMonitorActor extends AbstractProcessSpawnerActor { + + + @Inject + public LoaderMonitorActor(StateFactory stateFactory, + MqPersistence persistence, + ProcessService processService) { + + super(stateFactory, persistence, processService, + ProcessInboxNames.LOADER_INBOX, + ProcessService.ProcessId.LOADER); + } + +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/MessageQueueMonitorActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/MessageQueueMonitorActor.java new file mode 100644 index 00000000..77384b06 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/MessageQueueMonitorActor.java @@ -0,0 +1,48 @@ +package nu.marginalia.control.actor.monitor; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.mq.persistence.MqPersistence; +import nu.marginalia.mqsm.StateFactory; +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mqsm.graph.GraphState; +import nu.marginalia.mqsm.graph.ResumeBehavior; + +import java.util.concurrent.TimeUnit; + +@Singleton +public class MessageQueueMonitorActor extends AbstractStateGraph { + + // STATES + + private static final String INITIAL = "INITIAL"; + private static final String MONITOR = "MONITOR"; + private static final String END = "END"; + private final MqPersistence persistence; + + + @Inject + public MessageQueueMonitorActor(StateFactory stateFactory, + MqPersistence persistence) { + super(stateFactory); + this.persistence = persistence; + } + + @GraphState(name = INITIAL, next = MONITOR) + public void init() { + } + + @GraphState(name = MONITOR, next = MONITOR, resume = ResumeBehavior.RETRY, + description = """ + Periodically clean up the message queue. + """) + public void monitor() throws Exception { + + for (;;) { + persistence.reapDeadMessages(); + persistence.cleanOldMessages(); + TimeUnit.SECONDS.sleep(60); + } + } + +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/ProcessLivenessMonitorActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/ProcessLivenessMonitorActor.java new file mode 100644 index 00000000..4128f6f9 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/ProcessLivenessMonitorActor.java @@ -0,0 +1,82 @@ +package nu.marginalia.control.actor.monitor; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.control.model.ServiceHeartbeat; +import nu.marginalia.control.svc.HeartbeatService; +import nu.marginalia.control.process.ProcessService; +import nu.marginalia.mqsm.StateFactory; +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mqsm.graph.GraphState; +import nu.marginalia.mqsm.graph.ResumeBehavior; + +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; + +@Singleton +public class ProcessLivenessMonitorActor extends AbstractStateGraph { + + // STATES + + private static final String INITIAL = "INITIAL"; + private static final String MONITOR = "MONITOR"; + private static final String END = "END"; + private final ProcessService processService; + private final HeartbeatService heartbeatService; + + + @Inject + public ProcessLivenessMonitorActor(StateFactory stateFactory, + ProcessService processService, + HeartbeatService heartbeatService) { + super(stateFactory); + this.processService = processService; + this.heartbeatService = heartbeatService; + } + + @GraphState(name = INITIAL, next = MONITOR) + public void init() { + } + + @GraphState(name = MONITOR, next = MONITOR, resume = ResumeBehavior.RETRY, description = """ + Periodically check to ensure that the control service's view of + running processes is agreement with the process heartbeats table. + + If the process is not running, mark the process as stopped in the table. + """) + public void monitor() throws Exception { + + for (;;) { + for (var heartbeat : heartbeatService.getProcessHeartbeats()) { + if (!heartbeat.isRunning()) { + continue; + } + + var processId = heartbeat.getProcessId(); + if (null == processId) + continue; + + if (processService.isRunning(processId) && heartbeat.lastSeenMillis() < 10000) { + continue; + } + + heartbeatService.flagProcessAsStopped(heartbeat); + } + + var livingServices = heartbeatService.getServiceHeartbeats().stream() + .filter(ServiceHeartbeat::alive) + .map(ServiceHeartbeat::uuidFull) + .collect(Collectors.toSet()); + + for (var heartbeat : heartbeatService.getTaskHeartbeats()) { + if (!livingServices.contains(heartbeat.serviceUuuidFull())) { + heartbeatService.removeTaskHeartbeat(heartbeat); + } + } + + + TimeUnit.SECONDS.sleep(60); + } + } + +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ActorProcessWatcher.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ActorProcessWatcher.java new file mode 100644 index 00000000..d6c33608 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ActorProcessWatcher.java @@ -0,0 +1,77 @@ +package nu.marginalia.control.actor.task; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.control.process.ProcessService; +import nu.marginalia.mq.MqMessage; +import nu.marginalia.mq.outbox.MqOutbox; +import nu.marginalia.mqsm.graph.ControlFlowException; + +import java.sql.SQLException; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; + +@Singleton +public class ActorProcessWatcher { + + private final ProcessService processService; + + @Inject + public ActorProcessWatcher(ProcessService processService) { + this.processService = processService; + } + + /** Wait for a process to start, and then wait for a response from the process, + * periodically checking that the process is still running. If the process dies, + * and does not respawn, or does not start at all, a control flow exception is thrown + * that will cause the actor to transition to ERROR. + *

    + * When interrupted, the process is killed and the message is marked as dead. + */ + public MqMessage waitResponse(MqOutbox outbox, ProcessService.ProcessId processId, long msgId) + throws ControlFlowException, InterruptedException, SQLException + { + if (!waitForProcess(processId, TimeUnit.SECONDS, 30)) { + throw new ControlFlowException("ERROR", + "Process " + processId + " did not launch"); + } + + for (;;) { + try { + return outbox.waitResponse(msgId, 5, TimeUnit.SECONDS); + } + catch (InterruptedException ex) { + // Here we mark the message as dead, as it's the user that has aborted the process + // This will prevent the monitor process from attempting to respawn the process as we kill it + + outbox.flagAsDead(msgId); + processService.kill(processId); + + throw ex; + } + catch (TimeoutException ex) { + // Maybe the process died, wait a moment for it to restart + if (!waitForProcess(processId, TimeUnit.SECONDS, 30)) { + throw new ControlFlowException("ERROR", + "Process " + processId + " died and did not re-launch"); + } + } + } + } + + /** Wait the specified time for the specified process to start running (does not start the process) */ + private boolean waitForProcess(ProcessService.ProcessId processId, TimeUnit unit, int duration) throws InterruptedException { + + // Wait for process to start + long deadline = System.currentTimeMillis() + unit.toMillis(duration); + while (System.currentTimeMillis() < deadline) { + if (processService.isRunning(processId)) + return true; + + TimeUnit.MILLISECONDS.sleep(100); + } + + return false; + } + +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ConvertActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ConvertActor.java new file mode 100644 index 00000000..0bcc5293 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ConvertActor.java @@ -0,0 +1,176 @@ +package nu.marginalia.control.actor.task; + +import com.google.gson.Gson; +import com.google.inject.Inject; +import com.google.inject.Singleton; +import lombok.AllArgsConstructor; +import lombok.NoArgsConstructor; +import lombok.With; +import nu.marginalia.control.process.ProcessOutboxes; +import nu.marginalia.control.process.ProcessService; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.db.storage.model.FileStorageBaseType; +import nu.marginalia.db.storage.model.FileStorageId; +import nu.marginalia.db.storage.model.FileStorageType; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.outbox.MqOutbox; +import nu.marginalia.mqapi.converting.ConvertAction; +import nu.marginalia.mqapi.converting.ConvertRequest; +import nu.marginalia.mqsm.StateFactory; +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mqsm.graph.GraphState; +import nu.marginalia.mqsm.graph.ResumeBehavior; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.nio.file.Files; +import java.nio.file.Path; + +@Singleton +public class ConvertActor extends AbstractStateGraph { + + // STATES + + public static final String CONVERT = "CONVERT"; + public static final String CONVERT_ENCYCLOPEDIA = "CONVERT_ENCYCLOPEDIA"; + public static final String CONVERT_STACKEXCHANGE = "CONVERT_STACKEXCHANGE"; + public static final String CONVERT_WAIT = "CONVERT-WAIT"; + + public static final String END = "END"; + private final ActorProcessWatcher processWatcher; + private final MqOutbox mqConverterOutbox; + private final FileStorageService storageService; + private final Gson gson; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + + @AllArgsConstructor @With @NoArgsConstructor + public static class Message { + public FileStorageId crawlStorageId = null; + public FileStorageId processedStorageId = null; + public long converterMsgId = 0L; + public long loaderMsgId = 0L; + }; + + @Inject + public ConvertActor(StateFactory stateFactory, + ActorProcessWatcher processWatcher, + ProcessOutboxes processOutboxes, + FileStorageService storageService, + Gson gson + ) + { + super(stateFactory); + this.processWatcher = processWatcher; + this.mqConverterOutbox = processOutboxes.getConverterOutbox(); + this.storageService = storageService; + this.gson = gson; + } + + @GraphState(name = CONVERT, + next = CONVERT_WAIT, + resume = ResumeBehavior.ERROR, + description = """ + Allocate a storage area for the processed data, + then send a convert request to the converter and transition to RECONVERT_WAIT. + """ + ) + public Long convert(FileStorageId sourceStorageId) throws Exception { + // Create processed data area + + var toProcess = storageService.getStorage(sourceStorageId); + var base = storageService.getStorageBase(FileStorageBaseType.SLOW); + var processedArea = storageService.allocateTemporaryStorage(base, + FileStorageType.PROCESSED_DATA, "processed-data", + "Processed Data; " + toProcess.description()); + + storageService.relateFileStorages(toProcess.id(), processedArea.id()); + + // Pre-send convert request + var request = new ConvertRequest(ConvertAction.ConvertCrawlData, + null, + sourceStorageId, + processedArea.id()); + + return mqConverterOutbox.sendAsync(ConvertRequest.class.getSimpleName(), gson.toJson(request)); + } + + @GraphState(name = CONVERT_ENCYCLOPEDIA, + next = CONVERT_WAIT, + resume = ResumeBehavior.ERROR, + description = """ + Allocate a storage area for the processed data, + then send a convert request to the converter and transition to RECONVERT_WAIT. + """ + ) + public Long convertEncyclopedia(String source) throws Exception { + // Create processed data area + + Path sourcePath = Path.of(source); + if (!Files.exists(sourcePath)) + error("Source path does not exist: " + sourcePath); + + String fileName = sourcePath.toFile().getName(); + + var base = storageService.getStorageBase(FileStorageBaseType.SLOW); + var processedArea = storageService.allocateTemporaryStorage(base, + FileStorageType.PROCESSED_DATA, "processed-data", + "Processed Encylopedia Data; " + fileName); + + // Pre-send convert request + var request = new ConvertRequest(ConvertAction.SideloadEncyclopedia, + sourcePath.toString(), + null, + processedArea.id()); + + return mqConverterOutbox.sendAsync(ConvertRequest.class.getSimpleName(), gson.toJson(request)); + } + + @GraphState(name = CONVERT_STACKEXCHANGE, + next = CONVERT_WAIT, + resume = ResumeBehavior.ERROR, + description = """ + Allocate a storage area for the processed data, + then send a convert request to the converter and transition to RECONVERT_WAIT. + """ + ) + public Long convertStackexchange(String source) throws Exception { + // Create processed data area + + Path sourcePath = Path.of(source); + if (!Files.exists(sourcePath)) + error("Source path does not exist: " + sourcePath); + + String fileName = sourcePath.toFile().getName(); + + var base = storageService.getStorageBase(FileStorageBaseType.SLOW); + var processedArea = storageService.allocateTemporaryStorage(base, + FileStorageType.PROCESSED_DATA, "processed-data", + "Processed Stackexchange Data; " + fileName); + + // Pre-send convert request + var request = new ConvertRequest(ConvertAction.SideloadStackexchange, + sourcePath.toString(), + null, + processedArea.id()); + + return mqConverterOutbox.sendAsync(ConvertRequest.class.getSimpleName(), gson.toJson(request)); + } + + @GraphState( + name = CONVERT_WAIT, + next = END, + resume = ResumeBehavior.RETRY, + description = """ + Wait for the converter to finish processing the data. + """ + ) + public void convertWait(Long msgId) throws Exception { + var rsp = processWatcher.waitResponse(mqConverterOutbox, ProcessService.ProcessId.CONVERTER, msgId); + + if (rsp.state() != MqMessageState.OK) + error("Converter failed"); + } + + +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlActor.java new file mode 100644 index 00000000..40f447c1 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlActor.java @@ -0,0 +1,130 @@ +package nu.marginalia.control.actor.task; + +import com.google.gson.Gson; +import com.google.inject.Inject; +import com.google.inject.Singleton; +import lombok.AllArgsConstructor; +import lombok.NoArgsConstructor; +import lombok.With; +import nu.marginalia.control.process.ProcessOutboxes; +import nu.marginalia.control.process.ProcessService; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.db.storage.model.FileStorageBaseType; +import nu.marginalia.db.storage.model.FileStorageId; +import nu.marginalia.db.storage.model.FileStorageType; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.outbox.MqOutbox; +import nu.marginalia.mqapi.crawling.CrawlRequest; +import nu.marginalia.mqsm.StateFactory; +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mqsm.graph.GraphState; +import nu.marginalia.mqsm.graph.ResumeBehavior; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@Singleton +public class CrawlActor extends AbstractStateGraph { + + // STATES + + public static final String INITIAL = "INITIAL"; + public static final String CRAWL = "CRAWL"; + public static final String CRAWL_WAIT = "CRAWL-WAIT"; + public static final String END = "END"; + private final MqOutbox mqCrawlerOutbox; + private final FileStorageService storageService; + private final Gson gson; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + private final ActorProcessWatcher processWatcher; + + + @AllArgsConstructor @With @NoArgsConstructor + public static class Message { + public FileStorageId crawlSpecId = null; + public FileStorageId crawlStorageId = null; + public long crawlerMsgId = 0L; + }; + + @Inject + public CrawlActor(StateFactory stateFactory, + ProcessOutboxes processOutboxes, + FileStorageService storageService, + Gson gson, + ActorProcessWatcher processWatcher) + { + super(stateFactory); + this.mqCrawlerOutbox = processOutboxes.getCrawlerOutbox(); + this.storageService = storageService; + this.gson = gson; + this.processWatcher = processWatcher; + } + + @GraphState(name = INITIAL, + next = CRAWL, + description = """ + Validate the input and transition to CRAWL + """) + public Message init(FileStorageId crawlStorageId) throws Exception { + if (null == crawlStorageId) { + error("This Actor requires a FileStorageId to be passed in as a parameter to INITIAL"); + } + + var storage = storageService.getStorage(crawlStorageId); + + if (storage == null) error("Bad storage id"); + if (storage.type() != FileStorageType.CRAWL_SPEC) error("Bad storage type " + storage.type()); + + return new Message().withCrawlSpecId(crawlStorageId); + } + + @GraphState(name = CRAWL, + next = CRAWL_WAIT, + resume = ResumeBehavior.ERROR, + description = """ + Allocate a storage area for the crawled data, + then send a crawl request to the crawler and transition to CRAWL_WAIT. + """ + ) + public Message crawl(Message message) throws Exception { + // Create processed data area + + var toCrawl = storageService.getStorage(message.crawlSpecId); + + var base = storageService.getStorageBase(FileStorageBaseType.SLOW); + var dataArea = storageService.allocateTemporaryStorage( + base, + FileStorageType.CRAWL_DATA, + "crawl-data", + toCrawl.description()); + + storageService.relateFileStorages(toCrawl.id(), dataArea.id()); + + // Pre-send convert request + var request = new CrawlRequest(message.crawlSpecId, dataArea.id()); + long id = mqCrawlerOutbox.sendAsync(CrawlRequest.class.getSimpleName(), gson.toJson(request)); + + return message + .withCrawlStorageId(dataArea.id()) + .withCrawlerMsgId(id); + } + + @GraphState( + name = CRAWL_WAIT, + next = END, + resume = ResumeBehavior.RETRY, + description = """ + Wait for the crawler to finish retreiving the data. + """ + ) + public Message crawlerWait(Message message) throws Exception { + var rsp = processWatcher.waitResponse(mqCrawlerOutbox, ProcessService.ProcessId.CRAWLER, message.crawlerMsgId); + + if (rsp.state() != MqMessageState.OK) + error("Crawler failed"); + + return message; + } + + +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlJobExtractorActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlJobExtractorActor.java new file mode 100644 index 00000000..9cadc49a --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/CrawlJobExtractorActor.java @@ -0,0 +1,132 @@ +package nu.marginalia.control.actor.task; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.control.svc.ControlFileStorageService; +import nu.marginalia.control.process.ProcessService; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.db.storage.model.FileStorage; +import nu.marginalia.db.storage.model.FileStorageBaseType; +import nu.marginalia.db.storage.model.FileStorageType; +import nu.marginalia.mqsm.StateFactory; +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mqsm.graph.GraphState; +import nu.marginalia.mqsm.graph.ResumeBehavior; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.net.URL; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.atomic.AtomicBoolean; + +@Singleton +public class CrawlJobExtractorActor extends AbstractStateGraph { + + private final Logger logger = LoggerFactory.getLogger(getClass()); + // STATES + + public static final String CREATE_FROM_DB = "CREATE_FROM_DB"; + public static final String CREATE_FROM_LINK = "CREATE_FROM_LINK"; + public static final String END = "END"; + private final ProcessService processService; + private final FileStorageService fileStorageService; + private final ControlFileStorageService controlFileStorageService; + private final ExecutorService executor = Executors.newSingleThreadExecutor(); + + @Inject + public CrawlJobExtractorActor(StateFactory stateFactory, + ProcessService processService, + FileStorageService fileStorageService, + ControlFileStorageService controlFileStorageService + ) { + super(stateFactory); + this.processService = processService; + this.fileStorageService = fileStorageService; + this.controlFileStorageService = controlFileStorageService; + } + + public record CrawlJobExtractorArguments(String description) { } + public record CrawlJobExtractorArgumentsWithURL(String description, String url) { } + + @GraphState(name = CREATE_FROM_LINK, next = END, + resume = ResumeBehavior.ERROR, + description = """ + Download a list of URLs as provided, + and then spawn a CrawlJobExtractor process, + then wait for it to finish. + """ + ) + public void createFromFromLink(CrawlJobExtractorArgumentsWithURL arg) throws Exception { + if (arg == null) { + error("This actor requires a CrawlJobExtractorArgumentsWithURL argument"); + } + + var base = fileStorageService.getStorageBase(FileStorageBaseType.SLOW); + var storage = fileStorageService.allocateTemporaryStorage(base, FileStorageType.CRAWL_SPEC, "crawl-spec", arg.description()); + + Path urlsTxt = storage.asPath().resolve("urls.txt"); + + try (var os = Files.newOutputStream(urlsTxt, StandardOpenOption.CREATE_NEW); + var is = new URL(arg.url()).openStream()) + { + is.transferTo(os); + } + catch (Exception ex) { + controlFileStorageService.flagFileForDeletion(storage.id()); + error("Error downloading " + arg.url()); + } + + final Path path = storage.asPath(); + + run(storage, path.resolve("crawler.spec").toString(), + "-f", urlsTxt.toString()); + } + + + @GraphState(name = CREATE_FROM_DB, next = END, + resume = ResumeBehavior.ERROR, + description = """ + Spawns a CrawlJobExtractor process that loads data from the link database, and wait for it to finish. + """ + ) + public void createFromDB(CrawlJobExtractorArguments arg) throws Exception { + if (arg == null) { + error("This actor requires a CrawlJobExtractorArguments argument"); + } + + var base = fileStorageService.getStorageBase(FileStorageBaseType.SLOW); + var storage = fileStorageService.allocateTemporaryStorage(base, FileStorageType.CRAWL_SPEC, "crawl-spec", arg.description()); + + final Path path = storage.asPath(); + + run(storage, + path.resolve("crawler.spec").toString()); + } + + private void run(FileStorage storage, String... args) throws Exception { + + AtomicBoolean hasError = new AtomicBoolean(false); + var future = executor.submit(() -> { + try { + processService.trigger(ProcessService.ProcessId.CRAWL_JOB_EXTRACTOR, + args); + } + catch (Exception ex) { + logger.warn("Error in creating crawl job", ex); + hasError.set(true); + } + }); + future.get(); + + if (hasError.get()) { + controlFileStorageService.flagFileForDeletion(storage.id()); + error("Error triggering adjacency calculation"); + } + + } + +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ExportDataActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ExportDataActor.java new file mode 100644 index 00000000..10227dc9 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ExportDataActor.java @@ -0,0 +1,192 @@ +package nu.marginalia.control.actor.task; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.zaxxer.hikari.HikariDataSource; +import lombok.AllArgsConstructor; +import lombok.NoArgsConstructor; +import lombok.With; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.db.storage.model.FileStorageId; +import nu.marginalia.db.storage.model.FileStorageType; +import nu.marginalia.mqsm.StateFactory; +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mqsm.graph.GraphState; +import nu.marginalia.mqsm.graph.ResumeBehavior; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedWriter; +import java.io.OutputStreamWriter; +import java.nio.file.Files; +import java.nio.file.StandardCopyOption; +import java.nio.file.StandardOpenOption; +import java.nio.file.attribute.PosixFilePermissions; +import java.util.zip.GZIPOutputStream; + +@Singleton +public class ExportDataActor extends AbstractStateGraph { + + private static final String blacklistFilename = "blacklist.csv.gz"; + private static final String domainsFilename = "domains.csv.gz"; + private static final String linkGraphFilename = "linkgraph.csv.gz"; + + + // STATES + public static final String INITIAL = "INITIAL"; + public static final String EXPORT_DOMAINS = "EXPORT-DOMAINS"; + public static final String EXPORT_BLACKLIST = "EXPORT-BLACKLIST"; + public static final String EXPORT_LINK_GRAPH = "EXPORT-LINK-GRAPH"; + + public static final String END = "END"; + private final FileStorageService storageService; + private final HikariDataSource dataSource; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + @AllArgsConstructor @With @NoArgsConstructor + public static class Message { + public FileStorageId storageId = null; + }; + + @Inject + public ExportDataActor(StateFactory stateFactory, + FileStorageService storageService, + HikariDataSource dataSource) + { + super(stateFactory); + this.storageService = storageService; + this.dataSource = dataSource; + } + + @GraphState(name = INITIAL, + next = EXPORT_BLACKLIST, + description = """ + Find EXPORT storage area, then transition to EXPORT-BLACKLIST. + """) + public Message init(Integer i) throws Exception { + + var storage = storageService.getStorageByType(FileStorageType.EXPORT); + if (storage == null) error("Bad storage id"); + + return new Message().withStorageId(storage.id()); + } + + @GraphState(name = EXPORT_BLACKLIST, + next = EXPORT_DOMAINS, + resume = ResumeBehavior.ERROR, + description = """ + Export the blacklist from the database to the EXPORT storage area. + """ + ) + public Message exportBlacklist(Message message) throws Exception { + var storage = storageService.getStorage(message.storageId); + var tmpFile = Files.createTempFile(storage.asPath(), "export", ".csv.gz", + PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--"))); + + try (var bw = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(Files.newOutputStream(tmpFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)))); + var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement("SELECT URL_DOMAIN FROM EC_DOMAIN_BLACKLIST"); + ) + { + stmt.setFetchSize(1000); + var rs = stmt.executeQuery(); + while (rs.next()) { + bw.write(rs.getString(1)); + bw.write("\n"); + } + Files.move(tmpFile, storage.asPath().resolve(blacklistFilename), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING); + } + catch (Exception ex) { + logger.error("Failed to export blacklist", ex); + error("Failed to export blacklist"); + } + finally { + Files.deleteIfExists(tmpFile); + } + + return message; + } + + @GraphState( + name = EXPORT_DOMAINS, + next = EXPORT_LINK_GRAPH, + resume = ResumeBehavior.RETRY, + description = """ + Export known domains to the EXPORT storage area. + """ + ) + public Message exportDomains(Message message) throws Exception { + var storage = storageService.getStorage(message.storageId); + var tmpFile = Files.createTempFile(storage.asPath(), "export", ".csv.gz", + PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--"))); + + try (var bw = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(Files.newOutputStream(tmpFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)))); + var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement("SELECT DOMAIN_NAME, ID, INDEXED, STATE FROM EC_DOMAIN"); + ) + { + stmt.setFetchSize(1000); + var rs = stmt.executeQuery(); + while (rs.next()) { + bw.write(rs.getString("DOMAIN_NAME")); + bw.write(","); + bw.write(rs.getString("ID")); + bw.write(","); + bw.write(rs.getString("INDEXED")); + bw.write(","); + bw.write(rs.getString("STATE")); + bw.write("\n"); + } + Files.move(tmpFile, storage.asPath().resolve(domainsFilename), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING); + } + catch (Exception ex) { + logger.error("Failed to export domains", ex); + error("Failed to export domains"); + } + finally { + Files.deleteIfExists(tmpFile); + } + + return message; + } + + @GraphState( + name = EXPORT_LINK_GRAPH, + next = END, + resume = ResumeBehavior.RETRY, + description = """ + Export known domains to the EXPORT storage area. + """ + ) + public Message exportLinkGraph(Message message) throws Exception { + var storage = storageService.getStorage(message.storageId); + var tmpFile = Files.createTempFile(storage.asPath(), "export", ".csv.gz", + PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--"))); + + try (var bw = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(Files.newOutputStream(tmpFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)))); + var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK"); + ) + { + stmt.setFetchSize(1000); + var rs = stmt.executeQuery(); + while (rs.next()) { + bw.write(rs.getString("SOURCE_DOMAIN_ID")); + bw.write(","); + bw.write(rs.getString("DEST_DOMAIN_ID")); + bw.write("\n"); + } + Files.move(tmpFile, storage.asPath().resolve(linkGraphFilename), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING); + } + catch (Exception ex) { + logger.error("Failed to export link graph", ex); + error("Failed to export link graph"); + } + finally { + Files.deleteIfExists(tmpFile); + } + + return message; + } + +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ReconvertAndLoadActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ReconvertAndLoadActor.java new file mode 100644 index 00000000..06c982ff --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ReconvertAndLoadActor.java @@ -0,0 +1,261 @@ +package nu.marginalia.control.actor.task; + +import com.google.gson.Gson; +import com.google.inject.Inject; +import com.google.inject.Singleton; +import lombok.AllArgsConstructor; +import lombok.NoArgsConstructor; +import lombok.With; +import nu.marginalia.control.process.ProcessOutboxes; +import nu.marginalia.control.process.ProcessService; +import nu.marginalia.index.client.IndexClient; +import nu.marginalia.index.client.IndexMqEndpoints; +import nu.marginalia.mqapi.converting.ConvertAction; +import nu.marginalia.mqapi.converting.ConvertRequest; +import nu.marginalia.mqapi.loading.LoadRequest; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.db.storage.model.FileStorageBaseType; +import nu.marginalia.db.storage.model.FileStorageId; +import nu.marginalia.db.storage.model.FileStorageType; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.outbox.MqOutbox; +import nu.marginalia.mqsm.StateFactory; +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mqsm.graph.GraphState; +import nu.marginalia.mqsm.graph.ResumeBehavior; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.nio.file.Files; +import java.nio.file.StandardCopyOption; + +@Singleton +public class ReconvertAndLoadActor extends AbstractStateGraph { + + // STATES + + public static final String INITIAL = "INITIAL"; + public static final String RECONVERT = "RECONVERT"; + public static final String RECONVERT_WAIT = "RECONVERT-WAIT"; + public static final String LOAD = "LOAD"; + public static final String LOAD_WAIT = "LOAD-WAIT"; + public static final String SWAP_LEXICON = "SWAP-LEXICON"; + + public static final String REPARTITION = "REPARTITION"; + public static final String REPARTITION_WAIT = "REPARTITION-WAIT"; + public static final String REINDEX = "REINDEX"; + public static final String REINDEX_WAIT = "REINDEX-WAIT"; + public static final String END = "END"; + private final ActorProcessWatcher processWatcher; + private final MqOutbox mqConverterOutbox; + private final MqOutbox mqLoaderOutbox; + private final MqOutbox indexOutbox; + private final FileStorageService storageService; + private final Gson gson; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + + @AllArgsConstructor @With @NoArgsConstructor + public static class Message { + public FileStorageId crawlStorageId = null; + public FileStorageId processedStorageId = null; + public long converterMsgId = 0L; + public long loaderMsgId = 0L; + }; + + @Inject + public ReconvertAndLoadActor(StateFactory stateFactory, + ActorProcessWatcher processWatcher, + ProcessOutboxes processOutboxes, + FileStorageService storageService, + IndexClient indexClient, + Gson gson + ) + { + super(stateFactory); + this.processWatcher = processWatcher; + this.indexOutbox = indexClient.outbox(); + this.mqConverterOutbox = processOutboxes.getConverterOutbox(); + this.mqLoaderOutbox = processOutboxes.getLoaderOutbox(); + this.storageService = storageService; + this.gson = gson; + } + + @GraphState(name = INITIAL, + next = RECONVERT, + description = """ + Validate the input and transition to RECONVERT + """) + public Message init(FileStorageId crawlStorageId) throws Exception { + if (null == crawlStorageId) { + error("This Actor requires a FileStorageId to be passed in as a parameter to INITIAL"); + } + + var storage = storageService.getStorage(crawlStorageId); + + if (storage == null) error("Bad storage id"); + if (storage.type() != FileStorageType.CRAWL_DATA) error("Bad storage type " + storage.type()); + + return new Message().withCrawlStorageId(crawlStorageId); + } + + @GraphState(name = RECONVERT, + next = RECONVERT_WAIT, + resume = ResumeBehavior.ERROR, + description = """ + Allocate a storage area for the processed data, + then send a convert request to the converter and transition to RECONVERT_WAIT. + """ + ) + public Message reconvert(Message message) throws Exception { + // Create processed data area + + var toProcess = storageService.getStorage(message.crawlStorageId); + + var base = storageService.getStorageBase(FileStorageBaseType.SLOW); + var processedArea = storageService.allocateTemporaryStorage(base, FileStorageType.PROCESSED_DATA, "processed-data", + "Processed Data; " + toProcess.description()); + + storageService.relateFileStorages(toProcess.id(), processedArea.id()); + + // Pre-send convert request + var request = new ConvertRequest(ConvertAction.ConvertCrawlData, + null, + message.crawlStorageId, + processedArea.id()); + long id = mqConverterOutbox.sendAsync(ConvertRequest.class.getSimpleName(), gson.toJson(request)); + + return message + .withProcessedStorageId(processedArea.id()) + .withConverterMsgId(id); + } + + @GraphState( + name = RECONVERT_WAIT, + next = LOAD, + resume = ResumeBehavior.RETRY, + description = """ + Wait for the converter to finish processing the data. + """ + ) + public Message reconvertWait(Message message) throws Exception { + var rsp = processWatcher.waitResponse(mqConverterOutbox, ProcessService.ProcessId.CONVERTER, message.converterMsgId); + + if (rsp.state() != MqMessageState.OK) + error("Converter failed"); + + return message; + } + + + @GraphState( + name = LOAD, + next = LOAD_WAIT, + resume = ResumeBehavior.ERROR, + description = """ + Send a load request to the loader and transition to LOAD_WAIT. + """) + public Message load(Message message) throws Exception { + + var request = new LoadRequest(message.processedStorageId); + long id = mqLoaderOutbox.sendAsync(LoadRequest.class.getSimpleName(), gson.toJson(request)); + + return message.withLoaderMsgId(id); + + } + + @GraphState( + name = LOAD_WAIT, + next = SWAP_LEXICON, + resume = ResumeBehavior.RETRY, + description = """ + Wait for the loader to finish loading the data. + """ + ) + public void loadWait(Message message) throws Exception { + var rsp = processWatcher.waitResponse(mqLoaderOutbox, ProcessService.ProcessId.LOADER, message.loaderMsgId); + + if (rsp.state() != MqMessageState.OK) + error("Loader failed"); + } + + + + @GraphState( + name = SWAP_LEXICON, + next = REPARTITION, + resume = ResumeBehavior.RETRY, + description = """ + Move the lexicon from the LEXICON_STAGING area to the LEXICON_LIVE area, + then instruct the index-service to reload the lexicon. + """ + ) + public void swapLexicon(Message message) throws Exception { + var live = storageService.getStorageByType(FileStorageType.LEXICON_LIVE); + + var staging = storageService.getStorageByType(FileStorageType.LEXICON_STAGING); + var fromSource = staging.asPath().resolve("dictionary.dat"); + var liveDest = live.asPath().resolve("dictionary.dat"); + + // Swap in new lexicon + logger.info("Moving " + fromSource + " to " + liveDest); + Files.move(fromSource, liveDest, StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.ATOMIC_MOVE); + } + + + @GraphState( + name = REPARTITION, + next = REPARTITION_WAIT, + description = """ + Instruct the index-service to repartition the index then transition to REPARTITION_WAIT. + """ + ) + public Long repartition() throws Exception { + return indexOutbox.sendAsync(IndexMqEndpoints.INDEX_REPARTITION, ""); + } + + @GraphState( + name = REPARTITION_WAIT, + next = REINDEX, + resume = ResumeBehavior.RETRY, + description = """ + Wait for the index-service to finish repartitioning the index. + """ + ) + public void repartitionReply(Long id) throws Exception { + var rsp = indexOutbox.waitResponse(id); + + if (rsp.state() != MqMessageState.OK) { + error("Repartition failed"); + } + } + + @GraphState( + name = REINDEX, + next = REINDEX_WAIT, + description = """ + Instruct the index-service to reindex the data then transition to REINDEX_WAIT. + """ + ) + public Long reindex() throws Exception { + return indexOutbox.sendAsync(IndexMqEndpoints.INDEX_REINDEX, ""); + } + + @GraphState( + name = REINDEX_WAIT, + next = END, + resume = ResumeBehavior.RETRY, + description = """ + Wait for the index-service to finish reindexing the data. + """ + ) + public void reindexReply(Long id) throws Exception { + var rsp = indexOutbox.waitResponse(id); + + if (rsp.state() != MqMessageState.OK) { + error("Repartition failed"); + } + } + + +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/RecrawlActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/RecrawlActor.java new file mode 100644 index 00000000..c4253a0d --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/RecrawlActor.java @@ -0,0 +1,141 @@ +package nu.marginalia.control.actor.task; + +import com.google.gson.Gson; +import com.google.inject.Inject; +import com.google.inject.Singleton; +import lombok.AllArgsConstructor; +import lombok.NoArgsConstructor; +import lombok.With; +import nu.marginalia.control.process.ProcessOutboxes; +import nu.marginalia.control.process.ProcessService; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.db.storage.model.FileStorage; +import nu.marginalia.db.storage.model.FileStorageId; +import nu.marginalia.db.storage.model.FileStorageType; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.outbox.MqOutbox; +import nu.marginalia.mqapi.crawling.CrawlRequest; +import nu.marginalia.mqsm.StateFactory; +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mqsm.graph.GraphState; +import nu.marginalia.mqsm.graph.ResumeBehavior; + +import java.nio.file.Files; +import java.sql.SQLException; +import java.util.Optional; + +@Singleton +public class RecrawlActor extends AbstractStateGraph { + + // STATES + + public static final String INITIAL = "INITIAL"; + public static final String CRAWL = "CRAWL"; + public static final String CRAWL_WAIT = "CRAWL-WAIT"; + public static final String END = "END"; + private final MqOutbox mqCrawlerOutbox; + private final FileStorageService storageService; + private final Gson gson; + private final ActorProcessWatcher processWatcher; + + + @AllArgsConstructor @With @NoArgsConstructor + public static class RecrawlMessage { + public FileStorageId crawlSpecId = null; + public FileStorageId crawlStorageId = null; + public long crawlerMsgId = 0L; + }; + + public static RecrawlMessage recrawlFromCrawlData(FileStorageId crawlData) { + return new RecrawlMessage(null, crawlData, 0L); + } + public static RecrawlMessage recrawlFromCrawlDataAndCralSpec(FileStorageId crawlData, FileStorageId crawlSpec) { + return new RecrawlMessage(crawlSpec, crawlData, 0L); + } + + @Inject + public RecrawlActor(StateFactory stateFactory, + ActorProcessWatcher processWatcher, + ProcessOutboxes processOutboxes, + FileStorageService storageService, + Gson gson + ) + { + super(stateFactory); + this.processWatcher = processWatcher; + this.mqCrawlerOutbox = processOutboxes.getCrawlerOutbox(); + this.storageService = storageService; + this.gson = gson; + } + + @GraphState(name = INITIAL, + next = CRAWL, + description = """ + Validate the input and transition to CRAWL + """) + public RecrawlMessage init(RecrawlMessage recrawlMessage) throws Exception { + if (null == recrawlMessage) { + error("This Actor requires a message as an argument"); + } + + var crawlStorage = storageService.getStorage(recrawlMessage.crawlStorageId); + FileStorage specStorage; + + if (recrawlMessage.crawlSpecId != null) { + specStorage = storageService.getStorage(recrawlMessage.crawlSpecId); + } + else { + specStorage = getSpec(crawlStorage).orElse(null); + } + + if (specStorage == null) error("Bad storage id"); + if (specStorage.type() != FileStorageType.CRAWL_SPEC) error("Bad storage type " + specStorage.type()); + if (crawlStorage == null) error("Bad storage id"); + if (crawlStorage.type() != FileStorageType.CRAWL_DATA) error("Bad storage type " + specStorage.type()); + + Files.deleteIfExists(crawlStorage.asPath().resolve("crawler.log")); + + return recrawlMessage + .withCrawlSpecId(specStorage.id()); + } + + private Optional getSpec(FileStorage crawlStorage) throws SQLException { + return storageService.getSourceFromStorage(crawlStorage) + .stream() + .filter(storage -> storage.type().equals(FileStorageType.CRAWL_SPEC)) + .findFirst(); + } + + @GraphState(name = CRAWL, + next = CRAWL_WAIT, + resume = ResumeBehavior.ERROR, + description = """ + Send a crawl request to the crawler and transition to CRAWL_WAIT. + """ + ) + public RecrawlMessage crawl(RecrawlMessage recrawlMessage) throws Exception { + // Pre-send crawl request + var request = new CrawlRequest(recrawlMessage.crawlSpecId, recrawlMessage.crawlStorageId); + long id = mqCrawlerOutbox.sendAsync(CrawlRequest.class.getSimpleName(), gson.toJson(request)); + + return recrawlMessage.withCrawlerMsgId(id); + } + + @GraphState( + name = CRAWL_WAIT, + next = END, + resume = ResumeBehavior.RETRY, + description = """ + Wait for the crawler to finish retrieving the data. + """ + ) + public RecrawlMessage crawlerWait(RecrawlMessage recrawlMessage) throws Exception { + var rsp = processWatcher.waitResponse(mqCrawlerOutbox, ProcessService.ProcessId.CRAWLER, recrawlMessage.crawlerMsgId); + + if (rsp.state() != MqMessageState.OK) + error("Crawler failed"); + + return recrawlMessage; + } + +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/TriggerAdjacencyCalculationActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/TriggerAdjacencyCalculationActor.java new file mode 100644 index 00000000..7441b437 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/TriggerAdjacencyCalculationActor.java @@ -0,0 +1,59 @@ +package nu.marginalia.control.actor.task; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.control.process.ProcessService; +import nu.marginalia.mqsm.StateFactory; +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mqsm.graph.GraphState; +import nu.marginalia.mqsm.graph.ResumeBehavior; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.atomic.AtomicBoolean; + +@Singleton +public class TriggerAdjacencyCalculationActor extends AbstractStateGraph { + + private final Logger logger = LoggerFactory.getLogger(getClass()); + // STATES + + private static final String INITIAL = "INITIAL"; + private static final String END = "END"; + private final ProcessService processService; + private final ExecutorService executor = Executors.newSingleThreadExecutor(); + + @Inject + public TriggerAdjacencyCalculationActor(StateFactory stateFactory, + ProcessService processService) { + super(stateFactory); + this.processService = processService; + } + + @GraphState(name = INITIAL, next = END, + resume = ResumeBehavior.ERROR, + description = """ + Spawns a WebsitesAdjacenciesCalculator process and waits for it to finish. + """ + ) + public void init(Integer unused) throws Exception { + AtomicBoolean hasError = new AtomicBoolean(false); + var future = executor.submit(() -> { + try { + processService.trigger(ProcessService.ProcessId.ADJACENCIES_CALCULATOR, "load"); + } + catch (Exception ex) { + logger.warn("Error triggering adjacency calculation", ex); + hasError.set(true); + } + }); + future.get(); + + if (hasError.get()) { + error("Error triggering adjacency calculation"); + } + } + +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/TruncateLinkDatabase.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/TruncateLinkDatabase.java new file mode 100644 index 00000000..355620e5 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/TruncateLinkDatabase.java @@ -0,0 +1,78 @@ +package nu.marginalia.control.actor.task; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.zaxxer.hikari.HikariDataSource; +import lombok.AllArgsConstructor; +import lombok.NoArgsConstructor; +import lombok.With; +import nu.marginalia.db.storage.model.FileStorageId; +import nu.marginalia.mqsm.StateFactory; +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mqsm.graph.GraphState; +import nu.marginalia.mqsm.graph.ResumeBehavior; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.SQLException; + +@Singleton +public class TruncateLinkDatabase extends AbstractStateGraph { + + + // STATES + public static final String INITIAL = "INITIAL"; + public static final String FLUSH_DATABASE = "FLUSH_DATABASE"; + + public static final String END = "END"; + private final HikariDataSource dataSource; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + @AllArgsConstructor @With @NoArgsConstructor + public static class Message { + public FileStorageId storageId = null; + }; + + @Inject + public TruncateLinkDatabase(StateFactory stateFactory, + HikariDataSource dataSource) + { + super(stateFactory); + this.dataSource = dataSource; + } + + @GraphState(name = INITIAL, + next = FLUSH_DATABASE, + description = """ + Initial stage + """) + public void init(Integer i) throws Exception { + + } + + @GraphState(name = FLUSH_DATABASE, + next = END, + resume = ResumeBehavior.ERROR, + description = """ + Truncate the domain and link tables. + """ + ) + public void exportBlacklist() throws Exception { + try (var conn = dataSource.getConnection(); + var stmt = conn.createStatement()) + { + stmt.executeUpdate("SET FOREIGN_KEY_CHECKS = 0"); + stmt.executeUpdate("TRUNCATE TABLE EC_PAGE_DATA"); + stmt.executeUpdate("TRUNCATE TABLE EC_URL"); + stmt.executeUpdate("TRUNCATE TABLE EC_DOMAIN_LINK"); + stmt.executeUpdate("TRUNCATE TABLE DOMAIN_METADATA"); + stmt.executeUpdate("SET FOREIGN_KEY_CHECKS = 1"); + } + catch (SQLException ex) { + logger.error("Failed to truncate tables", ex); + error("Failed to truncate tables"); + } + } + + +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/model/ActorRunState.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/ActorRunState.java new file mode 100644 index 00000000..152af472 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/ActorRunState.java @@ -0,0 +1,18 @@ +package nu.marginalia.control.model; + +public record ActorRunState(String name, String state, boolean terminal, boolean canStart) { + public String stateIcon() { + if (terminal) { + return "\uD83D\uDE34"; + } + else if (state.equals("MONITOR")) { + return "\uD83D\uDD26"; + } + else if (state.endsWith("WAIT") || state.endsWith("REPLY")) { + return "\uD83D\uDD59"; + } + else { + return "\uD83C\uDFC3"; + } + } +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/model/ActorState.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/ActorState.java new file mode 100644 index 00000000..676f3ed2 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/ActorState.java @@ -0,0 +1,19 @@ +package nu.marginalia.control.model; + +import nu.marginalia.mqsm.graph.GraphState; + +import java.util.Arrays; +import java.util.List; +import java.util.stream.Stream; + +public record ActorState(String name, + boolean current, + List transitions, + String description) { + public ActorState(GraphState gs, boolean current) { + this(gs.name(), current, toTransitions(gs.next(), gs.transitions()), gs.description()); + } + private static List toTransitions(String next, String[] transitions) { + return Stream.concat(Stream.of(next), Arrays.stream(transitions)).distinct().toList(); + } +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/model/ActorStateGraph.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/ActorStateGraph.java new file mode 100644 index 00000000..a9d7b783 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/ActorStateGraph.java @@ -0,0 +1,51 @@ +package nu.marginalia.control.model; + +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import nu.marginalia.mqsm.graph.GraphState; +import nu.marginalia.mqsm.state.MachineState; + +import java.util.*; +import java.util.stream.Collectors; + +public record ActorStateGraph(List states) { + + public ActorStateGraph(AbstractStateGraph graph, MachineState currentState) { + this(getStateList(graph, currentState)); + } + + private static List getStateList( + AbstractStateGraph graph, + MachineState currentState) + { + Map declaredStates = graph.declaredStates().stream().collect(Collectors.toMap(GraphState::name, gs -> gs)); + Set seenStates = new HashSet<>(declaredStates.size()); + LinkedList edge = new LinkedList<>(); + + List statesList = new ArrayList<>(declaredStates.size()); + + edge.add(declaredStates.get("INITIAL")); + + while (!edge.isEmpty()) { + var first = edge.removeFirst(); + if (first == null || !seenStates.add(first)) { + continue; + } + statesList.add(new ActorState(first, currentState.name().equals(first.name()))); + + edge.add(declaredStates.get(first.next())); + + for (var transition : first.transitions()) { + edge.add(declaredStates.get(transition)); + } + } + + if (!declaredStates.containsKey("ERROR")) { + statesList.add(new ActorState("ERROR", currentState.name().equals("ERROR"), List.of(), "Terminal error state")); + } + if (!declaredStates.containsKey("END")) { + statesList.add(new ActorState("END", currentState.name().equals("END"), List.of(), "The machine terminated successfully")); + } + + return statesList; + } +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/model/ApiKeyModel.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/ApiKeyModel.java new file mode 100644 index 00000000..15eda2ba --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/ApiKeyModel.java @@ -0,0 +1,2 @@ +package nu.marginalia.control.model; +public record ApiKeyModel(String licenseKey, String license, String name, String email, int rate) {} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/model/BlacklistedDomainModel.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/BlacklistedDomainModel.java new file mode 100644 index 00000000..e7db4805 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/BlacklistedDomainModel.java @@ -0,0 +1,6 @@ +package nu.marginalia.control.model; + +import nu.marginalia.model.EdgeDomain; + +public record BlacklistedDomainModel(EdgeDomain domain, String comment) { +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/model/DomainComplaintCategory.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/DomainComplaintCategory.java new file mode 100644 index 00000000..d1743ba9 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/DomainComplaintCategory.java @@ -0,0 +1,28 @@ +package nu.marginalia.control.model; + +public enum DomainComplaintCategory { + SPAM("spam"), + FREEBOOTING("freebooting"), + BROKEN("broken"), + SHOCK("shock"), + BLACKLIST("blacklist"), + UNKNOWN("unknown"); + + private final String categoryName; + + DomainComplaintCategory(String categoryName) { + this.categoryName = categoryName; + } + + public String categoryName() { + return categoryName; + } + public static DomainComplaintCategory fromCategoryName(String categoryName) { + for (DomainComplaintCategory category : values()) { + if (category.categoryName().equals(categoryName)) { + return category; + } + } + return UNKNOWN; + } +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/model/DomainComplaintModel.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/DomainComplaintModel.java new file mode 100644 index 00000000..603b6fc8 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/DomainComplaintModel.java @@ -0,0 +1,17 @@ +package nu.marginalia.control.model; + +public record DomainComplaintModel(String domain, + DomainComplaintCategory category, + String description, + String sample, + String decision, + String fileDate, + String reviewDate, + boolean reviewed) +{ + + public boolean isAppeal() { + return category == DomainComplaintCategory.BLACKLIST; + } + +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/model/EventLogEntry.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/EventLogEntry.java new file mode 100644 index 00000000..d044ca91 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/EventLogEntry.java @@ -0,0 +1,19 @@ +package nu.marginalia.control.model; + +public record EventLogEntry( + String serviceName, + String instanceFull, + String eventTime, + String eventType, + String eventMessage) +{ + public String instance() { + return instanceFull.substring(0, 8); + } + public String instanceColor() { + return '#' + instanceFull.substring(0, 6); + } + public String instanceColor2() { + return '#' + instanceFull.substring(25, 31); + } +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/model/FileStorageBaseWithStorage.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/FileStorageBaseWithStorage.java new file mode 100644 index 00000000..7411e3c7 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/FileStorageBaseWithStorage.java @@ -0,0 +1,10 @@ +package nu.marginalia.control.model; + +import nu.marginalia.db.storage.model.FileStorageBase; + +import java.util.List; + +public record FileStorageBaseWithStorage(FileStorageBase base, + List storage) +{ +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/model/FileStorageFileModel.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/FileStorageFileModel.java new file mode 100644 index 00000000..41da73e8 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/FileStorageFileModel.java @@ -0,0 +1,7 @@ +package nu.marginalia.control.model; + +public record FileStorageFileModel(String filename, + String mTime, + String size) +{ +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithActions.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithActions.java new file mode 100644 index 00000000..4ef9a394 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithActions.java @@ -0,0 +1,24 @@ +package nu.marginalia.control.model; + +import nu.marginalia.db.storage.model.FileStorage; +import nu.marginalia.db.storage.model.FileStorageType; + +public record FileStorageWithActions(FileStorage storage) { + public boolean isCrawlable() { + return storage.type() == FileStorageType.CRAWL_SPEC; + } + public boolean isRecrawlable() { + return storage.type() == FileStorageType.CRAWL_DATA; + } + + public boolean isLoadable() { + return storage.type() == FileStorageType.PROCESSED_DATA; + } + public boolean isConvertible() { + return storage.type() == FileStorageType.CRAWL_DATA; + } + public boolean isDeletable() { + return storage.type() == FileStorageType.PROCESSED_DATA + || storage.type() == FileStorageType.BACKUP; + } +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithRelatedEntries.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithRelatedEntries.java new file mode 100644 index 00000000..608ccdca --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithRelatedEntries.java @@ -0,0 +1,13 @@ +package nu.marginalia.control.model; + +import nu.marginalia.db.storage.model.FileStorage; +import nu.marginalia.db.storage.model.FileStorageType; + +import java.util.List; + +public record FileStorageWithRelatedEntries(FileStorageWithActions self, + List related, + List files + ) { + +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/model/MessageQueueEntry.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/MessageQueueEntry.java new file mode 100644 index 00000000..c90bda76 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/MessageQueueEntry.java @@ -0,0 +1,55 @@ +package nu.marginalia.control.model; + +public record MessageQueueEntry ( + long id, + long relatedId, + String senderInbox, + String recipientInbox, + String function, + String payload, + String ownerInstanceFull, + long ownerTick, + String state, + String createdTime, + String updatedTime, + int ttl +) +{ + public boolean hasRelatedMessage() { + return relatedId > 0; + } + public String ownerInstance() { + if (ownerInstanceFull == null) { + return ""; + } + + return ownerInstanceFull.substring(0, 8); + } + public String ownerInstanceColor() { + if (ownerInstanceFull == null) { + return "#000000"; + } + return '#' + ownerInstanceFull.substring(0, 6); + } + public String ownerInstanceColor2() { + if (ownerInstanceFull == null) { + return "#000000"; + } + + return '#' + ownerInstanceFull.substring(25, 31); + } + + public String stateCode() { + if (state == null) { + return ""; + } + return switch (state) { + case "NEW" -> "\uD83D\uDC23"; + case "ACK" -> "\uD83D\uDD27"; + case "ERR" -> "\u274C"; + case "OK" -> "\u2705"; + case "DEAD" -> "\uD83D\uDC80"; + default -> ""; + }; + } +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java new file mode 100644 index 00000000..accb3351 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java @@ -0,0 +1,60 @@ +package nu.marginalia.control.model; + +import nu.marginalia.control.process.ProcessService; + +public record ProcessHeartbeat( + String processId, + String processBase, + String uuidFull, + double lastSeenMillis, + Integer progress, + String status +) { + public String uuid() { + return uuidFull.substring(0, 8); + } + public String uuidColor() { + return '#' + uuidFull.substring(0, 6); + } + public String uuidColor2() { + return '#' + uuidFull.substring(25, 31); + } + public boolean isMissing() { + return lastSeenMillis > 10000; + } + public boolean isStopped() { + return "STOPPED".equals(status); + } + public boolean isRunning() { + return "RUNNING".equals(status); + } + public String progressStyle() { + if ("RUNNING".equals(status) && progress != null) { + return """ + background: linear-gradient(90deg, #ccc 0%%, #ccc %d%%, #fff %d%%) + """.formatted(progress, progress, progress); + } + return ""; + } + + public ProcessService.ProcessId getProcessId() { + return switch (processBase) { + case "converter" -> ProcessService.ProcessId.CONVERTER; + case "crawler" -> ProcessService.ProcessId.CRAWLER; + case "loader" -> ProcessService.ProcessId.LOADER; + case "website-adjacencies-calculator" -> ProcessService.ProcessId.ADJACENCIES_CALCULATOR; + case "crawl-job-extractor" -> ProcessService.ProcessId.CRAWL_JOB_EXTRACTOR; + default -> null; + }; + } + + public String displayName() { + var pid = getProcessId(); + if (pid != null) { + return pid.name(); + } + else { + return processBase; + } + } +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/model/ServiceHeartbeat.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/ServiceHeartbeat.java new file mode 100644 index 00000000..f43d9058 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/ServiceHeartbeat.java @@ -0,0 +1,22 @@ +package nu.marginalia.control.model; + +public record ServiceHeartbeat( + String serviceId, + String serviceBase, + String uuidFull, + double lastSeenMillis, + boolean alive +) { + public boolean isMissing() { + return lastSeenMillis > 10000; + } + public String uuid() { + return uuidFull.substring(0, 8); + } + public String uuidColor() { + return '#' + uuidFull.substring(0, 6); + } + public String uuidColor2() { + return '#' + uuidFull.substring(25, 31); + } +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/model/TaskHeartbeat.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/TaskHeartbeat.java new file mode 100644 index 00000000..84d5bcd5 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/TaskHeartbeat.java @@ -0,0 +1,29 @@ +package nu.marginalia.control.model; + + +public record TaskHeartbeat( + String taskName, + String taskBase, + String serviceUuuidFull, + double lastSeenMillis, + Integer progress, + String stage, + String status +) { + public boolean isStopped() { + return "STOPPED".equals(status); + } + public boolean isRunning() { + return "RUNNING".equals(status); + } + + public String progressStyle() { + if ("RUNNING".equals(status) && progress != null) { + return """ + background: linear-gradient(90deg, #ccc 0%%, #ccc %d%%, #fff %d%%) + """.formatted(progress, progress, progress); + } + return ""; + } + +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/process/ProcessOutboxes.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/process/ProcessOutboxes.java new file mode 100644 index 00000000..b5b74406 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/process/ProcessOutboxes.java @@ -0,0 +1,47 @@ +package nu.marginalia.control.process; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.mqapi.ProcessInboxNames; +import nu.marginalia.mq.outbox.MqOutbox; +import nu.marginalia.mq.persistence.MqPersistence; +import nu.marginalia.service.server.BaseServiceParams; + +@Singleton +public class ProcessOutboxes { + private final MqOutbox converterOutbox; + private final MqOutbox loaderOutbox; + private final MqOutbox crawlerOutbox; + + @Inject + public ProcessOutboxes(BaseServiceParams params, MqPersistence persistence) { + converterOutbox = new MqOutbox(persistence, + ProcessInboxNames.CONVERTER_INBOX, + params.configuration.serviceName(), + params.configuration.instanceUuid() + ); + loaderOutbox = new MqOutbox(persistence, + ProcessInboxNames.LOADER_INBOX, + params.configuration.serviceName(), + params.configuration.instanceUuid() + ); + crawlerOutbox = new MqOutbox(persistence, + ProcessInboxNames.CRAWLER_INBOX, + params.configuration.serviceName(), + params.configuration.instanceUuid() + ); + } + + + public MqOutbox getConverterOutbox() { + return converterOutbox; + } + + public MqOutbox getLoaderOutbox() { + return loaderOutbox; + } + + public MqOutbox getCrawlerOutbox() { + return crawlerOutbox; + } +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/process/ProcessService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/process/ProcessService.java new file mode 100644 index 00000000..25583f43 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/process/ProcessService.java @@ -0,0 +1,156 @@ +package nu.marginalia.control.process; + +import com.google.inject.name.Named; +import nu.marginalia.service.control.ServiceEventLog; +import nu.marginalia.service.server.BaseServiceParams; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.slf4j.Marker; +import org.slf4j.MarkerFactory; + +import javax.inject.Inject; +import javax.inject.Singleton; +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.*; +import java.util.concurrent.ConcurrentHashMap; + +@Singleton +public class ProcessService { + private final Logger logger = LoggerFactory.getLogger(getClass()); + private final Marker processMarker = MarkerFactory.getMarker("PROCESS"); + + private final ServiceEventLog eventLog; + private final Path distPath; + + private final ConcurrentHashMap processes = new ConcurrentHashMap<>(); + + public enum ProcessId { + CRAWLER("crawler-process/bin/crawler-process"), + CONVERTER("converter-process/bin/converter-process"), + LOADER("loader-process/bin/loader-process"), + ADJACENCIES_CALCULATOR("website-adjacencies-calculator/bin/website-adjacencies-calculator"), + CRAWL_JOB_EXTRACTOR("crawl-job-extractor-process/bin/crawl-job-extractor-process") + ; + + public final String path; + ProcessId(String path) { + this.path = path; + } + }; + + @Inject + public ProcessService(BaseServiceParams params, + @Named("distPath") Path distPath) { + this.eventLog = params.eventLog; + this.distPath = distPath; + } + + public boolean trigger(ProcessId processId) throws Exception { + return trigger(processId, new String[0]); + } + + public boolean trigger(ProcessId processId, String... parameters) throws Exception { + final String processPath = distPath.resolve(processId.path).toString(); + final String[] env = createEnvironmentVariables(); + final String[] args = createCommandArguments(processPath, parameters); + + Process process; + + if (!Files.exists(Path.of(processPath))) { + logger.error("Process not found: {}", processPath); + return false; + } + + logger.info("Starting process: {}: {} // {}", processId, Arrays.toString(args), Arrays.toString(env)); + + synchronized (processes) { + if (processes.containsKey(processId)) return false; + process = Runtime.getRuntime().exec(args, env); + processes.put(processId, process); + } + + try (var es = new BufferedReader(new InputStreamReader(process.getErrorStream())); + var os = new BufferedReader(new InputStreamReader(process.getInputStream())) + ) { + eventLog.logEvent("PROCESS-STARTED", processId.toString()); + + while (process.isAlive()) { + if (es.ready()) + logger.warn(processMarker, es.readLine()); + if (os.ready()) + logger.info(processMarker, os.readLine()); + } + + final int returnCode = process.waitFor(); + logger.info("Process {} terminated with code {}", processId, returnCode); + return 0 == returnCode; + } + catch (Exception ex) { + logger.info("Process {} terminated with code exception", processId); + throw ex; + } + finally { + eventLog.logEvent("PROCESS-EXIT", processId.toString()); + processes.remove(processId); + } + } + + private String[] createCommandArguments(String processPath, String[] parameters) { + final String[] args = new String[parameters.length + 1]; + args[0] = processPath; + System.arraycopy(parameters, 0, args, 1, parameters.length); + return args; + } + + public boolean isRunning(ProcessId processId) { + return processes.containsKey(processId); + } + + public boolean kill(ProcessId processId) { + Process process = processes.get(processId); + if (process == null) return false; + + eventLog.logEvent("PROCESS-KILL", processId.toString()); + process.destroy(); + + return true; + } + + /** These environment variables are propagated from the parent process to the child process, + * along with WMSA_HOME, but it has special logic */ + private final List propagatedEnvironmentVariables = List.of( + "JAVA_HOME", + "CONVERTER_PROCESS_OPTS", +// "LOADER_PROCESS_OPTS", + "CRAWLER_PROCESS_OPTS"); + + private String[] createEnvironmentVariables() { + List opts = new ArrayList<>(); + + String WMSA_HOME = System.getenv("WMSA_HOME"); + + if (WMSA_HOME == null || WMSA_HOME.isBlank()) { + WMSA_HOME = "/var/lib/wmsa"; + } + + opts.add(env2str("WMSA_HOME", WMSA_HOME)); + opts.add(env2str("JAVA_OPTS", "")); // We explicitly empty this to avoid inheriting the parent process' JAVA_OPTS + + for (String envKey : propagatedEnvironmentVariables) { + String envValue = System.getenv(envKey); + if (envValue != null && !envValue.isBlank()) { + opts.add(env2str(envKey, envValue)); + } + } + + return opts.toArray(String[]::new); + } + + private String env2str(String key, String val) { + return key + "=" + val; + } + +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ApiKeyService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ApiKeyService.java new file mode 100644 index 00000000..505d2220 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ApiKeyService.java @@ -0,0 +1,93 @@ +package nu.marginalia.control.svc; + +import com.google.inject.Inject; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.control.model.ApiKeyModel; + +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.List; +import java.util.UUID; + +public class ApiKeyService { + + private final HikariDataSource dataSource; + + @Inject + public ApiKeyService(HikariDataSource dataSource) { + this.dataSource = dataSource; + } + + public List getApiKeys() { + try (var conn = dataSource.getConnection()) { + try (var stmt = conn.prepareStatement(""" + SELECT LICENSE_KEY, LICENSE, NAME, EMAIL, RATE FROM EC_API_KEY + """)) { + List ret = new ArrayList<>(100); + var rs = stmt.executeQuery(); + while (rs.next()) { + ret.add(new ApiKeyModel( + rs.getString("LICENSE_KEY"), + rs.getString("LICENSE"), + rs.getString("NAME"), + rs.getString("EMAIL"), + rs.getInt("RATE"))); + } + return ret; + } + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } + + public ApiKeyModel addApiKey(String license, String name, String email, int rate) { + try (var conn = dataSource.getConnection()) { + try (var insertStmt = conn.prepareStatement(""" + INSERT INTO EC_API_KEY (LICENSE_KEY, LICENSE, NAME, EMAIL, RATE) SELECT SHA(?), ?, ?, ?, ? + """); + // we could do SELECT SHA(?) here I guess if performance was a factor, but it's not + var queryStmt = conn.prepareStatement("SELECT LICENSE_KEY FROM EC_API_KEY WHERE LICENSE_KEY = SHA(?)") + ) { + final String seedString = UUID.randomUUID() + "-" + name + "-" + email; + + insertStmt.setString(1, seedString); + insertStmt.setString(2, license); + insertStmt.setString(3, name); + insertStmt.setString(4, email); + insertStmt.setInt(5, rate); + insertStmt.executeUpdate(); + + queryStmt.setString(1, seedString); + var rs = queryStmt.executeQuery(); + if (rs.next()) { + return new ApiKeyModel( + rs.getString("LICENSE_KEY"), + license, + name, + email, + rate); + } + + throw new RuntimeException("Failed to insert key"); + } + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } + + public void deleteApiKey(String key) { + try (var conn = dataSource.getConnection()) { + try (var stmt = conn.prepareStatement(""" + DELETE FROM EC_API_KEY WHERE LICENSE_KEY = ? + """)) { + stmt.setString(1, key); + stmt.executeUpdate(); + } + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActionsService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActionsService.java new file mode 100644 index 00000000..4425ac52 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActionsService.java @@ -0,0 +1,110 @@ +package nu.marginalia.control.svc; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.control.actor.ControlActors; +import nu.marginalia.control.actor.Actor; +import nu.marginalia.index.client.IndexClient; +import nu.marginalia.index.client.IndexMqEndpoints; +import nu.marginalia.mq.MessageQueueFactory; +import nu.marginalia.mq.outbox.MqOutbox; +import nu.marginalia.search.client.SearchClient; +import nu.marginalia.search.client.SearchMqEndpoints; +import nu.marginalia.service.control.ServiceEventLog; +import nu.marginalia.service.id.ServiceId; +import spark.Request; +import spark.Response; +import spark.Spark; + +import java.util.UUID; + +@Singleton +public class ControlActionsService { + + private final ControlActors actors; + private final SearchClient searchClient; + private final IndexClient indexClient; + private final MqOutbox apiOutbox; + private final ServiceEventLog eventLog; + + @Inject + public ControlActionsService(ControlActors actors, + SearchClient searchClient, + IndexClient indexClient, + MessageQueueFactory mqFactory, + ServiceEventLog eventLog) { + + this.actors = actors; + this.searchClient = searchClient; + this.indexClient = indexClient; + this.apiOutbox = createApiOutbox(mqFactory); + this.eventLog = eventLog; + + } + + /** This is a hack to get around the fact that the API service is not a core service + * and lacks a proper internal API + */ + private MqOutbox createApiOutbox(MessageQueueFactory mqFactory) { + String inboxName = ServiceId.Api.name + ":" + "0"; + String outboxName = System.getProperty("service-name", UUID.randomUUID().toString()); + return mqFactory.createOutbox(inboxName, outboxName, UUID.randomUUID()); + } + + public Object calculateAdjacencies(Request request, Response response) throws Exception { + eventLog.logEvent("USER-ACTION", "CALCULATE-ADJACENCIES"); + + actors.start(Actor.ADJACENCY_CALCULATION); + + return ""; + } + + public Object triggerDataExports(Request request, Response response) throws Exception { + eventLog.logEvent("USER-ACTION", "EXPORT-DATA"); + actors.start(Actor.EXPORT_DATA); + + return ""; + } + + public Object flushSearchCaches(Request request, Response response) throws Exception { + eventLog.logEvent("USER-ACTION", "FLUSH-SEARCH-CACHES"); + searchClient.outbox().sendNotice(SearchMqEndpoints.FLUSH_CACHES, ""); + + return ""; + } + + public Object flushApiCaches(Request request, Response response) throws Exception { + eventLog.logEvent("USER-ACTION", "FLUSH-API-CACHES"); + apiOutbox.sendNotice("FLUSH_CACHES", ""); + + return ""; + } + + public Object truncateLinkDatabase(Request request, Response response) throws Exception { + + String footgunLicense = request.queryParams("footgun-license"); + + if (!"YES".equals(footgunLicense)) { + Spark.halt(403); + return "You must agree to the footgun license to truncate the link database"; + } + + eventLog.logEvent("USER-ACTION", "FLUSH-LINK-DATABASE"); + + actors.start(Actor.TRUNCATE_LINK_DATABASE); + + return ""; + } + + public Object triggerRepartition(Request request, Response response) throws Exception { + indexClient.outbox().sendAsync(IndexMqEndpoints.INDEX_REPARTITION, ""); + + return null; + } + + public Object triggerIndexReconstruction(Request request, Response response) throws Exception { + indexClient.outbox().sendAsync(IndexMqEndpoints.INDEX_REINDEX, ""); + + return null; + } +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java new file mode 100644 index 00000000..ddfbbe58 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java @@ -0,0 +1,125 @@ +package nu.marginalia.control.svc; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.control.actor.ControlActors; +import nu.marginalia.control.actor.task.CrawlJobExtractorActor; +import nu.marginalia.control.actor.task.ReconvertAndLoadActor; +import nu.marginalia.control.actor.task.RecrawlActor; +import nu.marginalia.control.actor.Actor; +import nu.marginalia.control.model.ActorRunState; +import nu.marginalia.control.model.ActorStateGraph; +import nu.marginalia.db.storage.model.FileStorageId; +import nu.marginalia.mqsm.state.MachineState; +import spark.Request; +import spark.Response; + +import java.util.Comparator; + +@Singleton +public class ControlActorService { + private final ControlActors controlActors; + + @Inject + public ControlActorService(ControlActors controlActors) { + this.controlActors = controlActors; + } + + public Object getActorStateGraph(Actor actor) { + var currentState = controlActors.getActorStates().get(actor); + + return new ActorStateGraph(controlActors.getActorDefinition(actor), currentState); + } + + public Object startFsm(Request req, Response rsp) throws Exception { + controlActors.start( + Actor.valueOf(req.params("fsm").toUpperCase()) + ); + return ""; + } + + public Object stopFsm(Request req, Response rsp) throws Exception { + controlActors.stop( + Actor.valueOf(req.params("fsm").toUpperCase()) + ); + return ""; + } + + public Object triggerCrawling(Request request, Response response) throws Exception { + controlActors.start( + Actor.CRAWL, + FileStorageId.parse(request.params("fid")) + ); + return ""; + } + + public Object triggerRecrawling(Request request, Response response) throws Exception { + controlActors.start( + Actor.RECRAWL, + RecrawlActor.recrawlFromCrawlData( + FileStorageId.parse(request.params("fid")) + ) + ); + return ""; + } + public Object triggerProcessing(Request request, Response response) throws Exception { + controlActors.start( + Actor.RECONVERT_LOAD, + FileStorageId.parse(request.params("fid")) + ); + return ""; + } + + public Object loadProcessedData(Request request, Response response) throws Exception { + var fid = FileStorageId.parse(request.params("fid")); + + // Start the FSM from the intermediate state that triggers the load + controlActors.startFrom( + Actor.RECONVERT_LOAD, + ReconvertAndLoadActor.LOAD, + new ReconvertAndLoadActor.Message(null, fid, 0L, 0L) + ); + + return ""; + } + + public Object getActorStates() { + return controlActors.getActorStates().entrySet().stream().map(e -> { + + final MachineState state = e.getValue(); + final String machineName = e.getKey().name(); + final String stateName = state.name(); + final boolean terminal = state.isFinal(); + final boolean canStart = controlActors.isDirectlyInitializable(e.getKey()) && terminal; + + return new ActorRunState(machineName, stateName, terminal, canStart); + }) + .filter(s -> !s.terminal() || s.canStart()) + .sorted(Comparator.comparing(ActorRunState::name)) + .toList(); + } + + public Object createCrawlSpecification(Request request, Response response) throws Exception { + final String description = request.queryParams("description"); + final String url = request.queryParams("url"); + final String source = request.queryParams("source"); + + if ("db".equals(source)) { + controlActors.startFrom(Actor.CRAWL_JOB_EXTRACTOR, + CrawlJobExtractorActor.CREATE_FROM_DB, + new CrawlJobExtractorActor.CrawlJobExtractorArguments(description) + ); + } + else if ("download".equals(source)) { + controlActors.startFrom(Actor.CRAWL_JOB_EXTRACTOR, + CrawlJobExtractorActor.CREATE_FROM_LINK, + new CrawlJobExtractorActor.CrawlJobExtractorArgumentsWithURL(description, url) + ); + } + else { + throw new IllegalArgumentException("Unknown source: " + source); + } + + return ""; + } +} \ No newline at end of file diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlBlacklistService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlBlacklistService.java new file mode 100644 index 00000000..d23a06e2 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlBlacklistService.java @@ -0,0 +1,79 @@ +package nu.marginalia.control.svc; + +import com.google.inject.Inject; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.control.model.BlacklistedDomainModel; +import nu.marginalia.model.EdgeDomain; + +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.List; + +public class ControlBlacklistService { + + private final HikariDataSource dataSource; + + @Inject + public ControlBlacklistService(HikariDataSource dataSource) { + this.dataSource = dataSource; + } + + public void addToBlacklist(EdgeDomain domain, String comment) { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + INSERT IGNORE INTO EC_DOMAIN_BLACKLIST (URL_DOMAIN, COMMENT) VALUES (?, ?) + """)) { + stmt.setString(1, domain.toString()); + stmt.setString(2, comment); + stmt.executeUpdate(); + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } + + public void removeFromBlacklist(EdgeDomain domain) { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + DELETE FROM EC_DOMAIN_BLACKLIST WHERE URL_DOMAIN=? + """)) { + stmt.setString(1, domain.toString()); + stmt.addBatch(); + stmt.setString(1, domain.domain); + stmt.addBatch(); + stmt.executeBatch(); + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } + + public List lastNAdditions(int n) { + final List ret = new ArrayList<>(n); + + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT URL_DOMAIN, COMMENT + FROM EC_DOMAIN_BLACKLIST + ORDER BY ID DESC + LIMIT ? + """)) { + stmt.setInt(1, n); + + var rs = stmt.executeQuery(); + while (rs.next()) { + ret.add(new BlacklistedDomainModel( + new EdgeDomain(rs.getString("URL_DOMAIN")), + rs.getString("COMMENT") + ) + ); + } + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + + return ret; + + } +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlFileStorageService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlFileStorageService.java new file mode 100644 index 00000000..f80287f4 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlFileStorageService.java @@ -0,0 +1,200 @@ +package nu.marginalia.control.svc; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.zaxxer.hikari.HikariDataSource; +import lombok.SneakyThrows; +import nu.marginalia.control.model.*; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.db.storage.model.*; +import nu.marginalia.mqsm.graph.AbstractStateGraph; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import spark.Request; +import spark.Response; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.SQLException; +import java.util.*; + +@Singleton +public class ControlFileStorageService { + private final HikariDataSource dataSource; + private final FileStorageService fileStorageService; + private Logger logger = LoggerFactory.getLogger(getClass()); + + @Inject + public ControlFileStorageService(HikariDataSource dataSource, FileStorageService fileStorageService) { + this.dataSource = dataSource; + this.fileStorageService = fileStorageService; + } + + public Object flagFileForDeletionRequest(Request request, Response response) throws SQLException { + FileStorageId fid = new FileStorageId(Long.parseLong(request.params(":fid"))); + flagFileForDeletion(fid); + return ""; + } + + public void flagFileForDeletion(FileStorageId id) throws SQLException { + try (var conn = dataSource.getConnection(); + var flagStmt = conn.prepareStatement("UPDATE FILE_STORAGE SET DO_PURGE = TRUE WHERE ID = ?")) { + flagStmt.setLong(1, id.id()); + flagStmt.executeUpdate(); + } + } + + @SneakyThrows + public List getStorageList() { + var storageIds = getFileStorageIds(); + return makeFileStorageBaseWithStorage(storageIds); + } + + @SneakyThrows + public List getStorageList(FileStorageType type) { + var storageIds = getFileStorageIds(type); + return makeFileStorageBaseWithStorage(storageIds); + } + + private List getFileStorageIds() throws SQLException { + List storageIds = new ArrayList<>(); + + try (var conn = dataSource.getConnection(); + var storageByIdStmt = conn.prepareStatement("SELECT ID FROM FILE_STORAGE")) { + var rs = storageByIdStmt.executeQuery(); + while (rs.next()) { + storageIds.add(new FileStorageId(rs.getLong("ID"))); + } + } + + return storageIds; + } + + private List getFileStorageIds(FileStorageType type) throws SQLException { + List storageIds = new ArrayList<>(); + + try (var conn = dataSource.getConnection(); + var storageByIdStmt = conn.prepareStatement("SELECT ID FROM FILE_STORAGE WHERE TYPE = ?")) { + storageByIdStmt.setString(1, type.name()); + var rs = storageByIdStmt.executeQuery(); + while (rs.next()) { + storageIds.add(new FileStorageId(rs.getLong("ID"))); + } + } + + return storageIds; + } + + private List makeFileStorageBaseWithStorage(List storageIds) throws SQLException { + + Map fileStorageBaseByBaseId = new HashMap<>(); + Map> fileStoragByBaseId = new HashMap<>(); + + for (var id : storageIds) { + var storage = fileStorageService.getStorage(id); + fileStorageBaseByBaseId.computeIfAbsent(storage.base().id(), k -> storage.base()); + fileStoragByBaseId.computeIfAbsent(storage.base().id(), k -> new ArrayList<>()).add(new FileStorageWithActions(storage)); + } + + List result = new ArrayList<>(); + for (var baseId : fileStorageBaseByBaseId.keySet()) { + result.add(new FileStorageBaseWithStorage(fileStorageBaseByBaseId.get(baseId), + fileStoragByBaseId.get(baseId) + + )); + } + + return result; + } + + public FileStorageWithRelatedEntries getFileStorageWithRelatedEntries(FileStorageId id) throws SQLException { + var storage = fileStorageService.getStorage(id); + var related = getRelatedEntries(id); + + List files = new ArrayList<>(); + + try (var filesStream = Files.list(storage.asPath())) { + filesStream + .filter(Files::isRegularFile) + .map(this::createFileModel) + .sorted(Comparator.comparing(FileStorageFileModel::filename)) + .forEach(files::add); + } + catch (IOException ex) { + logger.error("Failed to list files in storage", ex); + } + + return new FileStorageWithRelatedEntries(new FileStorageWithActions(storage), related, files); + } + + private FileStorageFileModel createFileModel(Path p) { + try { + String mTime = Files.getLastModifiedTime(p).toInstant().toString(); + String size; + if (Files.isDirectory(p)) { + size = "-"; + } + else { + long sizeBytes = Files.size(p); + + if (sizeBytes < 1024) size = sizeBytes + " B"; + else if (sizeBytes < 1024 * 1024) size = sizeBytes / 1024 + " KB"; + else if (sizeBytes < 1024 * 1024 * 1024) size = sizeBytes / (1024 * 1024) + " MB"; + else size = sizeBytes / (1024 * 1024 * 1024) + " GB"; + } + + return new FileStorageFileModel(p.toFile().getName(), mTime, size); + } + catch (IOException ex) { + throw new RuntimeException(ex); + } + } + private List getRelatedEntries(FileStorageId id) { + List ret = new ArrayList<>(); + try (var conn = dataSource.getConnection(); + var relatedIds = conn.prepareStatement(""" + (SELECT SOURCE_ID AS ID FROM FILE_STORAGE_RELATION WHERE TARGET_ID = ?) + UNION + (SELECT TARGET_ID AS ID FROM FILE_STORAGE_RELATION WHERE SOURCE_ID = ?) + """)) + { + + relatedIds.setLong(1, id.id()); + relatedIds.setLong(2, id.id()); + var rs = relatedIds.executeQuery(); + while (rs.next()) { + ret.add(fileStorageService.getStorage(new FileStorageId(rs.getLong("ID")))); + } + } catch (SQLException throwables) { + throwables.printStackTrace(); + } + return ret; + } + + public Object downloadFileFromStorage(Request request, Response response) throws SQLException { + var fileStorageId = FileStorageId.parse(request.params("id")); + String filename = request.queryParams("name"); + + Path root = fileStorageService.getStorage(fileStorageId).asPath(); + Path filePath = root.resolve(filename).normalize(); + + if (!filePath.startsWith(root)) { + response.status(403); + return ""; + } + + if (filePath.endsWith(".txt") || filePath.endsWith(".log")) response.type("text/plain"); + else response.type("application/octet-stream"); + + try (var is = Files.newInputStream(filePath)) { + is.transferTo(response.raw().getOutputStream()); + } + catch (IOException ex) { + logger.error("Failed to download file", ex); + throw new RuntimeException(ex); + } + + return ""; + } +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/DomainComplaintService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/DomainComplaintService.java new file mode 100644 index 00000000..758d0313 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/DomainComplaintService.java @@ -0,0 +1,91 @@ +package nu.marginalia.control.svc; + +import com.google.inject.Inject; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.control.model.DomainComplaintCategory; +import nu.marginalia.control.model.DomainComplaintModel; +import nu.marginalia.model.EdgeDomain; + +import java.sql.SQLException; +import java.sql.Timestamp; +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; + + +/** Service for handling domain complaints. This code has an user-facing correspondent in + * SearchFlagSiteService in search-service + */ +public class DomainComplaintService { + private final HikariDataSource dataSource; + private final ControlBlacklistService blacklistService; + + @Inject + public DomainComplaintService(HikariDataSource dataSource, + ControlBlacklistService blacklistService + ) { + this.dataSource = dataSource; + this.blacklistService = blacklistService; + } + + public List getComplaints() { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT EC_DOMAIN.DOMAIN_NAME AS DOMAIN, CATEGORY, DESCRIPTION, SAMPLE, FILE_DATE, REVIEWED, DECISION, REVIEW_DATE + FROM DOMAIN_COMPLAINT LEFT JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_COMPLAINT.DOMAIN_ID + """)) { + List complaints = new ArrayList<>(); + var rs = stmt.executeQuery(); + while (rs.next()) { + complaints.add(new DomainComplaintModel( + rs.getString("DOMAIN"), + DomainComplaintCategory.fromCategoryName(rs.getString("CATEGORY")), + rs.getString("DESCRIPTION"), + rs.getString("SAMPLE"), + rs.getString("DECISION"), + rs.getTimestamp("FILE_DATE").toLocalDateTime().toString(), + Optional.ofNullable(rs.getTimestamp("REVIEW_DATE")) + .map(Timestamp::toLocalDateTime).map(Object::toString).orElse(null), + rs.getBoolean("REVIEWED") + )); + } + return complaints; + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } + + public void approveAppealBlacklisting(EdgeDomain domain) { + blacklistService.removeFromBlacklist(domain); + setDecision(domain, "APPROVED"); + } + + public void blacklistDomain(EdgeDomain domain) { + blacklistService.addToBlacklist(domain, "Domain complaint"); + + setDecision(domain, "BLACKLISTED"); + } + + public void reviewNoAction(EdgeDomain domain) { + setDecision(domain, "REJECTED"); + } + + + + private void setDecision(EdgeDomain domain, String decision) { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + UPDATE DOMAIN_COMPLAINT SET DECISION=?, REVIEW_DATE=NOW() + WHERE DOMAIN_ID=(SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?) + AND DECISION IS NULL + """)) { + stmt.setString(1, decision); + stmt.setString(2, domain.toString()); + stmt.executeUpdate(); + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/EventLogService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/EventLogService.java new file mode 100644 index 00000000..d2cd6bcb --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/EventLogService.java @@ -0,0 +1,110 @@ +package nu.marginalia.control.svc; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.control.model.EventLogEntry; + +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.List; + +@Singleton +public class EventLogService { + + private final HikariDataSource dataSource; + + @Inject + public EventLogService(HikariDataSource dataSource) { + this.dataSource = dataSource; + } + + public List getLastEntries(int n) { + try (var conn = dataSource.getConnection(); + var query = conn.prepareStatement(""" + SELECT SERVICE_NAME, INSTANCE, EVENT_TIME, EVENT_TYPE, EVENT_MESSAGE + FROM SERVICE_EVENTLOG ORDER BY ID DESC LIMIT ? + """)) { + + query.setInt(1, n); + List entries = new ArrayList<>(n); + var rs = query.executeQuery(); + while (rs.next()) { + entries.add(new EventLogEntry( + rs.getString("SERVICE_NAME"), + rs.getString("INSTANCE"), + rs.getTimestamp("EVENT_TIME").toLocalDateTime().toLocalTime().toString(), + rs.getString("EVENT_TYPE"), + rs.getString("EVENT_MESSAGE") + )); + } + return entries; + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } + + public List getLastEntriesForService(String serviceName, int n) { + try (var conn = dataSource.getConnection(); + var query = conn.prepareStatement(""" + SELECT SERVICE_NAME, INSTANCE, EVENT_TIME, EVENT_TYPE, EVENT_MESSAGE + FROM SERVICE_EVENTLOG + WHERE SERVICE_NAME = ? + ORDER BY ID DESC + LIMIT ? + """)) { + + query.setString(1, serviceName); + query.setInt(2, n); + + List entries = new ArrayList<>(n); + var rs = query.executeQuery(); + while (rs.next()) { + entries.add(new EventLogEntry( + rs.getString("SERVICE_NAME"), + rs.getString("INSTANCE"), + rs.getTimestamp("EVENT_TIME").toLocalDateTime().toLocalTime().toString(), + rs.getString("EVENT_TYPE"), + rs.getString("EVENT_MESSAGE") + )); + } + return entries; + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } + + + public List getLastEntriesForInstance(String instance, int n) { + try (var conn = dataSource.getConnection(); + var query = conn.prepareStatement(""" + SELECT SERVICE_NAME, INSTANCE, EVENT_TIME, EVENT_TYPE, EVENT_MESSAGE + FROM SERVICE_EVENTLOG + WHERE INSTANCE = ? + ORDER BY ID DESC + LIMIT ? + """)) { + + query.setString(1, instance); + query.setInt(2, n); + + List entries = new ArrayList<>(n); + var rs = query.executeQuery(); + while (rs.next()) { + entries.add(new EventLogEntry( + rs.getString("SERVICE_NAME"), + rs.getString("INSTANCE"), + rs.getTimestamp("EVENT_TIME").toLocalDateTime().toLocalTime().toString(), + rs.getString("EVENT_TYPE"), + rs.getString("EVENT_MESSAGE") + )); + } + return entries; + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/HeartbeatService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/HeartbeatService.java new file mode 100644 index 00000000..1379924a --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/HeartbeatService.java @@ -0,0 +1,146 @@ +package nu.marginalia.control.svc; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.control.model.ProcessHeartbeat; +import nu.marginalia.control.model.ServiceHeartbeat; +import nu.marginalia.control.model.TaskHeartbeat; +import nu.marginalia.service.control.ServiceEventLog; + +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.List; + +@Singleton +public class HeartbeatService { + private final HikariDataSource dataSource; + private final ServiceEventLog eventLogService; + + @Inject + public HeartbeatService(HikariDataSource dataSource, + ServiceEventLog eventLogService) { + this.dataSource = dataSource; + this.eventLogService = eventLogService; + } + + public List getServiceHeartbeats() { + List heartbeats = new ArrayList<>(); + + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT SERVICE_NAME, SERVICE_BASE, INSTANCE, ALIVE, + TIMESTAMPDIFF(MICROSECOND, HEARTBEAT_TIME, CURRENT_TIMESTAMP(6)) AS TSDIFF + FROM SERVICE_HEARTBEAT + """)) { + + var rs = stmt.executeQuery(); + while (rs.next()) { + heartbeats.add(new ServiceHeartbeat( + rs.getString("SERVICE_NAME"), + rs.getString("SERVICE_BASE"), + rs.getString("INSTANCE"), + rs.getLong("TSDIFF") / 1000., + rs.getBoolean("ALIVE") + )); + } + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + + return heartbeats; + } + + public List getTaskHeartbeats() { + List heartbeats = new ArrayList<>(); + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT TASK_NAME, TASK_BASE, SERVICE_INSTANCE, STATUS, STAGE_NAME, PROGRESS, TIMESTAMPDIFF(MICROSECOND, TASK_HEARTBEAT.HEARTBEAT_TIME, CURRENT_TIMESTAMP(6)) AS TSDIFF + FROM TASK_HEARTBEAT + INNER JOIN SERVICE_HEARTBEAT ON SERVICE_HEARTBEAT.`INSTANCE` = SERVICE_INSTANCE + """)) { + var rs = stmt.executeQuery(); + while (rs.next()) { + int progress = rs.getInt("PROGRESS"); + heartbeats.add(new TaskHeartbeat( + rs.getString("TASK_NAME"), + rs.getString("TASK_BASE"), + rs.getString("SERVICE_INSTANCE"), + rs.getLong("TSDIFF") / 1000., + progress < 0 ? null : progress, + rs.getString("STAGE_NAME"), + rs.getString("STATUS") + )); + } + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + return heartbeats; + } + + public void removeTaskHeartbeat(TaskHeartbeat heartbeat) { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + DELETE FROM TASK_HEARTBEAT + WHERE SERVICE_INSTANCE = ? + """)) { + + stmt.setString(1, heartbeat.serviceUuuidFull()); + stmt.executeUpdate(); + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } + + public List getProcessHeartbeats() { + List heartbeats = new ArrayList<>(); + + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT PROCESS_NAME, PROCESS_BASE, INSTANCE, STATUS, PROGRESS, + TIMESTAMPDIFF(MICROSECOND, HEARTBEAT_TIME, CURRENT_TIMESTAMP(6)) AS TSDIFF + FROM PROCESS_HEARTBEAT + """)) { + + var rs = stmt.executeQuery(); + while (rs.next()) { + int progress = rs.getInt("PROGRESS"); + heartbeats.add(new ProcessHeartbeat( + rs.getString("PROCESS_NAME"), + rs.getString("PROCESS_BASE"), + rs.getString("INSTANCE"), + rs.getLong("TSDIFF") / 1000., + progress < 0 ? null : progress, + rs.getString("STATUS") + )); + } + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + + return heartbeats; + } + + public void flagProcessAsStopped(ProcessHeartbeat processHeartbeat) { + eventLogService.logEvent("PROCESS-MISSING", "Marking stale process heartbeat " + + processHeartbeat.processId() + " / " + processHeartbeat.uuidFull() + " as stopped"); + + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + UPDATE PROCESS_HEARTBEAT + SET STATUS = 'STOPPED' + WHERE INSTANCE = ? + """)) { + + stmt.setString(1, processHeartbeat.uuidFull()); + stmt.executeUpdate(); + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueService.java new file mode 100644 index 00000000..1d74a5bf --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/MessageQueueService.java @@ -0,0 +1,315 @@ +package nu.marginalia.control.svc; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.control.model.MessageQueueEntry; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.persistence.MqPersistence; +import spark.Request; +import spark.Response; + +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +@Singleton +public class MessageQueueService { + + private final HikariDataSource dataSource; + private final MqPersistence persistence; + + @Inject + public MessageQueueService(HikariDataSource dataSource, MqPersistence persistence) { + this.dataSource = dataSource; + this.persistence = persistence; + } + + + public Object viewMessageModel(Request request, Response response) { + return Map.of("message", getMessage(Long.parseLong(request.params("id"))), + "relatedMessages", getRelatedMessages(Long.parseLong(request.params("id")))); + } + + + public Object listMessageQueueModel(Request request, Response response) { + String inboxParam = request.queryParams("inbox"); + String instanceParam = request.queryParams("instance"); + String afterParam = request.queryParams("after"); + + long afterId = Optional.ofNullable(afterParam).map(Long::parseLong).orElse(Long.MAX_VALUE); + + List entries; + + String mqFilter = "filter=none"; + if (inboxParam != null) { + mqFilter = "inbox=" + inboxParam; + entries = getEntriesForInbox(inboxParam, afterId, 20); + } + else if (instanceParam != null) { + mqFilter = "instance=" + instanceParam; + entries = getEntriesForInstance(instanceParam, afterId, 20); + } + else { + entries = getEntries(afterId, 20); + } + + Object next; + + if (entries.size() == 20) + next = entries.stream().mapToLong(MessageQueueEntry::id).min().getAsLong(); + else + next = ""; + + Object prev = afterParam == null ? "" : afterParam; + + return Map.of("messages", entries, + "next", next, + "prev", prev, + "mqFilter", mqFilter); + } + + public Object newMessageModel(Request request, Response response) { + String idParam = request.queryParams("id"); + if (null == idParam) + return Map.of("relatedId", "-1"); + + var message = getMessage(Long.parseLong(idParam)); + if (message != null) + return message; + + return Map.of("relatedId", "-1"); + } + + public Object replyMessageModel(Request request, Response response) { + String idParam = request.params("id"); + + var message = getMessage(Long.parseLong(idParam)); + + return Map.of("relatedId", message.id(), + "recipientInbox", message.senderInbox(), + "function", "REPLY"); + } + + public Object createMessage(Request request, Response response) throws Exception { + String recipient = request.queryParams("recipientInbox"); + String sender = request.queryParams("senderInbox"); + String relatedMessage = request.queryParams("relatedId"); + String function = request.queryParams("function"); + String payload = request.queryParams("payload"); + + persistence.sendNewMessage(recipient, + sender.isBlank() ? null : sender, + relatedMessage == null ? null : Long.parseLong(relatedMessage), + function, + payload, + null); + + return ""; + } + + public Object viewMessageForEditStateModel(Request request, Response response) throws SQLException { + return persistence.getMessage(Long.parseLong(request.params("id"))); + } + + public Object editMessageState(Request request, Response response) throws SQLException { + MqMessageState state = MqMessageState.valueOf(request.queryParams("state")); + long id = Long.parseLong(request.params("id")); + persistence.updateMessageState(id, state); + return ""; + } + + public List getLastEntries(int n) { + try (var conn = dataSource.getConnection(); + var query = conn.prepareStatement(""" + SELECT ID, RELATED_ID, SENDER_INBOX, RECIPIENT_INBOX, FUNCTION, PAYLOAD, OWNER_INSTANCE, OWNER_TICK, STATE, CREATED_TIME, UPDATED_TIME, TTL + FROM MESSAGE_QUEUE + ORDER BY ID DESC + LIMIT ? + """)) { + + query.setInt(1, n); + List entries = new ArrayList<>(n); + var rs = query.executeQuery(); + while (rs.next()) { + entries.add(newEntry(rs)); + } + return entries; + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } + public MessageQueueEntry getMessage(long id) { + try (var conn = dataSource.getConnection(); + var query = conn.prepareStatement(""" + SELECT ID, RELATED_ID, SENDER_INBOX, RECIPIENT_INBOX, FUNCTION, PAYLOAD, OWNER_INSTANCE, OWNER_TICK, STATE, CREATED_TIME, UPDATED_TIME, TTL + FROM MESSAGE_QUEUE + WHERE ID=? + """)) { + + query.setLong(1, id); + + var rs = query.executeQuery(); + if (rs.next()) { + return newEntry(rs); + } + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + return null; + } + + public Object getLastEntriesForInbox(String inbox, int n) { + try (var conn = dataSource.getConnection(); + var query = conn.prepareStatement(""" + SELECT ID, RELATED_ID, SENDER_INBOX, RECIPIENT_INBOX, FUNCTION, PAYLOAD, OWNER_INSTANCE, OWNER_TICK, STATE, CREATED_TIME, UPDATED_TIME, TTL + FROM MESSAGE_QUEUE + WHERE RECIPIENT_INBOX=? + ORDER BY ID DESC + LIMIT ? + """)) { + + query.setString(1, inbox); + query.setInt(2, n); + List entries = new ArrayList<>(n); + var rs = query.executeQuery(); + while (rs.next()) { + entries.add(newEntry(rs)); + } + return entries; + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } + + public List getEntriesForInbox(String inbox, long afterId, int n) { + try (var conn = dataSource.getConnection(); + var query = conn.prepareStatement(""" + SELECT ID, RELATED_ID, SENDER_INBOX, RECIPIENT_INBOX, FUNCTION, PAYLOAD, OWNER_INSTANCE, OWNER_TICK, STATE, CREATED_TIME, UPDATED_TIME, TTL + FROM MESSAGE_QUEUE + WHERE ID < ? AND (RECIPIENT_INBOX = ? OR SENDER_INBOX = ?) + ORDER BY ID DESC + LIMIT ? + """)) { + + query.setLong(1, afterId); + query.setString(2, inbox); + query.setString(3, inbox); + query.setInt(4, n); + + List entries = new ArrayList<>(n); + var rs = query.executeQuery(); + while (rs.next()) { + entries.add(newEntry(rs)); + } + return entries; + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } + + public List getEntriesForInstance(String instance, long afterId, int n) { + try (var conn = dataSource.getConnection(); + var query = conn.prepareStatement(""" + SELECT ID, RELATED_ID, SENDER_INBOX, RECIPIENT_INBOX, FUNCTION, PAYLOAD, OWNER_INSTANCE, OWNER_TICK, STATE, CREATED_TIME, UPDATED_TIME, TTL + FROM MESSAGE_QUEUE + WHERE ID < ? AND OWNER_INSTANCE = ? + ORDER BY ID DESC + LIMIT ? + """)) { + + query.setLong(1, afterId); + query.setString(2, instance); + query.setInt(3, n); + + List entries = new ArrayList<>(n); + var rs = query.executeQuery(); + while (rs.next()) { + entries.add(newEntry(rs)); + } + return entries; + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } + + public List getEntries(long afterId, int n) { + try (var conn = dataSource.getConnection(); + var query = conn.prepareStatement(""" + SELECT ID, RELATED_ID, SENDER_INBOX, RECIPIENT_INBOX, FUNCTION, PAYLOAD, OWNER_INSTANCE, OWNER_TICK, STATE, CREATED_TIME, UPDATED_TIME, TTL + FROM MESSAGE_QUEUE + WHERE ID < ? + ORDER BY ID DESC + LIMIT ? + """)) { + + query.setLong(1, afterId); + query.setInt(2, n); + + List entries = new ArrayList<>(n); + var rs = query.executeQuery(); + while (rs.next()) { + entries.add(newEntry(rs)); + } + return entries; + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } + + public List getRelatedMessages(long relatedId) { + try (var conn = dataSource.getConnection(); + var query = conn.prepareStatement(""" + (SELECT ID, RELATED_ID, SENDER_INBOX, RECIPIENT_INBOX, FUNCTION, PAYLOAD, OWNER_INSTANCE, OWNER_TICK, STATE, CREATED_TIME, UPDATED_TIME, TTL + FROM MESSAGE_QUEUE + WHERE RELATED_ID = ? + ORDER BY ID DESC + LIMIT 100) + UNION + (SELECT ID, RELATED_ID, SENDER_INBOX, RECIPIENT_INBOX, FUNCTION, PAYLOAD, OWNER_INSTANCE, OWNER_TICK, STATE, CREATED_TIME, UPDATED_TIME, TTL + FROM MESSAGE_QUEUE + WHERE ID = (SELECT RELATED_ID FROM MESSAGE_QUEUE WHERE ID=?) + ORDER BY ID DESC + LIMIT 100) + """)) { + + query.setLong(1, relatedId); + query.setLong(2, relatedId); + + List entries = new ArrayList<>(100); + var rs = query.executeQuery(); + while (rs.next()) { + entries.add(newEntry(rs)); + } + return entries; + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + } + + private MessageQueueEntry newEntry(ResultSet rs) throws SQLException { + return new MessageQueueEntry( + rs.getLong("ID"), + rs.getLong("RELATED_ID"), + rs.getString("SENDER_INBOX"), + rs.getString("RECIPIENT_INBOX"), + rs.getString("FUNCTION"), + rs.getString("PAYLOAD"), + rs.getString("OWNER_INSTANCE"), + rs.getLong("OWNER_TICK"), + rs.getString("STATE"), + rs.getTimestamp("CREATED_TIME").toLocalDateTime().toLocalTime().toString(), + rs.getTimestamp("UPDATED_TIME").toLocalDateTime().toLocalTime().toString(), + rs.getInt("TTL")); + } +} diff --git a/code/services-core/control-service/src/main/resources/static/control/refresh.js b/code/services-core/control-service/src/main/resources/static/control/refresh.js new file mode 100644 index 00000000..0ee10bbf --- /dev/null +++ b/code/services-core/control-service/src/main/resources/static/control/refresh.js @@ -0,0 +1,25 @@ +function refresh(ids) { + fetch(window.location.href) + .then(response => response.text()) + .then(html => { + const parser = new DOMParser(); + const newDocument = parser.parseFromString(html, "text/html"); + + ids.forEach(id => { + const newElement = newDocument.getElementById(id); + const targetElement = document.getElementById(id); + + if (newElement == null) + return; + if (targetElement == null) + return; + + if (!newElement.isEqualNode(targetElement)) { + targetElement.replaceWith(document.importNode(newElement, true)) + } + }); + }) + .catch(error => { + console.error("Error fetching webpage:", error); + }); +} \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/static/control/style.css b/code/services-core/control-service/src/main/resources/static/control/style.css new file mode 100644 index 00000000..a248a499 --- /dev/null +++ b/code/services-core/control-service/src/main/resources/static/control/style.css @@ -0,0 +1,113 @@ +body { + background-color: #f8f8ee; + font-family: sans-serif; + line-height: 1.6; + + display: grid; + grid-template-columns: 20ch auto; + grid-gap: 1em; + grid-template-areas: + "left right"; +} +section nav.tabs > a { + color: #000; + text-decoration: none; + background-color: #ccc; + padding: 0.5ch; + border-radius: .5ch; +} +section nav.tabs a.selected { + background-color: #eee; +} + +.toggle-switch-off { + border-left: 5px solid #f00; + width: 8ch; +} +.toggle-switch-on { + border-right: 5px solid #080; + width: 8ch; +} +.toggle-switch-active { + border-left: 5px solid #00f; + border-right: 5px solid #00f; + width: 8ch; +} +#services .missing { + color: #800; +} +.uuidPip { + margin-left: 0.25ch; + border-radius: 2ch; + border: 1px solid #ccc; +} +h1 { + font-family: serif; +} +table { + font-family: monospace; +} +th { text-align: left; } +td,th { padding-right: 1ch; border: 1px solid #ccc; } + +tr:nth-of-type(2n) { + background-color: #eee; +} + + +table.table-rh-2 tr:nth-of-type(4n+1) { background-color: #eee; } +table.table-rh-2 tr:nth-of-type(4n+2) { background-color: #eee; } +table.table-rh-2 tr:nth-of-type(4n+3) { background-color: unset; } +table.table-rh-2 tr:nth-of-type(4n) { background-color: unset; } + +table.table-rh-2 tr:nth-of-type(4n) td, +table.table-rh-2 tr:nth-of-type(4n) th { border-bottom: 1px solid #888; } +table.table-rh-2 tr:nth-of-type(4n+2) td, +table.table-rh-2 tr:nth-of-type(4n+2) th { border-bottom: 1px solid #888; } + +table.table-rh-3 tr:nth-of-type(6n+1) { background-color: #eee; } +table.table-rh-3 tr:nth-of-type(6n+2) { background-color: #eee; } +table.table-rh-3 tr:nth-of-type(6n+3) { background-color: #eee; } +table.table-rh-3 tr:nth-of-type(6n+4) { background-color: unset; } +table.table-rh-3 tr:nth-of-type(6n+5) { background-color: unset; } +table.table-rh-3 tr:nth-of-type(6n) { background-color: unset; } + +table.table-rh-3 tr:nth-of-type(6n) td, +table.table-rh-3 tr:nth-of-type(6n) th { border-bottom: 1px solid #888; } +table.table-rh-3 tr:nth-of-type(6n+3) td, +table.table-rh-3 tr:nth-of-type(6n+3) th { border-bottom: 1px solid #888; } + +body > nav { + grid-area: left; +} +nav ul { + list-style-type: none; + padding: 0; +} +nav ul li { + line-height: 2; +} +nav ul li a { + text-decoration: none; + padding: 0.5ch; + display: block; + color: #000; + background-color: #ccc; +} +nav ul li a:focus { + text-decoration: underline; +} +nav ul li a.current { + color: #000; + background-color: #fff; +} + +body > section { + grid-area: right; +} + +#state-graph .current-state td:first-of-type { + border-right: 1em solid #000; + font-weight: bold; + border-color: #000; +} \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/actions.hdb b/code/services-core/control-service/src/main/resources/templates/control/actions.hdb new file mode 100644 index 00000000..dda6ea0d --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/actions.hdb @@ -0,0 +1,103 @@ + + + + Control Service + + + + + {{> control/partials/nav}} +

    +

    Actions

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    ActionTrigger
    Trigger Adjacency Calculation

    + This will trigger a recalculation of website similarities, which affects + the rankings calculations. +

    +
    + +
    +
    Repartition Index

    + This will recalculate the rankings and search sets for the index. +

    +
    + +
    +
    Reconstruct Index

    + This will reconstruct the index from the index journal. +

    +
    + +
    +
    Flush search-service Caches

    + This will instruct the search-service to flush its caches, + getting rid of any stale data. This may rarely be necessary after + reloading the index. +

    +
    + +
    +
    Flush api-service Caches

    + This will instruct the api-service to flush its caches, + getting rid of any stale data. This will be necessary after + changes to the API licenses directly through the database. +

    +
    + +
    +
    Trigger Data Exports

    + This exports the data from the database into a set of CSV files +

    +
    + +
    +
    + WARNING -- Destructive Actions Below This Line +
    Truncate Links Database.

    + This will drop all known URLs and domain links.
    + This action is not reversible. +

    +
    +
    + +

    + +
    +
    +
    + + \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/actor-details.hdb b/code/services-core/control-service/src/main/resources/templates/control/actor-details.hdb new file mode 100644 index 00000000..d3d807e6 --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/actor-details.hdb @@ -0,0 +1,22 @@ + + + + Control Service + + + + + {{> control/partials/nav}} +
    +

    {{actor}}

    + {{> control/partials/actor-state-graph}} + {{> control/partials/message-queue-table}} +
    + + + + \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/actors.hdb b/code/services-core/control-service/src/main/resources/templates/control/actors.hdb new file mode 100644 index 00000000..c669ce46 --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/actors.hdb @@ -0,0 +1,22 @@ + + + + Control Service + + + + + {{> control/partials/nav}} +
    + {{> control/partials/processes-table}} + {{> control/partials/actors-table}} + {{> control/partials/message-queue-table}} +
    + + + + \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/api-keys.hdb b/code/services-core/control-service/src/main/resources/templates/control/api-keys.hdb new file mode 100644 index 00000000..e58b6b8a --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/api-keys.hdb @@ -0,0 +1,61 @@ + + + + Control Service + + + + +{{> control/partials/nav}} +
    + +

    API Keys

    + + + + + + + + + + + + {{#each apikeys}} + + + + + + + + + + + {{/each}} +
    Key 
    LicenseNameContactRate
    {{licenseKey}} +
    + +
    +
    {{license}}{{name}}{{email}}{{rate}}
    +

    Add New

    +
    +
    +
    +
    +
    +
    +
    +
    +

    + +
    +
    + + + + \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/blacklist.hdb b/code/services-core/control-service/src/main/resources/templates/control/blacklist.hdb new file mode 100644 index 00000000..5622659c --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/blacklist.hdb @@ -0,0 +1,68 @@ + + + + Control Service + + + + +{{> control/partials/nav}} +
    +

    Blacklist

    + +

    + The blacklist is a list of sanctioned domains that will not be + crawled, indexed, or returned from the search results. +

    + + + + + + + + + + + + + +
    DescriptionAction
    Add To Blacklist

    + This will add the given domain to the blacklist. +

    +
    +  
    +   +
    +
    + +
    +
    Remove from blacklist

    + Remove the specified domain from the blacklist. This will ensure that + the domain is not blacklisted, in doing so it may remove the root domain + from the blacklist as well. +

    +
    +   +
    +
    + +
    +
    + +

    Recent Additions

    + + + + + + {{#each blacklist}} + + + + + {{/each}} +
    DomainComment
    {{domain}}{{comment}}
    +
    + + \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/domain-complaints.hdb b/code/services-core/control-service/src/main/resources/templates/control/domain-complaints.hdb new file mode 100644 index 00000000..ac1f6c88 --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/domain-complaints.hdb @@ -0,0 +1,111 @@ + + + + Control Service + + + + +{{> control/partials/nav}} +
    + +

    Domain Complaints

    + {{#unless complaintsNew}} +

    No new complaints!

    + {{/unless}} + {{#if complaintsNew}} + + + + + + + + + + + + + + + {{#each complaintsNew}} + + + + + + + + + + + + + + {{/each}} +
    DateCategory
    DomainSample
    Description
    {{fileDate}}{{category}} +
    + + + +
    +
    {{domain}}{{sample}}
    {{description}}
    + {{/if}} + + {{#if complaintsReviewed}} +

    Review Log

    + + + + + + + + + + + + + + + {{#each complaintsReviewed}} + + + + + + + + + + + + + {{/each}} +
    Review DateCategoryAction
    DomainSample
    Description
    {{fileDate}}{{category}} + {{decision}} +
    {{domain}}{{sample}}
    {{description}}
    + {{/if}} +
    + + + + \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/index.hdb b/code/services-core/control-service/src/main/resources/templates/control/index.hdb new file mode 100644 index 00000000..43c189fd --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/index.hdb @@ -0,0 +1,23 @@ + + + + Control Service + + + + + {{> control/partials/nav}} +
    + {{> control/partials/services-table }} + {{> control/partials/processes-table}} + {{> control/partials/actors-table}} + {{> control/partials/events-table }} +
    + + + + diff --git a/code/services-core/control-service/src/main/resources/templates/control/message-queue.hdb b/code/services-core/control-service/src/main/resources/templates/control/message-queue.hdb new file mode 100644 index 00000000..cc5b5da9 --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/message-queue.hdb @@ -0,0 +1,20 @@ + + + + Control Service + + + + + {{> control/partials/nav}} +
    + {{> control/partials/message-queue-table }} +
    + + + + \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/new-message.hdb b/code/services-core/control-service/src/main/resources/templates/control/new-message.hdb new file mode 100644 index 00000000..91242ba4 --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/new-message.hdb @@ -0,0 +1,44 @@ + + + +Message Queue | New Message + +{{> control/partials/nav}} +
    +

    Create Message

    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    FieldValue
    + +
    +
    +
    + + \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/partials/actor-state-graph.hdb b/code/services-core/control-service/src/main/resources/templates/control/partials/actor-state-graph.hdb new file mode 100644 index 00000000..dece5f62 --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/partials/actor-state-graph.hdb @@ -0,0 +1,16 @@ +

    Actor State Graph

    + + + + + + + + {{#each state-graph.states}} + + + + + + {{/each}} +
    StateTransitionsDescription
    {{name}}{{#each transitions}} {{.}} {{/each}}{{description}}
    diff --git a/code/services-core/control-service/src/main/resources/templates/control/partials/actors-table.hdb b/code/services-core/control-service/src/main/resources/templates/control/partials/actors-table.hdb new file mode 100644 index 00000000..a09e16a4 --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/partials/actors-table.hdb @@ -0,0 +1,69 @@ +

    Actors

    + + + + + + + {{#each actors}} + + + + + + {{/each}} +
    ActorStateAction
    {{name}}{{stateIcon}} {{state}} + {{#unless terminal}} +
    + +
    + {{/unless}} + {{#if terminal}} +
    + +
    + {{/if}} + +
    + + \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/partials/events-table.hdb b/code/services-core/control-service/src/main/resources/templates/control/partials/events-table.hdb new file mode 100644 index 00000000..23324a13 --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/partials/events-table.hdb @@ -0,0 +1,23 @@ +

    Events

    + + + + + + + + + + {{#each events}} + + + + + + + + {{/each}} +
    Service NameInstanceEvent TimeTypeMessage
    {{serviceName}} +    + {{instance}} + {{eventTime}}{{eventType}}{{eventMessage}}
    \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/partials/message-queue-table.hdb b/code/services-core/control-service/src/main/resources/templates/control/partials/message-queue-table.hdb new file mode 100644 index 00000000..d71d0941 --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/partials/message-queue-table.hdb @@ -0,0 +1,53 @@ +

    Message Queue

    + + + + + + + + + + + + + + + + {{#each messages}} + + + + + + + + + + + + + + + + + {{/each}} + + + + + +
    State
    TTL
    Msg ID
    Related ID
    Recipient
    Sender
    Function
    Payload
    Owner Instance
    Owner Tick
    Created
    Updated
    + [Add Message] +
    {{stateCode}} {{state}}{{id}}{{recipientInbox}}{{function}} +    {{ownerInstance}} + {{createdTime}}
    {{ttl}} + {{#if hasRelatedMessage}} + {{relatedId}} + {{else}} + {{relatedId}} + {{/if}} + {{senderInbox}}{{payload}}{{ownerTick}}{{updatedTime}}
    + {{#if prev}}Prev{{/if}} + {{#if next}}Next{{/if}} +
    diff --git a/code/services-core/control-service/src/main/resources/templates/control/partials/nav.hdb b/code/services-core/control-service/src/main/resources/templates/control/partials/nav.hdb new file mode 100644 index 00000000..05086051 --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/partials/nav.hdb @@ -0,0 +1,16 @@ + \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/partials/processes-table.hdb b/code/services-core/control-service/src/main/resources/templates/control/partials/processes-table.hdb new file mode 100644 index 00000000..50ab8d58 --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/partials/processes-table.hdb @@ -0,0 +1,41 @@ + +

    Processes

    + + + + + + + + + + {{#each processes}} + + + + + + + + {{/each}} +
    Process IDUUIDStatusProgressLast Seen (ms)
    {{displayName}} +   {{uuid}} + {{status}}{{#if progress}}{{progress}}%{{/if}}{{#unless isStopped}}{{lastSeenMillis}}{{/unless}}
    + +

    Jobs

    + + + + + + + + {{#each jobs}} + + + + + + + {{/each}} +
    Process IDStatusProgressLast Seen (ms)
    {{taskBase}}{{status}}{{#if progress}}{{progress}}%{{/if}} {{stage}}{{#unless isStopped}}{{lastSeenMillis}}{{/unless}}
    \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/partials/services-table.hdb b/code/services-core/control-service/src/main/resources/templates/control/partials/services-table.hdb new file mode 100644 index 00000000..5da46a83 --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/partials/services-table.hdb @@ -0,0 +1,18 @@ +

    Services

    + + + + + + + {{#each services}} + + + + + + {{/each}} +
    Service IDUUIDLast Seen (ms)
    {{serviceId}} +    + {{uuid}} + {{lastSeenMillis}}
    \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/partials/storage-table.hdb b/code/services-core/control-service/src/main/resources/templates/control/partials/storage-table.hdb new file mode 100644 index 00000000..d46aafa8 --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/partials/storage-table.hdb @@ -0,0 +1,32 @@ + + {{#each storage}} + + + + + + + + + + + + + + + + + + + {{#each storage}} + + + + + + + {{/each}} + {{/each}} +
    TypeNamePathPermit Temp
    {{base.type}}{{base.name}}{{base.path}}{{base.permitTemp}}
    TypePathDescription
    + Info + {{storage.type}}{{storage.path}}{{storage.description}}
    \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/partials/storage-types.hdb b/code/services-core/control-service/src/main/resources/templates/control/partials/storage-types.hdb new file mode 100644 index 00000000..575797f9 --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/partials/storage-types.hdb @@ -0,0 +1,6 @@ + \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/service-by-id.hdb b/code/services-core/control-service/src/main/resources/templates/control/service-by-id.hdb new file mode 100644 index 00000000..f350ac5a --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/service-by-id.hdb @@ -0,0 +1,22 @@ + + + + Control Service + + + + + {{> control/partials/nav}} +
    +

    Services/{{id}}

    + {{> control/partials/events-table }} + {{> control/partials/message-queue-table }} +
    + + + + \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/services.hdb b/code/services-core/control-service/src/main/resources/templates/control/services.hdb new file mode 100644 index 00000000..2e73dd92 --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/services.hdb @@ -0,0 +1,21 @@ + + + + Control Service + + + + + {{> control/partials/nav}} +
    + {{> control/partials/services-table }} + {{> control/partials/events-table }} +
    + + + + \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/storage-crawls.hdb b/code/services-core/control-service/src/main/resources/templates/control/storage-crawls.hdb new file mode 100644 index 00000000..627072a3 --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/storage-crawls.hdb @@ -0,0 +1,28 @@ + + + + Control Service + + + + + {{> control/partials/nav}} +
    + {{> control/partials/storage-types}} +

    Crawl Data

    + {{> control/partials/storage-table}} + +

    About

    +

    Crawl data is the content of websites that have been downloaded by the crawler.

    +

    Crawl data can be turned into processed data, and loaded into the index to make + it searchable.

    +
    + + + + + \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/storage-details.hdb b/code/services-core/control-service/src/main/resources/templates/control/storage-details.hdb new file mode 100644 index 00000000..e1574fd5 --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/storage-details.hdb @@ -0,0 +1,122 @@ + + + + Control Service + + + + + {{> control/partials/nav}} +
    + {{> control/partials/storage-types}} +

    Storage Details

    + {{#with storage.self.storage}} + + + + + + + + + + + +
    TypePathDetails
    {{type}}{{path}}{{description}}
    + {{/with}} + + {{#if storage.files}} +

    Contents

    + + + + + + + {{#each storage.files}} + + + + + {{/each}} +
    File NameLast ModSize
    + {{filename}} + {{mTime}}{{size}}
    + {{/if}} + +

    Actions

    + + + + + + {{#with storage.self}} + {{#if isCrawlable}} + + + + + + + {{/if}} + {{#if isLoadable}} + + + + + + + {{/if}} + {{#if isConvertible}} + + + + + + + {{/if}} + {{#if isRecrawlable}} + + + + + + + {{/if}} + {{#if isDeletable}} + + + + + + + {{/if}} + {{/with}} +
    DescriptionTrigger
    Perform a full re-crawl of this data
    Load this data into index
    Process and load this data into index
    Perform a re-crawl of this data
    Delete this data
    + {{#if storage.related}} +

    Related

    + + + + + + + {{#each storage.related}} + + + + + + {{/each}} +
    TypePathDetails
    {{type}}{{path}}{{description}}
    + {{/if}} +
    + + + + + \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/storage-overview.hdb b/code/services-core/control-service/src/main/resources/templates/control/storage-overview.hdb new file mode 100644 index 00000000..f80253c7 --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/storage-overview.hdb @@ -0,0 +1,52 @@ + + + + Control Service + + + + + {{> control/partials/nav}} +
    + {{> control/partials/storage-types}} +

    Storage

    + + {{#each storage}} + + + + + + + + + + + + + + + + + + + {{#each storage}} + + + + + + + {{/each}} + {{/each}} +
    TypeNamePathPermit Temp
    {{base.type}}{{base.name}}{{base.path}}{{base.permitTemp}}
    TypePathDescription
    + {{storage.type}}{{storage.path}}{{storage.description}}
    +
    + + + + \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/storage-processed.hdb b/code/services-core/control-service/src/main/resources/templates/control/storage-processed.hdb new file mode 100644 index 00000000..9a0da6c7 --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/storage-processed.hdb @@ -0,0 +1,26 @@ + + + + Control Service + + + + + {{> control/partials/nav}} +
    + {{> control/partials/storage-types}} +

    Processed Data

    + {{> control/partials/storage-table}} + +

    About

    +

    Processed data is crawl data that has been analyzed, and had its keywords extracted, + and is ready to be loaded into the index.

    +
    + + + + \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/storage-specs.hdb b/code/services-core/control-service/src/main/resources/templates/control/storage-specs.hdb new file mode 100644 index 00000000..c1e64963 --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/storage-specs.hdb @@ -0,0 +1,64 @@ + + + + Control Service + + + + + {{> control/partials/nav}} +
    + {{> control/partials/storage-types}} + +

    Crawl Specifications

    + {{> control/partials/storage-table}} + +

    About

    + +

    Crawling specifications are a work order for the crawler, in essence a list of domains that are to be crawled, + combined with a list of known URLs for each domain, and instructions on how deep to crawl. The crawler requires + a specification in order to understand what to do. +

    +

    + A crawling specification can either be generated from the links in the database, or from a list of domains + provided via a URL that links to a text file. +

    +

    Create New Specification

    + +

    To create a new specification fill out the form below.

    +
    +
    +
    +
    +

    (This is how you'll be able to find the + specification later so give it a good and descriptive name)

    + +

    Source

    +
    +
    + +
    + +
    +
    + +
    + + + + \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/update-message-state.hdb b/code/services-core/control-service/src/main/resources/templates/control/update-message-state.hdb new file mode 100644 index 00000000..7d2a16ee --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/update-message-state.hdb @@ -0,0 +1,60 @@ + + + +Update ID + +{{> control/partials/nav}} +
    +

    Update Message State

    +

    Update the of a message in the message queue. This may be useful to prevent an actor +from resuming an action when this is not desirable. Setting an old message to 'NEW' will +erase information about its owner, and inboxes will consider the message new again.

    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    FieldValue
    + +
    +
    +

    Note that while setting a message to NEW or in some instances ACK typically causes an Actor + to act on the message, setting a message in ACK to ERR or DEAD will not stop action, but only + prevent resumption of action. To stop a running actor, use the Actors view and press the toggle.

    +
    + + \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/view-message.hdb b/code/services-core/control-service/src/main/resources/templates/control/view-message.hdb new file mode 100644 index 00000000..fb52f440 --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/view-message.hdb @@ -0,0 +1,57 @@ + + + +Message Queue | New Message + +{{> control/partials/nav}} +
    +

    View Message {{id}}

    + {{#with message}} + + + + + + + + + + + + +
    FieldValueAction
    id{{id}}[Copy Message]
    recipientInbox{{recipientInbox}}
    state{{state}}[Edit State]
    senderInbox{{senderInbox}}{{#if senderInbox}}[Reply]{{/if}}
    relatedId + {{#if hasRelatedMessage}} + {{relatedId}} + {{else}} + {{relatedId}} + {{/if}} +
    function{{function}}
    payload + +
    Created{{createdTime}}
    Updated{{updatedTime}}
    + {{/with}} + + {{#if relatedMessages}} +

    Related Messages

    + + + + + + + + + {{#each relatedMessages}} + + + + + + + + {{/each}} +
    IDRecipient InboxSender InboxFunctionState
    {{id}}{{recipientInbox}}{{senderInbox}}{{function}}{{state}}
    + {{/if}} +
    + + + \ No newline at end of file diff --git a/code/services-core/control-service/src/test/java/nu/marginalia/control/svc/ApiKeyServiceTest.java b/code/services-core/control-service/src/test/java/nu/marginalia/control/svc/ApiKeyServiceTest.java new file mode 100644 index 00000000..7bb8536a --- /dev/null +++ b/code/services-core/control-service/src/test/java/nu/marginalia/control/svc/ApiKeyServiceTest.java @@ -0,0 +1,101 @@ +package nu.marginalia.control.svc; + +import com.zaxxer.hikari.HikariConfig; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.control.model.ApiKeyModel; +import org.junit.jupiter.api.*; +import org.junit.jupiter.api.parallel.Execution; +import org.testcontainers.containers.MariaDBContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +import java.sql.SQLException; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD; + +@Testcontainers +@Execution(SAME_THREAD) +@Tag("slow") +public class ApiKeyServiceTest { + @Container + static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") + .withDatabaseName("WMSA_prod") + .withUsername("wmsa") + .withPassword("wmsa") + .withInitScript("db/migration/V23_06_0_006__api_key.sql") + .withNetworkAliases("mariadb"); + + static HikariDataSource dataSource; + @BeforeAll + public static void setup() { + HikariConfig config = new HikariConfig(); + config.setJdbcUrl(mariaDBContainer.getJdbcUrl()); + config.setUsername("wmsa"); + config.setPassword("wmsa"); + + dataSource = new HikariDataSource(config); + } + + @AfterAll + public static void tearDown() { + dataSource.close(); + mariaDBContainer.close(); + } + + @AfterEach + public void cleanDb() { + try (var conn = dataSource.getConnection(); var stmt = conn.createStatement()) { + stmt.executeUpdate("TRUNCATE TABLE EC_API_KEY"); + } catch (SQLException e) { + e.printStackTrace(); + } + } + + @Test + void getKeys() { + var apiKeyService = new ApiKeyService(dataSource); + apiKeyService.addApiKey("public domain", "bob dobbs", "bob@dobbstown.com", 30); + apiKeyService.addApiKey("public domain", "connie dobbs", "cdobbs@dobbstown.com", 15); + + var keys = apiKeyService.getApiKeys(); + System.out.println(keys); + assertEquals(2, keys.size()); + } + + @Test + void addApiKey() { + var apiKeyService = new ApiKeyService(dataSource); + apiKeyService.addApiKey("public domain", "bob dobbs", "bob@dobbstown.com", 30); + + var keys = apiKeyService.getApiKeys(); + + System.out.println(keys); + assertEquals(1, keys.size()); + + var key = keys.get(0); + + assertEquals("public domain", key.license()); + assertEquals("bob dobbs", key.name()); + assertEquals("bob@dobbstown.com", key.email()); + assertEquals(30, key.rate()); + assertNotNull(key.licenseKey()); + } + + @Test + void deleteApiKey() { + var apiKeyService = new ApiKeyService(dataSource); + apiKeyService.addApiKey("public domain", "bob dobbs", "bob@dobbstown.com", 30); + + List keys = apiKeyService.getApiKeys(); + + assertEquals(1, keys.size()); + + String licenseKey= keys.get(0).licenseKey(); + apiKeyService.deleteApiKey(licenseKey); + + keys = apiKeyService.getApiKeys(); + assertEquals(0, keys.size()); + } +} \ No newline at end of file diff --git a/code/services-core/index-service/build.gradle b/code/services-core/index-service/build.gradle index 103d736f..4801e722 100644 --- a/code/services-core/index-service/build.gradle +++ b/code/services-core/index-service/build.gradle @@ -23,6 +23,7 @@ java { dependencies { implementation project(':code:common:config') implementation project(':code:common:model') + implementation project(':code:common:db') implementation project(':code:common:service') implementation project(':code:api:index-api') implementation project(':code:common:service-discovery') diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexModule.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexModule.java index 1e674d01..e0a3b2de 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexModule.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexModule.java @@ -2,12 +2,17 @@ package nu.marginalia.index; import com.google.inject.AbstractModule; import com.google.inject.Provides; +import com.google.inject.Singleton; import lombok.SneakyThrows; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.db.storage.model.FileStorageType; import nu.marginalia.index.config.RankingSettings; import nu.marginalia.WmsaHome; import nu.marginalia.lexicon.KeywordLexicon; import nu.marginalia.lexicon.KeywordLexiconReadOnlyView; import nu.marginalia.lexicon.journal.KeywordLexiconJournal; +import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode; +import nu.marginalia.service.control.ServiceEventLog; import java.nio.file.Path; @@ -20,13 +25,19 @@ public class IndexModule extends AbstractModule { @Provides @SneakyThrows - private KeywordLexiconReadOnlyView createLexicon() { - return new KeywordLexiconReadOnlyView( - new KeywordLexicon( - new KeywordLexiconJournal(WmsaHome.getDisk("index-write").resolve("dictionary.dat").toFile() - ) - ) - ); + @Singleton + private KeywordLexiconReadOnlyView createLexicon(ServiceEventLog eventLog, FileStorageService fileStorageService) { + try { + eventLog.logEvent("INDEX-LEXICON-LOAD-BEGIN", ""); + + var area = fileStorageService.getStorageByType(FileStorageType.LEXICON_LIVE); + var path = area.asPath().resolve("dictionary.dat"); + + return new KeywordLexiconReadOnlyView(new KeywordLexicon(new KeywordLexiconJournal(path.toFile(), KeywordLexiconJournalMode.READ_ONLY))); + } + finally { + eventLog.logEvent("INDEX-LEXICON-LOAD-OK", ""); + } } @Provides diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexService.java index 8d4a7984..a0ff5582 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexService.java @@ -2,16 +2,16 @@ package nu.marginalia.index; import com.google.gson.Gson; import com.google.inject.Inject; -import com.google.inject.name.Named; import io.reactivex.rxjava3.schedulers.Schedulers; +import nu.marginalia.index.client.IndexMqEndpoints; import nu.marginalia.index.index.SearchIndex; import nu.marginalia.index.svc.IndexOpsService; import nu.marginalia.index.svc.IndexQueryService; import nu.marginalia.index.svc.IndexSearchSetsService; import nu.marginalia.model.gson.GsonFactory; -import nu.marginalia.service.server.Initialization; -import nu.marginalia.service.server.MetricsServer; -import nu.marginalia.service.server.Service; +import nu.marginalia.service.control.ServiceEventLog; +import nu.marginalia.service.server.*; +import nu.marginalia.service.server.mq.MqRequest; import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -34,28 +34,29 @@ public class IndexService extends Service { private final IndexServicesFactory servicesFactory; private final IndexSearchSetsService searchSetsService; + private final ServiceEventLog eventLog; @Inject - public IndexService(@Named("service-host") String ip, - @Named("service-port") Integer port, - Initialization init, - MetricsServer metricsServer, + public IndexService(BaseServiceParams params, IndexOpsService opsService, IndexQueryService indexQueryService, SearchIndex searchIndex, IndexServicesFactory servicesFactory, - IndexSearchSetsService searchSetsService) + IndexSearchSetsService searchSetsService, + ServiceEventLog eventLog) { - super(ip, port, init, metricsServer); + super(params); + this.opsService = opsService; this.searchIndex = searchIndex; this.servicesFactory = servicesFactory; this.searchSetsService = searchSetsService; + this.eventLog = eventLog; final Gson gson = GsonFactory.get(); - this.init = init; + this.init = params.initialization; Spark.post("/search/", indexQueryService::search, gson::toJson); @@ -73,6 +74,38 @@ public class IndexService extends Service { volatile boolean initialized = false; + @MqRequest(endpoint = IndexMqEndpoints.INDEX_RELOAD_LEXICON) + public String reloadLexicon(String message) throws Exception { + + if (!opsService.reloadLexicon()) { + throw new IllegalStateException("Ops lock busy"); + } + + return "ok"; + } + + + @MqRequest(endpoint = IndexMqEndpoints.INDEX_REPARTITION) + public String repartition(String message) { + if (!opsService.repartition()) { + throw new IllegalStateException("Ops lock busy"); + } + return "ok"; + } + + @MqRequest(endpoint = IndexMqEndpoints.INDEX_REINDEX) + public String reindex(String message) throws Exception { + if (!opsService.reindex()) { + throw new IllegalStateException("Ops lock busy"); + } + + return "ok"; + } + @MqRequest(endpoint = IndexMqEndpoints.INDEX_IS_BLOCKED) + public String isBlocked(String message) throws Exception { + return Boolean.valueOf(opsService.isBusy()).toString(); + } + public void initialize() { if (!initialized) { init.waitReady(); @@ -94,9 +127,11 @@ public class IndexService extends Service { } try { + eventLog.logEvent("INDEX-AUTO-CONVERT-BEGIN", ""); logger.info("Auto-converting"); searchSetsService.recalculateAll(); searchIndex.switchIndex(); + eventLog.logEvent("INDEX-AUTO-CONVERT-END", ""); logger.info("Auto-conversion finished!"); } catch (IOException ex) { diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexServicesFactory.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexServicesFactory.java index ec43819a..9e0c2a04 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexServicesFactory.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexServicesFactory.java @@ -2,20 +2,19 @@ package nu.marginalia.index; import com.google.inject.Inject; import com.google.inject.Singleton; -import com.google.inject.name.Named; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.db.storage.model.FileStorageType; import nu.marginalia.index.forward.ForwardIndexConverter; import nu.marginalia.index.forward.ForwardIndexReader; import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile; -import nu.marginalia.index.journal.writer.IndexJournalWriter; -import nu.marginalia.index.journal.writer.IndexJournalWriterImpl; import nu.marginalia.index.priority.ReverseIndexPriorityConverter; import nu.marginalia.index.full.ReverseIndexFullConverter; import nu.marginalia.index.priority.ReverseIndexPriorityReader; import nu.marginalia.index.priority.ReverseIndexPriorityParameters; import nu.marginalia.index.full.ReverseIndexFullReader; -import nu.marginalia.lexicon.KeywordLexicon; import nu.marginalia.ranking.DomainRankings; import nu.marginalia.index.index.SearchIndexReader; +import nu.marginalia.service.control.ServiceHeartbeat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -24,16 +23,20 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardCopyOption; +import java.sql.SQLException; import java.util.concurrent.Callable; import java.util.stream.Stream; @Singleton public class IndexServicesFactory { private final Path tmpFileDir; + private final ServiceHeartbeat heartbeat; + private final Path liveStorage; + private final Path stagingStorage; private final Logger logger = LoggerFactory.getLogger(getClass()); - private final PartitionedDataFile writerIndexFile; + private final Path writerIndexFile; private final PartitionedDataFile fwdIndexDocId; private final PartitionedDataFile fwdIndexDocData; @@ -50,28 +53,30 @@ public class IndexServicesFactory { @Inject public IndexServicesFactory( - @Named("tmp-file-dir") Path tmpFileDir, - @Named("partition-root-slow") Path partitionRootSlow, - @Named("partition-root-fast") Path partitionRootFast - ) throws IOException { + ServiceHeartbeat heartbeat, + FileStorageService fileStorageService + ) throws IOException, SQLException { + this.heartbeat = heartbeat; - this.tmpFileDir = tmpFileDir; + liveStorage = fileStorageService.getStorageByType(FileStorageType.INDEX_LIVE).asPath(); + stagingStorage = fileStorageService.getStorageByType(FileStorageType.INDEX_STAGING).asPath(); + tmpFileDir = fileStorageService.getStorageByType(FileStorageType.INDEX_STAGING).asPath().resolve("tmp"); + searchSetsBase = fileStorageService.getStorageByType(FileStorageType.SEARCH_SETS).asPath(); - this.writerIndexFile = new PartitionedDataFile(partitionRootSlow, "page-index.dat"); - - fwdIndexDocId = new PartitionedDataFile(partitionRootFast, "fwd-doc-id.dat"); - fwdIndexDocData = new PartitionedDataFile(partitionRootFast, "fwd-doc-data.dat"); - - revIndexDoc = new PartitionedDataFile(partitionRootFast, "rev-doc.dat"); - revIndexWords = new PartitionedDataFile(partitionRootFast, "rev-words.dat"); - - revPrioIndexDoc = new PartitionedDataFile(partitionRootFast, "rev-prio-doc.dat"); - revPrioIndexWords = new PartitionedDataFile(partitionRootFast, "rev-prio-words.dat"); - - searchSetsBase = partitionRootSlow.resolve("search-sets"); - if (!Files.isDirectory(searchSetsBase)) { - Files.createDirectory(searchSetsBase); + if (!Files.exists(tmpFileDir)) { + Files.createDirectories(tmpFileDir); } + + writerIndexFile = stagingStorage.resolve("page-index.dat"); + + fwdIndexDocId = new PartitionedDataFile(liveStorage, "fwd-doc-id.dat"); + fwdIndexDocData = new PartitionedDataFile(liveStorage, "fwd-doc-data.dat"); + + revIndexDoc = new PartitionedDataFile(liveStorage, "rev-doc.dat"); + revIndexWords = new PartitionedDataFile(liveStorage, "rev-words.dat"); + + revPrioIndexDoc = new PartitionedDataFile(liveStorage, "rev-prio-doc.dat"); + revPrioIndexWords = new PartitionedDataFile(liveStorage, "rev-prio-words.dat"); } public Path getSearchSetsBase() { @@ -80,7 +85,7 @@ public class IndexServicesFactory { public boolean isPreconvertedIndexPresent() { return Stream.of( - writerIndexFile.get(LIVE_PART).toPath() + writerIndexFile ).allMatch(Files::exists); } @@ -95,23 +100,34 @@ public class IndexServicesFactory { ).noneMatch(Files::exists); } - public IndexJournalWriter createIndexJournalWriter(KeywordLexicon lexicon) throws IOException { - return new IndexJournalWriterImpl(lexicon, writerIndexFile.get(LIVE_PART).toPath()); + enum ConvertSteps { + FORWARD_INDEX, + FULL_REVERSE_INDEX, + PRIORITY_REVERSE_INDEX, + FINISHED } - public void convertIndex(DomainRankings domainRankings) throws IOException { - convertForwardIndex(domainRankings); - convertFullReverseIndex(domainRankings); - convertPriorityReverseIndex(domainRankings); + try (var hb = heartbeat.createServiceTaskHeartbeat(ConvertSteps.class, "index-conversion")) { + hb.progress(ConvertSteps.FORWARD_INDEX); + convertForwardIndex(domainRankings); + + hb.progress(ConvertSteps.FULL_REVERSE_INDEX); + convertFullReverseIndex(domainRankings); + + hb.progress(ConvertSteps.PRIORITY_REVERSE_INDEX); + convertPriorityReverseIndex(domainRankings); + + hb.progress(ConvertSteps.FINISHED); + } } private void convertFullReverseIndex(DomainRankings domainRankings) throws IOException { - var source = writerIndexFile.get(0).toPath(); + logger.info("Converting full reverse index {}", writerIndexFile); - logger.info("Converting full reverse index {}", source); - - var journalReader = new IndexJournalReaderSingleCompressedFile(source); - var converter = new ReverseIndexFullConverter(tmpFileDir, + var journalReader = new IndexJournalReaderSingleCompressedFile(writerIndexFile); + var converter = new ReverseIndexFullConverter( + heartbeat, + tmpFileDir, journalReader, domainRankings, revIndexWords.get(NEXT_PART).toPath(), @@ -124,14 +140,13 @@ public class IndexServicesFactory { private void convertPriorityReverseIndex(DomainRankings domainRankings) throws IOException { - var source = writerIndexFile.get(0).toPath(); + logger.info("Converting priority reverse index {}", writerIndexFile); - logger.info("Converting priority reverse index {}", source); - - var journalReader = new IndexJournalReaderSingleCompressedFile(source, null, + var journalReader = new IndexJournalReaderSingleCompressedFile(writerIndexFile, null, ReverseIndexPriorityParameters::filterPriorityRecord); - var converter = new ReverseIndexPriorityConverter(tmpFileDir, + var converter = new ReverseIndexPriorityConverter(heartbeat, + tmpFileDir, journalReader, domainRankings, revPrioIndexWords.get(NEXT_PART).toPath(), @@ -144,11 +159,11 @@ public class IndexServicesFactory { private void convertForwardIndex(DomainRankings domainRankings) throws IOException { - var source = writerIndexFile.get(0); - logger.info("Converting forward index data {}", source); + logger.info("Converting forward index data {}", writerIndexFile); - new ForwardIndexConverter(source, + new ForwardIndexConverter(heartbeat, + writerIndexFile.toFile(), fwdIndexDocId.get(NEXT_PART).toPath(), fwdIndexDocData.get(NEXT_PART).toPath(), domainRankings) diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexTablesModule.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexTablesModule.java index c26ca5e3..d4bf43c9 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexTablesModule.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexTablesModule.java @@ -1,20 +1,10 @@ package nu.marginalia.index; import com.google.inject.AbstractModule; -import com.google.inject.name.Names; -import nu.marginalia.WmsaHome; - -import java.nio.file.Path; public class IndexTablesModule extends AbstractModule { public void configure() { - bind(Path.class).annotatedWith(Names.named("partition-root-slow")).toInstance(WmsaHome.getDisk("index-write")); - bind(Path.class).annotatedWith(Names.named("partition-root-fast")).toInstance(WmsaHome.getDisk("index-read")); - - bind(Path.class).annotatedWith(Names.named("partition-root-slow-tmp")).toInstance(WmsaHome.getDisk("tmp-slow")); - bind(Path.class).annotatedWith(Names.named("tmp-file-dir")).toInstance(WmsaHome.getDisk("tmp-fast")); - } } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java index c218caab..397c291c 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java @@ -6,6 +6,7 @@ import nu.marginalia.index.IndexServicesFactory; import nu.marginalia.index.query.*; import nu.marginalia.index.query.filter.QueryFilterStepFromPredicate; import nu.marginalia.index.svc.IndexSearchSetsService; +import nu.marginalia.service.control.ServiceEventLog; import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -36,10 +37,15 @@ public class SearchIndex { private final IndexServicesFactory servicesFactory; private final IndexSearchSetsService searchSetsService; + private final ServiceEventLog eventLog; + @Inject - public SearchIndex(@NotNull IndexServicesFactory servicesFactory, IndexSearchSetsService searchSetsService) { + public SearchIndex(@NotNull IndexServicesFactory servicesFactory, + IndexSearchSetsService searchSetsService, + ServiceEventLog eventLog) { this.servicesFactory = servicesFactory; this.searchSetsService = searchSetsService; + this.eventLog = eventLog; } public void init() { @@ -51,7 +57,13 @@ public class SearchIndex { if (indexReader == null) { indexReader = servicesFactory.getSearchIndexReader(); + eventLog.logEvent("INDEX-INIT", "Index loaded"); } + else { + eventLog.logEvent("INDEX-INIT", "No index loaded"); + } + + } catch (Exception ex) { logger.error("Uncaught exception", ex); @@ -63,9 +75,12 @@ public class SearchIndex { public boolean switchIndex() throws IOException { + eventLog.logEvent("CONVERT-INDEX-BEGIN", ""); servicesFactory.convertIndex(searchSetsService.getDomainRankings()); + eventLog.logEvent("CONVERT-INDEX-END", ""); System.gc(); + eventLog.logEvent("INDEX-SWITCH-BEGIN", ""); Lock lock = indexReplacementLock.writeLock(); try { lock.lock(); @@ -73,11 +88,15 @@ public class SearchIndex { servicesFactory.switchFilesJob().call(); indexReader = servicesFactory.getSearchIndexReader(); + + eventLog.logEvent("INDEX-SWITCH-OK", ""); } catch (Exception ex) { + eventLog.logEvent("INDEX-SWITCH-ERR", ""); logger.error("Uncaught exception", ex); } finally { + lock.unlock(); } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexOpsService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexOpsService.java index 34ed2927..22e514d8 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexOpsService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexOpsService.java @@ -3,6 +3,7 @@ package nu.marginalia.index.svc; import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.index.index.SearchIndex; +import nu.marginalia.lexicon.KeywordLexiconReadOnlyView; import spark.Request; import spark.Response; import spark.Spark; @@ -18,18 +19,34 @@ public class IndexOpsService { private final SearchIndex index; private final IndexSearchSetsService searchSetService; + private final KeywordLexiconReadOnlyView lexicon; @Inject public IndexOpsService(SearchIndex index, - IndexSearchSetsService searchSetService) { + IndexSearchSetsService searchSetService, + KeywordLexiconReadOnlyView lexicon) { this.index = index; this.searchSetService = searchSetService; + this.lexicon = lexicon; } public boolean isBusy() { return opsLock.isLocked(); } + public boolean repartition() { + return run(searchSetService::recalculateAll); + } + public boolean reindex() throws Exception { + return run(() -> { + return index.switchIndex() && lexicon.suggestReload(); + }).isPresent(); + } + + public boolean reloadLexicon() throws Exception { + return run(lexicon::suggestReload).isPresent(); + } + public Object repartitionEndpoint(Request request, Response response) throws Exception { if (!run(searchSetService::recalculateAll)) { @@ -73,5 +90,6 @@ public class IndexOpsService { } } + } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java index 512c735e..3d886158 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java @@ -2,9 +2,13 @@ package nu.marginalia.index.svc; import com.google.inject.Inject; import com.google.inject.Singleton; +import it.unimi.dsi.fastutil.ints.IntOpenHashSet; import lombok.SneakyThrows; +import nu.marginalia.db.DomainTypes; import nu.marginalia.index.IndexServicesFactory; import nu.marginalia.index.searchset.SearchSet; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.id.EdgeIdList; import nu.marginalia.ranking.ReversePageRank; import nu.marginalia.ranking.StandardPageRank; import nu.marginalia.ranking.accumulator.RankingResultHashMapAccumulator; @@ -17,6 +21,7 @@ import nu.marginalia.index.config.RankingSettings; import nu.marginalia.ranking.DomainRankings; import nu.marginalia.index.client.model.query.SearchSetIdentifier; import nu.marginalia.index.db.DbUpdateRanks; +import nu.marginalia.service.control.ServiceHeartbeat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -25,7 +30,8 @@ import java.io.IOException; @Singleton public class IndexSearchSetsService { private final Logger logger = LoggerFactory.getLogger(getClass()); - private final RankingDomainFetcher rankingDomains; + private final DomainTypes domainTypes; + private final ServiceHeartbeat heartbeat; private final DbUpdateRanks dbUpdateRanks; private final RankingDomainFetcher similarityDomains; private final RankingSettings rankingSettings; @@ -35,19 +41,23 @@ public class IndexSearchSetsService { private volatile RankingSearchSet retroSet; private volatile RankingSearchSet smallWebSet; private volatile RankingSearchSet academiaSet; + private volatile RankingSearchSet blogsSet; private final SearchSet anySet = new SearchSetAny(); // The ranking value of the domains used in sorting the domains private volatile DomainRankings domainRankings = new DomainRankings(); @Inject - public IndexSearchSetsService(RankingDomainFetcher rankingDomains, + public IndexSearchSetsService(DomainTypes domainTypes, + ServiceHeartbeat heartbeat, + RankingDomainFetcher rankingDomains, RankingDomainFetcherForSimilarityData similarityDomains, RankingSettings rankingSettings, IndexServicesFactory servicesFactory, DbUpdateRanks dbUpdateRanks) throws IOException { + this.domainTypes = domainTypes; + this.heartbeat = heartbeat; - this.rankingDomains = rankingDomains; this.dbUpdateRanks = dbUpdateRanks; if (similarityDomains.hasData()) { @@ -64,6 +74,7 @@ public class IndexSearchSetsService { smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, servicesFactory.getSearchSetsBase().resolve("small-web.dat")); academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, servicesFactory.getSearchSetsBase().resolve("academia.dat")); retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, servicesFactory.getSearchSetsBase().resolve("retro.dat")); + blogsSet = new RankingSearchSet(SearchSetIdentifier.BLOGS, servicesFactory.getSearchSetsBase().resolve("blogs.dat")); } public DomainRankings getDomainRankings() { @@ -79,14 +90,38 @@ public class IndexSearchSetsService { case RETRO -> retroSet; case ACADEMIA -> academiaSet; case SMALLWEB -> smallWebSet; + case BLOGS -> blogsSet; }; } + enum RepartitionSteps { + UPDATE_ACADEMIA, + UPDATE_RETRO, + UPDATE_SMALL_WEB, + UPDATE_BLOGS, + UPDATE_RANKINGS, + FINISHED + } public void recalculateAll() { - updateAcademiaDomainsSet(); - updateRetroDomainsSet(); - updateSmallWebDomainsSet(); - updateDomainRankings(); + try (var processHeartbeat = heartbeat.createServiceTaskHeartbeat(RepartitionSteps.class, "repartitionAll")) { + + processHeartbeat.progress(RepartitionSteps.UPDATE_ACADEMIA); + updateAcademiaDomainsSet(); + + processHeartbeat.progress(RepartitionSteps.UPDATE_RETRO); + updateRetroDomainsSet(); + + processHeartbeat.progress(RepartitionSteps.UPDATE_SMALL_WEB); + updateSmallWebDomainsSet(); + + processHeartbeat.progress(RepartitionSteps.UPDATE_BLOGS); + updateBlogsSet(); + + processHeartbeat.progress(RepartitionSteps.UPDATE_RANKINGS); + updateDomainRankings(); + + processHeartbeat.progress(RepartitionSteps.FINISHED); + } } private void updateDomainRankings() { @@ -131,6 +166,23 @@ public class IndexSearchSetsService { } } + @SneakyThrows + public void updateBlogsSet() { + EdgeIdList knownDomains = domainTypes.getKnownDomainsByType(DomainTypes.Type.BLOG); + + if (knownDomains.isEmpty()) { + // FIXME: We don't want to reload the entire list every time, but we do want to do it sometimes. Actor maybe? + domainTypes.reloadDomainsList(DomainTypes.Type.BLOG); + knownDomains = domainTypes.getKnownDomainsByType(DomainTypes.Type.BLOG); + } + + synchronized (this) { + blogsSet = new RankingSearchSet(SearchSetIdentifier.BLOGS, blogsSet.source, new IntOpenHashSet(knownDomains.values())); + blogsSet.write(); + } + } + + @SneakyThrows public void updateAcademiaDomainsSet() { var entry = rankingSettings.academia; diff --git a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java index fd92354b..37030b1f 100644 --- a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java +++ b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java @@ -18,6 +18,7 @@ import nu.marginalia.lexicon.KeywordLexicon; import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.WordMetadata; +import nu.marginalia.service.control.ServiceHeartbeat; import nu.marginalia.service.server.Initialization; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Assertions; @@ -49,6 +50,9 @@ public class IndexQueryServiceIntegrationTest { @Inject KeywordLexicon keywordLexicon; + @Inject + ServiceHeartbeat heartbeat; + @Inject IndexJournalWriter indexJournalWriter; diff --git a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java index a2962027..0801bc77 100644 --- a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java +++ b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java @@ -1,24 +1,33 @@ package nu.marginalia.index.svc; import com.google.inject.AbstractModule; -import com.google.inject.name.Names; -import nu.marginalia.WmsaHome; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.db.storage.model.FileStorage; +import nu.marginalia.db.storage.model.FileStorageType; import nu.marginalia.index.IndexServicesFactory; import nu.marginalia.index.journal.writer.IndexJournalWriter; import nu.marginalia.index.journal.writer.IndexJournalWriterImpl; import nu.marginalia.lexicon.KeywordLexicon; import nu.marginalia.lexicon.KeywordLexiconReadOnlyView; import nu.marginalia.lexicon.journal.KeywordLexiconJournal; +import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode; import nu.marginalia.ranking.DomainRankings; import nu.marginalia.index.svc.searchset.SearchSetAny; import nu.marginalia.index.util.TestUtil; import nu.marginalia.index.client.model.query.SearchSetIdentifier; +import nu.marginalia.service.control.ServiceEventLog; +import nu.marginalia.service.control.ServiceHeartbeat; +import nu.marginalia.service.control.ServiceTaskHeartbeat; +import nu.marginalia.service.id.ServiceId; +import nu.marginalia.service.module.ServiceConfiguration; import org.mockito.Mockito; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; +import java.sql.SQLException; import java.util.Random; +import java.util.UUID; import static org.mockito.Mockito.when; @@ -46,8 +55,23 @@ public class IndexQueryServiceIntegrationTestModule extends AbstractModule { protected void configure() { try { - var servicesFactory = new IndexServicesFactory(Path.of("/tmp"), - slowDir, fastDir + var fileStorageServiceMock = Mockito.mock(FileStorageService.class); + + when(fileStorageServiceMock.getStorageByType(FileStorageType.SEARCH_SETS)).thenReturn(new FileStorage(null, null, null, fastDir.toString(), null)); + when(fileStorageServiceMock.getStorageByType(FileStorageType.LEXICON_LIVE)).thenReturn(new FileStorage(null, null, null, fastDir.toString(), null)); + when(fileStorageServiceMock.getStorageByType(FileStorageType.LEXICON_STAGING)).thenReturn(new FileStorage(null, null, null, fastDir.toString(), null)); + when(fileStorageServiceMock.getStorageByType(FileStorageType.INDEX_LIVE)).thenReturn(new FileStorage(null, null, null, fastDir.toString(), null)); + when(fileStorageServiceMock.getStorageByType(FileStorageType.INDEX_STAGING)).thenReturn(new FileStorage(null, null, null, slowDir.toString(), null)); + + var serviceHeartbeat = Mockito.mock(ServiceHeartbeat.class); + // RIP fairies + when(serviceHeartbeat.createServiceTaskHeartbeat(Mockito.any(), Mockito.any())) + .thenReturn(Mockito.mock(ServiceTaskHeartbeat.class)); + bind(ServiceHeartbeat.class).toInstance(serviceHeartbeat); + + var servicesFactory = new IndexServicesFactory( + serviceHeartbeat, + fileStorageServiceMock ); bind(IndexServicesFactory.class).toInstance(servicesFactory); @@ -56,15 +80,29 @@ public class IndexQueryServiceIntegrationTestModule extends AbstractModule { when(setsServiceMock.getDomainRankings()).thenReturn(new DomainRankings()); bind(IndexSearchSetsService.class).toInstance(setsServiceMock); - var keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(slowDir.resolve("dictionary.dat").toFile())); + var keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal( + slowDir.resolve("dictionary.dat").toFile(), + KeywordLexiconJournalMode.READ_WRITE) + ); bind(KeywordLexicon.class).toInstance(keywordLexicon); bind(KeywordLexiconReadOnlyView.class).toInstance(new KeywordLexiconReadOnlyView(keywordLexicon)); - bind(IndexJournalWriter.class).toInstance(servicesFactory.createIndexJournalWriter(keywordLexicon)); + bind(ServiceEventLog.class).toInstance(Mockito.mock(ServiceEventLog.class)); - bind(String.class).annotatedWith(Names.named("service-host")).toInstance("127.0.0.1"); - bind(Integer.class).annotatedWith(Names.named("service-port")).toProvider(this::randomPort); - } catch (IOException e) { + + bind(IndexJournalWriter.class).toInstance(new IndexJournalWriterImpl(keywordLexicon, + slowDir.resolve("page-index.dat"))); + + bind(ServiceConfiguration.class).toInstance(new ServiceConfiguration( + ServiceId.Index, + 0, + "127.0.0.1", + randomPort(), + randomPort(), + UUID.randomUUID() + )); + + } catch (IOException | SQLException e) { throw new RuntimeException(e); } diff --git a/code/services-core/readme.md b/code/services-core/readme.md index 1af591d3..8a66f0c5 100644 --- a/code/services-core/readme.md +++ b/code/services-core/readme.md @@ -8,5 +8,8 @@ The cores services constitute the main functionality of the search engine. * The [index-service](index-service/) contains the indexes, it answers questions about which documents contain which terms. +* The [control-service](control-service/) provides an operator's user interface, and is responsible + for orchestrating the various processes of the system. + * The [assistant-service](assistant-service/) helps the search service with spelling suggestions other peripheral functionality. \ No newline at end of file diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/SearchService.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/SearchService.java index 66953dde..5fe5751e 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/SearchService.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/SearchService.java @@ -2,17 +2,16 @@ package nu.marginalia.search; import com.google.gson.Gson; import com.google.inject.Inject; -import com.google.inject.name.Named; import lombok.SneakyThrows; import nu.marginalia.WebsiteUrl; import nu.marginalia.client.Context; import nu.marginalia.model.gson.GsonFactory; +import nu.marginalia.search.client.SearchMqEndpoints; +import nu.marginalia.search.db.DbUrlDetailsQuery; import nu.marginalia.search.svc.SearchFrontPageService; import nu.marginalia.search.svc.*; -import nu.marginalia.service.server.Initialization; -import nu.marginalia.service.server.MetricsServer; -import nu.marginalia.service.server.Service; -import nu.marginalia.service.server.StaticResources; +import nu.marginalia.service.server.*; +import nu.marginalia.service.server.mq.MqNotification; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import spark.Request; @@ -25,17 +24,16 @@ import java.nio.charset.StandardCharsets; public class SearchService extends Service { private final WebsiteUrl websiteUrl; + private final DbUrlDetailsQuery dbUrlDetailsQuery; private final StaticResources staticResources; private static final Logger logger = LoggerFactory.getLogger(SearchService.class); @SneakyThrows @Inject - public SearchService(@Named("service-host") String ip, - @Named("service-port") Integer port, - Initialization initialization, - MetricsServer metricsServer, + public SearchService(BaseServiceParams params, WebsiteUrl websiteUrl, + DbUrlDetailsQuery dbUrlDetailsQuery, StaticResources staticResources, SearchFrontPageService frontPageService, SearchErrorPageService errorPageService, @@ -44,9 +42,10 @@ public class SearchService extends Service { SearchQueryService searchQueryService, SearchApiQueryService apiQueryService ) { - super(ip, port, initialization, metricsServer); + super(params); this.websiteUrl = websiteUrl; + this.dbUrlDetailsQuery = dbUrlDetailsQuery; this.staticResources = staticResources; Spark.staticFiles.expireTime(600); @@ -77,6 +76,12 @@ public class SearchService extends Service { Spark.awaitInitialization(); } + @MqNotification(endpoint = SearchMqEndpoints.FLUSH_CACHES) + public void flushCaches(String unusedArg) { + logger.info("Flushing caches"); + dbUrlDetailsQuery.clearCaches(); + } + private Object serveStatic(Request request, Response response) { String resource = request.params("resource"); staticResources.serveStatic("search", resource, request, response); diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java index b732cb18..c913d0ce 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java @@ -10,6 +10,7 @@ import java.util.Objects; public enum SearchProfile { DEFAULT("default", SearchSetIdentifier.RETRO), MODERN("modern", SearchSetIdentifier.SMALLWEB), + BLOGOSPHERE("blogosphere", SearchSetIdentifier.BLOGS), CORPO("corpo", SearchSetIdentifier.NONE), YOLO("yolo", SearchSetIdentifier.NONE), VINTAGE("vintage", SearchSetIdentifier.NONE), diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java index b44f2551..2de12536 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java @@ -121,7 +121,8 @@ public class UrlDetails { for (var problem :EnumSet.of( HtmlFeature.JS, - HtmlFeature.TRACKING_INNOCENT, + HtmlFeature.TRACKING, + HtmlFeature.TRACKING_ADTECH, HtmlFeature.AFFILIATE_LINK, HtmlFeature.COOKIES, HtmlFeature.ADVERTISEMENT)) { @@ -156,7 +157,7 @@ public class UrlDetails { return HtmlFeature.hasFeature(features, HtmlFeature.JS); } public boolean isTracking() { - return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING_INNOCENT); + return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING); } public boolean isAffiliate() { return HtmlFeature.hasFeature(features, HtmlFeature.AFFILIATE_LINK); diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/siteinfo/DomainInformationService.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/siteinfo/DomainInformationService.java index 35ce81b7..7863c17b 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/siteinfo/DomainInformationService.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/siteinfo/DomainInformationService.java @@ -115,14 +115,11 @@ public class DomainInformationService { public boolean isBlacklisted(EdgeDomain domain) { try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN_BLACKLIST WHERE URL_DOMAIN=?")) { + try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN_BLACKLIST WHERE URL_DOMAIN IN (?,?)")) { stmt.setString(1, domain.domain); + stmt.setString(2, domain.toString()); var rsp = stmt.executeQuery(); - if (rsp.next()) { - return true; - } else { - return false; - } + return rsp.next(); } } } diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchApiQueryService.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchApiQueryService.java index a84cdaee..ad9d3fd6 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchApiQueryService.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchApiQueryService.java @@ -48,6 +48,7 @@ public class SearchApiQueryService { case "1" -> SearchProfile.MODERN; case "2" -> SearchProfile.DEFAULT; case "3" -> SearchProfile.CORPO_CLEAN; + case "blogosphere" -> SearchProfile.BLOGOSPHERE; default -> SearchProfile.CORPO_CLEAN; }; } diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchFlagSiteService.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchFlagSiteService.java index 5eb960a5..33d0165d 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchFlagSiteService.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchFlagSiteService.java @@ -17,6 +17,9 @@ import java.util.Map; import java.util.function.Function; import java.util.stream.Collectors; +/** Service for handling flagging sites. This code has an admin-facing correspondent in + * DomainComplaintService in control-service + */ public class SearchFlagSiteService { private final MustacheRenderer formTemplate; private final HikariDataSource dataSource; @@ -83,9 +86,9 @@ public class SearchFlagSiteService { try (var conn = dataSource.getConnection(); var complaintsStmt = conn.prepareStatement(""" - SELECT CATEGORY, FILE_DATE, REVIEWED, DECISION + SELECT CATEGORY, FILE_DATE, REVIEWED, DECISION FROM DOMAIN_COMPLAINT - WHERE DOMAIN_ID=? + WHERE DOMAIN_ID=? """); var stmt = conn.prepareStatement( """ diff --git a/code/services-core/search-service/src/main/resources/templates/search/parts/search-form.hdb b/code/services-core/search-service/src/main/resources/templates/search/parts/search-form.hdb index fba8f3c7..8d5b4b79 100644 --- a/code/services-core/search-service/src/main/resources/templates/search/parts/search-form.hdb +++ b/code/services-core/search-service/src/main/resources/templates/search/parts/search-form.hdb @@ -9,7 +9,7 @@