From b941604135d42526a474e1c90ed46dd63a3fec77 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 22 Nov 2024 13:58:57 +0100 Subject: [PATCH] (live-crawler) Alter DbDomainIdRegistry to make inserts if an id is missing, as this is apparently a rare scenario we need to deal with. --- .../livecrawler/LiveCrawlerMain.java | 7 ++- .../loading/domains/DbDomainIdRegistry.java | 49 ++++++++++++++++--- .../domains/DbDomainIdRegistryTest.java | 44 +++++++++++++++++ 3 files changed, 92 insertions(+), 8 deletions(-) create mode 100644 code/processes/loading-process/test/nu/marginalia/loading/domains/DbDomainIdRegistryTest.java diff --git a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlerMain.java b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlerMain.java index edc90909..d05925bb 100644 --- a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlerMain.java +++ b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlerMain.java @@ -4,6 +4,7 @@ import com.google.gson.Gson; import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; +import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.WmsaHome; import nu.marginalia.api.feeds.FeedsClient; import nu.marginalia.converting.ConverterModule; @@ -56,6 +57,7 @@ public class LiveCrawlerMain extends ProcessMainClass { private final FileStorageService fileStorageService; private final KeywordLoaderService keywordLoaderService; private final DocumentLoaderService documentLoaderService; + private final HikariDataSource dataSource; @Inject public LiveCrawlerMain(FeedsClient feedsClient, @@ -68,7 +70,7 @@ public class LiveCrawlerMain extends ProcessMainClass { DomainProcessor domainProcessor, FileStorageService fileStorageService, KeywordLoaderService keywordLoaderService, - DocumentLoaderService documentLoaderService) + DocumentLoaderService documentLoaderService, HikariDataSource dataSource) throws Exception { super(messageQueueFactory, config, gson, LIVE_CRAWLER_INBOX); @@ -81,6 +83,7 @@ public class LiveCrawlerMain extends ProcessMainClass { this.fileStorageService = fileStorageService; this.keywordLoaderService = keywordLoaderService; this.documentLoaderService = documentLoaderService; + this.dataSource = dataSource; domainBlacklist.waitUntilLoaded(); } @@ -201,7 +204,7 @@ public class LiveCrawlerMain extends ProcessMainClass { LoaderInputData lid = new LoaderInputData(tempPath, 1); - DomainIdRegistry domainIdRegistry = new DbDomainIdRegistry(domainQueries); + DomainIdRegistry domainIdRegistry = new DbDomainIdRegistry(dataSource); keywordLoaderService.loadKeywords(domainIdRegistry, heartbeat, lid); documentLoaderService.loadDocuments(domainIdRegistry, heartbeat, lid); diff --git a/code/processes/loading-process/java/nu/marginalia/loading/domains/DbDomainIdRegistry.java b/code/processes/loading-process/java/nu/marginalia/loading/domains/DbDomainIdRegistry.java index 44a19d35..ce71c305 100644 --- a/code/processes/loading-process/java/nu/marginalia/loading/domains/DbDomainIdRegistry.java +++ b/code/processes/loading-process/java/nu/marginalia/loading/domains/DbDomainIdRegistry.java @@ -1,18 +1,55 @@ package nu.marginalia.loading.domains; -import nu.marginalia.db.DbDomainQueries; +import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.model.EdgeDomain; -public class DbDomainIdRegistry implements DomainIdRegistry { - private final DbDomainQueries dbDomainQueries; +import java.sql.Statement; - public DbDomainIdRegistry(DbDomainQueries dbDomainQueries) { - this.dbDomainQueries = dbDomainQueries; +public class DbDomainIdRegistry implements DomainIdRegistry { + private final HikariDataSource dataSource; + + public DbDomainIdRegistry(HikariDataSource dataSource) { + this.dataSource = dataSource; } @Override public int getDomainId(String domainName) { - return dbDomainQueries.getDomainId(new EdgeDomain(domainName)); + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) { + + stmt.setString(1, domainName); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return rsp.getInt(1); + } + } + catch (Exception e) { + throw new RuntimeException("Failed to query domain ID", e); + } + + // Insert the domain if it doesn't exist (unlikely) + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement("INSERT IGNORE INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES (?, ?, ?)", + Statement.RETURN_GENERATED_KEYS)) { + + var domain = new EdgeDomain(domainName); + + stmt.setString(1, domain.toString()); + stmt.setString(2, domain.getTopDomain()); + stmt.setInt(3, 0); // "up for grabs" node affinity + stmt.executeUpdate(); + + var gk = stmt.getGeneratedKeys(); + if (gk.next()) { + return gk.getInt(1); + } + else { + // recurse in the doubly unlikely event that the domain was inserted by another thread + return getDomainId(domainName); + } + } catch (Exception e) { + throw new RuntimeException(e); + } } @Override diff --git a/code/processes/loading-process/test/nu/marginalia/loading/domains/DbDomainIdRegistryTest.java b/code/processes/loading-process/test/nu/marginalia/loading/domains/DbDomainIdRegistryTest.java new file mode 100644 index 00000000..2c976878 --- /dev/null +++ b/code/processes/loading-process/test/nu/marginalia/loading/domains/DbDomainIdRegistryTest.java @@ -0,0 +1,44 @@ +package nu.marginalia.loading.domains; + +import com.zaxxer.hikari.HikariConfig; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.test.TestMigrationLoader; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.testcontainers.containers.MariaDBContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +@Testcontainers +@Tag("slow") +class DbDomainIdRegistryTest { + + @Container + static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") + .withDatabaseName("WMSA_prod") + .withUsername("wmsa") + .withPassword("wmsa") + .withNetworkAliases("mariadb"); + + static HikariDataSource dataSource; + + @BeforeAll + public static void setup() { + HikariConfig config = new HikariConfig(); + config.setJdbcUrl(mariaDBContainer.getJdbcUrl()); + config.setUsername("wmsa"); + config.setPassword("wmsa"); + + dataSource = new HikariDataSource(config); + + TestMigrationLoader.flywayMigration(dataSource); + } + + @Test + void getDomainId() { + Assertions.assertEquals(1, new DbDomainIdRegistry(dataSource).getDomainId("test.com")); + Assertions.assertEquals(1, new DbDomainIdRegistry(dataSource).getDomainId("test.com")); + } +} \ No newline at end of file