(live-crawler) Alter DbDomainIdRegistry to make inserts if an id is missing, as this is apparently a rare scenario we need to deal with.

This commit is contained in:
Viktor Lofgren 2024-11-22 13:58:57 +01:00
parent 52eb5bc84f
commit b941604135
3 changed files with 92 additions and 8 deletions

View File

@ -4,6 +4,7 @@ import com.google.gson.Gson;
import com.google.inject.Guice;
import com.google.inject.Inject;
import com.google.inject.Injector;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.WmsaHome;
import nu.marginalia.api.feeds.FeedsClient;
import nu.marginalia.converting.ConverterModule;
@ -56,6 +57,7 @@ public class LiveCrawlerMain extends ProcessMainClass {
private final FileStorageService fileStorageService;
private final KeywordLoaderService keywordLoaderService;
private final DocumentLoaderService documentLoaderService;
private final HikariDataSource dataSource;
@Inject
public LiveCrawlerMain(FeedsClient feedsClient,
@ -68,7 +70,7 @@ public class LiveCrawlerMain extends ProcessMainClass {
DomainProcessor domainProcessor,
FileStorageService fileStorageService,
KeywordLoaderService keywordLoaderService,
DocumentLoaderService documentLoaderService)
DocumentLoaderService documentLoaderService, HikariDataSource dataSource)
throws Exception
{
super(messageQueueFactory, config, gson, LIVE_CRAWLER_INBOX);
@ -81,6 +83,7 @@ public class LiveCrawlerMain extends ProcessMainClass {
this.fileStorageService = fileStorageService;
this.keywordLoaderService = keywordLoaderService;
this.documentLoaderService = documentLoaderService;
this.dataSource = dataSource;
domainBlacklist.waitUntilLoaded();
}
@ -201,7 +204,7 @@ public class LiveCrawlerMain extends ProcessMainClass {
LoaderInputData lid = new LoaderInputData(tempPath, 1);
DomainIdRegistry domainIdRegistry = new DbDomainIdRegistry(domainQueries);
DomainIdRegistry domainIdRegistry = new DbDomainIdRegistry(dataSource);
keywordLoaderService.loadKeywords(domainIdRegistry, heartbeat, lid);
documentLoaderService.loadDocuments(domainIdRegistry, heartbeat, lid);

View File

@ -1,18 +1,55 @@
package nu.marginalia.loading.domains;
import nu.marginalia.db.DbDomainQueries;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.model.EdgeDomain;
public class DbDomainIdRegistry implements DomainIdRegistry {
private final DbDomainQueries dbDomainQueries;
import java.sql.Statement;
public DbDomainIdRegistry(DbDomainQueries dbDomainQueries) {
this.dbDomainQueries = dbDomainQueries;
public class DbDomainIdRegistry implements DomainIdRegistry {
private final HikariDataSource dataSource;
public DbDomainIdRegistry(HikariDataSource dataSource) {
this.dataSource = dataSource;
}
@Override
public int getDomainId(String domainName) {
return dbDomainQueries.getDomainId(new EdgeDomain(domainName));
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
stmt.setString(1, domainName);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
}
}
catch (Exception e) {
throw new RuntimeException("Failed to query domain ID", e);
}
// Insert the domain if it doesn't exist (unlikely)
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("INSERT IGNORE INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES (?, ?, ?)",
Statement.RETURN_GENERATED_KEYS)) {
var domain = new EdgeDomain(domainName);
stmt.setString(1, domain.toString());
stmt.setString(2, domain.getTopDomain());
stmt.setInt(3, 0); // "up for grabs" node affinity
stmt.executeUpdate();
var gk = stmt.getGeneratedKeys();
if (gk.next()) {
return gk.getInt(1);
}
else {
// recurse in the doubly unlikely event that the domain was inserted by another thread
return getDomainId(domainName);
}
} catch (Exception e) {
throw new RuntimeException(e);
}
}
@Override

View File

@ -0,0 +1,44 @@
package nu.marginalia.loading.domains;
import com.zaxxer.hikari.HikariConfig;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.test.TestMigrationLoader;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
import org.testcontainers.containers.MariaDBContainer;
import org.testcontainers.junit.jupiter.Container;
import org.testcontainers.junit.jupiter.Testcontainers;
@Testcontainers
@Tag("slow")
class DbDomainIdRegistryTest {
@Container
static MariaDBContainer<?> mariaDBContainer = new MariaDBContainer<>("mariadb")
.withDatabaseName("WMSA_prod")
.withUsername("wmsa")
.withPassword("wmsa")
.withNetworkAliases("mariadb");
static HikariDataSource dataSource;
@BeforeAll
public static void setup() {
HikariConfig config = new HikariConfig();
config.setJdbcUrl(mariaDBContainer.getJdbcUrl());
config.setUsername("wmsa");
config.setPassword("wmsa");
dataSource = new HikariDataSource(config);
TestMigrationLoader.flywayMigration(dataSource);
}
@Test
void getDomainId() {
Assertions.assertEquals(1, new DbDomainIdRegistry(dataSource).getDomainId("test.com"));
Assertions.assertEquals(1, new DbDomainIdRegistry(dataSource).getDomainId("test.com"));
}
}