mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(live-crawler) Alter DbDomainIdRegistry to make inserts if an id is missing, as this is apparently a rare scenario we need to deal with.
This commit is contained in:
parent
52eb5bc84f
commit
b941604135
@ -4,6 +4,7 @@ import com.google.gson.Gson;
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Injector;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.api.feeds.FeedsClient;
|
||||
import nu.marginalia.converting.ConverterModule;
|
||||
@ -56,6 +57,7 @@ public class LiveCrawlerMain extends ProcessMainClass {
|
||||
private final FileStorageService fileStorageService;
|
||||
private final KeywordLoaderService keywordLoaderService;
|
||||
private final DocumentLoaderService documentLoaderService;
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
@Inject
|
||||
public LiveCrawlerMain(FeedsClient feedsClient,
|
||||
@ -68,7 +70,7 @@ public class LiveCrawlerMain extends ProcessMainClass {
|
||||
DomainProcessor domainProcessor,
|
||||
FileStorageService fileStorageService,
|
||||
KeywordLoaderService keywordLoaderService,
|
||||
DocumentLoaderService documentLoaderService)
|
||||
DocumentLoaderService documentLoaderService, HikariDataSource dataSource)
|
||||
throws Exception
|
||||
{
|
||||
super(messageQueueFactory, config, gson, LIVE_CRAWLER_INBOX);
|
||||
@ -81,6 +83,7 @@ public class LiveCrawlerMain extends ProcessMainClass {
|
||||
this.fileStorageService = fileStorageService;
|
||||
this.keywordLoaderService = keywordLoaderService;
|
||||
this.documentLoaderService = documentLoaderService;
|
||||
this.dataSource = dataSource;
|
||||
|
||||
domainBlacklist.waitUntilLoaded();
|
||||
}
|
||||
@ -201,7 +204,7 @@ public class LiveCrawlerMain extends ProcessMainClass {
|
||||
|
||||
LoaderInputData lid = new LoaderInputData(tempPath, 1);
|
||||
|
||||
DomainIdRegistry domainIdRegistry = new DbDomainIdRegistry(domainQueries);
|
||||
DomainIdRegistry domainIdRegistry = new DbDomainIdRegistry(dataSource);
|
||||
|
||||
keywordLoaderService.loadKeywords(domainIdRegistry, heartbeat, lid);
|
||||
documentLoaderService.loadDocuments(domainIdRegistry, heartbeat, lid);
|
||||
|
@ -1,18 +1,55 @@
|
||||
package nu.marginalia.loading.domains;
|
||||
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
|
||||
public class DbDomainIdRegistry implements DomainIdRegistry {
|
||||
private final DbDomainQueries dbDomainQueries;
|
||||
import java.sql.Statement;
|
||||
|
||||
public DbDomainIdRegistry(DbDomainQueries dbDomainQueries) {
|
||||
this.dbDomainQueries = dbDomainQueries;
|
||||
public class DbDomainIdRegistry implements DomainIdRegistry {
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
public DbDomainIdRegistry(HikariDataSource dataSource) {
|
||||
this.dataSource = dataSource;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getDomainId(String domainName) {
|
||||
return dbDomainQueries.getDomainId(new EdgeDomain(domainName));
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||
|
||||
stmt.setString(1, domainName);
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return rsp.getInt(1);
|
||||
}
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException("Failed to query domain ID", e);
|
||||
}
|
||||
|
||||
// Insert the domain if it doesn't exist (unlikely)
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("INSERT IGNORE INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES (?, ?, ?)",
|
||||
Statement.RETURN_GENERATED_KEYS)) {
|
||||
|
||||
var domain = new EdgeDomain(domainName);
|
||||
|
||||
stmt.setString(1, domain.toString());
|
||||
stmt.setString(2, domain.getTopDomain());
|
||||
stmt.setInt(3, 0); // "up for grabs" node affinity
|
||||
stmt.executeUpdate();
|
||||
|
||||
var gk = stmt.getGeneratedKeys();
|
||||
if (gk.next()) {
|
||||
return gk.getInt(1);
|
||||
}
|
||||
else {
|
||||
// recurse in the doubly unlikely event that the domain was inserted by another thread
|
||||
return getDomainId(domainName);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -0,0 +1,44 @@
|
||||
package nu.marginalia.loading.domains;
|
||||
|
||||
import com.zaxxer.hikari.HikariConfig;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.test.TestMigrationLoader;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.testcontainers.containers.MariaDBContainer;
|
||||
import org.testcontainers.junit.jupiter.Container;
|
||||
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||
|
||||
@Testcontainers
|
||||
@Tag("slow")
|
||||
class DbDomainIdRegistryTest {
|
||||
|
||||
@Container
|
||||
static MariaDBContainer<?> mariaDBContainer = new MariaDBContainer<>("mariadb")
|
||||
.withDatabaseName("WMSA_prod")
|
||||
.withUsername("wmsa")
|
||||
.withPassword("wmsa")
|
||||
.withNetworkAliases("mariadb");
|
||||
|
||||
static HikariDataSource dataSource;
|
||||
|
||||
@BeforeAll
|
||||
public static void setup() {
|
||||
HikariConfig config = new HikariConfig();
|
||||
config.setJdbcUrl(mariaDBContainer.getJdbcUrl());
|
||||
config.setUsername("wmsa");
|
||||
config.setPassword("wmsa");
|
||||
|
||||
dataSource = new HikariDataSource(config);
|
||||
|
||||
TestMigrationLoader.flywayMigration(dataSource);
|
||||
}
|
||||
|
||||
@Test
|
||||
void getDomainId() {
|
||||
Assertions.assertEquals(1, new DbDomainIdRegistry(dataSource).getDomainId("test.com"));
|
||||
Assertions.assertEquals(1, new DbDomainIdRegistry(dataSource).getDomainId("test.com"));
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user