mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(loader) Don't truncate the entire links table on load
This behavior is an old vestige from the days of only having a single loader process. We'd truncate the links table because doing inserts/updates was too slow. This was also important because we had 32 bit ID, and there's a lot of links between domains to go around... Instead we delete the rows associated with the current node with a stored procedure PURGE_LINKS_TABLE. We also update the PRIMARY KEY to a BIGINT. We'll need to load the data in excess of billion times to hit an ID rollover, so it'll be fine.
This commit is contained in:
parent
fd77e62a13
commit
f58a9f46be
@ -0,0 +1,10 @@
|
||||
ALTER TABLE WMSA_prod.EC_DOMAIN_LINK
|
||||
MODIFY COLUMN ID BIGINT NOT NULL AUTO_INCREMENT;
|
||||
|
||||
CREATE OR REPLACE PROCEDURE PURGE_LINKS_TABLE (IN nodeId INT)
|
||||
BEGIN
|
||||
DELETE EC_DOMAIN_LINK
|
||||
FROM EC_DOMAIN_LINK INNER JOIN WMSA_prod.EC_DOMAIN
|
||||
ON EC_DOMAIN_LINK.SOURCE_DOMAIN_ID = EC_DOMAIN.ID
|
||||
WHERE NODE_AFFINITY = nodeId;
|
||||
END;
|
@ -3,6 +3,7 @@ package nu.marginalia.loading.links;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.ProcessConfiguration;
|
||||
import nu.marginalia.io.processed.DomainLinkRecordParquetFileReader;
|
||||
import nu.marginalia.io.processed.ProcessedDataFileNames;
|
||||
import nu.marginalia.loading.LoaderInputData;
|
||||
@ -23,10 +24,12 @@ public class DomainLinksLoaderService {
|
||||
|
||||
private final HikariDataSource dataSource;
|
||||
private static final Logger logger = LoggerFactory.getLogger(DomainLinksLoaderService.class);
|
||||
|
||||
private final int nodeId;
|
||||
@Inject
|
||||
public DomainLinksLoaderService(HikariDataSource dataSource) {
|
||||
public DomainLinksLoaderService(HikariDataSource dataSource,
|
||||
ProcessConfiguration processConfiguration) {
|
||||
this.dataSource = dataSource;
|
||||
this.nodeId = processConfiguration.node();
|
||||
}
|
||||
|
||||
public boolean loadLinks(DomainIdRegistry domainIdRegistry,
|
||||
@ -54,11 +57,12 @@ public class DomainLinksLoaderService {
|
||||
}
|
||||
|
||||
private void dropLinkData() throws SQLException {
|
||||
logger.info("Truncating EC_DOMAIN_LINK");
|
||||
logger.info("Clearing EC_DOMAIN_LINK");
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.createStatement()) {
|
||||
stmt.executeUpdate("TRUNCATE TABLE EC_DOMAIN_LINK");
|
||||
var call = conn.prepareCall("CALL PURGE_LINKS_TABLE(?)")) {
|
||||
call.setInt(1, nodeId);
|
||||
call.executeUpdate();
|
||||
}
|
||||
}
|
||||
|
||||
@ -84,7 +88,7 @@ public class DomainLinksLoaderService {
|
||||
|
||||
connection = dataSource.getConnection();
|
||||
insertStatement = connection.prepareStatement("""
|
||||
INSERT INTO EC_DOMAIN_LINK(SOURCE_DOMAIN_ID, DEST_DOMAIN_ID)
|
||||
INSERT IGNORE INTO EC_DOMAIN_LINK(SOURCE_DOMAIN_ID, DEST_DOMAIN_ID)
|
||||
VALUES (?, ?)
|
||||
""");
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user