mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(loader) Don't truncate the entire links table on load
This behavior is an old vestige from the days of only having a single loader process. We'd truncate the links table because doing inserts/updates was too slow. This was also important because we had 32 bit ID, and there's a lot of links between domains to go around... Instead we delete the rows associated with the current node with a stored procedure PURGE_LINKS_TABLE. We also update the PRIMARY KEY to a BIGINT. We'll need to load the data in excess of billion times to hit an ID rollover, so it'll be fine.
This commit is contained in:
parent
fd77e62a13
commit
f58a9f46be
@ -0,0 +1,10 @@
|
|||||||
|
ALTER TABLE WMSA_prod.EC_DOMAIN_LINK
|
||||||
|
MODIFY COLUMN ID BIGINT NOT NULL AUTO_INCREMENT;
|
||||||
|
|
||||||
|
CREATE OR REPLACE PROCEDURE PURGE_LINKS_TABLE (IN nodeId INT)
|
||||||
|
BEGIN
|
||||||
|
DELETE EC_DOMAIN_LINK
|
||||||
|
FROM EC_DOMAIN_LINK INNER JOIN WMSA_prod.EC_DOMAIN
|
||||||
|
ON EC_DOMAIN_LINK.SOURCE_DOMAIN_ID = EC_DOMAIN.ID
|
||||||
|
WHERE NODE_AFFINITY = nodeId;
|
||||||
|
END;
|
@ -3,6 +3,7 @@ package nu.marginalia.loading.links;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import nu.marginalia.ProcessConfiguration;
|
||||||
import nu.marginalia.io.processed.DomainLinkRecordParquetFileReader;
|
import nu.marginalia.io.processed.DomainLinkRecordParquetFileReader;
|
||||||
import nu.marginalia.io.processed.ProcessedDataFileNames;
|
import nu.marginalia.io.processed.ProcessedDataFileNames;
|
||||||
import nu.marginalia.loading.LoaderInputData;
|
import nu.marginalia.loading.LoaderInputData;
|
||||||
@ -23,10 +24,12 @@ public class DomainLinksLoaderService {
|
|||||||
|
|
||||||
private final HikariDataSource dataSource;
|
private final HikariDataSource dataSource;
|
||||||
private static final Logger logger = LoggerFactory.getLogger(DomainLinksLoaderService.class);
|
private static final Logger logger = LoggerFactory.getLogger(DomainLinksLoaderService.class);
|
||||||
|
private final int nodeId;
|
||||||
@Inject
|
@Inject
|
||||||
public DomainLinksLoaderService(HikariDataSource dataSource) {
|
public DomainLinksLoaderService(HikariDataSource dataSource,
|
||||||
|
ProcessConfiguration processConfiguration) {
|
||||||
this.dataSource = dataSource;
|
this.dataSource = dataSource;
|
||||||
|
this.nodeId = processConfiguration.node();
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean loadLinks(DomainIdRegistry domainIdRegistry,
|
public boolean loadLinks(DomainIdRegistry domainIdRegistry,
|
||||||
@ -54,11 +57,12 @@ public class DomainLinksLoaderService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private void dropLinkData() throws SQLException {
|
private void dropLinkData() throws SQLException {
|
||||||
logger.info("Truncating EC_DOMAIN_LINK");
|
logger.info("Clearing EC_DOMAIN_LINK");
|
||||||
|
|
||||||
try (var conn = dataSource.getConnection();
|
try (var conn = dataSource.getConnection();
|
||||||
var stmt = conn.createStatement()) {
|
var call = conn.prepareCall("CALL PURGE_LINKS_TABLE(?)")) {
|
||||||
stmt.executeUpdate("TRUNCATE TABLE EC_DOMAIN_LINK");
|
call.setInt(1, nodeId);
|
||||||
|
call.executeUpdate();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -84,7 +88,7 @@ public class DomainLinksLoaderService {
|
|||||||
|
|
||||||
connection = dataSource.getConnection();
|
connection = dataSource.getConnection();
|
||||||
insertStatement = connection.prepareStatement("""
|
insertStatement = connection.prepareStatement("""
|
||||||
INSERT INTO EC_DOMAIN_LINK(SOURCE_DOMAIN_ID, DEST_DOMAIN_ID)
|
INSERT IGNORE INTO EC_DOMAIN_LINK(SOURCE_DOMAIN_ID, DEST_DOMAIN_ID)
|
||||||
VALUES (?, ?)
|
VALUES (?, ?)
|
||||||
""");
|
""");
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user