From e710e057e2d2419903a45c0cc3bb1bab260276b8 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 25 Aug 2023 13:45:03 +0200 Subject: [PATCH] (db) Remove EC_URL and EC_PAGE_DATA from mariadb database --- ...l => V23_09_0_000__filestorage_livedb.sql} | 0 .../V23_09_1_000__filestorage_livedb.sql | 3 + .../converting/UpdateDomainStatistics.java | 71 ------------------- .../loader/SqlLoadProcessedDomain.java | 4 -- .../actor/task/TruncateLinkDatabase.java | 4 +- 5 files changed, 4 insertions(+), 78 deletions(-) rename code/common/db/src/main/resources/db/migration/{V23_07_1_000__filestorage_livedb.sql => V23_09_0_000__filestorage_livedb.sql} (100%) create mode 100644 code/common/db/src/main/resources/db/migration/V23_09_1_000__filestorage_livedb.sql delete mode 100644 code/processes/converting-process/src/main/java/nu/marginalia/converting/UpdateDomainStatistics.java diff --git a/code/common/db/src/main/resources/db/migration/V23_07_1_000__filestorage_livedb.sql b/code/common/db/src/main/resources/db/migration/V23_09_0_000__filestorage_livedb.sql similarity index 100% rename from code/common/db/src/main/resources/db/migration/V23_07_1_000__filestorage_livedb.sql rename to code/common/db/src/main/resources/db/migration/V23_09_0_000__filestorage_livedb.sql diff --git a/code/common/db/src/main/resources/db/migration/V23_09_1_000__filestorage_livedb.sql b/code/common/db/src/main/resources/db/migration/V23_09_1_000__filestorage_livedb.sql new file mode 100644 index 00000000..ad3775ec --- /dev/null +++ b/code/common/db/src/main/resources/db/migration/V23_09_1_000__filestorage_livedb.sql @@ -0,0 +1,3 @@ +DROP VIEW EC_URL_VIEW; +DROP TABLE EC_PAGE_DATA; +DROP TABLE EC_URL; \ No newline at end of file diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/UpdateDomainStatistics.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/UpdateDomainStatistics.java deleted file mode 100644 index a59c7426..00000000 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/UpdateDomainStatistics.java +++ /dev/null @@ -1,71 +0,0 @@ -package nu.marginalia.converting; - -import com.zaxxer.hikari.HikariDataSource; -import gnu.trove.map.hash.TIntIntHashMap; -import nu.marginalia.service.module.DatabaseModule; - -import java.sql.SQLException; - -public class UpdateDomainStatistics { - private final HikariDataSource dataSource; - - public UpdateDomainStatistics(HikariDataSource dataSource) { - this.dataSource = dataSource; - } - - public static void main(String... args) throws SQLException { - new UpdateDomainStatistics(new DatabaseModule().provideConnection()).run(); - } - - public void run() throws SQLException { - - // This looks weird, but it's actually much faster than doing the computations with SQL queries - // - // ... in part because we can assume the data is immutable and don't mind consuming egregious - // resources - - try (var conn = dataSource.getConnection(); - var stmt = conn.createStatement(); - var domainInfoQuery = conn.prepareStatement("SELECT DOMAIN_ID, VISITED, STATE='ok' FROM EC_URL"); - var insertDomainInfo = conn.prepareStatement("INSERT INTO DOMAIN_METADATA(ID,KNOWN_URLS,VISITED_URLS,GOOD_URLS) VALUES (?, ?, ?, ?)") - ) { - - stmt.executeUpdate("DELETE FROM DOMAIN_METADATA"); - - TIntIntHashMap knownUrls = new TIntIntHashMap(1_000_000, 0.75f, 0, 0); - TIntIntHashMap visitedUrls = new TIntIntHashMap(1_000_000, 0.75f, 0, 0); - TIntIntHashMap goodUrls = new TIntIntHashMap(1_000_000, 0.75f, 0, 0); - - domainInfoQuery.setFetchSize(10_000); - var rsp = domainInfoQuery.executeQuery(); - while (rsp.next()) { - int domainId = rsp.getInt(1); - boolean visited = rsp.getBoolean(2); - boolean stateOk = rsp.getBoolean(3); - - knownUrls.adjustOrPutValue(domainId, 1, 1); - if (visited) { - visitedUrls.adjustOrPutValue(domainId, 1, 1); - if (stateOk) { - goodUrls.adjustOrPutValue(domainId, 1, 1); - } - } - } - - int i = 0; - for (int domainId : knownUrls.keys()) { - insertDomainInfo.setInt(1, domainId); - insertDomainInfo.setInt(2, knownUrls.get(domainId)); - insertDomainInfo.setInt(3, visitedUrls.get(domainId)); - insertDomainInfo.setInt(4, goodUrls.get(domainId)); - insertDomainInfo.addBatch(); - if ((++i % 1000) == 0) { - insertDomainInfo.executeBatch(); - } - } - if ((i % 1000) != 0) { - insertDomainInfo.executeBatch(); - } - } - } -} diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java index 17a423ed..9bf94816 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java @@ -26,9 +26,6 @@ public class SqlLoadProcessedDomain { try (var stmt = conn.createStatement()) { stmt.execute("DROP PROCEDURE IF EXISTS INITIALIZE_DOMAIN"); - // Note that there should be no need to delete from EC_PAGE_DATA here as it's done via their - // CASCADE DELETE constraint on EC_URL. - stmt.execute(""" CREATE PROCEDURE INITIALIZE_DOMAIN ( IN ST ENUM('ACTIVE', 'EXHAUSTED', 'SPECIAL', 'SOCIAL_MEDIA', 'BLOCKED', 'REDIR', 'ERROR', 'UNKNOWN'), @@ -38,7 +35,6 @@ public class SqlLoadProcessedDomain { BEGIN DELETE FROM DOMAIN_METADATA WHERE ID=DID; DELETE FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=DID; - DELETE FROM EC_URL WHERE DOMAIN_ID=DID; UPDATE EC_DOMAIN SET INDEX_DATE=NOW(), STATE=ST, DOMAIN_ALIAS=NULL, INDEXED=GREATEST(INDEXED,IDX), IP=IP WHERE ID=DID; DELETE FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=DID; END diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/TruncateLinkDatabase.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/TruncateLinkDatabase.java index f44545b9..70dd06a3 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/TruncateLinkDatabase.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/TruncateLinkDatabase.java @@ -62,13 +62,11 @@ public class TruncateLinkDatabase extends AbstractActorPrototype { Truncate the domain and link tables. """ ) - public void exportBlacklist() throws Exception { + public void flushDatabase() throws Exception { try (var conn = dataSource.getConnection(); var stmt = conn.createStatement()) { stmt.executeUpdate("SET FOREIGN_KEY_CHECKS = 0"); - stmt.executeUpdate("TRUNCATE TABLE EC_PAGE_DATA"); - stmt.executeUpdate("TRUNCATE TABLE EC_URL"); stmt.executeUpdate("TRUNCATE TABLE EC_DOMAIN_LINK"); stmt.executeUpdate("TRUNCATE TABLE DOMAIN_METADATA"); stmt.executeUpdate("SET FOREIGN_KEY_CHECKS = 1");