(db) Remove EC_URL and EC_PAGE_DATA from mariadb database

This commit is contained in:
Viktor Lofgren 2023-08-25 13:45:03 +02:00
parent 28188a6e59
commit e710e057e2
5 changed files with 4 additions and 78 deletions

View File

@ -0,0 +1,3 @@
DROP VIEW EC_URL_VIEW;
DROP TABLE EC_PAGE_DATA;
DROP TABLE EC_URL;

View File

@ -1,71 +0,0 @@
package nu.marginalia.converting;
import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.map.hash.TIntIntHashMap;
import nu.marginalia.service.module.DatabaseModule;
import java.sql.SQLException;
public class UpdateDomainStatistics {
private final HikariDataSource dataSource;
public UpdateDomainStatistics(HikariDataSource dataSource) {
this.dataSource = dataSource;
}
public static void main(String... args) throws SQLException {
new UpdateDomainStatistics(new DatabaseModule().provideConnection()).run();
}
public void run() throws SQLException {
// This looks weird, but it's actually much faster than doing the computations with SQL queries
//
// ... in part because we can assume the data is immutable and don't mind consuming egregious
// resources
try (var conn = dataSource.getConnection();
var stmt = conn.createStatement();
var domainInfoQuery = conn.prepareStatement("SELECT DOMAIN_ID, VISITED, STATE='ok' FROM EC_URL");
var insertDomainInfo = conn.prepareStatement("INSERT INTO DOMAIN_METADATA(ID,KNOWN_URLS,VISITED_URLS,GOOD_URLS) VALUES (?, ?, ?, ?)")
) {
stmt.executeUpdate("DELETE FROM DOMAIN_METADATA");
TIntIntHashMap knownUrls = new TIntIntHashMap(1_000_000, 0.75f, 0, 0);
TIntIntHashMap visitedUrls = new TIntIntHashMap(1_000_000, 0.75f, 0, 0);
TIntIntHashMap goodUrls = new TIntIntHashMap(1_000_000, 0.75f, 0, 0);
domainInfoQuery.setFetchSize(10_000);
var rsp = domainInfoQuery.executeQuery();
while (rsp.next()) {
int domainId = rsp.getInt(1);
boolean visited = rsp.getBoolean(2);
boolean stateOk = rsp.getBoolean(3);
knownUrls.adjustOrPutValue(domainId, 1, 1);
if (visited) {
visitedUrls.adjustOrPutValue(domainId, 1, 1);
if (stateOk) {
goodUrls.adjustOrPutValue(domainId, 1, 1);
}
}
}
int i = 0;
for (int domainId : knownUrls.keys()) {
insertDomainInfo.setInt(1, domainId);
insertDomainInfo.setInt(2, knownUrls.get(domainId));
insertDomainInfo.setInt(3, visitedUrls.get(domainId));
insertDomainInfo.setInt(4, goodUrls.get(domainId));
insertDomainInfo.addBatch();
if ((++i % 1000) == 0) {
insertDomainInfo.executeBatch();
}
}
if ((i % 1000) != 0) {
insertDomainInfo.executeBatch();
}
}
}
}

View File

@ -26,9 +26,6 @@ public class SqlLoadProcessedDomain {
try (var stmt = conn.createStatement()) {
stmt.execute("DROP PROCEDURE IF EXISTS INITIALIZE_DOMAIN");
// Note that there should be no need to delete from EC_PAGE_DATA here as it's done via their
// CASCADE DELETE constraint on EC_URL.
stmt.execute("""
CREATE PROCEDURE INITIALIZE_DOMAIN (
IN ST ENUM('ACTIVE', 'EXHAUSTED', 'SPECIAL', 'SOCIAL_MEDIA', 'BLOCKED', 'REDIR', 'ERROR', 'UNKNOWN'),
@ -38,7 +35,6 @@ public class SqlLoadProcessedDomain {
BEGIN
DELETE FROM DOMAIN_METADATA WHERE ID=DID;
DELETE FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=DID;
DELETE FROM EC_URL WHERE DOMAIN_ID=DID;
UPDATE EC_DOMAIN SET INDEX_DATE=NOW(), STATE=ST, DOMAIN_ALIAS=NULL, INDEXED=GREATEST(INDEXED,IDX), IP=IP WHERE ID=DID;
DELETE FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=DID;
END

View File

@ -62,13 +62,11 @@ public class TruncateLinkDatabase extends AbstractActorPrototype {
Truncate the domain and link tables.
"""
)
public void exportBlacklist() throws Exception {
public void flushDatabase() throws Exception {
try (var conn = dataSource.getConnection();
var stmt = conn.createStatement())
{
stmt.executeUpdate("SET FOREIGN_KEY_CHECKS = 0");
stmt.executeUpdate("TRUNCATE TABLE EC_PAGE_DATA");
stmt.executeUpdate("TRUNCATE TABLE EC_URL");
stmt.executeUpdate("TRUNCATE TABLE EC_DOMAIN_LINK");
stmt.executeUpdate("TRUNCATE TABLE DOMAIN_METADATA");
stmt.executeUpdate("SET FOREIGN_KEY_CHECKS = 1");