mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(db) Remove EC_URL and EC_PAGE_DATA from mariadb database
This commit is contained in:
parent
28188a6e59
commit
e710e057e2
@ -0,0 +1,3 @@
|
||||
DROP VIEW EC_URL_VIEW;
|
||||
DROP TABLE EC_PAGE_DATA;
|
||||
DROP TABLE EC_URL;
|
@ -1,71 +0,0 @@
|
||||
package nu.marginalia.converting;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import gnu.trove.map.hash.TIntIntHashMap;
|
||||
import nu.marginalia.service.module.DatabaseModule;
|
||||
|
||||
import java.sql.SQLException;
|
||||
|
||||
public class UpdateDomainStatistics {
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
public UpdateDomainStatistics(HikariDataSource dataSource) {
|
||||
this.dataSource = dataSource;
|
||||
}
|
||||
|
||||
public static void main(String... args) throws SQLException {
|
||||
new UpdateDomainStatistics(new DatabaseModule().provideConnection()).run();
|
||||
}
|
||||
|
||||
public void run() throws SQLException {
|
||||
|
||||
// This looks weird, but it's actually much faster than doing the computations with SQL queries
|
||||
//
|
||||
// ... in part because we can assume the data is immutable and don't mind consuming egregious
|
||||
// resources
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.createStatement();
|
||||
var domainInfoQuery = conn.prepareStatement("SELECT DOMAIN_ID, VISITED, STATE='ok' FROM EC_URL");
|
||||
var insertDomainInfo = conn.prepareStatement("INSERT INTO DOMAIN_METADATA(ID,KNOWN_URLS,VISITED_URLS,GOOD_URLS) VALUES (?, ?, ?, ?)")
|
||||
) {
|
||||
|
||||
stmt.executeUpdate("DELETE FROM DOMAIN_METADATA");
|
||||
|
||||
TIntIntHashMap knownUrls = new TIntIntHashMap(1_000_000, 0.75f, 0, 0);
|
||||
TIntIntHashMap visitedUrls = new TIntIntHashMap(1_000_000, 0.75f, 0, 0);
|
||||
TIntIntHashMap goodUrls = new TIntIntHashMap(1_000_000, 0.75f, 0, 0);
|
||||
|
||||
domainInfoQuery.setFetchSize(10_000);
|
||||
var rsp = domainInfoQuery.executeQuery();
|
||||
while (rsp.next()) {
|
||||
int domainId = rsp.getInt(1);
|
||||
boolean visited = rsp.getBoolean(2);
|
||||
boolean stateOk = rsp.getBoolean(3);
|
||||
|
||||
knownUrls.adjustOrPutValue(domainId, 1, 1);
|
||||
if (visited) {
|
||||
visitedUrls.adjustOrPutValue(domainId, 1, 1);
|
||||
if (stateOk) {
|
||||
goodUrls.adjustOrPutValue(domainId, 1, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int i = 0;
|
||||
for (int domainId : knownUrls.keys()) {
|
||||
insertDomainInfo.setInt(1, domainId);
|
||||
insertDomainInfo.setInt(2, knownUrls.get(domainId));
|
||||
insertDomainInfo.setInt(3, visitedUrls.get(domainId));
|
||||
insertDomainInfo.setInt(4, goodUrls.get(domainId));
|
||||
insertDomainInfo.addBatch();
|
||||
if ((++i % 1000) == 0) {
|
||||
insertDomainInfo.executeBatch();
|
||||
}
|
||||
}
|
||||
if ((i % 1000) != 0) {
|
||||
insertDomainInfo.executeBatch();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -26,9 +26,6 @@ public class SqlLoadProcessedDomain {
|
||||
try (var stmt = conn.createStatement()) {
|
||||
stmt.execute("DROP PROCEDURE IF EXISTS INITIALIZE_DOMAIN");
|
||||
|
||||
// Note that there should be no need to delete from EC_PAGE_DATA here as it's done via their
|
||||
// CASCADE DELETE constraint on EC_URL.
|
||||
|
||||
stmt.execute("""
|
||||
CREATE PROCEDURE INITIALIZE_DOMAIN (
|
||||
IN ST ENUM('ACTIVE', 'EXHAUSTED', 'SPECIAL', 'SOCIAL_MEDIA', 'BLOCKED', 'REDIR', 'ERROR', 'UNKNOWN'),
|
||||
@ -38,7 +35,6 @@ public class SqlLoadProcessedDomain {
|
||||
BEGIN
|
||||
DELETE FROM DOMAIN_METADATA WHERE ID=DID;
|
||||
DELETE FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=DID;
|
||||
DELETE FROM EC_URL WHERE DOMAIN_ID=DID;
|
||||
UPDATE EC_DOMAIN SET INDEX_DATE=NOW(), STATE=ST, DOMAIN_ALIAS=NULL, INDEXED=GREATEST(INDEXED,IDX), IP=IP WHERE ID=DID;
|
||||
DELETE FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=DID;
|
||||
END
|
||||
|
@ -62,13 +62,11 @@ public class TruncateLinkDatabase extends AbstractActorPrototype {
|
||||
Truncate the domain and link tables.
|
||||
"""
|
||||
)
|
||||
public void exportBlacklist() throws Exception {
|
||||
public void flushDatabase() throws Exception {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.createStatement())
|
||||
{
|
||||
stmt.executeUpdate("SET FOREIGN_KEY_CHECKS = 0");
|
||||
stmt.executeUpdate("TRUNCATE TABLE EC_PAGE_DATA");
|
||||
stmt.executeUpdate("TRUNCATE TABLE EC_URL");
|
||||
stmt.executeUpdate("TRUNCATE TABLE EC_DOMAIN_LINK");
|
||||
stmt.executeUpdate("TRUNCATE TABLE DOMAIN_METADATA");
|
||||
stmt.executeUpdate("SET FOREIGN_KEY_CHECKS = 1");
|
||||
|
Loading…
Reference in New Issue
Block a user