Use 64 bit path hash for EC_URL

This commit is contained in:
vlofgren 2022-06-08 16:52:46 +02:00
parent 2faaed3393
commit 495e6a1639
2 changed files with 9 additions and 4 deletions

View File

@ -1,11 +1,13 @@
package nu.marginalia.wmsa.edge.converting.loader; package nu.marginalia.wmsa.edge.converting.loader;
import com.google.common.hash.Hashing;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.EdgeUrl;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.nio.charset.StandardCharsets;
import java.sql.SQLException; import java.sql.SQLException;
import java.sql.Types; import java.sql.Types;
@ -28,7 +30,7 @@ public class SqlLoadUrls {
IN DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci, IN DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci,
IN PORT INT, IN PORT INT,
IN PATH VARCHAR(255), IN PATH VARCHAR(255),
IN PATH_HASH INT IN PATH_HASH BIGINT
) )
BEGIN BEGIN
INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PATH_HASH) SELECT PROTO,ID,PORT,PATH,PATH_HASH FROM EC_DOMAIN WHERE DOMAIN_NAME=DOMAIN; INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PATH_HASH) SELECT PROTO,ID,PORT,PATH,PATH_HASH FROM EC_DOMAIN WHERE DOMAIN_NAME=DOMAIN;
@ -59,7 +61,7 @@ public class SqlLoadUrls {
insertCall.setNull(3, Types.INTEGER); insertCall.setNull(3, Types.INTEGER);
} }
insertCall.setString(4, url.path); insertCall.setString(4, url.path);
insertCall.setInt(5, url.path.hashCode()); insertCall.setLong(5, hashPath(url.path));
insertCall.addBatch(); insertCall.addBatch();
} }
var ret = insertCall.executeBatch(); var ret = insertCall.executeBatch();
@ -91,4 +93,8 @@ public class SqlLoadUrls {
logger.warn("SQL error inserting URLs", ex); logger.warn("SQL error inserting URLs", ex);
} }
} }
private long hashPath(String path) {
return Hashing.murmur3_128().hashString(path, StandardCharsets.UTF_8).asLong();
}
} }

View File

@ -50,7 +50,7 @@ CREATE TABLE IF NOT EXISTS EC_URL (
PATH VARCHAR(255) NOT NULL COLLATE utf8mb4_bin, PATH VARCHAR(255) NOT NULL COLLATE utf8mb4_bin,
PORT INT, PORT INT,
PATH_HASH INT NOT NULL COMMENT "Hash of PATH for uniqueness check by domain", PATH_HASH BIGINT NOT NULL COMMENT "Hash of PATH for uniqueness check by domain",
VISITED BOOLEAN NOT NULL DEFAULT FALSE, VISITED BOOLEAN NOT NULL DEFAULT FALSE,
STATE ENUM('ok', 'redirect', 'dead', 'archived', 'disqualified') NOT NULL DEFAULT 'ok', STATE ENUM('ok', 'redirect', 'dead', 'archived', 'disqualified') NOT NULL DEFAULT 'ok',
@ -173,7 +173,6 @@ CREATE TABLE IF NOT EXISTS EC_API_KEY (
); );
CREATE INDEX IF NOT EXISTS EC_DOMAIN_INDEXED_INDEX ON EC_DOMAIN (INDEXED); CREATE INDEX IF NOT EXISTS EC_DOMAIN_INDEXED_INDEX ON EC_DOMAIN (INDEXED);
CREATE INDEX IF NOT EXISTS EC_URL_VISITED ON EC_URL (VISITED);
CREATE INDEX IF NOT EXISTS EC_DOMAIN_TOP_DOMAIN ON EC_DOMAIN (DOMAIN_TOP); CREATE INDEX IF NOT EXISTS EC_DOMAIN_TOP_DOMAIN ON EC_DOMAIN (DOMAIN_TOP);
---; ---;