From 495e6a1639dc070dfe8596caf36442dc9e2c8507 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Wed, 8 Jun 2022 16:52:46 +0200 Subject: [PATCH] Use 64 bit path hash for EC_URL --- .../wmsa/edge/converting/loader/SqlLoadUrls.java | 10 ++++++++-- .../src/main/resources/sql/edge-crawler-cache.sql | 3 +-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java index fd698c82..ba9ae43a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java @@ -1,11 +1,13 @@ package nu.marginalia.wmsa.edge.converting.loader; +import com.google.common.hash.Hashing; import com.google.inject.Inject; import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.wmsa.edge.model.EdgeUrl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.nio.charset.StandardCharsets; import java.sql.SQLException; import java.sql.Types; @@ -28,7 +30,7 @@ public class SqlLoadUrls { IN DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci, IN PORT INT, IN PATH VARCHAR(255), - IN PATH_HASH INT + IN PATH_HASH BIGINT ) BEGIN INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PATH_HASH) SELECT PROTO,ID,PORT,PATH,PATH_HASH FROM EC_DOMAIN WHERE DOMAIN_NAME=DOMAIN; @@ -59,7 +61,7 @@ public class SqlLoadUrls { insertCall.setNull(3, Types.INTEGER); } insertCall.setString(4, url.path); - insertCall.setInt(5, url.path.hashCode()); + insertCall.setLong(5, hashPath(url.path)); insertCall.addBatch(); } var ret = insertCall.executeBatch(); @@ -91,4 +93,8 @@ public class SqlLoadUrls { logger.warn("SQL error inserting URLs", ex); } } + + private long hashPath(String path) { + return Hashing.murmur3_128().hashString(path, StandardCharsets.UTF_8).asLong(); + } } diff --git a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql index 6c99eccf..2e517ac9 100644 --- a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql +++ b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql @@ -50,7 +50,7 @@ CREATE TABLE IF NOT EXISTS EC_URL ( PATH VARCHAR(255) NOT NULL COLLATE utf8mb4_bin, PORT INT, - PATH_HASH INT NOT NULL COMMENT "Hash of PATH for uniqueness check by domain", + PATH_HASH BIGINT NOT NULL COMMENT "Hash of PATH for uniqueness check by domain", VISITED BOOLEAN NOT NULL DEFAULT FALSE, STATE ENUM('ok', 'redirect', 'dead', 'archived', 'disqualified') NOT NULL DEFAULT 'ok', @@ -173,7 +173,6 @@ CREATE TABLE IF NOT EXISTS EC_API_KEY ( ); CREATE INDEX IF NOT EXISTS EC_DOMAIN_INDEXED_INDEX ON EC_DOMAIN (INDEXED); -CREATE INDEX IF NOT EXISTS EC_URL_VISITED ON EC_URL (VISITED); CREATE INDEX IF NOT EXISTS EC_DOMAIN_TOP_DOMAIN ON EC_DOMAIN (DOMAIN_TOP); ---;