Use 64 bit path hash for EC_URL

This commit is contained in:
vlofgren 2022-06-08 16:52:46 +02:00
parent 2faaed3393
commit 495e6a1639
2 changed files with 9 additions and 4 deletions

View File

@ -1,11 +1,13 @@
package nu.marginalia.wmsa.edge.converting.loader;
import com.google.common.hash.Hashing;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.nio.charset.StandardCharsets;
import java.sql.SQLException;
import java.sql.Types;
@ -28,7 +30,7 @@ public class SqlLoadUrls {
IN DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci,
IN PORT INT,
IN PATH VARCHAR(255),
IN PATH_HASH INT
IN PATH_HASH BIGINT
)
BEGIN
INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PATH_HASH) SELECT PROTO,ID,PORT,PATH,PATH_HASH FROM EC_DOMAIN WHERE DOMAIN_NAME=DOMAIN;
@ -59,7 +61,7 @@ public class SqlLoadUrls {
insertCall.setNull(3, Types.INTEGER);
}
insertCall.setString(4, url.path);
insertCall.setInt(5, url.path.hashCode());
insertCall.setLong(5, hashPath(url.path));
insertCall.addBatch();
}
var ret = insertCall.executeBatch();
@ -91,4 +93,8 @@ public class SqlLoadUrls {
logger.warn("SQL error inserting URLs", ex);
}
}
private long hashPath(String path) {
return Hashing.murmur3_128().hashString(path, StandardCharsets.UTF_8).asLong();
}
}

View File

@ -50,7 +50,7 @@ CREATE TABLE IF NOT EXISTS EC_URL (
PATH VARCHAR(255) NOT NULL COLLATE utf8mb4_bin,
PORT INT,
PATH_HASH INT NOT NULL COMMENT "Hash of PATH for uniqueness check by domain",
PATH_HASH BIGINT NOT NULL COMMENT "Hash of PATH for uniqueness check by domain",
VISITED BOOLEAN NOT NULL DEFAULT FALSE,
STATE ENUM('ok', 'redirect', 'dead', 'archived', 'disqualified') NOT NULL DEFAULT 'ok',
@ -173,7 +173,6 @@ CREATE TABLE IF NOT EXISTS EC_API_KEY (
);
CREATE INDEX IF NOT EXISTS EC_DOMAIN_INDEXED_INDEX ON EC_DOMAIN (INDEXED);
CREATE INDEX IF NOT EXISTS EC_URL_VISITED ON EC_URL (VISITED);
CREATE INDEX IF NOT EXISTS EC_DOMAIN_TOP_DOMAIN ON EC_DOMAIN (DOMAIN_TOP);
---;