mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Use 64 bit path hash for EC_URL
This commit is contained in:
parent
2faaed3393
commit
495e6a1639
@ -1,11 +1,13 @@
|
||||
package nu.marginalia.wmsa.edge.converting.loader;
|
||||
|
||||
import com.google.common.hash.Hashing;
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.sql.SQLException;
|
||||
import java.sql.Types;
|
||||
|
||||
@ -28,7 +30,7 @@ public class SqlLoadUrls {
|
||||
IN DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci,
|
||||
IN PORT INT,
|
||||
IN PATH VARCHAR(255),
|
||||
IN PATH_HASH INT
|
||||
IN PATH_HASH BIGINT
|
||||
)
|
||||
BEGIN
|
||||
INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PATH_HASH) SELECT PROTO,ID,PORT,PATH,PATH_HASH FROM EC_DOMAIN WHERE DOMAIN_NAME=DOMAIN;
|
||||
@ -59,7 +61,7 @@ public class SqlLoadUrls {
|
||||
insertCall.setNull(3, Types.INTEGER);
|
||||
}
|
||||
insertCall.setString(4, url.path);
|
||||
insertCall.setInt(5, url.path.hashCode());
|
||||
insertCall.setLong(5, hashPath(url.path));
|
||||
insertCall.addBatch();
|
||||
}
|
||||
var ret = insertCall.executeBatch();
|
||||
@ -91,4 +93,8 @@ public class SqlLoadUrls {
|
||||
logger.warn("SQL error inserting URLs", ex);
|
||||
}
|
||||
}
|
||||
|
||||
private long hashPath(String path) {
|
||||
return Hashing.murmur3_128().hashString(path, StandardCharsets.UTF_8).asLong();
|
||||
}
|
||||
}
|
||||
|
@ -50,7 +50,7 @@ CREATE TABLE IF NOT EXISTS EC_URL (
|
||||
PATH VARCHAR(255) NOT NULL COLLATE utf8mb4_bin,
|
||||
PORT INT,
|
||||
|
||||
PATH_HASH INT NOT NULL COMMENT "Hash of PATH for uniqueness check by domain",
|
||||
PATH_HASH BIGINT NOT NULL COMMENT "Hash of PATH for uniqueness check by domain",
|
||||
VISITED BOOLEAN NOT NULL DEFAULT FALSE,
|
||||
|
||||
STATE ENUM('ok', 'redirect', 'dead', 'archived', 'disqualified') NOT NULL DEFAULT 'ok',
|
||||
@ -173,7 +173,6 @@ CREATE TABLE IF NOT EXISTS EC_API_KEY (
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS EC_DOMAIN_INDEXED_INDEX ON EC_DOMAIN (INDEXED);
|
||||
CREATE INDEX IF NOT EXISTS EC_URL_VISITED ON EC_URL (VISITED);
|
||||
CREATE INDEX IF NOT EXISTS EC_DOMAIN_TOP_DOMAIN ON EC_DOMAIN (DOMAIN_TOP);
|
||||
|
||||
---;
|
||||
|
Loading…
Reference in New Issue
Block a user