mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Use 64 bit path hash for EC_URL
This commit is contained in:
parent
2faaed3393
commit
495e6a1639
@ -1,11 +1,13 @@
|
|||||||
package nu.marginalia.wmsa.edge.converting.loader;
|
package nu.marginalia.wmsa.edge.converting.loader;
|
||||||
|
|
||||||
|
import com.google.common.hash.Hashing;
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.sql.Types;
|
import java.sql.Types;
|
||||||
|
|
||||||
@ -28,7 +30,7 @@ public class SqlLoadUrls {
|
|||||||
IN DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci,
|
IN DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci,
|
||||||
IN PORT INT,
|
IN PORT INT,
|
||||||
IN PATH VARCHAR(255),
|
IN PATH VARCHAR(255),
|
||||||
IN PATH_HASH INT
|
IN PATH_HASH BIGINT
|
||||||
)
|
)
|
||||||
BEGIN
|
BEGIN
|
||||||
INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PATH_HASH) SELECT PROTO,ID,PORT,PATH,PATH_HASH FROM EC_DOMAIN WHERE DOMAIN_NAME=DOMAIN;
|
INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PATH_HASH) SELECT PROTO,ID,PORT,PATH,PATH_HASH FROM EC_DOMAIN WHERE DOMAIN_NAME=DOMAIN;
|
||||||
@ -59,7 +61,7 @@ public class SqlLoadUrls {
|
|||||||
insertCall.setNull(3, Types.INTEGER);
|
insertCall.setNull(3, Types.INTEGER);
|
||||||
}
|
}
|
||||||
insertCall.setString(4, url.path);
|
insertCall.setString(4, url.path);
|
||||||
insertCall.setInt(5, url.path.hashCode());
|
insertCall.setLong(5, hashPath(url.path));
|
||||||
insertCall.addBatch();
|
insertCall.addBatch();
|
||||||
}
|
}
|
||||||
var ret = insertCall.executeBatch();
|
var ret = insertCall.executeBatch();
|
||||||
@ -91,4 +93,8 @@ public class SqlLoadUrls {
|
|||||||
logger.warn("SQL error inserting URLs", ex);
|
logger.warn("SQL error inserting URLs", ex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private long hashPath(String path) {
|
||||||
|
return Hashing.murmur3_128().hashString(path, StandardCharsets.UTF_8).asLong();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -50,7 +50,7 @@ CREATE TABLE IF NOT EXISTS EC_URL (
|
|||||||
PATH VARCHAR(255) NOT NULL COLLATE utf8mb4_bin,
|
PATH VARCHAR(255) NOT NULL COLLATE utf8mb4_bin,
|
||||||
PORT INT,
|
PORT INT,
|
||||||
|
|
||||||
PATH_HASH INT NOT NULL COMMENT "Hash of PATH for uniqueness check by domain",
|
PATH_HASH BIGINT NOT NULL COMMENT "Hash of PATH for uniqueness check by domain",
|
||||||
VISITED BOOLEAN NOT NULL DEFAULT FALSE,
|
VISITED BOOLEAN NOT NULL DEFAULT FALSE,
|
||||||
|
|
||||||
STATE ENUM('ok', 'redirect', 'dead', 'archived', 'disqualified') NOT NULL DEFAULT 'ok',
|
STATE ENUM('ok', 'redirect', 'dead', 'archived', 'disqualified') NOT NULL DEFAULT 'ok',
|
||||||
@ -173,7 +173,6 @@ CREATE TABLE IF NOT EXISTS EC_API_KEY (
|
|||||||
);
|
);
|
||||||
|
|
||||||
CREATE INDEX IF NOT EXISTS EC_DOMAIN_INDEXED_INDEX ON EC_DOMAIN (INDEXED);
|
CREATE INDEX IF NOT EXISTS EC_DOMAIN_INDEXED_INDEX ON EC_DOMAIN (INDEXED);
|
||||||
CREATE INDEX IF NOT EXISTS EC_URL_VISITED ON EC_URL (VISITED);
|
|
||||||
CREATE INDEX IF NOT EXISTS EC_DOMAIN_TOP_DOMAIN ON EC_DOMAIN (DOMAIN_TOP);
|
CREATE INDEX IF NOT EXISTS EC_DOMAIN_TOP_DOMAIN ON EC_DOMAIN (DOMAIN_TOP);
|
||||||
|
|
||||||
---;
|
---;
|
||||||
|
Loading…
Reference in New Issue
Block a user