mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
WIP: Database refactoring
This commit is contained in:
parent
0e65384781
commit
c915664fcc
@ -1,49 +0,0 @@
|
||||
package nu.marginalia.util.ranking;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import gnu.trove.list.TIntList;
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
import gnu.trove.map.hash.TIntIntHashMap;
|
||||
import it.unimi.dsi.fastutil.ints.IntArrays;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.SQLException;
|
||||
|
||||
public class AcademiaRank {
|
||||
private final TIntArrayList result;
|
||||
private static final Logger logger = LoggerFactory.getLogger(AcademiaRank.class);
|
||||
|
||||
public AcademiaRank(HikariDataSource ds, String... origins) throws IOException {
|
||||
|
||||
TIntList rankingResults = new BetterStandardPageRank(ds, origins).pageRank(100_000);
|
||||
TIntIntHashMap idToRanking = new TIntIntHashMap(100_000, 0.5f, -1, 1_000_000_000);
|
||||
|
||||
for (int i = 0; i < rankingResults.size(); i++) {
|
||||
idToRanking.put(rankingResults.get(i), i);
|
||||
}
|
||||
|
||||
result = new TIntArrayList(10000);
|
||||
try (var conn = ds.getConnection();
|
||||
var stmt = conn.prepareStatement("select EC_DOMAIN.ID,COUNT(SOURCE_DOMAIN_ID) AS CNT from EC_DOMAIN INNER JOIN DOMAIN_METADATA ON DOMAIN_METADATA.ID=EC_DOMAIN.ID INNER JOIN EC_DOMAIN_LINK ON EC_DOMAIN_LINK.DEST_DOMAIN_ID=EC_DOMAIN.ID WHERE INDEXED>0 AND STATE>=0 AND STATE<2 AND ((VISITED_URLS>1000+1500*RANK AND RANK<1) OR (GOOD_URLS>1000 AND URL_PART LIKE '%edu')) GROUP BY EC_DOMAIN.ID HAVING CNT<1500 ORDER BY RANK ASC")) {
|
||||
|
||||
stmt.setFetchSize(1000);
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
result.add(rsp.getInt(1));
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("SQL error", ex);
|
||||
}
|
||||
|
||||
int[] internalArray = result.toArray();
|
||||
IntArrays.quickSort(internalArray, (a,b) -> idToRanking.get(a) - idToRanking.get(b));
|
||||
result.set(0, internalArray);
|
||||
}
|
||||
|
||||
public TIntArrayList getResult() {
|
||||
return result;
|
||||
}
|
||||
}
|
@ -72,10 +72,10 @@ public abstract class RankingAlgorithm {
|
||||
|
||||
String s;
|
||||
if (getNames) {
|
||||
s = "SELECT EC_DOMAIN.ID,URL_PART,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
|
||||
s = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
|
||||
}
|
||||
else {
|
||||
s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
|
||||
s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
|
||||
}
|
||||
try (var stmt = conn.prepareStatement(s)) {
|
||||
stmt.setFetchSize(10000);
|
||||
@ -84,7 +84,7 @@ public abstract class RankingAlgorithm {
|
||||
int id = rsp.getInt(1);
|
||||
if (!spamDomains.contains(id)) {
|
||||
|
||||
domainsById.put(id, new DomainData(id, rsp.getString(2), rsp.getInt(3), rsp.getInt(4), rsp.getInt(5), false));
|
||||
domainsById.put(id, new DomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5), false));
|
||||
|
||||
domainIndexToId.put(domainIndexToId.size(), id);
|
||||
domainIdToIndex.put(id, domainIdToIndex.size());
|
||||
@ -125,7 +125,7 @@ public abstract class RankingAlgorithm {
|
||||
}
|
||||
}
|
||||
|
||||
try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART LIKE ?")) {
|
||||
try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME LIKE ?")) {
|
||||
for (var seed : this.originDomains) {
|
||||
stmt.setString(1, seed);
|
||||
var rsp = stmt.executeQuery();
|
||||
@ -159,10 +159,10 @@ public abstract class RankingAlgorithm {
|
||||
try (var conn = dataSource.getConnection()) {
|
||||
String s;
|
||||
if (getNames) {
|
||||
s = "SELECT EC_DOMAIN.ID,URL_PART,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
|
||||
s = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
|
||||
}
|
||||
else {
|
||||
s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
|
||||
s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
|
||||
}
|
||||
try (var stmt = conn.prepareStatement(s)) {
|
||||
stmt.setFetchSize(10000);
|
||||
@ -172,7 +172,7 @@ public abstract class RankingAlgorithm {
|
||||
int id = rsp.getInt(1);
|
||||
|
||||
if (!spamDomains.contains(id)) {
|
||||
domainsById.put(id, new DomainData(id, rsp.getString(2), rsp.getInt(3), rsp.getInt(4), rsp.getInt(5), true));
|
||||
domainsById.put(id, new DomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5), true));
|
||||
|
||||
domainIndexToId.put(domainIndexToId.size(), id);
|
||||
domainIdToIndex.put(id, domainIdToIndex.size());
|
||||
@ -451,7 +451,7 @@ public abstract class RankingAlgorithm {
|
||||
public final int id;
|
||||
public final String name;
|
||||
private int alias;
|
||||
private int state;
|
||||
private EdgeDomainIndexingState state;
|
||||
public final int knownUrls;
|
||||
public boolean peripheral;
|
||||
|
||||
@ -465,11 +465,11 @@ public abstract class RankingAlgorithm {
|
||||
}
|
||||
|
||||
public boolean isSpecial() {
|
||||
return EdgeDomainIndexingState.SPECIAL.code == state;
|
||||
return EdgeDomainIndexingState.SPECIAL == state;
|
||||
}
|
||||
|
||||
public boolean isSocialMedia() {
|
||||
return EdgeDomainIndexingState.SOCIAL_MEDIA.code == state;
|
||||
return EdgeDomainIndexingState.SOCIAL_MEDIA == state;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -66,7 +66,7 @@ public class OldReversePageRankV2 {
|
||||
originDomains.add("memex.marginalia.nu");
|
||||
|
||||
try (var conn = dataSource.getConnection()) {
|
||||
try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE FROM EC_DOMAIN WHERE INDEXED>1 AND STATE>=0 AND QUALITY_RAW>=-10")) {
|
||||
try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE FROM EC_DOMAIN WHERE INDEXED>1 AND IS_ALIVE")) {
|
||||
stmt.setFetchSize(10000);
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
@ -90,7 +90,7 @@ public class OldReversePageRankV2 {
|
||||
}
|
||||
}
|
||||
|
||||
try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?")) {
|
||||
try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||
stmt.setFetchSize(10000);
|
||||
|
||||
for (var seed : this.originDomains) {
|
||||
|
@ -48,7 +48,7 @@ public class StandardPageRank {
|
||||
originDomains.addAll(Arrays.asList(origins));
|
||||
|
||||
try (var conn = dataSource.getConnection()) {
|
||||
try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE,URL_PART FROM EC_DOMAIN WHERE INDEXED>1 AND STATE>=0 AND QUALITY>=-10")) {
|
||||
try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE,DOMAIN_NAME FROM EC_DOMAIN WHERE INDEXED>1 AND IS_ALIVE AND QUALITY>=-10")) {
|
||||
stmt.setFetchSize(10000);
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
@ -78,7 +78,7 @@ public class StandardPageRank {
|
||||
}
|
||||
}
|
||||
|
||||
try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?")) {
|
||||
try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||
for (var seed : this.originDomains) {
|
||||
stmt.setString(1, seed);
|
||||
var rsp = stmt.executeQuery();
|
||||
|
@ -50,7 +50,7 @@ public class DedupTool {
|
||||
Map<Integer, Map<Integer, List<Data>>> domainToHashToUrl = new HashMap<>();
|
||||
|
||||
try (var conn = ds.getConnection();
|
||||
var fetchStmt = conn.prepareStatement("SELECT URL_TOP_DOMAIN_ID,DATA_HASH,URL,EC_URL.ID,EC_DOMAIN.URL_PART FROM EC_URL INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE DATA_HASH IS NOT NULL");
|
||||
var fetchStmt = conn.prepareStatement("SELECT URL_TOP_DOMAIN_ID,DATA_HASH,URL,EC_URL.ID,EC_DOMAIN.DOMAIN_NAME FROM EC_URL INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE DATA_HASH IS NOT NULL");
|
||||
var updateStmt = conn.prepareStatement("UPDATE EC_URL SET STATE='redirect' WHERE ID=?")
|
||||
|
||||
) {
|
||||
|
@ -112,10 +112,10 @@ public class PerusePageRankV2 {
|
||||
try (var conn = dataSource.getConnection()) {
|
||||
String s;
|
||||
if (getNames) {
|
||||
s = "SELECT EC_DOMAIN.ID,URL_PART,DOMAIN_ALIAS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 GROUP BY EC_DOMAIN.ID";
|
||||
s = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) GROUP BY EC_DOMAIN.ID";
|
||||
}
|
||||
else {
|
||||
s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS FROM EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 GROUP BY EC_DOMAIN.ID";
|
||||
s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS FROM EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) GROUP BY EC_DOMAIN.ID";
|
||||
}
|
||||
try (var stmt = conn.prepareStatement(s)) {
|
||||
stmt.setFetchSize(10000);
|
||||
|
@ -1,30 +0,0 @@
|
||||
package nu.marginalia.util.ranking.tool;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.ranking.AcademiaRank;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import org.mariadb.jdbc.Driver;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class TestAcademiaRankTool {
|
||||
|
||||
@SneakyThrows
|
||||
public static void main(String... args) {
|
||||
Driver driver = new Driver();
|
||||
var conn = new DatabaseModule().provideConnection();
|
||||
|
||||
var rank = new AcademiaRank(new DatabaseModule().provideConnection(), "www.perseus.tufts.edu", "xroads.virginia.edu");
|
||||
var res = rank.getResult();
|
||||
|
||||
try (var c = conn.getConnection(); var stmt = c.prepareStatement("SELECT URL_PART FROM EC_DOMAIN WHERE ID=?")) {
|
||||
for (int i = 0; i < Math.min(res.size(), 100); i++) {
|
||||
stmt.setInt(1, res.getQuick(i));
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next())
|
||||
System.out.println(rsp.getString(1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -83,11 +83,6 @@ public class UpdateDomainRanksTool {
|
||||
}
|
||||
}
|
||||
|
||||
logger.info("Recalculating quality");
|
||||
try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET QUALITY=-5*RANK+IF(RANK=1,RANK*GREATEST(QUALITY_RAW,QUALITY_ORIGINAL)/2, 0)")) {
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
|
||||
} catch (SQLException | InterruptedException throwables) {
|
||||
throwables.printStackTrace();
|
||||
}
|
||||
|
@ -94,9 +94,6 @@ public class UpdateDomainRanksTool2 {
|
||||
}
|
||||
|
||||
logger.info("Recalculating quality");
|
||||
try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET QUALITY=-5*RANK+IF(RANK=1,RANK*GREATEST(QUALITY_RAW,QUALITY_ORIGINAL)/2, 0)")) {
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
|
||||
} catch (SQLException | InterruptedException throwables) {
|
||||
throwables.printStackTrace();
|
||||
|
@ -29,7 +29,7 @@ public class ReindexTriggerMain {
|
||||
.build();
|
||||
|
||||
try (var ds = db.provideConnection(); var conn = ds.getConnection(); var stmt = conn.createStatement()) {
|
||||
var rs = stmt.executeQuery("SELECT ID, URL_PART, STATE, INDEXED FROM EC_DOMAIN LIMIT 100");
|
||||
var rs = stmt.executeQuery("SELECT ID, DOMAIN_NAME, STATE, INDEXED FROM EC_DOMAIN LIMIT 100");
|
||||
while (rs.next()) {
|
||||
System.out.printf("%d %s %s %d\n",
|
||||
rs.getInt(1),
|
||||
@ -38,7 +38,7 @@ public class ReindexTriggerMain {
|
||||
rs.getInt(4));
|
||||
}
|
||||
|
||||
rs = stmt.executeQuery("SELECT ID, DOMAIN_ID, URL, VISITED, STATE FROM EC_URL LIMIT 100");
|
||||
rs = stmt.executeQuery("SELECT ID, DOMAIN_ID, PATH, VISITED, STATE FROM EC_URL LIMIT 100");
|
||||
while (rs.next()) {
|
||||
System.out.printf("%d %d %s %d %s\n",
|
||||
rs.getInt(1),
|
||||
|
@ -14,7 +14,7 @@ public interface Interpreter {
|
||||
void loadRssFeed(EdgeUrl[] rssFeed);
|
||||
void loadDomainLink(DomainLink[] links);
|
||||
|
||||
void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, double quality);
|
||||
void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip);
|
||||
void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument);
|
||||
void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError);
|
||||
|
||||
|
@ -6,11 +6,11 @@ import nu.marginalia.wmsa.edge.converting.interpreter.Interpreter;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||
|
||||
public record LoadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, double quality) implements Instruction {
|
||||
public record LoadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) implements Instruction {
|
||||
|
||||
@Override
|
||||
public void apply(Interpreter interpreter) {
|
||||
interpreter.loadProcessedDomain(domain, state, quality);
|
||||
interpreter.loadProcessedDomain(domain, state, ip);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -76,9 +76,9 @@ public class Loader implements Interpreter {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, double quality) {
|
||||
logger.debug("loadProcessedDomain({}, {}, {})", domain, state, quality);
|
||||
sqlLoadProcessedDomain.load(data, domain, state, quality);
|
||||
public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) {
|
||||
logger.debug("loadProcessedDomain({}, {}, {})", domain, state, ip);
|
||||
sqlLoadProcessedDomain.load(data, domain, state, ip);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -30,7 +30,7 @@ public class SqlLoadDomainLinks {
|
||||
INSERT IGNORE INTO EC_DOMAIN_LINK (SOURCE_DOMAIN_ID, DEST_DOMAIN_ID)
|
||||
SELECT SOURCE.ID,DEST.ID
|
||||
FROM EC_DOMAIN SOURCE INNER JOIN EC_DOMAIN DEST
|
||||
ON SOURCE.URL_PART=FROM_DOMAIN AND DEST.URL_PART=TO_DOMAIN;
|
||||
ON SOURCE.DOMAIN_NAME=FROM_DOMAIN AND DEST.DOMAIN_NAME=TO_DOMAIN;
|
||||
END
|
||||
""");
|
||||
}
|
||||
@ -61,8 +61,8 @@ public class SqlLoadDomainLinks {
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (SQLException sql) {
|
||||
sql.printStackTrace();
|
||||
catch (SQLException ex) {
|
||||
logger.warn("SQL error inserting domain links", ex);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -25,15 +25,9 @@ public class SqlLoadDomains {
|
||||
stmt.execute("""
|
||||
CREATE PROCEDURE INSERT_DOMAIN (
|
||||
IN DOMAIN_NAME VARCHAR(255),
|
||||
IN SUB_DOMAIN VARCHAR(255),
|
||||
IN TOP_DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci)
|
||||
BEGIN
|
||||
INSERT IGNORE INTO EC_TOP_DOMAIN (URL_PART) VALUES (TOP_DOMAIN);
|
||||
|
||||
INSERT IGNORE INTO EC_DOMAIN(URL_PART, URL_SUBDOMAIN, URL_TOP_DOMAIN_ID)
|
||||
SELECT DOMAIN_NAME,SUB_DOMAIN,ID
|
||||
FROM EC_TOP_DOMAIN
|
||||
WHERE EC_TOP_DOMAIN.URL_PART=TOP_DOMAIN;
|
||||
INSERT IGNORE INTO EC_DOMAIN(DOMAIN_NAME, DOMAIN_TOP) VALUES (DOMAIN_NAME, TOP_DOMAIN);
|
||||
END
|
||||
""");
|
||||
}
|
||||
@ -46,10 +40,9 @@ public class SqlLoadDomains {
|
||||
public void load(LoaderData data, EdgeDomain domain) {
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?,?)")) {
|
||||
try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?)")) {
|
||||
insertCall.setString(1, domain.toString());
|
||||
insertCall.setString(2, domain.subDomain);
|
||||
insertCall.setString(3, domain.domain);
|
||||
insertCall.setString(2, domain.domain);
|
||||
insertCall.addBatch();
|
||||
|
||||
var ret = insertCall.executeUpdate();
|
||||
@ -57,12 +50,11 @@ public class SqlLoadDomains {
|
||||
logger.warn("load({}) -- bad row count {}", domain, ret);
|
||||
}
|
||||
|
||||
connection.commit();
|
||||
findIdForTargetDomain(connection, data);
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
ex.printStackTrace();
|
||||
logger.warn("SQL error inserting domain", ex);
|
||||
}
|
||||
|
||||
|
||||
@ -73,12 +65,11 @@ public class SqlLoadDomains {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
connection.setAutoCommit(false);
|
||||
|
||||
try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?,?)")) {
|
||||
try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?)")) {
|
||||
|
||||
for (var domain : domains) {
|
||||
insertCall.setString(1, domain.toString());
|
||||
insertCall.setString(2, domain.subDomain);
|
||||
insertCall.setString(3, domain.domain);
|
||||
insertCall.setString(2, domain.domain);
|
||||
insertCall.addBatch();
|
||||
}
|
||||
var ret = insertCall.executeBatch();
|
||||
@ -95,7 +86,7 @@ public class SqlLoadDomains {
|
||||
findIdForTargetDomain(connection, data);
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
ex.printStackTrace();
|
||||
logger.warn("SQL error inserting domains", ex);
|
||||
}
|
||||
}
|
||||
|
||||
@ -104,7 +95,7 @@ public class SqlLoadDomains {
|
||||
return;
|
||||
}
|
||||
|
||||
try (var query = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?"))
|
||||
try (var query = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?"))
|
||||
{
|
||||
|
||||
var targetDomain = data.getTargetDomain();
|
||||
@ -118,7 +109,7 @@ public class SqlLoadDomains {
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
ex.printStackTrace();
|
||||
logger.warn("SQL error finding id for domain", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -31,14 +31,13 @@ public class SqlLoadProcessedDocument {
|
||||
IN TITLE VARCHAR(255),
|
||||
IN DESCRIPTION VARCHAR(255),
|
||||
IN LENGTH INT,
|
||||
IN QUALITY_MEASURE DOUBLE,
|
||||
IN FEATURES INT,
|
||||
IN STANDARD VARCHAR(32),
|
||||
IN HASH INT)
|
||||
BEGIN
|
||||
SET FOREIGN_KEY_CHECKS=0;
|
||||
REPLACE INTO EC_PAGE_DATA(ID, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES) VALUES (URL_ID, TITLE, DESCRIPTION, LENGTH, STANDARD, FEATURES);
|
||||
UPDATE EC_URL SET VISITED=1, STATE=STATE, QUALITY_MEASURE=QUALITY_MEASURE, DATA_HASH=HASH WHERE ID=URL_ID;
|
||||
REPLACE INTO EC_PAGE_DATA(ID, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH) VALUES (URL_ID, TITLE, DESCRIPTION, LENGTH, STANDARD, FEATURES, HASH);
|
||||
UPDATE EC_URL SET VISITED=1, STATE=STATE WHERE ID=URL_ID;
|
||||
SET FOREIGN_KEY_CHECKS=1;
|
||||
END
|
||||
""");
|
||||
@ -47,7 +46,7 @@ public class SqlLoadProcessedDocument {
|
||||
IN URL_ID INT,
|
||||
IN STATE VARCHAR(32))
|
||||
BEGIN
|
||||
UPDATE EC_URL SET VISITED=1, STATE=STATE, QUALITY_MEASURE=-100, DATA_HASH=NULL WHERE ID=URL_ID;
|
||||
UPDATE EC_URL SET VISITED=1, STATE=STATE WHERE ID=URL_ID;
|
||||
END
|
||||
""");
|
||||
|
||||
@ -60,7 +59,8 @@ public class SqlLoadProcessedDocument {
|
||||
|
||||
public void load(LoaderData data, List<LoadProcessedDocument> documents) {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT(?, ?, ?, ?, ?, ?, ?, ?, ?)")) {
|
||||
var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT(?, ?, ?, ?, ?, ?, ?, ?)")) {
|
||||
conn.setAutoCommit(false);
|
||||
|
||||
for (var doc : documents) {
|
||||
int urlId = data.getUrlId(doc.url());
|
||||
@ -74,10 +74,9 @@ public class SqlLoadProcessedDocument {
|
||||
stmt.setString(3, doc.title());
|
||||
stmt.setString(4, doc.description());
|
||||
stmt.setInt(5, doc.length());
|
||||
stmt.setDouble(6, doc.quality());
|
||||
stmt.setInt(7, doc.htmlFeatures());
|
||||
stmt.setString(8, doc.standard().name());
|
||||
stmt.setInt(9, (int) doc.hash());
|
||||
stmt.setInt(6, doc.htmlFeatures());
|
||||
stmt.setString(7, doc.standard().name());
|
||||
stmt.setInt(8, (int) doc.hash());
|
||||
stmt.addBatch();
|
||||
}
|
||||
var ret = stmt.executeBatch();
|
||||
@ -89,8 +88,8 @@ public class SqlLoadProcessedDocument {
|
||||
}
|
||||
|
||||
conn.commit();
|
||||
} catch (SQLException e) {
|
||||
e.printStackTrace();
|
||||
} catch (SQLException ex) {
|
||||
logger.warn("SQL error inserting document", ex);
|
||||
}
|
||||
|
||||
|
||||
@ -117,8 +116,8 @@ public class SqlLoadProcessedDocument {
|
||||
logger.warn("load({}) -- bad row count {}", documents.get(rv), ret[rv]);
|
||||
}
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
e.printStackTrace();
|
||||
} catch (SQLException ex) {
|
||||
logger.warn("SQL error inserting failed document", ex);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -25,12 +25,12 @@ public class SqlLoadProcessedDomain {
|
||||
stmt.execute("DROP PROCEDURE IF EXISTS INITIALIZE_DOMAIN");
|
||||
stmt.execute("""
|
||||
CREATE PROCEDURE INITIALIZE_DOMAIN (
|
||||
IN ST INT,
|
||||
IN ST ENUM('ACTIVE', 'EXHAUSTED', 'SPECIAL', 'SOCIAL_MEDIA', 'BLOCKED', 'REDIR', 'ERROR', 'UNKNOWN'),
|
||||
IN IDX INT,
|
||||
IN QUAL DOUBLE,
|
||||
IN DID INT)
|
||||
IN DID INT,
|
||||
IN IP VARCHAR(32))
|
||||
BEGIN
|
||||
UPDATE EC_DOMAIN SET INDEX_DATE=NOW(), STATE=ST, DOMAIN_ALIAS=NULL, INDEXED=GREATEST(INDEXED,IDX), QUALITY=QUAL, QUALITY_RAW=QUAL, QUALITY_ORIGINAL=QUAL WHERE ID=DID;
|
||||
UPDATE EC_DOMAIN SET INDEX_DATE=NOW(), STATE=ST, DOMAIN_ALIAS=NULL, INDEXED=GREATEST(INDEXED,IDX), IP=IP WHERE ID=DID;
|
||||
DELETE FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=DID;
|
||||
END
|
||||
""");
|
||||
@ -41,7 +41,7 @@ public class SqlLoadProcessedDomain {
|
||||
}
|
||||
}
|
||||
|
||||
public void load(LoaderData data, EdgeDomain domain, EdgeDomainIndexingState state, double quality) {
|
||||
public void load(LoaderData data, EdgeDomain domain, EdgeDomainIndexingState state, String ip) {
|
||||
data.setTargetDomain(domain);
|
||||
|
||||
loadDomains.load(data, domain);
|
||||
@ -49,18 +49,17 @@ public class SqlLoadProcessedDomain {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var initCall = conn.prepareCall("CALL INITIALIZE_DOMAIN(?,?,?,?)"))
|
||||
{
|
||||
initCall.setInt(1, state.code);
|
||||
initCall.setString(1, state.name());
|
||||
initCall.setInt(2, 1 + data.sizeHint / 100);
|
||||
initCall.setDouble(3, quality);
|
||||
initCall.setInt(4, data.getDomainId(domain));
|
||||
initCall.setInt(3, data.getDomainId(domain));
|
||||
initCall.setString(4, ip);
|
||||
int rc = initCall.executeUpdate();
|
||||
if (rc < 1) {
|
||||
logger.warn("load({},{},{}) -- bad rowcount {}", domain, state, quality, rc);
|
||||
logger.warn("load({},{}) -- bad rowcount {}", domain, state, rc);
|
||||
}
|
||||
conn.commit();
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
ex.printStackTrace();
|
||||
logger.warn("SQL error initializing domain", ex);
|
||||
}
|
||||
|
||||
}
|
||||
@ -69,9 +68,9 @@ public class SqlLoadProcessedDomain {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
UPDATE EC_DOMAIN TARGET
|
||||
INNER JOIN EC_DOMAIN ALIAS ON ALIAS.URL_PART=?
|
||||
INNER JOIN EC_DOMAIN ALIAS ON ALIAS.DOMAIN_NAME=?
|
||||
SET TARGET.DOMAIN_ALIAS=ALIAS.ID
|
||||
WHERE TARGET.URL_PART=?
|
||||
WHERE TARGET.DOMAIN_NAME=?
|
||||
""")) {
|
||||
stmt.setString(1, link.to().toString());
|
||||
stmt.setString(2, link.from().toString());
|
||||
@ -81,7 +80,7 @@ public class SqlLoadProcessedDomain {
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
ex.printStackTrace();
|
||||
logger.warn("SQL error inserting domain alias", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -25,12 +25,13 @@ public class SqlLoadUrls {
|
||||
stmt.execute("""
|
||||
CREATE PROCEDURE INSERT_URL (
|
||||
IN PROTO VARCHAR(255),
|
||||
IN DOMAIN_NAME VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci,
|
||||
IN DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci,
|
||||
IN PORT INT,
|
||||
IN URL VARCHAR(255)
|
||||
IN PATH VARCHAR(255),
|
||||
IN PATH_HASH INT
|
||||
)
|
||||
BEGIN
|
||||
INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,URL) SELECT PROTO,ID,PORT,URL FROM EC_DOMAIN WHERE URL_PART=DOMAIN_NAME;
|
||||
INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PATH_HASH) SELECT PROTO,ID,PORT,PATH,PATH_HASH FROM EC_DOMAIN WHERE DOMAIN_NAME=DOMAIN;
|
||||
END
|
||||
""");
|
||||
}
|
||||
@ -42,8 +43,8 @@ public class SqlLoadUrls {
|
||||
|
||||
public void load(LoaderData data, EdgeUrl[] urls) {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?)");
|
||||
var queryCall = conn.prepareStatement("SELECT ID, PROTO, URL FROM EC_URL WHERE DOMAIN_ID=?")
|
||||
var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?, ?)");
|
||||
var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH FROM EC_URL WHERE DOMAIN_ID=?")
|
||||
)
|
||||
{
|
||||
conn.setAutoCommit(false);
|
||||
@ -58,6 +59,7 @@ public class SqlLoadUrls {
|
||||
insertCall.setNull(3, Types.INTEGER);
|
||||
}
|
||||
insertCall.setString(4, url.path);
|
||||
insertCall.setInt(5, url.path.hashCode());
|
||||
insertCall.addBatch();
|
||||
}
|
||||
var ret = insertCall.executeBatch();
|
||||
@ -86,7 +88,7 @@ public class SqlLoadUrls {
|
||||
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
ex.printStackTrace();
|
||||
logger.warn("SQL error inserting URLs", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -15,7 +15,7 @@ public class InstructionsCompiler {
|
||||
public List<Instruction> compile(ProcessedDomain domain) {
|
||||
List<Instruction> ret = new ArrayList<>(domain.size()*4);
|
||||
|
||||
ret.add(new LoadProcessedDomain(domain.domain, domain.state, domain.averageQuality().orElse(-5.)));
|
||||
ret.add(new LoadProcessedDomain(domain.domain, domain.state, domain.ip));
|
||||
|
||||
if (domain.documents != null) {
|
||||
compileUrls(ret, domain.documents);
|
||||
|
@ -34,11 +34,10 @@ public class CrawlJobExtractorMain {
|
||||
|
||||
private static final String domainsSql =
|
||||
"""
|
||||
SELECT ID, LOWER(EC_DOMAIN.URL_PART)
|
||||
SELECT ID, LOWER(EC_DOMAIN.DOMAIN_NAME)
|
||||
FROM EC_DOMAIN
|
||||
WHERE QUALITY_RAW>-100
|
||||
AND INDEXED>0
|
||||
AND STATE<2
|
||||
WHERE INDEXED>0
|
||||
AND STATE='ACTIVE' OR STATE='EXHAUSTED'
|
||||
ORDER BY
|
||||
INDEX_DATE ASC,
|
||||
DISCOVER_DATE ASC,
|
||||
@ -49,8 +48,8 @@ public class CrawlJobExtractorMain {
|
||||
|
||||
private static final String urlsSql =
|
||||
"""
|
||||
SELECT CONCAT(PROTO, "://", ?, URL)
|
||||
FROM EC_URL
|
||||
SELECT URL
|
||||
FROM EC_URL_VIEW
|
||||
WHERE DOMAIN_ID=?
|
||||
ORDER BY
|
||||
VISITED DESC,
|
||||
|
@ -30,19 +30,19 @@ public class CrawlJobExtractorPageRankMain {
|
||||
"""
|
||||
SELECT ID
|
||||
FROM EC_DOMAIN
|
||||
WHERE URL_PART=?
|
||||
WHERE DOMAIN_NAME=?
|
||||
""";
|
||||
private static final String specificDomainSqlFromId =
|
||||
"""
|
||||
SELECT LOWER(URL_PART)
|
||||
SELECT LOWER(DOMAIN_NAME)
|
||||
FROM EC_DOMAIN
|
||||
WHERE ID=?
|
||||
""";
|
||||
|
||||
private static final String urlsSql =
|
||||
"""
|
||||
SELECT CONCAT(PROTO, "://", ?, URL)
|
||||
FROM EC_URL
|
||||
SELECT URL
|
||||
FROM EC_URL_VIEW
|
||||
WHERE DOMAIN_ID=?
|
||||
ORDER BY
|
||||
VISITED DESC,
|
||||
|
@ -16,22 +16,14 @@ public interface EdgeDataStoreDao {
|
||||
boolean isBlacklisted(EdgeDomain domain);
|
||||
|
||||
EdgeId<EdgeDomain> getDomainId(EdgeDomain domain);
|
||||
EdgeId<EdgeUrl> getUrlId(EdgeUrl domain);
|
||||
EdgeUrl getUrl(EdgeId<EdgeUrl> id);
|
||||
EdgeUrlDetails getUrlDetails(EdgeId<EdgeUrl> id);
|
||||
|
||||
List<BrowseResult> getDomainNeighbors(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist backlist, int count);
|
||||
List<BrowseResult> getDomainNeighborsAdjacent(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist backlist, int count);
|
||||
List<BrowseResult> getRandomDomains(int count, EdgeDomainBlacklist backlist);
|
||||
List<EdgeUrlDetails> getUrlDetailsMulti(List<EdgeId<EdgeUrl>> ids);
|
||||
List<EdgeId<EdgeDomain>> getDomainIdsFromUrlIds(Collection<EdgeId<EdgeUrl>> urlIds);
|
||||
|
||||
|
||||
EdgeDomain getDomain(EdgeId<EdgeDomain> id);
|
||||
|
||||
List<EdgeId<EdgeUrl>> inboudUrls(EdgeId<EdgeUrl> id, int limit);
|
||||
List<EdgeId<EdgeUrl>> outboundUrls(EdgeId<EdgeUrl> id, int limit);
|
||||
|
||||
Optional<EdgeId<EdgeUrl>> resolveAmbiguousDomain(String name);
|
||||
|
||||
|
||||
@ -48,9 +40,6 @@ public interface EdgeDataStoreDao {
|
||||
|
||||
List<EdgeDomain> getLinkingDomains(EdgeId<EdgeDomain> domainId);
|
||||
|
||||
List<EdgeUrl> getNewUrls(EdgeId<EdgeDomain> domainId, Collection<EdgeUrl> links);
|
||||
|
||||
double getRank(EdgeId<EdgeDomain> domainId);
|
||||
|
||||
void updateDomainIndexTimestamp(EdgeDomain domain, EdgeDomainIndexingState state, EdgeDomain alias, int minIndexed);
|
||||
}
|
||||
|
@ -71,7 +71,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
return domainIdCache.get(domain, () -> {
|
||||
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?")) {
|
||||
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||
stmt.setString(1, domain.toString());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
@ -86,104 +86,14 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
@SneakyThrows
|
||||
public EdgeId<EdgeUrl> getUrlId(EdgeUrl url) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
return urlIdCache.get(url, () -> {
|
||||
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_URL_VIEW WHERE URL_PATH=? AND URL_DOMAIN=? AND URL_PROTO=?")) {
|
||||
stmt.setString(1, url.path);
|
||||
stmt.setString(2, url.domain.toString());
|
||||
stmt.setString(3, url.proto);
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return new EdgeId<>(rsp.getInt(1));
|
||||
}
|
||||
}
|
||||
// Lenient mode for http->https upgrades etc
|
||||
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_URL_VIEW WHERE URL_PATH=? AND URL_DOMAIN=?")) {
|
||||
stmt.setString(1, url.path);
|
||||
stmt.setString(2, url.domain.toString());
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return new EdgeId<>(rsp.getInt(1));
|
||||
}
|
||||
}
|
||||
throw new NoSuchElementException(url.toString());
|
||||
});
|
||||
}
|
||||
catch (UncheckedExecutionException ex) {
|
||||
throw ex.getCause();
|
||||
private <T> String idList(List<EdgeId<T>> ids) {
|
||||
StringJoiner j = new StringJoiner(",", "(", ")");
|
||||
for (var id : ids) {
|
||||
j.add(Integer.toString(id.getId()));
|
||||
}
|
||||
return j.toString();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public List<EdgeId<EdgeDomain>> getDomainIdsFromUrlIds(Collection<EdgeId<EdgeUrl>> urlIds) {
|
||||
List<EdgeId<EdgeDomain>> results = new ArrayList<>(urlIds.size());
|
||||
|
||||
if (urlIds.isEmpty())
|
||||
return results;
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT DOMAIN_ID FROM EC_URL WHERE ID IN " + urlIds
|
||||
.stream()
|
||||
.map(EdgeId::getId)
|
||||
.map(Object::toString)
|
||||
.collect(Collectors.joining(",", "(", ")"))))
|
||||
{
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
results.add(new EdgeId<>(rsp.getInt(1)));
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
static final Pattern badChars = Pattern.compile("[';\\\\]");
|
||||
private String saneString(String s) {
|
||||
return "\'"+badChars.matcher(s).replaceAll("?")+"\'";
|
||||
}
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public EdgeUrl getUrl(EdgeId<EdgeUrl> id) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.createStatement()) {
|
||||
var rsp = stmt.executeQuery("SELECT URL_PROTO, URL_DOMAIN,URL_PORT,URL_PATH FROM EC_URL_VIEW WHERE ID=" + id.getId());
|
||||
if (rsp.next()) {
|
||||
return new EdgeUrl(rsp.getString(1), new EdgeDomain(rsp.getString(2)), rsp.getInt(3), rsp.getString(4));
|
||||
}
|
||||
throw new NoSuchElementException();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public EdgeUrlDetails getUrlDetails(EdgeId<EdgeUrl> id) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.createStatement()) {
|
||||
var rsp = stmt.executeQuery("SELECT ID,URL_PROTO,URL_DOMAIN,URL_PORT,URL_PATH,TITLE,DESCRIPTION,URL_QUALITY_MEASURE,DOMAIN_QUALITY_MEASURE,IFNULL(EC_DOMAIN_LINK_AGGREGATE.LINKS,1),WORDS_TOTAL,FORMAT,FEATURES,\"\",QUALITY_RAW,DOMAIN_STATE,DATA_HASH FROM EC_URL_VIEW LEFT JOIN EC_DOMAIN_LINK_AGGREGATE ON EC_DOMAIN_LINK_AGGREGATE.DOMAIN_ID=EC_URL_VIEW.DOMAIN_ID WHERE ID=" + id.getId());
|
||||
if (rsp.next()) {
|
||||
EdgeUrl url = new EdgeUrl(rsp.getString(2), new EdgeDomain(rsp.getString(3)), rsp.getInt(4), rsp.getString(5));
|
||||
return new EdgeUrlDetails(rsp.getInt(1), url, rsp.getString(6), rsp.getString(7), rsp.getDouble(8), rsp.getDouble(15), rsp.getDouble(9), rsp.getInt(10), rsp.getInt(11), rsp.getString(12), rsp.getInt(13), EdgePageScoreAdjustment.zero(), Integer.MAX_VALUE, Double.MAX_VALUE, rsp.getString(14), rsp.getInt(16), 0, rsp.getInt(17));
|
||||
}
|
||||
throw new NoSuchElementException();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public List<EdgeUrlDetails> getUrlDetailsMulti(List<EdgeId<EdgeUrl>> ids) {
|
||||
@ -193,16 +103,38 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
List<EdgeUrlDetails> result = new ArrayList<>(ids.size());
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
// This is SQL-injection safe, the IDs are of type int
|
||||
String idString = ids.stream().map(EdgeId::getId).map(Objects::toString).collect(Collectors.joining(",", "(", ")"));
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT ID,URL_PROTO,URL_DOMAIN,URL_PORT,URL_PATH,TITLE,DESCRIPTION,URL_QUALITY_MEASURE,DOMAIN_QUALITY_MEASURE,IFNULL(EC_DOMAIN_LINK_AGGREGATE.LINKS,1),WORDS_TOTAL,FORMAT,FEATURES,\"\",QUALITY_RAW,DOMAIN_STATE,DATA_HASH FROM EC_URL_VIEW LEFT JOIN EC_DOMAIN_LINK_AGGREGATE ON EC_DOMAIN_LINK_AGGREGATE.DOMAIN_ID=EC_URL_VIEW.DOMAIN_ID WHERE ID IN " + idString)) {
|
||||
String idString = idList(ids);
|
||||
|
||||
try (var stmt = connection.prepareStatement(
|
||||
"""
|
||||
SELECT ID, URL,
|
||||
TITLE, DESCRIPTION,
|
||||
WORDS_TOTAL, FORMAT, FEATURES,
|
||||
IP, DOMAIN_STATE, DATA_HASH
|
||||
FROM EC_URL_VIEW WHERE ID IN
|
||||
""" + idString)) {
|
||||
// "SELECT ID,URL_PROTO,URL_DOMAIN,URL_PORT,URL_PATH,TITLE,DESCRIPTION,URL_QUALITY_MEASURE,DOMAIN_QUALITY_MEASURE,IFNULL(EC_DOMAIN_LINK_AGGREGATE.LINKS,1),WORDS_TOTAL,FORMAT,FEATURES,\"\",QUALITY_RAW,DOMAIN_STATE,DATA_HASH FROM EC_URL_VIEW LEFT JOIN EC_DOMAIN_LINK_AGGREGATE ON EC_DOMAIN_LINK_AGGREGATE.DOMAIN_ID=EC_URL_VIEW.DOMAIN_ID WHERE ID IN " + idString)) {
|
||||
stmt.setFetchSize(ids.size());
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
EdgeUrl url = new EdgeUrl(rsp.getString(2), new EdgeDomain(rsp.getString(3)), rsp.getInt(4), rsp.getString(5));
|
||||
var val = new EdgeUrlDetails(rsp.getInt(1), url, rsp.getString(6), rsp.getString(7), rsp.getDouble(8), rsp.getDouble(15), rsp.getDouble(9), rsp.getInt(10), rsp.getInt(11), rsp.getString(12), rsp.getInt(13), EdgePageScoreAdjustment.zero(), Integer.MAX_VALUE, Double.MAX_VALUE, rsp.getString(14), rsp.getInt(16), 0, rsp.getInt(17));
|
||||
EdgeUrl url = new EdgeUrl(rsp.getString(2));
|
||||
var val = new EdgeUrlDetails(rsp.getInt(1), url,
|
||||
rsp.getString(3), // title
|
||||
rsp.getString(4), // description
|
||||
-5, // quality
|
||||
rsp.getInt(5), // wordsTotal
|
||||
rsp.getString(6), // foramt
|
||||
rsp.getInt(7), // features
|
||||
rsp.getString(8), // ip
|
||||
EdgeDomainIndexingState.valueOf(rsp.getString(9)), // domainState
|
||||
rsp.getInt(10), // dataHash
|
||||
EdgePageScoreAdjustment.zero(), // urlQualityAdjustment
|
||||
Integer.MAX_VALUE, // rankingId
|
||||
Double.MAX_VALUE, // termScore
|
||||
0 // queryLength
|
||||
);
|
||||
if (val.urlQuality >= QUALITY_LOWER_BOUND_CUTOFF) {
|
||||
result.add(val);
|
||||
}
|
||||
@ -214,75 +146,6 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<BrowseResult> getDomainNeighbors(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist blacklist, int count) {
|
||||
final Set<BrowseResult> domains = new HashSet<>(count*3);
|
||||
|
||||
final String q = "SELECT EC_DOMAIN.ID AS NEIGHBOR_ID, URL_PART from EC_DOMAIN_NEIGHBORS INNER JOIN EC_DOMAIN ON NEIGHBOR_ID=EC_DOMAIN.ID WHERE STATE<2 AND DOMAIN_ALIAS IS NULL AND EC_DOMAIN_NEIGHBORS.DOMAIN_ID = ? ORDER BY ADJ_IDX LIMIT ?";
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement(q)) {
|
||||
stmt.setFetchSize(count);
|
||||
stmt.setInt(1, domainId.getId());
|
||||
stmt.setInt(2, count);
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
int id = rsp.getInt(1);
|
||||
String domain = rsp.getString(2);
|
||||
|
||||
if (!blacklist.isBlacklisted(id)) {
|
||||
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
|
||||
|
||||
domains.add(new BrowseResult(url, id));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final String q2 = "SELECT EC_DOMAIN.ID, URL_PART FROM EC_DOMAIN_LINK INNER JOIN EC_DOMAIN ON DEST_DOMAIN_ID=EC_DOMAIN.ID WHERE SOURCE_DOMAIN_ID=? AND STATE<2 AND DOMAIN_ALIAS IS NULL GROUP BY EC_DOMAIN.ID ORDER BY RANK ASC LIMIT ?";
|
||||
try (var stmt = connection.prepareStatement(q2)) {
|
||||
|
||||
stmt.setFetchSize(count);
|
||||
stmt.setInt(1, domainId.getId());
|
||||
stmt.setInt(2, count);
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
int id = rsp.getInt(1);
|
||||
String domain = rsp.getString(2);
|
||||
|
||||
if (!blacklist.isBlacklisted(id)) {
|
||||
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
|
||||
|
||||
domains.add(new BrowseResult(url, id));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final String q3 = "SELECT EC_DOMAIN.ID, URL_PART FROM EC_DOMAIN_LINK INNER JOIN EC_DOMAIN ON DEST_DOMAIN_ID=EC_DOMAIN.ID WHERE DEST_DOMAIN_ID=? AND STATE<2 AND DOMAIN_ALIAS IS NULL GROUP BY EC_DOMAIN.ID ORDER BY RANK ASC LIMIT ?";
|
||||
try (var stmt = connection.prepareStatement(q3)) {
|
||||
stmt.setFetchSize(count);
|
||||
stmt.setInt(1, domainId.getId());
|
||||
stmt.setInt(2, count);
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
int id = rsp.getInt(1);
|
||||
String domain = rsp.getString(2);
|
||||
|
||||
if (!blacklist.isBlacklisted(id)) {
|
||||
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
|
||||
|
||||
domains.add(new BrowseResult(url, id));
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (SQLException throwables) {
|
||||
throwables.printStackTrace();
|
||||
}
|
||||
|
||||
|
||||
return new ArrayList<>(domains);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<BrowseResult> getDomainNeighborsAdjacent(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist blacklist, int count) {
|
||||
@ -357,7 +220,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
|
||||
if (domains.size() < count/2) {
|
||||
final String q3 = """
|
||||
SELECT EC_DOMAIN.ID, URL_PART
|
||||
SELECT EC_DOMAIN.ID, URL_PART
|
||||
FROM EC_DOMAIN
|
||||
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||
INNER JOIN EC_DOMAIN_LINK B ON B.SOURCE_DOMAIN_ID=EC_DOMAIN.ID
|
||||
@ -399,7 +262,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
@Override
|
||||
public List<BrowseResult> getRandomDomains(int count, EdgeDomainBlacklist blacklist) {
|
||||
|
||||
final String q = "SELECT DOMAIN_ID,URL_PART FROM EC_RANDOM_DOMAINS INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE STATE<2 AND DOMAIN_ALIAS IS NULL ORDER BY RAND() LIMIT ?";
|
||||
final String q = "SELECT DOMAIN_ID,DOMAIN_NAME FROM EC_RANDOM_DOMAINS INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE STATE<2 AND DOMAIN_ALIAS IS NULL ORDER BY RAND() LIMIT ?";
|
||||
List<BrowseResult> domains = new ArrayList<>(count);
|
||||
try (var conn = dataSource.getConnection()) {
|
||||
try (var stmt = conn.prepareStatement(q)) {
|
||||
@ -428,7 +291,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
public EdgeDomain getDomain(EdgeId<EdgeDomain> id) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT URL_PART FROM EC_DOMAIN WHERE ID=?")) {
|
||||
try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) {
|
||||
stmt.setInt(1, id.getId());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
@ -439,55 +302,11 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
}
|
||||
}
|
||||
|
||||
@Override @SneakyThrows
|
||||
public List<EdgeId<EdgeUrl>> inboudUrls(EdgeId<EdgeUrl> id, int limit) {
|
||||
|
||||
List<EdgeId<EdgeUrl>> ret = new ArrayList<>();
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt =
|
||||
connection.prepareStatement("SELECT SRC_URL_ID FROM EC_RELATED_LINKS_IN WHERE DEST_URL_ID=? ORDER BY SRC_URL_QUALITY DESC LIMIT ?")) {
|
||||
stmt.setFetchSize(limit);
|
||||
stmt.setInt(1, id.getId());
|
||||
stmt.setInt(2, limit);
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
ret.add(new EdgeId<>(rsp.getInt(1)));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
@Override @SneakyThrows
|
||||
public List<EdgeId<EdgeUrl>> outboundUrls(EdgeId<EdgeUrl> id, int limit) {
|
||||
|
||||
List<EdgeId<EdgeUrl>> ret = new ArrayList<>();
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt =
|
||||
connection.prepareStatement("SELECT DEST_URL_ID FROM EC_RELATED_LINKS_IN WHERE SRC_URL_ID=? ORDER BY SRC_URL_QUALITY DESC LIMIT ?")) {
|
||||
stmt.setFetchSize(limit);
|
||||
stmt.setInt(1, id.getId());
|
||||
stmt.setInt(2, limit);
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
ret.add(new EdgeId<>(rsp.getInt(1)));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Optional<EdgeId<EdgeUrl>> resolveAmbiguousDomain(String name) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) {
|
||||
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||
stmt.setString(1, name);
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
@ -495,7 +314,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
}
|
||||
}
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) {
|
||||
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||
stmt.setString(1, "https://"+name);
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
@ -503,7 +322,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
}
|
||||
}
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) {
|
||||
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||
stmt.setString(1, "http://"+name);
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
@ -511,7 +330,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
}
|
||||
}
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) {
|
||||
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||
stmt.setString(1, "https://www."+name);
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
@ -519,7 +338,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
}
|
||||
}
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) {
|
||||
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||
stmt.setString(1, "http://www."+name);
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
@ -682,27 +501,6 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<EdgeUrl> getNewUrls(EdgeId<EdgeDomain> domainId, Collection<EdgeUrl> links) {
|
||||
Map<String, EdgeUrl> edgeUrlByPath = links.stream().collect(Collectors.toMap(EdgeUrl::getPath, Function.identity(), (a,b)->a));
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement("SELECT URL FROM EC_URL WHERE DOMAIN_ID=?")) {
|
||||
stmt.setFetchSize(500);
|
||||
stmt.setInt(1, domainId.getId());
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
edgeUrlByPath.remove(rs.getString(1));
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
return new ArrayList<>(edgeUrlByPath.values());
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getRank(EdgeId<EdgeDomain> domainId) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
@ -722,47 +520,5 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void updateDomainIndexTimestamp(EdgeDomain domain, EdgeDomainIndexingState state, EdgeDomain alias, int minIndexed) {
|
||||
try (var connection = dataSource.getConnection();
|
||||
var stmt = connection.prepareStatement("UPDATE EC_DOMAIN SET INDEX_DATE=NOW(), STATE=?, DOMAIN_ALIAS=?, INDEXED=GREATEST(INDEXED,?) WHERE ID=?")) {
|
||||
stmt.setInt(1, state.code);
|
||||
if (null == alias) {
|
||||
stmt.setNull(2, Types.INTEGER);
|
||||
}
|
||||
else {
|
||||
stmt.setInt(2, getDomainId(alias).getId());
|
||||
}
|
||||
|
||||
stmt.setInt(3, minIndexed);
|
||||
stmt.setInt(4, getDomainId(domain).getId());
|
||||
stmt.executeUpdate();
|
||||
connection.commit();
|
||||
}
|
||||
catch (SQLException throwables) {
|
||||
logger.error("SQL error", throwables);
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private double getDomainQuality(Connection connection, EdgeDomain src) {
|
||||
try (var stmt = connection.prepareStatement("SELECT QUALITY_RAW FROM EC_DOMAIN WHERE URL_PART=?")) {
|
||||
stmt.setString(1, src.toString());
|
||||
var res = stmt.executeQuery();
|
||||
|
||||
if (res.next()) {
|
||||
var q = res.getDouble(1);
|
||||
if (q > 0.5) {
|
||||
logger.warn("gDQ({}) -> 1", src);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("DB error", ex);
|
||||
}
|
||||
|
||||
return -5;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -50,7 +50,7 @@ public class EdgeDomainBlacklistImpl implements EdgeDomainBlacklist {
|
||||
final TIntHashSet result = new TIntHashSet(1_000_000);
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement("SELECT EC_DOMAIN.ID FROM EC_DOMAIN INNER JOIN EC_TOP_DOMAIN ON EC_DOMAIN.URL_TOP_DOMAIN_ID = EC_TOP_DOMAIN.ID INNER JOIN EC_DOMAIN_BLACKLIST ON EC_DOMAIN_BLACKLIST.URL_DOMAIN = EC_TOP_DOMAIN.URL_PART")) {
|
||||
try (var stmt = connection.prepareStatement("SELECT EC_DOMAIN.ID FROM EC_DOMAIN INNER JOIN EC_DOMAIN_BLACKLIST ON EC_DOMAIN_BLACKLIST.URL_DOMAIN = EC_DOMAIN.DOMAIN_TOP")) {
|
||||
stmt.setFetchSize(1000);
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
|
@ -30,29 +30,13 @@ public class SearchIndexDao {
|
||||
logger.info("SearchIndexDao ranking settings = {}", rankingSettings);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public TIntHashSet getSpamDomains() {
|
||||
final TIntHashSet result = new TIntHashSet(1_000_000);
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement("SELECT EC_DOMAIN.ID FROM EC_DOMAIN INNER JOIN EC_TOP_DOMAIN ON EC_DOMAIN.URL_TOP_DOMAIN_ID = EC_TOP_DOMAIN.ID INNER JOIN EC_DOMAIN_BLACKLIST ON EC_DOMAIN_BLACKLIST.URL_DOMAIN = EC_TOP_DOMAIN.URL_PART")) {
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
result.add(rsp.getInt(1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public TIntHashSet goodUrls() {
|
||||
TIntHashSet domains = new TIntHashSet(10_000_000, 0.5f, -1);
|
||||
TIntHashSet urls = new TIntHashSet(100_000_000, 0.5f, -1);
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_ALIAS IS NULL AND STATE>=0")) {
|
||||
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_ALIAS IS NULL AND IS_ALIVE")) {
|
||||
stmt.setFetchSize(10_000);
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
|
@ -16,25 +16,24 @@ public class EdgeUrlDetails {
|
||||
public String description;
|
||||
|
||||
public double urlQuality;
|
||||
public double urlQualityRaw;
|
||||
public double domainQuality;
|
||||
|
||||
public int links; // DEAD
|
||||
public int words;
|
||||
public String format;
|
||||
public int features;
|
||||
|
||||
public EdgePageScoreAdjustment urlQualityAdjustment;
|
||||
|
||||
public long rankingId;
|
||||
public double termScore;
|
||||
|
||||
public String ip; // BROKEN
|
||||
public int domainState;
|
||||
public int queryLength;
|
||||
public EdgeDomainIndexingState domainState;
|
||||
|
||||
|
||||
public int dataHash;
|
||||
|
||||
public EdgePageScoreAdjustment urlQualityAdjustment;
|
||||
public long rankingId;
|
||||
public double termScore;
|
||||
public int queryLength;
|
||||
|
||||
public long rankingIdAdjustment() {
|
||||
int penalty = 0;
|
||||
|
||||
@ -136,7 +135,7 @@ public class EdgeUrlDetails {
|
||||
return HtmlFeature.hasFeature(features, HtmlFeature.COOKIES);
|
||||
}
|
||||
public boolean isSpecialDomain() {
|
||||
return domainState == EdgeDomainIndexingState.SPECIAL.code;
|
||||
return domainState == EdgeDomainIndexingState.SPECIAL;
|
||||
}
|
||||
public int getLogRank() { return (int) Math.round(Math.min(Math.log(1+rankingId),10)); }
|
||||
|
||||
|
@ -107,7 +107,7 @@ public class SearchResultDecorator {
|
||||
|
||||
private double calculateTermScore(IndexBlock block, EdgeSearchResultItem resultItem, EdgeUrlDetails details) {
|
||||
return valuator.evaluateTerms(resultItem.scores, block, details.words) / Math.sqrt(1 + resultItem.queryLength)
|
||||
+ ((details.domainState == EdgeDomainIndexingState.SPECIAL.code) ? 1.25 : 0);
|
||||
+ ((details.domainState == EdgeDomainIndexingState.SPECIAL) ? 1.25 : 0);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,24 +1,11 @@
|
||||
DROP TABLE IF EXISTS EC_URL_LINK;
|
||||
DROP VIEW IF EXISTS EC_PAGE_VIEW;
|
||||
|
||||
DROP TABLE IF EXISTS DISC_DOMAIN_TAG;
|
||||
DROP TABLE IF EXISTS DISC_TAG;
|
||||
DROP TABLE IF EXISTS DISC_USER;
|
||||
|
||||
DROP TABLE IF EXISTS EC_DOMAIN_NEIGHBORS;
|
||||
DROP TABLE IF EXISTS DOMAIN_METADATA;
|
||||
DROP TABLE IF EXISTS EC_FEED_URL;
|
||||
DROP TABLE IF EXISTS EC_DOMAIN_LINK;
|
||||
DROP TABLE IF EXISTS EC_PAGE_DATA;
|
||||
DROP TABLE IF EXISTS EC_URL;
|
||||
DROP TABLE IF EXISTS EC_DOMAIN_NEIGHBORS;
|
||||
DROP TABLE IF EXISTS EC_DOMAIN;
|
||||
DROP TABLE IF EXISTS EC_TOP_DOMAIN;
|
||||
DROP TABLE IF EXISTS EC_URL_DETAILS;
|
||||
DROP VIEW IF EXISTS EC_URL_VIEW;
|
||||
DROP VIEW IF EXISTS EC_URL_PART_HASH;
|
||||
|
||||
DROP TABLE IF EXISTS EC_URL_WORD;
|
||||
DROP TABLE IF EXISTS EC_DICTIONARY;
|
||||
DROP TABLE IF EXISTS DOMAIN_METADATA;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOMAIN_METADATA (
|
||||
ID INT PRIMARY KEY,
|
||||
@ -27,52 +14,31 @@ CREATE TABLE IF NOT EXISTS DOMAIN_METADATA (
|
||||
GOOD_URLS INT DEFAULT 0
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS EC_TOP_DOMAIN (
|
||||
ID INT PRIMARY KEY AUTO_INCREMENT,
|
||||
URL_PART VARCHAR(255) UNIQUE NOT NULL,
|
||||
ALIVE BOOLEAN DEFAULT TRUE NOT NULL
|
||||
)
|
||||
CHARACTER SET utf8mb4
|
||||
COLLATE utf8mb4_unicode_ci;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS EC_DOMAIN (
|
||||
ID INT PRIMARY KEY AUTO_INCREMENT,
|
||||
URL_PART VARCHAR(255) UNIQUE NOT NULL,
|
||||
INDEXED INT DEFAULT 0 NOT NULL,
|
||||
QUALITY DOUBLE DEFAULT -5 NOT NULL,
|
||||
QUALITY_RAW DOUBLE DEFAULT -5 NOT NULL,
|
||||
QUALITY_ORIGINAL DOUBLE DEFAULT -5 NOT NULL,
|
||||
|
||||
URL_TOP_DOMAIN_ID INT NOT NULL,
|
||||
URL_SUBDOMAIN VARCHAR(255) NOT NULL,
|
||||
STATE INT DEFAULT 0 NOT NULL,
|
||||
DOMAIN_NAME VARCHAR(255) UNIQUE NOT NULL,
|
||||
DOMAIN_TOP VARCHAR(255) NOT NULL,
|
||||
|
||||
INDEXED INT DEFAULT 0 NOT NULL COMMENT "~number of documents visited / 100",
|
||||
STATE ENUM('ACTIVE', 'EXHAUSTED', 'SPECIAL', 'SOCIAL_MEDIA', 'BLOCKED', 'REDIR', 'ERROR', 'UNKNOWN') NOT NULL DEFAULT 'active' COMMENT "@see EdgeDomainIndexingState",
|
||||
|
||||
RANK DOUBLE,
|
||||
|
||||
DOMAIN_ALIAS INTEGER,
|
||||
IP VARCHAR(32),
|
||||
|
||||
INDEX_DATE TIMESTAMP DEFAULT NOW(),
|
||||
DISCOVER_DATE TIMESTAMP DEFAULT NOW(),
|
||||
|
||||
FOREIGN KEY (URL_TOP_DOMAIN_ID) REFERENCES EC_TOP_DOMAIN(ID) ON DELETE CASCADE
|
||||
)
|
||||
CHARACTER SET utf8mb4
|
||||
COLLATE utf8mb4_unicode_ci;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS EC_DOMAIN_HISTORY (
|
||||
ID INT PRIMARY KEY AUTO_INCREMENT,
|
||||
URL_PART VARCHAR(255) UNIQUE NOT NULL,
|
||||
QUALITY_MEASURE DOUBLE DEFAULT -5 NOT NULL,
|
||||
INBOUND_LINKS INT DEFAULT 1,
|
||||
LINK_ADJUSTED_QUALITY DOUBLE GENERATED ALWAYS AS (0.3*QUALITY_MEASURE + 0.7*QUALITY_MEASURE / GREATEST(1, INBOUND_LINKS)),
|
||||
RANK DOUBLE
|
||||
IS_ALIVE BOOLEAN AS (STATE='ACTIVE' OR STATE='EXHAUSTED' OR STATE='SPECIAL' OR STATE='SOCIAL_MEDIA') VIRTUAL
|
||||
)
|
||||
CHARACTER SET utf8mb4
|
||||
COLLATE utf8mb4_unicode_ci;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS EC_DOMAIN_BLACKLIST (
|
||||
ID INT PRIMARY KEY AUTO_INCREMENT,
|
||||
URL_DOMAIN VARCHAR(255) UNIQUE NOT NULL
|
||||
URL_DOMAIN VARCHAR(255) UNIQUE NOT NULL
|
||||
)
|
||||
CHARACTER SET utf8mb4
|
||||
COLLATE utf8mb4_unicode_ci;
|
||||
@ -81,18 +47,15 @@ CREATE TABLE IF NOT EXISTS EC_URL (
|
||||
ID INT PRIMARY KEY AUTO_INCREMENT,
|
||||
DOMAIN_ID INT NOT NULL,
|
||||
PROTO ENUM('http','https','gemini') NOT NULL,
|
||||
URL VARCHAR(255) NOT NULL,
|
||||
PATH VARCHAR(255) NOT NULL COLLATE utf8mb4_bin,
|
||||
PORT INT,
|
||||
|
||||
PATH_HASH INT NOT NULL COMMENT "Hash of PATH for uniqueness check by domain",
|
||||
VISITED BOOLEAN NOT NULL DEFAULT FALSE,
|
||||
DATA_HASH INTEGER,
|
||||
QUALITY_MEASURE DOUBLE,
|
||||
|
||||
STATE ENUM('ok', 'redirect', 'dead', 'archived', 'disqualified') NOT NULL DEFAULT 'ok',
|
||||
|
||||
IP VARCHAR(32),
|
||||
|
||||
CONSTRAINT CONS UNIQUE (DOMAIN_ID, URL),
|
||||
CONSTRAINT CONS UNIQUE (DOMAIN_ID, PATH_HASH),
|
||||
FOREIGN KEY (DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE
|
||||
)
|
||||
CHARACTER SET utf8mb4
|
||||
@ -101,13 +64,14 @@ COLLATE utf8mb4_unicode_ci;
|
||||
CREATE TABLE IF NOT EXISTS EC_PAGE_DATA (
|
||||
ID INT PRIMARY KEY AUTO_INCREMENT,
|
||||
|
||||
TITLE VARCHAR(255),
|
||||
DESCRIPTION VARCHAR(255),
|
||||
TITLE VARCHAR(255) NOT NULL,
|
||||
DESCRIPTION VARCHAR(255) NOT NULL,
|
||||
|
||||
WORDS_DISTINCT INTEGER,
|
||||
WORDS_TOTAL INTEGER,
|
||||
FORMAT VARCHAR(8),
|
||||
FEATURES INT,
|
||||
WORDS_TOTAL INTEGER NOT NULL,
|
||||
FORMAT ENUM('PLAIN', 'UNKNOWN', 'HTML123', 'HTML4', 'XHTML', 'HTML5', 'MARKDOWN') NOT NULL,
|
||||
FEATURES INT COMMENT "Bit-encoded feature set of document, @see HtmlFeature" NOT NULL,
|
||||
|
||||
DATA_HASH INTEGER NOT NULL,
|
||||
|
||||
FOREIGN KEY (ID) REFERENCES EC_URL(ID) ON DELETE CASCADE
|
||||
)
|
||||
@ -115,13 +79,9 @@ CHARACTER SET utf8mb4
|
||||
COLLATE utf8mb4_unicode_ci;
|
||||
|
||||
CREATE TABLE EC_FEED_URL (
|
||||
ID INT PRIMARY KEY AUTO_INCREMENT,
|
||||
DOMAIN_ID INT NOT NULL,
|
||||
PROTO VARCHAR(8) NOT NULL,
|
||||
URL VARCHAR(255) NOT NULL,
|
||||
PORT INT,
|
||||
URL VARCHAR(255) PRIMARY KEY,
|
||||
DOMAIN_ID INT,
|
||||
|
||||
CONSTRAINT CONS UNIQUE (DOMAIN_ID, URL),
|
||||
FOREIGN KEY (DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE
|
||||
)
|
||||
CHARACTER SET utf8mb4
|
||||
@ -150,29 +110,23 @@ CREATE TABLE IF NOT EXISTS EC_DOMAIN_LINK (
|
||||
FOREIGN KEY (DEST_DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS EC_DOMAIN_LINK_AGGREGATE (
|
||||
DOMAIN_ID INT PRIMARY KEY NOT NULL,
|
||||
LINKS INT
|
||||
);
|
||||
|
||||
CREATE OR REPLACE VIEW EC_URL_VIEW AS
|
||||
SELECT
|
||||
EC_DOMAIN.URL_PART AS URL_DOMAIN,
|
||||
EC_URL.URL AS URL_PATH,
|
||||
EC_TOP_DOMAIN.URL_PART AS URL_TOP,
|
||||
IF(PORT IS NULL,
|
||||
CONCAT(EC_URL.PROTO, "://", EC_DOMAIN.DOMAIN_NAME, EC_URL.PATH),
|
||||
CONCAT(EC_URL.PROTO, "://", EC_DOMAIN.DOMAIN_NAME, ":", EC_URL.PORT, EC_URL.PATH))
|
||||
AS URL,
|
||||
EC_URL.PATH_HASH AS PATH_HASH,
|
||||
EC_URL.PATH AS PATH,
|
||||
EC_DOMAIN.DOMAIN_NAME AS DOMAIN_NAME,
|
||||
EC_DOMAIN.DOMAIN_TOP AS DOMAIN_TOP,
|
||||
EC_URL.ID AS ID,
|
||||
EC_DOMAIN.ID AS DOMAIN_ID,
|
||||
EC_TOP_DOMAIN.ID AS TOP_DOMAIN_ID,
|
||||
EC_URL.PROTO AS URL_PROTO,
|
||||
EC_URL.PORT AS URL_PORT,
|
||||
EC_URL.VISITED AS VISITED,
|
||||
EC_URL.DATA_HASH AS DATA_HASH,
|
||||
EC_URL.QUALITY_MEASURE AS URL_QUALITY_MEASURE,
|
||||
EC_DOMAIN.QUALITY AS DOMAIN_QUALITY_MEASURE,
|
||||
EC_DOMAIN.QUALITY_RAW AS QUALITY_RAW,
|
||||
EC_PAGE_DATA.DATA_HASH AS DATA_HASH,
|
||||
EC_PAGE_DATA.TITLE AS TITLE,
|
||||
EC_PAGE_DATA.DESCRIPTION AS DESCRIPTION,
|
||||
EC_URL.IP AS IP,
|
||||
EC_DOMAIN.IP AS IP,
|
||||
EC_DOMAIN.STATE AS STATE,
|
||||
EC_PAGE_DATA.WORDS_TOTAL AS WORDS_TOTAL,
|
||||
EC_PAGE_DATA.FORMAT AS FORMAT,
|
||||
@ -183,59 +137,32 @@ CREATE OR REPLACE VIEW EC_URL_VIEW AS
|
||||
LEFT JOIN EC_PAGE_DATA
|
||||
ON EC_PAGE_DATA.ID = EC_URL.ID
|
||||
INNER JOIN EC_DOMAIN
|
||||
ON EC_URL.DOMAIN_ID = EC_DOMAIN.ID
|
||||
INNER JOIN EC_TOP_DOMAIN
|
||||
ON EC_DOMAIN.URL_TOP_DOMAIN_ID=EC_TOP_DOMAIN.ID;
|
||||
|
||||
CREATE OR REPLACE VIEW EC_DISCOVER_TASKS_VIEW AS
|
||||
SELECT
|
||||
ID,
|
||||
URL_PART
|
||||
FROM EC_DOMAIN
|
||||
WHERE
|
||||
DOMAIN_ALIAS IS NULL
|
||||
AND INDEXED = 0
|
||||
ORDER BY QUALITY DESC, ID ASC;
|
||||
ON EC_URL.DOMAIN_ID = EC_DOMAIN.ID;
|
||||
|
||||
CREATE OR REPLACE VIEW EC_RELATED_LINKS_VIEW AS
|
||||
SELECT
|
||||
SOURCE_DOMAIN_ID,
|
||||
SOURCE_DOMAIN.URL_PART AS SOURCE_URL,
|
||||
SOURCE_TOP_DOMAIN.URL_PART AS SOURCE_TOP_URL,
|
||||
SOURCE_DOMAIN.DOMAIN_NAME AS SOURCE_DOMAIN,
|
||||
SOURCE_DOMAIN.DOMAIN_TOP AS SOURCE_TOP_DOMAIN,
|
||||
DEST_DOMAIN_ID,
|
||||
DEST_DOMAIN.URL_PART AS DEST_URL,
|
||||
DEST_TOP_DOMAIN.URL_PART AS DEST_TOP_URL
|
||||
DEST_DOMAIN.DOMAIN_NAME AS DEST_DOMAIN,
|
||||
DEST_DOMAIN.DOMAIN_TOP AS DEST_TOP_DOMAIN
|
||||
FROM EC_DOMAIN_LINK
|
||||
INNER JOIN EC_DOMAIN AS SOURCE_DOMAIN
|
||||
ON SOURCE_DOMAIN.ID=SOURCE_DOMAIN_ID
|
||||
INNER JOIN EC_TOP_DOMAIN AS SOURCE_TOP_DOMAIN
|
||||
ON SOURCE_TOP_DOMAIN.ID=SOURCE_DOMAIN.URL_TOP_DOMAIN_ID
|
||||
INNER JOIN EC_DOMAIN AS DEST_DOMAIN
|
||||
ON DEST_DOMAIN.ID=DEST_DOMAIN_ID
|
||||
INNER JOIN EC_TOP_DOMAIN AS DEST_TOP_DOMAIN
|
||||
ON DEST_TOP_DOMAIN.ID=DEST_DOMAIN.URL_TOP_DOMAIN_ID
|
||||
;
|
||||
|
||||
CREATE OR REPLACE VIEW EC_RELATED_LINKS_IN AS
|
||||
SELECT
|
||||
IN_URL.ID AS SRC_URL_ID,
|
||||
IN_URL.QUALITY_MEASURE AS SRC_URL_QUALITY,
|
||||
OUT_URL.ID AS DEST_URL_ID,
|
||||
OUT_URL.QUALITY_MEASURE AS DEST_URL_QUALITY
|
||||
FROM EC_URL AS IN_URL
|
||||
INNER JOIN EC_DOMAIN_LINK
|
||||
ON IN_URL.DOMAIN_ID=EC_DOMAIN_LINK.SOURCE_DOMAIN_ID
|
||||
INNER JOIN EC_URL AS OUT_URL
|
||||
ON OUT_URL.DOMAIN_ID=EC_DOMAIN_LINK.DEST_DOMAIN_ID
|
||||
WHERE IN_URL.VISITED=TRUE
|
||||
AND IN_URL.DATA_HASH IS NOT NULL
|
||||
AND OUT_URL.VISITED=TRUE
|
||||
AND OUT_URL.DATA_HASH IS NOT NULL;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS EC_DOMAIN_BACKLINKS (
|
||||
ID INT PRIMARY KEY,
|
||||
LINKEDNESS INT
|
||||
);
|
||||
OUT_URL.ID AS DEST_URL_ID
|
||||
FROM EC_DOMAIN_LINK
|
||||
INNER JOIN EC_URL AS IN_URL ON IN_URL.DOMAIN_ID=EC_DOMAIN_LINK.SOURCE_DOMAIN_ID
|
||||
INNER JOIN EC_URL AS OUT_URL ON OUT_URL.DOMAIN_ID=EC_DOMAIN_LINK.DEST_DOMAIN_ID
|
||||
WHERE IN_URL.VISITED AND IN_URL.STATE = 'ok'
|
||||
AND OUT_URL.VISITED AND OUT_URL.STATE = 'ok';
|
||||
|
||||
CREATE TABLE IF NOT EXISTS EC_API_KEY (
|
||||
LICENSE_KEY VARCHAR(255) UNIQUE,
|
||||
@ -245,16 +172,9 @@ CREATE TABLE IF NOT EXISTS EC_API_KEY (
|
||||
RATE INT DEFAULT 10
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS EC_DOMAIN_RANK_INDEX ON EC_DOMAIN (RANK);
|
||||
CREATE INDEX IF NOT EXISTS EC_DOMAIN_QUALITY_INDEX ON EC_DOMAIN (QUALITY,STATE);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS EC_DOMAIN_INDEXED_INDEX ON EC_DOMAIN (INDEXED);
|
||||
CREATE INDEX IF NOT EXISTS EC_DOMAIN_ID_INDEXED_INDEX ON EC_DOMAIN (ID, INDEXED);
|
||||
CREATE INDEX IF NOT EXISTS EC_DOMAIN_TRIO ON EC_DOMAIN (STATE, DOMAIN_ALIAS, INDEXED, QUALITY);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS EC_URL_VISITED ON EC_URL (VISITED);
|
||||
CREATE INDEX IF NOT EXISTS EC_URL_VISITED_STATE ON EC_URL (VISITED, STATE);
|
||||
CREATE INDEX IF NOT EXISTS EC_URL_IP ON EC_URL (IP);
|
||||
CREATE INDEX IF NOT EXISTS EC_DOMAIN_TOP_DOMAIN ON EC_DOMAIN (DOMAIN_TOP);
|
||||
|
||||
---;
|
||||
|
||||
|
@ -43,7 +43,7 @@ public class TestUtil {
|
||||
logger.info("Running script {}", scriptFile);
|
||||
try (var scriptStream = ClassLoader.getSystemResourceAsStream(scriptFile);
|
||||
var stmt = conn.createStatement()) {
|
||||
for (String s : new String(scriptStream.readAllBytes()).split(";")) {
|
||||
for (String s : new String(scriptStream.readAllBytes()).split("(;|---)")) {
|
||||
if (!s.isBlank()) {
|
||||
try {
|
||||
Assertions.assertTrue(stmt.executeUpdate(s) >= 0);
|
||||
|
@ -0,0 +1,51 @@
|
||||
package nu.marginalia.wmsa.edge.converting.loader;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.util.TestUtil;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.parallel.Execution;
|
||||
import org.junit.jupiter.api.parallel.ExecutionMode;
|
||||
import org.junit.jupiter.api.parallel.ResourceAccessMode;
|
||||
import org.junit.jupiter.api.parallel.ResourceLock;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
@ResourceLock(value = "mariadb", mode = ResourceAccessMode.READ_WRITE)
|
||||
@Execution(ExecutionMode.SAME_THREAD)
|
||||
@Tag("db")
|
||||
class SqlLoadDomainLinksTest {
|
||||
|
||||
HikariDataSource dataSource;
|
||||
LoaderData loaderData;
|
||||
@BeforeEach
|
||||
public void setUp() {
|
||||
dataSource = TestUtil.getConnection();
|
||||
TestUtil.evalScript(dataSource, "sql/edge-crawler-cache.sql");
|
||||
|
||||
var loadDomains = new SqlLoadDomains(dataSource);
|
||||
loaderData = new LoaderData(10);
|
||||
|
||||
loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu"));
|
||||
loadDomains.load(loaderData, new EdgeDomain[] { new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu") });
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void tearDown() {
|
||||
dataSource.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void loadDomainLinks() throws URISyntaxException {
|
||||
var loader = new SqlLoadDomainLinks(dataSource);
|
||||
loader.load(new DomainLink[] { new DomainLink(new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu")) });
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,54 @@
|
||||
package nu.marginalia.wmsa.edge.converting.loader;
|
||||
|
||||
import nu.marginalia.util.TestUtil;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.parallel.Execution;
|
||||
import org.junit.jupiter.api.parallel.ExecutionMode;
|
||||
import org.junit.jupiter.api.parallel.ResourceAccessMode;
|
||||
import org.junit.jupiter.api.parallel.ResourceLock;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
@ResourceLock(value = "mariadb", mode = ResourceAccessMode.READ_WRITE)
|
||||
@Execution(ExecutionMode.SAME_THREAD)
|
||||
@Tag("db")
|
||||
class SqlLoadDomainsTest {
|
||||
|
||||
|
||||
@Test
|
||||
public void loadDomain() {
|
||||
|
||||
try (var dataSource = TestUtil.getConnection()) {
|
||||
TestUtil.evalScript(dataSource, "sql/edge-crawler-cache.sql");
|
||||
|
||||
var loadDomains = new SqlLoadDomains(dataSource);
|
||||
var loaderData = new LoaderData(10);
|
||||
|
||||
loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu"));
|
||||
loadDomains.load(loaderData, new EdgeDomain("www.marginalia.nu"));
|
||||
|
||||
assertTrue(loaderData.getDomainId(new EdgeDomain("www.marginalia.nu")) >= 0);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void loadDomains() {
|
||||
|
||||
try (var dataSource = TestUtil.getConnection()) {
|
||||
TestUtil.evalScript(dataSource, "sql/edge-crawler-cache.sql");
|
||||
|
||||
var loadDomains = new SqlLoadDomains(dataSource);
|
||||
var loaderData = new LoaderData(10);
|
||||
|
||||
loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu"));
|
||||
loadDomains.load(loaderData, new EdgeDomain[] { new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu") });
|
||||
|
||||
assertTrue(loaderData.getDomainId(new EdgeDomain("www.marginalia.nu")) >= 0);
|
||||
assertTrue(loaderData.getDomainId(new EdgeDomain("memex.marginalia.nu")) >= 0);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
@ -0,0 +1,68 @@
|
||||
package nu.marginalia.wmsa.edge.converting.loader;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.util.TestUtil;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.parallel.Execution;
|
||||
import org.junit.jupiter.api.parallel.ExecutionMode;
|
||||
import org.junit.jupiter.api.parallel.ResourceAccessMode;
|
||||
import org.junit.jupiter.api.parallel.ResourceLock;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
@ResourceLock(value = "mariadb", mode = ResourceAccessMode.READ_WRITE)
|
||||
@Execution(ExecutionMode.SAME_THREAD)
|
||||
@Tag("db")
|
||||
class SqlLoadProcessedDocumentTest {
|
||||
HikariDataSource dataSource;
|
||||
LoaderData loaderData;
|
||||
@BeforeEach
|
||||
public void setUp() throws URISyntaxException {
|
||||
dataSource = TestUtil.getConnection();
|
||||
TestUtil.evalScript(dataSource, "sql/edge-crawler-cache.sql");
|
||||
|
||||
var loadDomains = new SqlLoadDomains(dataSource);
|
||||
var loadUrls = new SqlLoadUrls(dataSource);
|
||||
|
||||
loaderData = new LoaderData(10);
|
||||
|
||||
loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu"));
|
||||
loadDomains.load(loaderData, new EdgeDomain("www.marginalia.nu"));
|
||||
|
||||
loadUrls.load(loaderData, new EdgeUrl[]{new EdgeUrl("https://www.marginalia.nu/")});
|
||||
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void tearDown() {
|
||||
dataSource.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void loadProcessedDocument() throws URISyntaxException {
|
||||
var loader = new SqlLoadProcessedDocument(dataSource);
|
||||
loader.load(loaderData, List.of(new LoadProcessedDocument(
|
||||
new EdgeUrl("https://www.marginalia.nu/"),
|
||||
EdgeUrlState.OK,
|
||||
"TITLE",
|
||||
"DESCR",
|
||||
HtmlFeature.encode(Set.of(HtmlFeature.AFFILIATE_LINK)),
|
||||
EdgeHtmlStandard.HTML5,
|
||||
100,
|
||||
12345,
|
||||
-5
|
||||
)));
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,52 @@
|
||||
package nu.marginalia.wmsa.edge.converting.loader;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.util.TestUtil;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.parallel.Execution;
|
||||
import org.junit.jupiter.api.parallel.ExecutionMode;
|
||||
import org.junit.jupiter.api.parallel.ResourceAccessMode;
|
||||
import org.junit.jupiter.api.parallel.ResourceLock;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
|
||||
@ResourceLock(value = "mariadb", mode = ResourceAccessMode.READ_WRITE)
|
||||
@Execution(ExecutionMode.SAME_THREAD)
|
||||
@Tag("db")
|
||||
class SqlLoadProcessedDomainTest {
|
||||
HikariDataSource dataSource;
|
||||
LoaderData loaderData;
|
||||
@BeforeEach
|
||||
public void setUp() {
|
||||
dataSource = TestUtil.getConnection();
|
||||
TestUtil.evalScript(dataSource, "sql/edge-crawler-cache.sql");
|
||||
|
||||
var loadDomains = new SqlLoadDomains(dataSource);
|
||||
loaderData = new LoaderData(10);
|
||||
|
||||
loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu"));
|
||||
loadDomains.load(loaderData, new EdgeDomain[]{ new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu") });
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void tearDown() {
|
||||
dataSource.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void loadProcessedDomain() {
|
||||
var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource));
|
||||
loader.load(loaderData, new EdgeDomain("www.marginalia.nu"), EdgeDomainIndexingState.BLOCKED, "127.0.0.1");
|
||||
}
|
||||
@Test
|
||||
public void loadDomainAlias() {
|
||||
var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource));
|
||||
loader.loadAlias(loaderData, new DomainLink(new EdgeDomain("memex.marginalia.nu"), new EdgeDomain("www.marginalia.nu")));
|
||||
}
|
||||
}
|
@ -0,0 +1,49 @@
|
||||
package nu.marginalia.wmsa.edge.converting.loader;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.util.TestUtil;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.parallel.Execution;
|
||||
import org.junit.jupiter.api.parallel.ExecutionMode;
|
||||
import org.junit.jupiter.api.parallel.ResourceAccessMode;
|
||||
import org.junit.jupiter.api.parallel.ResourceLock;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
@ResourceLock(value = "mariadb", mode = ResourceAccessMode.READ_WRITE)
|
||||
@Execution(ExecutionMode.SAME_THREAD)
|
||||
@Tag("db")
|
||||
class SqlLoadUrlsTest {
|
||||
HikariDataSource dataSource;
|
||||
LoaderData loaderData;
|
||||
@BeforeEach
|
||||
public void setUp() {
|
||||
dataSource = TestUtil.getConnection();
|
||||
TestUtil.evalScript(dataSource, "sql/edge-crawler-cache.sql");
|
||||
|
||||
var loadDomains = new SqlLoadDomains(dataSource);
|
||||
loaderData = new LoaderData(10);
|
||||
|
||||
loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu"));
|
||||
loadDomains.load(loaderData, new EdgeDomain("www.marginalia.nu"));
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void tearDown() {
|
||||
dataSource.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void loadUrl() throws URISyntaxException {
|
||||
var loadUrls = new SqlLoadUrls(dataSource);
|
||||
loadUrls.load(loaderData, new EdgeUrl[] { new EdgeUrl("https://www.marginalia.nu/") });
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user