WIP: Database refactoring

This commit is contained in:
vlofgren 2022-06-07 22:34:53 +02:00
parent 0e65384781
commit c915664fcc
34 changed files with 451 additions and 626 deletions

View File

@ -1,49 +0,0 @@
package nu.marginalia.util.ranking;
import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList;
import gnu.trove.map.hash.TIntIntHashMap;
import it.unimi.dsi.fastutil.ints.IntArrays;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.sql.SQLException;
public class AcademiaRank {
private final TIntArrayList result;
private static final Logger logger = LoggerFactory.getLogger(AcademiaRank.class);
public AcademiaRank(HikariDataSource ds, String... origins) throws IOException {
TIntList rankingResults = new BetterStandardPageRank(ds, origins).pageRank(100_000);
TIntIntHashMap idToRanking = new TIntIntHashMap(100_000, 0.5f, -1, 1_000_000_000);
for (int i = 0; i < rankingResults.size(); i++) {
idToRanking.put(rankingResults.get(i), i);
}
result = new TIntArrayList(10000);
try (var conn = ds.getConnection();
var stmt = conn.prepareStatement("select EC_DOMAIN.ID,COUNT(SOURCE_DOMAIN_ID) AS CNT from EC_DOMAIN INNER JOIN DOMAIN_METADATA ON DOMAIN_METADATA.ID=EC_DOMAIN.ID INNER JOIN EC_DOMAIN_LINK ON EC_DOMAIN_LINK.DEST_DOMAIN_ID=EC_DOMAIN.ID WHERE INDEXED>0 AND STATE>=0 AND STATE<2 AND ((VISITED_URLS>1000+1500*RANK AND RANK<1) OR (GOOD_URLS>1000 AND URL_PART LIKE '%edu')) GROUP BY EC_DOMAIN.ID HAVING CNT<1500 ORDER BY RANK ASC")) {
stmt.setFetchSize(1000);
var rsp = stmt.executeQuery();
while (rsp.next()) {
result.add(rsp.getInt(1));
}
}
catch (SQLException ex) {
logger.error("SQL error", ex);
}
int[] internalArray = result.toArray();
IntArrays.quickSort(internalArray, (a,b) -> idToRanking.get(a) - idToRanking.get(b));
result.set(0, internalArray);
}
public TIntArrayList getResult() {
return result;
}
}

View File

@ -72,10 +72,10 @@ public abstract class RankingAlgorithm {
String s;
if (getNames) {
s = "SELECT EC_DOMAIN.ID,URL_PART,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
s = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
}
else {
s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
}
try (var stmt = conn.prepareStatement(s)) {
stmt.setFetchSize(10000);
@ -84,7 +84,7 @@ public abstract class RankingAlgorithm {
int id = rsp.getInt(1);
if (!spamDomains.contains(id)) {
domainsById.put(id, new DomainData(id, rsp.getString(2), rsp.getInt(3), rsp.getInt(4), rsp.getInt(5), false));
domainsById.put(id, new DomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5), false));
domainIndexToId.put(domainIndexToId.size(), id);
domainIdToIndex.put(id, domainIdToIndex.size());
@ -125,7 +125,7 @@ public abstract class RankingAlgorithm {
}
}
try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART LIKE ?")) {
try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME LIKE ?")) {
for (var seed : this.originDomains) {
stmt.setString(1, seed);
var rsp = stmt.executeQuery();
@ -159,10 +159,10 @@ public abstract class RankingAlgorithm {
try (var conn = dataSource.getConnection()) {
String s;
if (getNames) {
s = "SELECT EC_DOMAIN.ID,URL_PART,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
s = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
}
else {
s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
}
try (var stmt = conn.prepareStatement(s)) {
stmt.setFetchSize(10000);
@ -172,7 +172,7 @@ public abstract class RankingAlgorithm {
int id = rsp.getInt(1);
if (!spamDomains.contains(id)) {
domainsById.put(id, new DomainData(id, rsp.getString(2), rsp.getInt(3), rsp.getInt(4), rsp.getInt(5), true));
domainsById.put(id, new DomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5), true));
domainIndexToId.put(domainIndexToId.size(), id);
domainIdToIndex.put(id, domainIdToIndex.size());
@ -451,7 +451,7 @@ public abstract class RankingAlgorithm {
public final int id;
public final String name;
private int alias;
private int state;
private EdgeDomainIndexingState state;
public final int knownUrls;
public boolean peripheral;
@ -465,11 +465,11 @@ public abstract class RankingAlgorithm {
}
public boolean isSpecial() {
return EdgeDomainIndexingState.SPECIAL.code == state;
return EdgeDomainIndexingState.SPECIAL == state;
}
public boolean isSocialMedia() {
return EdgeDomainIndexingState.SOCIAL_MEDIA.code == state;
return EdgeDomainIndexingState.SOCIAL_MEDIA == state;
}
}

View File

@ -66,7 +66,7 @@ public class OldReversePageRankV2 {
originDomains.add("memex.marginalia.nu");
try (var conn = dataSource.getConnection()) {
try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE FROM EC_DOMAIN WHERE INDEXED>1 AND STATE>=0 AND QUALITY_RAW>=-10")) {
try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE FROM EC_DOMAIN WHERE INDEXED>1 AND IS_ALIVE")) {
stmt.setFetchSize(10000);
var rsp = stmt.executeQuery();
while (rsp.next()) {
@ -90,7 +90,7 @@ public class OldReversePageRankV2 {
}
}
try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?")) {
try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
stmt.setFetchSize(10000);
for (var seed : this.originDomains) {

View File

@ -48,7 +48,7 @@ public class StandardPageRank {
originDomains.addAll(Arrays.asList(origins));
try (var conn = dataSource.getConnection()) {
try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE,URL_PART FROM EC_DOMAIN WHERE INDEXED>1 AND STATE>=0 AND QUALITY>=-10")) {
try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE,DOMAIN_NAME FROM EC_DOMAIN WHERE INDEXED>1 AND IS_ALIVE AND QUALITY>=-10")) {
stmt.setFetchSize(10000);
var rsp = stmt.executeQuery();
while (rsp.next()) {
@ -78,7 +78,7 @@ public class StandardPageRank {
}
}
try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?")) {
try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
for (var seed : this.originDomains) {
stmt.setString(1, seed);
var rsp = stmt.executeQuery();

View File

@ -50,7 +50,7 @@ public class DedupTool {
Map<Integer, Map<Integer, List<Data>>> domainToHashToUrl = new HashMap<>();
try (var conn = ds.getConnection();
var fetchStmt = conn.prepareStatement("SELECT URL_TOP_DOMAIN_ID,DATA_HASH,URL,EC_URL.ID,EC_DOMAIN.URL_PART FROM EC_URL INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE DATA_HASH IS NOT NULL");
var fetchStmt = conn.prepareStatement("SELECT URL_TOP_DOMAIN_ID,DATA_HASH,URL,EC_URL.ID,EC_DOMAIN.DOMAIN_NAME FROM EC_URL INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE DATA_HASH IS NOT NULL");
var updateStmt = conn.prepareStatement("UPDATE EC_URL SET STATE='redirect' WHERE ID=?")
) {

View File

@ -112,10 +112,10 @@ public class PerusePageRankV2 {
try (var conn = dataSource.getConnection()) {
String s;
if (getNames) {
s = "SELECT EC_DOMAIN.ID,URL_PART,DOMAIN_ALIAS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 GROUP BY EC_DOMAIN.ID";
s = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) GROUP BY EC_DOMAIN.ID";
}
else {
s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS FROM EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 GROUP BY EC_DOMAIN.ID";
s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS FROM EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) GROUP BY EC_DOMAIN.ID";
}
try (var stmt = conn.prepareStatement(s)) {
stmt.setFetchSize(10000);

View File

@ -1,30 +0,0 @@
package nu.marginalia.util.ranking.tool;
import lombok.SneakyThrows;
import nu.marginalia.util.ranking.AcademiaRank;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import org.mariadb.jdbc.Driver;
import java.io.IOException;
public class TestAcademiaRankTool {
@SneakyThrows
public static void main(String... args) {
Driver driver = new Driver();
var conn = new DatabaseModule().provideConnection();
var rank = new AcademiaRank(new DatabaseModule().provideConnection(), "www.perseus.tufts.edu", "xroads.virginia.edu");
var res = rank.getResult();
try (var c = conn.getConnection(); var stmt = c.prepareStatement("SELECT URL_PART FROM EC_DOMAIN WHERE ID=?")) {
for (int i = 0; i < Math.min(res.size(), 100); i++) {
stmt.setInt(1, res.getQuick(i));
var rsp = stmt.executeQuery();
while (rsp.next())
System.out.println(rsp.getString(1));
}
}
}
}

View File

@ -83,11 +83,6 @@ public class UpdateDomainRanksTool {
}
}
logger.info("Recalculating quality");
try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET QUALITY=-5*RANK+IF(RANK=1,RANK*GREATEST(QUALITY_RAW,QUALITY_ORIGINAL)/2, 0)")) {
stmt.executeUpdate();
}
} catch (SQLException | InterruptedException throwables) {
throwables.printStackTrace();
}

View File

@ -94,9 +94,6 @@ public class UpdateDomainRanksTool2 {
}
logger.info("Recalculating quality");
try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET QUALITY=-5*RANK+IF(RANK=1,RANK*GREATEST(QUALITY_RAW,QUALITY_ORIGINAL)/2, 0)")) {
stmt.executeUpdate();
}
} catch (SQLException | InterruptedException throwables) {
throwables.printStackTrace();

View File

@ -29,7 +29,7 @@ public class ReindexTriggerMain {
.build();
try (var ds = db.provideConnection(); var conn = ds.getConnection(); var stmt = conn.createStatement()) {
var rs = stmt.executeQuery("SELECT ID, URL_PART, STATE, INDEXED FROM EC_DOMAIN LIMIT 100");
var rs = stmt.executeQuery("SELECT ID, DOMAIN_NAME, STATE, INDEXED FROM EC_DOMAIN LIMIT 100");
while (rs.next()) {
System.out.printf("%d %s %s %d\n",
rs.getInt(1),
@ -38,7 +38,7 @@ public class ReindexTriggerMain {
rs.getInt(4));
}
rs = stmt.executeQuery("SELECT ID, DOMAIN_ID, URL, VISITED, STATE FROM EC_URL LIMIT 100");
rs = stmt.executeQuery("SELECT ID, DOMAIN_ID, PATH, VISITED, STATE FROM EC_URL LIMIT 100");
while (rs.next()) {
System.out.printf("%d %d %s %d %s\n",
rs.getInt(1),

View File

@ -14,7 +14,7 @@ public interface Interpreter {
void loadRssFeed(EdgeUrl[] rssFeed);
void loadDomainLink(DomainLink[] links);
void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, double quality);
void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip);
void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument);
void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError);

View File

@ -6,11 +6,11 @@ import nu.marginalia.wmsa.edge.converting.interpreter.Interpreter;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
public record LoadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, double quality) implements Instruction {
public record LoadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) implements Instruction {
@Override
public void apply(Interpreter interpreter) {
interpreter.loadProcessedDomain(domain, state, quality);
interpreter.loadProcessedDomain(domain, state, ip);
}
@Override

View File

@ -76,9 +76,9 @@ public class Loader implements Interpreter {
}
@Override
public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, double quality) {
logger.debug("loadProcessedDomain({}, {}, {})", domain, state, quality);
sqlLoadProcessedDomain.load(data, domain, state, quality);
public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) {
logger.debug("loadProcessedDomain({}, {}, {})", domain, state, ip);
sqlLoadProcessedDomain.load(data, domain, state, ip);
}
@Override

View File

@ -30,7 +30,7 @@ public class SqlLoadDomainLinks {
INSERT IGNORE INTO EC_DOMAIN_LINK (SOURCE_DOMAIN_ID, DEST_DOMAIN_ID)
SELECT SOURCE.ID,DEST.ID
FROM EC_DOMAIN SOURCE INNER JOIN EC_DOMAIN DEST
ON SOURCE.URL_PART=FROM_DOMAIN AND DEST.URL_PART=TO_DOMAIN;
ON SOURCE.DOMAIN_NAME=FROM_DOMAIN AND DEST.DOMAIN_NAME=TO_DOMAIN;
END
""");
}
@ -61,8 +61,8 @@ public class SqlLoadDomainLinks {
}
}
}
catch (SQLException sql) {
sql.printStackTrace();
catch (SQLException ex) {
logger.warn("SQL error inserting domain links", ex);
}
}

View File

@ -25,15 +25,9 @@ public class SqlLoadDomains {
stmt.execute("""
CREATE PROCEDURE INSERT_DOMAIN (
IN DOMAIN_NAME VARCHAR(255),
IN SUB_DOMAIN VARCHAR(255),
IN TOP_DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci)
BEGIN
INSERT IGNORE INTO EC_TOP_DOMAIN (URL_PART) VALUES (TOP_DOMAIN);
INSERT IGNORE INTO EC_DOMAIN(URL_PART, URL_SUBDOMAIN, URL_TOP_DOMAIN_ID)
SELECT DOMAIN_NAME,SUB_DOMAIN,ID
FROM EC_TOP_DOMAIN
WHERE EC_TOP_DOMAIN.URL_PART=TOP_DOMAIN;
INSERT IGNORE INTO EC_DOMAIN(DOMAIN_NAME, DOMAIN_TOP) VALUES (DOMAIN_NAME, TOP_DOMAIN);
END
""");
}
@ -46,10 +40,9 @@ public class SqlLoadDomains {
public void load(LoaderData data, EdgeDomain domain) {
try (var connection = dataSource.getConnection()) {
try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?,?)")) {
try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?)")) {
insertCall.setString(1, domain.toString());
insertCall.setString(2, domain.subDomain);
insertCall.setString(3, domain.domain);
insertCall.setString(2, domain.domain);
insertCall.addBatch();
var ret = insertCall.executeUpdate();
@ -57,12 +50,11 @@ public class SqlLoadDomains {
logger.warn("load({}) -- bad row count {}", domain, ret);
}
connection.commit();
findIdForTargetDomain(connection, data);
}
}
catch (SQLException ex) {
ex.printStackTrace();
logger.warn("SQL error inserting domain", ex);
}
@ -73,12 +65,11 @@ public class SqlLoadDomains {
try (var connection = dataSource.getConnection()) {
connection.setAutoCommit(false);
try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?,?)")) {
try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?)")) {
for (var domain : domains) {
insertCall.setString(1, domain.toString());
insertCall.setString(2, domain.subDomain);
insertCall.setString(3, domain.domain);
insertCall.setString(2, domain.domain);
insertCall.addBatch();
}
var ret = insertCall.executeBatch();
@ -95,7 +86,7 @@ public class SqlLoadDomains {
findIdForTargetDomain(connection, data);
}
catch (SQLException ex) {
ex.printStackTrace();
logger.warn("SQL error inserting domains", ex);
}
}
@ -104,7 +95,7 @@ public class SqlLoadDomains {
return;
}
try (var query = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?"))
try (var query = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?"))
{
var targetDomain = data.getTargetDomain();
@ -118,7 +109,7 @@ public class SqlLoadDomains {
}
}
catch (SQLException ex) {
ex.printStackTrace();
logger.warn("SQL error finding id for domain", ex);
}
}
}

View File

@ -31,14 +31,13 @@ public class SqlLoadProcessedDocument {
IN TITLE VARCHAR(255),
IN DESCRIPTION VARCHAR(255),
IN LENGTH INT,
IN QUALITY_MEASURE DOUBLE,
IN FEATURES INT,
IN STANDARD VARCHAR(32),
IN HASH INT)
BEGIN
SET FOREIGN_KEY_CHECKS=0;
REPLACE INTO EC_PAGE_DATA(ID, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES) VALUES (URL_ID, TITLE, DESCRIPTION, LENGTH, STANDARD, FEATURES);
UPDATE EC_URL SET VISITED=1, STATE=STATE, QUALITY_MEASURE=QUALITY_MEASURE, DATA_HASH=HASH WHERE ID=URL_ID;
REPLACE INTO EC_PAGE_DATA(ID, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH) VALUES (URL_ID, TITLE, DESCRIPTION, LENGTH, STANDARD, FEATURES, HASH);
UPDATE EC_URL SET VISITED=1, STATE=STATE WHERE ID=URL_ID;
SET FOREIGN_KEY_CHECKS=1;
END
""");
@ -47,7 +46,7 @@ public class SqlLoadProcessedDocument {
IN URL_ID INT,
IN STATE VARCHAR(32))
BEGIN
UPDATE EC_URL SET VISITED=1, STATE=STATE, QUALITY_MEASURE=-100, DATA_HASH=NULL WHERE ID=URL_ID;
UPDATE EC_URL SET VISITED=1, STATE=STATE WHERE ID=URL_ID;
END
""");
@ -60,7 +59,8 @@ public class SqlLoadProcessedDocument {
public void load(LoaderData data, List<LoadProcessedDocument> documents) {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT(?, ?, ?, ?, ?, ?, ?, ?, ?)")) {
var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT(?, ?, ?, ?, ?, ?, ?, ?)")) {
conn.setAutoCommit(false);
for (var doc : documents) {
int urlId = data.getUrlId(doc.url());
@ -74,10 +74,9 @@ public class SqlLoadProcessedDocument {
stmt.setString(3, doc.title());
stmt.setString(4, doc.description());
stmt.setInt(5, doc.length());
stmt.setDouble(6, doc.quality());
stmt.setInt(7, doc.htmlFeatures());
stmt.setString(8, doc.standard().name());
stmt.setInt(9, (int) doc.hash());
stmt.setInt(6, doc.htmlFeatures());
stmt.setString(7, doc.standard().name());
stmt.setInt(8, (int) doc.hash());
stmt.addBatch();
}
var ret = stmt.executeBatch();
@ -89,8 +88,8 @@ public class SqlLoadProcessedDocument {
}
conn.commit();
} catch (SQLException e) {
e.printStackTrace();
} catch (SQLException ex) {
logger.warn("SQL error inserting document", ex);
}
@ -117,8 +116,8 @@ public class SqlLoadProcessedDocument {
logger.warn("load({}) -- bad row count {}", documents.get(rv), ret[rv]);
}
}
} catch (SQLException e) {
e.printStackTrace();
} catch (SQLException ex) {
logger.warn("SQL error inserting failed document", ex);
}
}

View File

@ -25,12 +25,12 @@ public class SqlLoadProcessedDomain {
stmt.execute("DROP PROCEDURE IF EXISTS INITIALIZE_DOMAIN");
stmt.execute("""
CREATE PROCEDURE INITIALIZE_DOMAIN (
IN ST INT,
IN ST ENUM('ACTIVE', 'EXHAUSTED', 'SPECIAL', 'SOCIAL_MEDIA', 'BLOCKED', 'REDIR', 'ERROR', 'UNKNOWN'),
IN IDX INT,
IN QUAL DOUBLE,
IN DID INT)
IN DID INT,
IN IP VARCHAR(32))
BEGIN
UPDATE EC_DOMAIN SET INDEX_DATE=NOW(), STATE=ST, DOMAIN_ALIAS=NULL, INDEXED=GREATEST(INDEXED,IDX), QUALITY=QUAL, QUALITY_RAW=QUAL, QUALITY_ORIGINAL=QUAL WHERE ID=DID;
UPDATE EC_DOMAIN SET INDEX_DATE=NOW(), STATE=ST, DOMAIN_ALIAS=NULL, INDEXED=GREATEST(INDEXED,IDX), IP=IP WHERE ID=DID;
DELETE FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=DID;
END
""");
@ -41,7 +41,7 @@ public class SqlLoadProcessedDomain {
}
}
public void load(LoaderData data, EdgeDomain domain, EdgeDomainIndexingState state, double quality) {
public void load(LoaderData data, EdgeDomain domain, EdgeDomainIndexingState state, String ip) {
data.setTargetDomain(domain);
loadDomains.load(data, domain);
@ -49,18 +49,17 @@ public class SqlLoadProcessedDomain {
try (var conn = dataSource.getConnection();
var initCall = conn.prepareCall("CALL INITIALIZE_DOMAIN(?,?,?,?)"))
{
initCall.setInt(1, state.code);
initCall.setString(1, state.name());
initCall.setInt(2, 1 + data.sizeHint / 100);
initCall.setDouble(3, quality);
initCall.setInt(4, data.getDomainId(domain));
initCall.setInt(3, data.getDomainId(domain));
initCall.setString(4, ip);
int rc = initCall.executeUpdate();
if (rc < 1) {
logger.warn("load({},{},{}) -- bad rowcount {}", domain, state, quality, rc);
logger.warn("load({},{}) -- bad rowcount {}", domain, state, rc);
}
conn.commit();
}
catch (SQLException ex) {
ex.printStackTrace();
logger.warn("SQL error initializing domain", ex);
}
}
@ -69,9 +68,9 @@ public class SqlLoadProcessedDomain {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""
UPDATE EC_DOMAIN TARGET
INNER JOIN EC_DOMAIN ALIAS ON ALIAS.URL_PART=?
INNER JOIN EC_DOMAIN ALIAS ON ALIAS.DOMAIN_NAME=?
SET TARGET.DOMAIN_ALIAS=ALIAS.ID
WHERE TARGET.URL_PART=?
WHERE TARGET.DOMAIN_NAME=?
""")) {
stmt.setString(1, link.to().toString());
stmt.setString(2, link.from().toString());
@ -81,7 +80,7 @@ public class SqlLoadProcessedDomain {
}
}
catch (SQLException ex) {
ex.printStackTrace();
logger.warn("SQL error inserting domain alias", ex);
}
}
}

View File

@ -25,12 +25,13 @@ public class SqlLoadUrls {
stmt.execute("""
CREATE PROCEDURE INSERT_URL (
IN PROTO VARCHAR(255),
IN DOMAIN_NAME VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci,
IN DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci,
IN PORT INT,
IN URL VARCHAR(255)
IN PATH VARCHAR(255),
IN PATH_HASH INT
)
BEGIN
INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,URL) SELECT PROTO,ID,PORT,URL FROM EC_DOMAIN WHERE URL_PART=DOMAIN_NAME;
INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PATH_HASH) SELECT PROTO,ID,PORT,PATH,PATH_HASH FROM EC_DOMAIN WHERE DOMAIN_NAME=DOMAIN;
END
""");
}
@ -42,8 +43,8 @@ public class SqlLoadUrls {
public void load(LoaderData data, EdgeUrl[] urls) {
try (var conn = dataSource.getConnection();
var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?)");
var queryCall = conn.prepareStatement("SELECT ID, PROTO, URL FROM EC_URL WHERE DOMAIN_ID=?")
var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?, ?)");
var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH FROM EC_URL WHERE DOMAIN_ID=?")
)
{
conn.setAutoCommit(false);
@ -58,6 +59,7 @@ public class SqlLoadUrls {
insertCall.setNull(3, Types.INTEGER);
}
insertCall.setString(4, url.path);
insertCall.setInt(5, url.path.hashCode());
insertCall.addBatch();
}
var ret = insertCall.executeBatch();
@ -86,7 +88,7 @@ public class SqlLoadUrls {
}
catch (SQLException ex) {
ex.printStackTrace();
logger.warn("SQL error inserting URLs", ex);
}
}
}

View File

@ -15,7 +15,7 @@ public class InstructionsCompiler {
public List<Instruction> compile(ProcessedDomain domain) {
List<Instruction> ret = new ArrayList<>(domain.size()*4);
ret.add(new LoadProcessedDomain(domain.domain, domain.state, domain.averageQuality().orElse(-5.)));
ret.add(new LoadProcessedDomain(domain.domain, domain.state, domain.ip));
if (domain.documents != null) {
compileUrls(ret, domain.documents);

View File

@ -34,11 +34,10 @@ public class CrawlJobExtractorMain {
private static final String domainsSql =
"""
SELECT ID, LOWER(EC_DOMAIN.URL_PART)
SELECT ID, LOWER(EC_DOMAIN.DOMAIN_NAME)
FROM EC_DOMAIN
WHERE QUALITY_RAW>-100
AND INDEXED>0
AND STATE<2
WHERE INDEXED>0
AND STATE='ACTIVE' OR STATE='EXHAUSTED'
ORDER BY
INDEX_DATE ASC,
DISCOVER_DATE ASC,
@ -49,8 +48,8 @@ public class CrawlJobExtractorMain {
private static final String urlsSql =
"""
SELECT CONCAT(PROTO, "://", ?, URL)
FROM EC_URL
SELECT URL
FROM EC_URL_VIEW
WHERE DOMAIN_ID=?
ORDER BY
VISITED DESC,

View File

@ -30,19 +30,19 @@ public class CrawlJobExtractorPageRankMain {
"""
SELECT ID
FROM EC_DOMAIN
WHERE URL_PART=?
WHERE DOMAIN_NAME=?
""";
private static final String specificDomainSqlFromId =
"""
SELECT LOWER(URL_PART)
SELECT LOWER(DOMAIN_NAME)
FROM EC_DOMAIN
WHERE ID=?
""";
private static final String urlsSql =
"""
SELECT CONCAT(PROTO, "://", ?, URL)
FROM EC_URL
SELECT URL
FROM EC_URL_VIEW
WHERE DOMAIN_ID=?
ORDER BY
VISITED DESC,

View File

@ -16,22 +16,14 @@ public interface EdgeDataStoreDao {
boolean isBlacklisted(EdgeDomain domain);
EdgeId<EdgeDomain> getDomainId(EdgeDomain domain);
EdgeId<EdgeUrl> getUrlId(EdgeUrl domain);
EdgeUrl getUrl(EdgeId<EdgeUrl> id);
EdgeUrlDetails getUrlDetails(EdgeId<EdgeUrl> id);
List<BrowseResult> getDomainNeighbors(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist backlist, int count);
List<BrowseResult> getDomainNeighborsAdjacent(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist backlist, int count);
List<BrowseResult> getRandomDomains(int count, EdgeDomainBlacklist backlist);
List<EdgeUrlDetails> getUrlDetailsMulti(List<EdgeId<EdgeUrl>> ids);
List<EdgeId<EdgeDomain>> getDomainIdsFromUrlIds(Collection<EdgeId<EdgeUrl>> urlIds);
EdgeDomain getDomain(EdgeId<EdgeDomain> id);
List<EdgeId<EdgeUrl>> inboudUrls(EdgeId<EdgeUrl> id, int limit);
List<EdgeId<EdgeUrl>> outboundUrls(EdgeId<EdgeUrl> id, int limit);
Optional<EdgeId<EdgeUrl>> resolveAmbiguousDomain(String name);
@ -48,9 +40,6 @@ public interface EdgeDataStoreDao {
List<EdgeDomain> getLinkingDomains(EdgeId<EdgeDomain> domainId);
List<EdgeUrl> getNewUrls(EdgeId<EdgeDomain> domainId, Collection<EdgeUrl> links);
double getRank(EdgeId<EdgeDomain> domainId);
void updateDomainIndexTimestamp(EdgeDomain domain, EdgeDomainIndexingState state, EdgeDomain alias, int minIndexed);
}

View File

@ -71,7 +71,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
try (var connection = dataSource.getConnection()) {
return domainIdCache.get(domain, () -> {
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?")) {
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
stmt.setString(1, domain.toString());
var rsp = stmt.executeQuery();
if (rsp.next()) {
@ -86,104 +86,14 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
}
}
@Override
@SneakyThrows
public EdgeId<EdgeUrl> getUrlId(EdgeUrl url) {
try (var connection = dataSource.getConnection()) {
return urlIdCache.get(url, () -> {
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_URL_VIEW WHERE URL_PATH=? AND URL_DOMAIN=? AND URL_PROTO=?")) {
stmt.setString(1, url.path);
stmt.setString(2, url.domain.toString());
stmt.setString(3, url.proto);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return new EdgeId<>(rsp.getInt(1));
}
}
// Lenient mode for http->https upgrades etc
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_URL_VIEW WHERE URL_PATH=? AND URL_DOMAIN=?")) {
stmt.setString(1, url.path);
stmt.setString(2, url.domain.toString());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return new EdgeId<>(rsp.getInt(1));
}
}
throw new NoSuchElementException(url.toString());
});
}
catch (UncheckedExecutionException ex) {
throw ex.getCause();
private <T> String idList(List<EdgeId<T>> ids) {
StringJoiner j = new StringJoiner(",", "(", ")");
for (var id : ids) {
j.add(Integer.toString(id.getId()));
}
return j.toString();
}
@SneakyThrows
@Override
public List<EdgeId<EdgeDomain>> getDomainIdsFromUrlIds(Collection<EdgeId<EdgeUrl>> urlIds) {
List<EdgeId<EdgeDomain>> results = new ArrayList<>(urlIds.size());
if (urlIds.isEmpty())
return results;
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT DOMAIN_ID FROM EC_URL WHERE ID IN " + urlIds
.stream()
.map(EdgeId::getId)
.map(Object::toString)
.collect(Collectors.joining(",", "(", ")"))))
{
var rsp = stmt.executeQuery();
while (rsp.next()) {
results.add(new EdgeId<>(rsp.getInt(1)));
}
}
}
return results;
}
static final Pattern badChars = Pattern.compile("[';\\\\]");
private String saneString(String s) {
return "\'"+badChars.matcher(s).replaceAll("?")+"\'";
}
@SneakyThrows
@Override
public EdgeUrl getUrl(EdgeId<EdgeUrl> id) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.createStatement()) {
var rsp = stmt.executeQuery("SELECT URL_PROTO, URL_DOMAIN,URL_PORT,URL_PATH FROM EC_URL_VIEW WHERE ID=" + id.getId());
if (rsp.next()) {
return new EdgeUrl(rsp.getString(1), new EdgeDomain(rsp.getString(2)), rsp.getInt(3), rsp.getString(4));
}
throw new NoSuchElementException();
}
}
}
@SneakyThrows
@Override
public EdgeUrlDetails getUrlDetails(EdgeId<EdgeUrl> id) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.createStatement()) {
var rsp = stmt.executeQuery("SELECT ID,URL_PROTO,URL_DOMAIN,URL_PORT,URL_PATH,TITLE,DESCRIPTION,URL_QUALITY_MEASURE,DOMAIN_QUALITY_MEASURE,IFNULL(EC_DOMAIN_LINK_AGGREGATE.LINKS,1),WORDS_TOTAL,FORMAT,FEATURES,\"\",QUALITY_RAW,DOMAIN_STATE,DATA_HASH FROM EC_URL_VIEW LEFT JOIN EC_DOMAIN_LINK_AGGREGATE ON EC_DOMAIN_LINK_AGGREGATE.DOMAIN_ID=EC_URL_VIEW.DOMAIN_ID WHERE ID=" + id.getId());
if (rsp.next()) {
EdgeUrl url = new EdgeUrl(rsp.getString(2), new EdgeDomain(rsp.getString(3)), rsp.getInt(4), rsp.getString(5));
return new EdgeUrlDetails(rsp.getInt(1), url, rsp.getString(6), rsp.getString(7), rsp.getDouble(8), rsp.getDouble(15), rsp.getDouble(9), rsp.getInt(10), rsp.getInt(11), rsp.getString(12), rsp.getInt(13), EdgePageScoreAdjustment.zero(), Integer.MAX_VALUE, Double.MAX_VALUE, rsp.getString(14), rsp.getInt(16), 0, rsp.getInt(17));
}
throw new NoSuchElementException();
}
}
}
@SneakyThrows
@Override
public List<EdgeUrlDetails> getUrlDetailsMulti(List<EdgeId<EdgeUrl>> ids) {
@ -193,16 +103,38 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
List<EdgeUrlDetails> result = new ArrayList<>(ids.size());
try (var connection = dataSource.getConnection()) {
// This is SQL-injection safe, the IDs are of type int
String idString = ids.stream().map(EdgeId::getId).map(Objects::toString).collect(Collectors.joining(",", "(", ")"));
try (var stmt = connection.prepareStatement("SELECT ID,URL_PROTO,URL_DOMAIN,URL_PORT,URL_PATH,TITLE,DESCRIPTION,URL_QUALITY_MEASURE,DOMAIN_QUALITY_MEASURE,IFNULL(EC_DOMAIN_LINK_AGGREGATE.LINKS,1),WORDS_TOTAL,FORMAT,FEATURES,\"\",QUALITY_RAW,DOMAIN_STATE,DATA_HASH FROM EC_URL_VIEW LEFT JOIN EC_DOMAIN_LINK_AGGREGATE ON EC_DOMAIN_LINK_AGGREGATE.DOMAIN_ID=EC_URL_VIEW.DOMAIN_ID WHERE ID IN " + idString)) {
String idString = idList(ids);
try (var stmt = connection.prepareStatement(
"""
SELECT ID, URL,
TITLE, DESCRIPTION,
WORDS_TOTAL, FORMAT, FEATURES,
IP, DOMAIN_STATE, DATA_HASH
FROM EC_URL_VIEW WHERE ID IN
""" + idString)) {
// "SELECT ID,URL_PROTO,URL_DOMAIN,URL_PORT,URL_PATH,TITLE,DESCRIPTION,URL_QUALITY_MEASURE,DOMAIN_QUALITY_MEASURE,IFNULL(EC_DOMAIN_LINK_AGGREGATE.LINKS,1),WORDS_TOTAL,FORMAT,FEATURES,\"\",QUALITY_RAW,DOMAIN_STATE,DATA_HASH FROM EC_URL_VIEW LEFT JOIN EC_DOMAIN_LINK_AGGREGATE ON EC_DOMAIN_LINK_AGGREGATE.DOMAIN_ID=EC_URL_VIEW.DOMAIN_ID WHERE ID IN " + idString)) {
stmt.setFetchSize(ids.size());
var rsp = stmt.executeQuery();
while (rsp.next()) {
EdgeUrl url = new EdgeUrl(rsp.getString(2), new EdgeDomain(rsp.getString(3)), rsp.getInt(4), rsp.getString(5));
var val = new EdgeUrlDetails(rsp.getInt(1), url, rsp.getString(6), rsp.getString(7), rsp.getDouble(8), rsp.getDouble(15), rsp.getDouble(9), rsp.getInt(10), rsp.getInt(11), rsp.getString(12), rsp.getInt(13), EdgePageScoreAdjustment.zero(), Integer.MAX_VALUE, Double.MAX_VALUE, rsp.getString(14), rsp.getInt(16), 0, rsp.getInt(17));
EdgeUrl url = new EdgeUrl(rsp.getString(2));
var val = new EdgeUrlDetails(rsp.getInt(1), url,
rsp.getString(3), // title
rsp.getString(4), // description
-5, // quality
rsp.getInt(5), // wordsTotal
rsp.getString(6), // foramt
rsp.getInt(7), // features
rsp.getString(8), // ip
EdgeDomainIndexingState.valueOf(rsp.getString(9)), // domainState
rsp.getInt(10), // dataHash
EdgePageScoreAdjustment.zero(), // urlQualityAdjustment
Integer.MAX_VALUE, // rankingId
Double.MAX_VALUE, // termScore
0 // queryLength
);
if (val.urlQuality >= QUALITY_LOWER_BOUND_CUTOFF) {
result.add(val);
}
@ -214,75 +146,6 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
return result;
}
@Override
public List<BrowseResult> getDomainNeighbors(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist blacklist, int count) {
final Set<BrowseResult> domains = new HashSet<>(count*3);
final String q = "SELECT EC_DOMAIN.ID AS NEIGHBOR_ID, URL_PART from EC_DOMAIN_NEIGHBORS INNER JOIN EC_DOMAIN ON NEIGHBOR_ID=EC_DOMAIN.ID WHERE STATE<2 AND DOMAIN_ALIAS IS NULL AND EC_DOMAIN_NEIGHBORS.DOMAIN_ID = ? ORDER BY ADJ_IDX LIMIT ?";
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement(q)) {
stmt.setFetchSize(count);
stmt.setInt(1, domainId.getId());
stmt.setInt(2, count);
var rsp = stmt.executeQuery();
while (rsp.next()) {
int id = rsp.getInt(1);
String domain = rsp.getString(2);
if (!blacklist.isBlacklisted(id)) {
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
domains.add(new BrowseResult(url, id));
}
}
}
final String q2 = "SELECT EC_DOMAIN.ID, URL_PART FROM EC_DOMAIN_LINK INNER JOIN EC_DOMAIN ON DEST_DOMAIN_ID=EC_DOMAIN.ID WHERE SOURCE_DOMAIN_ID=? AND STATE<2 AND DOMAIN_ALIAS IS NULL GROUP BY EC_DOMAIN.ID ORDER BY RANK ASC LIMIT ?";
try (var stmt = connection.prepareStatement(q2)) {
stmt.setFetchSize(count);
stmt.setInt(1, domainId.getId());
stmt.setInt(2, count);
var rsp = stmt.executeQuery();
while (rsp.next()) {
int id = rsp.getInt(1);
String domain = rsp.getString(2);
if (!blacklist.isBlacklisted(id)) {
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
domains.add(new BrowseResult(url, id));
}
}
}
final String q3 = "SELECT EC_DOMAIN.ID, URL_PART FROM EC_DOMAIN_LINK INNER JOIN EC_DOMAIN ON DEST_DOMAIN_ID=EC_DOMAIN.ID WHERE DEST_DOMAIN_ID=? AND STATE<2 AND DOMAIN_ALIAS IS NULL GROUP BY EC_DOMAIN.ID ORDER BY RANK ASC LIMIT ?";
try (var stmt = connection.prepareStatement(q3)) {
stmt.setFetchSize(count);
stmt.setInt(1, domainId.getId());
stmt.setInt(2, count);
var rsp = stmt.executeQuery();
while (rsp.next()) {
int id = rsp.getInt(1);
String domain = rsp.getString(2);
if (!blacklist.isBlacklisted(id)) {
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
domains.add(new BrowseResult(url, id));
}
}
}
} catch (SQLException throwables) {
throwables.printStackTrace();
}
return new ArrayList<>(domains);
}
@Override
public List<BrowseResult> getDomainNeighborsAdjacent(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist blacklist, int count) {
@ -357,7 +220,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
if (domains.size() < count/2) {
final String q3 = """
SELECT EC_DOMAIN.ID, URL_PART
SELECT EC_DOMAIN.ID, URL_PART
FROM EC_DOMAIN
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
INNER JOIN EC_DOMAIN_LINK B ON B.SOURCE_DOMAIN_ID=EC_DOMAIN.ID
@ -399,7 +262,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
@Override
public List<BrowseResult> getRandomDomains(int count, EdgeDomainBlacklist blacklist) {
final String q = "SELECT DOMAIN_ID,URL_PART FROM EC_RANDOM_DOMAINS INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE STATE<2 AND DOMAIN_ALIAS IS NULL ORDER BY RAND() LIMIT ?";
final String q = "SELECT DOMAIN_ID,DOMAIN_NAME FROM EC_RANDOM_DOMAINS INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE STATE<2 AND DOMAIN_ALIAS IS NULL ORDER BY RAND() LIMIT ?";
List<BrowseResult> domains = new ArrayList<>(count);
try (var conn = dataSource.getConnection()) {
try (var stmt = conn.prepareStatement(q)) {
@ -428,7 +291,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
public EdgeDomain getDomain(EdgeId<EdgeDomain> id) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT URL_PART FROM EC_DOMAIN WHERE ID=?")) {
try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) {
stmt.setInt(1, id.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
@ -439,55 +302,11 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
}
}
@Override @SneakyThrows
public List<EdgeId<EdgeUrl>> inboudUrls(EdgeId<EdgeUrl> id, int limit) {
List<EdgeId<EdgeUrl>> ret = new ArrayList<>();
try (var connection = dataSource.getConnection()) {
try (var stmt =
connection.prepareStatement("SELECT SRC_URL_ID FROM EC_RELATED_LINKS_IN WHERE DEST_URL_ID=? ORDER BY SRC_URL_QUALITY DESC LIMIT ?")) {
stmt.setFetchSize(limit);
stmt.setInt(1, id.getId());
stmt.setInt(2, limit);
var rsp = stmt.executeQuery();
while (rsp.next()) {
ret.add(new EdgeId<>(rsp.getInt(1)));
}
}
}
return ret;
}
@Override @SneakyThrows
public List<EdgeId<EdgeUrl>> outboundUrls(EdgeId<EdgeUrl> id, int limit) {
List<EdgeId<EdgeUrl>> ret = new ArrayList<>();
try (var connection = dataSource.getConnection()) {
try (var stmt =
connection.prepareStatement("SELECT DEST_URL_ID FROM EC_RELATED_LINKS_IN WHERE SRC_URL_ID=? ORDER BY SRC_URL_QUALITY DESC LIMIT ?")) {
stmt.setFetchSize(limit);
stmt.setInt(1, id.getId());
stmt.setInt(2, limit);
var rsp = stmt.executeQuery();
while (rsp.next()) {
ret.add(new EdgeId<>(rsp.getInt(1)));
}
}
}
return ret;
}
@Override
public Optional<EdgeId<EdgeUrl>> resolveAmbiguousDomain(String name) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) {
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
stmt.setString(1, name);
var rsp = stmt.executeQuery();
if (rsp.next()) {
@ -495,7 +314,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
}
}
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) {
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
stmt.setString(1, "https://"+name);
var rsp = stmt.executeQuery();
if (rsp.next()) {
@ -503,7 +322,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
}
}
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) {
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
stmt.setString(1, "http://"+name);
var rsp = stmt.executeQuery();
if (rsp.next()) {
@ -511,7 +330,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
}
}
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) {
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
stmt.setString(1, "https://www."+name);
var rsp = stmt.executeQuery();
if (rsp.next()) {
@ -519,7 +338,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
}
}
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) {
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
stmt.setString(1, "http://www."+name);
var rsp = stmt.executeQuery();
if (rsp.next()) {
@ -682,27 +501,6 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
return Collections.emptyList();
}
@Override
public List<EdgeUrl> getNewUrls(EdgeId<EdgeDomain> domainId, Collection<EdgeUrl> links) {
Map<String, EdgeUrl> edgeUrlByPath = links.stream().collect(Collectors.toMap(EdgeUrl::getPath, Function.identity(), (a,b)->a));
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT URL FROM EC_URL WHERE DOMAIN_ID=?")) {
stmt.setFetchSize(500);
stmt.setInt(1, domainId.getId());
var rs = stmt.executeQuery();
while (rs.next()) {
edgeUrlByPath.remove(rs.getString(1));
}
}
}
catch (Exception ex) {
return Collections.emptyList();
}
return new ArrayList<>(edgeUrlByPath.values());
}
@Override
public double getRank(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
@ -722,47 +520,5 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
return 1;
}
@Override
public void updateDomainIndexTimestamp(EdgeDomain domain, EdgeDomainIndexingState state, EdgeDomain alias, int minIndexed) {
try (var connection = dataSource.getConnection();
var stmt = connection.prepareStatement("UPDATE EC_DOMAIN SET INDEX_DATE=NOW(), STATE=?, DOMAIN_ALIAS=?, INDEXED=GREATEST(INDEXED,?) WHERE ID=?")) {
stmt.setInt(1, state.code);
if (null == alias) {
stmt.setNull(2, Types.INTEGER);
}
else {
stmt.setInt(2, getDomainId(alias).getId());
}
stmt.setInt(3, minIndexed);
stmt.setInt(4, getDomainId(domain).getId());
stmt.executeUpdate();
connection.commit();
}
catch (SQLException throwables) {
logger.error("SQL error", throwables);
}
}
@SneakyThrows
private double getDomainQuality(Connection connection, EdgeDomain src) {
try (var stmt = connection.prepareStatement("SELECT QUALITY_RAW FROM EC_DOMAIN WHERE URL_PART=?")) {
stmt.setString(1, src.toString());
var res = stmt.executeQuery();
if (res.next()) {
var q = res.getDouble(1);
if (q > 0.5) {
logger.warn("gDQ({}) -> 1", src);
}
return 0;
}
}
catch (SQLException ex) {
logger.error("DB error", ex);
}
return -5;
}
}

View File

@ -50,7 +50,7 @@ public class EdgeDomainBlacklistImpl implements EdgeDomainBlacklist {
final TIntHashSet result = new TIntHashSet(1_000_000);
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT EC_DOMAIN.ID FROM EC_DOMAIN INNER JOIN EC_TOP_DOMAIN ON EC_DOMAIN.URL_TOP_DOMAIN_ID = EC_TOP_DOMAIN.ID INNER JOIN EC_DOMAIN_BLACKLIST ON EC_DOMAIN_BLACKLIST.URL_DOMAIN = EC_TOP_DOMAIN.URL_PART")) {
try (var stmt = connection.prepareStatement("SELECT EC_DOMAIN.ID FROM EC_DOMAIN INNER JOIN EC_DOMAIN_BLACKLIST ON EC_DOMAIN_BLACKLIST.URL_DOMAIN = EC_DOMAIN.DOMAIN_TOP")) {
stmt.setFetchSize(1000);
var rsp = stmt.executeQuery();
while (rsp.next()) {

View File

@ -30,29 +30,13 @@ public class SearchIndexDao {
logger.info("SearchIndexDao ranking settings = {}", rankingSettings);
}
@SneakyThrows
public TIntHashSet getSpamDomains() {
final TIntHashSet result = new TIntHashSet(1_000_000);
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT EC_DOMAIN.ID FROM EC_DOMAIN INNER JOIN EC_TOP_DOMAIN ON EC_DOMAIN.URL_TOP_DOMAIN_ID = EC_TOP_DOMAIN.ID INNER JOIN EC_DOMAIN_BLACKLIST ON EC_DOMAIN_BLACKLIST.URL_DOMAIN = EC_TOP_DOMAIN.URL_PART")) {
var rsp = stmt.executeQuery();
while (rsp.next()) {
result.add(rsp.getInt(1));
}
}
}
return result;
}
@SneakyThrows
public TIntHashSet goodUrls() {
TIntHashSet domains = new TIntHashSet(10_000_000, 0.5f, -1);
TIntHashSet urls = new TIntHashSet(100_000_000, 0.5f, -1);
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_ALIAS IS NULL AND STATE>=0")) {
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_ALIAS IS NULL AND IS_ALIVE")) {
stmt.setFetchSize(10_000);
var rsp = stmt.executeQuery();
while (rsp.next()) {

View File

@ -16,25 +16,24 @@ public class EdgeUrlDetails {
public String description;
public double urlQuality;
public double urlQualityRaw;
public double domainQuality;
public int links; // DEAD
public int words;
public String format;
public int features;
public EdgePageScoreAdjustment urlQualityAdjustment;
public long rankingId;
public double termScore;
public String ip; // BROKEN
public int domainState;
public int queryLength;
public EdgeDomainIndexingState domainState;
public int dataHash;
public EdgePageScoreAdjustment urlQualityAdjustment;
public long rankingId;
public double termScore;
public int queryLength;
public long rankingIdAdjustment() {
int penalty = 0;
@ -136,7 +135,7 @@ public class EdgeUrlDetails {
return HtmlFeature.hasFeature(features, HtmlFeature.COOKIES);
}
public boolean isSpecialDomain() {
return domainState == EdgeDomainIndexingState.SPECIAL.code;
return domainState == EdgeDomainIndexingState.SPECIAL;
}
public int getLogRank() { return (int) Math.round(Math.min(Math.log(1+rankingId),10)); }

View File

@ -107,7 +107,7 @@ public class SearchResultDecorator {
private double calculateTermScore(IndexBlock block, EdgeSearchResultItem resultItem, EdgeUrlDetails details) {
return valuator.evaluateTerms(resultItem.scores, block, details.words) / Math.sqrt(1 + resultItem.queryLength)
+ ((details.domainState == EdgeDomainIndexingState.SPECIAL.code) ? 1.25 : 0);
+ ((details.domainState == EdgeDomainIndexingState.SPECIAL) ? 1.25 : 0);
}
}

View File

@ -1,24 +1,11 @@
DROP TABLE IF EXISTS EC_URL_LINK;
DROP VIEW IF EXISTS EC_PAGE_VIEW;
DROP TABLE IF EXISTS DISC_DOMAIN_TAG;
DROP TABLE IF EXISTS DISC_TAG;
DROP TABLE IF EXISTS DISC_USER;
DROP TABLE IF EXISTS EC_DOMAIN_NEIGHBORS;
DROP TABLE IF EXISTS DOMAIN_METADATA;
DROP TABLE IF EXISTS EC_FEED_URL;
DROP TABLE IF EXISTS EC_DOMAIN_LINK;
DROP TABLE IF EXISTS EC_PAGE_DATA;
DROP TABLE IF EXISTS EC_URL;
DROP TABLE IF EXISTS EC_DOMAIN_NEIGHBORS;
DROP TABLE IF EXISTS EC_DOMAIN;
DROP TABLE IF EXISTS EC_TOP_DOMAIN;
DROP TABLE IF EXISTS EC_URL_DETAILS;
DROP VIEW IF EXISTS EC_URL_VIEW;
DROP VIEW IF EXISTS EC_URL_PART_HASH;
DROP TABLE IF EXISTS EC_URL_WORD;
DROP TABLE IF EXISTS EC_DICTIONARY;
DROP TABLE IF EXISTS DOMAIN_METADATA;
CREATE TABLE IF NOT EXISTS DOMAIN_METADATA (
ID INT PRIMARY KEY,
@ -27,52 +14,31 @@ CREATE TABLE IF NOT EXISTS DOMAIN_METADATA (
GOOD_URLS INT DEFAULT 0
);
CREATE TABLE IF NOT EXISTS EC_TOP_DOMAIN (
ID INT PRIMARY KEY AUTO_INCREMENT,
URL_PART VARCHAR(255) UNIQUE NOT NULL,
ALIVE BOOLEAN DEFAULT TRUE NOT NULL
)
CHARACTER SET utf8mb4
COLLATE utf8mb4_unicode_ci;
CREATE TABLE IF NOT EXISTS EC_DOMAIN (
ID INT PRIMARY KEY AUTO_INCREMENT,
URL_PART VARCHAR(255) UNIQUE NOT NULL,
INDEXED INT DEFAULT 0 NOT NULL,
QUALITY DOUBLE DEFAULT -5 NOT NULL,
QUALITY_RAW DOUBLE DEFAULT -5 NOT NULL,
QUALITY_ORIGINAL DOUBLE DEFAULT -5 NOT NULL,
URL_TOP_DOMAIN_ID INT NOT NULL,
URL_SUBDOMAIN VARCHAR(255) NOT NULL,
STATE INT DEFAULT 0 NOT NULL,
DOMAIN_NAME VARCHAR(255) UNIQUE NOT NULL,
DOMAIN_TOP VARCHAR(255) NOT NULL,
INDEXED INT DEFAULT 0 NOT NULL COMMENT "~number of documents visited / 100",
STATE ENUM('ACTIVE', 'EXHAUSTED', 'SPECIAL', 'SOCIAL_MEDIA', 'BLOCKED', 'REDIR', 'ERROR', 'UNKNOWN') NOT NULL DEFAULT 'active' COMMENT "@see EdgeDomainIndexingState",
RANK DOUBLE,
DOMAIN_ALIAS INTEGER,
IP VARCHAR(32),
INDEX_DATE TIMESTAMP DEFAULT NOW(),
DISCOVER_DATE TIMESTAMP DEFAULT NOW(),
FOREIGN KEY (URL_TOP_DOMAIN_ID) REFERENCES EC_TOP_DOMAIN(ID) ON DELETE CASCADE
)
CHARACTER SET utf8mb4
COLLATE utf8mb4_unicode_ci;
CREATE TABLE IF NOT EXISTS EC_DOMAIN_HISTORY (
ID INT PRIMARY KEY AUTO_INCREMENT,
URL_PART VARCHAR(255) UNIQUE NOT NULL,
QUALITY_MEASURE DOUBLE DEFAULT -5 NOT NULL,
INBOUND_LINKS INT DEFAULT 1,
LINK_ADJUSTED_QUALITY DOUBLE GENERATED ALWAYS AS (0.3*QUALITY_MEASURE + 0.7*QUALITY_MEASURE / GREATEST(1, INBOUND_LINKS)),
RANK DOUBLE
IS_ALIVE BOOLEAN AS (STATE='ACTIVE' OR STATE='EXHAUSTED' OR STATE='SPECIAL' OR STATE='SOCIAL_MEDIA') VIRTUAL
)
CHARACTER SET utf8mb4
COLLATE utf8mb4_unicode_ci;
CREATE TABLE IF NOT EXISTS EC_DOMAIN_BLACKLIST (
ID INT PRIMARY KEY AUTO_INCREMENT,
URL_DOMAIN VARCHAR(255) UNIQUE NOT NULL
URL_DOMAIN VARCHAR(255) UNIQUE NOT NULL
)
CHARACTER SET utf8mb4
COLLATE utf8mb4_unicode_ci;
@ -81,18 +47,15 @@ CREATE TABLE IF NOT EXISTS EC_URL (
ID INT PRIMARY KEY AUTO_INCREMENT,
DOMAIN_ID INT NOT NULL,
PROTO ENUM('http','https','gemini') NOT NULL,
URL VARCHAR(255) NOT NULL,
PATH VARCHAR(255) NOT NULL COLLATE utf8mb4_bin,
PORT INT,
PATH_HASH INT NOT NULL COMMENT "Hash of PATH for uniqueness check by domain",
VISITED BOOLEAN NOT NULL DEFAULT FALSE,
DATA_HASH INTEGER,
QUALITY_MEASURE DOUBLE,
STATE ENUM('ok', 'redirect', 'dead', 'archived', 'disqualified') NOT NULL DEFAULT 'ok',
IP VARCHAR(32),
CONSTRAINT CONS UNIQUE (DOMAIN_ID, URL),
CONSTRAINT CONS UNIQUE (DOMAIN_ID, PATH_HASH),
FOREIGN KEY (DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE
)
CHARACTER SET utf8mb4
@ -101,13 +64,14 @@ COLLATE utf8mb4_unicode_ci;
CREATE TABLE IF NOT EXISTS EC_PAGE_DATA (
ID INT PRIMARY KEY AUTO_INCREMENT,
TITLE VARCHAR(255),
DESCRIPTION VARCHAR(255),
TITLE VARCHAR(255) NOT NULL,
DESCRIPTION VARCHAR(255) NOT NULL,
WORDS_DISTINCT INTEGER,
WORDS_TOTAL INTEGER,
FORMAT VARCHAR(8),
FEATURES INT,
WORDS_TOTAL INTEGER NOT NULL,
FORMAT ENUM('PLAIN', 'UNKNOWN', 'HTML123', 'HTML4', 'XHTML', 'HTML5', 'MARKDOWN') NOT NULL,
FEATURES INT COMMENT "Bit-encoded feature set of document, @see HtmlFeature" NOT NULL,
DATA_HASH INTEGER NOT NULL,
FOREIGN KEY (ID) REFERENCES EC_URL(ID) ON DELETE CASCADE
)
@ -115,13 +79,9 @@ CHARACTER SET utf8mb4
COLLATE utf8mb4_unicode_ci;
CREATE TABLE EC_FEED_URL (
ID INT PRIMARY KEY AUTO_INCREMENT,
DOMAIN_ID INT NOT NULL,
PROTO VARCHAR(8) NOT NULL,
URL VARCHAR(255) NOT NULL,
PORT INT,
URL VARCHAR(255) PRIMARY KEY,
DOMAIN_ID INT,
CONSTRAINT CONS UNIQUE (DOMAIN_ID, URL),
FOREIGN KEY (DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE
)
CHARACTER SET utf8mb4
@ -150,29 +110,23 @@ CREATE TABLE IF NOT EXISTS EC_DOMAIN_LINK (
FOREIGN KEY (DEST_DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE
);
CREATE TABLE IF NOT EXISTS EC_DOMAIN_LINK_AGGREGATE (
DOMAIN_ID INT PRIMARY KEY NOT NULL,
LINKS INT
);
CREATE OR REPLACE VIEW EC_URL_VIEW AS
SELECT
EC_DOMAIN.URL_PART AS URL_DOMAIN,
EC_URL.URL AS URL_PATH,
EC_TOP_DOMAIN.URL_PART AS URL_TOP,
IF(PORT IS NULL,
CONCAT(EC_URL.PROTO, "://", EC_DOMAIN.DOMAIN_NAME, EC_URL.PATH),
CONCAT(EC_URL.PROTO, "://", EC_DOMAIN.DOMAIN_NAME, ":", EC_URL.PORT, EC_URL.PATH))
AS URL,
EC_URL.PATH_HASH AS PATH_HASH,
EC_URL.PATH AS PATH,
EC_DOMAIN.DOMAIN_NAME AS DOMAIN_NAME,
EC_DOMAIN.DOMAIN_TOP AS DOMAIN_TOP,
EC_URL.ID AS ID,
EC_DOMAIN.ID AS DOMAIN_ID,
EC_TOP_DOMAIN.ID AS TOP_DOMAIN_ID,
EC_URL.PROTO AS URL_PROTO,
EC_URL.PORT AS URL_PORT,
EC_URL.VISITED AS VISITED,
EC_URL.DATA_HASH AS DATA_HASH,
EC_URL.QUALITY_MEASURE AS URL_QUALITY_MEASURE,
EC_DOMAIN.QUALITY AS DOMAIN_QUALITY_MEASURE,
EC_DOMAIN.QUALITY_RAW AS QUALITY_RAW,
EC_PAGE_DATA.DATA_HASH AS DATA_HASH,
EC_PAGE_DATA.TITLE AS TITLE,
EC_PAGE_DATA.DESCRIPTION AS DESCRIPTION,
EC_URL.IP AS IP,
EC_DOMAIN.IP AS IP,
EC_DOMAIN.STATE AS STATE,
EC_PAGE_DATA.WORDS_TOTAL AS WORDS_TOTAL,
EC_PAGE_DATA.FORMAT AS FORMAT,
@ -183,59 +137,32 @@ CREATE OR REPLACE VIEW EC_URL_VIEW AS
LEFT JOIN EC_PAGE_DATA
ON EC_PAGE_DATA.ID = EC_URL.ID
INNER JOIN EC_DOMAIN
ON EC_URL.DOMAIN_ID = EC_DOMAIN.ID
INNER JOIN EC_TOP_DOMAIN
ON EC_DOMAIN.URL_TOP_DOMAIN_ID=EC_TOP_DOMAIN.ID;
CREATE OR REPLACE VIEW EC_DISCOVER_TASKS_VIEW AS
SELECT
ID,
URL_PART
FROM EC_DOMAIN
WHERE
DOMAIN_ALIAS IS NULL
AND INDEXED = 0
ORDER BY QUALITY DESC, ID ASC;
ON EC_URL.DOMAIN_ID = EC_DOMAIN.ID;
CREATE OR REPLACE VIEW EC_RELATED_LINKS_VIEW AS
SELECT
SOURCE_DOMAIN_ID,
SOURCE_DOMAIN.URL_PART AS SOURCE_URL,
SOURCE_TOP_DOMAIN.URL_PART AS SOURCE_TOP_URL,
SOURCE_DOMAIN.DOMAIN_NAME AS SOURCE_DOMAIN,
SOURCE_DOMAIN.DOMAIN_TOP AS SOURCE_TOP_DOMAIN,
DEST_DOMAIN_ID,
DEST_DOMAIN.URL_PART AS DEST_URL,
DEST_TOP_DOMAIN.URL_PART AS DEST_TOP_URL
DEST_DOMAIN.DOMAIN_NAME AS DEST_DOMAIN,
DEST_DOMAIN.DOMAIN_TOP AS DEST_TOP_DOMAIN
FROM EC_DOMAIN_LINK
INNER JOIN EC_DOMAIN AS SOURCE_DOMAIN
ON SOURCE_DOMAIN.ID=SOURCE_DOMAIN_ID
INNER JOIN EC_TOP_DOMAIN AS SOURCE_TOP_DOMAIN
ON SOURCE_TOP_DOMAIN.ID=SOURCE_DOMAIN.URL_TOP_DOMAIN_ID
INNER JOIN EC_DOMAIN AS DEST_DOMAIN
ON DEST_DOMAIN.ID=DEST_DOMAIN_ID
INNER JOIN EC_TOP_DOMAIN AS DEST_TOP_DOMAIN
ON DEST_TOP_DOMAIN.ID=DEST_DOMAIN.URL_TOP_DOMAIN_ID
;
CREATE OR REPLACE VIEW EC_RELATED_LINKS_IN AS
SELECT
IN_URL.ID AS SRC_URL_ID,
IN_URL.QUALITY_MEASURE AS SRC_URL_QUALITY,
OUT_URL.ID AS DEST_URL_ID,
OUT_URL.QUALITY_MEASURE AS DEST_URL_QUALITY
FROM EC_URL AS IN_URL
INNER JOIN EC_DOMAIN_LINK
ON IN_URL.DOMAIN_ID=EC_DOMAIN_LINK.SOURCE_DOMAIN_ID
INNER JOIN EC_URL AS OUT_URL
ON OUT_URL.DOMAIN_ID=EC_DOMAIN_LINK.DEST_DOMAIN_ID
WHERE IN_URL.VISITED=TRUE
AND IN_URL.DATA_HASH IS NOT NULL
AND OUT_URL.VISITED=TRUE
AND OUT_URL.DATA_HASH IS NOT NULL;
CREATE TABLE IF NOT EXISTS EC_DOMAIN_BACKLINKS (
ID INT PRIMARY KEY,
LINKEDNESS INT
);
OUT_URL.ID AS DEST_URL_ID
FROM EC_DOMAIN_LINK
INNER JOIN EC_URL AS IN_URL ON IN_URL.DOMAIN_ID=EC_DOMAIN_LINK.SOURCE_DOMAIN_ID
INNER JOIN EC_URL AS OUT_URL ON OUT_URL.DOMAIN_ID=EC_DOMAIN_LINK.DEST_DOMAIN_ID
WHERE IN_URL.VISITED AND IN_URL.STATE = 'ok'
AND OUT_URL.VISITED AND OUT_URL.STATE = 'ok';
CREATE TABLE IF NOT EXISTS EC_API_KEY (
LICENSE_KEY VARCHAR(255) UNIQUE,
@ -245,16 +172,9 @@ CREATE TABLE IF NOT EXISTS EC_API_KEY (
RATE INT DEFAULT 10
);
CREATE INDEX IF NOT EXISTS EC_DOMAIN_RANK_INDEX ON EC_DOMAIN (RANK);
CREATE INDEX IF NOT EXISTS EC_DOMAIN_QUALITY_INDEX ON EC_DOMAIN (QUALITY,STATE);
CREATE INDEX IF NOT EXISTS EC_DOMAIN_INDEXED_INDEX ON EC_DOMAIN (INDEXED);
CREATE INDEX IF NOT EXISTS EC_DOMAIN_ID_INDEXED_INDEX ON EC_DOMAIN (ID, INDEXED);
CREATE INDEX IF NOT EXISTS EC_DOMAIN_TRIO ON EC_DOMAIN (STATE, DOMAIN_ALIAS, INDEXED, QUALITY);
CREATE INDEX IF NOT EXISTS EC_URL_VISITED ON EC_URL (VISITED);
CREATE INDEX IF NOT EXISTS EC_URL_VISITED_STATE ON EC_URL (VISITED, STATE);
CREATE INDEX IF NOT EXISTS EC_URL_IP ON EC_URL (IP);
CREATE INDEX IF NOT EXISTS EC_DOMAIN_TOP_DOMAIN ON EC_DOMAIN (DOMAIN_TOP);
---;

View File

@ -43,7 +43,7 @@ public class TestUtil {
logger.info("Running script {}", scriptFile);
try (var scriptStream = ClassLoader.getSystemResourceAsStream(scriptFile);
var stmt = conn.createStatement()) {
for (String s : new String(scriptStream.readAllBytes()).split(";")) {
for (String s : new String(scriptStream.readAllBytes()).split("(;|---)")) {
if (!s.isBlank()) {
try {
Assertions.assertTrue(stmt.executeUpdate(s) >= 0);

View File

@ -0,0 +1,51 @@
package nu.marginalia.wmsa.edge.converting.loader;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.util.TestUtil;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.parallel.Execution;
import org.junit.jupiter.api.parallel.ExecutionMode;
import org.junit.jupiter.api.parallel.ResourceAccessMode;
import org.junit.jupiter.api.parallel.ResourceLock;
import java.net.URISyntaxException;
import static org.junit.jupiter.api.Assertions.assertTrue;
@ResourceLock(value = "mariadb", mode = ResourceAccessMode.READ_WRITE)
@Execution(ExecutionMode.SAME_THREAD)
@Tag("db")
class SqlLoadDomainLinksTest {
HikariDataSource dataSource;
LoaderData loaderData;
@BeforeEach
public void setUp() {
dataSource = TestUtil.getConnection();
TestUtil.evalScript(dataSource, "sql/edge-crawler-cache.sql");
var loadDomains = new SqlLoadDomains(dataSource);
loaderData = new LoaderData(10);
loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu"));
loadDomains.load(loaderData, new EdgeDomain[] { new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu") });
}
@AfterEach
public void tearDown() {
dataSource.close();
}
@Test
public void loadDomainLinks() throws URISyntaxException {
var loader = new SqlLoadDomainLinks(dataSource);
loader.load(new DomainLink[] { new DomainLink(new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu")) });
}
}

View File

@ -0,0 +1,54 @@
package nu.marginalia.wmsa.edge.converting.loader;
import nu.marginalia.util.TestUtil;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.parallel.Execution;
import org.junit.jupiter.api.parallel.ExecutionMode;
import org.junit.jupiter.api.parallel.ResourceAccessMode;
import org.junit.jupiter.api.parallel.ResourceLock;
import static org.junit.jupiter.api.Assertions.*;
@ResourceLock(value = "mariadb", mode = ResourceAccessMode.READ_WRITE)
@Execution(ExecutionMode.SAME_THREAD)
@Tag("db")
class SqlLoadDomainsTest {
@Test
public void loadDomain() {
try (var dataSource = TestUtil.getConnection()) {
TestUtil.evalScript(dataSource, "sql/edge-crawler-cache.sql");
var loadDomains = new SqlLoadDomains(dataSource);
var loaderData = new LoaderData(10);
loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu"));
loadDomains.load(loaderData, new EdgeDomain("www.marginalia.nu"));
assertTrue(loaderData.getDomainId(new EdgeDomain("www.marginalia.nu")) >= 0);
}
}
@Test
public void loadDomains() {
try (var dataSource = TestUtil.getConnection()) {
TestUtil.evalScript(dataSource, "sql/edge-crawler-cache.sql");
var loadDomains = new SqlLoadDomains(dataSource);
var loaderData = new LoaderData(10);
loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu"));
loadDomains.load(loaderData, new EdgeDomain[] { new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu") });
assertTrue(loaderData.getDomainId(new EdgeDomain("www.marginalia.nu")) >= 0);
assertTrue(loaderData.getDomainId(new EdgeDomain("memex.marginalia.nu")) >= 0);
}
}
}

View File

@ -0,0 +1,68 @@
package nu.marginalia.wmsa.edge.converting.loader;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.util.TestUtil;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument;
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.parallel.Execution;
import org.junit.jupiter.api.parallel.ExecutionMode;
import org.junit.jupiter.api.parallel.ResourceAccessMode;
import org.junit.jupiter.api.parallel.ResourceLock;
import java.net.URISyntaxException;
import java.util.List;
import java.util.Set;
@ResourceLock(value = "mariadb", mode = ResourceAccessMode.READ_WRITE)
@Execution(ExecutionMode.SAME_THREAD)
@Tag("db")
class SqlLoadProcessedDocumentTest {
HikariDataSource dataSource;
LoaderData loaderData;
@BeforeEach
public void setUp() throws URISyntaxException {
dataSource = TestUtil.getConnection();
TestUtil.evalScript(dataSource, "sql/edge-crawler-cache.sql");
var loadDomains = new SqlLoadDomains(dataSource);
var loadUrls = new SqlLoadUrls(dataSource);
loaderData = new LoaderData(10);
loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu"));
loadDomains.load(loaderData, new EdgeDomain("www.marginalia.nu"));
loadUrls.load(loaderData, new EdgeUrl[]{new EdgeUrl("https://www.marginalia.nu/")});
}
@AfterEach
public void tearDown() {
dataSource.close();
}
@Test
public void loadProcessedDocument() throws URISyntaxException {
var loader = new SqlLoadProcessedDocument(dataSource);
loader.load(loaderData, List.of(new LoadProcessedDocument(
new EdgeUrl("https://www.marginalia.nu/"),
EdgeUrlState.OK,
"TITLE",
"DESCR",
HtmlFeature.encode(Set.of(HtmlFeature.AFFILIATE_LINK)),
EdgeHtmlStandard.HTML5,
100,
12345,
-5
)));
}
}

View File

@ -0,0 +1,52 @@
package nu.marginalia.wmsa.edge.converting.loader;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.util.TestUtil;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.parallel.Execution;
import org.junit.jupiter.api.parallel.ExecutionMode;
import org.junit.jupiter.api.parallel.ResourceAccessMode;
import org.junit.jupiter.api.parallel.ResourceLock;
import java.net.URISyntaxException;
@ResourceLock(value = "mariadb", mode = ResourceAccessMode.READ_WRITE)
@Execution(ExecutionMode.SAME_THREAD)
@Tag("db")
class SqlLoadProcessedDomainTest {
HikariDataSource dataSource;
LoaderData loaderData;
@BeforeEach
public void setUp() {
dataSource = TestUtil.getConnection();
TestUtil.evalScript(dataSource, "sql/edge-crawler-cache.sql");
var loadDomains = new SqlLoadDomains(dataSource);
loaderData = new LoaderData(10);
loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu"));
loadDomains.load(loaderData, new EdgeDomain[]{ new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu") });
}
@AfterEach
public void tearDown() {
dataSource.close();
}
@Test
public void loadProcessedDomain() {
var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource));
loader.load(loaderData, new EdgeDomain("www.marginalia.nu"), EdgeDomainIndexingState.BLOCKED, "127.0.0.1");
}
@Test
public void loadDomainAlias() {
var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource));
loader.loadAlias(loaderData, new DomainLink(new EdgeDomain("memex.marginalia.nu"), new EdgeDomain("www.marginalia.nu")));
}
}

View File

@ -0,0 +1,49 @@
package nu.marginalia.wmsa.edge.converting.loader;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.util.TestUtil;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.parallel.Execution;
import org.junit.jupiter.api.parallel.ExecutionMode;
import org.junit.jupiter.api.parallel.ResourceAccessMode;
import org.junit.jupiter.api.parallel.ResourceLock;
import java.net.URISyntaxException;
import static org.junit.jupiter.api.Assertions.assertTrue;
@ResourceLock(value = "mariadb", mode = ResourceAccessMode.READ_WRITE)
@Execution(ExecutionMode.SAME_THREAD)
@Tag("db")
class SqlLoadUrlsTest {
HikariDataSource dataSource;
LoaderData loaderData;
@BeforeEach
public void setUp() {
dataSource = TestUtil.getConnection();
TestUtil.evalScript(dataSource, "sql/edge-crawler-cache.sql");
var loadDomains = new SqlLoadDomains(dataSource);
loaderData = new LoaderData(10);
loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu"));
loadDomains.load(loaderData, new EdgeDomain("www.marginalia.nu"));
}
@AfterEach
public void tearDown() {
dataSource.close();
}
@Test
public void loadUrl() throws URISyntaxException {
var loadUrls = new SqlLoadUrls(dataSource);
loadUrls.load(loaderData, new EdgeUrl[] { new EdgeUrl("https://www.marginalia.nu/") });
}
}