mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
WIP: Database refactoring
This commit is contained in:
parent
0e65384781
commit
c915664fcc
@ -1,49 +0,0 @@
|
|||||||
package nu.marginalia.util.ranking;
|
|
||||||
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
|
||||||
import gnu.trove.list.TIntList;
|
|
||||||
import gnu.trove.list.array.TIntArrayList;
|
|
||||||
import gnu.trove.map.hash.TIntIntHashMap;
|
|
||||||
import it.unimi.dsi.fastutil.ints.IntArrays;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.sql.SQLException;
|
|
||||||
|
|
||||||
public class AcademiaRank {
|
|
||||||
private final TIntArrayList result;
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(AcademiaRank.class);
|
|
||||||
|
|
||||||
public AcademiaRank(HikariDataSource ds, String... origins) throws IOException {
|
|
||||||
|
|
||||||
TIntList rankingResults = new BetterStandardPageRank(ds, origins).pageRank(100_000);
|
|
||||||
TIntIntHashMap idToRanking = new TIntIntHashMap(100_000, 0.5f, -1, 1_000_000_000);
|
|
||||||
|
|
||||||
for (int i = 0; i < rankingResults.size(); i++) {
|
|
||||||
idToRanking.put(rankingResults.get(i), i);
|
|
||||||
}
|
|
||||||
|
|
||||||
result = new TIntArrayList(10000);
|
|
||||||
try (var conn = ds.getConnection();
|
|
||||||
var stmt = conn.prepareStatement("select EC_DOMAIN.ID,COUNT(SOURCE_DOMAIN_ID) AS CNT from EC_DOMAIN INNER JOIN DOMAIN_METADATA ON DOMAIN_METADATA.ID=EC_DOMAIN.ID INNER JOIN EC_DOMAIN_LINK ON EC_DOMAIN_LINK.DEST_DOMAIN_ID=EC_DOMAIN.ID WHERE INDEXED>0 AND STATE>=0 AND STATE<2 AND ((VISITED_URLS>1000+1500*RANK AND RANK<1) OR (GOOD_URLS>1000 AND URL_PART LIKE '%edu')) GROUP BY EC_DOMAIN.ID HAVING CNT<1500 ORDER BY RANK ASC")) {
|
|
||||||
|
|
||||||
stmt.setFetchSize(1000);
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
while (rsp.next()) {
|
|
||||||
result.add(rsp.getInt(1));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
catch (SQLException ex) {
|
|
||||||
logger.error("SQL error", ex);
|
|
||||||
}
|
|
||||||
|
|
||||||
int[] internalArray = result.toArray();
|
|
||||||
IntArrays.quickSort(internalArray, (a,b) -> idToRanking.get(a) - idToRanking.get(b));
|
|
||||||
result.set(0, internalArray);
|
|
||||||
}
|
|
||||||
|
|
||||||
public TIntArrayList getResult() {
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
}
|
|
@ -72,10 +72,10 @@ public abstract class RankingAlgorithm {
|
|||||||
|
|
||||||
String s;
|
String s;
|
||||||
if (getNames) {
|
if (getNames) {
|
||||||
s = "SELECT EC_DOMAIN.ID,URL_PART,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
|
s = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
|
s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
|
||||||
}
|
}
|
||||||
try (var stmt = conn.prepareStatement(s)) {
|
try (var stmt = conn.prepareStatement(s)) {
|
||||||
stmt.setFetchSize(10000);
|
stmt.setFetchSize(10000);
|
||||||
@ -84,7 +84,7 @@ public abstract class RankingAlgorithm {
|
|||||||
int id = rsp.getInt(1);
|
int id = rsp.getInt(1);
|
||||||
if (!spamDomains.contains(id)) {
|
if (!spamDomains.contains(id)) {
|
||||||
|
|
||||||
domainsById.put(id, new DomainData(id, rsp.getString(2), rsp.getInt(3), rsp.getInt(4), rsp.getInt(5), false));
|
domainsById.put(id, new DomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5), false));
|
||||||
|
|
||||||
domainIndexToId.put(domainIndexToId.size(), id);
|
domainIndexToId.put(domainIndexToId.size(), id);
|
||||||
domainIdToIndex.put(id, domainIdToIndex.size());
|
domainIdToIndex.put(id, domainIdToIndex.size());
|
||||||
@ -125,7 +125,7 @@ public abstract class RankingAlgorithm {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART LIKE ?")) {
|
try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME LIKE ?")) {
|
||||||
for (var seed : this.originDomains) {
|
for (var seed : this.originDomains) {
|
||||||
stmt.setString(1, seed);
|
stmt.setString(1, seed);
|
||||||
var rsp = stmt.executeQuery();
|
var rsp = stmt.executeQuery();
|
||||||
@ -159,10 +159,10 @@ public abstract class RankingAlgorithm {
|
|||||||
try (var conn = dataSource.getConnection()) {
|
try (var conn = dataSource.getConnection()) {
|
||||||
String s;
|
String s;
|
||||||
if (getNames) {
|
if (getNames) {
|
||||||
s = "SELECT EC_DOMAIN.ID,URL_PART,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
|
s = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
|
s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
|
||||||
}
|
}
|
||||||
try (var stmt = conn.prepareStatement(s)) {
|
try (var stmt = conn.prepareStatement(s)) {
|
||||||
stmt.setFetchSize(10000);
|
stmt.setFetchSize(10000);
|
||||||
@ -172,7 +172,7 @@ public abstract class RankingAlgorithm {
|
|||||||
int id = rsp.getInt(1);
|
int id = rsp.getInt(1);
|
||||||
|
|
||||||
if (!spamDomains.contains(id)) {
|
if (!spamDomains.contains(id)) {
|
||||||
domainsById.put(id, new DomainData(id, rsp.getString(2), rsp.getInt(3), rsp.getInt(4), rsp.getInt(5), true));
|
domainsById.put(id, new DomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5), true));
|
||||||
|
|
||||||
domainIndexToId.put(domainIndexToId.size(), id);
|
domainIndexToId.put(domainIndexToId.size(), id);
|
||||||
domainIdToIndex.put(id, domainIdToIndex.size());
|
domainIdToIndex.put(id, domainIdToIndex.size());
|
||||||
@ -451,7 +451,7 @@ public abstract class RankingAlgorithm {
|
|||||||
public final int id;
|
public final int id;
|
||||||
public final String name;
|
public final String name;
|
||||||
private int alias;
|
private int alias;
|
||||||
private int state;
|
private EdgeDomainIndexingState state;
|
||||||
public final int knownUrls;
|
public final int knownUrls;
|
||||||
public boolean peripheral;
|
public boolean peripheral;
|
||||||
|
|
||||||
@ -465,11 +465,11 @@ public abstract class RankingAlgorithm {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public boolean isSpecial() {
|
public boolean isSpecial() {
|
||||||
return EdgeDomainIndexingState.SPECIAL.code == state;
|
return EdgeDomainIndexingState.SPECIAL == state;
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isSocialMedia() {
|
public boolean isSocialMedia() {
|
||||||
return EdgeDomainIndexingState.SOCIAL_MEDIA.code == state;
|
return EdgeDomainIndexingState.SOCIAL_MEDIA == state;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -66,7 +66,7 @@ public class OldReversePageRankV2 {
|
|||||||
originDomains.add("memex.marginalia.nu");
|
originDomains.add("memex.marginalia.nu");
|
||||||
|
|
||||||
try (var conn = dataSource.getConnection()) {
|
try (var conn = dataSource.getConnection()) {
|
||||||
try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE FROM EC_DOMAIN WHERE INDEXED>1 AND STATE>=0 AND QUALITY_RAW>=-10")) {
|
try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE FROM EC_DOMAIN WHERE INDEXED>1 AND IS_ALIVE")) {
|
||||||
stmt.setFetchSize(10000);
|
stmt.setFetchSize(10000);
|
||||||
var rsp = stmt.executeQuery();
|
var rsp = stmt.executeQuery();
|
||||||
while (rsp.next()) {
|
while (rsp.next()) {
|
||||||
@ -90,7 +90,7 @@ public class OldReversePageRankV2 {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?")) {
|
try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||||
stmt.setFetchSize(10000);
|
stmt.setFetchSize(10000);
|
||||||
|
|
||||||
for (var seed : this.originDomains) {
|
for (var seed : this.originDomains) {
|
||||||
|
@ -48,7 +48,7 @@ public class StandardPageRank {
|
|||||||
originDomains.addAll(Arrays.asList(origins));
|
originDomains.addAll(Arrays.asList(origins));
|
||||||
|
|
||||||
try (var conn = dataSource.getConnection()) {
|
try (var conn = dataSource.getConnection()) {
|
||||||
try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE,URL_PART FROM EC_DOMAIN WHERE INDEXED>1 AND STATE>=0 AND QUALITY>=-10")) {
|
try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE,DOMAIN_NAME FROM EC_DOMAIN WHERE INDEXED>1 AND IS_ALIVE AND QUALITY>=-10")) {
|
||||||
stmt.setFetchSize(10000);
|
stmt.setFetchSize(10000);
|
||||||
var rsp = stmt.executeQuery();
|
var rsp = stmt.executeQuery();
|
||||||
while (rsp.next()) {
|
while (rsp.next()) {
|
||||||
@ -78,7 +78,7 @@ public class StandardPageRank {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?")) {
|
try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||||
for (var seed : this.originDomains) {
|
for (var seed : this.originDomains) {
|
||||||
stmt.setString(1, seed);
|
stmt.setString(1, seed);
|
||||||
var rsp = stmt.executeQuery();
|
var rsp = stmt.executeQuery();
|
||||||
|
@ -50,7 +50,7 @@ public class DedupTool {
|
|||||||
Map<Integer, Map<Integer, List<Data>>> domainToHashToUrl = new HashMap<>();
|
Map<Integer, Map<Integer, List<Data>>> domainToHashToUrl = new HashMap<>();
|
||||||
|
|
||||||
try (var conn = ds.getConnection();
|
try (var conn = ds.getConnection();
|
||||||
var fetchStmt = conn.prepareStatement("SELECT URL_TOP_DOMAIN_ID,DATA_HASH,URL,EC_URL.ID,EC_DOMAIN.URL_PART FROM EC_URL INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE DATA_HASH IS NOT NULL");
|
var fetchStmt = conn.prepareStatement("SELECT URL_TOP_DOMAIN_ID,DATA_HASH,URL,EC_URL.ID,EC_DOMAIN.DOMAIN_NAME FROM EC_URL INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE DATA_HASH IS NOT NULL");
|
||||||
var updateStmt = conn.prepareStatement("UPDATE EC_URL SET STATE='redirect' WHERE ID=?")
|
var updateStmt = conn.prepareStatement("UPDATE EC_URL SET STATE='redirect' WHERE ID=?")
|
||||||
|
|
||||||
) {
|
) {
|
||||||
|
@ -112,10 +112,10 @@ public class PerusePageRankV2 {
|
|||||||
try (var conn = dataSource.getConnection()) {
|
try (var conn = dataSource.getConnection()) {
|
||||||
String s;
|
String s;
|
||||||
if (getNames) {
|
if (getNames) {
|
||||||
s = "SELECT EC_DOMAIN.ID,URL_PART,DOMAIN_ALIAS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 GROUP BY EC_DOMAIN.ID";
|
s = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) GROUP BY EC_DOMAIN.ID";
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS FROM EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 GROUP BY EC_DOMAIN.ID";
|
s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS FROM EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) GROUP BY EC_DOMAIN.ID";
|
||||||
}
|
}
|
||||||
try (var stmt = conn.prepareStatement(s)) {
|
try (var stmt = conn.prepareStatement(s)) {
|
||||||
stmt.setFetchSize(10000);
|
stmt.setFetchSize(10000);
|
||||||
|
@ -1,30 +0,0 @@
|
|||||||
package nu.marginalia.util.ranking.tool;
|
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
|
||||||
import nu.marginalia.util.ranking.AcademiaRank;
|
|
||||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
|
||||||
import org.mariadb.jdbc.Driver;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
public class TestAcademiaRankTool {
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
public static void main(String... args) {
|
|
||||||
Driver driver = new Driver();
|
|
||||||
var conn = new DatabaseModule().provideConnection();
|
|
||||||
|
|
||||||
var rank = new AcademiaRank(new DatabaseModule().provideConnection(), "www.perseus.tufts.edu", "xroads.virginia.edu");
|
|
||||||
var res = rank.getResult();
|
|
||||||
|
|
||||||
try (var c = conn.getConnection(); var stmt = c.prepareStatement("SELECT URL_PART FROM EC_DOMAIN WHERE ID=?")) {
|
|
||||||
for (int i = 0; i < Math.min(res.size(), 100); i++) {
|
|
||||||
stmt.setInt(1, res.getQuick(i));
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
while (rsp.next())
|
|
||||||
System.out.println(rsp.getString(1));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -83,11 +83,6 @@ public class UpdateDomainRanksTool {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.info("Recalculating quality");
|
|
||||||
try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET QUALITY=-5*RANK+IF(RANK=1,RANK*GREATEST(QUALITY_RAW,QUALITY_ORIGINAL)/2, 0)")) {
|
|
||||||
stmt.executeUpdate();
|
|
||||||
}
|
|
||||||
|
|
||||||
} catch (SQLException | InterruptedException throwables) {
|
} catch (SQLException | InterruptedException throwables) {
|
||||||
throwables.printStackTrace();
|
throwables.printStackTrace();
|
||||||
}
|
}
|
||||||
|
@ -94,9 +94,6 @@ public class UpdateDomainRanksTool2 {
|
|||||||
}
|
}
|
||||||
|
|
||||||
logger.info("Recalculating quality");
|
logger.info("Recalculating quality");
|
||||||
try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET QUALITY=-5*RANK+IF(RANK=1,RANK*GREATEST(QUALITY_RAW,QUALITY_ORIGINAL)/2, 0)")) {
|
|
||||||
stmt.executeUpdate();
|
|
||||||
}
|
|
||||||
|
|
||||||
} catch (SQLException | InterruptedException throwables) {
|
} catch (SQLException | InterruptedException throwables) {
|
||||||
throwables.printStackTrace();
|
throwables.printStackTrace();
|
||||||
|
@ -29,7 +29,7 @@ public class ReindexTriggerMain {
|
|||||||
.build();
|
.build();
|
||||||
|
|
||||||
try (var ds = db.provideConnection(); var conn = ds.getConnection(); var stmt = conn.createStatement()) {
|
try (var ds = db.provideConnection(); var conn = ds.getConnection(); var stmt = conn.createStatement()) {
|
||||||
var rs = stmt.executeQuery("SELECT ID, URL_PART, STATE, INDEXED FROM EC_DOMAIN LIMIT 100");
|
var rs = stmt.executeQuery("SELECT ID, DOMAIN_NAME, STATE, INDEXED FROM EC_DOMAIN LIMIT 100");
|
||||||
while (rs.next()) {
|
while (rs.next()) {
|
||||||
System.out.printf("%d %s %s %d\n",
|
System.out.printf("%d %s %s %d\n",
|
||||||
rs.getInt(1),
|
rs.getInt(1),
|
||||||
@ -38,7 +38,7 @@ public class ReindexTriggerMain {
|
|||||||
rs.getInt(4));
|
rs.getInt(4));
|
||||||
}
|
}
|
||||||
|
|
||||||
rs = stmt.executeQuery("SELECT ID, DOMAIN_ID, URL, VISITED, STATE FROM EC_URL LIMIT 100");
|
rs = stmt.executeQuery("SELECT ID, DOMAIN_ID, PATH, VISITED, STATE FROM EC_URL LIMIT 100");
|
||||||
while (rs.next()) {
|
while (rs.next()) {
|
||||||
System.out.printf("%d %d %s %d %s\n",
|
System.out.printf("%d %d %s %d %s\n",
|
||||||
rs.getInt(1),
|
rs.getInt(1),
|
||||||
|
@ -14,7 +14,7 @@ public interface Interpreter {
|
|||||||
void loadRssFeed(EdgeUrl[] rssFeed);
|
void loadRssFeed(EdgeUrl[] rssFeed);
|
||||||
void loadDomainLink(DomainLink[] links);
|
void loadDomainLink(DomainLink[] links);
|
||||||
|
|
||||||
void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, double quality);
|
void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip);
|
||||||
void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument);
|
void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument);
|
||||||
void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError);
|
void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError);
|
||||||
|
|
||||||
|
@ -6,11 +6,11 @@ import nu.marginalia.wmsa.edge.converting.interpreter.Interpreter;
|
|||||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||||
|
|
||||||
public record LoadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, double quality) implements Instruction {
|
public record LoadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) implements Instruction {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void apply(Interpreter interpreter) {
|
public void apply(Interpreter interpreter) {
|
||||||
interpreter.loadProcessedDomain(domain, state, quality);
|
interpreter.loadProcessedDomain(domain, state, ip);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -76,9 +76,9 @@ public class Loader implements Interpreter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, double quality) {
|
public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) {
|
||||||
logger.debug("loadProcessedDomain({}, {}, {})", domain, state, quality);
|
logger.debug("loadProcessedDomain({}, {}, {})", domain, state, ip);
|
||||||
sqlLoadProcessedDomain.load(data, domain, state, quality);
|
sqlLoadProcessedDomain.load(data, domain, state, ip);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -30,7 +30,7 @@ public class SqlLoadDomainLinks {
|
|||||||
INSERT IGNORE INTO EC_DOMAIN_LINK (SOURCE_DOMAIN_ID, DEST_DOMAIN_ID)
|
INSERT IGNORE INTO EC_DOMAIN_LINK (SOURCE_DOMAIN_ID, DEST_DOMAIN_ID)
|
||||||
SELECT SOURCE.ID,DEST.ID
|
SELECT SOURCE.ID,DEST.ID
|
||||||
FROM EC_DOMAIN SOURCE INNER JOIN EC_DOMAIN DEST
|
FROM EC_DOMAIN SOURCE INNER JOIN EC_DOMAIN DEST
|
||||||
ON SOURCE.URL_PART=FROM_DOMAIN AND DEST.URL_PART=TO_DOMAIN;
|
ON SOURCE.DOMAIN_NAME=FROM_DOMAIN AND DEST.DOMAIN_NAME=TO_DOMAIN;
|
||||||
END
|
END
|
||||||
""");
|
""");
|
||||||
}
|
}
|
||||||
@ -61,8 +61,8 @@ public class SqlLoadDomainLinks {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (SQLException sql) {
|
catch (SQLException ex) {
|
||||||
sql.printStackTrace();
|
logger.warn("SQL error inserting domain links", ex);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -25,15 +25,9 @@ public class SqlLoadDomains {
|
|||||||
stmt.execute("""
|
stmt.execute("""
|
||||||
CREATE PROCEDURE INSERT_DOMAIN (
|
CREATE PROCEDURE INSERT_DOMAIN (
|
||||||
IN DOMAIN_NAME VARCHAR(255),
|
IN DOMAIN_NAME VARCHAR(255),
|
||||||
IN SUB_DOMAIN VARCHAR(255),
|
|
||||||
IN TOP_DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci)
|
IN TOP_DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci)
|
||||||
BEGIN
|
BEGIN
|
||||||
INSERT IGNORE INTO EC_TOP_DOMAIN (URL_PART) VALUES (TOP_DOMAIN);
|
INSERT IGNORE INTO EC_DOMAIN(DOMAIN_NAME, DOMAIN_TOP) VALUES (DOMAIN_NAME, TOP_DOMAIN);
|
||||||
|
|
||||||
INSERT IGNORE INTO EC_DOMAIN(URL_PART, URL_SUBDOMAIN, URL_TOP_DOMAIN_ID)
|
|
||||||
SELECT DOMAIN_NAME,SUB_DOMAIN,ID
|
|
||||||
FROM EC_TOP_DOMAIN
|
|
||||||
WHERE EC_TOP_DOMAIN.URL_PART=TOP_DOMAIN;
|
|
||||||
END
|
END
|
||||||
""");
|
""");
|
||||||
}
|
}
|
||||||
@ -46,10 +40,9 @@ public class SqlLoadDomains {
|
|||||||
public void load(LoaderData data, EdgeDomain domain) {
|
public void load(LoaderData data, EdgeDomain domain) {
|
||||||
|
|
||||||
try (var connection = dataSource.getConnection()) {
|
try (var connection = dataSource.getConnection()) {
|
||||||
try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?,?)")) {
|
try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?)")) {
|
||||||
insertCall.setString(1, domain.toString());
|
insertCall.setString(1, domain.toString());
|
||||||
insertCall.setString(2, domain.subDomain);
|
insertCall.setString(2, domain.domain);
|
||||||
insertCall.setString(3, domain.domain);
|
|
||||||
insertCall.addBatch();
|
insertCall.addBatch();
|
||||||
|
|
||||||
var ret = insertCall.executeUpdate();
|
var ret = insertCall.executeUpdate();
|
||||||
@ -57,12 +50,11 @@ public class SqlLoadDomains {
|
|||||||
logger.warn("load({}) -- bad row count {}", domain, ret);
|
logger.warn("load({}) -- bad row count {}", domain, ret);
|
||||||
}
|
}
|
||||||
|
|
||||||
connection.commit();
|
|
||||||
findIdForTargetDomain(connection, data);
|
findIdForTargetDomain(connection, data);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (SQLException ex) {
|
catch (SQLException ex) {
|
||||||
ex.printStackTrace();
|
logger.warn("SQL error inserting domain", ex);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -73,12 +65,11 @@ public class SqlLoadDomains {
|
|||||||
try (var connection = dataSource.getConnection()) {
|
try (var connection = dataSource.getConnection()) {
|
||||||
connection.setAutoCommit(false);
|
connection.setAutoCommit(false);
|
||||||
|
|
||||||
try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?,?)")) {
|
try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?)")) {
|
||||||
|
|
||||||
for (var domain : domains) {
|
for (var domain : domains) {
|
||||||
insertCall.setString(1, domain.toString());
|
insertCall.setString(1, domain.toString());
|
||||||
insertCall.setString(2, domain.subDomain);
|
insertCall.setString(2, domain.domain);
|
||||||
insertCall.setString(3, domain.domain);
|
|
||||||
insertCall.addBatch();
|
insertCall.addBatch();
|
||||||
}
|
}
|
||||||
var ret = insertCall.executeBatch();
|
var ret = insertCall.executeBatch();
|
||||||
@ -95,7 +86,7 @@ public class SqlLoadDomains {
|
|||||||
findIdForTargetDomain(connection, data);
|
findIdForTargetDomain(connection, data);
|
||||||
}
|
}
|
||||||
catch (SQLException ex) {
|
catch (SQLException ex) {
|
||||||
ex.printStackTrace();
|
logger.warn("SQL error inserting domains", ex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -104,7 +95,7 @@ public class SqlLoadDomains {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
try (var query = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?"))
|
try (var query = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?"))
|
||||||
{
|
{
|
||||||
|
|
||||||
var targetDomain = data.getTargetDomain();
|
var targetDomain = data.getTargetDomain();
|
||||||
@ -118,7 +109,7 @@ public class SqlLoadDomains {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (SQLException ex) {
|
catch (SQLException ex) {
|
||||||
ex.printStackTrace();
|
logger.warn("SQL error finding id for domain", ex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -31,14 +31,13 @@ public class SqlLoadProcessedDocument {
|
|||||||
IN TITLE VARCHAR(255),
|
IN TITLE VARCHAR(255),
|
||||||
IN DESCRIPTION VARCHAR(255),
|
IN DESCRIPTION VARCHAR(255),
|
||||||
IN LENGTH INT,
|
IN LENGTH INT,
|
||||||
IN QUALITY_MEASURE DOUBLE,
|
|
||||||
IN FEATURES INT,
|
IN FEATURES INT,
|
||||||
IN STANDARD VARCHAR(32),
|
IN STANDARD VARCHAR(32),
|
||||||
IN HASH INT)
|
IN HASH INT)
|
||||||
BEGIN
|
BEGIN
|
||||||
SET FOREIGN_KEY_CHECKS=0;
|
SET FOREIGN_KEY_CHECKS=0;
|
||||||
REPLACE INTO EC_PAGE_DATA(ID, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES) VALUES (URL_ID, TITLE, DESCRIPTION, LENGTH, STANDARD, FEATURES);
|
REPLACE INTO EC_PAGE_DATA(ID, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH) VALUES (URL_ID, TITLE, DESCRIPTION, LENGTH, STANDARD, FEATURES, HASH);
|
||||||
UPDATE EC_URL SET VISITED=1, STATE=STATE, QUALITY_MEASURE=QUALITY_MEASURE, DATA_HASH=HASH WHERE ID=URL_ID;
|
UPDATE EC_URL SET VISITED=1, STATE=STATE WHERE ID=URL_ID;
|
||||||
SET FOREIGN_KEY_CHECKS=1;
|
SET FOREIGN_KEY_CHECKS=1;
|
||||||
END
|
END
|
||||||
""");
|
""");
|
||||||
@ -47,7 +46,7 @@ public class SqlLoadProcessedDocument {
|
|||||||
IN URL_ID INT,
|
IN URL_ID INT,
|
||||||
IN STATE VARCHAR(32))
|
IN STATE VARCHAR(32))
|
||||||
BEGIN
|
BEGIN
|
||||||
UPDATE EC_URL SET VISITED=1, STATE=STATE, QUALITY_MEASURE=-100, DATA_HASH=NULL WHERE ID=URL_ID;
|
UPDATE EC_URL SET VISITED=1, STATE=STATE WHERE ID=URL_ID;
|
||||||
END
|
END
|
||||||
""");
|
""");
|
||||||
|
|
||||||
@ -60,7 +59,8 @@ public class SqlLoadProcessedDocument {
|
|||||||
|
|
||||||
public void load(LoaderData data, List<LoadProcessedDocument> documents) {
|
public void load(LoaderData data, List<LoadProcessedDocument> documents) {
|
||||||
try (var conn = dataSource.getConnection();
|
try (var conn = dataSource.getConnection();
|
||||||
var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT(?, ?, ?, ?, ?, ?, ?, ?, ?)")) {
|
var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT(?, ?, ?, ?, ?, ?, ?, ?)")) {
|
||||||
|
conn.setAutoCommit(false);
|
||||||
|
|
||||||
for (var doc : documents) {
|
for (var doc : documents) {
|
||||||
int urlId = data.getUrlId(doc.url());
|
int urlId = data.getUrlId(doc.url());
|
||||||
@ -74,10 +74,9 @@ public class SqlLoadProcessedDocument {
|
|||||||
stmt.setString(3, doc.title());
|
stmt.setString(3, doc.title());
|
||||||
stmt.setString(4, doc.description());
|
stmt.setString(4, doc.description());
|
||||||
stmt.setInt(5, doc.length());
|
stmt.setInt(5, doc.length());
|
||||||
stmt.setDouble(6, doc.quality());
|
stmt.setInt(6, doc.htmlFeatures());
|
||||||
stmt.setInt(7, doc.htmlFeatures());
|
stmt.setString(7, doc.standard().name());
|
||||||
stmt.setString(8, doc.standard().name());
|
stmt.setInt(8, (int) doc.hash());
|
||||||
stmt.setInt(9, (int) doc.hash());
|
|
||||||
stmt.addBatch();
|
stmt.addBatch();
|
||||||
}
|
}
|
||||||
var ret = stmt.executeBatch();
|
var ret = stmt.executeBatch();
|
||||||
@ -89,8 +88,8 @@ public class SqlLoadProcessedDocument {
|
|||||||
}
|
}
|
||||||
|
|
||||||
conn.commit();
|
conn.commit();
|
||||||
} catch (SQLException e) {
|
} catch (SQLException ex) {
|
||||||
e.printStackTrace();
|
logger.warn("SQL error inserting document", ex);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -117,8 +116,8 @@ public class SqlLoadProcessedDocument {
|
|||||||
logger.warn("load({}) -- bad row count {}", documents.get(rv), ret[rv]);
|
logger.warn("load({}) -- bad row count {}", documents.get(rv), ret[rv]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (SQLException e) {
|
} catch (SQLException ex) {
|
||||||
e.printStackTrace();
|
logger.warn("SQL error inserting failed document", ex);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -25,12 +25,12 @@ public class SqlLoadProcessedDomain {
|
|||||||
stmt.execute("DROP PROCEDURE IF EXISTS INITIALIZE_DOMAIN");
|
stmt.execute("DROP PROCEDURE IF EXISTS INITIALIZE_DOMAIN");
|
||||||
stmt.execute("""
|
stmt.execute("""
|
||||||
CREATE PROCEDURE INITIALIZE_DOMAIN (
|
CREATE PROCEDURE INITIALIZE_DOMAIN (
|
||||||
IN ST INT,
|
IN ST ENUM('ACTIVE', 'EXHAUSTED', 'SPECIAL', 'SOCIAL_MEDIA', 'BLOCKED', 'REDIR', 'ERROR', 'UNKNOWN'),
|
||||||
IN IDX INT,
|
IN IDX INT,
|
||||||
IN QUAL DOUBLE,
|
IN DID INT,
|
||||||
IN DID INT)
|
IN IP VARCHAR(32))
|
||||||
BEGIN
|
BEGIN
|
||||||
UPDATE EC_DOMAIN SET INDEX_DATE=NOW(), STATE=ST, DOMAIN_ALIAS=NULL, INDEXED=GREATEST(INDEXED,IDX), QUALITY=QUAL, QUALITY_RAW=QUAL, QUALITY_ORIGINAL=QUAL WHERE ID=DID;
|
UPDATE EC_DOMAIN SET INDEX_DATE=NOW(), STATE=ST, DOMAIN_ALIAS=NULL, INDEXED=GREATEST(INDEXED,IDX), IP=IP WHERE ID=DID;
|
||||||
DELETE FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=DID;
|
DELETE FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=DID;
|
||||||
END
|
END
|
||||||
""");
|
""");
|
||||||
@ -41,7 +41,7 @@ public class SqlLoadProcessedDomain {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void load(LoaderData data, EdgeDomain domain, EdgeDomainIndexingState state, double quality) {
|
public void load(LoaderData data, EdgeDomain domain, EdgeDomainIndexingState state, String ip) {
|
||||||
data.setTargetDomain(domain);
|
data.setTargetDomain(domain);
|
||||||
|
|
||||||
loadDomains.load(data, domain);
|
loadDomains.load(data, domain);
|
||||||
@ -49,18 +49,17 @@ public class SqlLoadProcessedDomain {
|
|||||||
try (var conn = dataSource.getConnection();
|
try (var conn = dataSource.getConnection();
|
||||||
var initCall = conn.prepareCall("CALL INITIALIZE_DOMAIN(?,?,?,?)"))
|
var initCall = conn.prepareCall("CALL INITIALIZE_DOMAIN(?,?,?,?)"))
|
||||||
{
|
{
|
||||||
initCall.setInt(1, state.code);
|
initCall.setString(1, state.name());
|
||||||
initCall.setInt(2, 1 + data.sizeHint / 100);
|
initCall.setInt(2, 1 + data.sizeHint / 100);
|
||||||
initCall.setDouble(3, quality);
|
initCall.setInt(3, data.getDomainId(domain));
|
||||||
initCall.setInt(4, data.getDomainId(domain));
|
initCall.setString(4, ip);
|
||||||
int rc = initCall.executeUpdate();
|
int rc = initCall.executeUpdate();
|
||||||
if (rc < 1) {
|
if (rc < 1) {
|
||||||
logger.warn("load({},{},{}) -- bad rowcount {}", domain, state, quality, rc);
|
logger.warn("load({},{}) -- bad rowcount {}", domain, state, rc);
|
||||||
}
|
}
|
||||||
conn.commit();
|
|
||||||
}
|
}
|
||||||
catch (SQLException ex) {
|
catch (SQLException ex) {
|
||||||
ex.printStackTrace();
|
logger.warn("SQL error initializing domain", ex);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -69,9 +68,9 @@ public class SqlLoadProcessedDomain {
|
|||||||
try (var conn = dataSource.getConnection();
|
try (var conn = dataSource.getConnection();
|
||||||
var stmt = conn.prepareStatement("""
|
var stmt = conn.prepareStatement("""
|
||||||
UPDATE EC_DOMAIN TARGET
|
UPDATE EC_DOMAIN TARGET
|
||||||
INNER JOIN EC_DOMAIN ALIAS ON ALIAS.URL_PART=?
|
INNER JOIN EC_DOMAIN ALIAS ON ALIAS.DOMAIN_NAME=?
|
||||||
SET TARGET.DOMAIN_ALIAS=ALIAS.ID
|
SET TARGET.DOMAIN_ALIAS=ALIAS.ID
|
||||||
WHERE TARGET.URL_PART=?
|
WHERE TARGET.DOMAIN_NAME=?
|
||||||
""")) {
|
""")) {
|
||||||
stmt.setString(1, link.to().toString());
|
stmt.setString(1, link.to().toString());
|
||||||
stmt.setString(2, link.from().toString());
|
stmt.setString(2, link.from().toString());
|
||||||
@ -81,7 +80,7 @@ public class SqlLoadProcessedDomain {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (SQLException ex) {
|
catch (SQLException ex) {
|
||||||
ex.printStackTrace();
|
logger.warn("SQL error inserting domain alias", ex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -25,12 +25,13 @@ public class SqlLoadUrls {
|
|||||||
stmt.execute("""
|
stmt.execute("""
|
||||||
CREATE PROCEDURE INSERT_URL (
|
CREATE PROCEDURE INSERT_URL (
|
||||||
IN PROTO VARCHAR(255),
|
IN PROTO VARCHAR(255),
|
||||||
IN DOMAIN_NAME VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci,
|
IN DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci,
|
||||||
IN PORT INT,
|
IN PORT INT,
|
||||||
IN URL VARCHAR(255)
|
IN PATH VARCHAR(255),
|
||||||
|
IN PATH_HASH INT
|
||||||
)
|
)
|
||||||
BEGIN
|
BEGIN
|
||||||
INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,URL) SELECT PROTO,ID,PORT,URL FROM EC_DOMAIN WHERE URL_PART=DOMAIN_NAME;
|
INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PATH_HASH) SELECT PROTO,ID,PORT,PATH,PATH_HASH FROM EC_DOMAIN WHERE DOMAIN_NAME=DOMAIN;
|
||||||
END
|
END
|
||||||
""");
|
""");
|
||||||
}
|
}
|
||||||
@ -42,8 +43,8 @@ public class SqlLoadUrls {
|
|||||||
|
|
||||||
public void load(LoaderData data, EdgeUrl[] urls) {
|
public void load(LoaderData data, EdgeUrl[] urls) {
|
||||||
try (var conn = dataSource.getConnection();
|
try (var conn = dataSource.getConnection();
|
||||||
var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?)");
|
var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?, ?)");
|
||||||
var queryCall = conn.prepareStatement("SELECT ID, PROTO, URL FROM EC_URL WHERE DOMAIN_ID=?")
|
var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH FROM EC_URL WHERE DOMAIN_ID=?")
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
conn.setAutoCommit(false);
|
conn.setAutoCommit(false);
|
||||||
@ -58,6 +59,7 @@ public class SqlLoadUrls {
|
|||||||
insertCall.setNull(3, Types.INTEGER);
|
insertCall.setNull(3, Types.INTEGER);
|
||||||
}
|
}
|
||||||
insertCall.setString(4, url.path);
|
insertCall.setString(4, url.path);
|
||||||
|
insertCall.setInt(5, url.path.hashCode());
|
||||||
insertCall.addBatch();
|
insertCall.addBatch();
|
||||||
}
|
}
|
||||||
var ret = insertCall.executeBatch();
|
var ret = insertCall.executeBatch();
|
||||||
@ -86,7 +88,7 @@ public class SqlLoadUrls {
|
|||||||
|
|
||||||
}
|
}
|
||||||
catch (SQLException ex) {
|
catch (SQLException ex) {
|
||||||
ex.printStackTrace();
|
logger.warn("SQL error inserting URLs", ex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -15,7 +15,7 @@ public class InstructionsCompiler {
|
|||||||
public List<Instruction> compile(ProcessedDomain domain) {
|
public List<Instruction> compile(ProcessedDomain domain) {
|
||||||
List<Instruction> ret = new ArrayList<>(domain.size()*4);
|
List<Instruction> ret = new ArrayList<>(domain.size()*4);
|
||||||
|
|
||||||
ret.add(new LoadProcessedDomain(domain.domain, domain.state, domain.averageQuality().orElse(-5.)));
|
ret.add(new LoadProcessedDomain(domain.domain, domain.state, domain.ip));
|
||||||
|
|
||||||
if (domain.documents != null) {
|
if (domain.documents != null) {
|
||||||
compileUrls(ret, domain.documents);
|
compileUrls(ret, domain.documents);
|
||||||
|
@ -34,11 +34,10 @@ public class CrawlJobExtractorMain {
|
|||||||
|
|
||||||
private static final String domainsSql =
|
private static final String domainsSql =
|
||||||
"""
|
"""
|
||||||
SELECT ID, LOWER(EC_DOMAIN.URL_PART)
|
SELECT ID, LOWER(EC_DOMAIN.DOMAIN_NAME)
|
||||||
FROM EC_DOMAIN
|
FROM EC_DOMAIN
|
||||||
WHERE QUALITY_RAW>-100
|
WHERE INDEXED>0
|
||||||
AND INDEXED>0
|
AND STATE='ACTIVE' OR STATE='EXHAUSTED'
|
||||||
AND STATE<2
|
|
||||||
ORDER BY
|
ORDER BY
|
||||||
INDEX_DATE ASC,
|
INDEX_DATE ASC,
|
||||||
DISCOVER_DATE ASC,
|
DISCOVER_DATE ASC,
|
||||||
@ -49,8 +48,8 @@ public class CrawlJobExtractorMain {
|
|||||||
|
|
||||||
private static final String urlsSql =
|
private static final String urlsSql =
|
||||||
"""
|
"""
|
||||||
SELECT CONCAT(PROTO, "://", ?, URL)
|
SELECT URL
|
||||||
FROM EC_URL
|
FROM EC_URL_VIEW
|
||||||
WHERE DOMAIN_ID=?
|
WHERE DOMAIN_ID=?
|
||||||
ORDER BY
|
ORDER BY
|
||||||
VISITED DESC,
|
VISITED DESC,
|
||||||
|
@ -30,19 +30,19 @@ public class CrawlJobExtractorPageRankMain {
|
|||||||
"""
|
"""
|
||||||
SELECT ID
|
SELECT ID
|
||||||
FROM EC_DOMAIN
|
FROM EC_DOMAIN
|
||||||
WHERE URL_PART=?
|
WHERE DOMAIN_NAME=?
|
||||||
""";
|
""";
|
||||||
private static final String specificDomainSqlFromId =
|
private static final String specificDomainSqlFromId =
|
||||||
"""
|
"""
|
||||||
SELECT LOWER(URL_PART)
|
SELECT LOWER(DOMAIN_NAME)
|
||||||
FROM EC_DOMAIN
|
FROM EC_DOMAIN
|
||||||
WHERE ID=?
|
WHERE ID=?
|
||||||
""";
|
""";
|
||||||
|
|
||||||
private static final String urlsSql =
|
private static final String urlsSql =
|
||||||
"""
|
"""
|
||||||
SELECT CONCAT(PROTO, "://", ?, URL)
|
SELECT URL
|
||||||
FROM EC_URL
|
FROM EC_URL_VIEW
|
||||||
WHERE DOMAIN_ID=?
|
WHERE DOMAIN_ID=?
|
||||||
ORDER BY
|
ORDER BY
|
||||||
VISITED DESC,
|
VISITED DESC,
|
||||||
|
@ -16,22 +16,14 @@ public interface EdgeDataStoreDao {
|
|||||||
boolean isBlacklisted(EdgeDomain domain);
|
boolean isBlacklisted(EdgeDomain domain);
|
||||||
|
|
||||||
EdgeId<EdgeDomain> getDomainId(EdgeDomain domain);
|
EdgeId<EdgeDomain> getDomainId(EdgeDomain domain);
|
||||||
EdgeId<EdgeUrl> getUrlId(EdgeUrl domain);
|
|
||||||
EdgeUrl getUrl(EdgeId<EdgeUrl> id);
|
|
||||||
EdgeUrlDetails getUrlDetails(EdgeId<EdgeUrl> id);
|
|
||||||
|
|
||||||
List<BrowseResult> getDomainNeighbors(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist backlist, int count);
|
|
||||||
List<BrowseResult> getDomainNeighborsAdjacent(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist backlist, int count);
|
List<BrowseResult> getDomainNeighborsAdjacent(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist backlist, int count);
|
||||||
List<BrowseResult> getRandomDomains(int count, EdgeDomainBlacklist backlist);
|
List<BrowseResult> getRandomDomains(int count, EdgeDomainBlacklist backlist);
|
||||||
List<EdgeUrlDetails> getUrlDetailsMulti(List<EdgeId<EdgeUrl>> ids);
|
List<EdgeUrlDetails> getUrlDetailsMulti(List<EdgeId<EdgeUrl>> ids);
|
||||||
List<EdgeId<EdgeDomain>> getDomainIdsFromUrlIds(Collection<EdgeId<EdgeUrl>> urlIds);
|
|
||||||
|
|
||||||
|
|
||||||
EdgeDomain getDomain(EdgeId<EdgeDomain> id);
|
EdgeDomain getDomain(EdgeId<EdgeDomain> id);
|
||||||
|
|
||||||
List<EdgeId<EdgeUrl>> inboudUrls(EdgeId<EdgeUrl> id, int limit);
|
|
||||||
List<EdgeId<EdgeUrl>> outboundUrls(EdgeId<EdgeUrl> id, int limit);
|
|
||||||
|
|
||||||
Optional<EdgeId<EdgeUrl>> resolveAmbiguousDomain(String name);
|
Optional<EdgeId<EdgeUrl>> resolveAmbiguousDomain(String name);
|
||||||
|
|
||||||
|
|
||||||
@ -48,9 +40,6 @@ public interface EdgeDataStoreDao {
|
|||||||
|
|
||||||
List<EdgeDomain> getLinkingDomains(EdgeId<EdgeDomain> domainId);
|
List<EdgeDomain> getLinkingDomains(EdgeId<EdgeDomain> domainId);
|
||||||
|
|
||||||
List<EdgeUrl> getNewUrls(EdgeId<EdgeDomain> domainId, Collection<EdgeUrl> links);
|
|
||||||
|
|
||||||
double getRank(EdgeId<EdgeDomain> domainId);
|
double getRank(EdgeId<EdgeDomain> domainId);
|
||||||
|
|
||||||
void updateDomainIndexTimestamp(EdgeDomain domain, EdgeDomainIndexingState state, EdgeDomain alias, int minIndexed);
|
|
||||||
}
|
}
|
||||||
|
@ -71,7 +71,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
|||||||
try (var connection = dataSource.getConnection()) {
|
try (var connection = dataSource.getConnection()) {
|
||||||
|
|
||||||
return domainIdCache.get(domain, () -> {
|
return domainIdCache.get(domain, () -> {
|
||||||
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?")) {
|
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||||
stmt.setString(1, domain.toString());
|
stmt.setString(1, domain.toString());
|
||||||
var rsp = stmt.executeQuery();
|
var rsp = stmt.executeQuery();
|
||||||
if (rsp.next()) {
|
if (rsp.next()) {
|
||||||
@ -86,103 +86,13 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
private <T> String idList(List<EdgeId<T>> ids) {
|
||||||
@SneakyThrows
|
StringJoiner j = new StringJoiner(",", "(", ")");
|
||||||
public EdgeId<EdgeUrl> getUrlId(EdgeUrl url) {
|
for (var id : ids) {
|
||||||
try (var connection = dataSource.getConnection()) {
|
j.add(Integer.toString(id.getId()));
|
||||||
|
|
||||||
return urlIdCache.get(url, () -> {
|
|
||||||
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_URL_VIEW WHERE URL_PATH=? AND URL_DOMAIN=? AND URL_PROTO=?")) {
|
|
||||||
stmt.setString(1, url.path);
|
|
||||||
stmt.setString(2, url.domain.toString());
|
|
||||||
stmt.setString(3, url.proto);
|
|
||||||
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
if (rsp.next()) {
|
|
||||||
return new EdgeId<>(rsp.getInt(1));
|
|
||||||
}
|
}
|
||||||
|
return j.toString();
|
||||||
}
|
}
|
||||||
// Lenient mode for http->https upgrades etc
|
|
||||||
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_URL_VIEW WHERE URL_PATH=? AND URL_DOMAIN=?")) {
|
|
||||||
stmt.setString(1, url.path);
|
|
||||||
stmt.setString(2, url.domain.toString());
|
|
||||||
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
if (rsp.next()) {
|
|
||||||
return new EdgeId<>(rsp.getInt(1));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
throw new NoSuchElementException(url.toString());
|
|
||||||
});
|
|
||||||
}
|
|
||||||
catch (UncheckedExecutionException ex) {
|
|
||||||
throw ex.getCause();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
@Override
|
|
||||||
public List<EdgeId<EdgeDomain>> getDomainIdsFromUrlIds(Collection<EdgeId<EdgeUrl>> urlIds) {
|
|
||||||
List<EdgeId<EdgeDomain>> results = new ArrayList<>(urlIds.size());
|
|
||||||
|
|
||||||
if (urlIds.isEmpty())
|
|
||||||
return results;
|
|
||||||
|
|
||||||
try (var connection = dataSource.getConnection()) {
|
|
||||||
|
|
||||||
try (var stmt = connection.prepareStatement("SELECT DOMAIN_ID FROM EC_URL WHERE ID IN " + urlIds
|
|
||||||
.stream()
|
|
||||||
.map(EdgeId::getId)
|
|
||||||
.map(Object::toString)
|
|
||||||
.collect(Collectors.joining(",", "(", ")"))))
|
|
||||||
{
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
while (rsp.next()) {
|
|
||||||
results.add(new EdgeId<>(rsp.getInt(1)));
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return results;
|
|
||||||
}
|
|
||||||
|
|
||||||
static final Pattern badChars = Pattern.compile("[';\\\\]");
|
|
||||||
private String saneString(String s) {
|
|
||||||
return "\'"+badChars.matcher(s).replaceAll("?")+"\'";
|
|
||||||
}
|
|
||||||
@SneakyThrows
|
|
||||||
@Override
|
|
||||||
public EdgeUrl getUrl(EdgeId<EdgeUrl> id) {
|
|
||||||
try (var connection = dataSource.getConnection()) {
|
|
||||||
|
|
||||||
try (var stmt = connection.createStatement()) {
|
|
||||||
var rsp = stmt.executeQuery("SELECT URL_PROTO, URL_DOMAIN,URL_PORT,URL_PATH FROM EC_URL_VIEW WHERE ID=" + id.getId());
|
|
||||||
if (rsp.next()) {
|
|
||||||
return new EdgeUrl(rsp.getString(1), new EdgeDomain(rsp.getString(2)), rsp.getInt(3), rsp.getString(4));
|
|
||||||
}
|
|
||||||
throw new NoSuchElementException();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
@Override
|
|
||||||
public EdgeUrlDetails getUrlDetails(EdgeId<EdgeUrl> id) {
|
|
||||||
try (var connection = dataSource.getConnection()) {
|
|
||||||
|
|
||||||
try (var stmt = connection.createStatement()) {
|
|
||||||
var rsp = stmt.executeQuery("SELECT ID,URL_PROTO,URL_DOMAIN,URL_PORT,URL_PATH,TITLE,DESCRIPTION,URL_QUALITY_MEASURE,DOMAIN_QUALITY_MEASURE,IFNULL(EC_DOMAIN_LINK_AGGREGATE.LINKS,1),WORDS_TOTAL,FORMAT,FEATURES,\"\",QUALITY_RAW,DOMAIN_STATE,DATA_HASH FROM EC_URL_VIEW LEFT JOIN EC_DOMAIN_LINK_AGGREGATE ON EC_DOMAIN_LINK_AGGREGATE.DOMAIN_ID=EC_URL_VIEW.DOMAIN_ID WHERE ID=" + id.getId());
|
|
||||||
if (rsp.next()) {
|
|
||||||
EdgeUrl url = new EdgeUrl(rsp.getString(2), new EdgeDomain(rsp.getString(3)), rsp.getInt(4), rsp.getString(5));
|
|
||||||
return new EdgeUrlDetails(rsp.getInt(1), url, rsp.getString(6), rsp.getString(7), rsp.getDouble(8), rsp.getDouble(15), rsp.getDouble(9), rsp.getInt(10), rsp.getInt(11), rsp.getString(12), rsp.getInt(13), EdgePageScoreAdjustment.zero(), Integer.MAX_VALUE, Double.MAX_VALUE, rsp.getString(14), rsp.getInt(16), 0, rsp.getInt(17));
|
|
||||||
}
|
|
||||||
throw new NoSuchElementException();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
@Override
|
@Override
|
||||||
@ -193,16 +103,38 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
|||||||
List<EdgeUrlDetails> result = new ArrayList<>(ids.size());
|
List<EdgeUrlDetails> result = new ArrayList<>(ids.size());
|
||||||
|
|
||||||
try (var connection = dataSource.getConnection()) {
|
try (var connection = dataSource.getConnection()) {
|
||||||
// This is SQL-injection safe, the IDs are of type int
|
|
||||||
String idString = ids.stream().map(EdgeId::getId).map(Objects::toString).collect(Collectors.joining(",", "(", ")"));
|
|
||||||
|
|
||||||
try (var stmt = connection.prepareStatement("SELECT ID,URL_PROTO,URL_DOMAIN,URL_PORT,URL_PATH,TITLE,DESCRIPTION,URL_QUALITY_MEASURE,DOMAIN_QUALITY_MEASURE,IFNULL(EC_DOMAIN_LINK_AGGREGATE.LINKS,1),WORDS_TOTAL,FORMAT,FEATURES,\"\",QUALITY_RAW,DOMAIN_STATE,DATA_HASH FROM EC_URL_VIEW LEFT JOIN EC_DOMAIN_LINK_AGGREGATE ON EC_DOMAIN_LINK_AGGREGATE.DOMAIN_ID=EC_URL_VIEW.DOMAIN_ID WHERE ID IN " + idString)) {
|
String idString = idList(ids);
|
||||||
|
|
||||||
|
try (var stmt = connection.prepareStatement(
|
||||||
|
"""
|
||||||
|
SELECT ID, URL,
|
||||||
|
TITLE, DESCRIPTION,
|
||||||
|
WORDS_TOTAL, FORMAT, FEATURES,
|
||||||
|
IP, DOMAIN_STATE, DATA_HASH
|
||||||
|
FROM EC_URL_VIEW WHERE ID IN
|
||||||
|
""" + idString)) {
|
||||||
|
// "SELECT ID,URL_PROTO,URL_DOMAIN,URL_PORT,URL_PATH,TITLE,DESCRIPTION,URL_QUALITY_MEASURE,DOMAIN_QUALITY_MEASURE,IFNULL(EC_DOMAIN_LINK_AGGREGATE.LINKS,1),WORDS_TOTAL,FORMAT,FEATURES,\"\",QUALITY_RAW,DOMAIN_STATE,DATA_HASH FROM EC_URL_VIEW LEFT JOIN EC_DOMAIN_LINK_AGGREGATE ON EC_DOMAIN_LINK_AGGREGATE.DOMAIN_ID=EC_URL_VIEW.DOMAIN_ID WHERE ID IN " + idString)) {
|
||||||
stmt.setFetchSize(ids.size());
|
stmt.setFetchSize(ids.size());
|
||||||
|
|
||||||
var rsp = stmt.executeQuery();
|
var rsp = stmt.executeQuery();
|
||||||
while (rsp.next()) {
|
while (rsp.next()) {
|
||||||
EdgeUrl url = new EdgeUrl(rsp.getString(2), new EdgeDomain(rsp.getString(3)), rsp.getInt(4), rsp.getString(5));
|
EdgeUrl url = new EdgeUrl(rsp.getString(2));
|
||||||
var val = new EdgeUrlDetails(rsp.getInt(1), url, rsp.getString(6), rsp.getString(7), rsp.getDouble(8), rsp.getDouble(15), rsp.getDouble(9), rsp.getInt(10), rsp.getInt(11), rsp.getString(12), rsp.getInt(13), EdgePageScoreAdjustment.zero(), Integer.MAX_VALUE, Double.MAX_VALUE, rsp.getString(14), rsp.getInt(16), 0, rsp.getInt(17));
|
var val = new EdgeUrlDetails(rsp.getInt(1), url,
|
||||||
|
rsp.getString(3), // title
|
||||||
|
rsp.getString(4), // description
|
||||||
|
-5, // quality
|
||||||
|
rsp.getInt(5), // wordsTotal
|
||||||
|
rsp.getString(6), // foramt
|
||||||
|
rsp.getInt(7), // features
|
||||||
|
rsp.getString(8), // ip
|
||||||
|
EdgeDomainIndexingState.valueOf(rsp.getString(9)), // domainState
|
||||||
|
rsp.getInt(10), // dataHash
|
||||||
|
EdgePageScoreAdjustment.zero(), // urlQualityAdjustment
|
||||||
|
Integer.MAX_VALUE, // rankingId
|
||||||
|
Double.MAX_VALUE, // termScore
|
||||||
|
0 // queryLength
|
||||||
|
);
|
||||||
if (val.urlQuality >= QUALITY_LOWER_BOUND_CUTOFF) {
|
if (val.urlQuality >= QUALITY_LOWER_BOUND_CUTOFF) {
|
||||||
result.add(val);
|
result.add(val);
|
||||||
}
|
}
|
||||||
@ -214,75 +146,6 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public List<BrowseResult> getDomainNeighbors(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist blacklist, int count) {
|
|
||||||
final Set<BrowseResult> domains = new HashSet<>(count*3);
|
|
||||||
|
|
||||||
final String q = "SELECT EC_DOMAIN.ID AS NEIGHBOR_ID, URL_PART from EC_DOMAIN_NEIGHBORS INNER JOIN EC_DOMAIN ON NEIGHBOR_ID=EC_DOMAIN.ID WHERE STATE<2 AND DOMAIN_ALIAS IS NULL AND EC_DOMAIN_NEIGHBORS.DOMAIN_ID = ? ORDER BY ADJ_IDX LIMIT ?";
|
|
||||||
|
|
||||||
try (var connection = dataSource.getConnection()) {
|
|
||||||
try (var stmt = connection.prepareStatement(q)) {
|
|
||||||
stmt.setFetchSize(count);
|
|
||||||
stmt.setInt(1, domainId.getId());
|
|
||||||
stmt.setInt(2, count);
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
while (rsp.next()) {
|
|
||||||
int id = rsp.getInt(1);
|
|
||||||
String domain = rsp.getString(2);
|
|
||||||
|
|
||||||
if (!blacklist.isBlacklisted(id)) {
|
|
||||||
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
|
|
||||||
|
|
||||||
domains.add(new BrowseResult(url, id));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
final String q2 = "SELECT EC_DOMAIN.ID, URL_PART FROM EC_DOMAIN_LINK INNER JOIN EC_DOMAIN ON DEST_DOMAIN_ID=EC_DOMAIN.ID WHERE SOURCE_DOMAIN_ID=? AND STATE<2 AND DOMAIN_ALIAS IS NULL GROUP BY EC_DOMAIN.ID ORDER BY RANK ASC LIMIT ?";
|
|
||||||
try (var stmt = connection.prepareStatement(q2)) {
|
|
||||||
|
|
||||||
stmt.setFetchSize(count);
|
|
||||||
stmt.setInt(1, domainId.getId());
|
|
||||||
stmt.setInt(2, count);
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
while (rsp.next()) {
|
|
||||||
int id = rsp.getInt(1);
|
|
||||||
String domain = rsp.getString(2);
|
|
||||||
|
|
||||||
if (!blacklist.isBlacklisted(id)) {
|
|
||||||
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
|
|
||||||
|
|
||||||
domains.add(new BrowseResult(url, id));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
final String q3 = "SELECT EC_DOMAIN.ID, URL_PART FROM EC_DOMAIN_LINK INNER JOIN EC_DOMAIN ON DEST_DOMAIN_ID=EC_DOMAIN.ID WHERE DEST_DOMAIN_ID=? AND STATE<2 AND DOMAIN_ALIAS IS NULL GROUP BY EC_DOMAIN.ID ORDER BY RANK ASC LIMIT ?";
|
|
||||||
try (var stmt = connection.prepareStatement(q3)) {
|
|
||||||
stmt.setFetchSize(count);
|
|
||||||
stmt.setInt(1, domainId.getId());
|
|
||||||
stmt.setInt(2, count);
|
|
||||||
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
while (rsp.next()) {
|
|
||||||
int id = rsp.getInt(1);
|
|
||||||
String domain = rsp.getString(2);
|
|
||||||
|
|
||||||
if (!blacklist.isBlacklisted(id)) {
|
|
||||||
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
|
|
||||||
|
|
||||||
domains.add(new BrowseResult(url, id));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (SQLException throwables) {
|
|
||||||
throwables.printStackTrace();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
return new ArrayList<>(domains);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<BrowseResult> getDomainNeighborsAdjacent(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist blacklist, int count) {
|
public List<BrowseResult> getDomainNeighborsAdjacent(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist blacklist, int count) {
|
||||||
@ -399,7 +262,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
|||||||
@Override
|
@Override
|
||||||
public List<BrowseResult> getRandomDomains(int count, EdgeDomainBlacklist blacklist) {
|
public List<BrowseResult> getRandomDomains(int count, EdgeDomainBlacklist blacklist) {
|
||||||
|
|
||||||
final String q = "SELECT DOMAIN_ID,URL_PART FROM EC_RANDOM_DOMAINS INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE STATE<2 AND DOMAIN_ALIAS IS NULL ORDER BY RAND() LIMIT ?";
|
final String q = "SELECT DOMAIN_ID,DOMAIN_NAME FROM EC_RANDOM_DOMAINS INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE STATE<2 AND DOMAIN_ALIAS IS NULL ORDER BY RAND() LIMIT ?";
|
||||||
List<BrowseResult> domains = new ArrayList<>(count);
|
List<BrowseResult> domains = new ArrayList<>(count);
|
||||||
try (var conn = dataSource.getConnection()) {
|
try (var conn = dataSource.getConnection()) {
|
||||||
try (var stmt = conn.prepareStatement(q)) {
|
try (var stmt = conn.prepareStatement(q)) {
|
||||||
@ -428,7 +291,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
|||||||
public EdgeDomain getDomain(EdgeId<EdgeDomain> id) {
|
public EdgeDomain getDomain(EdgeId<EdgeDomain> id) {
|
||||||
try (var connection = dataSource.getConnection()) {
|
try (var connection = dataSource.getConnection()) {
|
||||||
|
|
||||||
try (var stmt = connection.prepareStatement("SELECT URL_PART FROM EC_DOMAIN WHERE ID=?")) {
|
try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) {
|
||||||
stmt.setInt(1, id.getId());
|
stmt.setInt(1, id.getId());
|
||||||
var rsp = stmt.executeQuery();
|
var rsp = stmt.executeQuery();
|
||||||
if (rsp.next()) {
|
if (rsp.next()) {
|
||||||
@ -439,55 +302,11 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override @SneakyThrows
|
|
||||||
public List<EdgeId<EdgeUrl>> inboudUrls(EdgeId<EdgeUrl> id, int limit) {
|
|
||||||
|
|
||||||
List<EdgeId<EdgeUrl>> ret = new ArrayList<>();
|
|
||||||
try (var connection = dataSource.getConnection()) {
|
|
||||||
|
|
||||||
try (var stmt =
|
|
||||||
connection.prepareStatement("SELECT SRC_URL_ID FROM EC_RELATED_LINKS_IN WHERE DEST_URL_ID=? ORDER BY SRC_URL_QUALITY DESC LIMIT ?")) {
|
|
||||||
stmt.setFetchSize(limit);
|
|
||||||
stmt.setInt(1, id.getId());
|
|
||||||
stmt.setInt(2, limit);
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
while (rsp.next()) {
|
|
||||||
ret.add(new EdgeId<>(rsp.getInt(1)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override @SneakyThrows
|
|
||||||
public List<EdgeId<EdgeUrl>> outboundUrls(EdgeId<EdgeUrl> id, int limit) {
|
|
||||||
|
|
||||||
List<EdgeId<EdgeUrl>> ret = new ArrayList<>();
|
|
||||||
try (var connection = dataSource.getConnection()) {
|
|
||||||
|
|
||||||
try (var stmt =
|
|
||||||
connection.prepareStatement("SELECT DEST_URL_ID FROM EC_RELATED_LINKS_IN WHERE SRC_URL_ID=? ORDER BY SRC_URL_QUALITY DESC LIMIT ?")) {
|
|
||||||
stmt.setFetchSize(limit);
|
|
||||||
stmt.setInt(1, id.getId());
|
|
||||||
stmt.setInt(2, limit);
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
while (rsp.next()) {
|
|
||||||
ret.add(new EdgeId<>(rsp.getInt(1)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<EdgeId<EdgeUrl>> resolveAmbiguousDomain(String name) {
|
public Optional<EdgeId<EdgeUrl>> resolveAmbiguousDomain(String name) {
|
||||||
try (var connection = dataSource.getConnection()) {
|
try (var connection = dataSource.getConnection()) {
|
||||||
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) {
|
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||||
stmt.setString(1, name);
|
stmt.setString(1, name);
|
||||||
var rsp = stmt.executeQuery();
|
var rsp = stmt.executeQuery();
|
||||||
if (rsp.next()) {
|
if (rsp.next()) {
|
||||||
@ -495,7 +314,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) {
|
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||||
stmt.setString(1, "https://"+name);
|
stmt.setString(1, "https://"+name);
|
||||||
var rsp = stmt.executeQuery();
|
var rsp = stmt.executeQuery();
|
||||||
if (rsp.next()) {
|
if (rsp.next()) {
|
||||||
@ -503,7 +322,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) {
|
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||||
stmt.setString(1, "http://"+name);
|
stmt.setString(1, "http://"+name);
|
||||||
var rsp = stmt.executeQuery();
|
var rsp = stmt.executeQuery();
|
||||||
if (rsp.next()) {
|
if (rsp.next()) {
|
||||||
@ -511,7 +330,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) {
|
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||||
stmt.setString(1, "https://www."+name);
|
stmt.setString(1, "https://www."+name);
|
||||||
var rsp = stmt.executeQuery();
|
var rsp = stmt.executeQuery();
|
||||||
if (rsp.next()) {
|
if (rsp.next()) {
|
||||||
@ -519,7 +338,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) {
|
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||||
stmt.setString(1, "http://www."+name);
|
stmt.setString(1, "http://www."+name);
|
||||||
var rsp = stmt.executeQuery();
|
var rsp = stmt.executeQuery();
|
||||||
if (rsp.next()) {
|
if (rsp.next()) {
|
||||||
@ -682,27 +501,6 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
|||||||
return Collections.emptyList();
|
return Collections.emptyList();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public List<EdgeUrl> getNewUrls(EdgeId<EdgeDomain> domainId, Collection<EdgeUrl> links) {
|
|
||||||
Map<String, EdgeUrl> edgeUrlByPath = links.stream().collect(Collectors.toMap(EdgeUrl::getPath, Function.identity(), (a,b)->a));
|
|
||||||
|
|
||||||
try (var connection = dataSource.getConnection()) {
|
|
||||||
try (var stmt = connection.prepareStatement("SELECT URL FROM EC_URL WHERE DOMAIN_ID=?")) {
|
|
||||||
stmt.setFetchSize(500);
|
|
||||||
stmt.setInt(1, domainId.getId());
|
|
||||||
var rs = stmt.executeQuery();
|
|
||||||
while (rs.next()) {
|
|
||||||
edgeUrlByPath.remove(rs.getString(1));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
|
||||||
return Collections.emptyList();
|
|
||||||
}
|
|
||||||
return new ArrayList<>(edgeUrlByPath.values());
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double getRank(EdgeId<EdgeDomain> domainId) {
|
public double getRank(EdgeId<EdgeDomain> domainId) {
|
||||||
try (var connection = dataSource.getConnection()) {
|
try (var connection = dataSource.getConnection()) {
|
||||||
@ -722,47 +520,5 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public void updateDomainIndexTimestamp(EdgeDomain domain, EdgeDomainIndexingState state, EdgeDomain alias, int minIndexed) {
|
|
||||||
try (var connection = dataSource.getConnection();
|
|
||||||
var stmt = connection.prepareStatement("UPDATE EC_DOMAIN SET INDEX_DATE=NOW(), STATE=?, DOMAIN_ALIAS=?, INDEXED=GREATEST(INDEXED,?) WHERE ID=?")) {
|
|
||||||
stmt.setInt(1, state.code);
|
|
||||||
if (null == alias) {
|
|
||||||
stmt.setNull(2, Types.INTEGER);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
stmt.setInt(2, getDomainId(alias).getId());
|
|
||||||
}
|
|
||||||
|
|
||||||
stmt.setInt(3, minIndexed);
|
|
||||||
stmt.setInt(4, getDomainId(domain).getId());
|
|
||||||
stmt.executeUpdate();
|
|
||||||
connection.commit();
|
|
||||||
}
|
|
||||||
catch (SQLException throwables) {
|
|
||||||
logger.error("SQL error", throwables);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
private double getDomainQuality(Connection connection, EdgeDomain src) {
|
|
||||||
try (var stmt = connection.prepareStatement("SELECT QUALITY_RAW FROM EC_DOMAIN WHERE URL_PART=?")) {
|
|
||||||
stmt.setString(1, src.toString());
|
|
||||||
var res = stmt.executeQuery();
|
|
||||||
|
|
||||||
if (res.next()) {
|
|
||||||
var q = res.getDouble(1);
|
|
||||||
if (q > 0.5) {
|
|
||||||
logger.warn("gDQ({}) -> 1", src);
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
catch (SQLException ex) {
|
|
||||||
logger.error("DB error", ex);
|
|
||||||
}
|
|
||||||
|
|
||||||
return -5;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -50,7 +50,7 @@ public class EdgeDomainBlacklistImpl implements EdgeDomainBlacklist {
|
|||||||
final TIntHashSet result = new TIntHashSet(1_000_000);
|
final TIntHashSet result = new TIntHashSet(1_000_000);
|
||||||
|
|
||||||
try (var connection = dataSource.getConnection()) {
|
try (var connection = dataSource.getConnection()) {
|
||||||
try (var stmt = connection.prepareStatement("SELECT EC_DOMAIN.ID FROM EC_DOMAIN INNER JOIN EC_TOP_DOMAIN ON EC_DOMAIN.URL_TOP_DOMAIN_ID = EC_TOP_DOMAIN.ID INNER JOIN EC_DOMAIN_BLACKLIST ON EC_DOMAIN_BLACKLIST.URL_DOMAIN = EC_TOP_DOMAIN.URL_PART")) {
|
try (var stmt = connection.prepareStatement("SELECT EC_DOMAIN.ID FROM EC_DOMAIN INNER JOIN EC_DOMAIN_BLACKLIST ON EC_DOMAIN_BLACKLIST.URL_DOMAIN = EC_DOMAIN.DOMAIN_TOP")) {
|
||||||
stmt.setFetchSize(1000);
|
stmt.setFetchSize(1000);
|
||||||
var rsp = stmt.executeQuery();
|
var rsp = stmt.executeQuery();
|
||||||
while (rsp.next()) {
|
while (rsp.next()) {
|
||||||
|
@ -30,29 +30,13 @@ public class SearchIndexDao {
|
|||||||
logger.info("SearchIndexDao ranking settings = {}", rankingSettings);
|
logger.info("SearchIndexDao ranking settings = {}", rankingSettings);
|
||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
public TIntHashSet getSpamDomains() {
|
|
||||||
final TIntHashSet result = new TIntHashSet(1_000_000);
|
|
||||||
|
|
||||||
try (var connection = dataSource.getConnection()) {
|
|
||||||
try (var stmt = connection.prepareStatement("SELECT EC_DOMAIN.ID FROM EC_DOMAIN INNER JOIN EC_TOP_DOMAIN ON EC_DOMAIN.URL_TOP_DOMAIN_ID = EC_TOP_DOMAIN.ID INNER JOIN EC_DOMAIN_BLACKLIST ON EC_DOMAIN_BLACKLIST.URL_DOMAIN = EC_TOP_DOMAIN.URL_PART")) {
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
while (rsp.next()) {
|
|
||||||
result.add(rsp.getInt(1));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public TIntHashSet goodUrls() {
|
public TIntHashSet goodUrls() {
|
||||||
TIntHashSet domains = new TIntHashSet(10_000_000, 0.5f, -1);
|
TIntHashSet domains = new TIntHashSet(10_000_000, 0.5f, -1);
|
||||||
TIntHashSet urls = new TIntHashSet(100_000_000, 0.5f, -1);
|
TIntHashSet urls = new TIntHashSet(100_000_000, 0.5f, -1);
|
||||||
|
|
||||||
try (var connection = dataSource.getConnection()) {
|
try (var connection = dataSource.getConnection()) {
|
||||||
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_ALIAS IS NULL AND STATE>=0")) {
|
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_ALIAS IS NULL AND IS_ALIVE")) {
|
||||||
stmt.setFetchSize(10_000);
|
stmt.setFetchSize(10_000);
|
||||||
var rsp = stmt.executeQuery();
|
var rsp = stmt.executeQuery();
|
||||||
while (rsp.next()) {
|
while (rsp.next()) {
|
||||||
|
@ -16,25 +16,24 @@ public class EdgeUrlDetails {
|
|||||||
public String description;
|
public String description;
|
||||||
|
|
||||||
public double urlQuality;
|
public double urlQuality;
|
||||||
public double urlQualityRaw;
|
|
||||||
public double domainQuality;
|
|
||||||
|
|
||||||
public int links; // DEAD
|
|
||||||
public int words;
|
public int words;
|
||||||
public String format;
|
public String format;
|
||||||
public int features;
|
public int features;
|
||||||
|
|
||||||
public EdgePageScoreAdjustment urlQualityAdjustment;
|
|
||||||
|
|
||||||
public long rankingId;
|
|
||||||
public double termScore;
|
|
||||||
|
|
||||||
public String ip; // BROKEN
|
public String ip; // BROKEN
|
||||||
public int domainState;
|
public EdgeDomainIndexingState domainState;
|
||||||
public int queryLength;
|
|
||||||
|
|
||||||
public int dataHash;
|
public int dataHash;
|
||||||
|
|
||||||
|
public EdgePageScoreAdjustment urlQualityAdjustment;
|
||||||
|
public long rankingId;
|
||||||
|
public double termScore;
|
||||||
|
public int queryLength;
|
||||||
|
|
||||||
public long rankingIdAdjustment() {
|
public long rankingIdAdjustment() {
|
||||||
int penalty = 0;
|
int penalty = 0;
|
||||||
|
|
||||||
@ -136,7 +135,7 @@ public class EdgeUrlDetails {
|
|||||||
return HtmlFeature.hasFeature(features, HtmlFeature.COOKIES);
|
return HtmlFeature.hasFeature(features, HtmlFeature.COOKIES);
|
||||||
}
|
}
|
||||||
public boolean isSpecialDomain() {
|
public boolean isSpecialDomain() {
|
||||||
return domainState == EdgeDomainIndexingState.SPECIAL.code;
|
return domainState == EdgeDomainIndexingState.SPECIAL;
|
||||||
}
|
}
|
||||||
public int getLogRank() { return (int) Math.round(Math.min(Math.log(1+rankingId),10)); }
|
public int getLogRank() { return (int) Math.round(Math.min(Math.log(1+rankingId),10)); }
|
||||||
|
|
||||||
|
@ -107,7 +107,7 @@ public class SearchResultDecorator {
|
|||||||
|
|
||||||
private double calculateTermScore(IndexBlock block, EdgeSearchResultItem resultItem, EdgeUrlDetails details) {
|
private double calculateTermScore(IndexBlock block, EdgeSearchResultItem resultItem, EdgeUrlDetails details) {
|
||||||
return valuator.evaluateTerms(resultItem.scores, block, details.words) / Math.sqrt(1 + resultItem.queryLength)
|
return valuator.evaluateTerms(resultItem.scores, block, details.words) / Math.sqrt(1 + resultItem.queryLength)
|
||||||
+ ((details.domainState == EdgeDomainIndexingState.SPECIAL.code) ? 1.25 : 0);
|
+ ((details.domainState == EdgeDomainIndexingState.SPECIAL) ? 1.25 : 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1,24 +1,11 @@
|
|||||||
DROP TABLE IF EXISTS EC_URL_LINK;
|
DROP TABLE IF EXISTS DOMAIN_METADATA;
|
||||||
DROP VIEW IF EXISTS EC_PAGE_VIEW;
|
|
||||||
|
|
||||||
DROP TABLE IF EXISTS DISC_DOMAIN_TAG;
|
|
||||||
DROP TABLE IF EXISTS DISC_TAG;
|
|
||||||
DROP TABLE IF EXISTS DISC_USER;
|
|
||||||
|
|
||||||
DROP TABLE IF EXISTS EC_DOMAIN_NEIGHBORS;
|
|
||||||
DROP TABLE IF EXISTS EC_FEED_URL;
|
DROP TABLE IF EXISTS EC_FEED_URL;
|
||||||
DROP TABLE IF EXISTS EC_DOMAIN_LINK;
|
DROP TABLE IF EXISTS EC_DOMAIN_LINK;
|
||||||
DROP TABLE IF EXISTS EC_PAGE_DATA;
|
DROP TABLE IF EXISTS EC_PAGE_DATA;
|
||||||
DROP TABLE IF EXISTS EC_URL;
|
DROP TABLE IF EXISTS EC_URL;
|
||||||
|
DROP TABLE IF EXISTS EC_DOMAIN_NEIGHBORS;
|
||||||
DROP TABLE IF EXISTS EC_DOMAIN;
|
DROP TABLE IF EXISTS EC_DOMAIN;
|
||||||
DROP TABLE IF EXISTS EC_TOP_DOMAIN;
|
|
||||||
DROP TABLE IF EXISTS EC_URL_DETAILS;
|
|
||||||
DROP VIEW IF EXISTS EC_URL_VIEW;
|
|
||||||
DROP VIEW IF EXISTS EC_URL_PART_HASH;
|
|
||||||
|
|
||||||
DROP TABLE IF EXISTS EC_URL_WORD;
|
|
||||||
DROP TABLE IF EXISTS EC_DICTIONARY;
|
|
||||||
DROP TABLE IF EXISTS DOMAIN_METADATA;
|
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS DOMAIN_METADATA (
|
CREATE TABLE IF NOT EXISTS DOMAIN_METADATA (
|
||||||
ID INT PRIMARY KEY,
|
ID INT PRIMARY KEY,
|
||||||
@ -27,45 +14,24 @@ CREATE TABLE IF NOT EXISTS DOMAIN_METADATA (
|
|||||||
GOOD_URLS INT DEFAULT 0
|
GOOD_URLS INT DEFAULT 0
|
||||||
);
|
);
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS EC_TOP_DOMAIN (
|
|
||||||
ID INT PRIMARY KEY AUTO_INCREMENT,
|
|
||||||
URL_PART VARCHAR(255) UNIQUE NOT NULL,
|
|
||||||
ALIVE BOOLEAN DEFAULT TRUE NOT NULL
|
|
||||||
)
|
|
||||||
CHARACTER SET utf8mb4
|
|
||||||
COLLATE utf8mb4_unicode_ci;
|
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS EC_DOMAIN (
|
CREATE TABLE IF NOT EXISTS EC_DOMAIN (
|
||||||
ID INT PRIMARY KEY AUTO_INCREMENT,
|
ID INT PRIMARY KEY AUTO_INCREMENT,
|
||||||
URL_PART VARCHAR(255) UNIQUE NOT NULL,
|
|
||||||
INDEXED INT DEFAULT 0 NOT NULL,
|
|
||||||
QUALITY DOUBLE DEFAULT -5 NOT NULL,
|
|
||||||
QUALITY_RAW DOUBLE DEFAULT -5 NOT NULL,
|
|
||||||
QUALITY_ORIGINAL DOUBLE DEFAULT -5 NOT NULL,
|
|
||||||
|
|
||||||
URL_TOP_DOMAIN_ID INT NOT NULL,
|
DOMAIN_NAME VARCHAR(255) UNIQUE NOT NULL,
|
||||||
URL_SUBDOMAIN VARCHAR(255) NOT NULL,
|
DOMAIN_TOP VARCHAR(255) NOT NULL,
|
||||||
STATE INT DEFAULT 0 NOT NULL,
|
|
||||||
|
INDEXED INT DEFAULT 0 NOT NULL COMMENT "~number of documents visited / 100",
|
||||||
|
STATE ENUM('ACTIVE', 'EXHAUSTED', 'SPECIAL', 'SOCIAL_MEDIA', 'BLOCKED', 'REDIR', 'ERROR', 'UNKNOWN') NOT NULL DEFAULT 'active' COMMENT "@see EdgeDomainIndexingState",
|
||||||
|
|
||||||
RANK DOUBLE,
|
RANK DOUBLE,
|
||||||
|
|
||||||
DOMAIN_ALIAS INTEGER,
|
DOMAIN_ALIAS INTEGER,
|
||||||
|
IP VARCHAR(32),
|
||||||
|
|
||||||
INDEX_DATE TIMESTAMP DEFAULT NOW(),
|
INDEX_DATE TIMESTAMP DEFAULT NOW(),
|
||||||
DISCOVER_DATE TIMESTAMP DEFAULT NOW(),
|
DISCOVER_DATE TIMESTAMP DEFAULT NOW(),
|
||||||
|
|
||||||
FOREIGN KEY (URL_TOP_DOMAIN_ID) REFERENCES EC_TOP_DOMAIN(ID) ON DELETE CASCADE
|
IS_ALIVE BOOLEAN AS (STATE='ACTIVE' OR STATE='EXHAUSTED' OR STATE='SPECIAL' OR STATE='SOCIAL_MEDIA') VIRTUAL
|
||||||
)
|
|
||||||
CHARACTER SET utf8mb4
|
|
||||||
COLLATE utf8mb4_unicode_ci;
|
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS EC_DOMAIN_HISTORY (
|
|
||||||
ID INT PRIMARY KEY AUTO_INCREMENT,
|
|
||||||
URL_PART VARCHAR(255) UNIQUE NOT NULL,
|
|
||||||
QUALITY_MEASURE DOUBLE DEFAULT -5 NOT NULL,
|
|
||||||
INBOUND_LINKS INT DEFAULT 1,
|
|
||||||
LINK_ADJUSTED_QUALITY DOUBLE GENERATED ALWAYS AS (0.3*QUALITY_MEASURE + 0.7*QUALITY_MEASURE / GREATEST(1, INBOUND_LINKS)),
|
|
||||||
RANK DOUBLE
|
|
||||||
)
|
)
|
||||||
CHARACTER SET utf8mb4
|
CHARACTER SET utf8mb4
|
||||||
COLLATE utf8mb4_unicode_ci;
|
COLLATE utf8mb4_unicode_ci;
|
||||||
@ -81,18 +47,15 @@ CREATE TABLE IF NOT EXISTS EC_URL (
|
|||||||
ID INT PRIMARY KEY AUTO_INCREMENT,
|
ID INT PRIMARY KEY AUTO_INCREMENT,
|
||||||
DOMAIN_ID INT NOT NULL,
|
DOMAIN_ID INT NOT NULL,
|
||||||
PROTO ENUM('http','https','gemini') NOT NULL,
|
PROTO ENUM('http','https','gemini') NOT NULL,
|
||||||
URL VARCHAR(255) NOT NULL,
|
PATH VARCHAR(255) NOT NULL COLLATE utf8mb4_bin,
|
||||||
PORT INT,
|
PORT INT,
|
||||||
|
|
||||||
|
PATH_HASH INT NOT NULL COMMENT "Hash of PATH for uniqueness check by domain",
|
||||||
VISITED BOOLEAN NOT NULL DEFAULT FALSE,
|
VISITED BOOLEAN NOT NULL DEFAULT FALSE,
|
||||||
DATA_HASH INTEGER,
|
|
||||||
QUALITY_MEASURE DOUBLE,
|
|
||||||
|
|
||||||
STATE ENUM('ok', 'redirect', 'dead', 'archived', 'disqualified') NOT NULL DEFAULT 'ok',
|
STATE ENUM('ok', 'redirect', 'dead', 'archived', 'disqualified') NOT NULL DEFAULT 'ok',
|
||||||
|
|
||||||
IP VARCHAR(32),
|
CONSTRAINT CONS UNIQUE (DOMAIN_ID, PATH_HASH),
|
||||||
|
|
||||||
CONSTRAINT CONS UNIQUE (DOMAIN_ID, URL),
|
|
||||||
FOREIGN KEY (DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE
|
FOREIGN KEY (DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE
|
||||||
)
|
)
|
||||||
CHARACTER SET utf8mb4
|
CHARACTER SET utf8mb4
|
||||||
@ -101,13 +64,14 @@ COLLATE utf8mb4_unicode_ci;
|
|||||||
CREATE TABLE IF NOT EXISTS EC_PAGE_DATA (
|
CREATE TABLE IF NOT EXISTS EC_PAGE_DATA (
|
||||||
ID INT PRIMARY KEY AUTO_INCREMENT,
|
ID INT PRIMARY KEY AUTO_INCREMENT,
|
||||||
|
|
||||||
TITLE VARCHAR(255),
|
TITLE VARCHAR(255) NOT NULL,
|
||||||
DESCRIPTION VARCHAR(255),
|
DESCRIPTION VARCHAR(255) NOT NULL,
|
||||||
|
|
||||||
WORDS_DISTINCT INTEGER,
|
WORDS_TOTAL INTEGER NOT NULL,
|
||||||
WORDS_TOTAL INTEGER,
|
FORMAT ENUM('PLAIN', 'UNKNOWN', 'HTML123', 'HTML4', 'XHTML', 'HTML5', 'MARKDOWN') NOT NULL,
|
||||||
FORMAT VARCHAR(8),
|
FEATURES INT COMMENT "Bit-encoded feature set of document, @see HtmlFeature" NOT NULL,
|
||||||
FEATURES INT,
|
|
||||||
|
DATA_HASH INTEGER NOT NULL,
|
||||||
|
|
||||||
FOREIGN KEY (ID) REFERENCES EC_URL(ID) ON DELETE CASCADE
|
FOREIGN KEY (ID) REFERENCES EC_URL(ID) ON DELETE CASCADE
|
||||||
)
|
)
|
||||||
@ -115,13 +79,9 @@ CHARACTER SET utf8mb4
|
|||||||
COLLATE utf8mb4_unicode_ci;
|
COLLATE utf8mb4_unicode_ci;
|
||||||
|
|
||||||
CREATE TABLE EC_FEED_URL (
|
CREATE TABLE EC_FEED_URL (
|
||||||
ID INT PRIMARY KEY AUTO_INCREMENT,
|
URL VARCHAR(255) PRIMARY KEY,
|
||||||
DOMAIN_ID INT NOT NULL,
|
DOMAIN_ID INT,
|
||||||
PROTO VARCHAR(8) NOT NULL,
|
|
||||||
URL VARCHAR(255) NOT NULL,
|
|
||||||
PORT INT,
|
|
||||||
|
|
||||||
CONSTRAINT CONS UNIQUE (DOMAIN_ID, URL),
|
|
||||||
FOREIGN KEY (DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE
|
FOREIGN KEY (DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE
|
||||||
)
|
)
|
||||||
CHARACTER SET utf8mb4
|
CHARACTER SET utf8mb4
|
||||||
@ -150,29 +110,23 @@ CREATE TABLE IF NOT EXISTS EC_DOMAIN_LINK (
|
|||||||
FOREIGN KEY (DEST_DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE
|
FOREIGN KEY (DEST_DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE
|
||||||
);
|
);
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS EC_DOMAIN_LINK_AGGREGATE (
|
|
||||||
DOMAIN_ID INT PRIMARY KEY NOT NULL,
|
|
||||||
LINKS INT
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE OR REPLACE VIEW EC_URL_VIEW AS
|
CREATE OR REPLACE VIEW EC_URL_VIEW AS
|
||||||
SELECT
|
SELECT
|
||||||
EC_DOMAIN.URL_PART AS URL_DOMAIN,
|
IF(PORT IS NULL,
|
||||||
EC_URL.URL AS URL_PATH,
|
CONCAT(EC_URL.PROTO, "://", EC_DOMAIN.DOMAIN_NAME, EC_URL.PATH),
|
||||||
EC_TOP_DOMAIN.URL_PART AS URL_TOP,
|
CONCAT(EC_URL.PROTO, "://", EC_DOMAIN.DOMAIN_NAME, ":", EC_URL.PORT, EC_URL.PATH))
|
||||||
|
AS URL,
|
||||||
|
EC_URL.PATH_HASH AS PATH_HASH,
|
||||||
|
EC_URL.PATH AS PATH,
|
||||||
|
EC_DOMAIN.DOMAIN_NAME AS DOMAIN_NAME,
|
||||||
|
EC_DOMAIN.DOMAIN_TOP AS DOMAIN_TOP,
|
||||||
EC_URL.ID AS ID,
|
EC_URL.ID AS ID,
|
||||||
EC_DOMAIN.ID AS DOMAIN_ID,
|
EC_DOMAIN.ID AS DOMAIN_ID,
|
||||||
EC_TOP_DOMAIN.ID AS TOP_DOMAIN_ID,
|
|
||||||
EC_URL.PROTO AS URL_PROTO,
|
|
||||||
EC_URL.PORT AS URL_PORT,
|
|
||||||
EC_URL.VISITED AS VISITED,
|
EC_URL.VISITED AS VISITED,
|
||||||
EC_URL.DATA_HASH AS DATA_HASH,
|
EC_PAGE_DATA.DATA_HASH AS DATA_HASH,
|
||||||
EC_URL.QUALITY_MEASURE AS URL_QUALITY_MEASURE,
|
|
||||||
EC_DOMAIN.QUALITY AS DOMAIN_QUALITY_MEASURE,
|
|
||||||
EC_DOMAIN.QUALITY_RAW AS QUALITY_RAW,
|
|
||||||
EC_PAGE_DATA.TITLE AS TITLE,
|
EC_PAGE_DATA.TITLE AS TITLE,
|
||||||
EC_PAGE_DATA.DESCRIPTION AS DESCRIPTION,
|
EC_PAGE_DATA.DESCRIPTION AS DESCRIPTION,
|
||||||
EC_URL.IP AS IP,
|
EC_DOMAIN.IP AS IP,
|
||||||
EC_DOMAIN.STATE AS STATE,
|
EC_DOMAIN.STATE AS STATE,
|
||||||
EC_PAGE_DATA.WORDS_TOTAL AS WORDS_TOTAL,
|
EC_PAGE_DATA.WORDS_TOTAL AS WORDS_TOTAL,
|
||||||
EC_PAGE_DATA.FORMAT AS FORMAT,
|
EC_PAGE_DATA.FORMAT AS FORMAT,
|
||||||
@ -183,59 +137,32 @@ CREATE OR REPLACE VIEW EC_URL_VIEW AS
|
|||||||
LEFT JOIN EC_PAGE_DATA
|
LEFT JOIN EC_PAGE_DATA
|
||||||
ON EC_PAGE_DATA.ID = EC_URL.ID
|
ON EC_PAGE_DATA.ID = EC_URL.ID
|
||||||
INNER JOIN EC_DOMAIN
|
INNER JOIN EC_DOMAIN
|
||||||
ON EC_URL.DOMAIN_ID = EC_DOMAIN.ID
|
ON EC_URL.DOMAIN_ID = EC_DOMAIN.ID;
|
||||||
INNER JOIN EC_TOP_DOMAIN
|
|
||||||
ON EC_DOMAIN.URL_TOP_DOMAIN_ID=EC_TOP_DOMAIN.ID;
|
|
||||||
|
|
||||||
CREATE OR REPLACE VIEW EC_DISCOVER_TASKS_VIEW AS
|
|
||||||
SELECT
|
|
||||||
ID,
|
|
||||||
URL_PART
|
|
||||||
FROM EC_DOMAIN
|
|
||||||
WHERE
|
|
||||||
DOMAIN_ALIAS IS NULL
|
|
||||||
AND INDEXED = 0
|
|
||||||
ORDER BY QUALITY DESC, ID ASC;
|
|
||||||
|
|
||||||
CREATE OR REPLACE VIEW EC_RELATED_LINKS_VIEW AS
|
CREATE OR REPLACE VIEW EC_RELATED_LINKS_VIEW AS
|
||||||
SELECT
|
SELECT
|
||||||
SOURCE_DOMAIN_ID,
|
SOURCE_DOMAIN_ID,
|
||||||
SOURCE_DOMAIN.URL_PART AS SOURCE_URL,
|
SOURCE_DOMAIN.DOMAIN_NAME AS SOURCE_DOMAIN,
|
||||||
SOURCE_TOP_DOMAIN.URL_PART AS SOURCE_TOP_URL,
|
SOURCE_DOMAIN.DOMAIN_TOP AS SOURCE_TOP_DOMAIN,
|
||||||
DEST_DOMAIN_ID,
|
DEST_DOMAIN_ID,
|
||||||
DEST_DOMAIN.URL_PART AS DEST_URL,
|
DEST_DOMAIN.DOMAIN_NAME AS DEST_DOMAIN,
|
||||||
DEST_TOP_DOMAIN.URL_PART AS DEST_TOP_URL
|
DEST_DOMAIN.DOMAIN_TOP AS DEST_TOP_DOMAIN
|
||||||
FROM EC_DOMAIN_LINK
|
FROM EC_DOMAIN_LINK
|
||||||
INNER JOIN EC_DOMAIN AS SOURCE_DOMAIN
|
INNER JOIN EC_DOMAIN AS SOURCE_DOMAIN
|
||||||
ON SOURCE_DOMAIN.ID=SOURCE_DOMAIN_ID
|
ON SOURCE_DOMAIN.ID=SOURCE_DOMAIN_ID
|
||||||
INNER JOIN EC_TOP_DOMAIN AS SOURCE_TOP_DOMAIN
|
|
||||||
ON SOURCE_TOP_DOMAIN.ID=SOURCE_DOMAIN.URL_TOP_DOMAIN_ID
|
|
||||||
INNER JOIN EC_DOMAIN AS DEST_DOMAIN
|
INNER JOIN EC_DOMAIN AS DEST_DOMAIN
|
||||||
ON DEST_DOMAIN.ID=DEST_DOMAIN_ID
|
ON DEST_DOMAIN.ID=DEST_DOMAIN_ID
|
||||||
INNER JOIN EC_TOP_DOMAIN AS DEST_TOP_DOMAIN
|
|
||||||
ON DEST_TOP_DOMAIN.ID=DEST_DOMAIN.URL_TOP_DOMAIN_ID
|
|
||||||
;
|
;
|
||||||
|
|
||||||
CREATE OR REPLACE VIEW EC_RELATED_LINKS_IN AS
|
CREATE OR REPLACE VIEW EC_RELATED_LINKS_IN AS
|
||||||
SELECT
|
SELECT
|
||||||
IN_URL.ID AS SRC_URL_ID,
|
IN_URL.ID AS SRC_URL_ID,
|
||||||
IN_URL.QUALITY_MEASURE AS SRC_URL_QUALITY,
|
OUT_URL.ID AS DEST_URL_ID
|
||||||
OUT_URL.ID AS DEST_URL_ID,
|
FROM EC_DOMAIN_LINK
|
||||||
OUT_URL.QUALITY_MEASURE AS DEST_URL_QUALITY
|
INNER JOIN EC_URL AS IN_URL ON IN_URL.DOMAIN_ID=EC_DOMAIN_LINK.SOURCE_DOMAIN_ID
|
||||||
FROM EC_URL AS IN_URL
|
INNER JOIN EC_URL AS OUT_URL ON OUT_URL.DOMAIN_ID=EC_DOMAIN_LINK.DEST_DOMAIN_ID
|
||||||
INNER JOIN EC_DOMAIN_LINK
|
WHERE IN_URL.VISITED AND IN_URL.STATE = 'ok'
|
||||||
ON IN_URL.DOMAIN_ID=EC_DOMAIN_LINK.SOURCE_DOMAIN_ID
|
AND OUT_URL.VISITED AND OUT_URL.STATE = 'ok';
|
||||||
INNER JOIN EC_URL AS OUT_URL
|
|
||||||
ON OUT_URL.DOMAIN_ID=EC_DOMAIN_LINK.DEST_DOMAIN_ID
|
|
||||||
WHERE IN_URL.VISITED=TRUE
|
|
||||||
AND IN_URL.DATA_HASH IS NOT NULL
|
|
||||||
AND OUT_URL.VISITED=TRUE
|
|
||||||
AND OUT_URL.DATA_HASH IS NOT NULL;
|
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS EC_DOMAIN_BACKLINKS (
|
|
||||||
ID INT PRIMARY KEY,
|
|
||||||
LINKEDNESS INT
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS EC_API_KEY (
|
CREATE TABLE IF NOT EXISTS EC_API_KEY (
|
||||||
LICENSE_KEY VARCHAR(255) UNIQUE,
|
LICENSE_KEY VARCHAR(255) UNIQUE,
|
||||||
@ -245,16 +172,9 @@ CREATE TABLE IF NOT EXISTS EC_API_KEY (
|
|||||||
RATE INT DEFAULT 10
|
RATE INT DEFAULT 10
|
||||||
);
|
);
|
||||||
|
|
||||||
CREATE INDEX IF NOT EXISTS EC_DOMAIN_RANK_INDEX ON EC_DOMAIN (RANK);
|
|
||||||
CREATE INDEX IF NOT EXISTS EC_DOMAIN_QUALITY_INDEX ON EC_DOMAIN (QUALITY,STATE);
|
|
||||||
|
|
||||||
CREATE INDEX IF NOT EXISTS EC_DOMAIN_INDEXED_INDEX ON EC_DOMAIN (INDEXED);
|
CREATE INDEX IF NOT EXISTS EC_DOMAIN_INDEXED_INDEX ON EC_DOMAIN (INDEXED);
|
||||||
CREATE INDEX IF NOT EXISTS EC_DOMAIN_ID_INDEXED_INDEX ON EC_DOMAIN (ID, INDEXED);
|
|
||||||
CREATE INDEX IF NOT EXISTS EC_DOMAIN_TRIO ON EC_DOMAIN (STATE, DOMAIN_ALIAS, INDEXED, QUALITY);
|
|
||||||
|
|
||||||
CREATE INDEX IF NOT EXISTS EC_URL_VISITED ON EC_URL (VISITED);
|
CREATE INDEX IF NOT EXISTS EC_URL_VISITED ON EC_URL (VISITED);
|
||||||
CREATE INDEX IF NOT EXISTS EC_URL_VISITED_STATE ON EC_URL (VISITED, STATE);
|
CREATE INDEX IF NOT EXISTS EC_DOMAIN_TOP_DOMAIN ON EC_DOMAIN (DOMAIN_TOP);
|
||||||
CREATE INDEX IF NOT EXISTS EC_URL_IP ON EC_URL (IP);
|
|
||||||
|
|
||||||
---;
|
---;
|
||||||
|
|
||||||
|
@ -43,7 +43,7 @@ public class TestUtil {
|
|||||||
logger.info("Running script {}", scriptFile);
|
logger.info("Running script {}", scriptFile);
|
||||||
try (var scriptStream = ClassLoader.getSystemResourceAsStream(scriptFile);
|
try (var scriptStream = ClassLoader.getSystemResourceAsStream(scriptFile);
|
||||||
var stmt = conn.createStatement()) {
|
var stmt = conn.createStatement()) {
|
||||||
for (String s : new String(scriptStream.readAllBytes()).split(";")) {
|
for (String s : new String(scriptStream.readAllBytes()).split("(;|---)")) {
|
||||||
if (!s.isBlank()) {
|
if (!s.isBlank()) {
|
||||||
try {
|
try {
|
||||||
Assertions.assertTrue(stmt.executeUpdate(s) >= 0);
|
Assertions.assertTrue(stmt.executeUpdate(s) >= 0);
|
||||||
|
@ -0,0 +1,51 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.loader;
|
||||||
|
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import nu.marginalia.util.TestUtil;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Tag;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.junit.jupiter.api.parallel.Execution;
|
||||||
|
import org.junit.jupiter.api.parallel.ExecutionMode;
|
||||||
|
import org.junit.jupiter.api.parallel.ResourceAccessMode;
|
||||||
|
import org.junit.jupiter.api.parallel.ResourceLock;
|
||||||
|
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
|
@ResourceLock(value = "mariadb", mode = ResourceAccessMode.READ_WRITE)
|
||||||
|
@Execution(ExecutionMode.SAME_THREAD)
|
||||||
|
@Tag("db")
|
||||||
|
class SqlLoadDomainLinksTest {
|
||||||
|
|
||||||
|
HikariDataSource dataSource;
|
||||||
|
LoaderData loaderData;
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() {
|
||||||
|
dataSource = TestUtil.getConnection();
|
||||||
|
TestUtil.evalScript(dataSource, "sql/edge-crawler-cache.sql");
|
||||||
|
|
||||||
|
var loadDomains = new SqlLoadDomains(dataSource);
|
||||||
|
loaderData = new LoaderData(10);
|
||||||
|
|
||||||
|
loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu"));
|
||||||
|
loadDomains.load(loaderData, new EdgeDomain[] { new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu") });
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
public void tearDown() {
|
||||||
|
dataSource.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void loadDomainLinks() throws URISyntaxException {
|
||||||
|
var loader = new SqlLoadDomainLinks(dataSource);
|
||||||
|
loader.load(new DomainLink[] { new DomainLink(new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu")) });
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,54 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.loader;
|
||||||
|
|
||||||
|
import nu.marginalia.util.TestUtil;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
|
import org.junit.jupiter.api.Tag;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.junit.jupiter.api.parallel.Execution;
|
||||||
|
import org.junit.jupiter.api.parallel.ExecutionMode;
|
||||||
|
import org.junit.jupiter.api.parallel.ResourceAccessMode;
|
||||||
|
import org.junit.jupiter.api.parallel.ResourceLock;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
|
@ResourceLock(value = "mariadb", mode = ResourceAccessMode.READ_WRITE)
|
||||||
|
@Execution(ExecutionMode.SAME_THREAD)
|
||||||
|
@Tag("db")
|
||||||
|
class SqlLoadDomainsTest {
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void loadDomain() {
|
||||||
|
|
||||||
|
try (var dataSource = TestUtil.getConnection()) {
|
||||||
|
TestUtil.evalScript(dataSource, "sql/edge-crawler-cache.sql");
|
||||||
|
|
||||||
|
var loadDomains = new SqlLoadDomains(dataSource);
|
||||||
|
var loaderData = new LoaderData(10);
|
||||||
|
|
||||||
|
loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu"));
|
||||||
|
loadDomains.load(loaderData, new EdgeDomain("www.marginalia.nu"));
|
||||||
|
|
||||||
|
assertTrue(loaderData.getDomainId(new EdgeDomain("www.marginalia.nu")) >= 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void loadDomains() {
|
||||||
|
|
||||||
|
try (var dataSource = TestUtil.getConnection()) {
|
||||||
|
TestUtil.evalScript(dataSource, "sql/edge-crawler-cache.sql");
|
||||||
|
|
||||||
|
var loadDomains = new SqlLoadDomains(dataSource);
|
||||||
|
var loaderData = new LoaderData(10);
|
||||||
|
|
||||||
|
loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu"));
|
||||||
|
loadDomains.load(loaderData, new EdgeDomain[] { new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu") });
|
||||||
|
|
||||||
|
assertTrue(loaderData.getDomainId(new EdgeDomain("www.marginalia.nu")) >= 0);
|
||||||
|
assertTrue(loaderData.getDomainId(new EdgeDomain("memex.marginalia.nu")) >= 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,68 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.loader;
|
||||||
|
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import nu.marginalia.util.TestUtil;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
||||||
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState;
|
||||||
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Tag;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.junit.jupiter.api.parallel.Execution;
|
||||||
|
import org.junit.jupiter.api.parallel.ExecutionMode;
|
||||||
|
import org.junit.jupiter.api.parallel.ResourceAccessMode;
|
||||||
|
import org.junit.jupiter.api.parallel.ResourceLock;
|
||||||
|
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
@ResourceLock(value = "mariadb", mode = ResourceAccessMode.READ_WRITE)
|
||||||
|
@Execution(ExecutionMode.SAME_THREAD)
|
||||||
|
@Tag("db")
|
||||||
|
class SqlLoadProcessedDocumentTest {
|
||||||
|
HikariDataSource dataSource;
|
||||||
|
LoaderData loaderData;
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() throws URISyntaxException {
|
||||||
|
dataSource = TestUtil.getConnection();
|
||||||
|
TestUtil.evalScript(dataSource, "sql/edge-crawler-cache.sql");
|
||||||
|
|
||||||
|
var loadDomains = new SqlLoadDomains(dataSource);
|
||||||
|
var loadUrls = new SqlLoadUrls(dataSource);
|
||||||
|
|
||||||
|
loaderData = new LoaderData(10);
|
||||||
|
|
||||||
|
loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu"));
|
||||||
|
loadDomains.load(loaderData, new EdgeDomain("www.marginalia.nu"));
|
||||||
|
|
||||||
|
loadUrls.load(loaderData, new EdgeUrl[]{new EdgeUrl("https://www.marginalia.nu/")});
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
public void tearDown() {
|
||||||
|
dataSource.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void loadProcessedDocument() throws URISyntaxException {
|
||||||
|
var loader = new SqlLoadProcessedDocument(dataSource);
|
||||||
|
loader.load(loaderData, List.of(new LoadProcessedDocument(
|
||||||
|
new EdgeUrl("https://www.marginalia.nu/"),
|
||||||
|
EdgeUrlState.OK,
|
||||||
|
"TITLE",
|
||||||
|
"DESCR",
|
||||||
|
HtmlFeature.encode(Set.of(HtmlFeature.AFFILIATE_LINK)),
|
||||||
|
EdgeHtmlStandard.HTML5,
|
||||||
|
100,
|
||||||
|
12345,
|
||||||
|
-5
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,52 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.loader;
|
||||||
|
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import nu.marginalia.util.TestUtil;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||||
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Tag;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.junit.jupiter.api.parallel.Execution;
|
||||||
|
import org.junit.jupiter.api.parallel.ExecutionMode;
|
||||||
|
import org.junit.jupiter.api.parallel.ResourceAccessMode;
|
||||||
|
import org.junit.jupiter.api.parallel.ResourceLock;
|
||||||
|
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
|
||||||
|
@ResourceLock(value = "mariadb", mode = ResourceAccessMode.READ_WRITE)
|
||||||
|
@Execution(ExecutionMode.SAME_THREAD)
|
||||||
|
@Tag("db")
|
||||||
|
class SqlLoadProcessedDomainTest {
|
||||||
|
HikariDataSource dataSource;
|
||||||
|
LoaderData loaderData;
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() {
|
||||||
|
dataSource = TestUtil.getConnection();
|
||||||
|
TestUtil.evalScript(dataSource, "sql/edge-crawler-cache.sql");
|
||||||
|
|
||||||
|
var loadDomains = new SqlLoadDomains(dataSource);
|
||||||
|
loaderData = new LoaderData(10);
|
||||||
|
|
||||||
|
loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu"));
|
||||||
|
loadDomains.load(loaderData, new EdgeDomain[]{ new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu") });
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
public void tearDown() {
|
||||||
|
dataSource.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void loadProcessedDomain() {
|
||||||
|
var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource));
|
||||||
|
loader.load(loaderData, new EdgeDomain("www.marginalia.nu"), EdgeDomainIndexingState.BLOCKED, "127.0.0.1");
|
||||||
|
}
|
||||||
|
@Test
|
||||||
|
public void loadDomainAlias() {
|
||||||
|
var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource));
|
||||||
|
loader.loadAlias(loaderData, new DomainLink(new EdgeDomain("memex.marginalia.nu"), new EdgeDomain("www.marginalia.nu")));
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,49 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.loader;
|
||||||
|
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import nu.marginalia.util.TestUtil;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Tag;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.junit.jupiter.api.parallel.Execution;
|
||||||
|
import org.junit.jupiter.api.parallel.ExecutionMode;
|
||||||
|
import org.junit.jupiter.api.parallel.ResourceAccessMode;
|
||||||
|
import org.junit.jupiter.api.parallel.ResourceLock;
|
||||||
|
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
|
@ResourceLock(value = "mariadb", mode = ResourceAccessMode.READ_WRITE)
|
||||||
|
@Execution(ExecutionMode.SAME_THREAD)
|
||||||
|
@Tag("db")
|
||||||
|
class SqlLoadUrlsTest {
|
||||||
|
HikariDataSource dataSource;
|
||||||
|
LoaderData loaderData;
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() {
|
||||||
|
dataSource = TestUtil.getConnection();
|
||||||
|
TestUtil.evalScript(dataSource, "sql/edge-crawler-cache.sql");
|
||||||
|
|
||||||
|
var loadDomains = new SqlLoadDomains(dataSource);
|
||||||
|
loaderData = new LoaderData(10);
|
||||||
|
|
||||||
|
loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu"));
|
||||||
|
loadDomains.load(loaderData, new EdgeDomain("www.marginalia.nu"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
public void tearDown() {
|
||||||
|
dataSource.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void loadUrl() throws URISyntaxException {
|
||||||
|
var loadUrls = new SqlLoadUrls(dataSource);
|
||||||
|
loadUrls.load(loaderData, new EdgeUrl[] { new EdgeUrl("https://www.marginalia.nu/") });
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user