WIP: Refactored ranking algorithms to separate database code from ranking code

This commit is contained in:
vlofgren 2022-06-08 16:18:00 +02:00
parent 026ba714b5
commit 5e472fe121
16 changed files with 488 additions and 512 deletions

View File

@ -1,15 +1,11 @@
package nu.marginalia.util.ranking;
import com.zaxxer.hikari.HikariDataSource;
import java.io.IOException;
public class BetterReversePageRank extends RankingAlgorithm {
public BetterReversePageRank(HikariDataSource dataSource, String... origins) {
super(dataSource, origins);
public BetterReversePageRank(RankingDomainFetcher domains, String... origins) {
super(domains, origins);
}
@Override

View File

@ -1,14 +1,10 @@
package nu.marginalia.util.ranking;
import com.zaxxer.hikari.HikariDataSource;
import java.io.IOException;
public class BetterStandardPageRank extends RankingAlgorithm {
public BetterStandardPageRank(HikariDataSource dataSource, String... origins) {
super(dataSource, origins);
public BetterStandardPageRank(RankingDomainFetcher domains, String... origins) {
super(domains, origins);
}
@Override

View File

@ -1,15 +1,11 @@
package nu.marginalia.util.ranking;
import com.zaxxer.hikari.HikariDataSource;
import java.io.IOException;
public class BuggyReversePageRank extends RankingAlgorithm {
public BuggyReversePageRank(HikariDataSource dataSource, String... origins) {
super(dataSource, origins);
public BuggyReversePageRank(RankingDomainFetcher domains, String... origins) {
super(domains, origins);
}
@Override

View File

@ -1,14 +1,10 @@
package nu.marginalia.util.ranking;
import com.zaxxer.hikari.HikariDataSource;
import java.io.IOException;
public class BuggyStandardPageRank extends RankingAlgorithm {
public BuggyStandardPageRank(HikariDataSource dataSource, String... origins) {
super(dataSource, origins);
public BuggyStandardPageRank(RankingDomainFetcher domains, String... origins) {
super(domains, origins);
}
@Override

View File

@ -1,35 +1,26 @@
package nu.marginalia.util.ranking;
import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList;
import gnu.trove.map.hash.TIntIntHashMap;
import gnu.trove.map.hash.TIntObjectHashMap;
import gnu.trove.set.hash.TIntHashSet;
import it.unimi.dsi.fastutil.ints.IntComparator;
import lombok.AllArgsConstructor;
import lombok.Data;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.sql.SQLException;
import java.util.*;
import java.util.function.IntToDoubleFunction;
import java.util.stream.IntStream;
import it.unimi.dsi.fastutil.ints.IntArrays;
public abstract class RankingAlgorithm {
final TIntObjectHashMap<DomainData> domainsById = new TIntObjectHashMap<>();
final TIntObjectHashMap<RankingDomainData> domainsById = new TIntObjectHashMap<>();
final TIntIntHashMap domainIndexToId = new TIntIntHashMap();
final TIntIntHashMap domainIdToIndex = new TIntIntHashMap();
private final TIntHashSet spamDomains;
private final HikariDataSource dataSource;
TIntArrayList[] linkDataSrc2Dest;
TIntArrayList[] linkDataDest2Src;
@ -41,10 +32,14 @@ public abstract class RankingAlgorithm {
private static final boolean getNames = true;
private final Logger logger = LoggerFactory.getLogger(getClass());
private RankingDomainFetcher domains;
public static void main(String... args) throws IOException {
var rpr = new BuggyReversePageRank(new DatabaseModule().provideConnection(), "wiki.xxiivv.com");
var spr = new BuggyStandardPageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu");
var ds = new DatabaseModule().provideConnection();
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
var rpr = new BuggyReversePageRank(domains, "wiki.xxiivv.com");
var spr = new BuggyStandardPageRank(domains, "memex.marginalia.nu");
var rankVector = spr.pageRankVector();
var norm = rankVector.norm();
@ -61,164 +56,97 @@ public abstract class RankingAlgorithm {
return domainsById.get(id).peripheral;
}
public RankingAlgorithm(HikariDataSource dataSource, String... origins) {
this.dataSource = dataSource;
var blacklist = new EdgeDomainBlacklistImpl(dataSource);
public RankingAlgorithm(RankingDomainFetcher domains, String... origins) {
this.domains = domains;
spamDomains = blacklist.getSpamDomains();
originDomains.addAll(Arrays.asList(origins));
try (var conn = dataSource.getConnection()) {
domains.getDomains(domainData -> {
int id = domainData.id;
String s;
if (getNames) {
s = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
domainsById.put(id, domainData);
domainIndexToId.put(domainIndexToId.size(), id);
domainIdToIndex.put(id, domainIdToIndex.size());
});
linkDataSrc2Dest = new TIntArrayList[domainIndexToId.size()];
linkDataDest2Src = new TIntArrayList[domainIndexToId.size()];
domains.eachDomainLink((src, dst) -> {
if (src == dst) return;
if (domainsById.contains(src) && domainsById.contains(dst)) {
int srcIdx = domainIdToIndex.get(src);
int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias());
if (linkDataSrc2Dest[srcIdx] == null) {
linkDataSrc2Dest[srcIdx] = new TIntArrayList();
}
linkDataSrc2Dest[srcIdx].add(dstIdx);
if (linkDataDest2Src[dstIdx] == null) {
linkDataDest2Src[dstIdx] = new TIntArrayList();
}
linkDataDest2Src[dstIdx].add(srcIdx);
}
});
for (var namePattern : this.originDomains) {
domains.domainsByPattern(namePattern, i -> {
int ival = domainIdToIndex.get(i);
if (ival != domainIdToIndex.getNoEntryValue() || domainIndexToId.get(0) == i) {
originDomainIds.add(ival);
}
else {
s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
logger.debug("No value for {}", i);
}
try (var stmt = conn.prepareStatement(s)) {
stmt.setFetchSize(10000);
var rsp = stmt.executeQuery();
while (rsp.next()) {
int id = rsp.getInt(1);
if (!spamDomains.contains(id)) {
domainsById.put(id, new DomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5), false));
domainIndexToId.put(domainIndexToId.size(), id);
domainIdToIndex.put(id, domainIdToIndex.size());
}
}
}
linkDataSrc2Dest = new TIntArrayList[domainIndexToId.size()];
linkDataDest2Src = new TIntArrayList[domainIndexToId.size()];
try (var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK")) {
stmt.setFetchSize(10000);
var rsp = stmt.executeQuery();
while (rsp.next()) {
int src = rsp.getInt(1);
int dst = rsp.getInt(2);
if (src == dst) continue;
if (domainsById.contains(src) && domainsById.contains(dst)) {
int srcIdx = domainIdToIndex.get(src);
int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias());
if (linkDataSrc2Dest[srcIdx] == null) {
linkDataSrc2Dest[srcIdx] = new TIntArrayList();
}
linkDataSrc2Dest[srcIdx].add(dstIdx);
if (linkDataDest2Src[dstIdx] == null) {
linkDataDest2Src[dstIdx] = new TIntArrayList();
}
linkDataDest2Src[dstIdx].add(srcIdx);
}
}
}
try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME LIKE ?")) {
for (var seed : this.originDomains) {
stmt.setString(1, seed);
var rsp = stmt.executeQuery();
while (rsp.next()) {
int i = rsp.getInt(1);
int ival = domainIdToIndex.get(i);
if (ival != domainIdToIndex.getNoEntryValue() || domainIndexToId.get(0) == i) {
originDomainIds.add(ival);
}
else {
logger.debug("No value for {}", i);
}
}
logger.debug("{} -> {}", seed, originDomainIds.size());
}
}
logger.info("Origin Domains: {}", originDomainIds.size());
} catch (SQLException throwables) {
logger.error("SQL error", throwables);
});
}
logger.info("Origin Domains: {}", originDomainIds.size());
}
public void addPeripheralNodes(boolean includeErrorStates) {
public void addPeripheralNodes() {
int newNodesIdxCutoff = domainIdToIndex.size();
logger.info("Inserting peripheral nodes");
try (var conn = dataSource.getConnection()) {
String s;
if (getNames) {
s = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
domains.getPeripheralDomains(domainData -> {
int id = domainData.id;
if (domainsById.put(id, domainData) == null) { // true if id was not already present
domainIndexToId.put(domainIndexToId.size(), id);
domainIdToIndex.put(id, domainIdToIndex.size());
}
else {
s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
}
try (var stmt = conn.prepareStatement(s)) {
stmt.setFetchSize(10000);
var rsp = stmt.executeQuery();
});
while (rsp.next()) {
int id = rsp.getInt(1);
linkDataSrc2Dest = Arrays.copyOf(linkDataSrc2Dest, domainIndexToId.size());
linkDataDest2Src = Arrays.copyOf(linkDataDest2Src, domainIndexToId.size());
if (!spamDomains.contains(id)) {
domainsById.put(id, new DomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5), true));
domains.eachDomainLink((src, dst) -> {
if (src == dst) return;
domainIndexToId.put(domainIndexToId.size(), id);
domainIdToIndex.put(id, domainIdToIndex.size());
}
if (domainsById.contains(src) && domainsById.contains(dst)) {
int srcIdx = domainIdToIndex.get(src);
int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias());
// This looks like a bug, but it improves the results
if (srcIdx < newNodesIdxCutoff || dstIdx < newNodesIdxCutoff)
return;
if (linkDataSrc2Dest[srcIdx] == null) {
linkDataSrc2Dest[srcIdx] = new TIntArrayList();
}
linkDataSrc2Dest[srcIdx].add(dstIdx);
}
linkDataSrc2Dest = Arrays.copyOf(linkDataSrc2Dest, domainIndexToId.size());
linkDataDest2Src = Arrays.copyOf(linkDataDest2Src, domainIndexToId.size());
try (var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK")) {
stmt.setFetchSize(10000);
var rsp = stmt.executeQuery();
while (rsp.next()) {
int src = rsp.getInt(1);
int dst = rsp.getInt(2);
if (src == dst) continue;
if (domainsById.contains(src) && domainsById.contains(dst)) {
int srcIdx = domainIdToIndex.get(src);
int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias());
// This looks like a bug, but it improves the results
if (srcIdx < newNodesIdxCutoff || dstIdx < newNodesIdxCutoff)
continue;
if (linkDataSrc2Dest[srcIdx] == null) {
linkDataSrc2Dest[srcIdx] = new TIntArrayList();
}
linkDataSrc2Dest[srcIdx].add(dstIdx);
if (linkDataDest2Src[dstIdx] == null) {
linkDataDest2Src[dstIdx] = new TIntArrayList();
}
linkDataDest2Src[dstIdx].add(srcIdx);
}
if (linkDataDest2Src[dstIdx] == null) {
linkDataDest2Src[dstIdx] = new TIntArrayList();
}
linkDataDest2Src[dstIdx].add(srcIdx);
}
} catch (SQLException throwables) {
logger.error("SQL error", throwables);
}
});
logger.info("Peripheral nodes inserted {} -> {}", newNodesIdxCutoff, domainIdToIndex.size());
}
@ -271,14 +199,14 @@ public abstract class RankingAlgorithm {
return rank.getRanking(resultCount);
}
public TIntList pageRankWithPeripheralNodes(int resultCount, boolean includeErrorStates) {
public TIntList pageRankWithPeripheralNodes(int resultCount) {
RankVector rank = new RankVector(1.d / domainsById.size());
int iter_max = 100;
for (int i = 0; i < iter_max; i++) {
if (i == iter_max-1) {
addPeripheralNodes(includeErrorStates);
addPeripheralNodes();
}
RankVector newRank = createNewRankVector(rank);
@ -323,7 +251,7 @@ public abstract class RankingAlgorithm {
abstract RankVector createNewRankVector(RankVector rank);
public boolean includeInRanking(DomainData data) {
public boolean includeInRanking(RankingDomainData data) {
if (data.isAlias())
return false;
if (data.isSpecial())
@ -445,32 +373,4 @@ public abstract class RankingAlgorithm {
}
}
@Data
@AllArgsConstructor
static class DomainData {
public final int id;
public final String name;
private int alias;
private EdgeDomainIndexingState state;
public final int knownUrls;
public boolean peripheral;
public int resolveAlias() {
if (alias == 0) return id;
return alias;
}
public boolean isAlias() {
return alias != 0;
}
public boolean isSpecial() {
return EdgeDomainIndexingState.SPECIAL == state;
}
public boolean isSocialMedia() {
return EdgeDomainIndexingState.SOCIAL_MEDIA == state;
}
}
}

View File

@ -0,0 +1,33 @@
package nu.marginalia.util.ranking;
import lombok.AllArgsConstructor;
import lombok.Data;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
@Data
@AllArgsConstructor
class RankingDomainData {
public final int id;
public final String name;
private int alias;
private EdgeDomainIndexingState state;
public final int knownUrls;
public boolean peripheral;
public int resolveAlias() {
if (alias == 0) return id;
return alias;
}
public boolean isAlias() {
return alias != 0;
}
public boolean isSpecial() {
return EdgeDomainIndexingState.SPECIAL == state;
}
public boolean isSocialMedia() {
return EdgeDomainIndexingState.SOCIAL_MEDIA == state;
}
}

View File

@ -0,0 +1,105 @@
package nu.marginalia.util.ranking;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.sql.SQLException;
import java.util.function.Consumer;
import java.util.function.IntConsumer;
public class RankingDomainFetcher {
private final HikariDataSource dataSource;
private final EdgeDomainBlacklistImpl blacklist;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final boolean getNames = false;
@Inject
public RankingDomainFetcher(HikariDataSource dataSource, EdgeDomainBlacklistImpl blacklist) {
this.dataSource = dataSource;
this.blacklist = blacklist;
}
public void getDomains(Consumer<RankingDomainData> consumer) {
String query;
if (getNames) {
query = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
}
else {
query = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
}
getDomains(query, consumer);
}
public void getPeripheralDomains(Consumer<RankingDomainData> consumer) {
String query;
if (getNames) {
query = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
}
else {
query = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
}
getDomains(query, consumer);
}
private void getDomains(String query, Consumer<RankingDomainData> consumer) {
try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(query)) {
stmt.setFetchSize(10000);
var rsp = stmt.executeQuery();
while (rsp.next()) {
int id = rsp.getInt(1);
if (!blacklist.isBlacklisted(id)) {
consumer.accept(new RankingDomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5), false));
}
}
}
catch (SQLException ex) {
logger.error("Failed to fetch domains", ex);
}
}
public void eachDomainLink(DomainLinkConsumer consumer) {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK"))
{
stmt.setFetchSize(10000);
var rsp = stmt.executeQuery();
while (rsp.next()) {
int src = rsp.getInt(1);
int dst = rsp.getInt(2);
consumer.accept(src, dst);
}
}
catch (SQLException ex) {
logger.error("Failed to fetch domain links", ex);
}
}
public void domainsByPattern(String pattern, IntConsumer idConsumer) {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME LIKE ?")) {
stmt.setString(1, pattern);
var rsp = stmt.executeQuery();
while (rsp.next()) {
idConsumer.accept(rsp.getInt(1));
}
}
catch (SQLException ex) {
logger.error("Failed to fetch domains by pattern", ex);
}
}
public interface DomainLinkConsumer {
void accept(int from, int to);
}
}

View File

@ -3,12 +3,13 @@ package nu.marginalia.util.ranking.tool;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.util.ranking.BuggyStandardPageRank;
import nu.marginalia.util.ranking.RankingDomainFetcher;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
import org.mariadb.jdbc.Driver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.sql.SQLException;
import java.util.HashSet;
import java.util.Set;
@ -43,12 +44,14 @@ public class UpdateDomainRanksTool {
var uploader = new Thread(() -> uploadThread(conn), "Uploader");
logger.info("Ranking");
var spr = new BuggyStandardPageRank(new DatabaseModule().provideConnection(),"memex.marginalia.nu");
var ds = new DatabaseModule().provideConnection();
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
var spr = new BuggyStandardPageRank(domains, "memex.marginalia.nu");
rankMax = spr.size()*2;
uploader.start();
spr.pageRankWithPeripheralNodes(rankMax, false).forEach(i -> {
spr.pageRankWithPeripheralNodes(rankMax).forEach(i -> {
try {
uploadQueue.put(i);
} catch (InterruptedException e) {

View File

@ -3,12 +3,13 @@ package nu.marginalia.util.ranking.tool;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.util.ranking.BetterReversePageRank;
import nu.marginalia.util.ranking.RankingDomainFetcher;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
import org.mariadb.jdbc.Driver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.sql.SQLException;
import java.util.HashSet;
import java.util.Set;
@ -45,7 +46,9 @@ public class UpdateDomainRanksTool2 {
logger.info("Ranking");
// "memex.marginalia.nu", "wiki.xxiivv.com", "bikobatanari.art", "sadgrl.online", "lileks.com",
// "www.rep.routledge.com", "www.personal.kent.edu", "xroads.virginia.edu", "classics.mit.edu", "faculty.washington.edu", "monadnock.net"
var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
var ds = new DatabaseModule().provideConnection();
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
var rpr = new BetterReversePageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
// var rpr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "%edu");
// var spr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu");
@ -58,7 +61,7 @@ public class UpdateDomainRanksTool2 {
rankMax = rpr.size();
rpr.pageRankWithPeripheralNodes(rankMax, false).forEach(i -> {
rpr.pageRankWithPeripheralNodes(rankMax).forEach(i -> {
try {
uploadQueue.put(i);
} catch (InterruptedException e) {

View File

@ -6,6 +6,7 @@ import com.google.common.hash.Hashing;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.util.ranking.RankingDomainFetcher;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
@ -73,10 +74,12 @@ public class CrawlJobExtractorPageRankMain {
Gson gson = new GsonBuilder().create();
var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
var ds = new DatabaseModule().provideConnection();
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
var rpr = new BetterReversePageRank(domains, "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
rpr.setMaxKnownUrls(750);
var targetDomainIds = rpr.pageRankWithPeripheralNodes(rpr.size(), false);
var targetDomainIds = rpr.pageRankWithPeripheralNodes(rpr.size());
try (var out = new PrintWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outFile.toFile()))))) {
final var extractor = new CrawlJobExtractorPageRankMain(new DatabaseModule().provideConnection());

View File

@ -13,33 +13,14 @@ import java.util.Optional;
@ImplementedBy(EdgeDataStoreDaoImpl.class)
public interface EdgeDataStoreDao {
boolean isBlacklisted(EdgeDomain domain);
EdgeId<EdgeDomain> getDomainId(EdgeDomain domain);
List<BrowseResult> getDomainNeighborsAdjacent(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist backlist, int count);
List<BrowseResult> getRandomDomains(int count, EdgeDomainBlacklist backlist);
List<EdgeUrlDetails> getUrlDetailsMulti(List<EdgeId<EdgeUrl>> ids);
EdgeDomain getDomain(EdgeId<EdgeDomain> id);
Optional<EdgeId<EdgeUrl>> resolveAmbiguousDomain(String name);
int getPagesKnown(EdgeId<EdgeDomain> domainId);
int getPagesVisited(EdgeId<EdgeDomain> domainId);
int getPagesIndexed(EdgeId<EdgeDomain> domainId);
int getIncomingLinks(EdgeId<EdgeDomain> domainId);
int getOutboundLinks(EdgeId<EdgeDomain> domainId);
double getDomainQuality(EdgeId<EdgeDomain> domainId);
EdgeDomainIndexingState getDomainState(EdgeId<EdgeDomain> domainId);
List<EdgeDomain> getLinkingDomains(EdgeId<EdgeDomain> domainId);
double getRank(EdgeId<EdgeDomain> domainId);
}

View File

@ -33,7 +33,6 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
private final Cache<EdgeUrl, EdgeId<EdgeUrl>> urlIdCache = CacheBuilder.newBuilder().maximumSize(100_000).build();
private final Cache<EdgeDomain, EdgeId<EdgeDomain>> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
private static final String DEFAULT_PROTOCOL = "http";
public static double QUALITY_LOWER_BOUND_CUTOFF = -15.;
@Inject
public EdgeDataStoreDaoImpl(HikariDataSource dataSource)
@ -48,23 +47,6 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
domainIdCache.invalidateAll();
}
@SneakyThrows
@Override
public boolean isBlacklisted(EdgeDomain domain) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN_BLACKLIST WHERE URL_DOMAIN=?")) {
stmt.setString(1, domain.domain);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return true;
} else {
return false;
}
}
}
}
@SneakyThrows
@Override
public EdgeId<EdgeDomain> getDomainId(EdgeDomain domain) {
@ -108,13 +90,12 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
try (var stmt = connection.prepareStatement(
"""
SELECT ID, URL,
SELECT ID, URL,
TITLE, DESCRIPTION,
WORDS_TOTAL, FORMAT, FEATURES,
WORDS_TOTAL, FORMAT, FEATURES,
IP, DOMAIN_STATE, DATA_HASH
FROM EC_URL_VIEW WHERE ID IN
""" + idString)) {
// "SELECT ID,URL_PROTO,URL_DOMAIN,URL_PORT,URL_PATH,TITLE,DESCRIPTION,URL_QUALITY_MEASURE,DOMAIN_QUALITY_MEASURE,IFNULL(EC_DOMAIN_LINK_AGGREGATE.LINKS,1),WORDS_TOTAL,FORMAT,FEATURES,\"\",QUALITY_RAW,DOMAIN_STATE,DATA_HASH FROM EC_URL_VIEW LEFT JOIN EC_DOMAIN_LINK_AGGREGATE ON EC_DOMAIN_LINK_AGGREGATE.DOMAIN_ID=EC_URL_VIEW.DOMAIN_ID WHERE ID IN " + idString)) {
stmt.setFetchSize(ids.size());
var rsp = stmt.executeQuery();
@ -125,7 +106,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
rsp.getString(4), // description
-5, // quality
rsp.getInt(5), // wordsTotal
rsp.getString(6), // foramt
rsp.getString(6), // format
rsp.getInt(7), // features
rsp.getString(8), // ip
EdgeDomainIndexingState.valueOf(rsp.getString(9)), // domainState
@ -179,9 +160,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
String domain = rsp.getString(2);
if (!blacklist.isBlacklisted(id)) {
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
domains.add(new BrowseResult(url, id));
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id));
}
}
}
@ -210,9 +189,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
String domain = rsp.getString(2);
if (!blacklist.isBlacklisted(id)) {
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
domains.add(new BrowseResult(url, id));
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id));
}
}
}
@ -244,9 +221,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
String domain = rsp.getString(2);
if (!blacklist.isBlacklisted(id)) {
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
domains.add(new BrowseResult(url, id));
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id));
}
}
}
@ -262,7 +237,15 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
@Override
public List<BrowseResult> getRandomDomains(int count, EdgeDomainBlacklist blacklist) {
final String q = "SELECT DOMAIN_ID,DOMAIN_NAME FROM EC_RANDOM_DOMAINS INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE STATE<2 AND DOMAIN_ALIAS IS NULL ORDER BY RAND() LIMIT ?";
final String q = """
SELECT DOMAIN_ID, DOMAIN_NAME
FROM EC_RANDOM_DOMAINS
INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID
WHERE STATE<2
AND DOMAIN_ALIAS IS NULL
ORDER BY RAND()
LIMIT ?
""";
List<BrowseResult> domains = new ArrayList<>(count);
try (var conn = dataSource.getConnection()) {
try (var stmt = conn.prepareStatement(q)) {
@ -273,9 +256,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
String domain = rsp.getString(2);
if (!blacklist.isBlacklisted(id)) {
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
domains.add(new BrowseResult(url, id));
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id));
}
}
}
@ -302,223 +283,4 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
}
}
@Override
public Optional<EdgeId<EdgeUrl>> resolveAmbiguousDomain(String name) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
stmt.setString(1, name);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return Optional.of(new EdgeId<>(rsp.getInt(1)));
}
}
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
stmt.setString(1, "https://"+name);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return Optional.of(new EdgeId<>(rsp.getInt(1)));
}
}
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
stmt.setString(1, "http://"+name);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return Optional.of(new EdgeId<>(rsp.getInt(1)));
}
}
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
stmt.setString(1, "https://www."+name);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return Optional.of(new EdgeId<>(rsp.getInt(1)));
}
}
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
stmt.setString(1, "http://www."+name);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return Optional.of(new EdgeId<>(rsp.getInt(1)));
}
}
} catch (SQLException throwables) {
logger.info("Could not resolve domain id for {}", name);
}
return Optional.empty();
}
@SneakyThrows
@Override
public int getPagesKnown(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT KNOWN_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return 0;
}
}
@SneakyThrows
@Override
public int getPagesVisited(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT VISITED_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return 0;
}
}
@SneakyThrows
@Override
public int getPagesIndexed(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT GOOD_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return 0;
}
}
@SneakyThrows
@Override
public int getIncomingLinks(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE DEST_DOMAIN_ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return 0;
}
}
@SneakyThrows
@Override
public int getOutboundLinks(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return 0;
}
}
@SneakyThrows
@Override
public double getDomainQuality(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT QUALITY FROM EC_DOMAIN WHERE ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getDouble(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return -5;
}
}
@Override
public EdgeDomainIndexingState getDomainState(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT STATE FROM EC_DOMAIN WHERE ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return EdgeDomainIndexingState.fromCode(rsp.getInt(1));
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
} catch (SQLException throwables) {
throwables.printStackTrace();
}
return EdgeDomainIndexingState.ERROR;
}
@Override
public List<EdgeDomain> getLinkingDomains(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
List<EdgeDomain> results = new ArrayList<>(25);
try (var stmt = connection.prepareStatement("SELECT SOURCE_URL FROM EC_RELATED_LINKS_VIEW WHERE DEST_DOMAIN_ID=? ORDER BY SOURCE_DOMAIN_ID LIMIT 25")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
while (rsp.next()) {
results.add(new EdgeDomain(rsp.getString(1)));
}
return results;
} catch (Exception ex) {
logger.error("DB error", ex);
}
} catch (SQLException throwables) {
throwables.printStackTrace();
}
return Collections.emptyList();
}
@Override
public double getRank(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT IFNULL(RANK, 1) FROM EC_DOMAIN WHERE ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getDouble(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
} catch (SQLException throwables) {
throwables.printStackTrace();
}
return 1;
}
}

View File

@ -10,7 +10,7 @@ import lombok.SneakyThrows;
import nu.marginalia.util.ranking.BetterReversePageRank;
import nu.marginalia.util.ranking.BetterStandardPageRank;
import nu.marginalia.util.ranking.BuggyStandardPageRank;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.util.ranking.RankingDomainFetcher;
import nu.marginalia.wmsa.edge.index.model.RankingSettings;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -18,14 +18,17 @@ import org.slf4j.LoggerFactory;
@Singleton
public class SearchIndexDao {
private final HikariDataSource dataSource;
private RankingDomainFetcher rankingDomains;
private final RankingSettings rankingSettings;
private final Logger logger = LoggerFactory.getLogger(getClass());
@Inject
public SearchIndexDao(HikariDataSource dataSource,
RankingDomainFetcher rankingDomains,
RankingSettings rankingSettings)
{
this.dataSource = dataSource;
this.rankingDomains = rankingDomains;
this.rankingSettings = rankingSettings;
logger.info("SearchIndexDao ranking settings = {}", rankingSettings);
}
@ -63,36 +66,36 @@ public class SearchIndexDao {
@SneakyThrows
public TIntList getRetroDomains() {
var spr = new BetterStandardPageRank(dataSource,rankingSettings.retro.toArray(String[]::new));
return spr.pageRankWithPeripheralNodes(spr.size()/2, false);
var spr = new BetterStandardPageRank(rankingDomains,rankingSettings.retro.toArray(String[]::new));
return spr.pageRankWithPeripheralNodes(spr.size()/2);
}
@SneakyThrows
public TIntList getSmallWebDomains() {
var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), rankingSettings.small.toArray(String[]::new));
var rpr = new BetterReversePageRank(rankingDomains, rankingSettings.small.toArray(String[]::new));
rpr.setMaxKnownUrls(750);
return rpr.pageRankWithPeripheralNodes(rpr.size(), false);
return rpr.pageRankWithPeripheralNodes(rpr.size());
}
@SneakyThrows
public TIntList getAcademiaDomains() {
var spr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), rankingSettings.academia.toArray(String[]::new));
return spr.pageRankWithPeripheralNodes(spr.size()/2, false);
var spr = new BetterStandardPageRank(rankingDomains, rankingSettings.academia.toArray(String[]::new));
return spr.pageRankWithPeripheralNodes(spr.size()/2);
}
@SneakyThrows
public TIntList getStandardDomains() {
var spr = new BuggyStandardPageRank(dataSource,rankingSettings.standard.toArray(String[]::new));
return spr.pageRankWithPeripheralNodes(spr.size()/2, false);
var spr = new BuggyStandardPageRank(rankingDomains,rankingSettings.standard.toArray(String[]::new));
return spr.pageRankWithPeripheralNodes(spr.size()/2);
}
@SneakyThrows
public TIntList getSpecialDomains() {
TIntArrayList results = new TIntArrayList();
try (var connection = dataSource.getConnection();
var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE STATE=2")
var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE STATE='SPECIAL'")
) {
var rs = stmt.executeQuery();
while (rs.next()) {

View File

@ -54,8 +54,11 @@ public class EdgeDomain implements WideHashable {
}
}
}
}
public EdgeUrl toRootUrl() {
// Set default protocol to http, as most https websites redirect http->https, but few http websites redirect https->http
return new EdgeUrl("http", this, null, "/");
}
public String toString() {

View File

@ -1,24 +1,43 @@
package nu.marginalia.wmsa.edge.search.siteinfo;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao;
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDaoImpl;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeId;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
import nu.marginalia.wmsa.edge.search.model.DomainInformation;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.inject.Inject;
import javax.inject.Singleton;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Optional;
/*
TODO: This class needs to be refactored, a lot of
these SQL queries are redundant and can be
collapsed into one single query that fetches
all the information
*/
@Singleton
public class DomainInformationService {
private EdgeDataStoreDao dataStore;
private EdgeDataStoreDaoImpl dataStoreDao;
private HikariDataSource dataSource;
private final Logger logger = LoggerFactory.getLogger(getClass());
@Inject
public DomainInformationService(EdgeDataStoreDao dataStore) {
this.dataStore = dataStore;
public DomainInformationService(
EdgeDataStoreDaoImpl dataStoreDao,
HikariDataSource dataSource) {
this.dataStoreDao = dataStoreDao;
this.dataSource = dataSource;
}
@ -28,29 +47,29 @@ public class DomainInformationService {
if (domainId == null) {
return Optional.empty();
}
EdgeDomain domain = dataStore.getDomain(domainId);
EdgeDomain domain = dataStoreDao.getDomain(domainId);
boolean blacklisted = dataStore.isBlacklisted(domain);
int pagesKnown = dataStore.getPagesKnown(domainId);
int pagesVisited = dataStore.getPagesVisited(domainId);
int pagesIndexed = dataStore.getPagesIndexed(domainId);
int incomingLinks = dataStore.getIncomingLinks(domainId);
int outboundLinks = dataStore.getOutboundLinks(domainId);
double rank = Math.round(10000.0*(1.0-dataStore.getRank(domainId)))/100;
EdgeDomainIndexingState state = dataStore.getDomainState(domainId);
double nominalQuality = Math.round(100*100*Math.exp(dataStore.getDomainQuality(domainId)))/100.;
List<EdgeDomain> linkingDomains = dataStore.getLinkingDomains(domainId);
boolean blacklisted = isBlacklisted(domain);
int pagesKnown = getPagesKnown(domainId);
int pagesVisited = getPagesVisited(domainId);
int pagesIndexed = getPagesIndexed(domainId);
int incomingLinks = getIncomingLinks(domainId);
int outboundLinks = getOutboundLinks(domainId);
double rank = Math.round(10000.0*(1.0-getRank(domainId)))/100;
EdgeDomainIndexingState state = getDomainState(domainId);
double nominalQuality = Math.round(100*100*Math.exp(getDomainQuality(domainId)))/100.;
List<EdgeDomain> linkingDomains = getLinkingDomains(domainId);
return Optional.of(new DomainInformation(domain, blacklisted, pagesKnown, pagesVisited, pagesIndexed, incomingLinks, outboundLinks, nominalQuality, rank, state, linkingDomains));
}
private EdgeId<EdgeDomain> getDomainFromPartial(String site) {
try {
return dataStore.getDomainId(new EdgeDomain(site));
return dataStoreDao.getDomainId(new EdgeDomain(site));
}
catch (Exception ex) {
try {
return dataStore.getDomainId(new EdgeDomain(site));
return dataStoreDao.getDomainId(new EdgeDomain(site));
}
catch (Exception ex2) {
return null;
@ -58,4 +77,178 @@ public class DomainInformationService {
}
}
@SneakyThrows
public boolean isBlacklisted(EdgeDomain domain) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN_BLACKLIST WHERE URL_DOMAIN=?")) {
stmt.setString(1, domain.domain);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return true;
} else {
return false;
}
}
}
}
@SneakyThrows
public int getPagesKnown(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT KNOWN_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return 0;
}
}
@SneakyThrows
public int getPagesVisited(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT VISITED_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return 0;
}
}
@SneakyThrows
public int getPagesIndexed(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT GOOD_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return 0;
}
}
@SneakyThrows
public int getIncomingLinks(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE DEST_DOMAIN_ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return 0;
}
}
@SneakyThrows
public int getOutboundLinks(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return 0;
}
}
@SneakyThrows
public double getDomainQuality(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT QUALITY FROM EC_DOMAIN WHERE ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getDouble(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return -5;
}
}
public EdgeDomainIndexingState getDomainState(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT STATE FROM EC_DOMAIN WHERE ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return EdgeDomainIndexingState.fromCode(rsp.getInt(1));
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
} catch (SQLException throwables) {
throwables.printStackTrace();
}
return EdgeDomainIndexingState.ERROR;
}
public List<EdgeDomain> getLinkingDomains(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
List<EdgeDomain> results = new ArrayList<>(25);
try (var stmt = connection.prepareStatement("SELECT SOURCE_URL FROM EC_RELATED_LINKS_VIEW WHERE DEST_DOMAIN_ID=? ORDER BY SOURCE_DOMAIN_ID LIMIT 25")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
while (rsp.next()) {
results.add(new EdgeDomain(rsp.getString(1)));
}
return results;
} catch (Exception ex) {
logger.error("DB error", ex);
}
} catch (SQLException throwables) {
throwables.printStackTrace();
}
return Collections.emptyList();
}
public double getRank(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT IFNULL(RANK, 1) FROM EC_DOMAIN WHERE ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getDouble(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
} catch (SQLException throwables) {
throwables.printStackTrace();
}
return 1;
}
}

View File

@ -3,6 +3,7 @@ package nu.marginalia.wmsa.edge.tools;
import com.google.inject.Inject;
import gnu.trove.set.hash.TIntHashSet;
import lombok.SneakyThrows;
import nu.marginalia.util.ranking.RankingDomainFetcher;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
@ -59,7 +60,9 @@ public class IndexMergerMain {
}
var hikari = new DatabaseModule().provideConnection();
var partitioner = new SearchIndexPartitioner(new SearchIndexDao(hikari, new RankingSettings()));
var ds = new DatabaseModule().provideConnection();
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
var partitioner = new SearchIndexPartitioner(new SearchIndexDao(hikari, domains, new RankingSettings()));
var blacklist = new EdgeDomainBlacklistImpl(hikari);
new IndexMergerMain(file1, file2, outputFile, partitioner, blacklist);