From 5e472fe121fcf42de70c3286a1a191686382e4ca Mon Sep 17 00:00:00 2001 From: vlofgren Date: Wed, 8 Jun 2022 16:18:00 +0200 Subject: [PATCH] WIP: Refactored ranking algorithms to separate database code from ranking code --- .../util/ranking/BetterReversePageRank.java | 8 +- .../util/ranking/BetterStandardPageRank.java | 8 +- .../util/ranking/BuggyReversePageRank.java | 8 +- .../util/ranking/BuggyStandardPageRank.java | 8 +- .../util/ranking/RankingAlgorithm.java | 256 +++++------------ .../util/ranking/RankingDomainData.java | 33 +++ .../util/ranking/RankingDomainFetcher.java | 105 +++++++ .../ranking/tool/UpdateDomainRanksTool.java | 9 +- .../ranking/tool/UpdateDomainRanksTool2.java | 9 +- .../CrawlJobExtractorPageRankMain.java | 7 +- .../wmsa/edge/data/dao/EdgeDataStoreDao.java | 21 +- .../edge/data/dao/EdgeDataStoreDaoImpl.java | 270 ++---------------- .../edge/index/service/SearchIndexDao.java | 23 +- .../wmsa/edge/model/EdgeDomain.java | 5 +- .../siteinfo/DomainInformationService.java | 225 +++++++++++++-- .../wmsa/edge/tools/IndexMergerMain.java | 5 +- 16 files changed, 488 insertions(+), 512 deletions(-) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainData.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainFetcher.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterReversePageRank.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterReversePageRank.java index f2889ad6..7d3b17c4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterReversePageRank.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterReversePageRank.java @@ -1,15 +1,11 @@ package nu.marginalia.util.ranking; -import com.zaxxer.hikari.HikariDataSource; - -import java.io.IOException; - public class BetterReversePageRank extends RankingAlgorithm { - public BetterReversePageRank(HikariDataSource dataSource, String... origins) { - super(dataSource, origins); + public BetterReversePageRank(RankingDomainFetcher domains, String... origins) { + super(domains, origins); } @Override diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterStandardPageRank.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterStandardPageRank.java index 5b64fa73..f1f9b0b1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterStandardPageRank.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterStandardPageRank.java @@ -1,14 +1,10 @@ package nu.marginalia.util.ranking; -import com.zaxxer.hikari.HikariDataSource; - -import java.io.IOException; - public class BetterStandardPageRank extends RankingAlgorithm { - public BetterStandardPageRank(HikariDataSource dataSource, String... origins) { - super(dataSource, origins); + public BetterStandardPageRank(RankingDomainFetcher domains, String... origins) { + super(domains, origins); } @Override diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyReversePageRank.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyReversePageRank.java index 1e87776c..485ba353 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyReversePageRank.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyReversePageRank.java @@ -1,15 +1,11 @@ package nu.marginalia.util.ranking; -import com.zaxxer.hikari.HikariDataSource; - -import java.io.IOException; - public class BuggyReversePageRank extends RankingAlgorithm { - public BuggyReversePageRank(HikariDataSource dataSource, String... origins) { - super(dataSource, origins); + public BuggyReversePageRank(RankingDomainFetcher domains, String... origins) { + super(domains, origins); } @Override diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyStandardPageRank.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyStandardPageRank.java index a3d7b87e..836bcdfe 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyStandardPageRank.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyStandardPageRank.java @@ -1,14 +1,10 @@ package nu.marginalia.util.ranking; -import com.zaxxer.hikari.HikariDataSource; - -import java.io.IOException; - public class BuggyStandardPageRank extends RankingAlgorithm { - public BuggyStandardPageRank(HikariDataSource dataSource, String... origins) { - super(dataSource, origins); + public BuggyStandardPageRank(RankingDomainFetcher domains, String... origins) { + super(domains, origins); } @Override diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java index b07285d4..875031f1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java @@ -1,35 +1,26 @@ package nu.marginalia.util.ranking; -import com.zaxxer.hikari.HikariDataSource; import gnu.trove.list.TIntList; import gnu.trove.list.array.TIntArrayList; import gnu.trove.map.hash.TIntIntHashMap; import gnu.trove.map.hash.TIntObjectHashMap; -import gnu.trove.set.hash.TIntHashSet; import it.unimi.dsi.fastutil.ints.IntComparator; -import lombok.AllArgsConstructor; -import lombok.Data; import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; -import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; -import java.sql.SQLException; import java.util.*; import java.util.function.IntToDoubleFunction; import java.util.stream.IntStream; import it.unimi.dsi.fastutil.ints.IntArrays; public abstract class RankingAlgorithm { - final TIntObjectHashMap domainsById = new TIntObjectHashMap<>(); + final TIntObjectHashMap domainsById = new TIntObjectHashMap<>(); final TIntIntHashMap domainIndexToId = new TIntIntHashMap(); final TIntIntHashMap domainIdToIndex = new TIntIntHashMap(); - private final TIntHashSet spamDomains; - private final HikariDataSource dataSource; - TIntArrayList[] linkDataSrc2Dest; TIntArrayList[] linkDataDest2Src; @@ -41,10 +32,14 @@ public abstract class RankingAlgorithm { private static final boolean getNames = true; private final Logger logger = LoggerFactory.getLogger(getClass()); + private RankingDomainFetcher domains; public static void main(String... args) throws IOException { - var rpr = new BuggyReversePageRank(new DatabaseModule().provideConnection(), "wiki.xxiivv.com"); - var spr = new BuggyStandardPageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu"); + var ds = new DatabaseModule().provideConnection(); + var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds)); + + var rpr = new BuggyReversePageRank(domains, "wiki.xxiivv.com"); + var spr = new BuggyStandardPageRank(domains, "memex.marginalia.nu"); var rankVector = spr.pageRankVector(); var norm = rankVector.norm(); @@ -61,164 +56,97 @@ public abstract class RankingAlgorithm { return domainsById.get(id).peripheral; } - public RankingAlgorithm(HikariDataSource dataSource, String... origins) { - this.dataSource = dataSource; - var blacklist = new EdgeDomainBlacklistImpl(dataSource); + public RankingAlgorithm(RankingDomainFetcher domains, String... origins) { + this.domains = domains; - spamDomains = blacklist.getSpamDomains(); originDomains.addAll(Arrays.asList(origins)); - try (var conn = dataSource.getConnection()) { + domains.getDomains(domainData -> { + int id = domainData.id; - String s; - if (getNames) { - s = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID"; + domainsById.put(id, domainData); + + domainIndexToId.put(domainIndexToId.size(), id); + domainIdToIndex.put(id, domainIdToIndex.size()); + }); + + linkDataSrc2Dest = new TIntArrayList[domainIndexToId.size()]; + linkDataDest2Src = new TIntArrayList[domainIndexToId.size()]; + + domains.eachDomainLink((src, dst) -> { + if (src == dst) return; + + if (domainsById.contains(src) && domainsById.contains(dst)) { + + int srcIdx = domainIdToIndex.get(src); + int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias()); + + if (linkDataSrc2Dest[srcIdx] == null) { + linkDataSrc2Dest[srcIdx] = new TIntArrayList(); + } + linkDataSrc2Dest[srcIdx].add(dstIdx); + + if (linkDataDest2Src[dstIdx] == null) { + linkDataDest2Src[dstIdx] = new TIntArrayList(); + } + linkDataDest2Src[dstIdx].add(srcIdx); + } + }); + + for (var namePattern : this.originDomains) { + domains.domainsByPattern(namePattern, i -> { + int ival = domainIdToIndex.get(i); + if (ival != domainIdToIndex.getNoEntryValue() || domainIndexToId.get(0) == i) { + originDomainIds.add(ival); } else { - s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID"; + logger.debug("No value for {}", i); } - try (var stmt = conn.prepareStatement(s)) { - stmt.setFetchSize(10000); - var rsp = stmt.executeQuery(); - while (rsp.next()) { - int id = rsp.getInt(1); - if (!spamDomains.contains(id)) { - - domainsById.put(id, new DomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5), false)); - - domainIndexToId.put(domainIndexToId.size(), id); - domainIdToIndex.put(id, domainIdToIndex.size()); - } - } - } - - - linkDataSrc2Dest = new TIntArrayList[domainIndexToId.size()]; - linkDataDest2Src = new TIntArrayList[domainIndexToId.size()]; - - try (var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK")) { - stmt.setFetchSize(10000); - - var rsp = stmt.executeQuery(); - - while (rsp.next()) { - int src = rsp.getInt(1); - int dst = rsp.getInt(2); - - if (src == dst) continue; - - if (domainsById.contains(src) && domainsById.contains(dst)) { - - int srcIdx = domainIdToIndex.get(src); - int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias()); - - if (linkDataSrc2Dest[srcIdx] == null) { - linkDataSrc2Dest[srcIdx] = new TIntArrayList(); - } - linkDataSrc2Dest[srcIdx].add(dstIdx); - - if (linkDataDest2Src[dstIdx] == null) { - linkDataDest2Src[dstIdx] = new TIntArrayList(); - } - linkDataDest2Src[dstIdx].add(srcIdx); - } - } - } - - try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME LIKE ?")) { - for (var seed : this.originDomains) { - stmt.setString(1, seed); - var rsp = stmt.executeQuery(); - while (rsp.next()) { - int i = rsp.getInt(1); - int ival = domainIdToIndex.get(i); - if (ival != domainIdToIndex.getNoEntryValue() || domainIndexToId.get(0) == i) { - originDomainIds.add(ival); - } - else { - logger.debug("No value for {}", i); - } - } - logger.debug("{} -> {}", seed, originDomainIds.size()); - } - } - - logger.info("Origin Domains: {}", originDomainIds.size()); - - } catch (SQLException throwables) { - logger.error("SQL error", throwables); + }); } + logger.info("Origin Domains: {}", originDomainIds.size()); } - public void addPeripheralNodes(boolean includeErrorStates) { + public void addPeripheralNodes() { int newNodesIdxCutoff = domainIdToIndex.size(); logger.info("Inserting peripheral nodes"); - try (var conn = dataSource.getConnection()) { - String s; - if (getNames) { - s = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID"; + domains.getPeripheralDomains(domainData -> { + int id = domainData.id; + + if (domainsById.put(id, domainData) == null) { // true if id was not already present + domainIndexToId.put(domainIndexToId.size(), id); + domainIdToIndex.put(id, domainIdToIndex.size()); } - else { - s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID"; - } - try (var stmt = conn.prepareStatement(s)) { - stmt.setFetchSize(10000); - var rsp = stmt.executeQuery(); + }); - while (rsp.next()) { - int id = rsp.getInt(1); + linkDataSrc2Dest = Arrays.copyOf(linkDataSrc2Dest, domainIndexToId.size()); + linkDataDest2Src = Arrays.copyOf(linkDataDest2Src, domainIndexToId.size()); - if (!spamDomains.contains(id)) { - domainsById.put(id, new DomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5), true)); + domains.eachDomainLink((src, dst) -> { + if (src == dst) return; - domainIndexToId.put(domainIndexToId.size(), id); - domainIdToIndex.put(id, domainIdToIndex.size()); - } + if (domainsById.contains(src) && domainsById.contains(dst)) { + int srcIdx = domainIdToIndex.get(src); + int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias()); + + // This looks like a bug, but it improves the results + if (srcIdx < newNodesIdxCutoff || dstIdx < newNodesIdxCutoff) + return; + + if (linkDataSrc2Dest[srcIdx] == null) { + linkDataSrc2Dest[srcIdx] = new TIntArrayList(); } + linkDataSrc2Dest[srcIdx].add(dstIdx); - } - - linkDataSrc2Dest = Arrays.copyOf(linkDataSrc2Dest, domainIndexToId.size()); - linkDataDest2Src = Arrays.copyOf(linkDataDest2Src, domainIndexToId.size()); - - try (var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK")) { - stmt.setFetchSize(10000); - - var rsp = stmt.executeQuery(); - - while (rsp.next()) { - int src = rsp.getInt(1); - int dst = rsp.getInt(2); - - if (src == dst) continue; - - if (domainsById.contains(src) && domainsById.contains(dst)) { - - int srcIdx = domainIdToIndex.get(src); - int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias()); - - // This looks like a bug, but it improves the results - if (srcIdx < newNodesIdxCutoff || dstIdx < newNodesIdxCutoff) - continue; - - if (linkDataSrc2Dest[srcIdx] == null) { - linkDataSrc2Dest[srcIdx] = new TIntArrayList(); - } - linkDataSrc2Dest[srcIdx].add(dstIdx); - - if (linkDataDest2Src[dstIdx] == null) { - linkDataDest2Src[dstIdx] = new TIntArrayList(); - } - linkDataDest2Src[dstIdx].add(srcIdx); - } + if (linkDataDest2Src[dstIdx] == null) { + linkDataDest2Src[dstIdx] = new TIntArrayList(); } + linkDataDest2Src[dstIdx].add(srcIdx); } - } catch (SQLException throwables) { - logger.error("SQL error", throwables); - } + }); logger.info("Peripheral nodes inserted {} -> {}", newNodesIdxCutoff, domainIdToIndex.size()); } @@ -271,14 +199,14 @@ public abstract class RankingAlgorithm { return rank.getRanking(resultCount); } - public TIntList pageRankWithPeripheralNodes(int resultCount, boolean includeErrorStates) { + public TIntList pageRankWithPeripheralNodes(int resultCount) { RankVector rank = new RankVector(1.d / domainsById.size()); int iter_max = 100; for (int i = 0; i < iter_max; i++) { if (i == iter_max-1) { - addPeripheralNodes(includeErrorStates); + addPeripheralNodes(); } RankVector newRank = createNewRankVector(rank); @@ -323,7 +251,7 @@ public abstract class RankingAlgorithm { abstract RankVector createNewRankVector(RankVector rank); - public boolean includeInRanking(DomainData data) { + public boolean includeInRanking(RankingDomainData data) { if (data.isAlias()) return false; if (data.isSpecial()) @@ -445,32 +373,4 @@ public abstract class RankingAlgorithm { } } - @Data - @AllArgsConstructor - static class DomainData { - public final int id; - public final String name; - private int alias; - private EdgeDomainIndexingState state; - public final int knownUrls; - public boolean peripheral; - - public int resolveAlias() { - if (alias == 0) return id; - return alias; - } - - public boolean isAlias() { - return alias != 0; - } - - public boolean isSpecial() { - return EdgeDomainIndexingState.SPECIAL == state; - } - - public boolean isSocialMedia() { - return EdgeDomainIndexingState.SOCIAL_MEDIA == state; - } - } - } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainData.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainData.java new file mode 100644 index 00000000..c29ed704 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainData.java @@ -0,0 +1,33 @@ +package nu.marginalia.util.ranking; + +import lombok.AllArgsConstructor; +import lombok.Data; +import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; + +@Data +@AllArgsConstructor +class RankingDomainData { + public final int id; + public final String name; + private int alias; + private EdgeDomainIndexingState state; + public final int knownUrls; + public boolean peripheral; + + public int resolveAlias() { + if (alias == 0) return id; + return alias; + } + + public boolean isAlias() { + return alias != 0; + } + + public boolean isSpecial() { + return EdgeDomainIndexingState.SPECIAL == state; + } + + public boolean isSocialMedia() { + return EdgeDomainIndexingState.SOCIAL_MEDIA == state; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainFetcher.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainFetcher.java new file mode 100644 index 00000000..79285a83 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainFetcher.java @@ -0,0 +1,105 @@ +package nu.marginalia.util.ranking; + +import com.google.inject.Inject; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.SQLException; +import java.util.function.Consumer; +import java.util.function.IntConsumer; + +public class RankingDomainFetcher { + private final HikariDataSource dataSource; + private final EdgeDomainBlacklistImpl blacklist; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + private final boolean getNames = false; + + @Inject + public RankingDomainFetcher(HikariDataSource dataSource, EdgeDomainBlacklistImpl blacklist) { + this.dataSource = dataSource; + this.blacklist = blacklist; + } + + public void getDomains(Consumer consumer) { + String query; + if (getNames) { + query = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID"; + } + else { + query = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID"; + } + + getDomains(query, consumer); + } + + + public void getPeripheralDomains(Consumer consumer) { + String query; + if (getNames) { + query = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID"; + } + else { + query = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID"; + } + + getDomains(query, consumer); + } + + private void getDomains(String query, Consumer consumer) { + try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(query)) { + stmt.setFetchSize(10000); + var rsp = stmt.executeQuery(); + while (rsp.next()) { + int id = rsp.getInt(1); + if (!blacklist.isBlacklisted(id)) { + consumer.accept(new RankingDomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5), false)); + } + } + } + catch (SQLException ex) { + logger.error("Failed to fetch domains", ex); + } + } + + public void eachDomainLink(DomainLinkConsumer consumer) { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK")) + { + stmt.setFetchSize(10000); + + var rsp = stmt.executeQuery(); + + while (rsp.next()) { + int src = rsp.getInt(1); + int dst = rsp.getInt(2); + + consumer.accept(src, dst); + } + } + catch (SQLException ex) { + logger.error("Failed to fetch domain links", ex); + } + } + + public void domainsByPattern(String pattern, IntConsumer idConsumer) { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME LIKE ?")) { + stmt.setString(1, pattern); + var rsp = stmt.executeQuery(); + while (rsp.next()) { + idConsumer.accept(rsp.getInt(1)); + } + } + catch (SQLException ex) { + logger.error("Failed to fetch domains by pattern", ex); + } + } + + public interface DomainLinkConsumer { + void accept(int from, int to); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool.java index 5660d9a7..f80d307f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool.java @@ -3,12 +3,13 @@ package nu.marginalia.util.ranking.tool; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; import nu.marginalia.util.ranking.BuggyStandardPageRank; +import nu.marginalia.util.ranking.RankingDomainFetcher; import nu.marginalia.wmsa.configuration.module.DatabaseModule; +import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; import org.mariadb.jdbc.Driver; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; import java.sql.SQLException; import java.util.HashSet; import java.util.Set; @@ -43,12 +44,14 @@ public class UpdateDomainRanksTool { var uploader = new Thread(() -> uploadThread(conn), "Uploader"); logger.info("Ranking"); - var spr = new BuggyStandardPageRank(new DatabaseModule().provideConnection(),"memex.marginalia.nu"); + var ds = new DatabaseModule().provideConnection(); + var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds)); + var spr = new BuggyStandardPageRank(domains, "memex.marginalia.nu"); rankMax = spr.size()*2; uploader.start(); - spr.pageRankWithPeripheralNodes(rankMax, false).forEach(i -> { + spr.pageRankWithPeripheralNodes(rankMax).forEach(i -> { try { uploadQueue.put(i); } catch (InterruptedException e) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java index ec48cd17..f46fb390 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java @@ -3,12 +3,13 @@ package nu.marginalia.util.ranking.tool; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; import nu.marginalia.util.ranking.BetterReversePageRank; +import nu.marginalia.util.ranking.RankingDomainFetcher; import nu.marginalia.wmsa.configuration.module.DatabaseModule; +import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; import org.mariadb.jdbc.Driver; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; import java.sql.SQLException; import java.util.HashSet; import java.util.Set; @@ -45,7 +46,9 @@ public class UpdateDomainRanksTool2 { logger.info("Ranking"); // "memex.marginalia.nu", "wiki.xxiivv.com", "bikobatanari.art", "sadgrl.online", "lileks.com", // "www.rep.routledge.com", "www.personal.kent.edu", "xroads.virginia.edu", "classics.mit.edu", "faculty.washington.edu", "monadnock.net" - var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org"); + var ds = new DatabaseModule().provideConnection(); + var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds)); + var rpr = new BetterReversePageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org"); // var rpr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "%edu"); // var spr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu"); @@ -58,7 +61,7 @@ public class UpdateDomainRanksTool2 { rankMax = rpr.size(); - rpr.pageRankWithPeripheralNodes(rankMax, false).forEach(i -> { + rpr.pageRankWithPeripheralNodes(rankMax).forEach(i -> { try { uploadQueue.put(i); } catch (InterruptedException e) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java index 53997194..ea1946fc 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java @@ -6,6 +6,7 @@ import com.google.common.hash.Hashing; import com.google.gson.Gson; import com.google.gson.GsonBuilder; import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.util.ranking.RankingDomainFetcher; import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; @@ -73,10 +74,12 @@ public class CrawlJobExtractorPageRankMain { Gson gson = new GsonBuilder().create(); - var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org"); + var ds = new DatabaseModule().provideConnection(); + var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds)); + var rpr = new BetterReversePageRank(domains, "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org"); rpr.setMaxKnownUrls(750); - var targetDomainIds = rpr.pageRankWithPeripheralNodes(rpr.size(), false); + var targetDomainIds = rpr.pageRankWithPeripheralNodes(rpr.size()); try (var out = new PrintWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outFile.toFile()))))) { final var extractor = new CrawlJobExtractorPageRankMain(new DatabaseModule().provideConnection()); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDao.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDao.java index c87088f6..2f309b07 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDao.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDao.java @@ -13,33 +13,14 @@ import java.util.Optional; @ImplementedBy(EdgeDataStoreDaoImpl.class) public interface EdgeDataStoreDao { - boolean isBlacklisted(EdgeDomain domain); - EdgeId getDomainId(EdgeDomain domain); List getDomainNeighborsAdjacent(EdgeId domainId, EdgeDomainBlacklist backlist, int count); + List getRandomDomains(int count, EdgeDomainBlacklist backlist); List getUrlDetailsMulti(List> ids); - EdgeDomain getDomain(EdgeId id); - Optional> resolveAmbiguousDomain(String name); - - - int getPagesKnown(EdgeId domainId); - int getPagesVisited(EdgeId domainId); - int getPagesIndexed(EdgeId domainId); - - int getIncomingLinks(EdgeId domainId); - int getOutboundLinks(EdgeId domainId); - - double getDomainQuality(EdgeId domainId); - - EdgeDomainIndexingState getDomainState(EdgeId domainId); - - List getLinkingDomains(EdgeId domainId); - - double getRank(EdgeId domainId); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java index 2519a745..430e7603 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java @@ -33,7 +33,6 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { private final Cache> urlIdCache = CacheBuilder.newBuilder().maximumSize(100_000).build(); private final Cache> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build(); - private static final String DEFAULT_PROTOCOL = "http"; public static double QUALITY_LOWER_BOUND_CUTOFF = -15.; @Inject public EdgeDataStoreDaoImpl(HikariDataSource dataSource) @@ -48,23 +47,6 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { domainIdCache.invalidateAll(); } - @SneakyThrows - @Override - public boolean isBlacklisted(EdgeDomain domain) { - - try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN_BLACKLIST WHERE URL_DOMAIN=?")) { - stmt.setString(1, domain.domain); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return true; - } else { - return false; - } - } - } - } - @SneakyThrows @Override public EdgeId getDomainId(EdgeDomain domain) { @@ -108,13 +90,12 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { try (var stmt = connection.prepareStatement( """ - SELECT ID, URL, + SELECT ID, URL, TITLE, DESCRIPTION, - WORDS_TOTAL, FORMAT, FEATURES, + WORDS_TOTAL, FORMAT, FEATURES, IP, DOMAIN_STATE, DATA_HASH FROM EC_URL_VIEW WHERE ID IN """ + idString)) { -// "SELECT ID,URL_PROTO,URL_DOMAIN,URL_PORT,URL_PATH,TITLE,DESCRIPTION,URL_QUALITY_MEASURE,DOMAIN_QUALITY_MEASURE,IFNULL(EC_DOMAIN_LINK_AGGREGATE.LINKS,1),WORDS_TOTAL,FORMAT,FEATURES,\"\",QUALITY_RAW,DOMAIN_STATE,DATA_HASH FROM EC_URL_VIEW LEFT JOIN EC_DOMAIN_LINK_AGGREGATE ON EC_DOMAIN_LINK_AGGREGATE.DOMAIN_ID=EC_URL_VIEW.DOMAIN_ID WHERE ID IN " + idString)) { stmt.setFetchSize(ids.size()); var rsp = stmt.executeQuery(); @@ -125,7 +106,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { rsp.getString(4), // description -5, // quality rsp.getInt(5), // wordsTotal - rsp.getString(6), // foramt + rsp.getString(6), // format rsp.getInt(7), // features rsp.getString(8), // ip EdgeDomainIndexingState.valueOf(rsp.getString(9)), // domainState @@ -179,9 +160,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { String domain = rsp.getString(2); if (!blacklist.isBlacklisted(id)) { - var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/"); - - domains.add(new BrowseResult(url, id)); + domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id)); } } } @@ -210,9 +189,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { String domain = rsp.getString(2); if (!blacklist.isBlacklisted(id)) { - var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/"); - - domains.add(new BrowseResult(url, id)); + domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id)); } } } @@ -244,9 +221,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { String domain = rsp.getString(2); if (!blacklist.isBlacklisted(id)) { - var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/"); - - domains.add(new BrowseResult(url, id)); + domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id)); } } } @@ -262,7 +237,15 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { @Override public List getRandomDomains(int count, EdgeDomainBlacklist blacklist) { - final String q = "SELECT DOMAIN_ID,DOMAIN_NAME FROM EC_RANDOM_DOMAINS INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE STATE<2 AND DOMAIN_ALIAS IS NULL ORDER BY RAND() LIMIT ?"; + final String q = """ + SELECT DOMAIN_ID, DOMAIN_NAME + FROM EC_RANDOM_DOMAINS + INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID + WHERE STATE<2 + AND DOMAIN_ALIAS IS NULL + ORDER BY RAND() + LIMIT ? + """; List domains = new ArrayList<>(count); try (var conn = dataSource.getConnection()) { try (var stmt = conn.prepareStatement(q)) { @@ -273,9 +256,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { String domain = rsp.getString(2); if (!blacklist.isBlacklisted(id)) { - var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/"); - - domains.add(new BrowseResult(url, id)); + domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id)); } } } @@ -302,223 +283,4 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { } } - - @Override - public Optional> resolveAmbiguousDomain(String name) { - try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) { - stmt.setString(1, name); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return Optional.of(new EdgeId<>(rsp.getInt(1))); - } - } - - try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) { - stmt.setString(1, "https://"+name); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return Optional.of(new EdgeId<>(rsp.getInt(1))); - } - } - - try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) { - stmt.setString(1, "http://"+name); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return Optional.of(new EdgeId<>(rsp.getInt(1))); - } - } - - try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) { - stmt.setString(1, "https://www."+name); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return Optional.of(new EdgeId<>(rsp.getInt(1))); - } - } - - try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) { - stmt.setString(1, "http://www."+name); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return Optional.of(new EdgeId<>(rsp.getInt(1))); - } - } - - } catch (SQLException throwables) { - logger.info("Could not resolve domain id for {}", name); - } - - return Optional.empty(); - } - - @SneakyThrows - @Override - public int getPagesKnown(EdgeId domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT KNOWN_URLS FROM DOMAIN_METADATA WHERE ID=?")) { - stmt.setInt(1, domainId.getId()); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getInt(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - return 0; - } - } - - @SneakyThrows - @Override - public int getPagesVisited(EdgeId domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT VISITED_URLS FROM DOMAIN_METADATA WHERE ID=?")) { - stmt.setInt(1, domainId.getId()); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getInt(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - return 0; - } - } - - - @SneakyThrows - @Override - public int getPagesIndexed(EdgeId domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT GOOD_URLS FROM DOMAIN_METADATA WHERE ID=?")) { - stmt.setInt(1, domainId.getId()); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getInt(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - return 0; - } - } - - @SneakyThrows - @Override - public int getIncomingLinks(EdgeId domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE DEST_DOMAIN_ID=?")) { - stmt.setInt(1, domainId.getId()); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getInt(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - return 0; - } - } - @SneakyThrows - @Override - public int getOutboundLinks(EdgeId domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=?")) { - stmt.setInt(1, domainId.getId()); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getInt(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - return 0; - } - } - - @SneakyThrows - @Override - public double getDomainQuality(EdgeId domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT QUALITY FROM EC_DOMAIN WHERE ID=?")) { - stmt.setInt(1, domainId.getId()); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getDouble(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - return -5; - } - } - - @Override - public EdgeDomainIndexingState getDomainState(EdgeId domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT STATE FROM EC_DOMAIN WHERE ID=?")) { - stmt.setInt(1, domainId.getId()); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return EdgeDomainIndexingState.fromCode(rsp.getInt(1)); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - } catch (SQLException throwables) { - throwables.printStackTrace(); - } - return EdgeDomainIndexingState.ERROR; - } - - @Override - public List getLinkingDomains(EdgeId domainId) { - try (var connection = dataSource.getConnection()) { - List results = new ArrayList<>(25); - try (var stmt = connection.prepareStatement("SELECT SOURCE_URL FROM EC_RELATED_LINKS_VIEW WHERE DEST_DOMAIN_ID=? ORDER BY SOURCE_DOMAIN_ID LIMIT 25")) { - stmt.setInt(1, domainId.getId()); - var rsp = stmt.executeQuery(); - while (rsp.next()) { - results.add(new EdgeDomain(rsp.getString(1))); - } - return results; - } catch (Exception ex) { - logger.error("DB error", ex); - } - - } catch (SQLException throwables) { - throwables.printStackTrace(); - } - return Collections.emptyList(); - } - - @Override - public double getRank(EdgeId domainId) { - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT IFNULL(RANK, 1) FROM EC_DOMAIN WHERE ID=?")) { - stmt.setInt(1, domainId.getId()); - var rsp = stmt.executeQuery(); - if (rsp.next()) { - return rsp.getDouble(1); - } - } catch (Exception ex) { - logger.error("DB error", ex); - } - } catch (SQLException throwables) { - throwables.printStackTrace(); - } - return 1; - } - - } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexDao.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexDao.java index c42fcf53..a12b249e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexDao.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexDao.java @@ -10,7 +10,7 @@ import lombok.SneakyThrows; import nu.marginalia.util.ranking.BetterReversePageRank; import nu.marginalia.util.ranking.BetterStandardPageRank; import nu.marginalia.util.ranking.BuggyStandardPageRank; -import nu.marginalia.wmsa.configuration.module.DatabaseModule; +import nu.marginalia.util.ranking.RankingDomainFetcher; import nu.marginalia.wmsa.edge.index.model.RankingSettings; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -18,14 +18,17 @@ import org.slf4j.LoggerFactory; @Singleton public class SearchIndexDao { private final HikariDataSource dataSource; + private RankingDomainFetcher rankingDomains; private final RankingSettings rankingSettings; private final Logger logger = LoggerFactory.getLogger(getClass()); @Inject public SearchIndexDao(HikariDataSource dataSource, + RankingDomainFetcher rankingDomains, RankingSettings rankingSettings) { this.dataSource = dataSource; + this.rankingDomains = rankingDomains; this.rankingSettings = rankingSettings; logger.info("SearchIndexDao ranking settings = {}", rankingSettings); } @@ -63,36 +66,36 @@ public class SearchIndexDao { @SneakyThrows public TIntList getRetroDomains() { - var spr = new BetterStandardPageRank(dataSource,rankingSettings.retro.toArray(String[]::new)); - return spr.pageRankWithPeripheralNodes(spr.size()/2, false); + var spr = new BetterStandardPageRank(rankingDomains,rankingSettings.retro.toArray(String[]::new)); + return spr.pageRankWithPeripheralNodes(spr.size()/2); } @SneakyThrows public TIntList getSmallWebDomains() { - var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), rankingSettings.small.toArray(String[]::new)); + var rpr = new BetterReversePageRank(rankingDomains, rankingSettings.small.toArray(String[]::new)); rpr.setMaxKnownUrls(750); - return rpr.pageRankWithPeripheralNodes(rpr.size(), false); + return rpr.pageRankWithPeripheralNodes(rpr.size()); } @SneakyThrows public TIntList getAcademiaDomains() { - var spr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), rankingSettings.academia.toArray(String[]::new)); - return spr.pageRankWithPeripheralNodes(spr.size()/2, false); + var spr = new BetterStandardPageRank(rankingDomains, rankingSettings.academia.toArray(String[]::new)); + return spr.pageRankWithPeripheralNodes(spr.size()/2); } @SneakyThrows public TIntList getStandardDomains() { - var spr = new BuggyStandardPageRank(dataSource,rankingSettings.standard.toArray(String[]::new)); - return spr.pageRankWithPeripheralNodes(spr.size()/2, false); + var spr = new BuggyStandardPageRank(rankingDomains,rankingSettings.standard.toArray(String[]::new)); + return spr.pageRankWithPeripheralNodes(spr.size()/2); } @SneakyThrows public TIntList getSpecialDomains() { TIntArrayList results = new TIntArrayList(); try (var connection = dataSource.getConnection(); - var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE STATE=2") + var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE STATE='SPECIAL'") ) { var rs = stmt.executeQuery(); while (rs.next()) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java index cb778947..8daf790a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java @@ -54,8 +54,11 @@ public class EdgeDomain implements WideHashable { } } } + } - + public EdgeUrl toRootUrl() { + // Set default protocol to http, as most https websites redirect http->https, but few http websites redirect https->http + return new EdgeUrl("http", this, null, "/"); } public String toString() { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java index 54179d64..1d3fd2b2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java @@ -1,24 +1,43 @@ package nu.marginalia.wmsa.edge.search.siteinfo; +import com.zaxxer.hikari.HikariDataSource; +import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao; +import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDaoImpl; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeId; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; import nu.marginalia.wmsa.edge.search.model.DomainInformation; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import javax.inject.Inject; import javax.inject.Singleton; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.Collections; import java.util.List; import java.util.Optional; +/* + TODO: This class needs to be refactored, a lot of + these SQL queries are redundant and can be + collapsed into one single query that fetches + all the information + */ @Singleton public class DomainInformationService { - private EdgeDataStoreDao dataStore; + private EdgeDataStoreDaoImpl dataStoreDao; + private HikariDataSource dataSource; + private final Logger logger = LoggerFactory.getLogger(getClass()); @Inject - public DomainInformationService(EdgeDataStoreDao dataStore) { - this.dataStore = dataStore; + public DomainInformationService( + EdgeDataStoreDaoImpl dataStoreDao, + HikariDataSource dataSource) { + this.dataStoreDao = dataStoreDao; + this.dataSource = dataSource; } @@ -28,29 +47,29 @@ public class DomainInformationService { if (domainId == null) { return Optional.empty(); } - EdgeDomain domain = dataStore.getDomain(domainId); + EdgeDomain domain = dataStoreDao.getDomain(domainId); - boolean blacklisted = dataStore.isBlacklisted(domain); - int pagesKnown = dataStore.getPagesKnown(domainId); - int pagesVisited = dataStore.getPagesVisited(domainId); - int pagesIndexed = dataStore.getPagesIndexed(domainId); - int incomingLinks = dataStore.getIncomingLinks(domainId); - int outboundLinks = dataStore.getOutboundLinks(domainId); - double rank = Math.round(10000.0*(1.0-dataStore.getRank(domainId)))/100; - EdgeDomainIndexingState state = dataStore.getDomainState(domainId); - double nominalQuality = Math.round(100*100*Math.exp(dataStore.getDomainQuality(domainId)))/100.; - List linkingDomains = dataStore.getLinkingDomains(domainId); + boolean blacklisted = isBlacklisted(domain); + int pagesKnown = getPagesKnown(domainId); + int pagesVisited = getPagesVisited(domainId); + int pagesIndexed = getPagesIndexed(domainId); + int incomingLinks = getIncomingLinks(domainId); + int outboundLinks = getOutboundLinks(domainId); + double rank = Math.round(10000.0*(1.0-getRank(domainId)))/100; + EdgeDomainIndexingState state = getDomainState(domainId); + double nominalQuality = Math.round(100*100*Math.exp(getDomainQuality(domainId)))/100.; + List linkingDomains = getLinkingDomains(domainId); return Optional.of(new DomainInformation(domain, blacklisted, pagesKnown, pagesVisited, pagesIndexed, incomingLinks, outboundLinks, nominalQuality, rank, state, linkingDomains)); } private EdgeId getDomainFromPartial(String site) { try { - return dataStore.getDomainId(new EdgeDomain(site)); + return dataStoreDao.getDomainId(new EdgeDomain(site)); } catch (Exception ex) { try { - return dataStore.getDomainId(new EdgeDomain(site)); + return dataStoreDao.getDomainId(new EdgeDomain(site)); } catch (Exception ex2) { return null; @@ -58,4 +77,178 @@ public class DomainInformationService { } } + + @SneakyThrows + public boolean isBlacklisted(EdgeDomain domain) { + + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN_BLACKLIST WHERE URL_DOMAIN=?")) { + stmt.setString(1, domain.domain); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return true; + } else { + return false; + } + } + } + } + + @SneakyThrows + public int getPagesKnown(EdgeId domainId) { + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.prepareStatement("SELECT KNOWN_URLS FROM DOMAIN_METADATA WHERE ID=?")) { + stmt.setInt(1, domainId.getId()); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return rsp.getInt(1); + } + } catch (Exception ex) { + logger.error("DB error", ex); + } + return 0; + } + } + + @SneakyThrows + public int getPagesVisited(EdgeId domainId) { + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.prepareStatement("SELECT VISITED_URLS FROM DOMAIN_METADATA WHERE ID=?")) { + stmt.setInt(1, domainId.getId()); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return rsp.getInt(1); + } + } catch (Exception ex) { + logger.error("DB error", ex); + } + return 0; + } + } + + + @SneakyThrows + public int getPagesIndexed(EdgeId domainId) { + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.prepareStatement("SELECT GOOD_URLS FROM DOMAIN_METADATA WHERE ID=?")) { + stmt.setInt(1, domainId.getId()); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return rsp.getInt(1); + } + } catch (Exception ex) { + logger.error("DB error", ex); + } + return 0; + } + } + + @SneakyThrows + public int getIncomingLinks(EdgeId domainId) { + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE DEST_DOMAIN_ID=?")) { + stmt.setInt(1, domainId.getId()); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return rsp.getInt(1); + } + } catch (Exception ex) { + logger.error("DB error", ex); + } + return 0; + } + } + @SneakyThrows + public int getOutboundLinks(EdgeId domainId) { + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=?")) { + stmt.setInt(1, domainId.getId()); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return rsp.getInt(1); + } + } catch (Exception ex) { + logger.error("DB error", ex); + } + return 0; + } + } + + @SneakyThrows + public double getDomainQuality(EdgeId domainId) { + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.prepareStatement("SELECT QUALITY FROM EC_DOMAIN WHERE ID=?")) { + stmt.setInt(1, domainId.getId()); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return rsp.getDouble(1); + } + } catch (Exception ex) { + logger.error("DB error", ex); + } + return -5; + } + } + + public EdgeDomainIndexingState getDomainState(EdgeId domainId) { + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.prepareStatement("SELECT STATE FROM EC_DOMAIN WHERE ID=?")) { + stmt.setInt(1, domainId.getId()); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return EdgeDomainIndexingState.fromCode(rsp.getInt(1)); + } + } catch (Exception ex) { + logger.error("DB error", ex); + } + } catch (SQLException throwables) { + throwables.printStackTrace(); + } + return EdgeDomainIndexingState.ERROR; + } + + public List getLinkingDomains(EdgeId domainId) { + try (var connection = dataSource.getConnection()) { + List results = new ArrayList<>(25); + try (var stmt = connection.prepareStatement("SELECT SOURCE_URL FROM EC_RELATED_LINKS_VIEW WHERE DEST_DOMAIN_ID=? ORDER BY SOURCE_DOMAIN_ID LIMIT 25")) { + stmt.setInt(1, domainId.getId()); + var rsp = stmt.executeQuery(); + while (rsp.next()) { + results.add(new EdgeDomain(rsp.getString(1))); + } + return results; + } catch (Exception ex) { + logger.error("DB error", ex); + } + + } catch (SQLException throwables) { + throwables.printStackTrace(); + } + return Collections.emptyList(); + } + + public double getRank(EdgeId domainId) { + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.prepareStatement("SELECT IFNULL(RANK, 1) FROM EC_DOMAIN WHERE ID=?")) { + stmt.setInt(1, domainId.getId()); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + return rsp.getDouble(1); + } + } catch (Exception ex) { + logger.error("DB error", ex); + } + } catch (SQLException throwables) { + throwables.printStackTrace(); + } + return 1; + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexMergerMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexMergerMain.java index bb946238..1251f626 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexMergerMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexMergerMain.java @@ -3,6 +3,7 @@ package nu.marginalia.wmsa.edge.tools; import com.google.inject.Inject; import gnu.trove.set.hash.TIntHashSet; import lombok.SneakyThrows; +import nu.marginalia.util.ranking.RankingDomainFetcher; import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; @@ -59,7 +60,9 @@ public class IndexMergerMain { } var hikari = new DatabaseModule().provideConnection(); - var partitioner = new SearchIndexPartitioner(new SearchIndexDao(hikari, new RankingSettings())); + var ds = new DatabaseModule().provideConnection(); + var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds)); + var partitioner = new SearchIndexPartitioner(new SearchIndexDao(hikari, domains, new RankingSettings())); var blacklist = new EdgeDomainBlacklistImpl(hikari); new IndexMergerMain(file1, file2, outputFile, partitioner, blacklist);