From bcadfc965d36713b04abb33f1129c1d90d92d6d4 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 12 Feb 2023 10:28:53 +0100 Subject: [PATCH] Use new cosine-similarity ranking algorithm --- .../ranking/data/RankingDomainFetcher.java | 2 + ...RankingDomainFetcherForSimilarityData.java | 54 +++++++- .../index/svc/EdgeIndexSearchSetsService.java | 120 +++++++++++------- 3 files changed, 122 insertions(+), 54 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/data/RankingDomainFetcher.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/data/RankingDomainFetcher.java index 397d9fb5..ff2b7e18 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/data/RankingDomainFetcher.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/data/RankingDomainFetcher.java @@ -1,6 +1,7 @@ package nu.marginalia.wmsa.edge.index.ranking.data; import com.google.inject.Inject; +import com.google.inject.Singleton; import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; @@ -11,6 +12,7 @@ import java.sql.SQLException; import java.util.function.Consumer; import java.util.function.IntConsumer; +@Singleton public class RankingDomainFetcher { protected final HikariDataSource dataSource; protected final EdgeDomainBlacklistImpl blacklist; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/data/RankingDomainFetcherForSimilarityData.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/data/RankingDomainFetcherForSimilarityData.java index dbbede55..dddaeebb 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/data/RankingDomainFetcherForSimilarityData.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/data/RankingDomainFetcherForSimilarityData.java @@ -1,14 +1,41 @@ package nu.marginalia.wmsa.edge.index.ranking.data; +import com.google.inject.Inject; +import com.google.inject.Singleton; import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl; +import org.slf4j.LoggerFactory; import java.sql.SQLException; import java.util.function.Consumer; +@Singleton public class RankingDomainFetcherForSimilarityData extends RankingDomainFetcher { + final boolean hasData; + + @Inject public RankingDomainFetcherForSimilarityData(HikariDataSource dataSource, EdgeDomainBlacklistImpl blacklist) { super(dataSource, blacklist); + + hasData = isDomainNeighborTablePopulated(dataSource); + } + + private static boolean isDomainNeighborTablePopulated(HikariDataSource dataSource) { + try (var conn = dataSource.getConnection(); + var stmt = conn.createStatement(); + var rs = stmt.executeQuery("SELECT DOMAIN_ID FROM EC_DOMAIN_NEIGHBORS_2 LIMIT 1")) { + + return rs.next(); + } + catch (SQLException ex) { + LoggerFactory + .getLogger(RankingDomainFetcherForSimilarityData.class) + .error("Failed to get count from EC_DOMAIN_NEIGHBORS_2", ex); + return false; + } + } + public boolean hasData() { + return hasData; } public void eachDomainLink(DomainLinkConsumer consumer) { @@ -45,19 +72,32 @@ public class RankingDomainFetcherForSimilarityData extends RankingDomainFetcher // HAVING COUNT(SOURCE_DOMAIN_ID)>5 // """; - String query = - """ - SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0) - FROM EC_DOMAIN - INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID - GROUP BY EC_DOMAIN.ID - """; + String query; + if (getNames) { + query = + """ + SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0) + FROM EC_DOMAIN + INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID + GROUP BY EC_DOMAIN.ID + """; + } + else { + query = + """ + SELECT EC_DOMAIN.ID,"",DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0) + FROM EC_DOMAIN + INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID + GROUP BY EC_DOMAIN.ID + """; + } getDomains(query, consumer); } public void getPeripheralDomains(Consumer consumer) { + // This is not relevant for this variant of pagerank since it is bidirectional } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexSearchSetsService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexSearchSetsService.java index 68a2a9c8..834caf67 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexSearchSetsService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexSearchSetsService.java @@ -5,35 +5,56 @@ import com.google.inject.Singleton; import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.index.ranking.ReversePageRank; import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank; +import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultHashMapAccumulator; import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher; import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultBitSetAccumulator; -import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultHashMapAccumulator; import nu.marginalia.wmsa.edge.index.IndexServicesFactory; import nu.marginalia.wmsa.edge.index.model.RankingSettings; import nu.marginalia.wmsa.edge.index.postings.DomainRankings; +import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcherForSimilarityData; import nu.marginalia.wmsa.edge.index.svc.searchset.RankingSearchSet; import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSet; import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetAny; import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.IOException; @Singleton public class EdgeIndexSearchSetsService { + private final Logger logger = LoggerFactory.getLogger(getClass()); private final RankingDomainFetcher rankingDomains; + private final RankingDomainFetcher similarityDomains; private final RankingSettings rankingSettings; - private final SearchSet anySet = new SearchSetAny(); + + + // Below are binary indices that are used to constrain a search private volatile RankingSearchSet retroSet; private volatile RankingSearchSet smallWebSet; private volatile RankingSearchSet academiaSet; + private final SearchSet anySet = new SearchSetAny(); + // The ranking value of the domains used in sorting the domains private volatile DomainRankings domainRankings = new DomainRankings(); @Inject public EdgeIndexSearchSetsService(RankingDomainFetcher rankingDomains, + RankingDomainFetcherForSimilarityData similarityDomains, RankingSettings rankingSettings, IndexServicesFactory servicesFactory) throws IOException { + this.rankingDomains = rankingDomains; + + if (similarityDomains.hasData()) { + this.similarityDomains = similarityDomains; + } + else { + // on test environments the cosine similarity graph may not be present + logger.info("Domain similarity is not present, falling back on link graph"); + this.similarityDomains = rankingDomains; + } + this.rankingSettings = rankingSettings; smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, servicesFactory.getSearchSetsBase().resolve("small-web.dat")); @@ -41,51 +62,6 @@ public class EdgeIndexSearchSetsService { retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, servicesFactory.getSearchSetsBase().resolve("retro.dat")); } - public void recalculateAll() { - updateAcademiaDomains(); - updateRetroDomains(); - updateSmallWebDomains(); - } - - @SneakyThrows - public void updateRetroDomains() { - var spr = new StandardPageRank(rankingDomains,rankingSettings.retro.toArray(String[]::new)); - var data = spr.pageRankWithPeripheralNodes(spr.size() / 2, RankingResultBitSetAccumulator::new); - - synchronized (this) { - retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, retroSet.source, data); - retroSet.write(); - } - - var ranks = spr.pageRankWithPeripheralNodes(spr.size() / 2, () -> new RankingResultHashMapAccumulator(100_000)); - synchronized (this) { - domainRankings = new DomainRankings(ranks); - } - } - - @SneakyThrows - public void updateSmallWebDomains() { - var rpr = new ReversePageRank(rankingDomains, rankingSettings.small.toArray(String[]::new)); - rpr.setMaxKnownUrls(750); - var data = rpr.pageRankWithPeripheralNodes(rpr.size(), RankingResultBitSetAccumulator::new); - - synchronized (this) { - smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, smallWebSet.source, data); - smallWebSet.write(); - } - } - - @SneakyThrows - public void updateAcademiaDomains() { - var spr = new StandardPageRank(rankingDomains, rankingSettings.academia.toArray(String[]::new)); - var data = spr.pageRankWithPeripheralNodes(spr.size()/2, RankingResultBitSetAccumulator::new); - - synchronized (this) { - academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, academiaSet.source, data); - academiaSet.write(); - } - } - public DomainRankings getDomainRankings() { return domainRankings; } @@ -101,4 +77,54 @@ public class EdgeIndexSearchSetsService { case SMALLWEB -> smallWebSet; }; } + + public void recalculateAll() { + updateAcademiaDomainsSet(); + updateRetroDomainsSet(); + updateSmallWebDomainsSet(); + updateDomainRankings(); + } + + private void updateDomainRankings() { + var spr = new StandardPageRank(similarityDomains, rankingSettings.retro.toArray(String[]::new)); + + var ranks = spr.pageRankWithPeripheralNodes(spr.size() / 2, () -> new RankingResultHashMapAccumulator(100_000)); + synchronized (this) { + domainRankings = new DomainRankings(ranks); + } + } + + @SneakyThrows + public void updateRetroDomainsSet() { + var spr = new StandardPageRank(similarityDomains, rankingSettings.retro.toArray(String[]::new)); + var data = spr.pageRankWithPeripheralNodes(spr.size() / 2, RankingResultBitSetAccumulator::new); + + synchronized (this) { + retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, retroSet.source, data); + retroSet.write(); + } + } + + @SneakyThrows + public void updateSmallWebDomainsSet() { + var rpr = new ReversePageRank(similarityDomains, rankingSettings.small.toArray(String[]::new)); + rpr.setMaxKnownUrls(750); + var data = rpr.pageRankWithPeripheralNodes(rpr.size(), RankingResultBitSetAccumulator::new); + + synchronized (this) { + smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, smallWebSet.source, data); + smallWebSet.write(); + } + } + + @SneakyThrows + public void updateAcademiaDomainsSet() { + var spr = new StandardPageRank(similarityDomains, rankingSettings.academia.toArray(String[]::new)); + var data = spr.pageRankWithPeripheralNodes(spr.size()/2, RankingResultBitSetAccumulator::new); + + synchronized (this) { + academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, academiaSet.source, data); + academiaSet.write(); + } + } }