Use new cosine-similarity ranking algorithm

This commit is contained in:
Viktor Lofgren 2023-02-12 10:28:53 +01:00
parent 3e1297064c
commit bcadfc965d
3 changed files with 122 additions and 54 deletions

View File

@ -1,6 +1,7 @@
package nu.marginalia.wmsa.edge.index.ranking.data;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
@ -11,6 +12,7 @@ import java.sql.SQLException;
import java.util.function.Consumer;
import java.util.function.IntConsumer;
@Singleton
public class RankingDomainFetcher {
protected final HikariDataSource dataSource;
protected final EdgeDomainBlacklistImpl blacklist;

View File

@ -1,14 +1,41 @@
package nu.marginalia.wmsa.edge.index.ranking.data;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
import org.slf4j.LoggerFactory;
import java.sql.SQLException;
import java.util.function.Consumer;
@Singleton
public class RankingDomainFetcherForSimilarityData extends RankingDomainFetcher {
final boolean hasData;
@Inject
public RankingDomainFetcherForSimilarityData(HikariDataSource dataSource, EdgeDomainBlacklistImpl blacklist) {
super(dataSource, blacklist);
hasData = isDomainNeighborTablePopulated(dataSource);
}
private static boolean isDomainNeighborTablePopulated(HikariDataSource dataSource) {
try (var conn = dataSource.getConnection();
var stmt = conn.createStatement();
var rs = stmt.executeQuery("SELECT DOMAIN_ID FROM EC_DOMAIN_NEIGHBORS_2 LIMIT 1")) {
return rs.next();
}
catch (SQLException ex) {
LoggerFactory
.getLogger(RankingDomainFetcherForSimilarityData.class)
.error("Failed to get count from EC_DOMAIN_NEIGHBORS_2", ex);
return false;
}
}
public boolean hasData() {
return hasData;
}
public void eachDomainLink(DomainLinkConsumer consumer) {
@ -45,19 +72,32 @@ public class RankingDomainFetcherForSimilarityData extends RankingDomainFetcher
// HAVING COUNT(SOURCE_DOMAIN_ID)>5
// """;
String query =
"""
SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0)
FROM EC_DOMAIN
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
GROUP BY EC_DOMAIN.ID
""";
String query;
if (getNames) {
query =
"""
SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0)
FROM EC_DOMAIN
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
GROUP BY EC_DOMAIN.ID
""";
}
else {
query =
"""
SELECT EC_DOMAIN.ID,"",DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0)
FROM EC_DOMAIN
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
GROUP BY EC_DOMAIN.ID
""";
}
getDomains(query, consumer);
}
public void getPeripheralDomains(Consumer<RankingDomainData> consumer) {
// This is not relevant for this variant of pagerank since it is bidirectional
}
}

View File

@ -5,35 +5,56 @@ import com.google.inject.Singleton;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.index.ranking.ReversePageRank;
import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank;
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultHashMapAccumulator;
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher;
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultBitSetAccumulator;
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultHashMapAccumulator;
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
import nu.marginalia.wmsa.edge.index.model.RankingSettings;
import nu.marginalia.wmsa.edge.index.postings.DomainRankings;
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcherForSimilarityData;
import nu.marginalia.wmsa.edge.index.svc.searchset.RankingSearchSet;
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSet;
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetAny;
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
@Singleton
public class EdgeIndexSearchSetsService {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final RankingDomainFetcher rankingDomains;
private final RankingDomainFetcher similarityDomains;
private final RankingSettings rankingSettings;
private final SearchSet anySet = new SearchSetAny();
// Below are binary indices that are used to constrain a search
private volatile RankingSearchSet retroSet;
private volatile RankingSearchSet smallWebSet;
private volatile RankingSearchSet academiaSet;
private final SearchSet anySet = new SearchSetAny();
// The ranking value of the domains used in sorting the domains
private volatile DomainRankings domainRankings = new DomainRankings();
@Inject
public EdgeIndexSearchSetsService(RankingDomainFetcher rankingDomains,
RankingDomainFetcherForSimilarityData similarityDomains,
RankingSettings rankingSettings,
IndexServicesFactory servicesFactory) throws IOException {
this.rankingDomains = rankingDomains;
if (similarityDomains.hasData()) {
this.similarityDomains = similarityDomains;
}
else {
// on test environments the cosine similarity graph may not be present
logger.info("Domain similarity is not present, falling back on link graph");
this.similarityDomains = rankingDomains;
}
this.rankingSettings = rankingSettings;
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, servicesFactory.getSearchSetsBase().resolve("small-web.dat"));
@ -41,51 +62,6 @@ public class EdgeIndexSearchSetsService {
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, servicesFactory.getSearchSetsBase().resolve("retro.dat"));
}
public void recalculateAll() {
updateAcademiaDomains();
updateRetroDomains();
updateSmallWebDomains();
}
@SneakyThrows
public void updateRetroDomains() {
var spr = new StandardPageRank(rankingDomains,rankingSettings.retro.toArray(String[]::new));
var data = spr.pageRankWithPeripheralNodes(spr.size() / 2, RankingResultBitSetAccumulator::new);
synchronized (this) {
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, retroSet.source, data);
retroSet.write();
}
var ranks = spr.pageRankWithPeripheralNodes(spr.size() / 2, () -> new RankingResultHashMapAccumulator(100_000));
synchronized (this) {
domainRankings = new DomainRankings(ranks);
}
}
@SneakyThrows
public void updateSmallWebDomains() {
var rpr = new ReversePageRank(rankingDomains, rankingSettings.small.toArray(String[]::new));
rpr.setMaxKnownUrls(750);
var data = rpr.pageRankWithPeripheralNodes(rpr.size(), RankingResultBitSetAccumulator::new);
synchronized (this) {
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, smallWebSet.source, data);
smallWebSet.write();
}
}
@SneakyThrows
public void updateAcademiaDomains() {
var spr = new StandardPageRank(rankingDomains, rankingSettings.academia.toArray(String[]::new));
var data = spr.pageRankWithPeripheralNodes(spr.size()/2, RankingResultBitSetAccumulator::new);
synchronized (this) {
academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, academiaSet.source, data);
academiaSet.write();
}
}
public DomainRankings getDomainRankings() {
return domainRankings;
}
@ -101,4 +77,54 @@ public class EdgeIndexSearchSetsService {
case SMALLWEB -> smallWebSet;
};
}
public void recalculateAll() {
updateAcademiaDomainsSet();
updateRetroDomainsSet();
updateSmallWebDomainsSet();
updateDomainRankings();
}
private void updateDomainRankings() {
var spr = new StandardPageRank(similarityDomains, rankingSettings.retro.toArray(String[]::new));
var ranks = spr.pageRankWithPeripheralNodes(spr.size() / 2, () -> new RankingResultHashMapAccumulator(100_000));
synchronized (this) {
domainRankings = new DomainRankings(ranks);
}
}
@SneakyThrows
public void updateRetroDomainsSet() {
var spr = new StandardPageRank(similarityDomains, rankingSettings.retro.toArray(String[]::new));
var data = spr.pageRankWithPeripheralNodes(spr.size() / 2, RankingResultBitSetAccumulator::new);
synchronized (this) {
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, retroSet.source, data);
retroSet.write();
}
}
@SneakyThrows
public void updateSmallWebDomainsSet() {
var rpr = new ReversePageRank(similarityDomains, rankingSettings.small.toArray(String[]::new));
rpr.setMaxKnownUrls(750);
var data = rpr.pageRankWithPeripheralNodes(rpr.size(), RankingResultBitSetAccumulator::new);
synchronized (this) {
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, smallWebSet.source, data);
smallWebSet.write();
}
}
@SneakyThrows
public void updateAcademiaDomainsSet() {
var spr = new StandardPageRank(similarityDomains, rankingSettings.academia.toArray(String[]::new));
var data = spr.pageRankWithPeripheralNodes(spr.size()/2, RankingResultBitSetAccumulator::new);
synchronized (this) {
academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, academiaSet.source, data);
academiaSet.write();
}
}
}