mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Use new cosine-similarity ranking algorithm
This commit is contained in:
parent
3e1297064c
commit
bcadfc965d
@ -1,6 +1,7 @@
|
||||
package nu.marginalia.wmsa.edge.index.ranking.data;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||
@ -11,6 +12,7 @@ import java.sql.SQLException;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.function.IntConsumer;
|
||||
|
||||
@Singleton
|
||||
public class RankingDomainFetcher {
|
||||
protected final HikariDataSource dataSource;
|
||||
protected final EdgeDomainBlacklistImpl blacklist;
|
||||
|
@ -1,14 +1,41 @@
|
||||
package nu.marginalia.wmsa.edge.index.ranking.data;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
@Singleton
|
||||
public class RankingDomainFetcherForSimilarityData extends RankingDomainFetcher {
|
||||
final boolean hasData;
|
||||
|
||||
@Inject
|
||||
public RankingDomainFetcherForSimilarityData(HikariDataSource dataSource, EdgeDomainBlacklistImpl blacklist) {
|
||||
super(dataSource, blacklist);
|
||||
|
||||
hasData = isDomainNeighborTablePopulated(dataSource);
|
||||
}
|
||||
|
||||
private static boolean isDomainNeighborTablePopulated(HikariDataSource dataSource) {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.createStatement();
|
||||
var rs = stmt.executeQuery("SELECT DOMAIN_ID FROM EC_DOMAIN_NEIGHBORS_2 LIMIT 1")) {
|
||||
|
||||
return rs.next();
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
LoggerFactory
|
||||
.getLogger(RankingDomainFetcherForSimilarityData.class)
|
||||
.error("Failed to get count from EC_DOMAIN_NEIGHBORS_2", ex);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
public boolean hasData() {
|
||||
return hasData;
|
||||
}
|
||||
|
||||
public void eachDomainLink(DomainLinkConsumer consumer) {
|
||||
@ -45,19 +72,32 @@ public class RankingDomainFetcherForSimilarityData extends RankingDomainFetcher
|
||||
// HAVING COUNT(SOURCE_DOMAIN_ID)>5
|
||||
// """;
|
||||
|
||||
String query =
|
||||
"""
|
||||
SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0)
|
||||
FROM EC_DOMAIN
|
||||
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||
GROUP BY EC_DOMAIN.ID
|
||||
""";
|
||||
String query;
|
||||
if (getNames) {
|
||||
query =
|
||||
"""
|
||||
SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0)
|
||||
FROM EC_DOMAIN
|
||||
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||
GROUP BY EC_DOMAIN.ID
|
||||
""";
|
||||
}
|
||||
else {
|
||||
query =
|
||||
"""
|
||||
SELECT EC_DOMAIN.ID,"",DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0)
|
||||
FROM EC_DOMAIN
|
||||
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||
GROUP BY EC_DOMAIN.ID
|
||||
""";
|
||||
}
|
||||
|
||||
getDomains(query, consumer);
|
||||
}
|
||||
|
||||
|
||||
public void getPeripheralDomains(Consumer<RankingDomainData> consumer) {
|
||||
// This is not relevant for this variant of pagerank since it is bidirectional
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -5,35 +5,56 @@ import com.google.inject.Singleton;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.ReversePageRank;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultHashMapAccumulator;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultBitSetAccumulator;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultHashMapAccumulator;
|
||||
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
||||
import nu.marginalia.wmsa.edge.index.model.RankingSettings;
|
||||
import nu.marginalia.wmsa.edge.index.postings.DomainRankings;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcherForSimilarityData;
|
||||
import nu.marginalia.wmsa.edge.index.svc.searchset.RankingSearchSet;
|
||||
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSet;
|
||||
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetAny;
|
||||
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
@Singleton
|
||||
public class EdgeIndexSearchSetsService {
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final RankingDomainFetcher rankingDomains;
|
||||
private final RankingDomainFetcher similarityDomains;
|
||||
private final RankingSettings rankingSettings;
|
||||
private final SearchSet anySet = new SearchSetAny();
|
||||
|
||||
|
||||
// Below are binary indices that are used to constrain a search
|
||||
private volatile RankingSearchSet retroSet;
|
||||
private volatile RankingSearchSet smallWebSet;
|
||||
private volatile RankingSearchSet academiaSet;
|
||||
private final SearchSet anySet = new SearchSetAny();
|
||||
|
||||
// The ranking value of the domains used in sorting the domains
|
||||
private volatile DomainRankings domainRankings = new DomainRankings();
|
||||
|
||||
@Inject
|
||||
public EdgeIndexSearchSetsService(RankingDomainFetcher rankingDomains,
|
||||
RankingDomainFetcherForSimilarityData similarityDomains,
|
||||
RankingSettings rankingSettings,
|
||||
IndexServicesFactory servicesFactory) throws IOException {
|
||||
|
||||
this.rankingDomains = rankingDomains;
|
||||
|
||||
if (similarityDomains.hasData()) {
|
||||
this.similarityDomains = similarityDomains;
|
||||
}
|
||||
else {
|
||||
// on test environments the cosine similarity graph may not be present
|
||||
logger.info("Domain similarity is not present, falling back on link graph");
|
||||
this.similarityDomains = rankingDomains;
|
||||
}
|
||||
|
||||
this.rankingSettings = rankingSettings;
|
||||
|
||||
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, servicesFactory.getSearchSetsBase().resolve("small-web.dat"));
|
||||
@ -41,51 +62,6 @@ public class EdgeIndexSearchSetsService {
|
||||
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, servicesFactory.getSearchSetsBase().resolve("retro.dat"));
|
||||
}
|
||||
|
||||
public void recalculateAll() {
|
||||
updateAcademiaDomains();
|
||||
updateRetroDomains();
|
||||
updateSmallWebDomains();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void updateRetroDomains() {
|
||||
var spr = new StandardPageRank(rankingDomains,rankingSettings.retro.toArray(String[]::new));
|
||||
var data = spr.pageRankWithPeripheralNodes(spr.size() / 2, RankingResultBitSetAccumulator::new);
|
||||
|
||||
synchronized (this) {
|
||||
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, retroSet.source, data);
|
||||
retroSet.write();
|
||||
}
|
||||
|
||||
var ranks = spr.pageRankWithPeripheralNodes(spr.size() / 2, () -> new RankingResultHashMapAccumulator(100_000));
|
||||
synchronized (this) {
|
||||
domainRankings = new DomainRankings(ranks);
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void updateSmallWebDomains() {
|
||||
var rpr = new ReversePageRank(rankingDomains, rankingSettings.small.toArray(String[]::new));
|
||||
rpr.setMaxKnownUrls(750);
|
||||
var data = rpr.pageRankWithPeripheralNodes(rpr.size(), RankingResultBitSetAccumulator::new);
|
||||
|
||||
synchronized (this) {
|
||||
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, smallWebSet.source, data);
|
||||
smallWebSet.write();
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void updateAcademiaDomains() {
|
||||
var spr = new StandardPageRank(rankingDomains, rankingSettings.academia.toArray(String[]::new));
|
||||
var data = spr.pageRankWithPeripheralNodes(spr.size()/2, RankingResultBitSetAccumulator::new);
|
||||
|
||||
synchronized (this) {
|
||||
academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, academiaSet.source, data);
|
||||
academiaSet.write();
|
||||
}
|
||||
}
|
||||
|
||||
public DomainRankings getDomainRankings() {
|
||||
return domainRankings;
|
||||
}
|
||||
@ -101,4 +77,54 @@ public class EdgeIndexSearchSetsService {
|
||||
case SMALLWEB -> smallWebSet;
|
||||
};
|
||||
}
|
||||
|
||||
public void recalculateAll() {
|
||||
updateAcademiaDomainsSet();
|
||||
updateRetroDomainsSet();
|
||||
updateSmallWebDomainsSet();
|
||||
updateDomainRankings();
|
||||
}
|
||||
|
||||
private void updateDomainRankings() {
|
||||
var spr = new StandardPageRank(similarityDomains, rankingSettings.retro.toArray(String[]::new));
|
||||
|
||||
var ranks = spr.pageRankWithPeripheralNodes(spr.size() / 2, () -> new RankingResultHashMapAccumulator(100_000));
|
||||
synchronized (this) {
|
||||
domainRankings = new DomainRankings(ranks);
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void updateRetroDomainsSet() {
|
||||
var spr = new StandardPageRank(similarityDomains, rankingSettings.retro.toArray(String[]::new));
|
||||
var data = spr.pageRankWithPeripheralNodes(spr.size() / 2, RankingResultBitSetAccumulator::new);
|
||||
|
||||
synchronized (this) {
|
||||
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, retroSet.source, data);
|
||||
retroSet.write();
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void updateSmallWebDomainsSet() {
|
||||
var rpr = new ReversePageRank(similarityDomains, rankingSettings.small.toArray(String[]::new));
|
||||
rpr.setMaxKnownUrls(750);
|
||||
var data = rpr.pageRankWithPeripheralNodes(rpr.size(), RankingResultBitSetAccumulator::new);
|
||||
|
||||
synchronized (this) {
|
||||
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, smallWebSet.source, data);
|
||||
smallWebSet.write();
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void updateAcademiaDomainsSet() {
|
||||
var spr = new StandardPageRank(similarityDomains, rankingSettings.academia.toArray(String[]::new));
|
||||
var data = spr.pageRankWithPeripheralNodes(spr.size()/2, RankingResultBitSetAccumulator::new);
|
||||
|
||||
synchronized (this) {
|
||||
academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, academiaSet.source, data);
|
||||
academiaSet.write();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user