mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Use new cosine-similarity ranking algorithm
This commit is contained in:
parent
3e1297064c
commit
bcadfc965d
@ -1,6 +1,7 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.ranking.data;
|
package nu.marginalia.wmsa.edge.index.ranking.data;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||||
@ -11,6 +12,7 @@ import java.sql.SQLException;
|
|||||||
import java.util.function.Consumer;
|
import java.util.function.Consumer;
|
||||||
import java.util.function.IntConsumer;
|
import java.util.function.IntConsumer;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
public class RankingDomainFetcher {
|
public class RankingDomainFetcher {
|
||||||
protected final HikariDataSource dataSource;
|
protected final HikariDataSource dataSource;
|
||||||
protected final EdgeDomainBlacklistImpl blacklist;
|
protected final EdgeDomainBlacklistImpl blacklist;
|
||||||
|
@ -1,14 +1,41 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.ranking.data;
|
package nu.marginalia.wmsa.edge.index.ranking.data;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.util.function.Consumer;
|
import java.util.function.Consumer;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
public class RankingDomainFetcherForSimilarityData extends RankingDomainFetcher {
|
public class RankingDomainFetcherForSimilarityData extends RankingDomainFetcher {
|
||||||
|
final boolean hasData;
|
||||||
|
|
||||||
|
@Inject
|
||||||
public RankingDomainFetcherForSimilarityData(HikariDataSource dataSource, EdgeDomainBlacklistImpl blacklist) {
|
public RankingDomainFetcherForSimilarityData(HikariDataSource dataSource, EdgeDomainBlacklistImpl blacklist) {
|
||||||
super(dataSource, blacklist);
|
super(dataSource, blacklist);
|
||||||
|
|
||||||
|
hasData = isDomainNeighborTablePopulated(dataSource);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean isDomainNeighborTablePopulated(HikariDataSource dataSource) {
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
var stmt = conn.createStatement();
|
||||||
|
var rs = stmt.executeQuery("SELECT DOMAIN_ID FROM EC_DOMAIN_NEIGHBORS_2 LIMIT 1")) {
|
||||||
|
|
||||||
|
return rs.next();
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
LoggerFactory
|
||||||
|
.getLogger(RankingDomainFetcherForSimilarityData.class)
|
||||||
|
.error("Failed to get count from EC_DOMAIN_NEIGHBORS_2", ex);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
public boolean hasData() {
|
||||||
|
return hasData;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void eachDomainLink(DomainLinkConsumer consumer) {
|
public void eachDomainLink(DomainLinkConsumer consumer) {
|
||||||
@ -45,19 +72,32 @@ public class RankingDomainFetcherForSimilarityData extends RankingDomainFetcher
|
|||||||
// HAVING COUNT(SOURCE_DOMAIN_ID)>5
|
// HAVING COUNT(SOURCE_DOMAIN_ID)>5
|
||||||
// """;
|
// """;
|
||||||
|
|
||||||
String query =
|
String query;
|
||||||
"""
|
if (getNames) {
|
||||||
SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0)
|
query =
|
||||||
FROM EC_DOMAIN
|
"""
|
||||||
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0)
|
||||||
GROUP BY EC_DOMAIN.ID
|
FROM EC_DOMAIN
|
||||||
""";
|
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||||
|
GROUP BY EC_DOMAIN.ID
|
||||||
|
""";
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
query =
|
||||||
|
"""
|
||||||
|
SELECT EC_DOMAIN.ID,"",DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0)
|
||||||
|
FROM EC_DOMAIN
|
||||||
|
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||||
|
GROUP BY EC_DOMAIN.ID
|
||||||
|
""";
|
||||||
|
}
|
||||||
|
|
||||||
getDomains(query, consumer);
|
getDomains(query, consumer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void getPeripheralDomains(Consumer<RankingDomainData> consumer) {
|
public void getPeripheralDomains(Consumer<RankingDomainData> consumer) {
|
||||||
|
// This is not relevant for this variant of pagerank since it is bidirectional
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -5,35 +5,56 @@ import com.google.inject.Singleton;
|
|||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.wmsa.edge.index.ranking.ReversePageRank;
|
import nu.marginalia.wmsa.edge.index.ranking.ReversePageRank;
|
||||||
import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank;
|
import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank;
|
||||||
|
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultHashMapAccumulator;
|
||||||
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher;
|
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher;
|
||||||
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultBitSetAccumulator;
|
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultBitSetAccumulator;
|
||||||
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultHashMapAccumulator;
|
|
||||||
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
||||||
import nu.marginalia.wmsa.edge.index.model.RankingSettings;
|
import nu.marginalia.wmsa.edge.index.model.RankingSettings;
|
||||||
import nu.marginalia.wmsa.edge.index.postings.DomainRankings;
|
import nu.marginalia.wmsa.edge.index.postings.DomainRankings;
|
||||||
|
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcherForSimilarityData;
|
||||||
import nu.marginalia.wmsa.edge.index.svc.searchset.RankingSearchSet;
|
import nu.marginalia.wmsa.edge.index.svc.searchset.RankingSearchSet;
|
||||||
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSet;
|
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSet;
|
||||||
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetAny;
|
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetAny;
|
||||||
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier;
|
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class EdgeIndexSearchSetsService {
|
public class EdgeIndexSearchSetsService {
|
||||||
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
private final RankingDomainFetcher rankingDomains;
|
private final RankingDomainFetcher rankingDomains;
|
||||||
|
private final RankingDomainFetcher similarityDomains;
|
||||||
private final RankingSettings rankingSettings;
|
private final RankingSettings rankingSettings;
|
||||||
private final SearchSet anySet = new SearchSetAny();
|
|
||||||
|
|
||||||
|
// Below are binary indices that are used to constrain a search
|
||||||
private volatile RankingSearchSet retroSet;
|
private volatile RankingSearchSet retroSet;
|
||||||
private volatile RankingSearchSet smallWebSet;
|
private volatile RankingSearchSet smallWebSet;
|
||||||
private volatile RankingSearchSet academiaSet;
|
private volatile RankingSearchSet academiaSet;
|
||||||
|
private final SearchSet anySet = new SearchSetAny();
|
||||||
|
|
||||||
|
// The ranking value of the domains used in sorting the domains
|
||||||
private volatile DomainRankings domainRankings = new DomainRankings();
|
private volatile DomainRankings domainRankings = new DomainRankings();
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public EdgeIndexSearchSetsService(RankingDomainFetcher rankingDomains,
|
public EdgeIndexSearchSetsService(RankingDomainFetcher rankingDomains,
|
||||||
|
RankingDomainFetcherForSimilarityData similarityDomains,
|
||||||
RankingSettings rankingSettings,
|
RankingSettings rankingSettings,
|
||||||
IndexServicesFactory servicesFactory) throws IOException {
|
IndexServicesFactory servicesFactory) throws IOException {
|
||||||
|
|
||||||
this.rankingDomains = rankingDomains;
|
this.rankingDomains = rankingDomains;
|
||||||
|
|
||||||
|
if (similarityDomains.hasData()) {
|
||||||
|
this.similarityDomains = similarityDomains;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// on test environments the cosine similarity graph may not be present
|
||||||
|
logger.info("Domain similarity is not present, falling back on link graph");
|
||||||
|
this.similarityDomains = rankingDomains;
|
||||||
|
}
|
||||||
|
|
||||||
this.rankingSettings = rankingSettings;
|
this.rankingSettings = rankingSettings;
|
||||||
|
|
||||||
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, servicesFactory.getSearchSetsBase().resolve("small-web.dat"));
|
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, servicesFactory.getSearchSetsBase().resolve("small-web.dat"));
|
||||||
@ -41,51 +62,6 @@ public class EdgeIndexSearchSetsService {
|
|||||||
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, servicesFactory.getSearchSetsBase().resolve("retro.dat"));
|
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, servicesFactory.getSearchSetsBase().resolve("retro.dat"));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void recalculateAll() {
|
|
||||||
updateAcademiaDomains();
|
|
||||||
updateRetroDomains();
|
|
||||||
updateSmallWebDomains();
|
|
||||||
}
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
public void updateRetroDomains() {
|
|
||||||
var spr = new StandardPageRank(rankingDomains,rankingSettings.retro.toArray(String[]::new));
|
|
||||||
var data = spr.pageRankWithPeripheralNodes(spr.size() / 2, RankingResultBitSetAccumulator::new);
|
|
||||||
|
|
||||||
synchronized (this) {
|
|
||||||
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, retroSet.source, data);
|
|
||||||
retroSet.write();
|
|
||||||
}
|
|
||||||
|
|
||||||
var ranks = spr.pageRankWithPeripheralNodes(spr.size() / 2, () -> new RankingResultHashMapAccumulator(100_000));
|
|
||||||
synchronized (this) {
|
|
||||||
domainRankings = new DomainRankings(ranks);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
public void updateSmallWebDomains() {
|
|
||||||
var rpr = new ReversePageRank(rankingDomains, rankingSettings.small.toArray(String[]::new));
|
|
||||||
rpr.setMaxKnownUrls(750);
|
|
||||||
var data = rpr.pageRankWithPeripheralNodes(rpr.size(), RankingResultBitSetAccumulator::new);
|
|
||||||
|
|
||||||
synchronized (this) {
|
|
||||||
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, smallWebSet.source, data);
|
|
||||||
smallWebSet.write();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
public void updateAcademiaDomains() {
|
|
||||||
var spr = new StandardPageRank(rankingDomains, rankingSettings.academia.toArray(String[]::new));
|
|
||||||
var data = spr.pageRankWithPeripheralNodes(spr.size()/2, RankingResultBitSetAccumulator::new);
|
|
||||||
|
|
||||||
synchronized (this) {
|
|
||||||
academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, academiaSet.source, data);
|
|
||||||
academiaSet.write();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public DomainRankings getDomainRankings() {
|
public DomainRankings getDomainRankings() {
|
||||||
return domainRankings;
|
return domainRankings;
|
||||||
}
|
}
|
||||||
@ -101,4 +77,54 @@ public class EdgeIndexSearchSetsService {
|
|||||||
case SMALLWEB -> smallWebSet;
|
case SMALLWEB -> smallWebSet;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void recalculateAll() {
|
||||||
|
updateAcademiaDomainsSet();
|
||||||
|
updateRetroDomainsSet();
|
||||||
|
updateSmallWebDomainsSet();
|
||||||
|
updateDomainRankings();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void updateDomainRankings() {
|
||||||
|
var spr = new StandardPageRank(similarityDomains, rankingSettings.retro.toArray(String[]::new));
|
||||||
|
|
||||||
|
var ranks = spr.pageRankWithPeripheralNodes(spr.size() / 2, () -> new RankingResultHashMapAccumulator(100_000));
|
||||||
|
synchronized (this) {
|
||||||
|
domainRankings = new DomainRankings(ranks);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public void updateRetroDomainsSet() {
|
||||||
|
var spr = new StandardPageRank(similarityDomains, rankingSettings.retro.toArray(String[]::new));
|
||||||
|
var data = spr.pageRankWithPeripheralNodes(spr.size() / 2, RankingResultBitSetAccumulator::new);
|
||||||
|
|
||||||
|
synchronized (this) {
|
||||||
|
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, retroSet.source, data);
|
||||||
|
retroSet.write();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public void updateSmallWebDomainsSet() {
|
||||||
|
var rpr = new ReversePageRank(similarityDomains, rankingSettings.small.toArray(String[]::new));
|
||||||
|
rpr.setMaxKnownUrls(750);
|
||||||
|
var data = rpr.pageRankWithPeripheralNodes(rpr.size(), RankingResultBitSetAccumulator::new);
|
||||||
|
|
||||||
|
synchronized (this) {
|
||||||
|
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, smallWebSet.source, data);
|
||||||
|
smallWebSet.write();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public void updateAcademiaDomainsSet() {
|
||||||
|
var spr = new StandardPageRank(similarityDomains, rankingSettings.academia.toArray(String[]::new));
|
||||||
|
var data = spr.pageRankWithPeripheralNodes(spr.size()/2, RankingResultBitSetAccumulator::new);
|
||||||
|
|
||||||
|
synchronized (this) {
|
||||||
|
academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, academiaSet.source, data);
|
||||||
|
academiaSet.write();
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user