mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
WIP: Refactored ranking algorithms to separate database code from ranking code
This commit is contained in:
parent
026ba714b5
commit
5e472fe121
@ -1,15 +1,11 @@
|
||||
package nu.marginalia.util.ranking;
|
||||
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class BetterReversePageRank extends RankingAlgorithm {
|
||||
|
||||
|
||||
public BetterReversePageRank(HikariDataSource dataSource, String... origins) {
|
||||
super(dataSource, origins);
|
||||
public BetterReversePageRank(RankingDomainFetcher domains, String... origins) {
|
||||
super(domains, origins);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -1,14 +1,10 @@
|
||||
package nu.marginalia.util.ranking;
|
||||
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class BetterStandardPageRank extends RankingAlgorithm {
|
||||
|
||||
public BetterStandardPageRank(HikariDataSource dataSource, String... origins) {
|
||||
super(dataSource, origins);
|
||||
public BetterStandardPageRank(RankingDomainFetcher domains, String... origins) {
|
||||
super(domains, origins);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -1,15 +1,11 @@
|
||||
package nu.marginalia.util.ranking;
|
||||
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class BuggyReversePageRank extends RankingAlgorithm {
|
||||
|
||||
|
||||
public BuggyReversePageRank(HikariDataSource dataSource, String... origins) {
|
||||
super(dataSource, origins);
|
||||
public BuggyReversePageRank(RankingDomainFetcher domains, String... origins) {
|
||||
super(domains, origins);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -1,14 +1,10 @@
|
||||
package nu.marginalia.util.ranking;
|
||||
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class BuggyStandardPageRank extends RankingAlgorithm {
|
||||
|
||||
public BuggyStandardPageRank(HikariDataSource dataSource, String... origins) {
|
||||
super(dataSource, origins);
|
||||
public BuggyStandardPageRank(RankingDomainFetcher domains, String... origins) {
|
||||
super(domains, origins);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -1,35 +1,26 @@
|
||||
package nu.marginalia.util.ranking;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import gnu.trove.list.TIntList;
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
import gnu.trove.map.hash.TIntIntHashMap;
|
||||
import gnu.trove.map.hash.TIntObjectHashMap;
|
||||
import gnu.trove.set.hash.TIntHashSet;
|
||||
import it.unimi.dsi.fastutil.ints.IntComparator;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
import java.util.function.IntToDoubleFunction;
|
||||
import java.util.stream.IntStream;
|
||||
import it.unimi.dsi.fastutil.ints.IntArrays;
|
||||
|
||||
public abstract class RankingAlgorithm {
|
||||
final TIntObjectHashMap<DomainData> domainsById = new TIntObjectHashMap<>();
|
||||
final TIntObjectHashMap<RankingDomainData> domainsById = new TIntObjectHashMap<>();
|
||||
final TIntIntHashMap domainIndexToId = new TIntIntHashMap();
|
||||
final TIntIntHashMap domainIdToIndex = new TIntIntHashMap();
|
||||
|
||||
private final TIntHashSet spamDomains;
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
TIntArrayList[] linkDataSrc2Dest;
|
||||
TIntArrayList[] linkDataDest2Src;
|
||||
|
||||
@ -41,10 +32,14 @@ public abstract class RankingAlgorithm {
|
||||
private static final boolean getNames = true;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private RankingDomainFetcher domains;
|
||||
|
||||
public static void main(String... args) throws IOException {
|
||||
var rpr = new BuggyReversePageRank(new DatabaseModule().provideConnection(), "wiki.xxiivv.com");
|
||||
var spr = new BuggyStandardPageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu");
|
||||
var ds = new DatabaseModule().provideConnection();
|
||||
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
|
||||
|
||||
var rpr = new BuggyReversePageRank(domains, "wiki.xxiivv.com");
|
||||
var spr = new BuggyStandardPageRank(domains, "memex.marginalia.nu");
|
||||
|
||||
var rankVector = spr.pageRankVector();
|
||||
var norm = rankVector.norm();
|
||||
@ -61,164 +56,97 @@ public abstract class RankingAlgorithm {
|
||||
return domainsById.get(id).peripheral;
|
||||
}
|
||||
|
||||
public RankingAlgorithm(HikariDataSource dataSource, String... origins) {
|
||||
this.dataSource = dataSource;
|
||||
var blacklist = new EdgeDomainBlacklistImpl(dataSource);
|
||||
public RankingAlgorithm(RankingDomainFetcher domains, String... origins) {
|
||||
this.domains = domains;
|
||||
|
||||
spamDomains = blacklist.getSpamDomains();
|
||||
originDomains.addAll(Arrays.asList(origins));
|
||||
|
||||
try (var conn = dataSource.getConnection()) {
|
||||
domains.getDomains(domainData -> {
|
||||
int id = domainData.id;
|
||||
|
||||
String s;
|
||||
if (getNames) {
|
||||
s = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
|
||||
domainsById.put(id, domainData);
|
||||
|
||||
domainIndexToId.put(domainIndexToId.size(), id);
|
||||
domainIdToIndex.put(id, domainIdToIndex.size());
|
||||
});
|
||||
|
||||
linkDataSrc2Dest = new TIntArrayList[domainIndexToId.size()];
|
||||
linkDataDest2Src = new TIntArrayList[domainIndexToId.size()];
|
||||
|
||||
domains.eachDomainLink((src, dst) -> {
|
||||
if (src == dst) return;
|
||||
|
||||
if (domainsById.contains(src) && domainsById.contains(dst)) {
|
||||
|
||||
int srcIdx = domainIdToIndex.get(src);
|
||||
int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias());
|
||||
|
||||
if (linkDataSrc2Dest[srcIdx] == null) {
|
||||
linkDataSrc2Dest[srcIdx] = new TIntArrayList();
|
||||
}
|
||||
linkDataSrc2Dest[srcIdx].add(dstIdx);
|
||||
|
||||
if (linkDataDest2Src[dstIdx] == null) {
|
||||
linkDataDest2Src[dstIdx] = new TIntArrayList();
|
||||
}
|
||||
linkDataDest2Src[dstIdx].add(srcIdx);
|
||||
}
|
||||
});
|
||||
|
||||
for (var namePattern : this.originDomains) {
|
||||
domains.domainsByPattern(namePattern, i -> {
|
||||
int ival = domainIdToIndex.get(i);
|
||||
if (ival != domainIdToIndex.getNoEntryValue() || domainIndexToId.get(0) == i) {
|
||||
originDomainIds.add(ival);
|
||||
}
|
||||
else {
|
||||
s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
|
||||
logger.debug("No value for {}", i);
|
||||
}
|
||||
try (var stmt = conn.prepareStatement(s)) {
|
||||
stmt.setFetchSize(10000);
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
int id = rsp.getInt(1);
|
||||
if (!spamDomains.contains(id)) {
|
||||
|
||||
domainsById.put(id, new DomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5), false));
|
||||
|
||||
domainIndexToId.put(domainIndexToId.size(), id);
|
||||
domainIdToIndex.put(id, domainIdToIndex.size());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
linkDataSrc2Dest = new TIntArrayList[domainIndexToId.size()];
|
||||
linkDataDest2Src = new TIntArrayList[domainIndexToId.size()];
|
||||
|
||||
try (var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK")) {
|
||||
stmt.setFetchSize(10000);
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
|
||||
while (rsp.next()) {
|
||||
int src = rsp.getInt(1);
|
||||
int dst = rsp.getInt(2);
|
||||
|
||||
if (src == dst) continue;
|
||||
|
||||
if (domainsById.contains(src) && domainsById.contains(dst)) {
|
||||
|
||||
int srcIdx = domainIdToIndex.get(src);
|
||||
int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias());
|
||||
|
||||
if (linkDataSrc2Dest[srcIdx] == null) {
|
||||
linkDataSrc2Dest[srcIdx] = new TIntArrayList();
|
||||
}
|
||||
linkDataSrc2Dest[srcIdx].add(dstIdx);
|
||||
|
||||
if (linkDataDest2Src[dstIdx] == null) {
|
||||
linkDataDest2Src[dstIdx] = new TIntArrayList();
|
||||
}
|
||||
linkDataDest2Src[dstIdx].add(srcIdx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME LIKE ?")) {
|
||||
for (var seed : this.originDomains) {
|
||||
stmt.setString(1, seed);
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
int i = rsp.getInt(1);
|
||||
int ival = domainIdToIndex.get(i);
|
||||
if (ival != domainIdToIndex.getNoEntryValue() || domainIndexToId.get(0) == i) {
|
||||
originDomainIds.add(ival);
|
||||
}
|
||||
else {
|
||||
logger.debug("No value for {}", i);
|
||||
}
|
||||
}
|
||||
logger.debug("{} -> {}", seed, originDomainIds.size());
|
||||
}
|
||||
}
|
||||
|
||||
logger.info("Origin Domains: {}", originDomainIds.size());
|
||||
|
||||
} catch (SQLException throwables) {
|
||||
logger.error("SQL error", throwables);
|
||||
});
|
||||
}
|
||||
logger.info("Origin Domains: {}", originDomainIds.size());
|
||||
}
|
||||
|
||||
public void addPeripheralNodes(boolean includeErrorStates) {
|
||||
public void addPeripheralNodes() {
|
||||
|
||||
int newNodesIdxCutoff = domainIdToIndex.size();
|
||||
|
||||
logger.info("Inserting peripheral nodes");
|
||||
|
||||
try (var conn = dataSource.getConnection()) {
|
||||
String s;
|
||||
if (getNames) {
|
||||
s = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
|
||||
domains.getPeripheralDomains(domainData -> {
|
||||
int id = domainData.id;
|
||||
|
||||
if (domainsById.put(id, domainData) == null) { // true if id was not already present
|
||||
domainIndexToId.put(domainIndexToId.size(), id);
|
||||
domainIdToIndex.put(id, domainIdToIndex.size());
|
||||
}
|
||||
else {
|
||||
s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
|
||||
}
|
||||
try (var stmt = conn.prepareStatement(s)) {
|
||||
stmt.setFetchSize(10000);
|
||||
var rsp = stmt.executeQuery();
|
||||
});
|
||||
|
||||
while (rsp.next()) {
|
||||
int id = rsp.getInt(1);
|
||||
linkDataSrc2Dest = Arrays.copyOf(linkDataSrc2Dest, domainIndexToId.size());
|
||||
linkDataDest2Src = Arrays.copyOf(linkDataDest2Src, domainIndexToId.size());
|
||||
|
||||
if (!spamDomains.contains(id)) {
|
||||
domainsById.put(id, new DomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5), true));
|
||||
domains.eachDomainLink((src, dst) -> {
|
||||
if (src == dst) return;
|
||||
|
||||
domainIndexToId.put(domainIndexToId.size(), id);
|
||||
domainIdToIndex.put(id, domainIdToIndex.size());
|
||||
}
|
||||
if (domainsById.contains(src) && domainsById.contains(dst)) {
|
||||
int srcIdx = domainIdToIndex.get(src);
|
||||
int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias());
|
||||
|
||||
// This looks like a bug, but it improves the results
|
||||
if (srcIdx < newNodesIdxCutoff || dstIdx < newNodesIdxCutoff)
|
||||
return;
|
||||
|
||||
if (linkDataSrc2Dest[srcIdx] == null) {
|
||||
linkDataSrc2Dest[srcIdx] = new TIntArrayList();
|
||||
}
|
||||
linkDataSrc2Dest[srcIdx].add(dstIdx);
|
||||
|
||||
}
|
||||
|
||||
linkDataSrc2Dest = Arrays.copyOf(linkDataSrc2Dest, domainIndexToId.size());
|
||||
linkDataDest2Src = Arrays.copyOf(linkDataDest2Src, domainIndexToId.size());
|
||||
|
||||
try (var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK")) {
|
||||
stmt.setFetchSize(10000);
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
|
||||
while (rsp.next()) {
|
||||
int src = rsp.getInt(1);
|
||||
int dst = rsp.getInt(2);
|
||||
|
||||
if (src == dst) continue;
|
||||
|
||||
if (domainsById.contains(src) && domainsById.contains(dst)) {
|
||||
|
||||
int srcIdx = domainIdToIndex.get(src);
|
||||
int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias());
|
||||
|
||||
// This looks like a bug, but it improves the results
|
||||
if (srcIdx < newNodesIdxCutoff || dstIdx < newNodesIdxCutoff)
|
||||
continue;
|
||||
|
||||
if (linkDataSrc2Dest[srcIdx] == null) {
|
||||
linkDataSrc2Dest[srcIdx] = new TIntArrayList();
|
||||
}
|
||||
linkDataSrc2Dest[srcIdx].add(dstIdx);
|
||||
|
||||
if (linkDataDest2Src[dstIdx] == null) {
|
||||
linkDataDest2Src[dstIdx] = new TIntArrayList();
|
||||
}
|
||||
linkDataDest2Src[dstIdx].add(srcIdx);
|
||||
}
|
||||
if (linkDataDest2Src[dstIdx] == null) {
|
||||
linkDataDest2Src[dstIdx] = new TIntArrayList();
|
||||
}
|
||||
linkDataDest2Src[dstIdx].add(srcIdx);
|
||||
}
|
||||
} catch (SQLException throwables) {
|
||||
logger.error("SQL error", throwables);
|
||||
}
|
||||
});
|
||||
|
||||
logger.info("Peripheral nodes inserted {} -> {}", newNodesIdxCutoff, domainIdToIndex.size());
|
||||
}
|
||||
@ -271,14 +199,14 @@ public abstract class RankingAlgorithm {
|
||||
return rank.getRanking(resultCount);
|
||||
}
|
||||
|
||||
public TIntList pageRankWithPeripheralNodes(int resultCount, boolean includeErrorStates) {
|
||||
public TIntList pageRankWithPeripheralNodes(int resultCount) {
|
||||
RankVector rank = new RankVector(1.d / domainsById.size());
|
||||
|
||||
int iter_max = 100;
|
||||
|
||||
for (int i = 0; i < iter_max; i++) {
|
||||
if (i == iter_max-1) {
|
||||
addPeripheralNodes(includeErrorStates);
|
||||
addPeripheralNodes();
|
||||
}
|
||||
RankVector newRank = createNewRankVector(rank);
|
||||
|
||||
@ -323,7 +251,7 @@ public abstract class RankingAlgorithm {
|
||||
|
||||
abstract RankVector createNewRankVector(RankVector rank);
|
||||
|
||||
public boolean includeInRanking(DomainData data) {
|
||||
public boolean includeInRanking(RankingDomainData data) {
|
||||
if (data.isAlias())
|
||||
return false;
|
||||
if (data.isSpecial())
|
||||
@ -445,32 +373,4 @@ public abstract class RankingAlgorithm {
|
||||
}
|
||||
}
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
static class DomainData {
|
||||
public final int id;
|
||||
public final String name;
|
||||
private int alias;
|
||||
private EdgeDomainIndexingState state;
|
||||
public final int knownUrls;
|
||||
public boolean peripheral;
|
||||
|
||||
public int resolveAlias() {
|
||||
if (alias == 0) return id;
|
||||
return alias;
|
||||
}
|
||||
|
||||
public boolean isAlias() {
|
||||
return alias != 0;
|
||||
}
|
||||
|
||||
public boolean isSpecial() {
|
||||
return EdgeDomainIndexingState.SPECIAL == state;
|
||||
}
|
||||
|
||||
public boolean isSocialMedia() {
|
||||
return EdgeDomainIndexingState.SOCIAL_MEDIA == state;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -0,0 +1,33 @@
|
||||
package nu.marginalia.util.ranking;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
class RankingDomainData {
|
||||
public final int id;
|
||||
public final String name;
|
||||
private int alias;
|
||||
private EdgeDomainIndexingState state;
|
||||
public final int knownUrls;
|
||||
public boolean peripheral;
|
||||
|
||||
public int resolveAlias() {
|
||||
if (alias == 0) return id;
|
||||
return alias;
|
||||
}
|
||||
|
||||
public boolean isAlias() {
|
||||
return alias != 0;
|
||||
}
|
||||
|
||||
public boolean isSpecial() {
|
||||
return EdgeDomainIndexingState.SPECIAL == state;
|
||||
}
|
||||
|
||||
public boolean isSocialMedia() {
|
||||
return EdgeDomainIndexingState.SOCIAL_MEDIA == state;
|
||||
}
|
||||
}
|
@ -0,0 +1,105 @@
|
||||
package nu.marginalia.util.ranking;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.function.IntConsumer;
|
||||
|
||||
public class RankingDomainFetcher {
|
||||
private final HikariDataSource dataSource;
|
||||
private final EdgeDomainBlacklistImpl blacklist;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private final boolean getNames = false;
|
||||
|
||||
@Inject
|
||||
public RankingDomainFetcher(HikariDataSource dataSource, EdgeDomainBlacklistImpl blacklist) {
|
||||
this.dataSource = dataSource;
|
||||
this.blacklist = blacklist;
|
||||
}
|
||||
|
||||
public void getDomains(Consumer<RankingDomainData> consumer) {
|
||||
String query;
|
||||
if (getNames) {
|
||||
query = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
|
||||
}
|
||||
else {
|
||||
query = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
|
||||
}
|
||||
|
||||
getDomains(query, consumer);
|
||||
}
|
||||
|
||||
|
||||
public void getPeripheralDomains(Consumer<RankingDomainData> consumer) {
|
||||
String query;
|
||||
if (getNames) {
|
||||
query = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
|
||||
}
|
||||
else {
|
||||
query = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
|
||||
}
|
||||
|
||||
getDomains(query, consumer);
|
||||
}
|
||||
|
||||
private void getDomains(String query, Consumer<RankingDomainData> consumer) {
|
||||
try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(query)) {
|
||||
stmt.setFetchSize(10000);
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
int id = rsp.getInt(1);
|
||||
if (!blacklist.isBlacklisted(id)) {
|
||||
consumer.accept(new RankingDomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5), false));
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Failed to fetch domains", ex);
|
||||
}
|
||||
}
|
||||
|
||||
public void eachDomainLink(DomainLinkConsumer consumer) {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK"))
|
||||
{
|
||||
stmt.setFetchSize(10000);
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
|
||||
while (rsp.next()) {
|
||||
int src = rsp.getInt(1);
|
||||
int dst = rsp.getInt(2);
|
||||
|
||||
consumer.accept(src, dst);
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Failed to fetch domain links", ex);
|
||||
}
|
||||
}
|
||||
|
||||
public void domainsByPattern(String pattern, IntConsumer idConsumer) {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME LIKE ?")) {
|
||||
stmt.setString(1, pattern);
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
idConsumer.accept(rsp.getInt(1));
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Failed to fetch domains by pattern", ex);
|
||||
}
|
||||
}
|
||||
|
||||
public interface DomainLinkConsumer {
|
||||
void accept(int from, int to);
|
||||
}
|
||||
}
|
@ -3,12 +3,13 @@ package nu.marginalia.util.ranking.tool;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.ranking.BuggyStandardPageRank;
|
||||
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
||||
import org.mariadb.jdbc.Driver;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.SQLException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
@ -43,12 +44,14 @@ public class UpdateDomainRanksTool {
|
||||
var uploader = new Thread(() -> uploadThread(conn), "Uploader");
|
||||
|
||||
logger.info("Ranking");
|
||||
var spr = new BuggyStandardPageRank(new DatabaseModule().provideConnection(),"memex.marginalia.nu");
|
||||
var ds = new DatabaseModule().provideConnection();
|
||||
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
|
||||
var spr = new BuggyStandardPageRank(domains, "memex.marginalia.nu");
|
||||
|
||||
rankMax = spr.size()*2;
|
||||
uploader.start();
|
||||
|
||||
spr.pageRankWithPeripheralNodes(rankMax, false).forEach(i -> {
|
||||
spr.pageRankWithPeripheralNodes(rankMax).forEach(i -> {
|
||||
try {
|
||||
uploadQueue.put(i);
|
||||
} catch (InterruptedException e) {
|
||||
|
@ -3,12 +3,13 @@ package nu.marginalia.util.ranking.tool;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.ranking.BetterReversePageRank;
|
||||
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
||||
import org.mariadb.jdbc.Driver;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.SQLException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
@ -45,7 +46,9 @@ public class UpdateDomainRanksTool2 {
|
||||
logger.info("Ranking");
|
||||
// "memex.marginalia.nu", "wiki.xxiivv.com", "bikobatanari.art", "sadgrl.online", "lileks.com",
|
||||
// "www.rep.routledge.com", "www.personal.kent.edu", "xroads.virginia.edu", "classics.mit.edu", "faculty.washington.edu", "monadnock.net"
|
||||
var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
|
||||
var ds = new DatabaseModule().provideConnection();
|
||||
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
|
||||
var rpr = new BetterReversePageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
|
||||
// var rpr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "%edu");
|
||||
// var spr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu");
|
||||
|
||||
@ -58,7 +61,7 @@ public class UpdateDomainRanksTool2 {
|
||||
rankMax = rpr.size();
|
||||
|
||||
|
||||
rpr.pageRankWithPeripheralNodes(rankMax, false).forEach(i -> {
|
||||
rpr.pageRankWithPeripheralNodes(rankMax).forEach(i -> {
|
||||
try {
|
||||
uploadQueue.put(i);
|
||||
} catch (InterruptedException e) {
|
||||
|
@ -6,6 +6,7 @@ import com.google.common.hash.Hashing;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
|
||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
||||
@ -73,10 +74,12 @@ public class CrawlJobExtractorPageRankMain {
|
||||
|
||||
Gson gson = new GsonBuilder().create();
|
||||
|
||||
var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
|
||||
var ds = new DatabaseModule().provideConnection();
|
||||
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
|
||||
var rpr = new BetterReversePageRank(domains, "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
|
||||
rpr.setMaxKnownUrls(750);
|
||||
|
||||
var targetDomainIds = rpr.pageRankWithPeripheralNodes(rpr.size(), false);
|
||||
var targetDomainIds = rpr.pageRankWithPeripheralNodes(rpr.size());
|
||||
|
||||
try (var out = new PrintWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outFile.toFile()))))) {
|
||||
final var extractor = new CrawlJobExtractorPageRankMain(new DatabaseModule().provideConnection());
|
||||
|
@ -13,33 +13,14 @@ import java.util.Optional;
|
||||
|
||||
@ImplementedBy(EdgeDataStoreDaoImpl.class)
|
||||
public interface EdgeDataStoreDao {
|
||||
boolean isBlacklisted(EdgeDomain domain);
|
||||
|
||||
EdgeId<EdgeDomain> getDomainId(EdgeDomain domain);
|
||||
|
||||
List<BrowseResult> getDomainNeighborsAdjacent(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist backlist, int count);
|
||||
|
||||
List<BrowseResult> getRandomDomains(int count, EdgeDomainBlacklist backlist);
|
||||
List<EdgeUrlDetails> getUrlDetailsMulti(List<EdgeId<EdgeUrl>> ids);
|
||||
|
||||
|
||||
EdgeDomain getDomain(EdgeId<EdgeDomain> id);
|
||||
|
||||
Optional<EdgeId<EdgeUrl>> resolveAmbiguousDomain(String name);
|
||||
|
||||
|
||||
int getPagesKnown(EdgeId<EdgeDomain> domainId);
|
||||
int getPagesVisited(EdgeId<EdgeDomain> domainId);
|
||||
int getPagesIndexed(EdgeId<EdgeDomain> domainId);
|
||||
|
||||
int getIncomingLinks(EdgeId<EdgeDomain> domainId);
|
||||
int getOutboundLinks(EdgeId<EdgeDomain> domainId);
|
||||
|
||||
double getDomainQuality(EdgeId<EdgeDomain> domainId);
|
||||
|
||||
EdgeDomainIndexingState getDomainState(EdgeId<EdgeDomain> domainId);
|
||||
|
||||
List<EdgeDomain> getLinkingDomains(EdgeId<EdgeDomain> domainId);
|
||||
|
||||
double getRank(EdgeId<EdgeDomain> domainId);
|
||||
|
||||
}
|
||||
|
@ -33,7 +33,6 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
private final Cache<EdgeUrl, EdgeId<EdgeUrl>> urlIdCache = CacheBuilder.newBuilder().maximumSize(100_000).build();
|
||||
private final Cache<EdgeDomain, EdgeId<EdgeDomain>> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||
|
||||
private static final String DEFAULT_PROTOCOL = "http";
|
||||
public static double QUALITY_LOWER_BOUND_CUTOFF = -15.;
|
||||
@Inject
|
||||
public EdgeDataStoreDaoImpl(HikariDataSource dataSource)
|
||||
@ -48,23 +47,6 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
domainIdCache.invalidateAll();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public boolean isBlacklisted(EdgeDomain domain) {
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN_BLACKLIST WHERE URL_DOMAIN=?")) {
|
||||
stmt.setString(1, domain.domain);
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public EdgeId<EdgeDomain> getDomainId(EdgeDomain domain) {
|
||||
@ -108,13 +90,12 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
|
||||
try (var stmt = connection.prepareStatement(
|
||||
"""
|
||||
SELECT ID, URL,
|
||||
SELECT ID, URL,
|
||||
TITLE, DESCRIPTION,
|
||||
WORDS_TOTAL, FORMAT, FEATURES,
|
||||
WORDS_TOTAL, FORMAT, FEATURES,
|
||||
IP, DOMAIN_STATE, DATA_HASH
|
||||
FROM EC_URL_VIEW WHERE ID IN
|
||||
""" + idString)) {
|
||||
// "SELECT ID,URL_PROTO,URL_DOMAIN,URL_PORT,URL_PATH,TITLE,DESCRIPTION,URL_QUALITY_MEASURE,DOMAIN_QUALITY_MEASURE,IFNULL(EC_DOMAIN_LINK_AGGREGATE.LINKS,1),WORDS_TOTAL,FORMAT,FEATURES,\"\",QUALITY_RAW,DOMAIN_STATE,DATA_HASH FROM EC_URL_VIEW LEFT JOIN EC_DOMAIN_LINK_AGGREGATE ON EC_DOMAIN_LINK_AGGREGATE.DOMAIN_ID=EC_URL_VIEW.DOMAIN_ID WHERE ID IN " + idString)) {
|
||||
stmt.setFetchSize(ids.size());
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
@ -125,7 +106,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
rsp.getString(4), // description
|
||||
-5, // quality
|
||||
rsp.getInt(5), // wordsTotal
|
||||
rsp.getString(6), // foramt
|
||||
rsp.getString(6), // format
|
||||
rsp.getInt(7), // features
|
||||
rsp.getString(8), // ip
|
||||
EdgeDomainIndexingState.valueOf(rsp.getString(9)), // domainState
|
||||
@ -179,9 +160,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
String domain = rsp.getString(2);
|
||||
|
||||
if (!blacklist.isBlacklisted(id)) {
|
||||
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
|
||||
|
||||
domains.add(new BrowseResult(url, id));
|
||||
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -210,9 +189,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
String domain = rsp.getString(2);
|
||||
|
||||
if (!blacklist.isBlacklisted(id)) {
|
||||
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
|
||||
|
||||
domains.add(new BrowseResult(url, id));
|
||||
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -244,9 +221,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
String domain = rsp.getString(2);
|
||||
|
||||
if (!blacklist.isBlacklisted(id)) {
|
||||
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
|
||||
|
||||
domains.add(new BrowseResult(url, id));
|
||||
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -262,7 +237,15 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
@Override
|
||||
public List<BrowseResult> getRandomDomains(int count, EdgeDomainBlacklist blacklist) {
|
||||
|
||||
final String q = "SELECT DOMAIN_ID,DOMAIN_NAME FROM EC_RANDOM_DOMAINS INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE STATE<2 AND DOMAIN_ALIAS IS NULL ORDER BY RAND() LIMIT ?";
|
||||
final String q = """
|
||||
SELECT DOMAIN_ID, DOMAIN_NAME
|
||||
FROM EC_RANDOM_DOMAINS
|
||||
INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID
|
||||
WHERE STATE<2
|
||||
AND DOMAIN_ALIAS IS NULL
|
||||
ORDER BY RAND()
|
||||
LIMIT ?
|
||||
""";
|
||||
List<BrowseResult> domains = new ArrayList<>(count);
|
||||
try (var conn = dataSource.getConnection()) {
|
||||
try (var stmt = conn.prepareStatement(q)) {
|
||||
@ -273,9 +256,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
String domain = rsp.getString(2);
|
||||
|
||||
if (!blacklist.isBlacklisted(id)) {
|
||||
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
|
||||
|
||||
domains.add(new BrowseResult(url, id));
|
||||
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -302,223 +283,4 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Optional<EdgeId<EdgeUrl>> resolveAmbiguousDomain(String name) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||
stmt.setString(1, name);
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return Optional.of(new EdgeId<>(rsp.getInt(1)));
|
||||
}
|
||||
}
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||
stmt.setString(1, "https://"+name);
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return Optional.of(new EdgeId<>(rsp.getInt(1)));
|
||||
}
|
||||
}
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||
stmt.setString(1, "http://"+name);
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return Optional.of(new EdgeId<>(rsp.getInt(1)));
|
||||
}
|
||||
}
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||
stmt.setString(1, "https://www."+name);
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return Optional.of(new EdgeId<>(rsp.getInt(1)));
|
||||
}
|
||||
}
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||
stmt.setString(1, "http://www."+name);
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return Optional.of(new EdgeId<>(rsp.getInt(1)));
|
||||
}
|
||||
}
|
||||
|
||||
} catch (SQLException throwables) {
|
||||
logger.info("Could not resolve domain id for {}", name);
|
||||
}
|
||||
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public int getPagesKnown(EdgeId<EdgeDomain> domainId) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT KNOWN_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
|
||||
stmt.setInt(1, domainId.getId());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return rsp.getInt(1);
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
logger.error("DB error", ex);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public int getPagesVisited(EdgeId<EdgeDomain> domainId) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT VISITED_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
|
||||
stmt.setInt(1, domainId.getId());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return rsp.getInt(1);
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
logger.error("DB error", ex);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public int getPagesIndexed(EdgeId<EdgeDomain> domainId) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT GOOD_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
|
||||
stmt.setInt(1, domainId.getId());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return rsp.getInt(1);
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
logger.error("DB error", ex);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public int getIncomingLinks(EdgeId<EdgeDomain> domainId) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE DEST_DOMAIN_ID=?")) {
|
||||
stmt.setInt(1, domainId.getId());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return rsp.getInt(1);
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
logger.error("DB error", ex);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public int getOutboundLinks(EdgeId<EdgeDomain> domainId) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=?")) {
|
||||
stmt.setInt(1, domainId.getId());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return rsp.getInt(1);
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
logger.error("DB error", ex);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public double getDomainQuality(EdgeId<EdgeDomain> domainId) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT QUALITY FROM EC_DOMAIN WHERE ID=?")) {
|
||||
stmt.setInt(1, domainId.getId());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return rsp.getDouble(1);
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
logger.error("DB error", ex);
|
||||
}
|
||||
return -5;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public EdgeDomainIndexingState getDomainState(EdgeId<EdgeDomain> domainId) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT STATE FROM EC_DOMAIN WHERE ID=?")) {
|
||||
stmt.setInt(1, domainId.getId());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return EdgeDomainIndexingState.fromCode(rsp.getInt(1));
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
logger.error("DB error", ex);
|
||||
}
|
||||
} catch (SQLException throwables) {
|
||||
throwables.printStackTrace();
|
||||
}
|
||||
return EdgeDomainIndexingState.ERROR;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<EdgeDomain> getLinkingDomains(EdgeId<EdgeDomain> domainId) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
List<EdgeDomain> results = new ArrayList<>(25);
|
||||
try (var stmt = connection.prepareStatement("SELECT SOURCE_URL FROM EC_RELATED_LINKS_VIEW WHERE DEST_DOMAIN_ID=? ORDER BY SOURCE_DOMAIN_ID LIMIT 25")) {
|
||||
stmt.setInt(1, domainId.getId());
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
results.add(new EdgeDomain(rsp.getString(1)));
|
||||
}
|
||||
return results;
|
||||
} catch (Exception ex) {
|
||||
logger.error("DB error", ex);
|
||||
}
|
||||
|
||||
} catch (SQLException throwables) {
|
||||
throwables.printStackTrace();
|
||||
}
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getRank(EdgeId<EdgeDomain> domainId) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT IFNULL(RANK, 1) FROM EC_DOMAIN WHERE ID=?")) {
|
||||
stmt.setInt(1, domainId.getId());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return rsp.getDouble(1);
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
logger.error("DB error", ex);
|
||||
}
|
||||
} catch (SQLException throwables) {
|
||||
throwables.printStackTrace();
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
@ -10,7 +10,7 @@ import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.ranking.BetterReversePageRank;
|
||||
import nu.marginalia.util.ranking.BetterStandardPageRank;
|
||||
import nu.marginalia.util.ranking.BuggyStandardPageRank;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
||||
import nu.marginalia.wmsa.edge.index.model.RankingSettings;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -18,14 +18,17 @@ import org.slf4j.LoggerFactory;
|
||||
@Singleton
|
||||
public class SearchIndexDao {
|
||||
private final HikariDataSource dataSource;
|
||||
private RankingDomainFetcher rankingDomains;
|
||||
private final RankingSettings rankingSettings;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@Inject
|
||||
public SearchIndexDao(HikariDataSource dataSource,
|
||||
RankingDomainFetcher rankingDomains,
|
||||
RankingSettings rankingSettings)
|
||||
{
|
||||
this.dataSource = dataSource;
|
||||
this.rankingDomains = rankingDomains;
|
||||
this.rankingSettings = rankingSettings;
|
||||
logger.info("SearchIndexDao ranking settings = {}", rankingSettings);
|
||||
}
|
||||
@ -63,36 +66,36 @@ public class SearchIndexDao {
|
||||
|
||||
@SneakyThrows
|
||||
public TIntList getRetroDomains() {
|
||||
var spr = new BetterStandardPageRank(dataSource,rankingSettings.retro.toArray(String[]::new));
|
||||
return spr.pageRankWithPeripheralNodes(spr.size()/2, false);
|
||||
var spr = new BetterStandardPageRank(rankingDomains,rankingSettings.retro.toArray(String[]::new));
|
||||
return spr.pageRankWithPeripheralNodes(spr.size()/2);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public TIntList getSmallWebDomains() {
|
||||
var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), rankingSettings.small.toArray(String[]::new));
|
||||
var rpr = new BetterReversePageRank(rankingDomains, rankingSettings.small.toArray(String[]::new));
|
||||
|
||||
rpr.setMaxKnownUrls(750);
|
||||
|
||||
return rpr.pageRankWithPeripheralNodes(rpr.size(), false);
|
||||
return rpr.pageRankWithPeripheralNodes(rpr.size());
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public TIntList getAcademiaDomains() {
|
||||
var spr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), rankingSettings.academia.toArray(String[]::new));
|
||||
return spr.pageRankWithPeripheralNodes(spr.size()/2, false);
|
||||
var spr = new BetterStandardPageRank(rankingDomains, rankingSettings.academia.toArray(String[]::new));
|
||||
return spr.pageRankWithPeripheralNodes(spr.size()/2);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public TIntList getStandardDomains() {
|
||||
var spr = new BuggyStandardPageRank(dataSource,rankingSettings.standard.toArray(String[]::new));
|
||||
return spr.pageRankWithPeripheralNodes(spr.size()/2, false);
|
||||
var spr = new BuggyStandardPageRank(rankingDomains,rankingSettings.standard.toArray(String[]::new));
|
||||
return spr.pageRankWithPeripheralNodes(spr.size()/2);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public TIntList getSpecialDomains() {
|
||||
TIntArrayList results = new TIntArrayList();
|
||||
try (var connection = dataSource.getConnection();
|
||||
var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE STATE=2")
|
||||
var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE STATE='SPECIAL'")
|
||||
) {
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
|
@ -54,8 +54,11 @@ public class EdgeDomain implements WideHashable {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public EdgeUrl toRootUrl() {
|
||||
// Set default protocol to http, as most https websites redirect http->https, but few http websites redirect https->http
|
||||
return new EdgeUrl("http", this, null, "/");
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
|
@ -1,24 +1,43 @@
|
||||
package nu.marginalia.wmsa.edge.search.siteinfo;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao;
|
||||
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDaoImpl;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||
import nu.marginalia.wmsa.edge.search.model.DomainInformation;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.inject.Inject;
|
||||
import javax.inject.Singleton;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
/*
|
||||
TODO: This class needs to be refactored, a lot of
|
||||
these SQL queries are redundant and can be
|
||||
collapsed into one single query that fetches
|
||||
all the information
|
||||
*/
|
||||
@Singleton
|
||||
public class DomainInformationService {
|
||||
|
||||
private EdgeDataStoreDao dataStore;
|
||||
private EdgeDataStoreDaoImpl dataStoreDao;
|
||||
private HikariDataSource dataSource;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@Inject
|
||||
public DomainInformationService(EdgeDataStoreDao dataStore) {
|
||||
this.dataStore = dataStore;
|
||||
public DomainInformationService(
|
||||
EdgeDataStoreDaoImpl dataStoreDao,
|
||||
HikariDataSource dataSource) {
|
||||
this.dataStoreDao = dataStoreDao;
|
||||
this.dataSource = dataSource;
|
||||
}
|
||||
|
||||
|
||||
@ -28,29 +47,29 @@ public class DomainInformationService {
|
||||
if (domainId == null) {
|
||||
return Optional.empty();
|
||||
}
|
||||
EdgeDomain domain = dataStore.getDomain(domainId);
|
||||
EdgeDomain domain = dataStoreDao.getDomain(domainId);
|
||||
|
||||
boolean blacklisted = dataStore.isBlacklisted(domain);
|
||||
int pagesKnown = dataStore.getPagesKnown(domainId);
|
||||
int pagesVisited = dataStore.getPagesVisited(domainId);
|
||||
int pagesIndexed = dataStore.getPagesIndexed(domainId);
|
||||
int incomingLinks = dataStore.getIncomingLinks(domainId);
|
||||
int outboundLinks = dataStore.getOutboundLinks(domainId);
|
||||
double rank = Math.round(10000.0*(1.0-dataStore.getRank(domainId)))/100;
|
||||
EdgeDomainIndexingState state = dataStore.getDomainState(domainId);
|
||||
double nominalQuality = Math.round(100*100*Math.exp(dataStore.getDomainQuality(domainId)))/100.;
|
||||
List<EdgeDomain> linkingDomains = dataStore.getLinkingDomains(domainId);
|
||||
boolean blacklisted = isBlacklisted(domain);
|
||||
int pagesKnown = getPagesKnown(domainId);
|
||||
int pagesVisited = getPagesVisited(domainId);
|
||||
int pagesIndexed = getPagesIndexed(domainId);
|
||||
int incomingLinks = getIncomingLinks(domainId);
|
||||
int outboundLinks = getOutboundLinks(domainId);
|
||||
double rank = Math.round(10000.0*(1.0-getRank(domainId)))/100;
|
||||
EdgeDomainIndexingState state = getDomainState(domainId);
|
||||
double nominalQuality = Math.round(100*100*Math.exp(getDomainQuality(domainId)))/100.;
|
||||
List<EdgeDomain> linkingDomains = getLinkingDomains(domainId);
|
||||
|
||||
return Optional.of(new DomainInformation(domain, blacklisted, pagesKnown, pagesVisited, pagesIndexed, incomingLinks, outboundLinks, nominalQuality, rank, state, linkingDomains));
|
||||
}
|
||||
|
||||
private EdgeId<EdgeDomain> getDomainFromPartial(String site) {
|
||||
try {
|
||||
return dataStore.getDomainId(new EdgeDomain(site));
|
||||
return dataStoreDao.getDomainId(new EdgeDomain(site));
|
||||
}
|
||||
catch (Exception ex) {
|
||||
try {
|
||||
return dataStore.getDomainId(new EdgeDomain(site));
|
||||
return dataStoreDao.getDomainId(new EdgeDomain(site));
|
||||
}
|
||||
catch (Exception ex2) {
|
||||
return null;
|
||||
@ -58,4 +77,178 @@ public class DomainInformationService {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public boolean isBlacklisted(EdgeDomain domain) {
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN_BLACKLIST WHERE URL_DOMAIN=?")) {
|
||||
stmt.setString(1, domain.domain);
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public int getPagesKnown(EdgeId<EdgeDomain> domainId) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT KNOWN_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
|
||||
stmt.setInt(1, domainId.getId());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return rsp.getInt(1);
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
logger.error("DB error", ex);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public int getPagesVisited(EdgeId<EdgeDomain> domainId) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT VISITED_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
|
||||
stmt.setInt(1, domainId.getId());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return rsp.getInt(1);
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
logger.error("DB error", ex);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public int getPagesIndexed(EdgeId<EdgeDomain> domainId) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT GOOD_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
|
||||
stmt.setInt(1, domainId.getId());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return rsp.getInt(1);
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
logger.error("DB error", ex);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public int getIncomingLinks(EdgeId<EdgeDomain> domainId) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE DEST_DOMAIN_ID=?")) {
|
||||
stmt.setInt(1, domainId.getId());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return rsp.getInt(1);
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
logger.error("DB error", ex);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
@SneakyThrows
|
||||
public int getOutboundLinks(EdgeId<EdgeDomain> domainId) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=?")) {
|
||||
stmt.setInt(1, domainId.getId());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return rsp.getInt(1);
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
logger.error("DB error", ex);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public double getDomainQuality(EdgeId<EdgeDomain> domainId) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT QUALITY FROM EC_DOMAIN WHERE ID=?")) {
|
||||
stmt.setInt(1, domainId.getId());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return rsp.getDouble(1);
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
logger.error("DB error", ex);
|
||||
}
|
||||
return -5;
|
||||
}
|
||||
}
|
||||
|
||||
public EdgeDomainIndexingState getDomainState(EdgeId<EdgeDomain> domainId) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT STATE FROM EC_DOMAIN WHERE ID=?")) {
|
||||
stmt.setInt(1, domainId.getId());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return EdgeDomainIndexingState.fromCode(rsp.getInt(1));
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
logger.error("DB error", ex);
|
||||
}
|
||||
} catch (SQLException throwables) {
|
||||
throwables.printStackTrace();
|
||||
}
|
||||
return EdgeDomainIndexingState.ERROR;
|
||||
}
|
||||
|
||||
public List<EdgeDomain> getLinkingDomains(EdgeId<EdgeDomain> domainId) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
List<EdgeDomain> results = new ArrayList<>(25);
|
||||
try (var stmt = connection.prepareStatement("SELECT SOURCE_URL FROM EC_RELATED_LINKS_VIEW WHERE DEST_DOMAIN_ID=? ORDER BY SOURCE_DOMAIN_ID LIMIT 25")) {
|
||||
stmt.setInt(1, domainId.getId());
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
results.add(new EdgeDomain(rsp.getString(1)));
|
||||
}
|
||||
return results;
|
||||
} catch (Exception ex) {
|
||||
logger.error("DB error", ex);
|
||||
}
|
||||
|
||||
} catch (SQLException throwables) {
|
||||
throwables.printStackTrace();
|
||||
}
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
public double getRank(EdgeId<EdgeDomain> domainId) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT IFNULL(RANK, 1) FROM EC_DOMAIN WHERE ID=?")) {
|
||||
stmt.setInt(1, domainId.getId());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return rsp.getDouble(1);
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
logger.error("DB error", ex);
|
||||
}
|
||||
} catch (SQLException throwables) {
|
||||
throwables.printStackTrace();
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
@ -3,6 +3,7 @@ package nu.marginalia.wmsa.edge.tools;
|
||||
import com.google.inject.Inject;
|
||||
import gnu.trove.set.hash.TIntHashSet;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
|
||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
||||
@ -59,7 +60,9 @@ public class IndexMergerMain {
|
||||
}
|
||||
|
||||
var hikari = new DatabaseModule().provideConnection();
|
||||
var partitioner = new SearchIndexPartitioner(new SearchIndexDao(hikari, new RankingSettings()));
|
||||
var ds = new DatabaseModule().provideConnection();
|
||||
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
|
||||
var partitioner = new SearchIndexPartitioner(new SearchIndexDao(hikari, domains, new RankingSettings()));
|
||||
var blacklist = new EdgeDomainBlacklistImpl(hikari);
|
||||
|
||||
new IndexMergerMain(file1, file2, outputFile, partitioner, blacklist);
|
||||
|
Loading…
Reference in New Issue
Block a user