mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
WIP: Refactored ranking algorithms to separate database code from ranking code
This commit is contained in:
parent
026ba714b5
commit
5e472fe121
@ -1,15 +1,11 @@
|
|||||||
package nu.marginalia.util.ranking;
|
package nu.marginalia.util.ranking;
|
||||||
|
|
||||||
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
public class BetterReversePageRank extends RankingAlgorithm {
|
public class BetterReversePageRank extends RankingAlgorithm {
|
||||||
|
|
||||||
|
|
||||||
public BetterReversePageRank(HikariDataSource dataSource, String... origins) {
|
public BetterReversePageRank(RankingDomainFetcher domains, String... origins) {
|
||||||
super(dataSource, origins);
|
super(domains, origins);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -1,14 +1,10 @@
|
|||||||
package nu.marginalia.util.ranking;
|
package nu.marginalia.util.ranking;
|
||||||
|
|
||||||
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
public class BetterStandardPageRank extends RankingAlgorithm {
|
public class BetterStandardPageRank extends RankingAlgorithm {
|
||||||
|
|
||||||
public BetterStandardPageRank(HikariDataSource dataSource, String... origins) {
|
public BetterStandardPageRank(RankingDomainFetcher domains, String... origins) {
|
||||||
super(dataSource, origins);
|
super(domains, origins);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -1,15 +1,11 @@
|
|||||||
package nu.marginalia.util.ranking;
|
package nu.marginalia.util.ranking;
|
||||||
|
|
||||||
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
public class BuggyReversePageRank extends RankingAlgorithm {
|
public class BuggyReversePageRank extends RankingAlgorithm {
|
||||||
|
|
||||||
|
|
||||||
public BuggyReversePageRank(HikariDataSource dataSource, String... origins) {
|
public BuggyReversePageRank(RankingDomainFetcher domains, String... origins) {
|
||||||
super(dataSource, origins);
|
super(domains, origins);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -1,14 +1,10 @@
|
|||||||
package nu.marginalia.util.ranking;
|
package nu.marginalia.util.ranking;
|
||||||
|
|
||||||
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
public class BuggyStandardPageRank extends RankingAlgorithm {
|
public class BuggyStandardPageRank extends RankingAlgorithm {
|
||||||
|
|
||||||
public BuggyStandardPageRank(HikariDataSource dataSource, String... origins) {
|
public BuggyStandardPageRank(RankingDomainFetcher domains, String... origins) {
|
||||||
super(dataSource, origins);
|
super(domains, origins);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -1,35 +1,26 @@
|
|||||||
package nu.marginalia.util.ranking;
|
package nu.marginalia.util.ranking;
|
||||||
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
|
||||||
import gnu.trove.list.TIntList;
|
import gnu.trove.list.TIntList;
|
||||||
import gnu.trove.list.array.TIntArrayList;
|
import gnu.trove.list.array.TIntArrayList;
|
||||||
import gnu.trove.map.hash.TIntIntHashMap;
|
import gnu.trove.map.hash.TIntIntHashMap;
|
||||||
import gnu.trove.map.hash.TIntObjectHashMap;
|
import gnu.trove.map.hash.TIntObjectHashMap;
|
||||||
import gnu.trove.set.hash.TIntHashSet;
|
|
||||||
import it.unimi.dsi.fastutil.ints.IntComparator;
|
import it.unimi.dsi.fastutil.ints.IntComparator;
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import lombok.Data;
|
|
||||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.sql.SQLException;
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.function.IntToDoubleFunction;
|
import java.util.function.IntToDoubleFunction;
|
||||||
import java.util.stream.IntStream;
|
import java.util.stream.IntStream;
|
||||||
import it.unimi.dsi.fastutil.ints.IntArrays;
|
import it.unimi.dsi.fastutil.ints.IntArrays;
|
||||||
|
|
||||||
public abstract class RankingAlgorithm {
|
public abstract class RankingAlgorithm {
|
||||||
final TIntObjectHashMap<DomainData> domainsById = new TIntObjectHashMap<>();
|
final TIntObjectHashMap<RankingDomainData> domainsById = new TIntObjectHashMap<>();
|
||||||
final TIntIntHashMap domainIndexToId = new TIntIntHashMap();
|
final TIntIntHashMap domainIndexToId = new TIntIntHashMap();
|
||||||
final TIntIntHashMap domainIdToIndex = new TIntIntHashMap();
|
final TIntIntHashMap domainIdToIndex = new TIntIntHashMap();
|
||||||
|
|
||||||
private final TIntHashSet spamDomains;
|
|
||||||
private final HikariDataSource dataSource;
|
|
||||||
|
|
||||||
TIntArrayList[] linkDataSrc2Dest;
|
TIntArrayList[] linkDataSrc2Dest;
|
||||||
TIntArrayList[] linkDataDest2Src;
|
TIntArrayList[] linkDataDest2Src;
|
||||||
|
|
||||||
@ -41,10 +32,14 @@ public abstract class RankingAlgorithm {
|
|||||||
private static final boolean getNames = true;
|
private static final boolean getNames = true;
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
private RankingDomainFetcher domains;
|
||||||
|
|
||||||
public static void main(String... args) throws IOException {
|
public static void main(String... args) throws IOException {
|
||||||
var rpr = new BuggyReversePageRank(new DatabaseModule().provideConnection(), "wiki.xxiivv.com");
|
var ds = new DatabaseModule().provideConnection();
|
||||||
var spr = new BuggyStandardPageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu");
|
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
|
||||||
|
|
||||||
|
var rpr = new BuggyReversePageRank(domains, "wiki.xxiivv.com");
|
||||||
|
var spr = new BuggyStandardPageRank(domains, "memex.marginalia.nu");
|
||||||
|
|
||||||
var rankVector = spr.pageRankVector();
|
var rankVector = spr.pageRankVector();
|
||||||
var norm = rankVector.norm();
|
var norm = rankVector.norm();
|
||||||
@ -61,164 +56,97 @@ public abstract class RankingAlgorithm {
|
|||||||
return domainsById.get(id).peripheral;
|
return domainsById.get(id).peripheral;
|
||||||
}
|
}
|
||||||
|
|
||||||
public RankingAlgorithm(HikariDataSource dataSource, String... origins) {
|
public RankingAlgorithm(RankingDomainFetcher domains, String... origins) {
|
||||||
this.dataSource = dataSource;
|
this.domains = domains;
|
||||||
var blacklist = new EdgeDomainBlacklistImpl(dataSource);
|
|
||||||
|
|
||||||
spamDomains = blacklist.getSpamDomains();
|
|
||||||
originDomains.addAll(Arrays.asList(origins));
|
originDomains.addAll(Arrays.asList(origins));
|
||||||
|
|
||||||
try (var conn = dataSource.getConnection()) {
|
domains.getDomains(domainData -> {
|
||||||
|
int id = domainData.id;
|
||||||
|
|
||||||
String s;
|
domainsById.put(id, domainData);
|
||||||
if (getNames) {
|
|
||||||
s = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
|
domainIndexToId.put(domainIndexToId.size(), id);
|
||||||
|
domainIdToIndex.put(id, domainIdToIndex.size());
|
||||||
|
});
|
||||||
|
|
||||||
|
linkDataSrc2Dest = new TIntArrayList[domainIndexToId.size()];
|
||||||
|
linkDataDest2Src = new TIntArrayList[domainIndexToId.size()];
|
||||||
|
|
||||||
|
domains.eachDomainLink((src, dst) -> {
|
||||||
|
if (src == dst) return;
|
||||||
|
|
||||||
|
if (domainsById.contains(src) && domainsById.contains(dst)) {
|
||||||
|
|
||||||
|
int srcIdx = domainIdToIndex.get(src);
|
||||||
|
int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias());
|
||||||
|
|
||||||
|
if (linkDataSrc2Dest[srcIdx] == null) {
|
||||||
|
linkDataSrc2Dest[srcIdx] = new TIntArrayList();
|
||||||
|
}
|
||||||
|
linkDataSrc2Dest[srcIdx].add(dstIdx);
|
||||||
|
|
||||||
|
if (linkDataDest2Src[dstIdx] == null) {
|
||||||
|
linkDataDest2Src[dstIdx] = new TIntArrayList();
|
||||||
|
}
|
||||||
|
linkDataDest2Src[dstIdx].add(srcIdx);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
for (var namePattern : this.originDomains) {
|
||||||
|
domains.domainsByPattern(namePattern, i -> {
|
||||||
|
int ival = domainIdToIndex.get(i);
|
||||||
|
if (ival != domainIdToIndex.getNoEntryValue() || domainIndexToId.get(0) == i) {
|
||||||
|
originDomainIds.add(ival);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
|
logger.debug("No value for {}", i);
|
||||||
}
|
}
|
||||||
try (var stmt = conn.prepareStatement(s)) {
|
});
|
||||||
stmt.setFetchSize(10000);
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
while (rsp.next()) {
|
|
||||||
int id = rsp.getInt(1);
|
|
||||||
if (!spamDomains.contains(id)) {
|
|
||||||
|
|
||||||
domainsById.put(id, new DomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5), false));
|
|
||||||
|
|
||||||
domainIndexToId.put(domainIndexToId.size(), id);
|
|
||||||
domainIdToIndex.put(id, domainIdToIndex.size());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
linkDataSrc2Dest = new TIntArrayList[domainIndexToId.size()];
|
|
||||||
linkDataDest2Src = new TIntArrayList[domainIndexToId.size()];
|
|
||||||
|
|
||||||
try (var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK")) {
|
|
||||||
stmt.setFetchSize(10000);
|
|
||||||
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
|
|
||||||
while (rsp.next()) {
|
|
||||||
int src = rsp.getInt(1);
|
|
||||||
int dst = rsp.getInt(2);
|
|
||||||
|
|
||||||
if (src == dst) continue;
|
|
||||||
|
|
||||||
if (domainsById.contains(src) && domainsById.contains(dst)) {
|
|
||||||
|
|
||||||
int srcIdx = domainIdToIndex.get(src);
|
|
||||||
int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias());
|
|
||||||
|
|
||||||
if (linkDataSrc2Dest[srcIdx] == null) {
|
|
||||||
linkDataSrc2Dest[srcIdx] = new TIntArrayList();
|
|
||||||
}
|
|
||||||
linkDataSrc2Dest[srcIdx].add(dstIdx);
|
|
||||||
|
|
||||||
if (linkDataDest2Src[dstIdx] == null) {
|
|
||||||
linkDataDest2Src[dstIdx] = new TIntArrayList();
|
|
||||||
}
|
|
||||||
linkDataDest2Src[dstIdx].add(srcIdx);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME LIKE ?")) {
|
|
||||||
for (var seed : this.originDomains) {
|
|
||||||
stmt.setString(1, seed);
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
while (rsp.next()) {
|
|
||||||
int i = rsp.getInt(1);
|
|
||||||
int ival = domainIdToIndex.get(i);
|
|
||||||
if (ival != domainIdToIndex.getNoEntryValue() || domainIndexToId.get(0) == i) {
|
|
||||||
originDomainIds.add(ival);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
logger.debug("No value for {}", i);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
logger.debug("{} -> {}", seed, originDomainIds.size());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.info("Origin Domains: {}", originDomainIds.size());
|
|
||||||
|
|
||||||
} catch (SQLException throwables) {
|
|
||||||
logger.error("SQL error", throwables);
|
|
||||||
}
|
}
|
||||||
|
logger.info("Origin Domains: {}", originDomainIds.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
public void addPeripheralNodes(boolean includeErrorStates) {
|
public void addPeripheralNodes() {
|
||||||
|
|
||||||
int newNodesIdxCutoff = domainIdToIndex.size();
|
int newNodesIdxCutoff = domainIdToIndex.size();
|
||||||
|
|
||||||
logger.info("Inserting peripheral nodes");
|
logger.info("Inserting peripheral nodes");
|
||||||
|
|
||||||
try (var conn = dataSource.getConnection()) {
|
domains.getPeripheralDomains(domainData -> {
|
||||||
String s;
|
int id = domainData.id;
|
||||||
if (getNames) {
|
|
||||||
s = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
|
if (domainsById.put(id, domainData) == null) { // true if id was not already present
|
||||||
|
domainIndexToId.put(domainIndexToId.size(), id);
|
||||||
|
domainIdToIndex.put(id, domainIdToIndex.size());
|
||||||
}
|
}
|
||||||
else {
|
});
|
||||||
s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
|
|
||||||
}
|
|
||||||
try (var stmt = conn.prepareStatement(s)) {
|
|
||||||
stmt.setFetchSize(10000);
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
|
|
||||||
while (rsp.next()) {
|
linkDataSrc2Dest = Arrays.copyOf(linkDataSrc2Dest, domainIndexToId.size());
|
||||||
int id = rsp.getInt(1);
|
linkDataDest2Src = Arrays.copyOf(linkDataDest2Src, domainIndexToId.size());
|
||||||
|
|
||||||
if (!spamDomains.contains(id)) {
|
domains.eachDomainLink((src, dst) -> {
|
||||||
domainsById.put(id, new DomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5), true));
|
if (src == dst) return;
|
||||||
|
|
||||||
domainIndexToId.put(domainIndexToId.size(), id);
|
if (domainsById.contains(src) && domainsById.contains(dst)) {
|
||||||
domainIdToIndex.put(id, domainIdToIndex.size());
|
int srcIdx = domainIdToIndex.get(src);
|
||||||
}
|
int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias());
|
||||||
|
|
||||||
|
// This looks like a bug, but it improves the results
|
||||||
|
if (srcIdx < newNodesIdxCutoff || dstIdx < newNodesIdxCutoff)
|
||||||
|
return;
|
||||||
|
|
||||||
|
if (linkDataSrc2Dest[srcIdx] == null) {
|
||||||
|
linkDataSrc2Dest[srcIdx] = new TIntArrayList();
|
||||||
}
|
}
|
||||||
|
linkDataSrc2Dest[srcIdx].add(dstIdx);
|
||||||
|
|
||||||
}
|
if (linkDataDest2Src[dstIdx] == null) {
|
||||||
|
linkDataDest2Src[dstIdx] = new TIntArrayList();
|
||||||
linkDataSrc2Dest = Arrays.copyOf(linkDataSrc2Dest, domainIndexToId.size());
|
|
||||||
linkDataDest2Src = Arrays.copyOf(linkDataDest2Src, domainIndexToId.size());
|
|
||||||
|
|
||||||
try (var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK")) {
|
|
||||||
stmt.setFetchSize(10000);
|
|
||||||
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
|
|
||||||
while (rsp.next()) {
|
|
||||||
int src = rsp.getInt(1);
|
|
||||||
int dst = rsp.getInt(2);
|
|
||||||
|
|
||||||
if (src == dst) continue;
|
|
||||||
|
|
||||||
if (domainsById.contains(src) && domainsById.contains(dst)) {
|
|
||||||
|
|
||||||
int srcIdx = domainIdToIndex.get(src);
|
|
||||||
int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias());
|
|
||||||
|
|
||||||
// This looks like a bug, but it improves the results
|
|
||||||
if (srcIdx < newNodesIdxCutoff || dstIdx < newNodesIdxCutoff)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if (linkDataSrc2Dest[srcIdx] == null) {
|
|
||||||
linkDataSrc2Dest[srcIdx] = new TIntArrayList();
|
|
||||||
}
|
|
||||||
linkDataSrc2Dest[srcIdx].add(dstIdx);
|
|
||||||
|
|
||||||
if (linkDataDest2Src[dstIdx] == null) {
|
|
||||||
linkDataDest2Src[dstIdx] = new TIntArrayList();
|
|
||||||
}
|
|
||||||
linkDataDest2Src[dstIdx].add(srcIdx);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
linkDataDest2Src[dstIdx].add(srcIdx);
|
||||||
}
|
}
|
||||||
} catch (SQLException throwables) {
|
});
|
||||||
logger.error("SQL error", throwables);
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.info("Peripheral nodes inserted {} -> {}", newNodesIdxCutoff, domainIdToIndex.size());
|
logger.info("Peripheral nodes inserted {} -> {}", newNodesIdxCutoff, domainIdToIndex.size());
|
||||||
}
|
}
|
||||||
@ -271,14 +199,14 @@ public abstract class RankingAlgorithm {
|
|||||||
return rank.getRanking(resultCount);
|
return rank.getRanking(resultCount);
|
||||||
}
|
}
|
||||||
|
|
||||||
public TIntList pageRankWithPeripheralNodes(int resultCount, boolean includeErrorStates) {
|
public TIntList pageRankWithPeripheralNodes(int resultCount) {
|
||||||
RankVector rank = new RankVector(1.d / domainsById.size());
|
RankVector rank = new RankVector(1.d / domainsById.size());
|
||||||
|
|
||||||
int iter_max = 100;
|
int iter_max = 100;
|
||||||
|
|
||||||
for (int i = 0; i < iter_max; i++) {
|
for (int i = 0; i < iter_max; i++) {
|
||||||
if (i == iter_max-1) {
|
if (i == iter_max-1) {
|
||||||
addPeripheralNodes(includeErrorStates);
|
addPeripheralNodes();
|
||||||
}
|
}
|
||||||
RankVector newRank = createNewRankVector(rank);
|
RankVector newRank = createNewRankVector(rank);
|
||||||
|
|
||||||
@ -323,7 +251,7 @@ public abstract class RankingAlgorithm {
|
|||||||
|
|
||||||
abstract RankVector createNewRankVector(RankVector rank);
|
abstract RankVector createNewRankVector(RankVector rank);
|
||||||
|
|
||||||
public boolean includeInRanking(DomainData data) {
|
public boolean includeInRanking(RankingDomainData data) {
|
||||||
if (data.isAlias())
|
if (data.isAlias())
|
||||||
return false;
|
return false;
|
||||||
if (data.isSpecial())
|
if (data.isSpecial())
|
||||||
@ -445,32 +373,4 @@ public abstract class RankingAlgorithm {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Data
|
|
||||||
@AllArgsConstructor
|
|
||||||
static class DomainData {
|
|
||||||
public final int id;
|
|
||||||
public final String name;
|
|
||||||
private int alias;
|
|
||||||
private EdgeDomainIndexingState state;
|
|
||||||
public final int knownUrls;
|
|
||||||
public boolean peripheral;
|
|
||||||
|
|
||||||
public int resolveAlias() {
|
|
||||||
if (alias == 0) return id;
|
|
||||||
return alias;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isAlias() {
|
|
||||||
return alias != 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isSpecial() {
|
|
||||||
return EdgeDomainIndexingState.SPECIAL == state;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isSocialMedia() {
|
|
||||||
return EdgeDomainIndexingState.SOCIAL_MEDIA == state;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,33 @@
|
|||||||
|
package nu.marginalia.util.ranking;
|
||||||
|
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Data;
|
||||||
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@AllArgsConstructor
|
||||||
|
class RankingDomainData {
|
||||||
|
public final int id;
|
||||||
|
public final String name;
|
||||||
|
private int alias;
|
||||||
|
private EdgeDomainIndexingState state;
|
||||||
|
public final int knownUrls;
|
||||||
|
public boolean peripheral;
|
||||||
|
|
||||||
|
public int resolveAlias() {
|
||||||
|
if (alias == 0) return id;
|
||||||
|
return alias;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isAlias() {
|
||||||
|
return alias != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isSpecial() {
|
||||||
|
return EdgeDomainIndexingState.SPECIAL == state;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isSocialMedia() {
|
||||||
|
return EdgeDomainIndexingState.SOCIAL_MEDIA == state;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,105 @@
|
|||||||
|
package nu.marginalia.util.ranking;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
||||||
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.util.function.Consumer;
|
||||||
|
import java.util.function.IntConsumer;
|
||||||
|
|
||||||
|
public class RankingDomainFetcher {
|
||||||
|
private final HikariDataSource dataSource;
|
||||||
|
private final EdgeDomainBlacklistImpl blacklist;
|
||||||
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
|
private final boolean getNames = false;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public RankingDomainFetcher(HikariDataSource dataSource, EdgeDomainBlacklistImpl blacklist) {
|
||||||
|
this.dataSource = dataSource;
|
||||||
|
this.blacklist = blacklist;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void getDomains(Consumer<RankingDomainData> consumer) {
|
||||||
|
String query;
|
||||||
|
if (getNames) {
|
||||||
|
query = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
query = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
|
||||||
|
}
|
||||||
|
|
||||||
|
getDomains(query, consumer);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void getPeripheralDomains(Consumer<RankingDomainData> consumer) {
|
||||||
|
String query;
|
||||||
|
if (getNames) {
|
||||||
|
query = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
query = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
|
||||||
|
}
|
||||||
|
|
||||||
|
getDomains(query, consumer);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void getDomains(String query, Consumer<RankingDomainData> consumer) {
|
||||||
|
try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(query)) {
|
||||||
|
stmt.setFetchSize(10000);
|
||||||
|
var rsp = stmt.executeQuery();
|
||||||
|
while (rsp.next()) {
|
||||||
|
int id = rsp.getInt(1);
|
||||||
|
if (!blacklist.isBlacklisted(id)) {
|
||||||
|
consumer.accept(new RankingDomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5), false));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
logger.error("Failed to fetch domains", ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void eachDomainLink(DomainLinkConsumer consumer) {
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK"))
|
||||||
|
{
|
||||||
|
stmt.setFetchSize(10000);
|
||||||
|
|
||||||
|
var rsp = stmt.executeQuery();
|
||||||
|
|
||||||
|
while (rsp.next()) {
|
||||||
|
int src = rsp.getInt(1);
|
||||||
|
int dst = rsp.getInt(2);
|
||||||
|
|
||||||
|
consumer.accept(src, dst);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
logger.error("Failed to fetch domain links", ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void domainsByPattern(String pattern, IntConsumer idConsumer) {
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME LIKE ?")) {
|
||||||
|
stmt.setString(1, pattern);
|
||||||
|
var rsp = stmt.executeQuery();
|
||||||
|
while (rsp.next()) {
|
||||||
|
idConsumer.accept(rsp.getInt(1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
logger.error("Failed to fetch domains by pattern", ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public interface DomainLinkConsumer {
|
||||||
|
void accept(int from, int to);
|
||||||
|
}
|
||||||
|
}
|
@ -3,12 +3,13 @@ package nu.marginalia.util.ranking.tool;
|
|||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.util.ranking.BuggyStandardPageRank;
|
import nu.marginalia.util.ranking.BuggyStandardPageRank;
|
||||||
|
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
||||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||||
|
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
||||||
import org.mariadb.jdbc.Driver;
|
import org.mariadb.jdbc.Driver;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
@ -43,12 +44,14 @@ public class UpdateDomainRanksTool {
|
|||||||
var uploader = new Thread(() -> uploadThread(conn), "Uploader");
|
var uploader = new Thread(() -> uploadThread(conn), "Uploader");
|
||||||
|
|
||||||
logger.info("Ranking");
|
logger.info("Ranking");
|
||||||
var spr = new BuggyStandardPageRank(new DatabaseModule().provideConnection(),"memex.marginalia.nu");
|
var ds = new DatabaseModule().provideConnection();
|
||||||
|
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
|
||||||
|
var spr = new BuggyStandardPageRank(domains, "memex.marginalia.nu");
|
||||||
|
|
||||||
rankMax = spr.size()*2;
|
rankMax = spr.size()*2;
|
||||||
uploader.start();
|
uploader.start();
|
||||||
|
|
||||||
spr.pageRankWithPeripheralNodes(rankMax, false).forEach(i -> {
|
spr.pageRankWithPeripheralNodes(rankMax).forEach(i -> {
|
||||||
try {
|
try {
|
||||||
uploadQueue.put(i);
|
uploadQueue.put(i);
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
|
@ -3,12 +3,13 @@ package nu.marginalia.util.ranking.tool;
|
|||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.util.ranking.BetterReversePageRank;
|
import nu.marginalia.util.ranking.BetterReversePageRank;
|
||||||
|
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
||||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||||
|
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
||||||
import org.mariadb.jdbc.Driver;
|
import org.mariadb.jdbc.Driver;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
@ -45,7 +46,9 @@ public class UpdateDomainRanksTool2 {
|
|||||||
logger.info("Ranking");
|
logger.info("Ranking");
|
||||||
// "memex.marginalia.nu", "wiki.xxiivv.com", "bikobatanari.art", "sadgrl.online", "lileks.com",
|
// "memex.marginalia.nu", "wiki.xxiivv.com", "bikobatanari.art", "sadgrl.online", "lileks.com",
|
||||||
// "www.rep.routledge.com", "www.personal.kent.edu", "xroads.virginia.edu", "classics.mit.edu", "faculty.washington.edu", "monadnock.net"
|
// "www.rep.routledge.com", "www.personal.kent.edu", "xroads.virginia.edu", "classics.mit.edu", "faculty.washington.edu", "monadnock.net"
|
||||||
var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
|
var ds = new DatabaseModule().provideConnection();
|
||||||
|
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
|
||||||
|
var rpr = new BetterReversePageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
|
||||||
// var rpr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "%edu");
|
// var rpr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "%edu");
|
||||||
// var spr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu");
|
// var spr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu");
|
||||||
|
|
||||||
@ -58,7 +61,7 @@ public class UpdateDomainRanksTool2 {
|
|||||||
rankMax = rpr.size();
|
rankMax = rpr.size();
|
||||||
|
|
||||||
|
|
||||||
rpr.pageRankWithPeripheralNodes(rankMax, false).forEach(i -> {
|
rpr.pageRankWithPeripheralNodes(rankMax).forEach(i -> {
|
||||||
try {
|
try {
|
||||||
uploadQueue.put(i);
|
uploadQueue.put(i);
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
|
@ -6,6 +6,7 @@ import com.google.common.hash.Hashing;
|
|||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
import com.google.gson.GsonBuilder;
|
import com.google.gson.GsonBuilder;
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
||||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
|
||||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
||||||
@ -73,10 +74,12 @@ public class CrawlJobExtractorPageRankMain {
|
|||||||
|
|
||||||
Gson gson = new GsonBuilder().create();
|
Gson gson = new GsonBuilder().create();
|
||||||
|
|
||||||
var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
|
var ds = new DatabaseModule().provideConnection();
|
||||||
|
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
|
||||||
|
var rpr = new BetterReversePageRank(domains, "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
|
||||||
rpr.setMaxKnownUrls(750);
|
rpr.setMaxKnownUrls(750);
|
||||||
|
|
||||||
var targetDomainIds = rpr.pageRankWithPeripheralNodes(rpr.size(), false);
|
var targetDomainIds = rpr.pageRankWithPeripheralNodes(rpr.size());
|
||||||
|
|
||||||
try (var out = new PrintWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outFile.toFile()))))) {
|
try (var out = new PrintWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outFile.toFile()))))) {
|
||||||
final var extractor = new CrawlJobExtractorPageRankMain(new DatabaseModule().provideConnection());
|
final var extractor = new CrawlJobExtractorPageRankMain(new DatabaseModule().provideConnection());
|
||||||
|
@ -13,33 +13,14 @@ import java.util.Optional;
|
|||||||
|
|
||||||
@ImplementedBy(EdgeDataStoreDaoImpl.class)
|
@ImplementedBy(EdgeDataStoreDaoImpl.class)
|
||||||
public interface EdgeDataStoreDao {
|
public interface EdgeDataStoreDao {
|
||||||
boolean isBlacklisted(EdgeDomain domain);
|
|
||||||
|
|
||||||
EdgeId<EdgeDomain> getDomainId(EdgeDomain domain);
|
EdgeId<EdgeDomain> getDomainId(EdgeDomain domain);
|
||||||
|
|
||||||
List<BrowseResult> getDomainNeighborsAdjacent(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist backlist, int count);
|
List<BrowseResult> getDomainNeighborsAdjacent(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist backlist, int count);
|
||||||
|
|
||||||
List<BrowseResult> getRandomDomains(int count, EdgeDomainBlacklist backlist);
|
List<BrowseResult> getRandomDomains(int count, EdgeDomainBlacklist backlist);
|
||||||
List<EdgeUrlDetails> getUrlDetailsMulti(List<EdgeId<EdgeUrl>> ids);
|
List<EdgeUrlDetails> getUrlDetailsMulti(List<EdgeId<EdgeUrl>> ids);
|
||||||
|
|
||||||
|
|
||||||
EdgeDomain getDomain(EdgeId<EdgeDomain> id);
|
EdgeDomain getDomain(EdgeId<EdgeDomain> id);
|
||||||
|
|
||||||
Optional<EdgeId<EdgeUrl>> resolveAmbiguousDomain(String name);
|
|
||||||
|
|
||||||
|
|
||||||
int getPagesKnown(EdgeId<EdgeDomain> domainId);
|
|
||||||
int getPagesVisited(EdgeId<EdgeDomain> domainId);
|
|
||||||
int getPagesIndexed(EdgeId<EdgeDomain> domainId);
|
|
||||||
|
|
||||||
int getIncomingLinks(EdgeId<EdgeDomain> domainId);
|
|
||||||
int getOutboundLinks(EdgeId<EdgeDomain> domainId);
|
|
||||||
|
|
||||||
double getDomainQuality(EdgeId<EdgeDomain> domainId);
|
|
||||||
|
|
||||||
EdgeDomainIndexingState getDomainState(EdgeId<EdgeDomain> domainId);
|
|
||||||
|
|
||||||
List<EdgeDomain> getLinkingDomains(EdgeId<EdgeDomain> domainId);
|
|
||||||
|
|
||||||
double getRank(EdgeId<EdgeDomain> domainId);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -33,7 +33,6 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
|||||||
private final Cache<EdgeUrl, EdgeId<EdgeUrl>> urlIdCache = CacheBuilder.newBuilder().maximumSize(100_000).build();
|
private final Cache<EdgeUrl, EdgeId<EdgeUrl>> urlIdCache = CacheBuilder.newBuilder().maximumSize(100_000).build();
|
||||||
private final Cache<EdgeDomain, EdgeId<EdgeDomain>> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
private final Cache<EdgeDomain, EdgeId<EdgeDomain>> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||||
|
|
||||||
private static final String DEFAULT_PROTOCOL = "http";
|
|
||||||
public static double QUALITY_LOWER_BOUND_CUTOFF = -15.;
|
public static double QUALITY_LOWER_BOUND_CUTOFF = -15.;
|
||||||
@Inject
|
@Inject
|
||||||
public EdgeDataStoreDaoImpl(HikariDataSource dataSource)
|
public EdgeDataStoreDaoImpl(HikariDataSource dataSource)
|
||||||
@ -48,23 +47,6 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
|||||||
domainIdCache.invalidateAll();
|
domainIdCache.invalidateAll();
|
||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
@Override
|
|
||||||
public boolean isBlacklisted(EdgeDomain domain) {
|
|
||||||
|
|
||||||
try (var connection = dataSource.getConnection()) {
|
|
||||||
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN_BLACKLIST WHERE URL_DOMAIN=?")) {
|
|
||||||
stmt.setString(1, domain.domain);
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
if (rsp.next()) {
|
|
||||||
return true;
|
|
||||||
} else {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
@Override
|
@Override
|
||||||
public EdgeId<EdgeDomain> getDomainId(EdgeDomain domain) {
|
public EdgeId<EdgeDomain> getDomainId(EdgeDomain domain) {
|
||||||
@ -108,13 +90,12 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
|||||||
|
|
||||||
try (var stmt = connection.prepareStatement(
|
try (var stmt = connection.prepareStatement(
|
||||||
"""
|
"""
|
||||||
SELECT ID, URL,
|
SELECT ID, URL,
|
||||||
TITLE, DESCRIPTION,
|
TITLE, DESCRIPTION,
|
||||||
WORDS_TOTAL, FORMAT, FEATURES,
|
WORDS_TOTAL, FORMAT, FEATURES,
|
||||||
IP, DOMAIN_STATE, DATA_HASH
|
IP, DOMAIN_STATE, DATA_HASH
|
||||||
FROM EC_URL_VIEW WHERE ID IN
|
FROM EC_URL_VIEW WHERE ID IN
|
||||||
""" + idString)) {
|
""" + idString)) {
|
||||||
// "SELECT ID,URL_PROTO,URL_DOMAIN,URL_PORT,URL_PATH,TITLE,DESCRIPTION,URL_QUALITY_MEASURE,DOMAIN_QUALITY_MEASURE,IFNULL(EC_DOMAIN_LINK_AGGREGATE.LINKS,1),WORDS_TOTAL,FORMAT,FEATURES,\"\",QUALITY_RAW,DOMAIN_STATE,DATA_HASH FROM EC_URL_VIEW LEFT JOIN EC_DOMAIN_LINK_AGGREGATE ON EC_DOMAIN_LINK_AGGREGATE.DOMAIN_ID=EC_URL_VIEW.DOMAIN_ID WHERE ID IN " + idString)) {
|
|
||||||
stmt.setFetchSize(ids.size());
|
stmt.setFetchSize(ids.size());
|
||||||
|
|
||||||
var rsp = stmt.executeQuery();
|
var rsp = stmt.executeQuery();
|
||||||
@ -125,7 +106,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
|||||||
rsp.getString(4), // description
|
rsp.getString(4), // description
|
||||||
-5, // quality
|
-5, // quality
|
||||||
rsp.getInt(5), // wordsTotal
|
rsp.getInt(5), // wordsTotal
|
||||||
rsp.getString(6), // foramt
|
rsp.getString(6), // format
|
||||||
rsp.getInt(7), // features
|
rsp.getInt(7), // features
|
||||||
rsp.getString(8), // ip
|
rsp.getString(8), // ip
|
||||||
EdgeDomainIndexingState.valueOf(rsp.getString(9)), // domainState
|
EdgeDomainIndexingState.valueOf(rsp.getString(9)), // domainState
|
||||||
@ -179,9 +160,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
|||||||
String domain = rsp.getString(2);
|
String domain = rsp.getString(2);
|
||||||
|
|
||||||
if (!blacklist.isBlacklisted(id)) {
|
if (!blacklist.isBlacklisted(id)) {
|
||||||
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
|
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id));
|
||||||
|
|
||||||
domains.add(new BrowseResult(url, id));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -210,9 +189,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
|||||||
String domain = rsp.getString(2);
|
String domain = rsp.getString(2);
|
||||||
|
|
||||||
if (!blacklist.isBlacklisted(id)) {
|
if (!blacklist.isBlacklisted(id)) {
|
||||||
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
|
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id));
|
||||||
|
|
||||||
domains.add(new BrowseResult(url, id));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -244,9 +221,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
|||||||
String domain = rsp.getString(2);
|
String domain = rsp.getString(2);
|
||||||
|
|
||||||
if (!blacklist.isBlacklisted(id)) {
|
if (!blacklist.isBlacklisted(id)) {
|
||||||
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
|
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id));
|
||||||
|
|
||||||
domains.add(new BrowseResult(url, id));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -262,7 +237,15 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
|||||||
@Override
|
@Override
|
||||||
public List<BrowseResult> getRandomDomains(int count, EdgeDomainBlacklist blacklist) {
|
public List<BrowseResult> getRandomDomains(int count, EdgeDomainBlacklist blacklist) {
|
||||||
|
|
||||||
final String q = "SELECT DOMAIN_ID,DOMAIN_NAME FROM EC_RANDOM_DOMAINS INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE STATE<2 AND DOMAIN_ALIAS IS NULL ORDER BY RAND() LIMIT ?";
|
final String q = """
|
||||||
|
SELECT DOMAIN_ID, DOMAIN_NAME
|
||||||
|
FROM EC_RANDOM_DOMAINS
|
||||||
|
INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID
|
||||||
|
WHERE STATE<2
|
||||||
|
AND DOMAIN_ALIAS IS NULL
|
||||||
|
ORDER BY RAND()
|
||||||
|
LIMIT ?
|
||||||
|
""";
|
||||||
List<BrowseResult> domains = new ArrayList<>(count);
|
List<BrowseResult> domains = new ArrayList<>(count);
|
||||||
try (var conn = dataSource.getConnection()) {
|
try (var conn = dataSource.getConnection()) {
|
||||||
try (var stmt = conn.prepareStatement(q)) {
|
try (var stmt = conn.prepareStatement(q)) {
|
||||||
@ -273,9 +256,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
|||||||
String domain = rsp.getString(2);
|
String domain = rsp.getString(2);
|
||||||
|
|
||||||
if (!blacklist.isBlacklisted(id)) {
|
if (!blacklist.isBlacklisted(id)) {
|
||||||
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
|
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id));
|
||||||
|
|
||||||
domains.add(new BrowseResult(url, id));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -302,223 +283,4 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Optional<EdgeId<EdgeUrl>> resolveAmbiguousDomain(String name) {
|
|
||||||
try (var connection = dataSource.getConnection()) {
|
|
||||||
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
|
||||||
stmt.setString(1, name);
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
if (rsp.next()) {
|
|
||||||
return Optional.of(new EdgeId<>(rsp.getInt(1)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
|
||||||
stmt.setString(1, "https://"+name);
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
if (rsp.next()) {
|
|
||||||
return Optional.of(new EdgeId<>(rsp.getInt(1)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
|
||||||
stmt.setString(1, "http://"+name);
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
if (rsp.next()) {
|
|
||||||
return Optional.of(new EdgeId<>(rsp.getInt(1)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
|
||||||
stmt.setString(1, "https://www."+name);
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
if (rsp.next()) {
|
|
||||||
return Optional.of(new EdgeId<>(rsp.getInt(1)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
|
||||||
stmt.setString(1, "http://www."+name);
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
if (rsp.next()) {
|
|
||||||
return Optional.of(new EdgeId<>(rsp.getInt(1)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} catch (SQLException throwables) {
|
|
||||||
logger.info("Could not resolve domain id for {}", name);
|
|
||||||
}
|
|
||||||
|
|
||||||
return Optional.empty();
|
|
||||||
}
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
@Override
|
|
||||||
public int getPagesKnown(EdgeId<EdgeDomain> domainId) {
|
|
||||||
try (var connection = dataSource.getConnection()) {
|
|
||||||
|
|
||||||
try (var stmt = connection.prepareStatement("SELECT KNOWN_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
|
|
||||||
stmt.setInt(1, domainId.getId());
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
if (rsp.next()) {
|
|
||||||
return rsp.getInt(1);
|
|
||||||
}
|
|
||||||
} catch (Exception ex) {
|
|
||||||
logger.error("DB error", ex);
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
@Override
|
|
||||||
public int getPagesVisited(EdgeId<EdgeDomain> domainId) {
|
|
||||||
try (var connection = dataSource.getConnection()) {
|
|
||||||
|
|
||||||
try (var stmt = connection.prepareStatement("SELECT VISITED_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
|
|
||||||
stmt.setInt(1, domainId.getId());
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
if (rsp.next()) {
|
|
||||||
return rsp.getInt(1);
|
|
||||||
}
|
|
||||||
} catch (Exception ex) {
|
|
||||||
logger.error("DB error", ex);
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
@Override
|
|
||||||
public int getPagesIndexed(EdgeId<EdgeDomain> domainId) {
|
|
||||||
try (var connection = dataSource.getConnection()) {
|
|
||||||
|
|
||||||
try (var stmt = connection.prepareStatement("SELECT GOOD_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
|
|
||||||
stmt.setInt(1, domainId.getId());
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
if (rsp.next()) {
|
|
||||||
return rsp.getInt(1);
|
|
||||||
}
|
|
||||||
} catch (Exception ex) {
|
|
||||||
logger.error("DB error", ex);
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
@Override
|
|
||||||
public int getIncomingLinks(EdgeId<EdgeDomain> domainId) {
|
|
||||||
try (var connection = dataSource.getConnection()) {
|
|
||||||
|
|
||||||
try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE DEST_DOMAIN_ID=?")) {
|
|
||||||
stmt.setInt(1, domainId.getId());
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
if (rsp.next()) {
|
|
||||||
return rsp.getInt(1);
|
|
||||||
}
|
|
||||||
} catch (Exception ex) {
|
|
||||||
logger.error("DB error", ex);
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@SneakyThrows
|
|
||||||
@Override
|
|
||||||
public int getOutboundLinks(EdgeId<EdgeDomain> domainId) {
|
|
||||||
try (var connection = dataSource.getConnection()) {
|
|
||||||
|
|
||||||
try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=?")) {
|
|
||||||
stmt.setInt(1, domainId.getId());
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
if (rsp.next()) {
|
|
||||||
return rsp.getInt(1);
|
|
||||||
}
|
|
||||||
} catch (Exception ex) {
|
|
||||||
logger.error("DB error", ex);
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
@Override
|
|
||||||
public double getDomainQuality(EdgeId<EdgeDomain> domainId) {
|
|
||||||
try (var connection = dataSource.getConnection()) {
|
|
||||||
|
|
||||||
try (var stmt = connection.prepareStatement("SELECT QUALITY FROM EC_DOMAIN WHERE ID=?")) {
|
|
||||||
stmt.setInt(1, domainId.getId());
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
if (rsp.next()) {
|
|
||||||
return rsp.getDouble(1);
|
|
||||||
}
|
|
||||||
} catch (Exception ex) {
|
|
||||||
logger.error("DB error", ex);
|
|
||||||
}
|
|
||||||
return -5;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public EdgeDomainIndexingState getDomainState(EdgeId<EdgeDomain> domainId) {
|
|
||||||
try (var connection = dataSource.getConnection()) {
|
|
||||||
|
|
||||||
try (var stmt = connection.prepareStatement("SELECT STATE FROM EC_DOMAIN WHERE ID=?")) {
|
|
||||||
stmt.setInt(1, domainId.getId());
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
if (rsp.next()) {
|
|
||||||
return EdgeDomainIndexingState.fromCode(rsp.getInt(1));
|
|
||||||
}
|
|
||||||
} catch (Exception ex) {
|
|
||||||
logger.error("DB error", ex);
|
|
||||||
}
|
|
||||||
} catch (SQLException throwables) {
|
|
||||||
throwables.printStackTrace();
|
|
||||||
}
|
|
||||||
return EdgeDomainIndexingState.ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public List<EdgeDomain> getLinkingDomains(EdgeId<EdgeDomain> domainId) {
|
|
||||||
try (var connection = dataSource.getConnection()) {
|
|
||||||
List<EdgeDomain> results = new ArrayList<>(25);
|
|
||||||
try (var stmt = connection.prepareStatement("SELECT SOURCE_URL FROM EC_RELATED_LINKS_VIEW WHERE DEST_DOMAIN_ID=? ORDER BY SOURCE_DOMAIN_ID LIMIT 25")) {
|
|
||||||
stmt.setInt(1, domainId.getId());
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
while (rsp.next()) {
|
|
||||||
results.add(new EdgeDomain(rsp.getString(1)));
|
|
||||||
}
|
|
||||||
return results;
|
|
||||||
} catch (Exception ex) {
|
|
||||||
logger.error("DB error", ex);
|
|
||||||
}
|
|
||||||
|
|
||||||
} catch (SQLException throwables) {
|
|
||||||
throwables.printStackTrace();
|
|
||||||
}
|
|
||||||
return Collections.emptyList();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public double getRank(EdgeId<EdgeDomain> domainId) {
|
|
||||||
try (var connection = dataSource.getConnection()) {
|
|
||||||
|
|
||||||
try (var stmt = connection.prepareStatement("SELECT IFNULL(RANK, 1) FROM EC_DOMAIN WHERE ID=?")) {
|
|
||||||
stmt.setInt(1, domainId.getId());
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
if (rsp.next()) {
|
|
||||||
return rsp.getDouble(1);
|
|
||||||
}
|
|
||||||
} catch (Exception ex) {
|
|
||||||
logger.error("DB error", ex);
|
|
||||||
}
|
|
||||||
} catch (SQLException throwables) {
|
|
||||||
throwables.printStackTrace();
|
|
||||||
}
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -10,7 +10,7 @@ import lombok.SneakyThrows;
|
|||||||
import nu.marginalia.util.ranking.BetterReversePageRank;
|
import nu.marginalia.util.ranking.BetterReversePageRank;
|
||||||
import nu.marginalia.util.ranking.BetterStandardPageRank;
|
import nu.marginalia.util.ranking.BetterStandardPageRank;
|
||||||
import nu.marginalia.util.ranking.BuggyStandardPageRank;
|
import nu.marginalia.util.ranking.BuggyStandardPageRank;
|
||||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
||||||
import nu.marginalia.wmsa.edge.index.model.RankingSettings;
|
import nu.marginalia.wmsa.edge.index.model.RankingSettings;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@ -18,14 +18,17 @@ import org.slf4j.LoggerFactory;
|
|||||||
@Singleton
|
@Singleton
|
||||||
public class SearchIndexDao {
|
public class SearchIndexDao {
|
||||||
private final HikariDataSource dataSource;
|
private final HikariDataSource dataSource;
|
||||||
|
private RankingDomainFetcher rankingDomains;
|
||||||
private final RankingSettings rankingSettings;
|
private final RankingSettings rankingSettings;
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public SearchIndexDao(HikariDataSource dataSource,
|
public SearchIndexDao(HikariDataSource dataSource,
|
||||||
|
RankingDomainFetcher rankingDomains,
|
||||||
RankingSettings rankingSettings)
|
RankingSettings rankingSettings)
|
||||||
{
|
{
|
||||||
this.dataSource = dataSource;
|
this.dataSource = dataSource;
|
||||||
|
this.rankingDomains = rankingDomains;
|
||||||
this.rankingSettings = rankingSettings;
|
this.rankingSettings = rankingSettings;
|
||||||
logger.info("SearchIndexDao ranking settings = {}", rankingSettings);
|
logger.info("SearchIndexDao ranking settings = {}", rankingSettings);
|
||||||
}
|
}
|
||||||
@ -63,36 +66,36 @@ public class SearchIndexDao {
|
|||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public TIntList getRetroDomains() {
|
public TIntList getRetroDomains() {
|
||||||
var spr = new BetterStandardPageRank(dataSource,rankingSettings.retro.toArray(String[]::new));
|
var spr = new BetterStandardPageRank(rankingDomains,rankingSettings.retro.toArray(String[]::new));
|
||||||
return spr.pageRankWithPeripheralNodes(spr.size()/2, false);
|
return spr.pageRankWithPeripheralNodes(spr.size()/2);
|
||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public TIntList getSmallWebDomains() {
|
public TIntList getSmallWebDomains() {
|
||||||
var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), rankingSettings.small.toArray(String[]::new));
|
var rpr = new BetterReversePageRank(rankingDomains, rankingSettings.small.toArray(String[]::new));
|
||||||
|
|
||||||
rpr.setMaxKnownUrls(750);
|
rpr.setMaxKnownUrls(750);
|
||||||
|
|
||||||
return rpr.pageRankWithPeripheralNodes(rpr.size(), false);
|
return rpr.pageRankWithPeripheralNodes(rpr.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public TIntList getAcademiaDomains() {
|
public TIntList getAcademiaDomains() {
|
||||||
var spr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), rankingSettings.academia.toArray(String[]::new));
|
var spr = new BetterStandardPageRank(rankingDomains, rankingSettings.academia.toArray(String[]::new));
|
||||||
return spr.pageRankWithPeripheralNodes(spr.size()/2, false);
|
return spr.pageRankWithPeripheralNodes(spr.size()/2);
|
||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public TIntList getStandardDomains() {
|
public TIntList getStandardDomains() {
|
||||||
var spr = new BuggyStandardPageRank(dataSource,rankingSettings.standard.toArray(String[]::new));
|
var spr = new BuggyStandardPageRank(rankingDomains,rankingSettings.standard.toArray(String[]::new));
|
||||||
return spr.pageRankWithPeripheralNodes(spr.size()/2, false);
|
return spr.pageRankWithPeripheralNodes(spr.size()/2);
|
||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public TIntList getSpecialDomains() {
|
public TIntList getSpecialDomains() {
|
||||||
TIntArrayList results = new TIntArrayList();
|
TIntArrayList results = new TIntArrayList();
|
||||||
try (var connection = dataSource.getConnection();
|
try (var connection = dataSource.getConnection();
|
||||||
var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE STATE=2")
|
var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE STATE='SPECIAL'")
|
||||||
) {
|
) {
|
||||||
var rs = stmt.executeQuery();
|
var rs = stmt.executeQuery();
|
||||||
while (rs.next()) {
|
while (rs.next()) {
|
||||||
|
@ -54,8 +54,11 @@ public class EdgeDomain implements WideHashable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public EdgeUrl toRootUrl() {
|
||||||
|
// Set default protocol to http, as most https websites redirect http->https, but few http websites redirect https->http
|
||||||
|
return new EdgeUrl("http", this, null, "/");
|
||||||
}
|
}
|
||||||
|
|
||||||
public String toString() {
|
public String toString() {
|
||||||
|
@ -1,24 +1,43 @@
|
|||||||
package nu.marginalia.wmsa.edge.search.siteinfo;
|
package nu.marginalia.wmsa.edge.search.siteinfo;
|
||||||
|
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao;
|
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao;
|
||||||
|
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDaoImpl;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||||
import nu.marginalia.wmsa.edge.search.model.DomainInformation;
|
import nu.marginalia.wmsa.edge.search.model.DomainInformation;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import javax.inject.Inject;
|
import javax.inject.Inject;
|
||||||
import javax.inject.Singleton;
|
import javax.inject.Singleton;
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
|
/*
|
||||||
|
TODO: This class needs to be refactored, a lot of
|
||||||
|
these SQL queries are redundant and can be
|
||||||
|
collapsed into one single query that fetches
|
||||||
|
all the information
|
||||||
|
*/
|
||||||
@Singleton
|
@Singleton
|
||||||
public class DomainInformationService {
|
public class DomainInformationService {
|
||||||
|
|
||||||
private EdgeDataStoreDao dataStore;
|
private EdgeDataStoreDaoImpl dataStoreDao;
|
||||||
|
private HikariDataSource dataSource;
|
||||||
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public DomainInformationService(EdgeDataStoreDao dataStore) {
|
public DomainInformationService(
|
||||||
this.dataStore = dataStore;
|
EdgeDataStoreDaoImpl dataStoreDao,
|
||||||
|
HikariDataSource dataSource) {
|
||||||
|
this.dataStoreDao = dataStoreDao;
|
||||||
|
this.dataSource = dataSource;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -28,29 +47,29 @@ public class DomainInformationService {
|
|||||||
if (domainId == null) {
|
if (domainId == null) {
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
}
|
}
|
||||||
EdgeDomain domain = dataStore.getDomain(domainId);
|
EdgeDomain domain = dataStoreDao.getDomain(domainId);
|
||||||
|
|
||||||
boolean blacklisted = dataStore.isBlacklisted(domain);
|
boolean blacklisted = isBlacklisted(domain);
|
||||||
int pagesKnown = dataStore.getPagesKnown(domainId);
|
int pagesKnown = getPagesKnown(domainId);
|
||||||
int pagesVisited = dataStore.getPagesVisited(domainId);
|
int pagesVisited = getPagesVisited(domainId);
|
||||||
int pagesIndexed = dataStore.getPagesIndexed(domainId);
|
int pagesIndexed = getPagesIndexed(domainId);
|
||||||
int incomingLinks = dataStore.getIncomingLinks(domainId);
|
int incomingLinks = getIncomingLinks(domainId);
|
||||||
int outboundLinks = dataStore.getOutboundLinks(domainId);
|
int outboundLinks = getOutboundLinks(domainId);
|
||||||
double rank = Math.round(10000.0*(1.0-dataStore.getRank(domainId)))/100;
|
double rank = Math.round(10000.0*(1.0-getRank(domainId)))/100;
|
||||||
EdgeDomainIndexingState state = dataStore.getDomainState(domainId);
|
EdgeDomainIndexingState state = getDomainState(domainId);
|
||||||
double nominalQuality = Math.round(100*100*Math.exp(dataStore.getDomainQuality(domainId)))/100.;
|
double nominalQuality = Math.round(100*100*Math.exp(getDomainQuality(domainId)))/100.;
|
||||||
List<EdgeDomain> linkingDomains = dataStore.getLinkingDomains(domainId);
|
List<EdgeDomain> linkingDomains = getLinkingDomains(domainId);
|
||||||
|
|
||||||
return Optional.of(new DomainInformation(domain, blacklisted, pagesKnown, pagesVisited, pagesIndexed, incomingLinks, outboundLinks, nominalQuality, rank, state, linkingDomains));
|
return Optional.of(new DomainInformation(domain, blacklisted, pagesKnown, pagesVisited, pagesIndexed, incomingLinks, outboundLinks, nominalQuality, rank, state, linkingDomains));
|
||||||
}
|
}
|
||||||
|
|
||||||
private EdgeId<EdgeDomain> getDomainFromPartial(String site) {
|
private EdgeId<EdgeDomain> getDomainFromPartial(String site) {
|
||||||
try {
|
try {
|
||||||
return dataStore.getDomainId(new EdgeDomain(site));
|
return dataStoreDao.getDomainId(new EdgeDomain(site));
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
try {
|
try {
|
||||||
return dataStore.getDomainId(new EdgeDomain(site));
|
return dataStoreDao.getDomainId(new EdgeDomain(site));
|
||||||
}
|
}
|
||||||
catch (Exception ex2) {
|
catch (Exception ex2) {
|
||||||
return null;
|
return null;
|
||||||
@ -58,4 +77,178 @@ public class DomainInformationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public boolean isBlacklisted(EdgeDomain domain) {
|
||||||
|
|
||||||
|
try (var connection = dataSource.getConnection()) {
|
||||||
|
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN_BLACKLIST WHERE URL_DOMAIN=?")) {
|
||||||
|
stmt.setString(1, domain.domain);
|
||||||
|
var rsp = stmt.executeQuery();
|
||||||
|
if (rsp.next()) {
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public int getPagesKnown(EdgeId<EdgeDomain> domainId) {
|
||||||
|
try (var connection = dataSource.getConnection()) {
|
||||||
|
|
||||||
|
try (var stmt = connection.prepareStatement("SELECT KNOWN_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
|
||||||
|
stmt.setInt(1, domainId.getId());
|
||||||
|
var rsp = stmt.executeQuery();
|
||||||
|
if (rsp.next()) {
|
||||||
|
return rsp.getInt(1);
|
||||||
|
}
|
||||||
|
} catch (Exception ex) {
|
||||||
|
logger.error("DB error", ex);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public int getPagesVisited(EdgeId<EdgeDomain> domainId) {
|
||||||
|
try (var connection = dataSource.getConnection()) {
|
||||||
|
|
||||||
|
try (var stmt = connection.prepareStatement("SELECT VISITED_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
|
||||||
|
stmt.setInt(1, domainId.getId());
|
||||||
|
var rsp = stmt.executeQuery();
|
||||||
|
if (rsp.next()) {
|
||||||
|
return rsp.getInt(1);
|
||||||
|
}
|
||||||
|
} catch (Exception ex) {
|
||||||
|
logger.error("DB error", ex);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public int getPagesIndexed(EdgeId<EdgeDomain> domainId) {
|
||||||
|
try (var connection = dataSource.getConnection()) {
|
||||||
|
|
||||||
|
try (var stmt = connection.prepareStatement("SELECT GOOD_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
|
||||||
|
stmt.setInt(1, domainId.getId());
|
||||||
|
var rsp = stmt.executeQuery();
|
||||||
|
if (rsp.next()) {
|
||||||
|
return rsp.getInt(1);
|
||||||
|
}
|
||||||
|
} catch (Exception ex) {
|
||||||
|
logger.error("DB error", ex);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public int getIncomingLinks(EdgeId<EdgeDomain> domainId) {
|
||||||
|
try (var connection = dataSource.getConnection()) {
|
||||||
|
|
||||||
|
try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE DEST_DOMAIN_ID=?")) {
|
||||||
|
stmt.setInt(1, domainId.getId());
|
||||||
|
var rsp = stmt.executeQuery();
|
||||||
|
if (rsp.next()) {
|
||||||
|
return rsp.getInt(1);
|
||||||
|
}
|
||||||
|
} catch (Exception ex) {
|
||||||
|
logger.error("DB error", ex);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@SneakyThrows
|
||||||
|
public int getOutboundLinks(EdgeId<EdgeDomain> domainId) {
|
||||||
|
try (var connection = dataSource.getConnection()) {
|
||||||
|
|
||||||
|
try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=?")) {
|
||||||
|
stmt.setInt(1, domainId.getId());
|
||||||
|
var rsp = stmt.executeQuery();
|
||||||
|
if (rsp.next()) {
|
||||||
|
return rsp.getInt(1);
|
||||||
|
}
|
||||||
|
} catch (Exception ex) {
|
||||||
|
logger.error("DB error", ex);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public double getDomainQuality(EdgeId<EdgeDomain> domainId) {
|
||||||
|
try (var connection = dataSource.getConnection()) {
|
||||||
|
|
||||||
|
try (var stmt = connection.prepareStatement("SELECT QUALITY FROM EC_DOMAIN WHERE ID=?")) {
|
||||||
|
stmt.setInt(1, domainId.getId());
|
||||||
|
var rsp = stmt.executeQuery();
|
||||||
|
if (rsp.next()) {
|
||||||
|
return rsp.getDouble(1);
|
||||||
|
}
|
||||||
|
} catch (Exception ex) {
|
||||||
|
logger.error("DB error", ex);
|
||||||
|
}
|
||||||
|
return -5;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public EdgeDomainIndexingState getDomainState(EdgeId<EdgeDomain> domainId) {
|
||||||
|
try (var connection = dataSource.getConnection()) {
|
||||||
|
|
||||||
|
try (var stmt = connection.prepareStatement("SELECT STATE FROM EC_DOMAIN WHERE ID=?")) {
|
||||||
|
stmt.setInt(1, domainId.getId());
|
||||||
|
var rsp = stmt.executeQuery();
|
||||||
|
if (rsp.next()) {
|
||||||
|
return EdgeDomainIndexingState.fromCode(rsp.getInt(1));
|
||||||
|
}
|
||||||
|
} catch (Exception ex) {
|
||||||
|
logger.error("DB error", ex);
|
||||||
|
}
|
||||||
|
} catch (SQLException throwables) {
|
||||||
|
throwables.printStackTrace();
|
||||||
|
}
|
||||||
|
return EdgeDomainIndexingState.ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<EdgeDomain> getLinkingDomains(EdgeId<EdgeDomain> domainId) {
|
||||||
|
try (var connection = dataSource.getConnection()) {
|
||||||
|
List<EdgeDomain> results = new ArrayList<>(25);
|
||||||
|
try (var stmt = connection.prepareStatement("SELECT SOURCE_URL FROM EC_RELATED_LINKS_VIEW WHERE DEST_DOMAIN_ID=? ORDER BY SOURCE_DOMAIN_ID LIMIT 25")) {
|
||||||
|
stmt.setInt(1, domainId.getId());
|
||||||
|
var rsp = stmt.executeQuery();
|
||||||
|
while (rsp.next()) {
|
||||||
|
results.add(new EdgeDomain(rsp.getString(1)));
|
||||||
|
}
|
||||||
|
return results;
|
||||||
|
} catch (Exception ex) {
|
||||||
|
logger.error("DB error", ex);
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (SQLException throwables) {
|
||||||
|
throwables.printStackTrace();
|
||||||
|
}
|
||||||
|
return Collections.emptyList();
|
||||||
|
}
|
||||||
|
|
||||||
|
public double getRank(EdgeId<EdgeDomain> domainId) {
|
||||||
|
try (var connection = dataSource.getConnection()) {
|
||||||
|
|
||||||
|
try (var stmt = connection.prepareStatement("SELECT IFNULL(RANK, 1) FROM EC_DOMAIN WHERE ID=?")) {
|
||||||
|
stmt.setInt(1, domainId.getId());
|
||||||
|
var rsp = stmt.executeQuery();
|
||||||
|
if (rsp.next()) {
|
||||||
|
return rsp.getDouble(1);
|
||||||
|
}
|
||||||
|
} catch (Exception ex) {
|
||||||
|
logger.error("DB error", ex);
|
||||||
|
}
|
||||||
|
} catch (SQLException throwables) {
|
||||||
|
throwables.printStackTrace();
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -3,6 +3,7 @@ package nu.marginalia.wmsa.edge.tools;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import gnu.trove.set.hash.TIntHashSet;
|
import gnu.trove.set.hash.TIntHashSet;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
||||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
|
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
|
||||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
||||||
@ -59,7 +60,9 @@ public class IndexMergerMain {
|
|||||||
}
|
}
|
||||||
|
|
||||||
var hikari = new DatabaseModule().provideConnection();
|
var hikari = new DatabaseModule().provideConnection();
|
||||||
var partitioner = new SearchIndexPartitioner(new SearchIndexDao(hikari, new RankingSettings()));
|
var ds = new DatabaseModule().provideConnection();
|
||||||
|
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
|
||||||
|
var partitioner = new SearchIndexPartitioner(new SearchIndexDao(hikari, domains, new RankingSettings()));
|
||||||
var blacklist = new EdgeDomainBlacklistImpl(hikari);
|
var blacklist = new EdgeDomainBlacklistImpl(hikari);
|
||||||
|
|
||||||
new IndexMergerMain(file1, file2, outputFile, partitioner, blacklist);
|
new IndexMergerMain(file1, file2, outputFile, partitioner, blacklist);
|
||||||
|
Loading…
Reference in New Issue
Block a user