(search) Optimize related domains queries

In the future this logic probably needs to move into a separate
service, as it's still quite slow to load.  But this fixes response
times and DOS potential of previous version.
This commit is contained in:
Viktor Lofgren 2023-12-05 15:47:21 +01:00
parent 9301c47d93
commit 2e438847fc
2 changed files with 285 additions and 217 deletions

View File

@ -50,6 +50,7 @@ dependencies {
implementation libs.bundles.slf4j implementation libs.bundles.slf4j
implementation libs.roaringbitmap
implementation libs.prometheus implementation libs.prometheus
implementation libs.notnull implementation libs.notnull
implementation libs.guice implementation libs.guice

View File

@ -2,265 +2,332 @@ package nu.marginalia.search.svc;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.map.TIntDoubleMap; import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList;
import gnu.trove.map.hash.TIntDoubleHashMap; import gnu.trove.map.hash.TIntDoubleHashMap;
import gnu.trove.map.hash.TIntIntHashMap;
import gnu.trove.map.hash.TLongDoubleHashMap;
import gnu.trove.set.TIntSet; import gnu.trove.set.TIntSet;
import gnu.trove.set.TLongSet;
import gnu.trove.set.hash.TIntHashSet; import gnu.trove.set.hash.TIntHashSet;
import gnu.trove.set.hash.TLongHashSet;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.sql.ResultSet;
import java.sql.SQLException; import java.sql.SQLException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Comparator; import java.util.BitSet;
import java.util.List; import java.util.List;
import java.util.concurrent.Executors;
import java.util.stream.IntStream;
public class SimilarDomainsService { public class SimilarDomainsService {
private static final Logger logger = LoggerFactory.getLogger(SimilarDomainsService.class); private static final Logger logger = LoggerFactory.getLogger(SimilarDomainsService.class);
private final HikariDataSource dataSource; private final HikariDataSource dataSource;
private volatile TIntIntHashMap domainIdToIdx = new TIntIntHashMap(100_000);
private volatile int[] domainIdxToId;
public volatile TIntDoubleHashMap[] relatedDomains;
public volatile TIntList[] domainNeighbors = null;
public volatile TIntList[] linkStoD = null;
public volatile TIntList[] linkDtoS = null;
public volatile BitSet screenshotDomains = null;
public volatile BitSet activeDomains = null;
public volatile BitSet indexedDomains = null;
public volatile double[] domainRanks = null;
public volatile String[] domainNames = null;
@Inject @Inject
public SimilarDomainsService(HikariDataSource dataSource) { public SimilarDomainsService(HikariDataSource dataSource) {
this.dataSource = dataSource; this.dataSource = dataSource;
Executors.newSingleThreadExecutor().submit(this::init);
} }
public List<SimilarDomain> getSimilarDomains(int domainId, int count) { private void init() {
// Tell me you've worked in enterprise software without telling me you've worked in enterprise software
String q1 = """
SELECT
NEIGHBOR.ID AS ID,
NEIGHBOR.DOMAIN_NAME AS DOMAIN_NAME,
SCREENSHOT.DOMAIN_NAME IS NOT NULL AS HAS_SCREENSHOT,
NODE_AFFINITY > 0 AS INDEXED,
STATE='ACTIVE' AS ACTIVE,
RELATEDNESS,
RANK,
STOD.ID IS NOT NULL AS LINK_STOD,
DTOS.ID IS NOT NULL AS LINK_DTOS
FROM EC_DOMAIN_NEIGHBORS_2
INNER JOIN EC_DOMAIN AS NEIGHBOR ON EC_DOMAIN_NEIGHBORS_2.NEIGHBOR_ID = NEIGHBOR.ID
LEFT JOIN DATA_DOMAIN_SCREENSHOT AS SCREENSHOT ON NEIGHBOR.DOMAIN_NAME = SCREENSHOT.DOMAIN_NAME
LEFT JOIN EC_DOMAIN_LINK STOD ON STOD.SOURCE_DOMAIN_ID = NEIGHBOR.ID AND STOD.DEST_DOMAIN_ID = EC_DOMAIN_NEIGHBORS_2.DOMAIN_ID
LEFT JOIN EC_DOMAIN_LINK DTOS ON DTOS.DEST_DOMAIN_ID = NEIGHBOR.ID AND DTOS.SOURCE_DOMAIN_ID = EC_DOMAIN_NEIGHBORS_2.DOMAIN_ID
WHERE DOMAIN_ID = ?
ORDER BY RELATEDNESS DESC, RANK ASC
LIMIT ?
""";
String q2 = """
SELECT
NEIGHBOR.ID AS ID,
NEIGHBOR.DOMAIN_NAME AS DOMAIN_NAME,
SCREENSHOT.DOMAIN_NAME IS NOT NULL AS HAS_SCREENSHOT,
NODE_AFFINITY > 0 AS INDEXED,
STATE='ACTIVE' AS ACTIVE,
RELATEDNESS,
RANK,
STOD.ID IS NOT NULL AS LINK_STOD,
DTOS.ID IS NOT NULL AS LINK_DTOS
FROM EC_DOMAIN_NEIGHBORS_2
INNER JOIN EC_DOMAIN AS NEIGHBOR ON EC_DOMAIN_NEIGHBORS_2.DOMAIN_ID = NEIGHBOR.ID
LEFT JOIN DATA_DOMAIN_SCREENSHOT AS SCREENSHOT ON NEIGHBOR.DOMAIN_NAME = SCREENSHOT.DOMAIN_NAME
LEFT JOIN EC_DOMAIN_LINK STOD ON STOD.SOURCE_DOMAIN_ID = NEIGHBOR.ID AND STOD.DEST_DOMAIN_ID = EC_DOMAIN_NEIGHBORS_2.NEIGHBOR_ID
LEFT JOIN EC_DOMAIN_LINK DTOS ON DTOS.DEST_DOMAIN_ID = NEIGHBOR.ID AND DTOS.SOURCE_DOMAIN_ID = EC_DOMAIN_NEIGHBORS_2.NEIGHBOR_ID
WHERE NEIGHBOR_ID = ?
ORDER BY RELATEDNESS DESC, RANK ASC
LIMIT ?
""";
var domains = executeSimilarDomainsQueries(domainId, count, q1, q2); logger.info("Loading similar domains data... ");
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.createStatement(ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY)) {
stmt.setFetchSize(1000);
ResultSet rs;
rs = stmt.executeQuery("SELECT ID FROM EC_DOMAIN");
while (rs.next()) {
int id = rs.getInt(1);
domainIdToIdx.put(id, domainIdToIdx.size());
}
domainIdxToId = new int[domainIdToIdx.size()];
domainIdToIdx.forEachEntry((id, idx) -> {
domainIdxToId[idx] = id;
return true;
});
domainRanks = new double[domainIdToIdx.size()];
domainNames = new String[domainIdToIdx.size()];
domainNeighbors = new TIntList[domainIdToIdx.size()];
linkStoD = new TIntList[domainIdToIdx.size()];
linkDtoS = new TIntList[domainIdToIdx.size()];
screenshotDomains = new BitSet(domainIdToIdx.size());
activeDomains = new BitSet(domainIdToIdx.size());
indexedDomains = new BitSet(domainIdToIdx.size());
relatedDomains = new TIntDoubleHashMap[domainIdToIdx.size()];
logger.info("Loaded {} domain IDs", domainIdToIdx.size());
rs = stmt.executeQuery("""
SELECT DOMAIN_ID, NEIGHBOR_ID, RELATEDNESS FROM EC_DOMAIN_NEIGHBORS_2
""");
while (rs.next()) {
int did = rs.getInt(1);
int nid = rs.getInt(2);
int didx = domainIdToIdx.get(did);
int nidx = domainIdToIdx.get(nid);
int lowerIndex = Math.min(didx, nidx);
int higherIndex = Math.max(didx, nidx);
if (relatedDomains[lowerIndex] == null)
relatedDomains[lowerIndex] = new TIntDoubleHashMap(32);
relatedDomains[lowerIndex].put(higherIndex, Math.round(100 * rs.getDouble(3)));
if (domainNeighbors[didx] == null)
domainNeighbors[didx] = new TIntArrayList(32);
if (domainNeighbors[nidx] == null)
domainNeighbors[nidx] = new TIntArrayList(32);
domainNeighbors[didx].add(nidx);
domainNeighbors[nidx].add(didx);
}
logger.info("Loaded {} related domains", relatedDomains.length);
rs = stmt.executeQuery("""
SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK
""");
while (rs.next()) {
int source = rs.getInt(1);
int dest = rs.getInt(2);
int sourceIdx = domainIdToIdx.get(source);
int destIdx = domainIdToIdx.get(dest);
if (linkStoD[sourceIdx] == null)
linkStoD[sourceIdx] = new TIntArrayList(32);
if (linkDtoS[destIdx] == null)
linkDtoS[destIdx] = new TIntArrayList(32);
linkStoD[sourceIdx].add(destIdx);
linkDtoS[destIdx].add(sourceIdx);
}
logger.info("Loaded links...");
rs = stmt.executeQuery("""
SELECT EC_DOMAIN.ID,
RANK,
STATE='ACTIVE' AS ACTIVE,
NODE_AFFINITY > 0 AS INDEXED,
EC_DOMAIN.DOMAIN_NAME AS DOMAIN_NAME
FROM EC_DOMAIN
""");
while (rs.next()) {
final int id = rs.getInt("ID");
final int idx = domainIdToIdx.get(id);
domainRanks[idx] = Math.round(100 * (1. - rs.getDouble("RANK")));
domainNames[idx] = rs.getString("DOMAIN_NAME");
if (rs.getBoolean("INDEXED"))
indexedDomains.set(idx);
if (rs.getBoolean("ACTIVE"))
activeDomains.set(idx);
}
rs = stmt.executeQuery("""
SELECT EC_DOMAIN.ID
FROM EC_DOMAIN INNER JOIN DATA_DOMAIN_SCREENSHOT AS SCREENSHOT ON EC_DOMAIN.DOMAIN_NAME = SCREENSHOT.DOMAIN_NAME
""");
while (rs.next()) {
final int id = rs.getInt(1);
final int idx = domainIdToIdx.get(id);
screenshotDomains.set(idx);
}
logger.info("Loaded {} domains", domainRanks.length);
logger.info("All done!");
}
}
catch (SQLException throwables) {
logger.warn("Failed to get domain neighbors for domain", throwables);
}
}
double getRelatedness(int a, int b) {
int lowerIndex = Math.min(domainIdToIdx.get(a), domainIdToIdx.get(b));
int higherIndex = Math.max(domainIdToIdx.get(a), domainIdToIdx.get(b));
if (relatedDomains[lowerIndex] == null)
return 0;
return relatedDomains[lowerIndex].get(higherIndex);
}
public List<SimilarDomain> getSimilarDomains(int domainId, int count) {
int domainIdx = domainIdToIdx.get(domainId);
TIntList allIdsList = domainNeighbors[domainIdx];
if (allIdsList == null)
return List.of();
TIntList allIds = new TIntArrayList(new TIntHashSet(allIdsList));
TIntSet linkingIdsDtoS = getLinkingIdsDToS(domainIdx);
TIntSet linkingIdsStoD = getLinkingIdsSToD(domainIdx);
int[] idsArray = new int[allIds.size()];
int[] idxArray = new int[idsArray.length];
for (int i = 0; i < idsArray.length; i++) {
idxArray[i] = allIds.get(i);
idsArray[i] = domainIdxToId[allIds.get(i)];
}
double[] relatednessArray = new double[idsArray.length];
for (int i = 0; i < idsArray.length; i++) {
relatednessArray[i] = getRelatedness(domainId, idsArray[i]);
}
int[] resultIds = IntStream.range(0, idsArray.length)
.boxed()
.sorted((id1, id2) -> {
int diff = Double.compare(relatednessArray[id1], relatednessArray[id2]);
if (diff != 0)
return -diff;
return Integer.compare(idsArray[id1], idsArray[id2]);
})
.mapToInt(idx -> idxArray[idx])
.limit(count)
.toArray();
List<SimilarDomain> domains = new ArrayList<>();
for (int idx : resultIds) {
int id = domainIdxToId[idx];
domains.add(new SimilarDomain(
new EdgeDomain(domainNames[idx]).toRootUrl(),
id,
getRelatedness(domainId, id),
domainRanks[idx],
indexedDomains.get(idx),
activeDomains.get(idx),
screenshotDomains.get(idx),
LinkType.find(
linkingIdsStoD.contains(idx),
linkingIdsDtoS.contains(idx)
)
));
}
domains.removeIf(d -> d.url.domain.toString().length() > 32); domains.removeIf(d -> d.url.domain.toString().length() > 32);
domains.sort(Comparator.comparing(SimilarDomain::relatedness).reversed().thenComparing(SimilarDomain::domainId));
return domains; return domains;
} }
private TIntSet getLinkingIdsDToS(int domainId) { private TIntSet getLinkingIdsDToS(int domainIdx) {
String idQuery = """ var items = linkDtoS[domainIdx];
SELECT DEST_DOMAIN_ID AS ID FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=? if (items == null)
"""; return new TIntHashSet();
return new TIntHashSet(items);
TIntSet ids = new TIntHashSet();
try (var connection = dataSource.getConnection()) {
try (var stmt1 = connection.prepareStatement(idQuery)) {
stmt1.setInt(1, domainId);
var rsp = stmt1.executeQuery();
while (rsp.next()) {
ids.add(rsp.getInt(1));
}
}
}
catch (SQLException throwables) {
logger.warn("Failed to get domain neighbors for domain", throwables);
}
return ids;
}
private TIntSet getLinkingIdsSToD(int domainId) {
String idQuery = """
SELECT SOURCE_DOMAIN_ID AS ID FROM EC_DOMAIN_LINK WHERE DEST_DOMAIN_ID=?
""";
TIntSet ids = new TIntHashSet();
try (var connection = dataSource.getConnection()) {
try (var stmt1 = connection.prepareStatement(idQuery)) {
stmt1.setInt(1, domainId);
var rsp = stmt1.executeQuery();
while (rsp.next()) {
ids.add(rsp.getInt(1));
}
}
}
catch (SQLException throwables) {
logger.warn("Failed to get domain neighbors for domain", throwables);
}
return ids;
} }
private TIntDoubleMap getRelatedness(int selfId, TIntSet ids) { private TIntSet getLinkingIdsSToD(int domainIdx) {
String idQuery = """ var items = linkStoD[domainIdx];
SELECT RELATEDNESS FROM WMSA_prod.EC_DOMAIN_NEIGHBORS_2 WHERE DOMAIN_ID=? AND NEIGHBOR_ID=? if (items == null)
"""; return new TIntHashSet();
return new TIntHashSet(items);
TIntDoubleMap ret = new TIntDoubleHashMap(ids.size());
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement(idQuery)) {
for (var id : ids.toArray()) {
if (selfId > id) {
stmt.setInt(1, selfId);
stmt.setInt(2, id);
}
else {
stmt.setInt(1, id);
stmt.setInt(2, selfId);
}
var rsp = stmt.executeQuery();
if (rsp.next()) {
double relatedness = rsp.getDouble(1);
ret.put(id, relatedness);
}
}
}
}
catch (SQLException throwables) {
logger.warn("Failed to get domain neighbors for domain", throwables);
}
return ret;
} }
public List<SimilarDomain> getLinkingDomains(int domainId, int count) { public List<SimilarDomain> getLinkingDomains(int domainId, int count) {
TIntSet linkingIdsDtoS = getLinkingIdsDToS(domainId); int domainIdx = domainIdToIdx.get(domainId);
TIntSet linkingIdsStoD = getLinkingIdsSToD(domainId);
TIntSet allIds = new TIntHashSet(linkingIdsDtoS.size() + linkingIdsStoD.size()); TIntSet linkingIdsDtoS = getLinkingIdsDToS(domainIdx);
allIds.addAll(linkingIdsDtoS); TIntSet linkingIdsStoD = getLinkingIdsSToD(domainIdx);
allIds.addAll(linkingIdsStoD);
TIntDoubleMap relatedness = getRelatedness(domainId, allIds); TIntSet allIdx = new TIntHashSet(linkingIdsDtoS.size() + linkingIdsStoD.size());
allIdx.addAll(linkingIdsDtoS);
allIdx.addAll(linkingIdsStoD);
List<SimilarDomain> domains = new ArrayList(); int[] idxArray = allIdx.toArray();
int[] idsArray = new int[idxArray.length];
try (var connection = dataSource.getConnection()) { for (int i = 0; i < idsArray.length; i++) {
try (var stmt = connection.prepareStatement(""" idsArray[i] = domainIdxToId[idxArray[i]];
SELECT EC_DOMAIN.DOMAIN_NAME,
SCREENSHOT.DOMAIN_NAME IS NOT NULL AS HAS_SCREENSHOT,
NODE_AFFINITY > 0 AS INDEXED,
STATE='ACTIVE' AS ACTIVE,
RANK
FROM EC_DOMAIN
LEFT JOIN DATA_DOMAIN_SCREENSHOT AS SCREENSHOT
ON EC_DOMAIN.DOMAIN_NAME = SCREENSHOT.DOMAIN_NAME
WHERE ID=?
""")) {
for (int id : allIds.toArray()) {
stmt.setInt(1, id);
var rsp = stmt.executeQuery();
if (rsp.next()) {
domains.add(new SimilarDomain(
new EdgeDomain(rsp.getString("DOMAIN_NAME")).toRootUrl(),
id,
Math.round(100 * relatedness.get(id)),
Math.round(100 * (1. - rsp.getDouble("RANK"))),
rsp.getBoolean("INDEXED"),
rsp.getBoolean("ACTIVE"),
rsp.getBoolean("HAS_SCREENSHOT"),
LinkType.find(
linkingIdsStoD.contains(id),
linkingIdsDtoS.contains(id)
)
));
}
}
}
} }
catch (SQLException throwables) {
logger.warn("Failed to get domain neighbors for domain", throwables); double[] ranksArray = new double[idsArray.length];
for (int i = 0; i < idxArray.length; i++) {
ranksArray[i] = domainRanks[idxArray[i]];
}
double[] relatednessArray = new double[idsArray.length];
for (int i = 0; i < idsArray.length; i++) {
relatednessArray[i] = getRelatedness(domainId, idsArray[i]);
}
int[] linkinessArray = new int[idxArray.length];
for (int i = 0; i < idxArray.length; i++) {
linkinessArray[i] = (linkingIdsDtoS.contains(idxArray[i]) ? 1 : 0) + (linkingIdsStoD.contains(idxArray[i]) ? 1 : 0);
}
int[] resultIds = IntStream.range(0, idsArray.length)
.boxed()
.sorted((id1, id2) -> {
int diff = Double.compare(ranksArray[id1], ranksArray[id2]);
if (diff != 0)
return -diff;
diff = Double.compare(relatednessArray[id1], relatednessArray[id2]);
if (diff != 0)
return -diff;
diff = Integer.compare(linkinessArray[id1], linkinessArray[id2]);
if (diff != 0)
return -diff;
return Integer.compare(idsArray[id1], idsArray[id2]);
})
.mapToInt(idx -> idsArray[idx])
.limit(count)
.toArray();
List<SimilarDomain> domains = new ArrayList<>();
for (int id : resultIds) {
int idx = domainIdToIdx.get(id);
domains.add(new SimilarDomain(
new EdgeDomain(domainNames[idx]).toRootUrl(),
id,
getRelatedness(domainId, id),
domainRanks[idx],
indexedDomains.get(idx),
activeDomains.get(idx),
screenshotDomains.get(idx),
LinkType.find(
linkingIdsStoD.contains(idx),
linkingIdsDtoS.contains(idx)
)
));
} }
domains.removeIf(d -> d.url.domain.toString().length() > 32); domains.removeIf(d -> d.url.domain.toString().length() > 32);
domains.sort(Comparator.comparing(SimilarDomain::rank)
.thenComparing(SimilarDomain::relatedness)
.thenComparing(SimilarDomain::indexed).reversed()
.thenComparing(SimilarDomain::domainId));
if (domains.size() > count)
domains.subList(count, domains.size()).clear();
return domains;
}
private List<SimilarDomain> executeSimilarDomainsQueries(int domainId, int count, String... queries) {
List<SimilarDomain> domains = new ArrayList<>(count);
TIntHashSet seen = new TIntHashSet();
try (var connection = dataSource.getConnection()) {
for (var query : queries) {
try (var stmt = connection.prepareStatement(query)) {
stmt.setFetchSize(count);
stmt.setInt(1, domainId);
stmt.setInt(2, count);
var rsp = stmt.executeQuery();
while (rsp.next() && domains.size() < count * 2) {
int id = rsp.getInt("ID");
if (seen.add(id)) {
boolean linkStod = rsp.getBoolean("LINK_STOD");
boolean linkDtos = rsp.getBoolean("LINK_DTOS");
LinkType linkType = LinkType.find(linkStod, linkDtos);
domains.add(new SimilarDomain(
new EdgeDomain(rsp.getString("DOMAIN_NAME")).toRootUrl(),
id,
Math.round(100 * rsp.getDouble("RELATEDNESS")),
Math.round(100 * (1. - rsp.getDouble("RANK"))),
rsp.getBoolean("INDEXED"),
rsp.getBoolean("ACTIVE"),
rsp.getBoolean("HAS_SCREENSHOT"),
linkType
));
}
}
}
}
} catch (SQLException throwables) {
logger.warn("Failed to get domain neighbors for domain", throwables);
}
return domains; return domains;
} }