From 6231f525fdb6e58ae6c72fa44cb154326695a580 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Sun, 30 Oct 2022 10:31:34 +0100 Subject: [PATCH] Prefer cosine similarity relatedness for browse:-queries. --- .../edge/data/dao/EdgeDataStoreDaoImpl.java | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java index 19752b51..13eb644a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java @@ -134,10 +134,53 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { } + public List getDomainNeighborsAdjacentCosine(EdgeId domainId, EdgeDomainBlacklist blacklist, int count) { + List domains = new ArrayList<>(count); + + String q = """ + SELECT + EC_DOMAIN.ID, + NV.NEIGHBOR_NAME + FROM EC_NEIGHBORS_VIEW NV + INNER JOIN DATA_DOMAIN_SCREENSHOT ON DATA_DOMAIN_SCREENSHOT.DOMAIN_NAME=NV.NEIGHBOR_NAME + INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=NV.NEIGHBOR_ID + WHERE NV.DOMAIN_ID=? + GROUP BY NV.NEIGHBOR_ID + ORDER BY NV.RELATEDNESS DESC + """; + + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement(q)) { + stmt.setFetchSize(count); + stmt.setInt(1, domainId.id()); + stmt.setInt(2, count); + var rsp = stmt.executeQuery(); + while (rsp.next() && domains.size() < count) { + int id = rsp.getInt(1); + String domain = rsp.getString(2); + + if (!blacklist.isBlacklisted(id)) { + domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id)); + } + } + } + } catch (SQLException throwables) { + throwables.printStackTrace(); + } + + return domains; + } + @Override public List getDomainNeighborsAdjacent(EdgeId domainId, EdgeDomainBlacklist blacklist, int count) { final Set domains = new HashSet<>(count*3); + domains.addAll(getDomainNeighborsAdjacentCosine(domainId, blacklist, count)); + + if (domains.size() >= count) { + return new ArrayList<>(domains); + } + final String q = """ SELECT EC_DOMAIN.ID AS NEIGHBOR_ID, DOMAIN_NAME, COUNT(*) AS CNT FROM EC_DOMAIN_NEIGHBORS