diff --git a/code/features-search/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsRandom.java b/code/features-search/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsRandom.java index 44dbe744..385d8eb5 100644 --- a/code/features-search/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsRandom.java +++ b/code/features-search/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsRandom.java @@ -26,7 +26,7 @@ public class DbBrowseDomainsRandom { public List getRandomDomains(int count, DomainBlacklist blacklist, int set) { final String q = """ - SELECT DOMAIN_ID, DOMAIN_NAME + SELECT DOMAIN_ID, DOMAIN_NAME, INDEXED FROM EC_RANDOM_DOMAINS INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE STATE<2 @@ -44,9 +44,10 @@ public class DbBrowseDomainsRandom { while (rsp.next()) { int id = rsp.getInt(1); String domain = rsp.getString(2); + boolean indexed = rsp.getBoolean("INDEXED"); if (!blacklist.isBlacklisted(id)) { - domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0)); + domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0, indexed)); } } } diff --git a/code/features-search/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsSimilarCosine.java b/code/features-search/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsSimilarCosine.java index 36651cd0..63a276a2 100644 --- a/code/features-search/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsSimilarCosine.java +++ b/code/features-search/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsSimilarCosine.java @@ -3,6 +3,7 @@ package nu.marginalia.browse; import com.google.inject.Inject; import com.google.inject.Singleton; import com.zaxxer.hikari.HikariDataSource; +import gnu.trove.set.hash.TIntHashSet; import nu.marginalia.browse.model.BrowseResult; import nu.marginalia.model.EdgeDomain; import nu.marginalia.db.DomainBlacklist; @@ -23,14 +24,15 @@ public class DbBrowseDomainsSimilarCosine { this.dataSource = dataSource; } - public List getDomainNeighborsAdjacentCosine(int domainId, DomainBlacklist blacklist, int count) { + public List getDomainNeighborsAdjacentCosineRequireScreenshot(int domainId, DomainBlacklist blacklist, int count) { List domains = new ArrayList<>(count); String q = """ SELECT EC_DOMAIN.ID, NV.NEIGHBOR_NAME, - NV.RELATEDNESS + NV.RELATEDNESS, + EC_DOMAIN.INDEXED FROM EC_NEIGHBORS_VIEW NV INNER JOIN DATA_DOMAIN_SCREENSHOT ON DATA_DOMAIN_SCREENSHOT.DOMAIN_NAME=NV.NEIGHBOR_NAME INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=NV.NEIGHBOR_ID @@ -49,9 +51,10 @@ public class DbBrowseDomainsSimilarCosine { int id = rsp.getInt(1); String domain = rsp.getString(2); double relatedness = rsp.getDouble(3); + boolean indexed = rsp.getBoolean("INDEXED"); if (!blacklist.isBlacklisted(id)) { - domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, relatedness)); + domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, relatedness, indexed)); } } } diff --git a/code/features-search/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsSimilarOldAlgo.java b/code/features-search/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsSimilarOldAlgo.java index 923cc4fe..bf155040 100644 --- a/code/features-search/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsSimilarOldAlgo.java +++ b/code/features-search/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsSimilarOldAlgo.java @@ -27,7 +27,7 @@ public class DbBrowseDomainsSimilarOldAlgo { final Set domains = new HashSet<>(count*3); final String q = """ - SELECT EC_DOMAIN.ID AS NEIGHBOR_ID, DOMAIN_NAME, COUNT(*) AS CNT + SELECT EC_DOMAIN.ID AS NEIGHBOR_ID, DOMAIN_NAME, COUNT(*) AS CNT, INDEXED FROM EC_DOMAIN_NEIGHBORS INNER JOIN EC_DOMAIN ON NEIGHBOR_ID=EC_DOMAIN.ID INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID @@ -54,14 +54,14 @@ public class DbBrowseDomainsSimilarOldAlgo { String domain = rsp.getString(2); if (!blacklist.isBlacklisted(id)) { - domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0)); + domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0, rsp.getBoolean("INDEXED"))); } } } if (domains.size() < count/2) { final String q2 = """ - SELECT EC_DOMAIN.ID, DOMAIN_NAME + SELECT EC_DOMAIN.ID, DOMAIN_NAME, INDEXED FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK B ON DEST_DOMAIN_ID=EC_DOMAIN.ID @@ -83,7 +83,7 @@ public class DbBrowseDomainsSimilarOldAlgo { String domain = rsp.getString(2); if (!blacklist.isBlacklisted(id)) { - domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0)); + domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0, rsp.getBoolean("INDEXED"))); } } } @@ -91,7 +91,7 @@ public class DbBrowseDomainsSimilarOldAlgo { if (domains.size() < count/2) { final String q3 = """ - SELECT EC_DOMAIN.ID, DOMAIN_NAME + SELECT EC_DOMAIN.ID, DOMAIN_NAME, INDEXED FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK B ON B.SOURCE_DOMAIN_ID=EC_DOMAIN.ID @@ -115,7 +115,7 @@ public class DbBrowseDomainsSimilarOldAlgo { String domain = rsp.getString(2); if (!blacklist.isBlacklisted(id)) { - domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0)); + domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0, rsp.getBoolean("INDEXED"))); } } } @@ -128,38 +128,5 @@ public class DbBrowseDomainsSimilarOldAlgo { return new ArrayList<>(domains); } - public List getRandomDomains(int count, DomainBlacklist blacklist, int set) { - - final String q = """ - SELECT DOMAIN_ID, DOMAIN_NAME - FROM EC_RANDOM_DOMAINS - INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID - WHERE STATE<2 - AND DOMAIN_SET=? - AND DOMAIN_ALIAS IS NULL - ORDER BY RAND() - LIMIT ? - """; - List domains = new ArrayList<>(count); - try (var conn = dataSource.getConnection()) { - try (var stmt = conn.prepareStatement(q)) { - stmt.setInt(1, set);; - stmt.setInt(2, count); - var rsp = stmt.executeQuery(); - while (rsp.next()) { - int id = rsp.getInt(1); - String domain = rsp.getString(2); - - if (!blacklist.isBlacklisted(id)) { - domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0)); - } - } - } - } - catch (SQLException ex) { - logger.error("SQL error", ex); - } - return domains; - } } diff --git a/code/features-search/random-websites/src/main/java/nu/marginalia/browse/model/BrowseResult.java b/code/features-search/random-websites/src/main/java/nu/marginalia/browse/model/BrowseResult.java index e2e899d0..e4f5460b 100644 --- a/code/features-search/random-websites/src/main/java/nu/marginalia/browse/model/BrowseResult.java +++ b/code/features-search/random-websites/src/main/java/nu/marginalia/browse/model/BrowseResult.java @@ -2,7 +2,10 @@ package nu.marginalia.browse.model; import nu.marginalia.model.EdgeUrl; -public record BrowseResult (EdgeUrl url, int domainId, double relatedness) { +public record BrowseResult (EdgeUrl url, + int domainId, + double relatedness, + boolean indexed) { public String domainHash() { var domain = url.domain; diff --git a/code/services-application/dating-service/src/main/java/nu/marginalia/dating/DatingSessionObject.java b/code/services-application/dating-service/src/main/java/nu/marginalia/dating/DatingSessionObject.java index ff87e235..47f52afa 100644 --- a/code/services-application/dating-service/src/main/java/nu/marginalia/dating/DatingSessionObject.java +++ b/code/services-application/dating-service/src/main/java/nu/marginalia/dating/DatingSessionObject.java @@ -28,7 +28,7 @@ public class DatingSessionObject { } public BrowseResult nextSimilar(int domainId, DbBrowseDomainsSimilarCosine adjacent, DomainBlacklist blacklist) { - adjacent.getDomainNeighborsAdjacentCosine(domainId, blacklist, 25).forEach(queue::addFirst); + adjacent.getDomainNeighborsAdjacentCosineRequireScreenshot(domainId, blacklist, 25).forEach(queue::addFirst); while (queue.size() > MAX_QUEUE_SIZE) { queue.removeLast(); diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchBrowseService.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchBrowseService.java index 1064eca1..08423a4d 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchBrowseService.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchBrowseService.java @@ -52,7 +52,7 @@ public class SearchBrowseService { public BrowseResultSet getRelatedEntries(String word) { var domain = domainQueries.getDomainId(new EdgeDomain(word)); - var neighbors = similarDomains.getDomainNeighborsAdjacentCosine(domain, blacklist, 256); + var neighbors = similarDomains.getDomainNeighborsAdjacentCosineRequireScreenshot(domain, blacklist, 256); neighbors.removeIf(browseResultCleaner.shouldRemoveResultPredicate()); // If the results are very few, supplement with the alternative shitty algorithm diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java index 752047b9..b9c9cff2 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java @@ -1,9 +1,9 @@ package nu.marginalia.search.svc; + import com.google.inject.Inject; -import nu.marginalia.browse.model.BrowseResult; -import nu.marginalia.browse.model.BrowseResultSet; import nu.marginalia.client.Context; import nu.marginalia.db.DbDomainQueries; +import nu.marginalia.db.DomainBlacklist; import nu.marginalia.model.EdgeDomain; import nu.marginalia.renderer.MustacheRenderer; import nu.marginalia.renderer.RendererFactory; @@ -12,38 +12,37 @@ import nu.marginalia.search.model.DomainInformation; import nu.marginalia.search.model.UrlDetails; import nu.marginalia.search.siteinfo.DomainInformationService; import nu.marginalia.search.svc.SearchFlagSiteService.FlagSiteFormData; -import spark.*; +import spark.Request; +import spark.Response; -import javax.annotation.Nullable; import java.io.IOException; import java.sql.SQLException; -import java.util.ArrayList; import java.util.List; import java.util.Map; -import java.util.OptionalInt; public class SearchSiteInfoService { private final SearchOperator searchOperator; + private final SimilarDomainsService similarDomains; private final DomainInformationService domainInformationService; private final SearchFlagSiteService flagSiteService; private final DbDomainQueries domainQueries; - private final SearchBrowseService browseService; private final MustacheRenderer renderer; @Inject public SearchSiteInfoService(SearchOperator searchOperator, + SimilarDomainsService similarDomains, DomainInformationService domainInformationService, RendererFactory rendererFactory, SearchFlagSiteService flagSiteService, - DbDomainQueries domainQueries, SearchBrowseService browseService) throws IOException { + DbDomainQueries domainQueries) throws IOException { this.searchOperator = searchOperator; + this.similarDomains = similarDomains; this.domainInformationService = domainInformationService; this.flagSiteService = flagSiteService; this.domainQueries = domainQueries; this.renderer = rendererFactory.renderer("search/site-info/site-info"); - this.browseService = browseService; } @@ -60,10 +59,9 @@ public class SearchSiteInfoService { var model = switch (view) { case "links" -> listLinks(ctx, domainName); case "docs" -> listDocs(ctx, domainName); - case "info" -> siteInfo(ctx, domainName); - case "similar" -> listSimilar(ctx, domainName); + case "info" -> listInfo(ctx, domainName); case "report" -> reportSite(ctx, domainName); - default -> siteInfo(ctx, domainName); + default -> listInfo(ctx, domainName); }; return renderer.renderInto(response, model); @@ -108,21 +106,6 @@ public class SearchSiteInfoService { false); } - private SiteInfo siteInfo(Context ctx, String domainName) { - OptionalInt id = domainQueries.tryGetDomainId(new EdgeDomain(domainName)); - - if (id.isEmpty()) { - return new SiteInfo(domainName, -1, null, dummyInformation(domainName)); - } - - String screenshotPath = "/screenshot/"+id.getAsInt(); - DomainInformation domainInfo = domainInformationService - .domainInfo(domainName) - .orElseGet(() -> dummyInformation(domainName)); - - return new SiteInfo(domainName, id.getAsInt(), screenshotPath, domainInfo); - } - private DomainInformation dummyInformation(String domainName) { return DomainInformation.builder() .domain(new EdgeDomain(domainName)) @@ -136,11 +119,25 @@ public class SearchSiteInfoService { domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1), searchOperator.doBacklinkSearch(ctx, domainName)); } - private SimilarSites listSimilar(Context ctx, String domainName) { - return new SimilarSites(domainName, - domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1), - browseService.getRelatedEntries(domainName)); + private SiteInfoWithContext listInfo(Context ctx, String domainName) { + + final int domainId = domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1); + + final DomainInformation domainInfo = domainInformationService.domainInfo(domainName) + .orElseGet(() -> dummyInformation(domainName)); + + final List similarSet = + similarDomains.getSimilarDomains(domainId, 100); + final List linkingDomains = + similarDomains.getLinkingDomains(domainId, 100); + + return new SiteInfoWithContext(domainName, + domainId, + domainInfo, + similarSet, + linkingDomains + ); } private Docs listDocs(Context ctx, String domainName) { return new Docs(domainName, @@ -148,51 +145,6 @@ public class SearchSiteInfoService { searchOperator.doSiteSearch(ctx, domainName)); } - public record SiteInfo(Map view, - Map domainState, - long domainId, - String domain, - @Nullable String screenshotUrl, - DomainInformation domainInformation) - { - public SiteInfo(String domain, - long domainId, - @Nullable String screenshotUrl, - DomainInformation domainInformation) - { - this(Map.of("info", true), - Map.of(domainInfoState(domainInformation), true), - domainId, - domain, - screenshotUrl, - domainInformation); - } - - private static String domainInfoState(DomainInformation info) { - if (info.isBlacklisted()) { - return "blacklisted"; - } - if (!info.isUnknownDomain() && info.isSuggestForCrawling()) { - return "suggestForCrawling"; - } - if (info.isInCrawlQueue()) { - return "inCrawlQueue"; - } - if (info.isUnknownDomain()) { - return "unknownDomain"; - } - else { - return "indexed"; - } - } - - public String query() { return "site:" + domain; } - - public boolean isKnown() { - return domainId > 0; - } - } - public record Docs(Map view, String domain, long domainId, @@ -222,12 +174,48 @@ public class SearchSiteInfoService { } } - public record SimilarSites(Map view, String domain, long domainId, List results) { - public SimilarSites(String domain, long domainId, BrowseResultSet results) { - this(Map.of("similar", true), domain, domainId, new ArrayList<>(results.results())); + public record SiteInfoWithContext(Map view, + Map domainState, + String domain, + long domainId, + DomainInformation domainInformation, + List similar, + List linking) { + public SiteInfoWithContext(String domain, + long domainId, + DomainInformation domainInformation, + List similar, + List linking + ) + { + this(Map.of("info", true), + Map.of(domainInfoState(domainInformation), true), + domain, + domainId, + domainInformation, + similar, + linking); } - public String query() { return "similar:" + domain; } + public String query() { return "site:" + domain; } + + private static String domainInfoState(DomainInformation info) { + if (info.isBlacklisted()) { + return "blacklisted"; + } + if (!info.isUnknownDomain() && info.isSuggestForCrawling()) { + return "suggestForCrawling"; + } + if (info.isInCrawlQueue()) { + return "inCrawlQueue"; + } + if (info.isUnknownDomain()) { + return "unknownDomain"; + } + else { + return "indexed"; + } + } public boolean isKnown() { return domainId > 0; diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SimilarDomainsService.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SimilarDomainsService.java new file mode 100644 index 00000000..92dcb4d4 --- /dev/null +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SimilarDomainsService.java @@ -0,0 +1,245 @@ +package nu.marginalia.search.svc; + +import com.google.inject.Inject; +import com.zaxxer.hikari.HikariDataSource; +import gnu.trove.set.hash.TIntHashSet; +import nu.marginalia.db.DomainBlacklist; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; + +public class SimilarDomainsService { + + private static final Logger logger = LoggerFactory.getLogger(SimilarDomainsService.class); + private final HikariDataSource dataSource; + + @Inject + public SimilarDomainsService(HikariDataSource dataSource) { + this.dataSource = dataSource; + } + + enum LinkType { + STOD, + DTOS, + BIDI, + NONE; + + public static LinkType find(boolean linkStod, boolean linkDtos) { + if (linkDtos && linkStod) + return BIDI; + if (linkDtos) + return DTOS; + if (linkStod) + return STOD; + + return NONE; + } + + public String toString() { + return switch (this) { + case DTOS -> "→"; + case STOD -> "←"; + case BIDI -> "⇆"; + case NONE -> "-"; + }; + } + + public String getDescription() { + return switch (this) { + case STOD -> "Backward Link"; + case DTOS -> "Forward Link"; + case BIDI -> "Mutual Link"; + case NONE -> "No Link"; + }; + } + }; + + public record SimilarDomain(EdgeUrl url, + int domainId, + double relatedness, + double rank, + boolean indexed, + boolean active, + boolean screenshot, + LinkType linkType) + { + public String getRankSymbols() { + if (rank > 90) { + return "★★★★★"; + } + if (rank > 70) { + return "★★★★"; + } + if (rank > 50) { + return "★★★"; + } + if (rank > 30) { + return "★★"; + } + if (rank > 10) { + return "★"; + } + return ""; + } + } + + public record SimilarDomainsSet(List domains, String focusDomain) { + public SimilarDomainsSet(List domains) { + this(domains, ""); + } + } + + public List getSimilarDomains(int domainId, int count) { + // Tell me you've worked in enterprise software without telling me you've worked in enterprise software + String q1 = """ + SELECT + NEIGHBOR.ID AS ID, + NEIGHBOR.DOMAIN_NAME AS DOMAIN_NAME, + SCREENSHOT.DOMAIN_NAME IS NOT NULL AS HAS_SCREENSHOT, + NODE_AFFINITY > 0 AS INDEXED, + STATE='ACTIVE' AS ACTIVE, + RELATEDNESS, + RANK, + STOD.ID IS NOT NULL AS LINK_STOD, + DTOS.ID IS NOT NULL AS LINK_DTOS + FROM EC_DOMAIN_NEIGHBORS_2 + INNER JOIN EC_DOMAIN AS NEIGHBOR ON EC_DOMAIN_NEIGHBORS_2.NEIGHBOR_ID = NEIGHBOR.ID + LEFT JOIN DATA_DOMAIN_SCREENSHOT AS SCREENSHOT ON NEIGHBOR.DOMAIN_NAME = SCREENSHOT.DOMAIN_NAME + LEFT JOIN EC_DOMAIN_LINK STOD ON STOD.SOURCE_DOMAIN_ID = NEIGHBOR.ID AND STOD.DEST_DOMAIN_ID = EC_DOMAIN_NEIGHBORS_2.DOMAIN_ID + LEFT JOIN EC_DOMAIN_LINK DTOS ON DTOS.DEST_DOMAIN_ID = NEIGHBOR.ID AND DTOS.SOURCE_DOMAIN_ID = EC_DOMAIN_NEIGHBORS_2.DOMAIN_ID + WHERE DOMAIN_ID = ? + ORDER BY RELATEDNESS DESC + LIMIT ? + """; + String q2 = """ + SELECT + NEIGHBOR.ID AS ID, + NEIGHBOR.DOMAIN_NAME AS DOMAIN_NAME, + SCREENSHOT.DOMAIN_NAME IS NOT NULL AS HAS_SCREENSHOT, + NODE_AFFINITY > 0 AS INDEXED, + STATE='ACTIVE' AS ACTIVE, + RELATEDNESS, + RANK, + STOD.ID IS NOT NULL AS LINK_STOD, + DTOS.ID IS NOT NULL AS LINK_DTOS + FROM EC_DOMAIN_NEIGHBORS_2 + INNER JOIN EC_DOMAIN AS NEIGHBOR ON EC_DOMAIN_NEIGHBORS_2.DOMAIN_ID = NEIGHBOR.ID + LEFT JOIN DATA_DOMAIN_SCREENSHOT AS SCREENSHOT ON NEIGHBOR.DOMAIN_NAME = SCREENSHOT.DOMAIN_NAME + LEFT JOIN EC_DOMAIN_LINK STOD ON STOD.SOURCE_DOMAIN_ID = NEIGHBOR.ID AND STOD.DEST_DOMAIN_ID = EC_DOMAIN_NEIGHBORS_2.NEIGHBOR_ID + LEFT JOIN EC_DOMAIN_LINK DTOS ON DTOS.DEST_DOMAIN_ID = NEIGHBOR.ID AND DTOS.SOURCE_DOMAIN_ID = EC_DOMAIN_NEIGHBORS_2.NEIGHBOR_ID + WHERE NEIGHBOR_ID = ? + ORDER BY RELATEDNESS DESC + LIMIT ? + """; + + var domains = executeSimilarDomainsQueries(domainId, count, q1, q2); + + domains.sort(Comparator.comparing(SimilarDomain::relatedness).reversed().thenComparing(SimilarDomain::domainId)); + + return domains; + } + + public List getLinkingDomains(int domainId, int count) { + // Tell me you've worked in enterprise software without telling me you've worked in enterprise software + String q1 = """ + SELECT + NEIGHBOR.ID AS ID, + NEIGHBOR.DOMAIN_NAME AS DOMAIN_NAME, + SCREENSHOT.DOMAIN_NAME IS NOT NULL AS HAS_SCREENSHOT, + NODE_AFFINITY > 0 AS INDEXED, + STATE='ACTIVE' AS ACTIVE, + COALESCE(COALESCE(NA.RELATEDNESS, NB.RELATEDNESS), 0) AS RELATEDNESS, + RANK, + TRUE AS LINK_STOD, + DTOS.ID IS NOT NULL AS LINK_DTOS + FROM EC_DOMAIN_LINK STOD + INNER JOIN EC_DOMAIN AS NEIGHBOR ON STOD.SOURCE_DOMAIN_ID = NEIGHBOR.ID + LEFT JOIN EC_DOMAIN_NEIGHBORS_2 NA ON STOD.SOURCE_DOMAIN_ID = NA.DOMAIN_ID AND STOD.DEST_DOMAIN_ID = NA.NEIGHBOR_ID + LEFT JOIN EC_DOMAIN_NEIGHBORS_2 NB ON STOD.SOURCE_DOMAIN_ID = NB.NEIGHBOR_ID AND STOD.DEST_DOMAIN_ID = NA.DOMAIN_ID + LEFT JOIN DATA_DOMAIN_SCREENSHOT AS SCREENSHOT ON NEIGHBOR.DOMAIN_NAME = SCREENSHOT.DOMAIN_NAME + LEFT JOIN EC_DOMAIN_LINK DTOS ON DTOS.DEST_DOMAIN_ID = STOD.SOURCE_DOMAIN_ID AND DTOS.SOURCE_DOMAIN_ID = STOD.DEST_DOMAIN_ID + WHERE STOD.DEST_DOMAIN_ID = ? + GROUP BY NEIGHBOR.ID + ORDER BY RELATEDNESS DESC + LIMIT ? + """; + String q2 = """ + SELECT + NEIGHBOR.ID AS ID, + NEIGHBOR.DOMAIN_NAME AS DOMAIN_NAME, + SCREENSHOT.DOMAIN_NAME IS NOT NULL AS HAS_SCREENSHOT, + NODE_AFFINITY > 0 AS INDEXED, + STATE='ACTIVE' AS ACTIVE, + COALESCE(COALESCE(NA.RELATEDNESS, NB.RELATEDNESS), 0) AS RELATEDNESS, + RANK, + STOD.ID IS NOT NULL AS LINK_STOD, + TRUE AS LINK_DTOS + FROM EC_DOMAIN_LINK DTOS + INNER JOIN EC_DOMAIN AS NEIGHBOR ON DTOS.DEST_DOMAIN_ID = NEIGHBOR.ID + LEFT JOIN EC_DOMAIN_NEIGHBORS_2 NA ON DTOS.DEST_DOMAIN_ID = NA.DOMAIN_ID AND DTOS.SOURCE_DOMAIN_ID = NA.NEIGHBOR_ID + LEFT JOIN EC_DOMAIN_NEIGHBORS_2 NB ON DTOS.DEST_DOMAIN_ID = NB.NEIGHBOR_ID AND DTOS.SOURCE_DOMAIN_ID = NA.DOMAIN_ID + LEFT JOIN DATA_DOMAIN_SCREENSHOT AS SCREENSHOT ON NEIGHBOR.DOMAIN_NAME = SCREENSHOT.DOMAIN_NAME + LEFT JOIN EC_DOMAIN_LINK STOD ON STOD.DEST_DOMAIN_ID = DTOS.SOURCE_DOMAIN_ID AND STOD.SOURCE_DOMAIN_ID = DTOS.DEST_DOMAIN_ID + WHERE DTOS.SOURCE_DOMAIN_ID = ? + GROUP BY NEIGHBOR.ID + ORDER BY RELATEDNESS DESC + LIMIT ? + """; + + var domains = executeSimilarDomainsQueries(domainId, count, q1, q2); + + domains.sort(Comparator.comparing(SimilarDomain::rank) + .thenComparing(SimilarDomain::relatedness) + .thenComparing(SimilarDomain::indexed).reversed() + .thenComparing(SimilarDomain::domainId)); + + return domains; + } + + private List executeSimilarDomainsQueries(int domainId, int count, String... queries) { + List domains = new ArrayList<>(count); + TIntHashSet seen = new TIntHashSet(); + + try (var connection = dataSource.getConnection()) { + + for (var query : queries) { + try (var stmt = connection.prepareStatement(query)) { + stmt.setFetchSize(count); + stmt.setInt(1, domainId); + stmt.setInt(2, count); + var rsp = stmt.executeQuery(); + while (rsp.next() && domains.size() < count * 2) { + int id = rsp.getInt("ID"); + + if (seen.add(id)) { + boolean linkStod = rsp.getBoolean("LINK_STOD"); + boolean linkDtos = rsp.getBoolean("LINK_DTOS"); + LinkType linkType = LinkType.find(linkStod, linkDtos); + + domains.add(new SimilarDomain( + new EdgeDomain(rsp.getString("DOMAIN_NAME")).toRootUrl(), + id, + 100 * rsp.getDouble("RELATEDNESS"), + 100 * (1. - rsp.getDouble("RANK")), + rsp.getBoolean("INDEXED"), + rsp.getBoolean("ACTIVE"), + rsp.getBoolean("HAS_SCREENSHOT"), + linkType + )); + } + } + } + } + } catch (SQLException throwables) { + logger.warn("Failed to get domain neighbors for domain", throwables); + } + + return domains; + } +} diff --git a/code/services-application/search-service/src/main/resources/static/search/serp.scss b/code/services-application/search-service/src/main/resources/static/search/serp.scss index 03be8190..28c3cf3b 100644 --- a/code/services-application/search-service/src/main/resources/static/search/serp.scss +++ b/code/services-application/search-service/src/main/resources/static/search/serp.scss @@ -12,6 +12,15 @@ $visited: #fcc; * { box-sizing: border-box; } +h1 a, h2 a { + color: $fg-light; +} +h1 a:visited, h2 a:visited { + color: $visited; +} +progress { + width: 10ch; +} body { background-color: $nicotine-light; @@ -343,6 +352,49 @@ footer { align-items: start; } + +#similar-view { + display: grid; + grid-template-columns: 1fr 1fr; + grid-template-rows: auto 1fr; + grid-gap: 1ch; + align-content: start; + justify-content: start; + align-items: start; + table { + th { + text-align: left; + } + } + .screenshot { + width: 100%; + height: auto; + } +} + +#similar-info { + @extend .dialog; +} + +#similar-domains { + grid-row: span 2; + + @extend .dialog; +} + +#similar-links { + @extend .dialog; +} + +@media (max-device-width: 900px) { + #similar-view { + display: block; + * { + margin-bottom: 1ch; + } + } +} + #search-box { @extend .shadowbox; diff --git a/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-index-indexed.hdb b/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-index-indexed.hdb index e780d78b..5b6e40dd 100644 --- a/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-index-indexed.hdb +++ b/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-index-indexed.hdb @@ -7,10 +7,4 @@ Pages Crawled: {{pagesFetched}}
Pages Indexed: {{pagesIndexed}}
-
- -{{#if pagesFetched}} -

- If you've found a reason why this website should not be indexed, - you may use this form to file a report.

-{{/if}} +
\ No newline at end of file diff --git a/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-index.hdb b/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-index.hdb index e38830f0..43ed2450 100644 --- a/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-index.hdb +++ b/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-index.hdb @@ -1,4 +1,3 @@ -

Indexing Information

{{#if domainState.blacklisted}} {{>search/site-info/site-info-index-blacklisted}} @@ -21,5 +20,4 @@ It may take up to a month before it is indexed. {{#if domainState.indexed}} {{>search/site-info/site-info-index-indexed}} -{{/if}} -
\ No newline at end of file +{{/if}} \ No newline at end of file diff --git a/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-links.hdb b/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-links.hdb index 9ac4642d..fa869930 100644 --- a/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-links.hdb +++ b/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-links.hdb @@ -1,9 +1,7 @@ - \ No newline at end of file +

Links

+
+ Link Graph + Ranking: {{ranking}}%
+ Incoming Links: {{incomingLinks}}
+ Outbound Links: {{outboundLinks}}
+
\ No newline at end of file diff --git a/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info.hdb b/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info.hdb index f4b427b7..63f85b2c 100644 --- a/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info.hdb +++ b/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info.hdb @@ -21,22 +21,13 @@ {{/with}} -{{#if view.info}}{{#with domainInformation}} -
-{{> search/site-info/site-info-screenshot}} -{{> search/site-info/site-info-index}} -{{> search/site-info/site-info-links}} -
-{{/with}}{{/if}} - {{#if view.links}}
Showing search results with links to {{domain}}. @@ -56,11 +47,118 @@ {{>search/site-info/site-info-report}} {{/if}} -{{#if view.similar}} -
Showing domains similar to {{domain}}
-
- {{#each results}}{{>search/browse-result}}{{/each}} -
+{{#if view.info}} +
+ A visual exploration mode is also available. +
+ + +
+
+

🌎 {{domain}}

+ + + + Screenshot of {{domain}} + + {{#with domainInformation}} + {{> search/site-info/site-info-index}} + {{> search/site-info/site-info-links}} + {{/with}} +
+ + {{#if similar}} +
+

Similar Domains

+ + + + + + + + + {{#each similar}} + + + + + + + + + {{/each}} +
MetaRankDomainSimilarity
+ {{#if indexed}} + {{#if active}} + 👀 + {{/if}} + {{#unless active}} + 🔥 + {{/unless}} + {{/if}} + + {{#if screenshot}}📷{{/if}} + + {{{linkType}}} + + {{{rankSymbols}}} + + {{url.domain}} + {{relatedness}}
+ +
+

Note: Because two domains are considered similar does not always mean they're in + cahoots. Similarity is a measure of how often they appear in the same contexts, + which may be an association like peas and carrots, but some pairings are also defined by their + contrasting opposition, like Sparta and Athens.

+
+ {{/if}} + + {{#if linking}} + + {{/if}} +
{{/if}} {{>search/parts/search-footer}}