From a6a66c6d8a29447ac41d18645267da1976a9cf3a Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 27 Jun 2023 15:32:11 +0200 Subject: [PATCH] Improve site info for unknown domains: * Placeholder screenshot should work * Add a link to git-repo for submitting the site for crawling --- .../nu/marginalia/db/DbDomainQueries.java | 27 ++++++++++++++++ .../command/commands/SiteListCommand.java | 24 ++++++++++++-- .../search/model/DomainInformation.java | 1 + .../search/parts/site-info-index.hdb | 31 ++++++++++++++----- 4 files changed, 72 insertions(+), 11 deletions(-) diff --git a/code/common/db/src/main/java/nu/marginalia/db/DbDomainQueries.java b/code/common/db/src/main/java/nu/marginalia/db/DbDomainQueries.java index ae52cac9..d923a82a 100644 --- a/code/common/db/src/main/java/nu/marginalia/db/DbDomainQueries.java +++ b/code/common/db/src/main/java/nu/marginalia/db/DbDomainQueries.java @@ -47,6 +47,33 @@ public class DbDomainQueries { } } + @SneakyThrows + public Optional> tryGetDomainId(EdgeDomain domain) { + + var maybe = Optional.ofNullable(domainIdCache.getIfPresent(domain)); + + if (maybe.isPresent()) + return maybe; + + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) { + stmt.setString(1, domain.toString()); + var rsp = stmt.executeQuery(); + if (rsp.next()) { + var id = new EdgeId(rsp.getInt(1)); + + domainIdCache.put(domain, id); + return Optional.of(id); + } + } + return Optional.empty(); + } + catch (UncheckedExecutionException ex) { + return Optional.empty(); + } + } + @SneakyThrows public Optional getDomain(EdgeId id) { try (var connection = dataSource.getConnection()) { diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/command/commands/SiteListCommand.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/command/commands/SiteListCommand.java index 0f24126b..eac37841 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/command/commands/SiteListCommand.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/command/commands/SiteListCommand.java @@ -2,6 +2,7 @@ package nu.marginalia.search.command.commands; import com.google.inject.Inject; import nu.marginalia.db.DbDomainQueries; +import nu.marginalia.model.EdgeDomain; import nu.marginalia.search.model.UrlDetails; import nu.marginalia.search.command.SearchCommandInterface; import nu.marginalia.search.command.SearchParameters; @@ -64,8 +65,15 @@ public class SiteListCommand implements SearchCommandInterface { if (null != domain) { var dumbQuery = queryFactory.createQuery(SearchProfile.CORPO, 100, 100, "site:"+domain); resultSet = searchQueryIndexService.executeQuery(ctx, dumbQuery); - domainId = domainQueries.getDomainId(domain).id(); - screenshotPath = Path.of("/screenshot/" + domainId); + var maybeId = domainQueries.tryGetDomainId(domain); + if (maybeId.isPresent()) { + domainId = maybeId.get().id(); + screenshotPath = Path.of("/screenshot/" + domainId); + } + else { + domainId = -1; + screenshotPath = Path.of("/screenshot/0"); + } } else { resultSet = Collections.emptyList(); @@ -90,7 +98,10 @@ public class SiteListCommand implements SearchCommandInterface { String word = humanQuery.substring(definePrefix.length()).toLowerCase(); logger.info("Fetching Site Info: {}", word); - var results = domainInformationService.domainInfo(word).orElseGet(DomainInformation::new); + + var results = domainInformationService + .domainInfo(word) + .orElseGet(() -> unknownSite(word)); logger.debug("Results = {}", results); @@ -98,4 +109,11 @@ public class SiteListCommand implements SearchCommandInterface { } + private DomainInformation unknownSite(String url) { + return DomainInformation.builder() + .domain(new EdgeDomain(url)) + .suggestForCrawling(true) + .unknownDomain(true) + .build(); + } } diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/model/DomainInformation.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/model/DomainInformation.java index 85bb438c..6a7a6c3e 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/model/DomainInformation.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/model/DomainInformation.java @@ -20,6 +20,7 @@ public class DomainInformation { boolean suggestForCrawling; boolean inCrawlQueue; + boolean unknownDomain; String state; List linkingDomains; diff --git a/code/services-core/search-service/src/main/resources/templates/search/parts/site-info-index.hdb b/code/services-core/search-service/src/main/resources/templates/search/parts/site-info-index.hdb index 9acf8bed..346f4416 100644 --- a/code/services-core/search-service/src/main/resources/templates/search/parts/site-info-index.hdb +++ b/code/services-core/search-service/src/main/resources/templates/search/parts/site-info-index.hdb @@ -26,19 +26,34 @@ {{/if}} {{#if suggestForCrawling}} -
+ {{#if unknownDomain}} +
Crawling - This website is not queued for crawling. If you would like it to be crawled, - use the checkbox and button below.

- - -
-
- + This website is not known to the search engine. + + To submit the website for crawling, follow these instructions.

+ {{/if}} + {{#unless unknownDomain}} + +
+ Crawling + This website is not queued for crawling. If you would like it to be crawled, + use the checkbox and button below.

+ + +
+
+ +

+ {{/unless}} {{/if}} + {{#if pagesFetched}}

If you've found a reason why this website should not be indexed,