Improve site info for unknown domains:

* Placeholder screenshot should work
* Add a link to git-repo for submitting the site for crawling
This commit is contained in:
Viktor Lofgren 2023-06-27 15:32:11 +02:00
parent d167ad2017
commit a6a66c6d8a
4 changed files with 72 additions and 11 deletions

View File

@ -47,6 +47,33 @@ public class DbDomainQueries {
}
}
@SneakyThrows
public Optional<EdgeId<EdgeDomain>> tryGetDomainId(EdgeDomain domain) {
var maybe = Optional.ofNullable(domainIdCache.getIfPresent(domain));
if (maybe.isPresent())
return maybe;
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
stmt.setString(1, domain.toString());
var rsp = stmt.executeQuery();
if (rsp.next()) {
var id = new EdgeId<EdgeDomain>(rsp.getInt(1));
domainIdCache.put(domain, id);
return Optional.of(id);
}
}
return Optional.empty();
}
catch (UncheckedExecutionException ex) {
return Optional.empty();
}
}
@SneakyThrows
public Optional<EdgeDomain> getDomain(EdgeId<EdgeDomain> id) {
try (var connection = dataSource.getConnection()) {

View File

@ -2,6 +2,7 @@ package nu.marginalia.search.command.commands;
import com.google.inject.Inject;
import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.search.model.UrlDetails;
import nu.marginalia.search.command.SearchCommandInterface;
import nu.marginalia.search.command.SearchParameters;
@ -64,8 +65,15 @@ public class SiteListCommand implements SearchCommandInterface {
if (null != domain) {
var dumbQuery = queryFactory.createQuery(SearchProfile.CORPO, 100, 100, "site:"+domain);
resultSet = searchQueryIndexService.executeQuery(ctx, dumbQuery);
domainId = domainQueries.getDomainId(domain).id();
screenshotPath = Path.of("/screenshot/" + domainId);
var maybeId = domainQueries.tryGetDomainId(domain);
if (maybeId.isPresent()) {
domainId = maybeId.get().id();
screenshotPath = Path.of("/screenshot/" + domainId);
}
else {
domainId = -1;
screenshotPath = Path.of("/screenshot/0");
}
}
else {
resultSet = Collections.emptyList();
@ -90,7 +98,10 @@ public class SiteListCommand implements SearchCommandInterface {
String word = humanQuery.substring(definePrefix.length()).toLowerCase();
logger.info("Fetching Site Info: {}", word);
var results = domainInformationService.domainInfo(word).orElseGet(DomainInformation::new);
var results = domainInformationService
.domainInfo(word)
.orElseGet(() -> unknownSite(word));
logger.debug("Results = {}", results);
@ -98,4 +109,11 @@ public class SiteListCommand implements SearchCommandInterface {
}
private DomainInformation unknownSite(String url) {
return DomainInformation.builder()
.domain(new EdgeDomain(url))
.suggestForCrawling(true)
.unknownDomain(true)
.build();
}
}

View File

@ -20,6 +20,7 @@ public class DomainInformation {
boolean suggestForCrawling;
boolean inCrawlQueue;
boolean unknownDomain;
String state;
List<EdgeDomain> linkingDomains;

View File

@ -26,19 +26,34 @@
{{/if}}
{{#if suggestForCrawling}}
<form method="POST" action="/site/suggest/">
{{#if unknownDomain}}
<fieldset>
<legend>Crawling</legend>
This website is not queued for crawling. If you would like it to be crawled,
use the checkbox and button below.<p/>
<input type="hidden" name="id" value="{{domainId}}" />
<input type="checkbox" id="nomisclick" name="nomisclick" /> <label for="nomisclick"> This is not a mis-click </label>
<br/>
<br/>
<input type="submit" value="Add {{domain}} to queue" />
This website is not known to the search engine.
To submit the website for crawling, follow <a
rel="noopener noreferrer"
target="_blank"
href="https://github.com/MarginaliaSearch/submit-site-to-marginalia-search">these instructions</a>.
</fieldset>
{{/if}}
{{#unless unknownDomain}}
<form method="POST" action="/site/suggest/">
<fieldset>
<legend>Crawling</legend>
This website is not queued for crawling. If you would like it to be crawled,
use the checkbox and button below.<p/>
<input type="hidden" name="id" value="{{domainId}}" />
<input type="checkbox" id="nomisclick" name="nomisclick" /> <label for="nomisclick"> This is not a mis-click </label>
<br/>
<br/>
<input type="submit" value="Add {{domain}} to queue" />
</fieldset>
</form>
{{/unless}}
{{/if}}
{{#if pagesFetched}}
<p>
If you've found a reason why this website should not be indexed,