mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Improve site info for unknown domains:
* Placeholder screenshot should work * Add a link to git-repo for submitting the site for crawling
This commit is contained in:
parent
d167ad2017
commit
a6a66c6d8a
@ -47,6 +47,33 @@ public class DbDomainQueries {
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public Optional<EdgeId<EdgeDomain>> tryGetDomainId(EdgeDomain domain) {
|
||||
|
||||
var maybe = Optional.ofNullable(domainIdCache.getIfPresent(domain));
|
||||
|
||||
if (maybe.isPresent())
|
||||
return maybe;
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||
stmt.setString(1, domain.toString());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
var id = new EdgeId<EdgeDomain>(rsp.getInt(1));
|
||||
|
||||
domainIdCache.put(domain, id);
|
||||
return Optional.of(id);
|
||||
}
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
catch (UncheckedExecutionException ex) {
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public Optional<EdgeDomain> getDomain(EdgeId<EdgeDomain> id) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
@ -2,6 +2,7 @@ package nu.marginalia.search.command.commands;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.search.model.UrlDetails;
|
||||
import nu.marginalia.search.command.SearchCommandInterface;
|
||||
import nu.marginalia.search.command.SearchParameters;
|
||||
@ -64,9 +65,16 @@ public class SiteListCommand implements SearchCommandInterface {
|
||||
if (null != domain) {
|
||||
var dumbQuery = queryFactory.createQuery(SearchProfile.CORPO, 100, 100, "site:"+domain);
|
||||
resultSet = searchQueryIndexService.executeQuery(ctx, dumbQuery);
|
||||
domainId = domainQueries.getDomainId(domain).id();
|
||||
var maybeId = domainQueries.tryGetDomainId(domain);
|
||||
if (maybeId.isPresent()) {
|
||||
domainId = maybeId.get().id();
|
||||
screenshotPath = Path.of("/screenshot/" + domainId);
|
||||
}
|
||||
else {
|
||||
domainId = -1;
|
||||
screenshotPath = Path.of("/screenshot/0");
|
||||
}
|
||||
}
|
||||
else {
|
||||
resultSet = Collections.emptyList();
|
||||
}
|
||||
@ -90,7 +98,10 @@ public class SiteListCommand implements SearchCommandInterface {
|
||||
String word = humanQuery.substring(definePrefix.length()).toLowerCase();
|
||||
|
||||
logger.info("Fetching Site Info: {}", word);
|
||||
var results = domainInformationService.domainInfo(word).orElseGet(DomainInformation::new);
|
||||
|
||||
var results = domainInformationService
|
||||
.domainInfo(word)
|
||||
.orElseGet(() -> unknownSite(word));
|
||||
|
||||
logger.debug("Results = {}", results);
|
||||
|
||||
@ -98,4 +109,11 @@ public class SiteListCommand implements SearchCommandInterface {
|
||||
|
||||
}
|
||||
|
||||
private DomainInformation unknownSite(String url) {
|
||||
return DomainInformation.builder()
|
||||
.domain(new EdgeDomain(url))
|
||||
.suggestForCrawling(true)
|
||||
.unknownDomain(true)
|
||||
.build();
|
||||
}
|
||||
}
|
||||
|
@ -20,6 +20,7 @@ public class DomainInformation {
|
||||
|
||||
boolean suggestForCrawling;
|
||||
boolean inCrawlQueue;
|
||||
boolean unknownDomain;
|
||||
|
||||
String state;
|
||||
List<EdgeDomain> linkingDomains;
|
||||
|
@ -26,6 +26,19 @@
|
||||
{{/if}}
|
||||
|
||||
{{#if suggestForCrawling}}
|
||||
{{#if unknownDomain}}
|
||||
|
||||
<fieldset>
|
||||
<legend>Crawling</legend>
|
||||
This website is not known to the search engine.
|
||||
|
||||
To submit the website for crawling, follow <a
|
||||
rel="noopener noreferrer"
|
||||
target="_blank"
|
||||
href="https://github.com/MarginaliaSearch/submit-site-to-marginalia-search">these instructions</a>.
|
||||
</fieldset>
|
||||
{{/if}}
|
||||
{{#unless unknownDomain}}
|
||||
<form method="POST" action="/site/suggest/">
|
||||
<fieldset>
|
||||
<legend>Crawling</legend>
|
||||
@ -38,7 +51,9 @@
|
||||
<input type="submit" value="Add {{domain}} to queue" />
|
||||
</fieldset>
|
||||
</form>
|
||||
{{/unless}}
|
||||
{{/if}}
|
||||
|
||||
{{#if pagesFetched}}
|
||||
<p>
|
||||
If you've found a reason why this website should not be indexed,
|
||||
|
Loading…
Reference in New Issue
Block a user