mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Improve site info for unknown domains:
* Placeholder screenshot should work * Add a link to git-repo for submitting the site for crawling
This commit is contained in:
parent
d167ad2017
commit
a6a66c6d8a
@ -47,6 +47,33 @@ public class DbDomainQueries {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public Optional<EdgeId<EdgeDomain>> tryGetDomainId(EdgeDomain domain) {
|
||||||
|
|
||||||
|
var maybe = Optional.ofNullable(domainIdCache.getIfPresent(domain));
|
||||||
|
|
||||||
|
if (maybe.isPresent())
|
||||||
|
return maybe;
|
||||||
|
|
||||||
|
try (var connection = dataSource.getConnection()) {
|
||||||
|
|
||||||
|
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||||
|
stmt.setString(1, domain.toString());
|
||||||
|
var rsp = stmt.executeQuery();
|
||||||
|
if (rsp.next()) {
|
||||||
|
var id = new EdgeId<EdgeDomain>(rsp.getInt(1));
|
||||||
|
|
||||||
|
domainIdCache.put(domain, id);
|
||||||
|
return Optional.of(id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
catch (UncheckedExecutionException ex) {
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public Optional<EdgeDomain> getDomain(EdgeId<EdgeDomain> id) {
|
public Optional<EdgeDomain> getDomain(EdgeId<EdgeDomain> id) {
|
||||||
try (var connection = dataSource.getConnection()) {
|
try (var connection = dataSource.getConnection()) {
|
||||||
|
@ -2,6 +2,7 @@ package nu.marginalia.search.command.commands;
|
|||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import nu.marginalia.db.DbDomainQueries;
|
import nu.marginalia.db.DbDomainQueries;
|
||||||
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.search.model.UrlDetails;
|
import nu.marginalia.search.model.UrlDetails;
|
||||||
import nu.marginalia.search.command.SearchCommandInterface;
|
import nu.marginalia.search.command.SearchCommandInterface;
|
||||||
import nu.marginalia.search.command.SearchParameters;
|
import nu.marginalia.search.command.SearchParameters;
|
||||||
@ -64,8 +65,15 @@ public class SiteListCommand implements SearchCommandInterface {
|
|||||||
if (null != domain) {
|
if (null != domain) {
|
||||||
var dumbQuery = queryFactory.createQuery(SearchProfile.CORPO, 100, 100, "site:"+domain);
|
var dumbQuery = queryFactory.createQuery(SearchProfile.CORPO, 100, 100, "site:"+domain);
|
||||||
resultSet = searchQueryIndexService.executeQuery(ctx, dumbQuery);
|
resultSet = searchQueryIndexService.executeQuery(ctx, dumbQuery);
|
||||||
domainId = domainQueries.getDomainId(domain).id();
|
var maybeId = domainQueries.tryGetDomainId(domain);
|
||||||
screenshotPath = Path.of("/screenshot/" + domainId);
|
if (maybeId.isPresent()) {
|
||||||
|
domainId = maybeId.get().id();
|
||||||
|
screenshotPath = Path.of("/screenshot/" + domainId);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
domainId = -1;
|
||||||
|
screenshotPath = Path.of("/screenshot/0");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
resultSet = Collections.emptyList();
|
resultSet = Collections.emptyList();
|
||||||
@ -90,7 +98,10 @@ public class SiteListCommand implements SearchCommandInterface {
|
|||||||
String word = humanQuery.substring(definePrefix.length()).toLowerCase();
|
String word = humanQuery.substring(definePrefix.length()).toLowerCase();
|
||||||
|
|
||||||
logger.info("Fetching Site Info: {}", word);
|
logger.info("Fetching Site Info: {}", word);
|
||||||
var results = domainInformationService.domainInfo(word).orElseGet(DomainInformation::new);
|
|
||||||
|
var results = domainInformationService
|
||||||
|
.domainInfo(word)
|
||||||
|
.orElseGet(() -> unknownSite(word));
|
||||||
|
|
||||||
logger.debug("Results = {}", results);
|
logger.debug("Results = {}", results);
|
||||||
|
|
||||||
@ -98,4 +109,11 @@ public class SiteListCommand implements SearchCommandInterface {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private DomainInformation unknownSite(String url) {
|
||||||
|
return DomainInformation.builder()
|
||||||
|
.domain(new EdgeDomain(url))
|
||||||
|
.suggestForCrawling(true)
|
||||||
|
.unknownDomain(true)
|
||||||
|
.build();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -20,6 +20,7 @@ public class DomainInformation {
|
|||||||
|
|
||||||
boolean suggestForCrawling;
|
boolean suggestForCrawling;
|
||||||
boolean inCrawlQueue;
|
boolean inCrawlQueue;
|
||||||
|
boolean unknownDomain;
|
||||||
|
|
||||||
String state;
|
String state;
|
||||||
List<EdgeDomain> linkingDomains;
|
List<EdgeDomain> linkingDomains;
|
||||||
|
@ -26,19 +26,34 @@
|
|||||||
{{/if}}
|
{{/if}}
|
||||||
|
|
||||||
{{#if suggestForCrawling}}
|
{{#if suggestForCrawling}}
|
||||||
<form method="POST" action="/site/suggest/">
|
{{#if unknownDomain}}
|
||||||
|
|
||||||
<fieldset>
|
<fieldset>
|
||||||
<legend>Crawling</legend>
|
<legend>Crawling</legend>
|
||||||
This website is not queued for crawling. If you would like it to be crawled,
|
This website is not known to the search engine.
|
||||||
use the checkbox and button below.<p/>
|
|
||||||
<input type="hidden" name="id" value="{{domainId}}" />
|
To submit the website for crawling, follow <a
|
||||||
<input type="checkbox" id="nomisclick" name="nomisclick" /> <label for="nomisclick"> This is not a mis-click </label>
|
rel="noopener noreferrer"
|
||||||
<br/>
|
target="_blank"
|
||||||
<br/>
|
href="https://github.com/MarginaliaSearch/submit-site-to-marginalia-search">these instructions</a>.
|
||||||
<input type="submit" value="Add {{domain}} to queue" />
|
|
||||||
</fieldset>
|
</fieldset>
|
||||||
|
{{/if}}
|
||||||
|
{{#unless unknownDomain}}
|
||||||
|
<form method="POST" action="/site/suggest/">
|
||||||
|
<fieldset>
|
||||||
|
<legend>Crawling</legend>
|
||||||
|
This website is not queued for crawling. If you would like it to be crawled,
|
||||||
|
use the checkbox and button below.<p/>
|
||||||
|
<input type="hidden" name="id" value="{{domainId}}" />
|
||||||
|
<input type="checkbox" id="nomisclick" name="nomisclick" /> <label for="nomisclick"> This is not a mis-click </label>
|
||||||
|
<br/>
|
||||||
|
<br/>
|
||||||
|
<input type="submit" value="Add {{domain}} to queue" />
|
||||||
|
</fieldset>
|
||||||
</form>
|
</form>
|
||||||
|
{{/unless}}
|
||||||
{{/if}}
|
{{/if}}
|
||||||
|
|
||||||
{{#if pagesFetched}}
|
{{#if pagesFetched}}
|
||||||
<p>
|
<p>
|
||||||
If you've found a reason why this website should not be indexed,
|
If you've found a reason why this website should not be indexed,
|
||||||
|
Loading…
Reference in New Issue
Block a user