mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(control) Add utility for adding domains from an external URL
This commit is contained in:
parent
185b79f2a5
commit
aeeb1d0cb7
@ -54,6 +54,7 @@ dependencies {
|
||||
implementation libs.handlebars
|
||||
|
||||
implementation libs.duckdb
|
||||
implementation libs.jsoup
|
||||
|
||||
implementation libs.trove
|
||||
implementation dependencies.create(libs.spark.get()) {
|
||||
|
@ -8,6 +8,8 @@ import nu.marginalia.control.app.model.DomainModel;
|
||||
import nu.marginalia.control.app.model.DomainSearchResultModel;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Element;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
import spark.Spark;
|
||||
@ -16,6 +18,9 @@ import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
|
||||
@ -38,17 +43,20 @@ public class DomainsManagementService {
|
||||
public void register() throws IOException {
|
||||
|
||||
var domainsViewRenderer = rendererFactory.renderer("control/app/domains");
|
||||
var addDomainsViewRenderer = rendererFactory.renderer("control/app/domains-new");
|
||||
var addDomainsTxtViewRenderer = rendererFactory.renderer("control/app/domains-new");
|
||||
var addDomainsUrlViewRenderer = rendererFactory.renderer("control/app/domains-new-url");
|
||||
var addDomainsAfterReportRenderer = rendererFactory.renderer("control/app/domains-new-report");
|
||||
|
||||
Spark.get("/domain", this::getDomains, domainsViewRenderer::render);
|
||||
Spark.get("/domain/new", this::addDomains, addDomainsViewRenderer::render);
|
||||
Spark.post("/domain/new", this::addDomains, addDomainsAfterReportRenderer::render);
|
||||
Spark.get("/domain/new", this::addDomainsTextfield, addDomainsTxtViewRenderer::render);
|
||||
Spark.post("/domain/new", this::addDomainsTextfield, addDomainsAfterReportRenderer::render);
|
||||
Spark.get("/domain/new-url", this::addDomainsFromDownload, addDomainsUrlViewRenderer::render);
|
||||
Spark.post("/domain/new-url", this::addDomainsFromDownload, addDomainsAfterReportRenderer::render);
|
||||
Spark.post("/domain/:id/assign/:node", this::assignDomain, new Redirects.HtmlRedirect("/domain"));
|
||||
|
||||
}
|
||||
|
||||
private Object addDomains(Request request, Response response) throws SQLException {
|
||||
private Object addDomainsTextfield(Request request, Response response) throws SQLException {
|
||||
if ("GET".equals(request.requestMethod())) {
|
||||
return "";
|
||||
}
|
||||
@ -57,49 +65,16 @@ public class DomainsManagementService {
|
||||
String domainsStr = request.queryParams("domains");
|
||||
|
||||
int node = Integer.parseInt(nodeStr);
|
||||
String[] domains = domainsStr.split("\n+");
|
||||
|
||||
List<EdgeDomain> validDomains = new ArrayList<>();
|
||||
List<String> invalidDomains = new ArrayList<>();
|
||||
List<EdgeDomain> validDomains;
|
||||
List<String> invalidDomains;
|
||||
|
||||
for (String domain : domains) {
|
||||
domain = domain.trim();
|
||||
if (domain.isBlank()) continue;
|
||||
if (domain.length() > 255) {
|
||||
invalidDomains.add(domain);
|
||||
continue;
|
||||
}
|
||||
if (domain.startsWith("#")) {
|
||||
continue;
|
||||
}
|
||||
Map.Entry<List<EdgeDomain>, List<String>> domainsList = parseDomainsList(domainsStr);
|
||||
|
||||
// Run through the URI parser to check for bad domains
|
||||
try {
|
||||
if (domain.contains(":")) {
|
||||
domain = new URI(domain ).toURL().getHost();
|
||||
}
|
||||
else {
|
||||
domain = new URI("https://" + domain + "/").toURL().getHost();
|
||||
}
|
||||
} catch (URISyntaxException | MalformedURLException e) {
|
||||
invalidDomains.add(domain);
|
||||
continue;
|
||||
}
|
||||
validDomains = domainsList.getKey();
|
||||
invalidDomains = domainsList.getValue();
|
||||
|
||||
validDomains.add(new EdgeDomain(domain));
|
||||
}
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("INSERT IGNORE INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES (?, ?, ?)"))
|
||||
{
|
||||
for (var domain : validDomains) {
|
||||
stmt.setString(1, domain.toString());
|
||||
stmt.setString(2, domain.getTopDomain());
|
||||
stmt.setInt(3, node);
|
||||
stmt.addBatch();
|
||||
}
|
||||
stmt.executeBatch();
|
||||
}
|
||||
insertDomains(validDomains, node);
|
||||
|
||||
return Map.of("validDomains", validDomains,
|
||||
"invalidDomains", invalidDomains);
|
||||
@ -107,6 +82,123 @@ public class DomainsManagementService {
|
||||
return "";
|
||||
}
|
||||
|
||||
private Map.Entry<List<EdgeDomain>, List<String>> parseDomainsList(String domainsStr) {
|
||||
List<EdgeDomain> validDomains = new ArrayList<>();
|
||||
List<String> invalidDomains = new ArrayList<>();
|
||||
|
||||
for (String domain : domainsStr.split("\n+")) {
|
||||
domain = domain.trim();
|
||||
if (domain.isBlank()) continue;
|
||||
if (domain.length() > 255) {
|
||||
invalidDomains.add(domain);
|
||||
continue;
|
||||
}
|
||||
if (domain.startsWith("#")) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Run through the URI parser to check for bad domains
|
||||
try {
|
||||
if (domain.contains(":")) {
|
||||
domain = new URI(domain ).toURL().getHost();
|
||||
}
|
||||
else {
|
||||
domain = new URI("https://" + domain + "/").toURL().getHost();
|
||||
}
|
||||
} catch (URISyntaxException | MalformedURLException e) {
|
||||
invalidDomains.add(domain);
|
||||
continue;
|
||||
}
|
||||
|
||||
validDomains.add(new EdgeDomain(domain));
|
||||
}
|
||||
|
||||
return Map.entry(validDomains, invalidDomains);
|
||||
}
|
||||
|
||||
private Object addDomainsFromDownload(Request request, Response response) throws SQLException, URISyntaxException, IOException, InterruptedException {
|
||||
if ("GET".equals(request.requestMethod())) {
|
||||
return "";
|
||||
}
|
||||
else if ("POST".equals(request.requestMethod())) {
|
||||
String nodeStr = request.queryParams("node");
|
||||
URI domainsUrl = new URI(request.queryParams("url"));
|
||||
|
||||
int node = Integer.parseInt(nodeStr);
|
||||
|
||||
HttpClient client = HttpClient.newBuilder().build();
|
||||
var httpReq = HttpRequest.newBuilder(domainsUrl).GET().build();
|
||||
|
||||
|
||||
HttpResponse<String> result = client.send(httpReq, HttpResponse.BodyHandlers.ofString());
|
||||
if (result.statusCode() != 200) {
|
||||
return Map.of("error", "Failed to download domains");
|
||||
}
|
||||
Optional<String> ct = result.headers().firstValue("Content-Type");
|
||||
if (ct.isEmpty()) {
|
||||
return Map.of("error", "No content type");
|
||||
}
|
||||
|
||||
List<EdgeDomain> validDomains = new ArrayList<>();
|
||||
List<String> invalidDomains = new ArrayList<>();
|
||||
|
||||
String contentType = ct.get().toLowerCase();
|
||||
|
||||
if (contentType.startsWith("text/plain")) {
|
||||
var parsedDomains = parseDomainsList(result.body());
|
||||
validDomains = parsedDomains.getKey();
|
||||
invalidDomains = parsedDomains.getValue();
|
||||
}
|
||||
else {
|
||||
for (Element e : Jsoup.parse(result.body()).select("a")) {
|
||||
String s = e.attr("href");
|
||||
if (s.isBlank()) continue;
|
||||
if (!s.contains("://")) continue;
|
||||
|
||||
URI uri = URI.create(s);
|
||||
String scheme = uri.getScheme();
|
||||
String host = uri.getHost();
|
||||
|
||||
if (scheme == null || host == null)
|
||||
continue;
|
||||
if (!scheme.equalsIgnoreCase("http") && !scheme.equalsIgnoreCase("https"))
|
||||
continue;
|
||||
|
||||
validDomains.add(new EdgeDomain(host));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
insertDomains(validDomains, node);
|
||||
|
||||
|
||||
return Map.of("validDomains", validDomains,
|
||||
"invalidDomains", invalidDomains);
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
private void insertDomains(List<EdgeDomain> domains, int node) throws SQLException {
|
||||
|
||||
// Insert the domains into the database, updating the node affinity if the domain already exists and the affinity is not already set to a node
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY)
|
||||
VALUES (?, ?, ?)
|
||||
ON DUPLICATE KEY UPDATE NODE_AFFINITY = IF(NODE_AFFINITY<=0, VALUES(NODE_AFFINITY), NODE_AFFINITY)
|
||||
"""))
|
||||
{
|
||||
for (var domain : domains) {
|
||||
stmt.setString(1, domain.toString());
|
||||
stmt.setString(2, domain.getTopDomain());
|
||||
stmt.setInt(3, node);
|
||||
stmt.addBatch();
|
||||
}
|
||||
stmt.executeBatch();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private Object assignDomain(Request request, Response response) throws SQLException {
|
||||
|
||||
String idStr = request.params(":id");
|
||||
|
@ -10,14 +10,29 @@
|
||||
<h1 class="my-3">Add Domains Report</h1>
|
||||
|
||||
<p></p>
|
||||
{{#unless invalidDomains}}
|
||||
<p>All domains were added successfully!</p>
|
||||
{{#if error}}
|
||||
<p class="alert alert-danger">{{error}}</p>
|
||||
{{/if}}
|
||||
{{#unless errror}}
|
||||
{{#unless invalidDomains}}
|
||||
<p>All domains were added successfully!</p>
|
||||
{{/unless}}
|
||||
{{/unless}}
|
||||
{{#if invalidDomains}}
|
||||
<p>Some domains were invalid and could not be added:</p>
|
||||
<textarea class="form-control" rows="10" disabled>
|
||||
{{#each invalidDomains}}{{.}}{{/each}}
|
||||
</textarea>
|
||||
<textarea class="form-control" rows="10" disabled>
|
||||
{{#each invalidDomains}}
|
||||
{{.}}
|
||||
{{/each}}
|
||||
</textarea>
|
||||
{{/if}}
|
||||
{{#if validDomains}}
|
||||
<p>If they were not already in the database, these domains were added:</p>
|
||||
<textarea class="form-control" rows="10" disabled>
|
||||
{{#each validDomains}}
|
||||
{{.}}
|
||||
{{/each}}
|
||||
</textarea>
|
||||
{{/if}}
|
||||
<p></p>
|
||||
</div>
|
||||
|
@ -0,0 +1,48 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en-US">
|
||||
<head>
|
||||
<title>Control Service</title>
|
||||
{{> control/partials/head-includes }}
|
||||
</head>
|
||||
<body>
|
||||
{{> control/partials/nav}}
|
||||
<div class="container">
|
||||
<h1 class="my-3">Add Domains (URL)</h1>
|
||||
|
||||
<div class="my-3 p-3 border bg-light">
|
||||
<p>This utility lets you add domains to be crawled via an external URL.</p>
|
||||
<a href="/domain/new">It's also possible to add domains directly via a text area</a>
|
||||
</div>
|
||||
|
||||
<form method="post">
|
||||
<div class="form-group my-3">
|
||||
<label for="url" class="form-label">Domains to add</label>
|
||||
<input type="text" class="form-control" name="url"/>
|
||||
<span class="text-muted">
|
||||
Enter the URL to the file or page that contains the domains to add. If the URL leads to a text file,
|
||||
the domains will be parsed from the file, one per line. If it leads to a HTML page, the HTML
|
||||
will be parsed and all the links will be extracted and added as domains.
|
||||
</span>
|
||||
</div>
|
||||
|
||||
<div class="form-group my-3">
|
||||
<label for="node" class="form-label">Node</label>
|
||||
<select name="node" class="form-select">
|
||||
<option value="-1">Unassigned</option>
|
||||
<option value="0" selected>Auto</option>
|
||||
{{#each global-context.nodes}}
|
||||
<option value="{{this}}">Node {{id}}</option>
|
||||
{{/each}}
|
||||
|
||||
</select>
|
||||
<span class="text-muted">
|
||||
Select the node to assign the domains to, this is the index node that will "own" the domain, crawl its documents
|
||||
and index dem. If you select "Auto", the system will assign the domains to the next node that performs a crawl.
|
||||
</span>
|
||||
</div>
|
||||
<button type="submit" class="btn btn-primary">Add</button>
|
||||
</form>
|
||||
</div>
|
||||
</body>
|
||||
{{> control/partials/foot-includes }}
|
||||
</html>
|
@ -9,6 +9,11 @@
|
||||
<div class="container">
|
||||
<h1 class="my-3">Add Domains</h1>
|
||||
|
||||
<div class="my-3 p-3 border bg-light">
|
||||
<p>This utility lets you add domains to be crawled via a text area.</p>
|
||||
<a href="/domain/new-url">It's also possible to add domains via an external URL</a>
|
||||
</div>
|
||||
|
||||
<form method="post">
|
||||
<div class="form-group my-3">
|
||||
<label for="domains" class="form-label">Domains to add</label>
|
||||
|
Loading…
Reference in New Issue
Block a user