(control) Add utility for adding domains from an external URL

This commit is contained in:
Viktor Lofgren 2024-09-01 12:14:21 +02:00
parent 185b79f2a5
commit aeeb1d0cb7
5 changed files with 209 additions and 48 deletions

View File

@ -54,6 +54,7 @@ dependencies {
implementation libs.handlebars implementation libs.handlebars
implementation libs.duckdb implementation libs.duckdb
implementation libs.jsoup
implementation libs.trove implementation libs.trove
implementation dependencies.create(libs.spark.get()) { implementation dependencies.create(libs.spark.get()) {

View File

@ -8,6 +8,8 @@ import nu.marginalia.control.app.model.DomainModel;
import nu.marginalia.control.app.model.DomainSearchResultModel; import nu.marginalia.control.app.model.DomainSearchResultModel;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.nodecfg.NodeConfigurationService; import nu.marginalia.nodecfg.NodeConfigurationService;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import spark.Request; import spark.Request;
import spark.Response; import spark.Response;
import spark.Spark; import spark.Spark;
@ -16,6 +18,9 @@ import java.io.IOException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.net.URI; import java.net.URI;
import java.net.URISyntaxException; import java.net.URISyntaxException;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.sql.SQLException; import java.sql.SQLException;
import java.util.*; import java.util.*;
@ -38,17 +43,20 @@ public class DomainsManagementService {
public void register() throws IOException { public void register() throws IOException {
var domainsViewRenderer = rendererFactory.renderer("control/app/domains"); var domainsViewRenderer = rendererFactory.renderer("control/app/domains");
var addDomainsViewRenderer = rendererFactory.renderer("control/app/domains-new"); var addDomainsTxtViewRenderer = rendererFactory.renderer("control/app/domains-new");
var addDomainsUrlViewRenderer = rendererFactory.renderer("control/app/domains-new-url");
var addDomainsAfterReportRenderer = rendererFactory.renderer("control/app/domains-new-report"); var addDomainsAfterReportRenderer = rendererFactory.renderer("control/app/domains-new-report");
Spark.get("/domain", this::getDomains, domainsViewRenderer::render); Spark.get("/domain", this::getDomains, domainsViewRenderer::render);
Spark.get("/domain/new", this::addDomains, addDomainsViewRenderer::render); Spark.get("/domain/new", this::addDomainsTextfield, addDomainsTxtViewRenderer::render);
Spark.post("/domain/new", this::addDomains, addDomainsAfterReportRenderer::render); Spark.post("/domain/new", this::addDomainsTextfield, addDomainsAfterReportRenderer::render);
Spark.get("/domain/new-url", this::addDomainsFromDownload, addDomainsUrlViewRenderer::render);
Spark.post("/domain/new-url", this::addDomainsFromDownload, addDomainsAfterReportRenderer::render);
Spark.post("/domain/:id/assign/:node", this::assignDomain, new Redirects.HtmlRedirect("/domain")); Spark.post("/domain/:id/assign/:node", this::assignDomain, new Redirects.HtmlRedirect("/domain"));
} }
private Object addDomains(Request request, Response response) throws SQLException { private Object addDomainsTextfield(Request request, Response response) throws SQLException {
if ("GET".equals(request.requestMethod())) { if ("GET".equals(request.requestMethod())) {
return ""; return "";
} }
@ -57,12 +65,28 @@ public class DomainsManagementService {
String domainsStr = request.queryParams("domains"); String domainsStr = request.queryParams("domains");
int node = Integer.parseInt(nodeStr); int node = Integer.parseInt(nodeStr);
String[] domains = domainsStr.split("\n+");
List<EdgeDomain> validDomains;
List<String> invalidDomains;
Map.Entry<List<EdgeDomain>, List<String>> domainsList = parseDomainsList(domainsStr);
validDomains = domainsList.getKey();
invalidDomains = domainsList.getValue();
insertDomains(validDomains, node);
return Map.of("validDomains", validDomains,
"invalidDomains", invalidDomains);
}
return "";
}
private Map.Entry<List<EdgeDomain>, List<String>> parseDomainsList(String domainsStr) {
List<EdgeDomain> validDomains = new ArrayList<>(); List<EdgeDomain> validDomains = new ArrayList<>();
List<String> invalidDomains = new ArrayList<>(); List<String> invalidDomains = new ArrayList<>();
for (String domain : domains) { for (String domain : domainsStr.split("\n+")) {
domain = domain.trim(); domain = domain.trim();
if (domain.isBlank()) continue; if (domain.isBlank()) continue;
if (domain.length() > 255) { if (domain.length() > 255) {
@ -89,10 +113,82 @@ public class DomainsManagementService {
validDomains.add(new EdgeDomain(domain)); validDomains.add(new EdgeDomain(domain));
} }
return Map.entry(validDomains, invalidDomains);
}
private Object addDomainsFromDownload(Request request, Response response) throws SQLException, URISyntaxException, IOException, InterruptedException {
if ("GET".equals(request.requestMethod())) {
return "";
}
else if ("POST".equals(request.requestMethod())) {
String nodeStr = request.queryParams("node");
URI domainsUrl = new URI(request.queryParams("url"));
int node = Integer.parseInt(nodeStr);
HttpClient client = HttpClient.newBuilder().build();
var httpReq = HttpRequest.newBuilder(domainsUrl).GET().build();
HttpResponse<String> result = client.send(httpReq, HttpResponse.BodyHandlers.ofString());
if (result.statusCode() != 200) {
return Map.of("error", "Failed to download domains");
}
Optional<String> ct = result.headers().firstValue("Content-Type");
if (ct.isEmpty()) {
return Map.of("error", "No content type");
}
List<EdgeDomain> validDomains = new ArrayList<>();
List<String> invalidDomains = new ArrayList<>();
String contentType = ct.get().toLowerCase();
if (contentType.startsWith("text/plain")) {
var parsedDomains = parseDomainsList(result.body());
validDomains = parsedDomains.getKey();
invalidDomains = parsedDomains.getValue();
}
else {
for (Element e : Jsoup.parse(result.body()).select("a")) {
String s = e.attr("href");
if (s.isBlank()) continue;
if (!s.contains("://")) continue;
URI uri = URI.create(s);
String scheme = uri.getScheme();
String host = uri.getHost();
if (scheme == null || host == null)
continue;
if (!scheme.equalsIgnoreCase("http") && !scheme.equalsIgnoreCase("https"))
continue;
validDomains.add(new EdgeDomain(host));
}
}
insertDomains(validDomains, node);
return Map.of("validDomains", validDomains,
"invalidDomains", invalidDomains);
}
return "";
}
private void insertDomains(List<EdgeDomain> domains, int node) throws SQLException {
// Insert the domains into the database, updating the node affinity if the domain already exists and the affinity is not already set to a node
try (var conn = dataSource.getConnection(); try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("INSERT IGNORE INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES (?, ?, ?)")) var stmt = conn.prepareStatement("""
INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY)
VALUES (?, ?, ?)
ON DUPLICATE KEY UPDATE NODE_AFFINITY = IF(NODE_AFFINITY<=0, VALUES(NODE_AFFINITY), NODE_AFFINITY)
"""))
{ {
for (var domain : validDomains) { for (var domain : domains) {
stmt.setString(1, domain.toString()); stmt.setString(1, domain.toString());
stmt.setString(2, domain.getTopDomain()); stmt.setString(2, domain.getTopDomain());
stmt.setInt(3, node); stmt.setInt(3, node);
@ -100,12 +196,8 @@ public class DomainsManagementService {
} }
stmt.executeBatch(); stmt.executeBatch();
} }
}
return Map.of("validDomains", validDomains,
"invalidDomains", invalidDomains);
}
return "";
}
private Object assignDomain(Request request, Response response) throws SQLException { private Object assignDomain(Request request, Response response) throws SQLException {

View File

@ -10,13 +10,28 @@
<h1 class="my-3">Add Domains Report</h1> <h1 class="my-3">Add Domains Report</h1>
<p></p> <p></p>
{{#if error}}
<p class="alert alert-danger">{{error}}</p>
{{/if}}
{{#unless errror}}
{{#unless invalidDomains}} {{#unless invalidDomains}}
<p>All domains were added successfully!</p> <p>All domains were added successfully!</p>
{{/unless}} {{/unless}}
{{/unless}}
{{#if invalidDomains}} {{#if invalidDomains}}
<p>Some domains were invalid and could not be added:</p> <p>Some domains were invalid and could not be added:</p>
<textarea class="form-control" rows="10" disabled> <textarea class="form-control" rows="10" disabled>
{{#each invalidDomains}}{{.}}{{/each}} {{#each invalidDomains}}
{{.}}
{{/each}}
</textarea>
{{/if}}
{{#if validDomains}}
<p>If they were not already in the database, these domains were added:</p>
<textarea class="form-control" rows="10" disabled>
{{#each validDomains}}
{{.}}
{{/each}}
</textarea> </textarea>
{{/if}} {{/if}}
<p></p> <p></p>

View File

@ -0,0 +1,48 @@
<!DOCTYPE html>
<html lang="en-US">
<head>
<title>Control Service</title>
{{> control/partials/head-includes }}
</head>
<body>
{{> control/partials/nav}}
<div class="container">
<h1 class="my-3">Add Domains (URL)</h1>
<div class="my-3 p-3 border bg-light">
<p>This utility lets you add domains to be crawled via an external URL.</p>
<a href="/domain/new">It's also possible to add domains directly via a text area</a>
</div>
<form method="post">
<div class="form-group my-3">
<label for="url" class="form-label">Domains to add</label>
<input type="text" class="form-control" name="url"/>
<span class="text-muted">
Enter the URL to the file or page that contains the domains to add. If the URL leads to a text file,
the domains will be parsed from the file, one per line. If it leads to a HTML page, the HTML
will be parsed and all the links will be extracted and added as domains.
</span>
</div>
<div class="form-group my-3">
<label for="node" class="form-label">Node</label>
<select name="node" class="form-select">
<option value="-1">Unassigned</option>
<option value="0" selected>Auto</option>
{{#each global-context.nodes}}
<option value="{{this}}">Node {{id}}</option>
{{/each}}
</select>
<span class="text-muted">
Select the node to assign the domains to, this is the index node that will "own" the domain, crawl its documents
and index dem. If you select "Auto", the system will assign the domains to the next node that performs a crawl.
</span>
</div>
<button type="submit" class="btn btn-primary">Add</button>
</form>
</div>
</body>
{{> control/partials/foot-includes }}
</html>

View File

@ -9,6 +9,11 @@
<div class="container"> <div class="container">
<h1 class="my-3">Add Domains</h1> <h1 class="my-3">Add Domains</h1>
<div class="my-3 p-3 border bg-light">
<p>This utility lets you add domains to be crawled via a text area.</p>
<a href="/domain/new-url">It's also possible to add domains via an external URL</a>
</div>
<form method="post"> <form method="post">
<div class="form-group my-3"> <div class="form-group my-3">
<label for="domains" class="form-label">Domains to add</label> <label for="domains" class="form-label">Domains to add</label>