Merge branch 'master' into term-positions

# Conflicts:
#	code/index/java/nu/marginalia/index/results/model/ids/TermIdList.java
#	code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java
#	code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
#	code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java
#	code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/CrawledDomainReader.java
#	code/processes/crawling-process/test/nu/marginalia/crawling/HttpFetcherTest.java
#	code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java
#	code/services-application/search-service/java/nu/marginalia/search/svc/SearchQueryIndexService.java
This commit is contained in:
Viktor Lofgren 2024-09-08 10:12:53 +02:00
commit 8f367d96f8
26 changed files with 835 additions and 108 deletions

View File

@ -34,7 +34,6 @@ import org.apache.logging.log4j.util.Strings;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
@ -201,7 +200,7 @@ public class ConverterMain extends ProcessMainClass {
try {
return Optional.of(CrawledDomainReader.createDataStream(path));
}
catch (IOException ex) {
catch (Exception ex) {
return Optional.empty();
}
}

View File

@ -151,9 +151,9 @@ public class RedditSideloader implements SideloadSource {
var doc = sideloaderProcessing
.processDocument(fullUrl,
fullHtml,
List.of("encyclopedia", "wiki"),
List.of("reddit"),
domainLinks,
GeneratorType.WIKI,
GeneratorType.FORUM,
DocumentClass.SIDELOAD,
anchorTextKeywords.getAnchorTextKeywords(domainLinks, urls),
pubYear,

View File

@ -9,6 +9,9 @@ import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor;
import nu.marginalia.crawl.retreival.revisit.DocumentWithReference;
import nu.marginalia.crawl.retreival.sitemap.SitemapFetcher;
import nu.marginalia.crawling.body.HttpFetchResult;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.crawling.model.CrawlerDomainStatus;
import nu.marginalia.ip_blocklist.UrlBlocklist;
import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.model.EdgeDomain;
@ -28,6 +31,7 @@ import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.concurrent.TimeUnit;
public class CrawlerRetreiver implements AutoCloseable {
@ -88,17 +92,8 @@ public class CrawlerRetreiver implements AutoCloseable {
}
public int fetch(DomainLinks domainLinks, CrawlDataReference oldCrawlData) {
final DomainProber.ProbeResult probeResult = domainProber.probeDomain(
fetcher,
domain,
new EdgeUrl("http", new EdgeDomain(domain), null, "/", null));
try {
// Sleep a bit to avoid hammering the server with requests, we just probed it
TimeUnit.SECONDS.sleep(1);
// Fetch the domain
return crawlDomain(oldCrawlData, probeResult, domainLinks);
return crawlDomain(oldCrawlData, domainLinks);
}
catch (Exception ex) {
logger.error("Error crawling domain {}", domain, ex);
@ -112,25 +107,33 @@ public class CrawlerRetreiver implements AutoCloseable {
resync.run(warcFile);
}
private int crawlDomain(CrawlDataReference oldCrawlData, DomainProber.ProbeResult probeResult, DomainLinks domainLinks) throws IOException, InterruptedException {
String ip = findIp(domain);
EdgeUrl rootUrl;
private DomainProber.ProbeResult probeRootUrl(String ip) throws IOException {
// Construct an URL to the root of the domain, we don't know the schema yet so we'll
// start with http and then try https if that fails
var httpUrl = new EdgeUrl("http", new EdgeDomain(domain), null, "/", null);
final DomainProber.ProbeResult probeResult = domainProber.probeDomain(fetcher, domain, httpUrl);
warcRecorder.writeWarcinfoHeader(ip, new EdgeDomain(domain), probeResult);
if (!(probeResult instanceof DomainProber.ProbeResultOk ok)) {
return 1;
}
else {
rootUrl = ok.probedUrl();
}
return probeResult;
}
private int crawlDomain(CrawlDataReference oldCrawlData, DomainLinks domainLinks) throws IOException, InterruptedException {
String ip = findIp(domain);
EdgeUrl rootUrl;
if (probeRootUrl(ip) instanceof DomainProber.ProbeResultOk ok) rootUrl = ok.probedUrl();
else return 1;
// Sleep after the initial probe, we don't have access to the robots.txt yet
// so we don't know the crawl delay
TimeUnit.SECONDS.sleep(1);
final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(rootUrl.domain, warcRecorder);
final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());
delayTimer.waitFetchDelay(0); // initial delay after robots.txt
sniffRootDocument(rootUrl, delayTimer);
delayTimer.waitFetchDelay(0); // delay after sniffing
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
int recrawled = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer);
@ -188,7 +191,7 @@ public class CrawlerRetreiver implements AutoCloseable {
try {
if (fetchWriteAndSleep(top, delayTimer, DocumentWithReference.empty()).isOk()) {
if (fetchContentWithReference(top, delayTimer, DocumentWithReference.empty()).isOk()) {
fetchedCount++;
}
}
@ -209,21 +212,8 @@ public class CrawlerRetreiver implements AutoCloseable {
var url = rootUrl.withPathAndParam("/", null);
HttpFetchResult result = null;
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
try {
result = fetcher.fetchContent(url, warcRecorder, ContentTags.empty());
break;
}
catch (RateLimitException ex) {
timer.waitRetryDelay(ex);
}
catch (Exception ex) {
logger.warn("Failed to fetch {}", url, ex);
result = new HttpFetchResult.ResultException(ex);
}
}
HttpFetchResult result = fetchWithRetry(url, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
timer.waitFetchDelay(0);
if (!(result instanceof HttpFetchResult.ResultOk ok))
return;
@ -236,24 +226,40 @@ public class CrawlerRetreiver implements AutoCloseable {
var doc = optDoc.get();
crawlFrontier.setLinkFilter(linkFilterSelector.selectFilter(doc));
EdgeUrl faviconUrl = url.withPathAndParam("/favicon.ico", null);
Optional<EdgeUrl> sitemapUrl = Optional.empty();
for (var link : doc.getElementsByTag("link")) {
String rel = link.attr("rel");
String type = link.attr("type");
if (!rel.equalsIgnoreCase("alternate"))
continue;
if (rel.equals("icon") || rel.equals("shortcut icon")) {
String href = link.attr("href");
if (!(type.equalsIgnoreCase("application/atom+xml")
|| type.equalsIgnoreCase("application/rss+xml")))
continue;
faviconUrl = linkParser.parseLink(url, href)
.filter(crawlFrontier::isSameDomain)
.orElse(faviconUrl);
}
String href = link.attr("href");
// Grab the RSS/Atom as a sitemap if it exists
if (rel.equalsIgnoreCase("alternate")
&& (type.equalsIgnoreCase("application/atom+xml") || type.equalsIgnoreCase("application/atomsvc+xml"))) {
String href = link.attr("href");
linkParser.parseLink(url, href)
.filter(crawlFrontier::isSameDomain)
.map(List::of)
.ifPresent(sitemapFetcher::downloadSitemaps);
sitemapUrl = linkParser.parseLink(url, href)
.filter(crawlFrontier::isSameDomain);
}
}
// Download the sitemap if available exists
if (sitemapUrl.isPresent()) {
sitemapFetcher.downloadSitemaps(List.of(sitemapUrl.get()));
timer.waitFetchDelay(0);
}
// Grab the favicon if it exists
fetchWithRetry(faviconUrl, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
timer.waitFetchDelay(0);
}
catch (Exception ex) {
logger.error("Error configuring link filter", ex);
@ -263,31 +269,16 @@ public class CrawlerRetreiver implements AutoCloseable {
}
}
public HttpFetchResult fetchWriteAndSleep(EdgeUrl top,
CrawlDelayTimer timer,
DocumentWithReference reference) throws InterruptedException
public HttpFetchResult fetchContentWithReference(EdgeUrl top,
CrawlDelayTimer timer,
DocumentWithReference reference) throws InterruptedException
{
logger.debug("Fetching {}", top);
HttpFetchResult fetchedDoc = new HttpFetchResult.ResultNone();
long startTime = System.currentTimeMillis();
var contentTags = reference.getContentTags();
// Fetch the document, retrying if we get a rate limit exception
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
try {
fetchedDoc = fetcher.fetchContent(top, warcRecorder, contentTags);
break;
}
catch (RateLimitException ex) {
timer.waitRetryDelay(ex);
}
catch (Exception ex) {
logger.warn("Failed to fetch {}", top, ex);
fetchedDoc = new HttpFetchResult.ResultException(ex);
}
}
HttpFetchResult fetchedDoc = fetchWithRetry(top, timer, HttpFetcher.ProbeType.FULL, contentTags);
// Parse the document and enqueue links
try {
@ -329,6 +320,27 @@ public class CrawlerRetreiver implements AutoCloseable {
return fetchedDoc;
}
/** Fetch a document and retry on 429s */
private HttpFetchResult fetchWithRetry(EdgeUrl url,
CrawlDelayTimer timer,
HttpFetcher.ProbeType probeType,
ContentTags contentTags) throws InterruptedException {
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
try {
return fetcher.fetchContent(url, warcRecorder, contentTags, probeType);
}
catch (RateLimitException ex) {
timer.waitRetryDelay(ex);
}
catch (Exception ex) {
logger.warn("Failed to fetch {}", url, ex);
return new HttpFetchResult.ResultException(ex);
}
}
return new HttpFetchResult.ResultNone();
}
private boolean isAllowedProtocol(String proto) {
return proto.equalsIgnoreCase("http")
|| proto.equalsIgnoreCase("https");

View File

@ -4,6 +4,7 @@ import com.google.inject.ImplementedBy;
import crawlercommons.robots.SimpleRobotRules;
import nu.marginalia.crawl.retreival.RateLimitException;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.crawling.body.HttpFetchResult;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.body.HttpFetchResult;
@ -19,9 +20,18 @@ public interface HttpFetcher {
FetchResult probeDomain(EdgeUrl url);
HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags) throws RateLimitException;
HttpFetchResult fetchContent(EdgeUrl url,
WarcRecorder recorder,
ContentTags tags,
ProbeType probeType) throws RateLimitException;
SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder);
SitemapRetriever createSitemapRetriever();
enum ProbeType {
DISABLED,
FULL,
IF_MODIFIED_SINCE
}
}

View File

@ -12,6 +12,9 @@ import nu.marginalia.crawl.retreival.fetcher.socket.FastTerminatingSocketFactory
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
import nu.marginalia.crawl.retreival.fetcher.socket.NoSecuritySSL;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.crawling.body.ContentTypeLogic;
import nu.marginalia.crawling.body.DocumentBodyExtractor;
import nu.marginalia.crawling.body.HttpFetchResult;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.body.ContentTypeLogic;
@ -145,12 +148,13 @@ public class HttpFetcherImpl implements HttpFetcher {
@SneakyThrows
public HttpFetchResult fetchContent(EdgeUrl url,
WarcRecorder warcRecorder,
ContentTags contentTags)
ContentTags contentTags,
ProbeType probeType)
{
// We don't want to waste time and resources on URLs that are not HTML, so if the file ending
// looks like it might be something else, we perform a HEAD first to check the content type
if (contentTags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url))
if (probeType == ProbeType.FULL && contentTags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url))
{
ContentTypeProbeResult probeResult = contentTypeProber.probeContentType(url);
if (probeResult instanceof ContentTypeProbeResult.Ok ok) {
@ -174,7 +178,9 @@ public class HttpFetcherImpl implements HttpFetcher {
else {
// Possibly do a soft probe to see if the URL has been modified since the last time we crawled it
// if we have reason to suspect ETags are not supported by the server.
if (softIfModifiedSinceProber.probeModificationTime(url, contentTags)) {
if (probeType == ProbeType.IF_MODIFIED_SINCE
&& softIfModifiedSinceProber.probeModificationTime(url, contentTags))
{
return new HttpFetchResult.Result304Raw();
}
}

View File

@ -137,7 +137,7 @@ public class CrawlerRevisitor {
DocumentWithReference reference = new DocumentWithReference(doc, oldCrawlData);
var result = crawlerRetreiver.fetchWriteAndSleep(url, delayTimer, reference);
var result = crawlerRetreiver.fetchContentWithReference(url, delayTimer, reference);
if (reference.isSame(result)) {
retained++;

View File

@ -46,22 +46,35 @@ public class DbCrawlSpecProvider implements CrawlSpecProvider {
blacklist.waitUntilLoaded();
List<Integer> domainIds = new ArrayList<>(10_000);
try (var conn = dataSource.getConnection();
var assignFreeDomains = conn.prepareStatement("UPDATE EC_DOMAIN SET NODE_AFFINITY=? WHERE NODE_AFFINITY=0");
var query = conn.prepareStatement("""
SELECT DOMAIN_NAME, COALESCE(KNOWN_URLS, 0), EC_DOMAIN.ID
FROM EC_DOMAIN
LEFT JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
WHERE NODE_AFFINITY=?
"""))
WHERE NODE_AFFINITY=? OR NODE_AFFINITY=0
""")
)
{
// Assign any domains with node_affinity=0 to this node. We must do this now, before we start crawling
// to avoid race conditions with other crawl runs. We don't want multiple crawlers to crawl the same domain.
assignFreeDomains.setInt(1, processConfiguration.node());
assignFreeDomains.executeUpdate();
// Fetch the domains to be crawled
query.setInt(1, processConfiguration.node());
query.setFetchSize(10_000);
var rs = query.executeQuery();
while (rs.next()) {
// Skip blacklisted domains
if (blacklist.isBlacklisted(rs.getInt(3)))
int id = rs.getInt(3);
if (blacklist.isBlacklisted(id))
continue;
domainIds.add(id);
int urls = rs.getInt(2);
double growthFactor;
@ -83,6 +96,7 @@ public class DbCrawlSpecProvider implements CrawlSpecProvider {
domains.add(record);
}
}
logger.info("Loaded {} domains", domains.size());

View File

@ -1,6 +1,9 @@
package nu.marginalia.io.crawldata;
import nu.marginalia.io.crawldata.format.ParquetSerializableCrawlDataStream;
import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.FileNotFoundException;
import java.io.IOException;
@ -8,16 +11,23 @@ import java.nio.file.Files;
import java.nio.file.Path;
public class CrawledDomainReader {
private static final Logger logger = LoggerFactory.getLogger(CrawledDomainReader.class);
/** An iterator-like access to domain data This must be closed otherwise it will leak off-heap memory! */
public static SerializableCrawlDataStream createDataStream(Path fullPath) throws IOException
{
String fileName = fullPath.getFileName().toString();
if (fileName.endsWith(".parquet")) {
return new ParquetSerializableCrawlDataStream(fullPath);
}
else {
throw new IllegalArgumentException("Unknown file type: " + fullPath);
try {
return new ParquetSerializableCrawlDataStream(fullPath);
} catch (Exception ex) {
logger.error("Error reading domain data from " + fullPath, ex);
return SerializableCrawlDataStream.empty();
}
} else {
logger.error("Unknown file type: {}", fullPath);
return SerializableCrawlDataStream.empty();
}
}

View File

@ -12,13 +12,10 @@
# Cloud Yuqu LLC
172.247.0.0/16
107.151.64.0/18
# Google Cloud
# 35.208.0.0/12
# 35.224.0.0/12
# 35.240.0.0/13
# 1Blu
178.254.10.0/23
178.254.10.0/23
# Domain parking spam
199.59.243.0/24

View File

@ -3,12 +3,13 @@ package nu.marginalia.crawling;
import lombok.SneakyThrows;
import nu.marginalia.crawl.retreival.RateLimitException;
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.crawling.body.ContentTypeLogic;
import nu.marginalia.crawling.body.DocumentBodyExtractor;
import nu.marginalia.crawling.body.DocumentBodyResult;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.body.ContentTypeLogic;
import nu.marginalia.model.body.DocumentBodyExtractor;
import nu.marginalia.model.body.DocumentBodyResult;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
@ -35,7 +36,7 @@ class HttpFetcherTest {
void fetchUTF8() throws URISyntaxException, RateLimitException, IOException {
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
try (var recorder = new WarcRecorder()) {
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, ContentTags.empty());
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, ContentTags.empty(), HttpFetcher.ProbeType.FULL);
if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
System.out.println(bodyOk.contentType());
}
@ -47,7 +48,7 @@ class HttpFetcherTest {
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
try (var recorder = new WarcRecorder()) {
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, ContentTags.empty());
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, ContentTags.empty(), HttpFetcher.ProbeType.FULL);
if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
System.out.println(bodyOk.contentType());
}

View File

@ -122,7 +122,7 @@ public class CrawlerMockFetcherTest {
@SneakyThrows
@Override
public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags) {
public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags, ProbeType probeType) {
logger.info("Fetching {}", url);
if (mockData.containsKey(url)) {
byte[] bodyBytes = mockData.get(url).documentBody.getBytes();

View File

@ -261,6 +261,7 @@ class CrawlerRetreiverTest {
.collect(Collectors.toSet());
assertEquals(Set.of("https://www.marginalia.nu/",
"https://www.marginalia.nu/favicon.ico",
"https://www.marginalia.nu/log/06-optimization.gmi/"),
fetchedUrls);

View File

@ -12,6 +12,7 @@ import nu.marginalia.bbpc.BrailleBlockPunchCards;
import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.index.query.limit.QueryLimits;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.search.command.SearchParameters;
import nu.marginalia.search.model.ClusteredUrlDetails;
@ -162,7 +163,7 @@ public class SearchOperator {
return new UrlDetails(
item.documentId(),
item.domainId(),
item.url,
cleanUrl(item.url),
item.title,
item.description,
item.format,
@ -177,6 +178,31 @@ public class SearchOperator {
);
}
/** Replace nuisance domains with replacements where available */
private static EdgeUrl cleanUrl(EdgeUrl url) {
String topdomain = url.domain.topDomain;
String subdomain = url.domain.subDomain;
String path = url.path;
if (topdomain.equals("fandom.com")) {
int wikiIndex = path.indexOf("/wiki/");
if (wikiIndex >= 0) {
return new EdgeUrl("https", new EdgeDomain("breezewiki.com"), null, "/" + subdomain + path.substring(wikiIndex), null);
}
}
else if (topdomain.equals("medium.com")) {
if (!subdomain.isBlank()) {
return new EdgeUrl("https", new EdgeDomain("scribe.rip"), null, path, null);
}
else {
String article = path.substring(path.indexOf("/", 1));
return new EdgeUrl("https", new EdgeDomain("scribe.rip"), null, article, null);
}
}
return url;
}
@SneakyThrows
private List<String> getProblems(String evalResult, List<UrlDetails> queryResults, QueryResponse response) {

View File

@ -54,6 +54,7 @@ dependencies {
implementation libs.handlebars
implementation libs.duckdb
implementation libs.jsoup
implementation libs.trove
implementation dependencies.create(libs.spark.get()) {

View File

@ -2,16 +2,18 @@ package nu.marginalia.control;
import com.google.gson.Gson;
import com.google.inject.Inject;
import nu.marginalia.service.ServiceMonitors;
import nu.marginalia.control.actor.ControlActorService;
import nu.marginalia.control.app.svc.*;
import nu.marginalia.control.node.svc.ControlNodeActionsService;
import nu.marginalia.control.node.svc.ControlFileStorageService;
import nu.marginalia.control.node.svc.ControlNodeActionsService;
import nu.marginalia.control.node.svc.ControlNodeService;
import nu.marginalia.control.sys.svc.*;
import nu.marginalia.model.gson.GsonFactory;
import nu.marginalia.screenshot.ScreenshotService;
import nu.marginalia.service.server.*;
import nu.marginalia.service.ServiceMonitors;
import nu.marginalia.service.server.BaseServiceParams;
import nu.marginalia.service.server.Service;
import nu.marginalia.service.server.StaticResources;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.Request;
@ -19,7 +21,7 @@ import spark.Response;
import spark.Spark;
import java.io.IOException;
import java.util.*;
import java.util.Map;
public class ControlService extends Service {
@ -56,6 +58,7 @@ public class ControlService extends Service {
ControlDomainRankingSetsService controlDomainRankingSetsService,
ControlActorService controlActorService,
AbortedProcessService abortedProcessService,
DomainsManagementService domainsManagementService,
ControlErrorHandler errorHandler
) throws IOException {
@ -84,6 +87,7 @@ public class ControlService extends Service {
apiKeyService.register();
domainComplaintService.register();
randomExplorationService.register();
domainsManagementService.register();
errorHandler.register();

View File

@ -0,0 +1,40 @@
package nu.marginalia.control.app.model;
public record DomainModel(int id,
String name,
String ip,
int nodeAffinity,
double rank,
boolean blacklisted) {
public boolean isUnassigned() {
return nodeAffinity < 0;
}
public DomainAffinityState getAffinityState() {
if (nodeAffinity < 0) {
return DomainAffinityState.Known;
}
else if (nodeAffinity == 0) {
return DomainAffinityState.Scheduled;
}
else {
return DomainAffinityState.Assigned;
}
}
public enum DomainAffinityState {
Assigned("The domain has been assigned to a node."),
Scheduled("The domain will be assigned to the next crawling node."),
Known("The domain is known but not yet scheduled for crawling.");
private final String desc;
DomainAffinityState(String desc) {
this.desc = desc;
}
public String getDesc() {
return desc;
}
}
}

View File

@ -0,0 +1,26 @@
package nu.marginalia.control.app.model;
import java.util.List;
import java.util.Map;
public record DomainSearchResultModel(String query,
String affinity,
String field,
Map<String, Boolean> selectedAffinity,
Map<String, Boolean> selectedField,
int page,
boolean hasNext,
boolean hasPrevious,
List<Integer> nodes,
List<DomainModel> results)
{
public Integer getNextPage() {
if (!hasNext) return null;
return page + 1;
}
public Integer getPreviousPage() {
if (!hasPrevious) return null;
return page - 1;
}
}

View File

@ -0,0 +1,310 @@
package nu.marginalia.control.app.svc;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.control.ControlRendererFactory;
import nu.marginalia.control.Redirects;
import nu.marginalia.control.app.model.DomainModel;
import nu.marginalia.control.app.model.DomainSearchResultModel;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.nodecfg.NodeConfigurationService;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import spark.Request;
import spark.Response;
import spark.Spark;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.sql.SQLException;
import java.util.*;
public class DomainsManagementService {
private final HikariDataSource dataSource;
private final NodeConfigurationService nodeConfigurationService;
private final ControlRendererFactory rendererFactory;
@Inject
public DomainsManagementService(HikariDataSource dataSource,
NodeConfigurationService nodeConfigurationService,
ControlRendererFactory rendererFactory
) {
this.dataSource = dataSource;
this.nodeConfigurationService = nodeConfigurationService;
this.rendererFactory = rendererFactory;
}
public void register() throws IOException {
var domainsViewRenderer = rendererFactory.renderer("control/app/domains");
var addDomainsTxtViewRenderer = rendererFactory.renderer("control/app/domains-new");
var addDomainsUrlViewRenderer = rendererFactory.renderer("control/app/domains-new-url");
var addDomainsAfterReportRenderer = rendererFactory.renderer("control/app/domains-new-report");
Spark.get("/domain", this::getDomains, domainsViewRenderer::render);
Spark.get("/domain/new", this::addDomainsTextfield, addDomainsTxtViewRenderer::render);
Spark.post("/domain/new", this::addDomainsTextfield, addDomainsAfterReportRenderer::render);
Spark.get("/domain/new-url", this::addDomainsFromDownload, addDomainsUrlViewRenderer::render);
Spark.post("/domain/new-url", this::addDomainsFromDownload, addDomainsAfterReportRenderer::render);
Spark.post("/domain/:id/assign/:node", this::assignDomain, new Redirects.HtmlRedirect("/domain"));
}
private Object addDomainsTextfield(Request request, Response response) throws SQLException {
if ("GET".equals(request.requestMethod())) {
return "";
}
else if ("POST".equals(request.requestMethod())) {
String nodeStr = request.queryParams("node");
String domainsStr = request.queryParams("domains");
int node = Integer.parseInt(nodeStr);
List<EdgeDomain> validDomains;
List<String> invalidDomains;
Map.Entry<List<EdgeDomain>, List<String>> domainsList = parseDomainsList(domainsStr);
validDomains = domainsList.getKey();
invalidDomains = domainsList.getValue();
insertDomains(validDomains, node);
return Map.of("validDomains", validDomains,
"invalidDomains", invalidDomains);
}
return "";
}
private Map.Entry<List<EdgeDomain>, List<String>> parseDomainsList(String domainsStr) {
List<EdgeDomain> validDomains = new ArrayList<>();
List<String> invalidDomains = new ArrayList<>();
for (String domain : domainsStr.split("\n+")) {
domain = domain.trim();
if (domain.isBlank()) continue;
if (domain.length() > 255) {
invalidDomains.add(domain);
continue;
}
if (domain.startsWith("#")) {
continue;
}
// Run through the URI parser to check for bad domains
try {
if (domain.contains(":")) {
domain = new URI(domain ).toURL().getHost();
}
else {
domain = new URI("https://" + domain + "/").toURL().getHost();
}
} catch (URISyntaxException | MalformedURLException e) {
invalidDomains.add(domain);
continue;
}
validDomains.add(new EdgeDomain(domain));
}
return Map.entry(validDomains, invalidDomains);
}
private Object addDomainsFromDownload(Request request, Response response) throws SQLException, URISyntaxException, IOException, InterruptedException {
if ("GET".equals(request.requestMethod())) {
return "";
}
else if ("POST".equals(request.requestMethod())) {
String nodeStr = request.queryParams("node");
URI domainsUrl = new URI(request.queryParams("url"));
int node = Integer.parseInt(nodeStr);
HttpClient client = HttpClient.newBuilder().build();
var httpReq = HttpRequest.newBuilder(domainsUrl).GET().build();
HttpResponse<String> result = client.send(httpReq, HttpResponse.BodyHandlers.ofString());
if (result.statusCode() != 200) {
return Map.of("error", "Failed to download domains");
}
Optional<String> ct = result.headers().firstValue("Content-Type");
if (ct.isEmpty()) {
return Map.of("error", "No content type");
}
List<EdgeDomain> validDomains = new ArrayList<>();
List<String> invalidDomains = new ArrayList<>();
String contentType = ct.get().toLowerCase();
if (contentType.startsWith("text/plain")) {
var parsedDomains = parseDomainsList(result.body());
validDomains = parsedDomains.getKey();
invalidDomains = parsedDomains.getValue();
}
else {
for (Element e : Jsoup.parse(result.body()).select("a")) {
String s = e.attr("href");
if (s.isBlank()) continue;
if (!s.contains("://")) continue;
URI uri = URI.create(s);
String scheme = uri.getScheme();
String host = uri.getHost();
if (scheme == null || host == null)
continue;
if (!scheme.equalsIgnoreCase("http") && !scheme.equalsIgnoreCase("https"))
continue;
validDomains.add(new EdgeDomain(host));
}
}
insertDomains(validDomains, node);
return Map.of("validDomains", validDomains,
"invalidDomains", invalidDomains);
}
return "";
}
private void insertDomains(List<EdgeDomain> domains, int node) throws SQLException {
// Insert the domains into the database, updating the node affinity if the domain already exists and the affinity is not already set to a node
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""
INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY)
VALUES (?, ?, ?)
ON DUPLICATE KEY UPDATE NODE_AFFINITY = IF(NODE_AFFINITY<=0, VALUES(NODE_AFFINITY), NODE_AFFINITY)
"""))
{
for (var domain : domains) {
stmt.setString(1, domain.toString());
stmt.setString(2, domain.getTopDomain());
stmt.setInt(3, node);
stmt.addBatch();
}
stmt.executeBatch();
}
}
private Object assignDomain(Request request, Response response) throws SQLException {
String idStr = request.params(":id");
String nodeStr = request.params(":node");
int id = Integer.parseInt(idStr);
int node = Integer.parseInt(nodeStr);
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET NODE_AFFINITY = ? WHERE ID = ?"))
{
stmt.setInt(1, node);
stmt.setInt(2, id);
stmt.executeUpdate();
}
return "";
}
private DomainSearchResultModel getDomains(Request request, Response response) throws SQLException {
List<DomainModel> ret = new ArrayList<>();
String filterRaw = Objects.requireNonNullElse(request.queryParams("filter"), "*");
String filter;
if (filterRaw.isBlank()) filter = "%";
else filter = filterRaw.replace('*', '%');
int page = Integer.parseInt(Objects.requireNonNullElse(request.queryParams("page"), "0"));
boolean hasMore = false;
int count = 10;
String field = Objects.requireNonNullElse(request.queryParams("field"), "domain");
Map<String, Boolean> selectedField = Map.of(field, true);
String affinity = Objects.requireNonNullElse(request.queryParams("affinity"), "all");
Map<String, Boolean> selectedAffinity = Map.of(affinity, true);
StringJoiner queryJoiner = new StringJoiner(" ");
queryJoiner.add("""
SELECT EC_DOMAIN.ID,
DOMAIN_NAME,
NODE_AFFINITY,
`RANK`,
IP,
EC_DOMAIN_BLACKLIST.URL_DOMAIN IS NOT NULL AS BLACKLISTED
FROM WMSA_prod.EC_DOMAIN
LEFT JOIN WMSA_prod.EC_DOMAIN_BLACKLIST ON DOMAIN_NAME = EC_DOMAIN_BLACKLIST.URL_DOMAIN
""")
.add((switch (field) {
case "domain" -> "WHERE DOMAIN_NAME LIKE ?";
case "ip" -> "WHERE IP LIKE ?";
case "id" -> "WHERE EC_DOMAIN.ID = ?";
default -> "WHERE DOMAIN_NAME LIKE ?";
}))
.add((switch (affinity) {
case "assigned" -> "AND NODE_AFFINITY > 0";
case "scheduled" -> "AND NODE_AFFINITY = 0";
case "unassigned" -> "AND NODE_AFFINITY < 0";
default -> "";
}))
.add("LIMIT ?")
.add("OFFSET ?");
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement(queryJoiner.toString()))
{
stmt.setString(1, filter);
stmt.setInt(2, count + 1);
stmt.setInt(3, count * page);
try (var rs = stmt.executeQuery()) {
while (rs.next()) {
if (ret.size() == count) {
hasMore = true;
break;
}
ret.add(new DomainModel(
rs.getInt("ID"),
rs.getString("DOMAIN_NAME"),
rs.getString("IP"),
rs.getInt("NODE_AFFINITY"),
Math.round(100 * rs.getDouble("RANK"))/100.,
rs.getBoolean("BLACKLISTED")
));
}
}
}
List<Integer> nodes = new ArrayList<>();
for (var node : nodeConfigurationService.getAll()) {
nodes.add(node.node());
}
return new DomainSearchResultModel(filterRaw,
affinity,
field,
selectedAffinity,
selectedField,
page,
hasMore,
page > 0,
nodes,
ret);
}
}

View File

@ -0,0 +1,41 @@
<!DOCTYPE html>
<html lang="en-US">
<head>
<title>Control Service</title>
{{> control/partials/head-includes }}
</head>
<body>
{{> control/partials/nav}}
<div class="container">
<h1 class="my-3">Add Domains Report</h1>
<p></p>
{{#if error}}
<p class="alert alert-danger">{{error}}</p>
{{/if}}
{{#unless errror}}
{{#unless invalidDomains}}
<p>All domains were added successfully!</p>
{{/unless}}
{{/unless}}
{{#if invalidDomains}}
<p>Some domains were invalid and could not be added:</p>
<textarea class="form-control" rows="10" disabled>
{{#each invalidDomains}}
{{.}}
{{/each}}
</textarea>
{{/if}}
{{#if validDomains}}
<p>If they were not already in the database, these domains were added:</p>
<textarea class="form-control" rows="10" disabled>
{{#each validDomains}}
{{.}}
{{/each}}
</textarea>
{{/if}}
<p></p>
</div>
</body>
{{> control/partials/foot-includes }}
</html>

View File

@ -0,0 +1,48 @@
<!DOCTYPE html>
<html lang="en-US">
<head>
<title>Control Service</title>
{{> control/partials/head-includes }}
</head>
<body>
{{> control/partials/nav}}
<div class="container">
<h1 class="my-3">Add Domains (URL)</h1>
<div class="my-3 p-3 border bg-light">
<p>This utility lets you add domains to be crawled via an external URL.</p>
<a href="/domain/new">It's also possible to add domains directly via a text area</a>
</div>
<form method="post">
<div class="form-group my-3">
<label for="url" class="form-label">Domains to add</label>
<input type="text" class="form-control" name="url"/>
<span class="text-muted">
Enter the URL to the file or page that contains the domains to add. If the URL leads to a text file,
the domains will be parsed from the file, one per line. If it leads to a HTML page, the HTML
will be parsed and all the links will be extracted and added as domains.
</span>
</div>
<div class="form-group my-3">
<label for="node" class="form-label">Node</label>
<select name="node" class="form-select">
<option value="-1">Unassigned</option>
<option value="0" selected>Auto</option>
{{#each global-context.nodes}}
<option value="{{id}}">Node {{id}}</option>
{{/each}}
</select>
<span class="text-muted">
Select the node to assign the domains to, this is the index node that will "own" the domain, crawl its documents
and index dem. If you select "Auto", the system will assign the domains to the next node that performs a crawl.
</span>
</div>
<button type="submit" class="btn btn-primary">Add</button>
</form>
</div>
</body>
{{> control/partials/foot-includes }}
</html>

View File

@ -0,0 +1,47 @@
<!DOCTYPE html>
<html lang="en-US">
<head>
<title>Control Service</title>
{{> control/partials/head-includes }}
</head>
<body>
{{> control/partials/nav}}
<div class="container">
<h1 class="my-3">Add Domains</h1>
<div class="my-3 p-3 border bg-light">
<p>This utility lets you add domains to be crawled via a text area.</p>
<a href="/domain/new-url">It's also possible to add domains via an external URL</a>
</div>
<form method="post">
<div class="form-group my-3">
<label for="domains" class="form-label">Domains to add</label>
<textarea name="domains" class="form-control" rows="10"></textarea>
<span class="text-muted">
Enter a list of domains to add, one per line. The system will check if the domain is already in the database and
will not add duplicates. Spaces and empty lines are ignored.
</span>
</div>
<div class="form-group my-3">
<label for="node" class="form-label">Node</label>
<select name="node" class="form-select">
<option value="-1">Unassigned</option>
<option value="0" selected>Auto</option>
{{#each global-context.nodes}}
<option value="{{id}}">Node {{id}}</option>
{{/each}}
</select>
<span class="text-muted">
Select the node to assign the domains to, this is the index node that will "own" the domain, crawl its documents
and index dem. If you select "Auto", the system will assign the domains to the next node that performs a crawl.
</span>
</div>
<button type="submit" class="btn btn-primary">Add</button>
</form>
</div>
</body>
{{> control/partials/foot-includes }}
</html>

View File

@ -0,0 +1,109 @@
<!DOCTYPE html>
<html lang="en-US">
<head>
<title>Control Service</title>
{{> control/partials/head-includes }}
</head>
<body>
{{> control/partials/nav}}
<div class="container">
<h1 class="my-3">Domains</h1>
<table class="table">
<form method="get">
<tr>
<td>
<select name="field" class="form-select" aria-label="Select Field">
<option value="domain" {{#if selectedField.domain}}selected{{/if}}>Domain Name</option>
<option value="id" {{#if selectedField.id}}selected{{/if}}>Domain ID</option>
<option value="ip" {{#if selectedField.ip}}selected{{/if}}>IP</option>
</select>
</td>
<td colspan="3"><input type="text" name="filter" class="form-control" placeholder="Domain" value="{{query}}"></td>
<td>
<select name="affinity" class="form-select" aria-label="Select Node Affinity">
<option value="all" {{#if selectedAffinity.all}}selected{{/if}}>-</option>
<option value="unassigned" {{#if selectedAffinity.unassigned}}selected{{/if}}>Unassigned</option>
<option value="scheduled" {{#if selectedAffinity.scheduled}}selected{{/if}}>Scheduled</option>
<option value="assigned" {{#if selectedAffinity.assigned}}selected{{/if}}>Assigned</option>
</select>
</td>
<td><button type="submit" class="btn btn-primary">Search</button></td>
</tr>
</form>
<tr>
<th>Domain</th>
<th>ID</th>
<th title="Which, if any, index node owns a domain and will crawl and index it">Node Affinity</th>
<th>Rank</th>
<th>IP</th>
<th>Blacklisted</th>
</tr>
{{#each results}}
<tr>
<td>{{name}}</td>
<td>{{id}}</td>
<td title="{{affinityState.desc}}">{{#unless unassigned}}{{affinityState}} {{#if nodeAffinity}}{{nodeAffinity}}{{/if}} {{/unless}}
{{#if unassigned}}
<div class="dropdown">
<button title="Assign to a node" class="btn btn-secondary dropdown-toggle" type="button" id="dropdownMenuButton1" data-bs-toggle="dropdown" aria-expanded="false">
Unassigned
</button>
<ul class="dropdown-menu" aria-labelledby="dropdownMenuButton1">
<form method="post">
<input type="hidden" name="node" value="0">
<li>
<button
class="dropdown-item"
title="Assign to the next node that performs a crawl"
formaction="/domain/{{id}}/assign/0"
type="submit">
Any
</button>
</li>
{{#each nodes}}
<input type="hidden" name="node" value="{{.}}">
<li>
<button
class="dropdown-item"
title="Assign to node {{.}}"
formaction="/domain/{{id}}/assign/{{.}}"
type="submit">
Node {{.}}
</button>
</li>
{{/each}}
</form>
</ul>
</div>
{{/if}}
</td>
<td>{{rank}}</td>
<td>{{ip}}</td>
<td>{{#if blacklisted}}&check;{{/if}}</td>
</tr>
{{/each}}
{{#unless results}}
<tr>
<td colspan="5">No results found</td>
</tr>
{{/unless}}
<tr>
<td>
{{#if hasPrevious}}
<a href="?page={{previousPage}}&filter={{query}}&field={{field}}&affinity={{affinity}}">Previous</a>
{{/if}}
</td>
<td colspan="4"></td>
<td>
{{#if hasNext}}
<a href="?page={{nextPage}}&filter={{query}}&field={{field}}&affinity={{affinity}}">Next</a>
{{/if}}
</td>
</tr>
</table>
</div>
</body>
{{> control/partials/foot-includes }}
</html>

View File

@ -1,5 +1,4 @@
<script src="https://cdn.jsdelivr.net/npm/@popperjs/core@2.11.8/dist/umd/popper.min.js" integrity="sha384-I7E8VVD/ismYTF4hNIPjVp/Zjvgyol6VFvRkX/vR+Vc4jQkC+hVqc2pM8ODewa9r" crossorigin="anonymous"></script>
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/js/bootstrap.min.js" integrity="sha384-BBtl+eGJRgqQAUMxJ7pMwbEyER4l1g+O15P+16Ep7Q9Q+zqX6gSbd85u4mG4QzX+" crossorigin="anonymous"></script>
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.0.2/dist/js/bootstrap.bundle.min.js" integrity="sha384-MrcW6ZMFYlzcLA8Nl+NtUVF0sA7MsXsP1UyJoMp4YLEuNSfAP+JcXn/tWtIaxVXM" crossorigin="anonymous"></script>
<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.1/jquery.min.js"></script>
<script src="/refresh.js"></script>
<script type="javascript">

View File

@ -16,13 +16,21 @@
<a href="#" class="nav-link dropdown-toggle" data-bs-toggle="dropdown" role="button" aria-expanded="false">Application</a>
<ul class="dropdown-menu">
<li><a class="dropdown-item" href="/api-keys" title="Create or remove API keys">API Keys</a></li>
<li><a class="dropdown-item" href="/blacklist" title="Add or remove website sanctions">Blacklist</a></li>
<li><a class="dropdown-item" href="/search-to-ban" title="Search function for easy blacklisting">Blacklist Search</a></li>
<li><a class="dropdown-item" href="/complaints" title="View and act on user complaints">Complaints</a></li>
<li><a class="dropdown-item" href="/review-random-domains" title="Review random domains list">Random Exploration</a></li>
</ul>
</li>
{{/unless}}
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" data-bs-toggle="dropdown" role="button" aria-expanded="false">Domains</a>
<ul class="dropdown-menu">
<li><a class="dropdown-item" href="/domain/new" title="Add New Domains">Add Domains</a></li>
<li><a class="dropdown-item" href="/domain" title="List Domains">Manage Domains</a></li>
<li><hr class="dropdown-divider"></li>
<li><a class="dropdown-item" href="/blacklist" title="Add or remove website sanctions">Blacklist</a></li>
<li><a class="dropdown-item" href="/search-to-ban" title="Search function for easy blacklisting">Blacklist Search</a></li>
</ul>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" data-bs-toggle="dropdown" role="button" aria-expanded="false">Index Nodes</a>
<ul class="dropdown-menu">

View File

@ -16,6 +16,7 @@ import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.time.Duration;
import java.util.ArrayList;
@ -34,7 +35,7 @@ public class ScreenshotCaptureToolMain {
System.setProperty(ChromeDriverService.CHROME_DRIVER_SILENT_OUTPUT_PROPERTY, "true");
List<EdgeDomain> crawlQueue = fetchCrawlQueue(ds, 1000);
List<EdgeDomain> crawlQueue = fetchCrawlQueue(ds, 10_000);
HttpClient httpClient = HttpClient.newBuilder()
.version(HttpClient.Version.HTTP_1_1)
@ -137,16 +138,33 @@ public class ScreenshotCaptureToolMain {
List<EdgeDomain> ret = new ArrayList<>(queueSize);
try (var conn = ds.getConnection(); var stmt = conn.createStatement()) {
var rsp = stmt.executeQuery(
int newCount = queueSize / 4;
int oldCount = queueSize - newCount;
ResultSet rst = stmt.executeQuery(
"""
SELECT EC_DOMAIN.DOMAIN_NAME FROM EC_DOMAIN
LEFT JOIN DATA_DOMAIN_HISTORY ON EC_DOMAIN.DOMAIN_NAME=DATA_DOMAIN_HISTORY.DOMAIN_NAME
ORDER BY SCREENSHOT_DATE IS NULL DESC, SCREENSHOT_DATE, INDEXED DESC
LIMIT
""" + queueSize);
while (rsp.next()) {
ret.add(new EdgeDomain(rsp.getString(1)));
""" + newCount);
while (rst.next()) {
ret.add(new EdgeDomain(rst.getString(1)));
}
rst = stmt.executeQuery("""
SELECT DATA_DOMAIN_HISTORY.DOMAIN_NAME FROM DATA_DOMAIN_HISTORY
INNER JOIN DATA_DOMAIN_SCREENSHOT ON DATA_DOMAIN_SCREENSHOT.DOMAIN_NAME = DATA_DOMAIN_HISTORY.DOMAIN_NAME
WHERE SCREENSHOT_DATE IS NOT NULL
ORDER BY SCREENSHOT_DATE ASC
LIMIT
""" + oldCount);
while (rst.next()) {
ret.add(new EdgeDomain(rst.getString(1)));
}
}
catch (Exception ex) {
logger.warn("Exception in fetching queue", ex);