Merge branch 'master' into term-positions

# Conflicts:
#	code/index/java/nu/marginalia/index/results/model/ids/TermIdList.java
#	code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java
#	code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
#	code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java
#	code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/CrawledDomainReader.java
#	code/processes/crawling-process/test/nu/marginalia/crawling/HttpFetcherTest.java
#	code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java
#	code/services-application/search-service/java/nu/marginalia/search/svc/SearchQueryIndexService.java
This commit is contained in:
Viktor Lofgren 2024-09-08 10:12:53 +02:00
commit 8f367d96f8
26 changed files with 835 additions and 108 deletions

View File

@ -34,7 +34,6 @@ import org.apache.logging.log4j.util.Strings;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.sql.SQLException; import java.sql.SQLException;
@ -201,7 +200,7 @@ public class ConverterMain extends ProcessMainClass {
try { try {
return Optional.of(CrawledDomainReader.createDataStream(path)); return Optional.of(CrawledDomainReader.createDataStream(path));
} }
catch (IOException ex) { catch (Exception ex) {
return Optional.empty(); return Optional.empty();
} }
} }

View File

@ -151,9 +151,9 @@ public class RedditSideloader implements SideloadSource {
var doc = sideloaderProcessing var doc = sideloaderProcessing
.processDocument(fullUrl, .processDocument(fullUrl,
fullHtml, fullHtml,
List.of("encyclopedia", "wiki"), List.of("reddit"),
domainLinks, domainLinks,
GeneratorType.WIKI, GeneratorType.FORUM,
DocumentClass.SIDELOAD, DocumentClass.SIDELOAD,
anchorTextKeywords.getAnchorTextKeywords(domainLinks, urls), anchorTextKeywords.getAnchorTextKeywords(domainLinks, urls),
pubYear, pubYear,

View File

@ -9,6 +9,9 @@ import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor; import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor;
import nu.marginalia.crawl.retreival.revisit.DocumentWithReference; import nu.marginalia.crawl.retreival.revisit.DocumentWithReference;
import nu.marginalia.crawl.retreival.sitemap.SitemapFetcher; import nu.marginalia.crawl.retreival.sitemap.SitemapFetcher;
import nu.marginalia.crawling.body.HttpFetchResult;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.crawling.model.CrawlerDomainStatus;
import nu.marginalia.ip_blocklist.UrlBlocklist; import nu.marginalia.ip_blocklist.UrlBlocklist;
import nu.marginalia.link_parser.LinkParser; import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
@ -28,6 +31,7 @@ import java.nio.file.Path;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Objects; import java.util.Objects;
import java.util.Optional;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
public class CrawlerRetreiver implements AutoCloseable { public class CrawlerRetreiver implements AutoCloseable {
@ -88,17 +92,8 @@ public class CrawlerRetreiver implements AutoCloseable {
} }
public int fetch(DomainLinks domainLinks, CrawlDataReference oldCrawlData) { public int fetch(DomainLinks domainLinks, CrawlDataReference oldCrawlData) {
final DomainProber.ProbeResult probeResult = domainProber.probeDomain(
fetcher,
domain,
new EdgeUrl("http", new EdgeDomain(domain), null, "/", null));
try { try {
// Sleep a bit to avoid hammering the server with requests, we just probed it return crawlDomain(oldCrawlData, domainLinks);
TimeUnit.SECONDS.sleep(1);
// Fetch the domain
return crawlDomain(oldCrawlData, probeResult, domainLinks);
} }
catch (Exception ex) { catch (Exception ex) {
logger.error("Error crawling domain {}", domain, ex); logger.error("Error crawling domain {}", domain, ex);
@ -112,25 +107,33 @@ public class CrawlerRetreiver implements AutoCloseable {
resync.run(warcFile); resync.run(warcFile);
} }
private int crawlDomain(CrawlDataReference oldCrawlData, DomainProber.ProbeResult probeResult, DomainLinks domainLinks) throws IOException, InterruptedException { private DomainProber.ProbeResult probeRootUrl(String ip) throws IOException {
String ip = findIp(domain); // Construct an URL to the root of the domain, we don't know the schema yet so we'll
EdgeUrl rootUrl; // start with http and then try https if that fails
var httpUrl = new EdgeUrl("http", new EdgeDomain(domain), null, "/", null);
final DomainProber.ProbeResult probeResult = domainProber.probeDomain(fetcher, domain, httpUrl);
warcRecorder.writeWarcinfoHeader(ip, new EdgeDomain(domain), probeResult); warcRecorder.writeWarcinfoHeader(ip, new EdgeDomain(domain), probeResult);
if (!(probeResult instanceof DomainProber.ProbeResultOk ok)) { return probeResult;
return 1;
}
else {
rootUrl = ok.probedUrl();
} }
private int crawlDomain(CrawlDataReference oldCrawlData, DomainLinks domainLinks) throws IOException, InterruptedException {
String ip = findIp(domain);
EdgeUrl rootUrl;
if (probeRootUrl(ip) instanceof DomainProber.ProbeResultOk ok) rootUrl = ok.probedUrl();
else return 1;
// Sleep after the initial probe, we don't have access to the robots.txt yet
// so we don't know the crawl delay
TimeUnit.SECONDS.sleep(1);
final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(rootUrl.domain, warcRecorder); final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(rootUrl.domain, warcRecorder);
final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay()); final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());
delayTimer.waitFetchDelay(0); // initial delay after robots.txt delayTimer.waitFetchDelay(0); // initial delay after robots.txt
sniffRootDocument(rootUrl, delayTimer); sniffRootDocument(rootUrl, delayTimer);
delayTimer.waitFetchDelay(0); // delay after sniffing
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified // Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
int recrawled = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer); int recrawled = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer);
@ -188,7 +191,7 @@ public class CrawlerRetreiver implements AutoCloseable {
try { try {
if (fetchWriteAndSleep(top, delayTimer, DocumentWithReference.empty()).isOk()) { if (fetchContentWithReference(top, delayTimer, DocumentWithReference.empty()).isOk()) {
fetchedCount++; fetchedCount++;
} }
} }
@ -209,21 +212,8 @@ public class CrawlerRetreiver implements AutoCloseable {
var url = rootUrl.withPathAndParam("/", null); var url = rootUrl.withPathAndParam("/", null);
HttpFetchResult result = null; HttpFetchResult result = fetchWithRetry(url, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
timer.waitFetchDelay(0);
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
try {
result = fetcher.fetchContent(url, warcRecorder, ContentTags.empty());
break;
}
catch (RateLimitException ex) {
timer.waitRetryDelay(ex);
}
catch (Exception ex) {
logger.warn("Failed to fetch {}", url, ex);
result = new HttpFetchResult.ResultException(ex);
}
}
if (!(result instanceof HttpFetchResult.ResultOk ok)) if (!(result instanceof HttpFetchResult.ResultOk ok))
return; return;
@ -236,24 +226,40 @@ public class CrawlerRetreiver implements AutoCloseable {
var doc = optDoc.get(); var doc = optDoc.get();
crawlFrontier.setLinkFilter(linkFilterSelector.selectFilter(doc)); crawlFrontier.setLinkFilter(linkFilterSelector.selectFilter(doc));
EdgeUrl faviconUrl = url.withPathAndParam("/favicon.ico", null);
Optional<EdgeUrl> sitemapUrl = Optional.empty();
for (var link : doc.getElementsByTag("link")) { for (var link : doc.getElementsByTag("link")) {
String rel = link.attr("rel"); String rel = link.attr("rel");
String type = link.attr("type"); String type = link.attr("type");
if (!rel.equalsIgnoreCase("alternate")) if (rel.equals("icon") || rel.equals("shortcut icon")) {
continue;
if (!(type.equalsIgnoreCase("application/atom+xml")
|| type.equalsIgnoreCase("application/rss+xml")))
continue;
String href = link.attr("href"); String href = link.attr("href");
linkParser.parseLink(url, href) faviconUrl = linkParser.parseLink(url, href)
.filter(crawlFrontier::isSameDomain) .filter(crawlFrontier::isSameDomain)
.map(List::of) .orElse(faviconUrl);
.ifPresent(sitemapFetcher::downloadSitemaps);
} }
// Grab the RSS/Atom as a sitemap if it exists
if (rel.equalsIgnoreCase("alternate")
&& (type.equalsIgnoreCase("application/atom+xml") || type.equalsIgnoreCase("application/atomsvc+xml"))) {
String href = link.attr("href");
sitemapUrl = linkParser.parseLink(url, href)
.filter(crawlFrontier::isSameDomain);
}
}
// Download the sitemap if available exists
if (sitemapUrl.isPresent()) {
sitemapFetcher.downloadSitemaps(List.of(sitemapUrl.get()));
timer.waitFetchDelay(0);
}
// Grab the favicon if it exists
fetchWithRetry(faviconUrl, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
timer.waitFetchDelay(0);
} }
catch (Exception ex) { catch (Exception ex) {
logger.error("Error configuring link filter", ex); logger.error("Error configuring link filter", ex);
@ -263,31 +269,16 @@ public class CrawlerRetreiver implements AutoCloseable {
} }
} }
public HttpFetchResult fetchWriteAndSleep(EdgeUrl top, public HttpFetchResult fetchContentWithReference(EdgeUrl top,
CrawlDelayTimer timer, CrawlDelayTimer timer,
DocumentWithReference reference) throws InterruptedException DocumentWithReference reference) throws InterruptedException
{ {
logger.debug("Fetching {}", top); logger.debug("Fetching {}", top);
HttpFetchResult fetchedDoc = new HttpFetchResult.ResultNone();
long startTime = System.currentTimeMillis(); long startTime = System.currentTimeMillis();
var contentTags = reference.getContentTags(); var contentTags = reference.getContentTags();
// Fetch the document, retrying if we get a rate limit exception HttpFetchResult fetchedDoc = fetchWithRetry(top, timer, HttpFetcher.ProbeType.FULL, contentTags);
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
try {
fetchedDoc = fetcher.fetchContent(top, warcRecorder, contentTags);
break;
}
catch (RateLimitException ex) {
timer.waitRetryDelay(ex);
}
catch (Exception ex) {
logger.warn("Failed to fetch {}", top, ex);
fetchedDoc = new HttpFetchResult.ResultException(ex);
}
}
// Parse the document and enqueue links // Parse the document and enqueue links
try { try {
@ -329,6 +320,27 @@ public class CrawlerRetreiver implements AutoCloseable {
return fetchedDoc; return fetchedDoc;
} }
/** Fetch a document and retry on 429s */
private HttpFetchResult fetchWithRetry(EdgeUrl url,
CrawlDelayTimer timer,
HttpFetcher.ProbeType probeType,
ContentTags contentTags) throws InterruptedException {
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
try {
return fetcher.fetchContent(url, warcRecorder, contentTags, probeType);
}
catch (RateLimitException ex) {
timer.waitRetryDelay(ex);
}
catch (Exception ex) {
logger.warn("Failed to fetch {}", url, ex);
return new HttpFetchResult.ResultException(ex);
}
}
return new HttpFetchResult.ResultNone();
}
private boolean isAllowedProtocol(String proto) { private boolean isAllowedProtocol(String proto) {
return proto.equalsIgnoreCase("http") return proto.equalsIgnoreCase("http")
|| proto.equalsIgnoreCase("https"); || proto.equalsIgnoreCase("https");

View File

@ -4,6 +4,7 @@ import com.google.inject.ImplementedBy;
import crawlercommons.robots.SimpleRobotRules; import crawlercommons.robots.SimpleRobotRules;
import nu.marginalia.crawl.retreival.RateLimitException; import nu.marginalia.crawl.retreival.RateLimitException;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.crawling.body.HttpFetchResult;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.body.HttpFetchResult; import nu.marginalia.model.body.HttpFetchResult;
@ -19,9 +20,18 @@ public interface HttpFetcher {
FetchResult probeDomain(EdgeUrl url); FetchResult probeDomain(EdgeUrl url);
HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags) throws RateLimitException; HttpFetchResult fetchContent(EdgeUrl url,
WarcRecorder recorder,
ContentTags tags,
ProbeType probeType) throws RateLimitException;
SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder); SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder);
SitemapRetriever createSitemapRetriever(); SitemapRetriever createSitemapRetriever();
enum ProbeType {
DISABLED,
FULL,
IF_MODIFIED_SINCE
}
} }

View File

@ -12,6 +12,9 @@ import nu.marginalia.crawl.retreival.fetcher.socket.FastTerminatingSocketFactory
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor; import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
import nu.marginalia.crawl.retreival.fetcher.socket.NoSecuritySSL; import nu.marginalia.crawl.retreival.fetcher.socket.NoSecuritySSL;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.crawling.body.ContentTypeLogic;
import nu.marginalia.crawling.body.DocumentBodyExtractor;
import nu.marginalia.crawling.body.HttpFetchResult;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.body.ContentTypeLogic; import nu.marginalia.model.body.ContentTypeLogic;
@ -145,12 +148,13 @@ public class HttpFetcherImpl implements HttpFetcher {
@SneakyThrows @SneakyThrows
public HttpFetchResult fetchContent(EdgeUrl url, public HttpFetchResult fetchContent(EdgeUrl url,
WarcRecorder warcRecorder, WarcRecorder warcRecorder,
ContentTags contentTags) ContentTags contentTags,
ProbeType probeType)
{ {
// We don't want to waste time and resources on URLs that are not HTML, so if the file ending // We don't want to waste time and resources on URLs that are not HTML, so if the file ending
// looks like it might be something else, we perform a HEAD first to check the content type // looks like it might be something else, we perform a HEAD first to check the content type
if (contentTags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url)) if (probeType == ProbeType.FULL && contentTags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url))
{ {
ContentTypeProbeResult probeResult = contentTypeProber.probeContentType(url); ContentTypeProbeResult probeResult = contentTypeProber.probeContentType(url);
if (probeResult instanceof ContentTypeProbeResult.Ok ok) { if (probeResult instanceof ContentTypeProbeResult.Ok ok) {
@ -174,7 +178,9 @@ public class HttpFetcherImpl implements HttpFetcher {
else { else {
// Possibly do a soft probe to see if the URL has been modified since the last time we crawled it // Possibly do a soft probe to see if the URL has been modified since the last time we crawled it
// if we have reason to suspect ETags are not supported by the server. // if we have reason to suspect ETags are not supported by the server.
if (softIfModifiedSinceProber.probeModificationTime(url, contentTags)) { if (probeType == ProbeType.IF_MODIFIED_SINCE
&& softIfModifiedSinceProber.probeModificationTime(url, contentTags))
{
return new HttpFetchResult.Result304Raw(); return new HttpFetchResult.Result304Raw();
} }
} }

View File

@ -137,7 +137,7 @@ public class CrawlerRevisitor {
DocumentWithReference reference = new DocumentWithReference(doc, oldCrawlData); DocumentWithReference reference = new DocumentWithReference(doc, oldCrawlData);
var result = crawlerRetreiver.fetchWriteAndSleep(url, delayTimer, reference); var result = crawlerRetreiver.fetchContentWithReference(url, delayTimer, reference);
if (reference.isSame(result)) { if (reference.isSame(result)) {
retained++; retained++;

View File

@ -46,22 +46,35 @@ public class DbCrawlSpecProvider implements CrawlSpecProvider {
blacklist.waitUntilLoaded(); blacklist.waitUntilLoaded();
List<Integer> domainIds = new ArrayList<>(10_000);
try (var conn = dataSource.getConnection(); try (var conn = dataSource.getConnection();
var assignFreeDomains = conn.prepareStatement("UPDATE EC_DOMAIN SET NODE_AFFINITY=? WHERE NODE_AFFINITY=0");
var query = conn.prepareStatement(""" var query = conn.prepareStatement("""
SELECT DOMAIN_NAME, COALESCE(KNOWN_URLS, 0), EC_DOMAIN.ID SELECT DOMAIN_NAME, COALESCE(KNOWN_URLS, 0), EC_DOMAIN.ID
FROM EC_DOMAIN FROM EC_DOMAIN
LEFT JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
WHERE NODE_AFFINITY=? WHERE NODE_AFFINITY=? OR NODE_AFFINITY=0
""")) """)
)
{ {
// Assign any domains with node_affinity=0 to this node. We must do this now, before we start crawling
// to avoid race conditions with other crawl runs. We don't want multiple crawlers to crawl the same domain.
assignFreeDomains.setInt(1, processConfiguration.node());
assignFreeDomains.executeUpdate();
// Fetch the domains to be crawled
query.setInt(1, processConfiguration.node()); query.setInt(1, processConfiguration.node());
query.setFetchSize(10_000); query.setFetchSize(10_000);
var rs = query.executeQuery(); var rs = query.executeQuery();
while (rs.next()) { while (rs.next()) {
// Skip blacklisted domains // Skip blacklisted domains
if (blacklist.isBlacklisted(rs.getInt(3))) int id = rs.getInt(3);
if (blacklist.isBlacklisted(id))
continue; continue;
domainIds.add(id);
int urls = rs.getInt(2); int urls = rs.getInt(2);
double growthFactor; double growthFactor;
@ -83,6 +96,7 @@ public class DbCrawlSpecProvider implements CrawlSpecProvider {
domains.add(record); domains.add(record);
} }
} }
logger.info("Loaded {} domains", domains.size()); logger.info("Loaded {} domains", domains.size());

View File

@ -1,6 +1,9 @@
package nu.marginalia.io.crawldata; package nu.marginalia.io.crawldata;
import nu.marginalia.io.crawldata.format.ParquetSerializableCrawlDataStream; import nu.marginalia.io.crawldata.format.ParquetSerializableCrawlDataStream;
import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.io.IOException; import java.io.IOException;
@ -8,16 +11,23 @@ import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
public class CrawledDomainReader { public class CrawledDomainReader {
private static final Logger logger = LoggerFactory.getLogger(CrawledDomainReader.class);
/** An iterator-like access to domain data This must be closed otherwise it will leak off-heap memory! */ /** An iterator-like access to domain data This must be closed otherwise it will leak off-heap memory! */
public static SerializableCrawlDataStream createDataStream(Path fullPath) throws IOException public static SerializableCrawlDataStream createDataStream(Path fullPath) throws IOException
{ {
String fileName = fullPath.getFileName().toString(); String fileName = fullPath.getFileName().toString();
if (fileName.endsWith(".parquet")) { if (fileName.endsWith(".parquet")) {
try {
return new ParquetSerializableCrawlDataStream(fullPath); return new ParquetSerializableCrawlDataStream(fullPath);
} catch (Exception ex) {
logger.error("Error reading domain data from " + fullPath, ex);
return SerializableCrawlDataStream.empty();
} }
else { } else {
throw new IllegalArgumentException("Unknown file type: " + fullPath); logger.error("Unknown file type: {}", fullPath);
return SerializableCrawlDataStream.empty();
} }
} }

View File

@ -12,13 +12,10 @@
# Cloud Yuqu LLC # Cloud Yuqu LLC
172.247.0.0/16 172.247.0.0/16
107.151.64.0/18 107.151.64.0/18
# Google Cloud
# 35.208.0.0/12
# 35.224.0.0/12
# 35.240.0.0/13
# 1Blu # 1Blu
178.254.10.0/23 178.254.10.0/23
# Domain parking spam
199.59.243.0/24

View File

@ -3,12 +3,13 @@ package nu.marginalia.crawling;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.crawl.retreival.RateLimitException; import nu.marginalia.crawl.retreival.RateLimitException;
import nu.marginalia.crawl.retreival.fetcher.ContentTags; import nu.marginalia.crawl.retreival.fetcher.ContentTags;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.crawling.body.ContentTypeLogic;
import nu.marginalia.crawling.body.DocumentBodyExtractor;
import nu.marginalia.crawling.body.DocumentBodyResult;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.body.ContentTypeLogic;
import nu.marginalia.model.body.DocumentBodyExtractor;
import nu.marginalia.model.body.DocumentBodyResult;
import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
@ -35,7 +36,7 @@ class HttpFetcherTest {
void fetchUTF8() throws URISyntaxException, RateLimitException, IOException { void fetchUTF8() throws URISyntaxException, RateLimitException, IOException {
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler"); var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
try (var recorder = new WarcRecorder()) { try (var recorder = new WarcRecorder()) {
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, ContentTags.empty()); var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, ContentTags.empty(), HttpFetcher.ProbeType.FULL);
if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) { if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
System.out.println(bodyOk.contentType()); System.out.println(bodyOk.contentType());
} }
@ -47,7 +48,7 @@ class HttpFetcherTest {
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler"); var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
try (var recorder = new WarcRecorder()) { try (var recorder = new WarcRecorder()) {
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, ContentTags.empty()); var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, ContentTags.empty(), HttpFetcher.ProbeType.FULL);
if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) { if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
System.out.println(bodyOk.contentType()); System.out.println(bodyOk.contentType());
} }

View File

@ -122,7 +122,7 @@ public class CrawlerMockFetcherTest {
@SneakyThrows @SneakyThrows
@Override @Override
public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags) { public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags, ProbeType probeType) {
logger.info("Fetching {}", url); logger.info("Fetching {}", url);
if (mockData.containsKey(url)) { if (mockData.containsKey(url)) {
byte[] bodyBytes = mockData.get(url).documentBody.getBytes(); byte[] bodyBytes = mockData.get(url).documentBody.getBytes();

View File

@ -261,6 +261,7 @@ class CrawlerRetreiverTest {
.collect(Collectors.toSet()); .collect(Collectors.toSet());
assertEquals(Set.of("https://www.marginalia.nu/", assertEquals(Set.of("https://www.marginalia.nu/",
"https://www.marginalia.nu/favicon.ico",
"https://www.marginalia.nu/log/06-optimization.gmi/"), "https://www.marginalia.nu/log/06-optimization.gmi/"),
fetchedUrls); fetchedUrls);

View File

@ -12,6 +12,7 @@ import nu.marginalia.bbpc.BrailleBlockPunchCards;
import nu.marginalia.db.DbDomainQueries; import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.index.query.limit.QueryLimits;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.search.command.SearchParameters; import nu.marginalia.search.command.SearchParameters;
import nu.marginalia.search.model.ClusteredUrlDetails; import nu.marginalia.search.model.ClusteredUrlDetails;
@ -162,7 +163,7 @@ public class SearchOperator {
return new UrlDetails( return new UrlDetails(
item.documentId(), item.documentId(),
item.domainId(), item.domainId(),
item.url, cleanUrl(item.url),
item.title, item.title,
item.description, item.description,
item.format, item.format,
@ -177,6 +178,31 @@ public class SearchOperator {
); );
} }
/** Replace nuisance domains with replacements where available */
private static EdgeUrl cleanUrl(EdgeUrl url) {
String topdomain = url.domain.topDomain;
String subdomain = url.domain.subDomain;
String path = url.path;
if (topdomain.equals("fandom.com")) {
int wikiIndex = path.indexOf("/wiki/");
if (wikiIndex >= 0) {
return new EdgeUrl("https", new EdgeDomain("breezewiki.com"), null, "/" + subdomain + path.substring(wikiIndex), null);
}
}
else if (topdomain.equals("medium.com")) {
if (!subdomain.isBlank()) {
return new EdgeUrl("https", new EdgeDomain("scribe.rip"), null, path, null);
}
else {
String article = path.substring(path.indexOf("/", 1));
return new EdgeUrl("https", new EdgeDomain("scribe.rip"), null, article, null);
}
}
return url;
}
@SneakyThrows @SneakyThrows
private List<String> getProblems(String evalResult, List<UrlDetails> queryResults, QueryResponse response) { private List<String> getProblems(String evalResult, List<UrlDetails> queryResults, QueryResponse response) {

View File

@ -54,6 +54,7 @@ dependencies {
implementation libs.handlebars implementation libs.handlebars
implementation libs.duckdb implementation libs.duckdb
implementation libs.jsoup
implementation libs.trove implementation libs.trove
implementation dependencies.create(libs.spark.get()) { implementation dependencies.create(libs.spark.get()) {

View File

@ -2,16 +2,18 @@ package nu.marginalia.control;
import com.google.gson.Gson; import com.google.gson.Gson;
import com.google.inject.Inject; import com.google.inject.Inject;
import nu.marginalia.service.ServiceMonitors;
import nu.marginalia.control.actor.ControlActorService; import nu.marginalia.control.actor.ControlActorService;
import nu.marginalia.control.app.svc.*; import nu.marginalia.control.app.svc.*;
import nu.marginalia.control.node.svc.ControlNodeActionsService;
import nu.marginalia.control.node.svc.ControlFileStorageService; import nu.marginalia.control.node.svc.ControlFileStorageService;
import nu.marginalia.control.node.svc.ControlNodeActionsService;
import nu.marginalia.control.node.svc.ControlNodeService; import nu.marginalia.control.node.svc.ControlNodeService;
import nu.marginalia.control.sys.svc.*; import nu.marginalia.control.sys.svc.*;
import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.model.gson.GsonFactory;
import nu.marginalia.screenshot.ScreenshotService; import nu.marginalia.screenshot.ScreenshotService;
import nu.marginalia.service.server.*; import nu.marginalia.service.ServiceMonitors;
import nu.marginalia.service.server.BaseServiceParams;
import nu.marginalia.service.server.Service;
import nu.marginalia.service.server.StaticResources;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import spark.Request; import spark.Request;
@ -19,7 +21,7 @@ import spark.Response;
import spark.Spark; import spark.Spark;
import java.io.IOException; import java.io.IOException;
import java.util.*; import java.util.Map;
public class ControlService extends Service { public class ControlService extends Service {
@ -56,6 +58,7 @@ public class ControlService extends Service {
ControlDomainRankingSetsService controlDomainRankingSetsService, ControlDomainRankingSetsService controlDomainRankingSetsService,
ControlActorService controlActorService, ControlActorService controlActorService,
AbortedProcessService abortedProcessService, AbortedProcessService abortedProcessService,
DomainsManagementService domainsManagementService,
ControlErrorHandler errorHandler ControlErrorHandler errorHandler
) throws IOException { ) throws IOException {
@ -84,6 +87,7 @@ public class ControlService extends Service {
apiKeyService.register(); apiKeyService.register();
domainComplaintService.register(); domainComplaintService.register();
randomExplorationService.register(); randomExplorationService.register();
domainsManagementService.register();
errorHandler.register(); errorHandler.register();

View File

@ -0,0 +1,40 @@
package nu.marginalia.control.app.model;
public record DomainModel(int id,
String name,
String ip,
int nodeAffinity,
double rank,
boolean blacklisted) {
public boolean isUnassigned() {
return nodeAffinity < 0;
}
public DomainAffinityState getAffinityState() {
if (nodeAffinity < 0) {
return DomainAffinityState.Known;
}
else if (nodeAffinity == 0) {
return DomainAffinityState.Scheduled;
}
else {
return DomainAffinityState.Assigned;
}
}
public enum DomainAffinityState {
Assigned("The domain has been assigned to a node."),
Scheduled("The domain will be assigned to the next crawling node."),
Known("The domain is known but not yet scheduled for crawling.");
private final String desc;
DomainAffinityState(String desc) {
this.desc = desc;
}
public String getDesc() {
return desc;
}
}
}

View File

@ -0,0 +1,26 @@
package nu.marginalia.control.app.model;
import java.util.List;
import java.util.Map;
public record DomainSearchResultModel(String query,
String affinity,
String field,
Map<String, Boolean> selectedAffinity,
Map<String, Boolean> selectedField,
int page,
boolean hasNext,
boolean hasPrevious,
List<Integer> nodes,
List<DomainModel> results)
{
public Integer getNextPage() {
if (!hasNext) return null;
return page + 1;
}
public Integer getPreviousPage() {
if (!hasPrevious) return null;
return page - 1;
}
}

View File

@ -0,0 +1,310 @@
package nu.marginalia.control.app.svc;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.control.ControlRendererFactory;
import nu.marginalia.control.Redirects;
import nu.marginalia.control.app.model.DomainModel;
import nu.marginalia.control.app.model.DomainSearchResultModel;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.nodecfg.NodeConfigurationService;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import spark.Request;
import spark.Response;
import spark.Spark;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.sql.SQLException;
import java.util.*;
public class DomainsManagementService {
private final HikariDataSource dataSource;
private final NodeConfigurationService nodeConfigurationService;
private final ControlRendererFactory rendererFactory;
@Inject
public DomainsManagementService(HikariDataSource dataSource,
NodeConfigurationService nodeConfigurationService,
ControlRendererFactory rendererFactory
) {
this.dataSource = dataSource;
this.nodeConfigurationService = nodeConfigurationService;
this.rendererFactory = rendererFactory;
}
public void register() throws IOException {
var domainsViewRenderer = rendererFactory.renderer("control/app/domains");
var addDomainsTxtViewRenderer = rendererFactory.renderer("control/app/domains-new");
var addDomainsUrlViewRenderer = rendererFactory.renderer("control/app/domains-new-url");
var addDomainsAfterReportRenderer = rendererFactory.renderer("control/app/domains-new-report");
Spark.get("/domain", this::getDomains, domainsViewRenderer::render);
Spark.get("/domain/new", this::addDomainsTextfield, addDomainsTxtViewRenderer::render);
Spark.post("/domain/new", this::addDomainsTextfield, addDomainsAfterReportRenderer::render);
Spark.get("/domain/new-url", this::addDomainsFromDownload, addDomainsUrlViewRenderer::render);
Spark.post("/domain/new-url", this::addDomainsFromDownload, addDomainsAfterReportRenderer::render);
Spark.post("/domain/:id/assign/:node", this::assignDomain, new Redirects.HtmlRedirect("/domain"));
}
private Object addDomainsTextfield(Request request, Response response) throws SQLException {
if ("GET".equals(request.requestMethod())) {
return "";
}
else if ("POST".equals(request.requestMethod())) {
String nodeStr = request.queryParams("node");
String domainsStr = request.queryParams("domains");
int node = Integer.parseInt(nodeStr);
List<EdgeDomain> validDomains;
List<String> invalidDomains;
Map.Entry<List<EdgeDomain>, List<String>> domainsList = parseDomainsList(domainsStr);
validDomains = domainsList.getKey();
invalidDomains = domainsList.getValue();
insertDomains(validDomains, node);
return Map.of("validDomains", validDomains,
"invalidDomains", invalidDomains);
}
return "";
}
private Map.Entry<List<EdgeDomain>, List<String>> parseDomainsList(String domainsStr) {
List<EdgeDomain> validDomains = new ArrayList<>();
List<String> invalidDomains = new ArrayList<>();
for (String domain : domainsStr.split("\n+")) {
domain = domain.trim();
if (domain.isBlank()) continue;
if (domain.length() > 255) {
invalidDomains.add(domain);
continue;
}
if (domain.startsWith("#")) {
continue;
}
// Run through the URI parser to check for bad domains
try {
if (domain.contains(":")) {
domain = new URI(domain ).toURL().getHost();
}
else {
domain = new URI("https://" + domain + "/").toURL().getHost();
}
} catch (URISyntaxException | MalformedURLException e) {
invalidDomains.add(domain);
continue;
}
validDomains.add(new EdgeDomain(domain));
}
return Map.entry(validDomains, invalidDomains);
}
private Object addDomainsFromDownload(Request request, Response response) throws SQLException, URISyntaxException, IOException, InterruptedException {
if ("GET".equals(request.requestMethod())) {
return "";
}
else if ("POST".equals(request.requestMethod())) {
String nodeStr = request.queryParams("node");
URI domainsUrl = new URI(request.queryParams("url"));
int node = Integer.parseInt(nodeStr);
HttpClient client = HttpClient.newBuilder().build();
var httpReq = HttpRequest.newBuilder(domainsUrl).GET().build();
HttpResponse<String> result = client.send(httpReq, HttpResponse.BodyHandlers.ofString());
if (result.statusCode() != 200) {
return Map.of("error", "Failed to download domains");
}
Optional<String> ct = result.headers().firstValue("Content-Type");
if (ct.isEmpty()) {
return Map.of("error", "No content type");
}
List<EdgeDomain> validDomains = new ArrayList<>();
List<String> invalidDomains = new ArrayList<>();
String contentType = ct.get().toLowerCase();
if (contentType.startsWith("text/plain")) {
var parsedDomains = parseDomainsList(result.body());
validDomains = parsedDomains.getKey();
invalidDomains = parsedDomains.getValue();
}
else {
for (Element e : Jsoup.parse(result.body()).select("a")) {
String s = e.attr("href");
if (s.isBlank()) continue;
if (!s.contains("://")) continue;
URI uri = URI.create(s);
String scheme = uri.getScheme();
String host = uri.getHost();
if (scheme == null || host == null)
continue;
if (!scheme.equalsIgnoreCase("http") && !scheme.equalsIgnoreCase("https"))
continue;
validDomains.add(new EdgeDomain(host));
}
}
insertDomains(validDomains, node);
return Map.of("validDomains", validDomains,
"invalidDomains", invalidDomains);
}
return "";
}
private void insertDomains(List<EdgeDomain> domains, int node) throws SQLException {
// Insert the domains into the database, updating the node affinity if the domain already exists and the affinity is not already set to a node
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""
INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY)
VALUES (?, ?, ?)
ON DUPLICATE KEY UPDATE NODE_AFFINITY = IF(NODE_AFFINITY<=0, VALUES(NODE_AFFINITY), NODE_AFFINITY)
"""))
{
for (var domain : domains) {
stmt.setString(1, domain.toString());
stmt.setString(2, domain.getTopDomain());
stmt.setInt(3, node);
stmt.addBatch();
}
stmt.executeBatch();
}
}
private Object assignDomain(Request request, Response response) throws SQLException {
String idStr = request.params(":id");
String nodeStr = request.params(":node");
int id = Integer.parseInt(idStr);
int node = Integer.parseInt(nodeStr);
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET NODE_AFFINITY = ? WHERE ID = ?"))
{
stmt.setInt(1, node);
stmt.setInt(2, id);
stmt.executeUpdate();
}
return "";
}
private DomainSearchResultModel getDomains(Request request, Response response) throws SQLException {
List<DomainModel> ret = new ArrayList<>();
String filterRaw = Objects.requireNonNullElse(request.queryParams("filter"), "*");
String filter;
if (filterRaw.isBlank()) filter = "%";
else filter = filterRaw.replace('*', '%');
int page = Integer.parseInt(Objects.requireNonNullElse(request.queryParams("page"), "0"));
boolean hasMore = false;
int count = 10;
String field = Objects.requireNonNullElse(request.queryParams("field"), "domain");
Map<String, Boolean> selectedField = Map.of(field, true);
String affinity = Objects.requireNonNullElse(request.queryParams("affinity"), "all");
Map<String, Boolean> selectedAffinity = Map.of(affinity, true);
StringJoiner queryJoiner = new StringJoiner(" ");
queryJoiner.add("""
SELECT EC_DOMAIN.ID,
DOMAIN_NAME,
NODE_AFFINITY,
`RANK`,
IP,
EC_DOMAIN_BLACKLIST.URL_DOMAIN IS NOT NULL AS BLACKLISTED
FROM WMSA_prod.EC_DOMAIN
LEFT JOIN WMSA_prod.EC_DOMAIN_BLACKLIST ON DOMAIN_NAME = EC_DOMAIN_BLACKLIST.URL_DOMAIN
""")
.add((switch (field) {
case "domain" -> "WHERE DOMAIN_NAME LIKE ?";
case "ip" -> "WHERE IP LIKE ?";
case "id" -> "WHERE EC_DOMAIN.ID = ?";
default -> "WHERE DOMAIN_NAME LIKE ?";
}))
.add((switch (affinity) {
case "assigned" -> "AND NODE_AFFINITY > 0";
case "scheduled" -> "AND NODE_AFFINITY = 0";
case "unassigned" -> "AND NODE_AFFINITY < 0";
default -> "";
}))
.add("LIMIT ?")
.add("OFFSET ?");
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement(queryJoiner.toString()))
{
stmt.setString(1, filter);
stmt.setInt(2, count + 1);
stmt.setInt(3, count * page);
try (var rs = stmt.executeQuery()) {
while (rs.next()) {
if (ret.size() == count) {
hasMore = true;
break;
}
ret.add(new DomainModel(
rs.getInt("ID"),
rs.getString("DOMAIN_NAME"),
rs.getString("IP"),
rs.getInt("NODE_AFFINITY"),
Math.round(100 * rs.getDouble("RANK"))/100.,
rs.getBoolean("BLACKLISTED")
));
}
}
}
List<Integer> nodes = new ArrayList<>();
for (var node : nodeConfigurationService.getAll()) {
nodes.add(node.node());
}
return new DomainSearchResultModel(filterRaw,
affinity,
field,
selectedAffinity,
selectedField,
page,
hasMore,
page > 0,
nodes,
ret);
}
}

View File

@ -0,0 +1,41 @@
<!DOCTYPE html>
<html lang="en-US">
<head>
<title>Control Service</title>
{{> control/partials/head-includes }}
</head>
<body>
{{> control/partials/nav}}
<div class="container">
<h1 class="my-3">Add Domains Report</h1>
<p></p>
{{#if error}}
<p class="alert alert-danger">{{error}}</p>
{{/if}}
{{#unless errror}}
{{#unless invalidDomains}}
<p>All domains were added successfully!</p>
{{/unless}}
{{/unless}}
{{#if invalidDomains}}
<p>Some domains were invalid and could not be added:</p>
<textarea class="form-control" rows="10" disabled>
{{#each invalidDomains}}
{{.}}
{{/each}}
</textarea>
{{/if}}
{{#if validDomains}}
<p>If they were not already in the database, these domains were added:</p>
<textarea class="form-control" rows="10" disabled>
{{#each validDomains}}
{{.}}
{{/each}}
</textarea>
{{/if}}
<p></p>
</div>
</body>
{{> control/partials/foot-includes }}
</html>

View File

@ -0,0 +1,48 @@
<!DOCTYPE html>
<html lang="en-US">
<head>
<title>Control Service</title>
{{> control/partials/head-includes }}
</head>
<body>
{{> control/partials/nav}}
<div class="container">
<h1 class="my-3">Add Domains (URL)</h1>
<div class="my-3 p-3 border bg-light">
<p>This utility lets you add domains to be crawled via an external URL.</p>
<a href="/domain/new">It's also possible to add domains directly via a text area</a>
</div>
<form method="post">
<div class="form-group my-3">
<label for="url" class="form-label">Domains to add</label>
<input type="text" class="form-control" name="url"/>
<span class="text-muted">
Enter the URL to the file or page that contains the domains to add. If the URL leads to a text file,
the domains will be parsed from the file, one per line. If it leads to a HTML page, the HTML
will be parsed and all the links will be extracted and added as domains.
</span>
</div>
<div class="form-group my-3">
<label for="node" class="form-label">Node</label>
<select name="node" class="form-select">
<option value="-1">Unassigned</option>
<option value="0" selected>Auto</option>
{{#each global-context.nodes}}
<option value="{{id}}">Node {{id}}</option>
{{/each}}
</select>
<span class="text-muted">
Select the node to assign the domains to, this is the index node that will "own" the domain, crawl its documents
and index dem. If you select "Auto", the system will assign the domains to the next node that performs a crawl.
</span>
</div>
<button type="submit" class="btn btn-primary">Add</button>
</form>
</div>
</body>
{{> control/partials/foot-includes }}
</html>

View File

@ -0,0 +1,47 @@
<!DOCTYPE html>
<html lang="en-US">
<head>
<title>Control Service</title>
{{> control/partials/head-includes }}
</head>
<body>
{{> control/partials/nav}}
<div class="container">
<h1 class="my-3">Add Domains</h1>
<div class="my-3 p-3 border bg-light">
<p>This utility lets you add domains to be crawled via a text area.</p>
<a href="/domain/new-url">It's also possible to add domains via an external URL</a>
</div>
<form method="post">
<div class="form-group my-3">
<label for="domains" class="form-label">Domains to add</label>
<textarea name="domains" class="form-control" rows="10"></textarea>
<span class="text-muted">
Enter a list of domains to add, one per line. The system will check if the domain is already in the database and
will not add duplicates. Spaces and empty lines are ignored.
</span>
</div>
<div class="form-group my-3">
<label for="node" class="form-label">Node</label>
<select name="node" class="form-select">
<option value="-1">Unassigned</option>
<option value="0" selected>Auto</option>
{{#each global-context.nodes}}
<option value="{{id}}">Node {{id}}</option>
{{/each}}
</select>
<span class="text-muted">
Select the node to assign the domains to, this is the index node that will "own" the domain, crawl its documents
and index dem. If you select "Auto", the system will assign the domains to the next node that performs a crawl.
</span>
</div>
<button type="submit" class="btn btn-primary">Add</button>
</form>
</div>
</body>
{{> control/partials/foot-includes }}
</html>

View File

@ -0,0 +1,109 @@
<!DOCTYPE html>
<html lang="en-US">
<head>
<title>Control Service</title>
{{> control/partials/head-includes }}
</head>
<body>
{{> control/partials/nav}}
<div class="container">
<h1 class="my-3">Domains</h1>
<table class="table">
<form method="get">
<tr>
<td>
<select name="field" class="form-select" aria-label="Select Field">
<option value="domain" {{#if selectedField.domain}}selected{{/if}}>Domain Name</option>
<option value="id" {{#if selectedField.id}}selected{{/if}}>Domain ID</option>
<option value="ip" {{#if selectedField.ip}}selected{{/if}}>IP</option>
</select>
</td>
<td colspan="3"><input type="text" name="filter" class="form-control" placeholder="Domain" value="{{query}}"></td>
<td>
<select name="affinity" class="form-select" aria-label="Select Node Affinity">
<option value="all" {{#if selectedAffinity.all}}selected{{/if}}>-</option>
<option value="unassigned" {{#if selectedAffinity.unassigned}}selected{{/if}}>Unassigned</option>
<option value="scheduled" {{#if selectedAffinity.scheduled}}selected{{/if}}>Scheduled</option>
<option value="assigned" {{#if selectedAffinity.assigned}}selected{{/if}}>Assigned</option>
</select>
</td>
<td><button type="submit" class="btn btn-primary">Search</button></td>
</tr>
</form>
<tr>
<th>Domain</th>
<th>ID</th>
<th title="Which, if any, index node owns a domain and will crawl and index it">Node Affinity</th>
<th>Rank</th>
<th>IP</th>
<th>Blacklisted</th>
</tr>
{{#each results}}
<tr>
<td>{{name}}</td>
<td>{{id}}</td>
<td title="{{affinityState.desc}}">{{#unless unassigned}}{{affinityState}} {{#if nodeAffinity}}{{nodeAffinity}}{{/if}} {{/unless}}
{{#if unassigned}}
<div class="dropdown">
<button title="Assign to a node" class="btn btn-secondary dropdown-toggle" type="button" id="dropdownMenuButton1" data-bs-toggle="dropdown" aria-expanded="false">
Unassigned
</button>
<ul class="dropdown-menu" aria-labelledby="dropdownMenuButton1">
<form method="post">
<input type="hidden" name="node" value="0">
<li>
<button
class="dropdown-item"
title="Assign to the next node that performs a crawl"
formaction="/domain/{{id}}/assign/0"
type="submit">
Any
</button>
</li>
{{#each nodes}}
<input type="hidden" name="node" value="{{.}}">
<li>
<button
class="dropdown-item"
title="Assign to node {{.}}"
formaction="/domain/{{id}}/assign/{{.}}"
type="submit">
Node {{.}}
</button>
</li>
{{/each}}
</form>
</ul>
</div>
{{/if}}
</td>
<td>{{rank}}</td>
<td>{{ip}}</td>
<td>{{#if blacklisted}}&check;{{/if}}</td>
</tr>
{{/each}}
{{#unless results}}
<tr>
<td colspan="5">No results found</td>
</tr>
{{/unless}}
<tr>
<td>
{{#if hasPrevious}}
<a href="?page={{previousPage}}&filter={{query}}&field={{field}}&affinity={{affinity}}">Previous</a>
{{/if}}
</td>
<td colspan="4"></td>
<td>
{{#if hasNext}}
<a href="?page={{nextPage}}&filter={{query}}&field={{field}}&affinity={{affinity}}">Next</a>
{{/if}}
</td>
</tr>
</table>
</div>
</body>
{{> control/partials/foot-includes }}
</html>

View File

@ -1,5 +1,4 @@
<script src="https://cdn.jsdelivr.net/npm/@popperjs/core@2.11.8/dist/umd/popper.min.js" integrity="sha384-I7E8VVD/ismYTF4hNIPjVp/Zjvgyol6VFvRkX/vR+Vc4jQkC+hVqc2pM8ODewa9r" crossorigin="anonymous"></script> <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.0.2/dist/js/bootstrap.bundle.min.js" integrity="sha384-MrcW6ZMFYlzcLA8Nl+NtUVF0sA7MsXsP1UyJoMp4YLEuNSfAP+JcXn/tWtIaxVXM" crossorigin="anonymous"></script>
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/js/bootstrap.min.js" integrity="sha384-BBtl+eGJRgqQAUMxJ7pMwbEyER4l1g+O15P+16Ep7Q9Q+zqX6gSbd85u4mG4QzX+" crossorigin="anonymous"></script>
<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.1/jquery.min.js"></script> <script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.1/jquery.min.js"></script>
<script src="/refresh.js"></script> <script src="/refresh.js"></script>
<script type="javascript"> <script type="javascript">

View File

@ -16,13 +16,21 @@
<a href="#" class="nav-link dropdown-toggle" data-bs-toggle="dropdown" role="button" aria-expanded="false">Application</a> <a href="#" class="nav-link dropdown-toggle" data-bs-toggle="dropdown" role="button" aria-expanded="false">Application</a>
<ul class="dropdown-menu"> <ul class="dropdown-menu">
<li><a class="dropdown-item" href="/api-keys" title="Create or remove API keys">API Keys</a></li> <li><a class="dropdown-item" href="/api-keys" title="Create or remove API keys">API Keys</a></li>
<li><a class="dropdown-item" href="/blacklist" title="Add or remove website sanctions">Blacklist</a></li>
<li><a class="dropdown-item" href="/search-to-ban" title="Search function for easy blacklisting">Blacklist Search</a></li>
<li><a class="dropdown-item" href="/complaints" title="View and act on user complaints">Complaints</a></li> <li><a class="dropdown-item" href="/complaints" title="View and act on user complaints">Complaints</a></li>
<li><a class="dropdown-item" href="/review-random-domains" title="Review random domains list">Random Exploration</a></li> <li><a class="dropdown-item" href="/review-random-domains" title="Review random domains list">Random Exploration</a></li>
</ul> </ul>
</li> </li>
{{/unless}} {{/unless}}
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" data-bs-toggle="dropdown" role="button" aria-expanded="false">Domains</a>
<ul class="dropdown-menu">
<li><a class="dropdown-item" href="/domain/new" title="Add New Domains">Add Domains</a></li>
<li><a class="dropdown-item" href="/domain" title="List Domains">Manage Domains</a></li>
<li><hr class="dropdown-divider"></li>
<li><a class="dropdown-item" href="/blacklist" title="Add or remove website sanctions">Blacklist</a></li>
<li><a class="dropdown-item" href="/search-to-ban" title="Search function for easy blacklisting">Blacklist Search</a></li>
</ul>
</li>
<li class="nav-item dropdown"> <li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" data-bs-toggle="dropdown" role="button" aria-expanded="false">Index Nodes</a> <a href="#" class="nav-link dropdown-toggle" data-bs-toggle="dropdown" role="button" aria-expanded="false">Index Nodes</a>
<ul class="dropdown-menu"> <ul class="dropdown-menu">

View File

@ -16,6 +16,7 @@ import java.net.http.HttpClient;
import java.net.http.HttpRequest; import java.net.http.HttpRequest;
import java.net.http.HttpResponse; import java.net.http.HttpResponse;
import java.sql.Connection; import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException; import java.sql.SQLException;
import java.time.Duration; import java.time.Duration;
import java.util.ArrayList; import java.util.ArrayList;
@ -34,7 +35,7 @@ public class ScreenshotCaptureToolMain {
System.setProperty(ChromeDriverService.CHROME_DRIVER_SILENT_OUTPUT_PROPERTY, "true"); System.setProperty(ChromeDriverService.CHROME_DRIVER_SILENT_OUTPUT_PROPERTY, "true");
List<EdgeDomain> crawlQueue = fetchCrawlQueue(ds, 1000); List<EdgeDomain> crawlQueue = fetchCrawlQueue(ds, 10_000);
HttpClient httpClient = HttpClient.newBuilder() HttpClient httpClient = HttpClient.newBuilder()
.version(HttpClient.Version.HTTP_1_1) .version(HttpClient.Version.HTTP_1_1)
@ -137,16 +138,33 @@ public class ScreenshotCaptureToolMain {
List<EdgeDomain> ret = new ArrayList<>(queueSize); List<EdgeDomain> ret = new ArrayList<>(queueSize);
try (var conn = ds.getConnection(); var stmt = conn.createStatement()) { try (var conn = ds.getConnection(); var stmt = conn.createStatement()) {
var rsp = stmt.executeQuery( int newCount = queueSize / 4;
int oldCount = queueSize - newCount;
ResultSet rst = stmt.executeQuery(
""" """
SELECT EC_DOMAIN.DOMAIN_NAME FROM EC_DOMAIN SELECT EC_DOMAIN.DOMAIN_NAME FROM EC_DOMAIN
LEFT JOIN DATA_DOMAIN_HISTORY ON EC_DOMAIN.DOMAIN_NAME=DATA_DOMAIN_HISTORY.DOMAIN_NAME LEFT JOIN DATA_DOMAIN_HISTORY ON EC_DOMAIN.DOMAIN_NAME=DATA_DOMAIN_HISTORY.DOMAIN_NAME
ORDER BY SCREENSHOT_DATE IS NULL DESC, SCREENSHOT_DATE, INDEXED DESC ORDER BY SCREENSHOT_DATE IS NULL DESC, SCREENSHOT_DATE, INDEXED DESC
LIMIT LIMIT
""" + queueSize); """ + newCount);
while (rsp.next()) { while (rst.next()) {
ret.add(new EdgeDomain(rsp.getString(1))); ret.add(new EdgeDomain(rst.getString(1)));
} }
rst = stmt.executeQuery("""
SELECT DATA_DOMAIN_HISTORY.DOMAIN_NAME FROM DATA_DOMAIN_HISTORY
INNER JOIN DATA_DOMAIN_SCREENSHOT ON DATA_DOMAIN_SCREENSHOT.DOMAIN_NAME = DATA_DOMAIN_HISTORY.DOMAIN_NAME
WHERE SCREENSHOT_DATE IS NOT NULL
ORDER BY SCREENSHOT_DATE ASC
LIMIT
""" + oldCount);
while (rst.next()) {
ret.add(new EdgeDomain(rst.getString(1)));
}
} }
catch (Exception ex) { catch (Exception ex) {
logger.warn("Exception in fetching queue", ex); logger.warn("Exception in fetching queue", ex);