mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Merge branch 'master' into term-positions
# Conflicts: # code/index/java/nu/marginalia/index/results/model/ids/TermIdList.java # code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java # code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java # code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java # code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/CrawledDomainReader.java # code/processes/crawling-process/test/nu/marginalia/crawling/HttpFetcherTest.java # code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java # code/services-application/search-service/java/nu/marginalia/search/svc/SearchQueryIndexService.java
This commit is contained in:
commit
8f367d96f8
@ -34,7 +34,6 @@ import org.apache.logging.log4j.util.Strings;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
@ -201,7 +200,7 @@ public class ConverterMain extends ProcessMainClass {
|
||||
try {
|
||||
return Optional.of(CrawledDomainReader.createDataStream(path));
|
||||
}
|
||||
catch (IOException ex) {
|
||||
catch (Exception ex) {
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
@ -151,9 +151,9 @@ public class RedditSideloader implements SideloadSource {
|
||||
var doc = sideloaderProcessing
|
||||
.processDocument(fullUrl,
|
||||
fullHtml,
|
||||
List.of("encyclopedia", "wiki"),
|
||||
List.of("reddit"),
|
||||
domainLinks,
|
||||
GeneratorType.WIKI,
|
||||
GeneratorType.FORUM,
|
||||
DocumentClass.SIDELOAD,
|
||||
anchorTextKeywords.getAnchorTextKeywords(domainLinks, urls),
|
||||
pubYear,
|
||||
|
@ -9,6 +9,9 @@ import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor;
|
||||
import nu.marginalia.crawl.retreival.revisit.DocumentWithReference;
|
||||
import nu.marginalia.crawl.retreival.sitemap.SitemapFetcher;
|
||||
import nu.marginalia.crawling.body.HttpFetchResult;
|
||||
import nu.marginalia.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.crawling.model.CrawlerDomainStatus;
|
||||
import nu.marginalia.ip_blocklist.UrlBlocklist;
|
||||
import nu.marginalia.link_parser.LinkParser;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
@ -28,6 +31,7 @@ import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class CrawlerRetreiver implements AutoCloseable {
|
||||
@ -88,17 +92,8 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
}
|
||||
|
||||
public int fetch(DomainLinks domainLinks, CrawlDataReference oldCrawlData) {
|
||||
final DomainProber.ProbeResult probeResult = domainProber.probeDomain(
|
||||
fetcher,
|
||||
domain,
|
||||
new EdgeUrl("http", new EdgeDomain(domain), null, "/", null));
|
||||
|
||||
try {
|
||||
// Sleep a bit to avoid hammering the server with requests, we just probed it
|
||||
TimeUnit.SECONDS.sleep(1);
|
||||
|
||||
// Fetch the domain
|
||||
return crawlDomain(oldCrawlData, probeResult, domainLinks);
|
||||
return crawlDomain(oldCrawlData, domainLinks);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Error crawling domain {}", domain, ex);
|
||||
@ -112,25 +107,33 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
resync.run(warcFile);
|
||||
}
|
||||
|
||||
private int crawlDomain(CrawlDataReference oldCrawlData, DomainProber.ProbeResult probeResult, DomainLinks domainLinks) throws IOException, InterruptedException {
|
||||
String ip = findIp(domain);
|
||||
EdgeUrl rootUrl;
|
||||
private DomainProber.ProbeResult probeRootUrl(String ip) throws IOException {
|
||||
// Construct an URL to the root of the domain, we don't know the schema yet so we'll
|
||||
// start with http and then try https if that fails
|
||||
var httpUrl = new EdgeUrl("http", new EdgeDomain(domain), null, "/", null);
|
||||
final DomainProber.ProbeResult probeResult = domainProber.probeDomain(fetcher, domain, httpUrl);
|
||||
|
||||
warcRecorder.writeWarcinfoHeader(ip, new EdgeDomain(domain), probeResult);
|
||||
|
||||
if (!(probeResult instanceof DomainProber.ProbeResultOk ok)) {
|
||||
return 1;
|
||||
}
|
||||
else {
|
||||
rootUrl = ok.probedUrl();
|
||||
}
|
||||
return probeResult;
|
||||
}
|
||||
|
||||
private int crawlDomain(CrawlDataReference oldCrawlData, DomainLinks domainLinks) throws IOException, InterruptedException {
|
||||
String ip = findIp(domain);
|
||||
EdgeUrl rootUrl;
|
||||
|
||||
if (probeRootUrl(ip) instanceof DomainProber.ProbeResultOk ok) rootUrl = ok.probedUrl();
|
||||
else return 1;
|
||||
|
||||
// Sleep after the initial probe, we don't have access to the robots.txt yet
|
||||
// so we don't know the crawl delay
|
||||
TimeUnit.SECONDS.sleep(1);
|
||||
|
||||
final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(rootUrl.domain, warcRecorder);
|
||||
final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());
|
||||
|
||||
delayTimer.waitFetchDelay(0); // initial delay after robots.txt
|
||||
sniffRootDocument(rootUrl, delayTimer);
|
||||
delayTimer.waitFetchDelay(0); // delay after sniffing
|
||||
|
||||
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
|
||||
int recrawled = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer);
|
||||
@ -188,7 +191,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
|
||||
|
||||
try {
|
||||
if (fetchWriteAndSleep(top, delayTimer, DocumentWithReference.empty()).isOk()) {
|
||||
if (fetchContentWithReference(top, delayTimer, DocumentWithReference.empty()).isOk()) {
|
||||
fetchedCount++;
|
||||
}
|
||||
}
|
||||
@ -209,21 +212,8 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
|
||||
var url = rootUrl.withPathAndParam("/", null);
|
||||
|
||||
HttpFetchResult result = null;
|
||||
|
||||
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
|
||||
try {
|
||||
result = fetcher.fetchContent(url, warcRecorder, ContentTags.empty());
|
||||
break;
|
||||
}
|
||||
catch (RateLimitException ex) {
|
||||
timer.waitRetryDelay(ex);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.warn("Failed to fetch {}", url, ex);
|
||||
result = new HttpFetchResult.ResultException(ex);
|
||||
}
|
||||
}
|
||||
HttpFetchResult result = fetchWithRetry(url, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
|
||||
timer.waitFetchDelay(0);
|
||||
|
||||
if (!(result instanceof HttpFetchResult.ResultOk ok))
|
||||
return;
|
||||
@ -236,24 +226,40 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
var doc = optDoc.get();
|
||||
crawlFrontier.setLinkFilter(linkFilterSelector.selectFilter(doc));
|
||||
|
||||
EdgeUrl faviconUrl = url.withPathAndParam("/favicon.ico", null);
|
||||
Optional<EdgeUrl> sitemapUrl = Optional.empty();
|
||||
|
||||
for (var link : doc.getElementsByTag("link")) {
|
||||
String rel = link.attr("rel");
|
||||
String type = link.attr("type");
|
||||
|
||||
if (!rel.equalsIgnoreCase("alternate"))
|
||||
continue;
|
||||
if (rel.equals("icon") || rel.equals("shortcut icon")) {
|
||||
String href = link.attr("href");
|
||||
|
||||
if (!(type.equalsIgnoreCase("application/atom+xml")
|
||||
|| type.equalsIgnoreCase("application/rss+xml")))
|
||||
continue;
|
||||
faviconUrl = linkParser.parseLink(url, href)
|
||||
.filter(crawlFrontier::isSameDomain)
|
||||
.orElse(faviconUrl);
|
||||
}
|
||||
|
||||
String href = link.attr("href");
|
||||
// Grab the RSS/Atom as a sitemap if it exists
|
||||
if (rel.equalsIgnoreCase("alternate")
|
||||
&& (type.equalsIgnoreCase("application/atom+xml") || type.equalsIgnoreCase("application/atomsvc+xml"))) {
|
||||
String href = link.attr("href");
|
||||
|
||||
linkParser.parseLink(url, href)
|
||||
.filter(crawlFrontier::isSameDomain)
|
||||
.map(List::of)
|
||||
.ifPresent(sitemapFetcher::downloadSitemaps);
|
||||
sitemapUrl = linkParser.parseLink(url, href)
|
||||
.filter(crawlFrontier::isSameDomain);
|
||||
}
|
||||
}
|
||||
|
||||
// Download the sitemap if available exists
|
||||
if (sitemapUrl.isPresent()) {
|
||||
sitemapFetcher.downloadSitemaps(List.of(sitemapUrl.get()));
|
||||
timer.waitFetchDelay(0);
|
||||
}
|
||||
|
||||
// Grab the favicon if it exists
|
||||
fetchWithRetry(faviconUrl, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
|
||||
timer.waitFetchDelay(0);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Error configuring link filter", ex);
|
||||
@ -263,31 +269,16 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
}
|
||||
}
|
||||
|
||||
public HttpFetchResult fetchWriteAndSleep(EdgeUrl top,
|
||||
CrawlDelayTimer timer,
|
||||
DocumentWithReference reference) throws InterruptedException
|
||||
public HttpFetchResult fetchContentWithReference(EdgeUrl top,
|
||||
CrawlDelayTimer timer,
|
||||
DocumentWithReference reference) throws InterruptedException
|
||||
{
|
||||
logger.debug("Fetching {}", top);
|
||||
|
||||
HttpFetchResult fetchedDoc = new HttpFetchResult.ResultNone();
|
||||
|
||||
long startTime = System.currentTimeMillis();
|
||||
var contentTags = reference.getContentTags();
|
||||
|
||||
// Fetch the document, retrying if we get a rate limit exception
|
||||
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
|
||||
try {
|
||||
fetchedDoc = fetcher.fetchContent(top, warcRecorder, contentTags);
|
||||
break;
|
||||
}
|
||||
catch (RateLimitException ex) {
|
||||
timer.waitRetryDelay(ex);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.warn("Failed to fetch {}", top, ex);
|
||||
fetchedDoc = new HttpFetchResult.ResultException(ex);
|
||||
}
|
||||
}
|
||||
HttpFetchResult fetchedDoc = fetchWithRetry(top, timer, HttpFetcher.ProbeType.FULL, contentTags);
|
||||
|
||||
// Parse the document and enqueue links
|
||||
try {
|
||||
@ -329,6 +320,27 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
return fetchedDoc;
|
||||
}
|
||||
|
||||
/** Fetch a document and retry on 429s */
|
||||
private HttpFetchResult fetchWithRetry(EdgeUrl url,
|
||||
CrawlDelayTimer timer,
|
||||
HttpFetcher.ProbeType probeType,
|
||||
ContentTags contentTags) throws InterruptedException {
|
||||
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
|
||||
try {
|
||||
return fetcher.fetchContent(url, warcRecorder, contentTags, probeType);
|
||||
}
|
||||
catch (RateLimitException ex) {
|
||||
timer.waitRetryDelay(ex);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.warn("Failed to fetch {}", url, ex);
|
||||
return new HttpFetchResult.ResultException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
return new HttpFetchResult.ResultNone();
|
||||
}
|
||||
|
||||
private boolean isAllowedProtocol(String proto) {
|
||||
return proto.equalsIgnoreCase("http")
|
||||
|| proto.equalsIgnoreCase("https");
|
||||
|
@ -4,6 +4,7 @@ import com.google.inject.ImplementedBy;
|
||||
import crawlercommons.robots.SimpleRobotRules;
|
||||
import nu.marginalia.crawl.retreival.RateLimitException;
|
||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawling.body.HttpFetchResult;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.body.HttpFetchResult;
|
||||
@ -19,9 +20,18 @@ public interface HttpFetcher {
|
||||
|
||||
FetchResult probeDomain(EdgeUrl url);
|
||||
|
||||
HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags) throws RateLimitException;
|
||||
HttpFetchResult fetchContent(EdgeUrl url,
|
||||
WarcRecorder recorder,
|
||||
ContentTags tags,
|
||||
ProbeType probeType) throws RateLimitException;
|
||||
|
||||
SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder);
|
||||
|
||||
SitemapRetriever createSitemapRetriever();
|
||||
|
||||
enum ProbeType {
|
||||
DISABLED,
|
||||
FULL,
|
||||
IF_MODIFIED_SINCE
|
||||
}
|
||||
}
|
||||
|
@ -12,6 +12,9 @@ import nu.marginalia.crawl.retreival.fetcher.socket.FastTerminatingSocketFactory
|
||||
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
|
||||
import nu.marginalia.crawl.retreival.fetcher.socket.NoSecuritySSL;
|
||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawling.body.ContentTypeLogic;
|
||||
import nu.marginalia.crawling.body.DocumentBodyExtractor;
|
||||
import nu.marginalia.crawling.body.HttpFetchResult;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.body.ContentTypeLogic;
|
||||
@ -145,12 +148,13 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
@SneakyThrows
|
||||
public HttpFetchResult fetchContent(EdgeUrl url,
|
||||
WarcRecorder warcRecorder,
|
||||
ContentTags contentTags)
|
||||
ContentTags contentTags,
|
||||
ProbeType probeType)
|
||||
{
|
||||
|
||||
// We don't want to waste time and resources on URLs that are not HTML, so if the file ending
|
||||
// looks like it might be something else, we perform a HEAD first to check the content type
|
||||
if (contentTags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url))
|
||||
if (probeType == ProbeType.FULL && contentTags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url))
|
||||
{
|
||||
ContentTypeProbeResult probeResult = contentTypeProber.probeContentType(url);
|
||||
if (probeResult instanceof ContentTypeProbeResult.Ok ok) {
|
||||
@ -174,7 +178,9 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
else {
|
||||
// Possibly do a soft probe to see if the URL has been modified since the last time we crawled it
|
||||
// if we have reason to suspect ETags are not supported by the server.
|
||||
if (softIfModifiedSinceProber.probeModificationTime(url, contentTags)) {
|
||||
if (probeType == ProbeType.IF_MODIFIED_SINCE
|
||||
&& softIfModifiedSinceProber.probeModificationTime(url, contentTags))
|
||||
{
|
||||
return new HttpFetchResult.Result304Raw();
|
||||
}
|
||||
}
|
||||
|
@ -137,7 +137,7 @@ public class CrawlerRevisitor {
|
||||
|
||||
DocumentWithReference reference = new DocumentWithReference(doc, oldCrawlData);
|
||||
|
||||
var result = crawlerRetreiver.fetchWriteAndSleep(url, delayTimer, reference);
|
||||
var result = crawlerRetreiver.fetchContentWithReference(url, delayTimer, reference);
|
||||
|
||||
if (reference.isSame(result)) {
|
||||
retained++;
|
||||
|
@ -46,22 +46,35 @@ public class DbCrawlSpecProvider implements CrawlSpecProvider {
|
||||
|
||||
blacklist.waitUntilLoaded();
|
||||
|
||||
List<Integer> domainIds = new ArrayList<>(10_000);
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var assignFreeDomains = conn.prepareStatement("UPDATE EC_DOMAIN SET NODE_AFFINITY=? WHERE NODE_AFFINITY=0");
|
||||
var query = conn.prepareStatement("""
|
||||
SELECT DOMAIN_NAME, COALESCE(KNOWN_URLS, 0), EC_DOMAIN.ID
|
||||
FROM EC_DOMAIN
|
||||
LEFT JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||
WHERE NODE_AFFINITY=?
|
||||
"""))
|
||||
WHERE NODE_AFFINITY=? OR NODE_AFFINITY=0
|
||||
""")
|
||||
)
|
||||
{
|
||||
|
||||
// Assign any domains with node_affinity=0 to this node. We must do this now, before we start crawling
|
||||
// to avoid race conditions with other crawl runs. We don't want multiple crawlers to crawl the same domain.
|
||||
assignFreeDomains.setInt(1, processConfiguration.node());
|
||||
assignFreeDomains.executeUpdate();
|
||||
|
||||
// Fetch the domains to be crawled
|
||||
query.setInt(1, processConfiguration.node());
|
||||
query.setFetchSize(10_000);
|
||||
var rs = query.executeQuery();
|
||||
|
||||
while (rs.next()) {
|
||||
// Skip blacklisted domains
|
||||
if (blacklist.isBlacklisted(rs.getInt(3)))
|
||||
int id = rs.getInt(3);
|
||||
if (blacklist.isBlacklisted(id))
|
||||
continue;
|
||||
domainIds.add(id);
|
||||
|
||||
int urls = rs.getInt(2);
|
||||
double growthFactor;
|
||||
@ -83,6 +96,7 @@ public class DbCrawlSpecProvider implements CrawlSpecProvider {
|
||||
|
||||
domains.add(record);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
logger.info("Loaded {} domains", domains.size());
|
||||
|
@ -1,6 +1,9 @@
|
||||
package nu.marginalia.io.crawldata;
|
||||
|
||||
import nu.marginalia.io.crawldata.format.ParquetSerializableCrawlDataStream;
|
||||
import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
@ -8,16 +11,23 @@ import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
public class CrawledDomainReader {
|
||||
private static final Logger logger = LoggerFactory.getLogger(CrawledDomainReader.class);
|
||||
|
||||
/** An iterator-like access to domain data This must be closed otherwise it will leak off-heap memory! */
|
||||
public static SerializableCrawlDataStream createDataStream(Path fullPath) throws IOException
|
||||
{
|
||||
|
||||
String fileName = fullPath.getFileName().toString();
|
||||
if (fileName.endsWith(".parquet")) {
|
||||
return new ParquetSerializableCrawlDataStream(fullPath);
|
||||
}
|
||||
else {
|
||||
throw new IllegalArgumentException("Unknown file type: " + fullPath);
|
||||
try {
|
||||
return new ParquetSerializableCrawlDataStream(fullPath);
|
||||
} catch (Exception ex) {
|
||||
logger.error("Error reading domain data from " + fullPath, ex);
|
||||
return SerializableCrawlDataStream.empty();
|
||||
}
|
||||
} else {
|
||||
logger.error("Unknown file type: {}", fullPath);
|
||||
return SerializableCrawlDataStream.empty();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -12,13 +12,10 @@
|
||||
|
||||
# Cloud Yuqu LLC
|
||||
172.247.0.0/16
|
||||
|
||||
107.151.64.0/18
|
||||
|
||||
# Google Cloud
|
||||
# 35.208.0.0/12
|
||||
# 35.224.0.0/12
|
||||
# 35.240.0.0/13
|
||||
|
||||
# 1Blu
|
||||
178.254.10.0/23
|
||||
178.254.10.0/23
|
||||
|
||||
# Domain parking spam
|
||||
199.59.243.0/24
|
@ -3,12 +3,13 @@ package nu.marginalia.crawling;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.crawl.retreival.RateLimitException;
|
||||
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawling.body.ContentTypeLogic;
|
||||
import nu.marginalia.crawling.body.DocumentBodyExtractor;
|
||||
import nu.marginalia.crawling.body.DocumentBodyResult;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.body.ContentTypeLogic;
|
||||
import nu.marginalia.model.body.DocumentBodyExtractor;
|
||||
import nu.marginalia.model.body.DocumentBodyResult;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
@ -35,7 +36,7 @@ class HttpFetcherTest {
|
||||
void fetchUTF8() throws URISyntaxException, RateLimitException, IOException {
|
||||
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
||||
try (var recorder = new WarcRecorder()) {
|
||||
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, ContentTags.empty());
|
||||
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
|
||||
System.out.println(bodyOk.contentType());
|
||||
}
|
||||
@ -47,7 +48,7 @@ class HttpFetcherTest {
|
||||
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
||||
|
||||
try (var recorder = new WarcRecorder()) {
|
||||
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, ContentTags.empty());
|
||||
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
|
||||
System.out.println(bodyOk.contentType());
|
||||
}
|
||||
|
@ -122,7 +122,7 @@ public class CrawlerMockFetcherTest {
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags) {
|
||||
public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags, ProbeType probeType) {
|
||||
logger.info("Fetching {}", url);
|
||||
if (mockData.containsKey(url)) {
|
||||
byte[] bodyBytes = mockData.get(url).documentBody.getBytes();
|
||||
|
@ -261,6 +261,7 @@ class CrawlerRetreiverTest {
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
assertEquals(Set.of("https://www.marginalia.nu/",
|
||||
"https://www.marginalia.nu/favicon.ico",
|
||||
"https://www.marginalia.nu/log/06-optimization.gmi/"),
|
||||
fetchedUrls);
|
||||
|
||||
|
@ -12,6 +12,7 @@ import nu.marginalia.bbpc.BrailleBlockPunchCards;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.index.query.limit.QueryLimits;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
import nu.marginalia.search.command.SearchParameters;
|
||||
import nu.marginalia.search.model.ClusteredUrlDetails;
|
||||
@ -162,7 +163,7 @@ public class SearchOperator {
|
||||
return new UrlDetails(
|
||||
item.documentId(),
|
||||
item.domainId(),
|
||||
item.url,
|
||||
cleanUrl(item.url),
|
||||
item.title,
|
||||
item.description,
|
||||
item.format,
|
||||
@ -177,6 +178,31 @@ public class SearchOperator {
|
||||
);
|
||||
}
|
||||
|
||||
/** Replace nuisance domains with replacements where available */
|
||||
private static EdgeUrl cleanUrl(EdgeUrl url) {
|
||||
String topdomain = url.domain.topDomain;
|
||||
String subdomain = url.domain.subDomain;
|
||||
String path = url.path;
|
||||
|
||||
if (topdomain.equals("fandom.com")) {
|
||||
int wikiIndex = path.indexOf("/wiki/");
|
||||
if (wikiIndex >= 0) {
|
||||
return new EdgeUrl("https", new EdgeDomain("breezewiki.com"), null, "/" + subdomain + path.substring(wikiIndex), null);
|
||||
}
|
||||
}
|
||||
else if (topdomain.equals("medium.com")) {
|
||||
if (!subdomain.isBlank()) {
|
||||
return new EdgeUrl("https", new EdgeDomain("scribe.rip"), null, path, null);
|
||||
}
|
||||
else {
|
||||
String article = path.substring(path.indexOf("/", 1));
|
||||
return new EdgeUrl("https", new EdgeDomain("scribe.rip"), null, article, null);
|
||||
}
|
||||
|
||||
}
|
||||
return url;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private List<String> getProblems(String evalResult, List<UrlDetails> queryResults, QueryResponse response) {
|
||||
|
||||
|
@ -54,6 +54,7 @@ dependencies {
|
||||
implementation libs.handlebars
|
||||
|
||||
implementation libs.duckdb
|
||||
implementation libs.jsoup
|
||||
|
||||
implementation libs.trove
|
||||
implementation dependencies.create(libs.spark.get()) {
|
||||
|
@ -2,16 +2,18 @@ package nu.marginalia.control;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.service.ServiceMonitors;
|
||||
import nu.marginalia.control.actor.ControlActorService;
|
||||
import nu.marginalia.control.app.svc.*;
|
||||
import nu.marginalia.control.node.svc.ControlNodeActionsService;
|
||||
import nu.marginalia.control.node.svc.ControlFileStorageService;
|
||||
import nu.marginalia.control.node.svc.ControlNodeActionsService;
|
||||
import nu.marginalia.control.node.svc.ControlNodeService;
|
||||
import nu.marginalia.control.sys.svc.*;
|
||||
import nu.marginalia.model.gson.GsonFactory;
|
||||
import nu.marginalia.screenshot.ScreenshotService;
|
||||
import nu.marginalia.service.server.*;
|
||||
import nu.marginalia.service.ServiceMonitors;
|
||||
import nu.marginalia.service.server.BaseServiceParams;
|
||||
import nu.marginalia.service.server.Service;
|
||||
import nu.marginalia.service.server.StaticResources;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.Request;
|
||||
@ -19,7 +21,7 @@ import spark.Response;
|
||||
import spark.Spark;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
import java.util.Map;
|
||||
|
||||
public class ControlService extends Service {
|
||||
|
||||
@ -56,6 +58,7 @@ public class ControlService extends Service {
|
||||
ControlDomainRankingSetsService controlDomainRankingSetsService,
|
||||
ControlActorService controlActorService,
|
||||
AbortedProcessService abortedProcessService,
|
||||
DomainsManagementService domainsManagementService,
|
||||
ControlErrorHandler errorHandler
|
||||
) throws IOException {
|
||||
|
||||
@ -84,6 +87,7 @@ public class ControlService extends Service {
|
||||
apiKeyService.register();
|
||||
domainComplaintService.register();
|
||||
randomExplorationService.register();
|
||||
domainsManagementService.register();
|
||||
|
||||
errorHandler.register();
|
||||
|
||||
|
@ -0,0 +1,40 @@
|
||||
package nu.marginalia.control.app.model;
|
||||
|
||||
public record DomainModel(int id,
|
||||
String name,
|
||||
String ip,
|
||||
int nodeAffinity,
|
||||
double rank,
|
||||
boolean blacklisted) {
|
||||
|
||||
public boolean isUnassigned() {
|
||||
return nodeAffinity < 0;
|
||||
}
|
||||
|
||||
public DomainAffinityState getAffinityState() {
|
||||
if (nodeAffinity < 0) {
|
||||
return DomainAffinityState.Known;
|
||||
}
|
||||
else if (nodeAffinity == 0) {
|
||||
return DomainAffinityState.Scheduled;
|
||||
}
|
||||
else {
|
||||
return DomainAffinityState.Assigned;
|
||||
}
|
||||
}
|
||||
|
||||
public enum DomainAffinityState {
|
||||
Assigned("The domain has been assigned to a node."),
|
||||
Scheduled("The domain will be assigned to the next crawling node."),
|
||||
Known("The domain is known but not yet scheduled for crawling.");
|
||||
|
||||
private final String desc;
|
||||
DomainAffinityState(String desc) {
|
||||
this.desc = desc;
|
||||
}
|
||||
|
||||
public String getDesc() {
|
||||
return desc;
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,26 @@
|
||||
package nu.marginalia.control.app.model;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public record DomainSearchResultModel(String query,
|
||||
String affinity,
|
||||
String field,
|
||||
Map<String, Boolean> selectedAffinity,
|
||||
Map<String, Boolean> selectedField,
|
||||
int page,
|
||||
boolean hasNext,
|
||||
boolean hasPrevious,
|
||||
List<Integer> nodes,
|
||||
List<DomainModel> results)
|
||||
{
|
||||
public Integer getNextPage() {
|
||||
if (!hasNext) return null;
|
||||
return page + 1;
|
||||
}
|
||||
|
||||
public Integer getPreviousPage() {
|
||||
if (!hasPrevious) return null;
|
||||
return page - 1;
|
||||
}
|
||||
}
|
@ -0,0 +1,310 @@
|
||||
package nu.marginalia.control.app.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.control.ControlRendererFactory;
|
||||
import nu.marginalia.control.Redirects;
|
||||
import nu.marginalia.control.app.model.DomainModel;
|
||||
import nu.marginalia.control.app.model.DomainSearchResultModel;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Element;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
import spark.Spark;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
|
||||
public class DomainsManagementService {
|
||||
|
||||
private final HikariDataSource dataSource;
|
||||
private final NodeConfigurationService nodeConfigurationService;
|
||||
private final ControlRendererFactory rendererFactory;
|
||||
|
||||
@Inject
|
||||
public DomainsManagementService(HikariDataSource dataSource,
|
||||
NodeConfigurationService nodeConfigurationService,
|
||||
ControlRendererFactory rendererFactory
|
||||
) {
|
||||
this.dataSource = dataSource;
|
||||
this.nodeConfigurationService = nodeConfigurationService;
|
||||
this.rendererFactory = rendererFactory;
|
||||
}
|
||||
|
||||
public void register() throws IOException {
|
||||
|
||||
var domainsViewRenderer = rendererFactory.renderer("control/app/domains");
|
||||
var addDomainsTxtViewRenderer = rendererFactory.renderer("control/app/domains-new");
|
||||
var addDomainsUrlViewRenderer = rendererFactory.renderer("control/app/domains-new-url");
|
||||
var addDomainsAfterReportRenderer = rendererFactory.renderer("control/app/domains-new-report");
|
||||
|
||||
Spark.get("/domain", this::getDomains, domainsViewRenderer::render);
|
||||
Spark.get("/domain/new", this::addDomainsTextfield, addDomainsTxtViewRenderer::render);
|
||||
Spark.post("/domain/new", this::addDomainsTextfield, addDomainsAfterReportRenderer::render);
|
||||
Spark.get("/domain/new-url", this::addDomainsFromDownload, addDomainsUrlViewRenderer::render);
|
||||
Spark.post("/domain/new-url", this::addDomainsFromDownload, addDomainsAfterReportRenderer::render);
|
||||
Spark.post("/domain/:id/assign/:node", this::assignDomain, new Redirects.HtmlRedirect("/domain"));
|
||||
|
||||
}
|
||||
|
||||
private Object addDomainsTextfield(Request request, Response response) throws SQLException {
|
||||
if ("GET".equals(request.requestMethod())) {
|
||||
return "";
|
||||
}
|
||||
else if ("POST".equals(request.requestMethod())) {
|
||||
String nodeStr = request.queryParams("node");
|
||||
String domainsStr = request.queryParams("domains");
|
||||
|
||||
int node = Integer.parseInt(nodeStr);
|
||||
|
||||
List<EdgeDomain> validDomains;
|
||||
List<String> invalidDomains;
|
||||
|
||||
Map.Entry<List<EdgeDomain>, List<String>> domainsList = parseDomainsList(domainsStr);
|
||||
|
||||
validDomains = domainsList.getKey();
|
||||
invalidDomains = domainsList.getValue();
|
||||
|
||||
insertDomains(validDomains, node);
|
||||
|
||||
return Map.of("validDomains", validDomains,
|
||||
"invalidDomains", invalidDomains);
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
private Map.Entry<List<EdgeDomain>, List<String>> parseDomainsList(String domainsStr) {
|
||||
List<EdgeDomain> validDomains = new ArrayList<>();
|
||||
List<String> invalidDomains = new ArrayList<>();
|
||||
|
||||
for (String domain : domainsStr.split("\n+")) {
|
||||
domain = domain.trim();
|
||||
if (domain.isBlank()) continue;
|
||||
if (domain.length() > 255) {
|
||||
invalidDomains.add(domain);
|
||||
continue;
|
||||
}
|
||||
if (domain.startsWith("#")) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Run through the URI parser to check for bad domains
|
||||
try {
|
||||
if (domain.contains(":")) {
|
||||
domain = new URI(domain ).toURL().getHost();
|
||||
}
|
||||
else {
|
||||
domain = new URI("https://" + domain + "/").toURL().getHost();
|
||||
}
|
||||
} catch (URISyntaxException | MalformedURLException e) {
|
||||
invalidDomains.add(domain);
|
||||
continue;
|
||||
}
|
||||
|
||||
validDomains.add(new EdgeDomain(domain));
|
||||
}
|
||||
|
||||
return Map.entry(validDomains, invalidDomains);
|
||||
}
|
||||
|
||||
private Object addDomainsFromDownload(Request request, Response response) throws SQLException, URISyntaxException, IOException, InterruptedException {
|
||||
if ("GET".equals(request.requestMethod())) {
|
||||
return "";
|
||||
}
|
||||
else if ("POST".equals(request.requestMethod())) {
|
||||
String nodeStr = request.queryParams("node");
|
||||
URI domainsUrl = new URI(request.queryParams("url"));
|
||||
|
||||
int node = Integer.parseInt(nodeStr);
|
||||
|
||||
HttpClient client = HttpClient.newBuilder().build();
|
||||
var httpReq = HttpRequest.newBuilder(domainsUrl).GET().build();
|
||||
|
||||
|
||||
HttpResponse<String> result = client.send(httpReq, HttpResponse.BodyHandlers.ofString());
|
||||
if (result.statusCode() != 200) {
|
||||
return Map.of("error", "Failed to download domains");
|
||||
}
|
||||
Optional<String> ct = result.headers().firstValue("Content-Type");
|
||||
if (ct.isEmpty()) {
|
||||
return Map.of("error", "No content type");
|
||||
}
|
||||
|
||||
List<EdgeDomain> validDomains = new ArrayList<>();
|
||||
List<String> invalidDomains = new ArrayList<>();
|
||||
|
||||
String contentType = ct.get().toLowerCase();
|
||||
|
||||
if (contentType.startsWith("text/plain")) {
|
||||
var parsedDomains = parseDomainsList(result.body());
|
||||
validDomains = parsedDomains.getKey();
|
||||
invalidDomains = parsedDomains.getValue();
|
||||
}
|
||||
else {
|
||||
for (Element e : Jsoup.parse(result.body()).select("a")) {
|
||||
String s = e.attr("href");
|
||||
if (s.isBlank()) continue;
|
||||
if (!s.contains("://")) continue;
|
||||
|
||||
URI uri = URI.create(s);
|
||||
String scheme = uri.getScheme();
|
||||
String host = uri.getHost();
|
||||
|
||||
if (scheme == null || host == null)
|
||||
continue;
|
||||
if (!scheme.equalsIgnoreCase("http") && !scheme.equalsIgnoreCase("https"))
|
||||
continue;
|
||||
|
||||
validDomains.add(new EdgeDomain(host));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
insertDomains(validDomains, node);
|
||||
|
||||
|
||||
return Map.of("validDomains", validDomains,
|
||||
"invalidDomains", invalidDomains);
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
private void insertDomains(List<EdgeDomain> domains, int node) throws SQLException {
|
||||
|
||||
// Insert the domains into the database, updating the node affinity if the domain already exists and the affinity is not already set to a node
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY)
|
||||
VALUES (?, ?, ?)
|
||||
ON DUPLICATE KEY UPDATE NODE_AFFINITY = IF(NODE_AFFINITY<=0, VALUES(NODE_AFFINITY), NODE_AFFINITY)
|
||||
"""))
|
||||
{
|
||||
for (var domain : domains) {
|
||||
stmt.setString(1, domain.toString());
|
||||
stmt.setString(2, domain.getTopDomain());
|
||||
stmt.setInt(3, node);
|
||||
stmt.addBatch();
|
||||
}
|
||||
stmt.executeBatch();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private Object assignDomain(Request request, Response response) throws SQLException {
|
||||
|
||||
String idStr = request.params(":id");
|
||||
String nodeStr = request.params(":node");
|
||||
|
||||
int id = Integer.parseInt(idStr);
|
||||
int node = Integer.parseInt(nodeStr);
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET NODE_AFFINITY = ? WHERE ID = ?"))
|
||||
{
|
||||
stmt.setInt(1, node);
|
||||
stmt.setInt(2, id);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
private DomainSearchResultModel getDomains(Request request, Response response) throws SQLException {
|
||||
List<DomainModel> ret = new ArrayList<>();
|
||||
|
||||
String filterRaw = Objects.requireNonNullElse(request.queryParams("filter"), "*");
|
||||
|
||||
String filter;
|
||||
if (filterRaw.isBlank()) filter = "%";
|
||||
else filter = filterRaw.replace('*', '%');
|
||||
|
||||
int page = Integer.parseInt(Objects.requireNonNullElse(request.queryParams("page"), "0"));
|
||||
boolean hasMore = false;
|
||||
int count = 10;
|
||||
|
||||
String field = Objects.requireNonNullElse(request.queryParams("field"), "domain");
|
||||
Map<String, Boolean> selectedField = Map.of(field, true);
|
||||
|
||||
String affinity = Objects.requireNonNullElse(request.queryParams("affinity"), "all");
|
||||
Map<String, Boolean> selectedAffinity = Map.of(affinity, true);
|
||||
|
||||
StringJoiner queryJoiner = new StringJoiner(" ");
|
||||
queryJoiner.add("""
|
||||
SELECT EC_DOMAIN.ID,
|
||||
DOMAIN_NAME,
|
||||
NODE_AFFINITY,
|
||||
`RANK`,
|
||||
IP,
|
||||
EC_DOMAIN_BLACKLIST.URL_DOMAIN IS NOT NULL AS BLACKLISTED
|
||||
FROM WMSA_prod.EC_DOMAIN
|
||||
LEFT JOIN WMSA_prod.EC_DOMAIN_BLACKLIST ON DOMAIN_NAME = EC_DOMAIN_BLACKLIST.URL_DOMAIN
|
||||
""")
|
||||
.add((switch (field) {
|
||||
case "domain" -> "WHERE DOMAIN_NAME LIKE ?";
|
||||
case "ip" -> "WHERE IP LIKE ?";
|
||||
case "id" -> "WHERE EC_DOMAIN.ID = ?";
|
||||
default -> "WHERE DOMAIN_NAME LIKE ?";
|
||||
}))
|
||||
.add((switch (affinity) {
|
||||
case "assigned" -> "AND NODE_AFFINITY > 0";
|
||||
case "scheduled" -> "AND NODE_AFFINITY = 0";
|
||||
case "unassigned" -> "AND NODE_AFFINITY < 0";
|
||||
default -> "";
|
||||
}))
|
||||
.add("LIMIT ?")
|
||||
.add("OFFSET ?");
|
||||
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement(queryJoiner.toString()))
|
||||
{
|
||||
stmt.setString(1, filter);
|
||||
stmt.setInt(2, count + 1);
|
||||
stmt.setInt(3, count * page);
|
||||
|
||||
try (var rs = stmt.executeQuery()) {
|
||||
while (rs.next()) {
|
||||
if (ret.size() == count) {
|
||||
hasMore = true;
|
||||
break;
|
||||
}
|
||||
ret.add(new DomainModel(
|
||||
rs.getInt("ID"),
|
||||
rs.getString("DOMAIN_NAME"),
|
||||
rs.getString("IP"),
|
||||
rs.getInt("NODE_AFFINITY"),
|
||||
Math.round(100 * rs.getDouble("RANK"))/100.,
|
||||
rs.getBoolean("BLACKLISTED")
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
List<Integer> nodes = new ArrayList<>();
|
||||
|
||||
for (var node : nodeConfigurationService.getAll()) {
|
||||
nodes.add(node.node());
|
||||
}
|
||||
|
||||
return new DomainSearchResultModel(filterRaw,
|
||||
affinity,
|
||||
field,
|
||||
selectedAffinity,
|
||||
selectedField,
|
||||
page,
|
||||
hasMore,
|
||||
page > 0,
|
||||
nodes,
|
||||
ret);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,41 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en-US">
|
||||
<head>
|
||||
<title>Control Service</title>
|
||||
{{> control/partials/head-includes }}
|
||||
</head>
|
||||
<body>
|
||||
{{> control/partials/nav}}
|
||||
<div class="container">
|
||||
<h1 class="my-3">Add Domains Report</h1>
|
||||
|
||||
<p></p>
|
||||
{{#if error}}
|
||||
<p class="alert alert-danger">{{error}}</p>
|
||||
{{/if}}
|
||||
{{#unless errror}}
|
||||
{{#unless invalidDomains}}
|
||||
<p>All domains were added successfully!</p>
|
||||
{{/unless}}
|
||||
{{/unless}}
|
||||
{{#if invalidDomains}}
|
||||
<p>Some domains were invalid and could not be added:</p>
|
||||
<textarea class="form-control" rows="10" disabled>
|
||||
{{#each invalidDomains}}
|
||||
{{.}}
|
||||
{{/each}}
|
||||
</textarea>
|
||||
{{/if}}
|
||||
{{#if validDomains}}
|
||||
<p>If they were not already in the database, these domains were added:</p>
|
||||
<textarea class="form-control" rows="10" disabled>
|
||||
{{#each validDomains}}
|
||||
{{.}}
|
||||
{{/each}}
|
||||
</textarea>
|
||||
{{/if}}
|
||||
<p></p>
|
||||
</div>
|
||||
</body>
|
||||
{{> control/partials/foot-includes }}
|
||||
</html>
|
@ -0,0 +1,48 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en-US">
|
||||
<head>
|
||||
<title>Control Service</title>
|
||||
{{> control/partials/head-includes }}
|
||||
</head>
|
||||
<body>
|
||||
{{> control/partials/nav}}
|
||||
<div class="container">
|
||||
<h1 class="my-3">Add Domains (URL)</h1>
|
||||
|
||||
<div class="my-3 p-3 border bg-light">
|
||||
<p>This utility lets you add domains to be crawled via an external URL.</p>
|
||||
<a href="/domain/new">It's also possible to add domains directly via a text area</a>
|
||||
</div>
|
||||
|
||||
<form method="post">
|
||||
<div class="form-group my-3">
|
||||
<label for="url" class="form-label">Domains to add</label>
|
||||
<input type="text" class="form-control" name="url"/>
|
||||
<span class="text-muted">
|
||||
Enter the URL to the file or page that contains the domains to add. If the URL leads to a text file,
|
||||
the domains will be parsed from the file, one per line. If it leads to a HTML page, the HTML
|
||||
will be parsed and all the links will be extracted and added as domains.
|
||||
</span>
|
||||
</div>
|
||||
|
||||
<div class="form-group my-3">
|
||||
<label for="node" class="form-label">Node</label>
|
||||
<select name="node" class="form-select">
|
||||
<option value="-1">Unassigned</option>
|
||||
<option value="0" selected>Auto</option>
|
||||
{{#each global-context.nodes}}
|
||||
<option value="{{id}}">Node {{id}}</option>
|
||||
{{/each}}
|
||||
|
||||
</select>
|
||||
<span class="text-muted">
|
||||
Select the node to assign the domains to, this is the index node that will "own" the domain, crawl its documents
|
||||
and index dem. If you select "Auto", the system will assign the domains to the next node that performs a crawl.
|
||||
</span>
|
||||
</div>
|
||||
<button type="submit" class="btn btn-primary">Add</button>
|
||||
</form>
|
||||
</div>
|
||||
</body>
|
||||
{{> control/partials/foot-includes }}
|
||||
</html>
|
@ -0,0 +1,47 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en-US">
|
||||
<head>
|
||||
<title>Control Service</title>
|
||||
{{> control/partials/head-includes }}
|
||||
</head>
|
||||
<body>
|
||||
{{> control/partials/nav}}
|
||||
<div class="container">
|
||||
<h1 class="my-3">Add Domains</h1>
|
||||
|
||||
<div class="my-3 p-3 border bg-light">
|
||||
<p>This utility lets you add domains to be crawled via a text area.</p>
|
||||
<a href="/domain/new-url">It's also possible to add domains via an external URL</a>
|
||||
</div>
|
||||
|
||||
<form method="post">
|
||||
<div class="form-group my-3">
|
||||
<label for="domains" class="form-label">Domains to add</label>
|
||||
<textarea name="domains" class="form-control" rows="10"></textarea>
|
||||
<span class="text-muted">
|
||||
Enter a list of domains to add, one per line. The system will check if the domain is already in the database and
|
||||
will not add duplicates. Spaces and empty lines are ignored.
|
||||
</span>
|
||||
</div>
|
||||
|
||||
<div class="form-group my-3">
|
||||
<label for="node" class="form-label">Node</label>
|
||||
<select name="node" class="form-select">
|
||||
<option value="-1">Unassigned</option>
|
||||
<option value="0" selected>Auto</option>
|
||||
{{#each global-context.nodes}}
|
||||
<option value="{{id}}">Node {{id}}</option>
|
||||
{{/each}}
|
||||
|
||||
</select>
|
||||
<span class="text-muted">
|
||||
Select the node to assign the domains to, this is the index node that will "own" the domain, crawl its documents
|
||||
and index dem. If you select "Auto", the system will assign the domains to the next node that performs a crawl.
|
||||
</span>
|
||||
</div>
|
||||
<button type="submit" class="btn btn-primary">Add</button>
|
||||
</form>
|
||||
</div>
|
||||
</body>
|
||||
{{> control/partials/foot-includes }}
|
||||
</html>
|
@ -0,0 +1,109 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en-US">
|
||||
<head>
|
||||
<title>Control Service</title>
|
||||
{{> control/partials/head-includes }}
|
||||
</head>
|
||||
<body>
|
||||
{{> control/partials/nav}}
|
||||
<div class="container">
|
||||
<h1 class="my-3">Domains</h1>
|
||||
|
||||
<table class="table">
|
||||
<form method="get">
|
||||
<tr>
|
||||
<td>
|
||||
<select name="field" class="form-select" aria-label="Select Field">
|
||||
<option value="domain" {{#if selectedField.domain}}selected{{/if}}>Domain Name</option>
|
||||
<option value="id" {{#if selectedField.id}}selected{{/if}}>Domain ID</option>
|
||||
<option value="ip" {{#if selectedField.ip}}selected{{/if}}>IP</option>
|
||||
</select>
|
||||
</td>
|
||||
<td colspan="3"><input type="text" name="filter" class="form-control" placeholder="Domain" value="{{query}}"></td>
|
||||
<td>
|
||||
<select name="affinity" class="form-select" aria-label="Select Node Affinity">
|
||||
<option value="all" {{#if selectedAffinity.all}}selected{{/if}}>-</option>
|
||||
<option value="unassigned" {{#if selectedAffinity.unassigned}}selected{{/if}}>Unassigned</option>
|
||||
<option value="scheduled" {{#if selectedAffinity.scheduled}}selected{{/if}}>Scheduled</option>
|
||||
<option value="assigned" {{#if selectedAffinity.assigned}}selected{{/if}}>Assigned</option>
|
||||
</select>
|
||||
</td>
|
||||
<td><button type="submit" class="btn btn-primary">Search</button></td>
|
||||
</tr>
|
||||
</form>
|
||||
<tr>
|
||||
<th>Domain</th>
|
||||
<th>ID</th>
|
||||
<th title="Which, if any, index node owns a domain and will crawl and index it">Node Affinity</th>
|
||||
<th>Rank</th>
|
||||
<th>IP</th>
|
||||
<th>Blacklisted</th>
|
||||
</tr>
|
||||
{{#each results}}
|
||||
<tr>
|
||||
<td>{{name}}</td>
|
||||
<td>{{id}}</td>
|
||||
<td title="{{affinityState.desc}}">{{#unless unassigned}}{{affinityState}} {{#if nodeAffinity}}{{nodeAffinity}}{{/if}} {{/unless}}
|
||||
{{#if unassigned}}
|
||||
<div class="dropdown">
|
||||
<button title="Assign to a node" class="btn btn-secondary dropdown-toggle" type="button" id="dropdownMenuButton1" data-bs-toggle="dropdown" aria-expanded="false">
|
||||
Unassigned
|
||||
</button>
|
||||
<ul class="dropdown-menu" aria-labelledby="dropdownMenuButton1">
|
||||
<form method="post">
|
||||
<input type="hidden" name="node" value="0">
|
||||
<li>
|
||||
<button
|
||||
class="dropdown-item"
|
||||
title="Assign to the next node that performs a crawl"
|
||||
formaction="/domain/{{id}}/assign/0"
|
||||
type="submit">
|
||||
Any
|
||||
</button>
|
||||
</li>
|
||||
|
||||
{{#each nodes}}
|
||||
<input type="hidden" name="node" value="{{.}}">
|
||||
<li>
|
||||
<button
|
||||
class="dropdown-item"
|
||||
title="Assign to node {{.}}"
|
||||
formaction="/domain/{{id}}/assign/{{.}}"
|
||||
type="submit">
|
||||
Node {{.}}
|
||||
</button>
|
||||
</li>
|
||||
{{/each}}
|
||||
</form>
|
||||
</ul>
|
||||
</div>
|
||||
{{/if}}
|
||||
</td>
|
||||
<td>{{rank}}</td>
|
||||
<td>{{ip}}</td>
|
||||
<td>{{#if blacklisted}}✓{{/if}}</td>
|
||||
</tr>
|
||||
{{/each}}
|
||||
{{#unless results}}
|
||||
<tr>
|
||||
<td colspan="5">No results found</td>
|
||||
</tr>
|
||||
{{/unless}}
|
||||
<tr>
|
||||
<td>
|
||||
{{#if hasPrevious}}
|
||||
<a href="?page={{previousPage}}&filter={{query}}&field={{field}}&affinity={{affinity}}">Previous</a>
|
||||
{{/if}}
|
||||
</td>
|
||||
<td colspan="4"></td>
|
||||
<td>
|
||||
{{#if hasNext}}
|
||||
<a href="?page={{nextPage}}&filter={{query}}&field={{field}}&affinity={{affinity}}">Next</a>
|
||||
{{/if}}
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
</div>
|
||||
</body>
|
||||
{{> control/partials/foot-includes }}
|
||||
</html>
|
@ -1,5 +1,4 @@
|
||||
<script src="https://cdn.jsdelivr.net/npm/@popperjs/core@2.11.8/dist/umd/popper.min.js" integrity="sha384-I7E8VVD/ismYTF4hNIPjVp/Zjvgyol6VFvRkX/vR+Vc4jQkC+hVqc2pM8ODewa9r" crossorigin="anonymous"></script>
|
||||
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/js/bootstrap.min.js" integrity="sha384-BBtl+eGJRgqQAUMxJ7pMwbEyER4l1g+O15P+16Ep7Q9Q+zqX6gSbd85u4mG4QzX+" crossorigin="anonymous"></script>
|
||||
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.0.2/dist/js/bootstrap.bundle.min.js" integrity="sha384-MrcW6ZMFYlzcLA8Nl+NtUVF0sA7MsXsP1UyJoMp4YLEuNSfAP+JcXn/tWtIaxVXM" crossorigin="anonymous"></script>
|
||||
<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.1/jquery.min.js"></script>
|
||||
<script src="/refresh.js"></script>
|
||||
<script type="javascript">
|
||||
|
@ -16,13 +16,21 @@
|
||||
<a href="#" class="nav-link dropdown-toggle" data-bs-toggle="dropdown" role="button" aria-expanded="false">Application</a>
|
||||
<ul class="dropdown-menu">
|
||||
<li><a class="dropdown-item" href="/api-keys" title="Create or remove API keys">API Keys</a></li>
|
||||
<li><a class="dropdown-item" href="/blacklist" title="Add or remove website sanctions">Blacklist</a></li>
|
||||
<li><a class="dropdown-item" href="/search-to-ban" title="Search function for easy blacklisting">Blacklist Search</a></li>
|
||||
<li><a class="dropdown-item" href="/complaints" title="View and act on user complaints">Complaints</a></li>
|
||||
<li><a class="dropdown-item" href="/review-random-domains" title="Review random domains list">Random Exploration</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
{{/unless}}
|
||||
<li class="nav-item dropdown">
|
||||
<a href="#" class="nav-link dropdown-toggle" data-bs-toggle="dropdown" role="button" aria-expanded="false">Domains</a>
|
||||
<ul class="dropdown-menu">
|
||||
<li><a class="dropdown-item" href="/domain/new" title="Add New Domains">Add Domains</a></li>
|
||||
<li><a class="dropdown-item" href="/domain" title="List Domains">Manage Domains</a></li>
|
||||
<li><hr class="dropdown-divider"></li>
|
||||
<li><a class="dropdown-item" href="/blacklist" title="Add or remove website sanctions">Blacklist</a></li>
|
||||
<li><a class="dropdown-item" href="/search-to-ban" title="Search function for easy blacklisting">Blacklist Search</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="nav-item dropdown">
|
||||
<a href="#" class="nav-link dropdown-toggle" data-bs-toggle="dropdown" role="button" aria-expanded="false">Index Nodes</a>
|
||||
<ul class="dropdown-menu">
|
||||
|
@ -16,6 +16,7 @@ import java.net.http.HttpClient;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.sql.Connection;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.time.Duration;
|
||||
import java.util.ArrayList;
|
||||
@ -34,7 +35,7 @@ public class ScreenshotCaptureToolMain {
|
||||
|
||||
System.setProperty(ChromeDriverService.CHROME_DRIVER_SILENT_OUTPUT_PROPERTY, "true");
|
||||
|
||||
List<EdgeDomain> crawlQueue = fetchCrawlQueue(ds, 1000);
|
||||
List<EdgeDomain> crawlQueue = fetchCrawlQueue(ds, 10_000);
|
||||
|
||||
HttpClient httpClient = HttpClient.newBuilder()
|
||||
.version(HttpClient.Version.HTTP_1_1)
|
||||
@ -137,16 +138,33 @@ public class ScreenshotCaptureToolMain {
|
||||
List<EdgeDomain> ret = new ArrayList<>(queueSize);
|
||||
|
||||
try (var conn = ds.getConnection(); var stmt = conn.createStatement()) {
|
||||
var rsp = stmt.executeQuery(
|
||||
int newCount = queueSize / 4;
|
||||
int oldCount = queueSize - newCount;
|
||||
|
||||
ResultSet rst = stmt.executeQuery(
|
||||
"""
|
||||
SELECT EC_DOMAIN.DOMAIN_NAME FROM EC_DOMAIN
|
||||
LEFT JOIN DATA_DOMAIN_HISTORY ON EC_DOMAIN.DOMAIN_NAME=DATA_DOMAIN_HISTORY.DOMAIN_NAME
|
||||
ORDER BY SCREENSHOT_DATE IS NULL DESC, SCREENSHOT_DATE, INDEXED DESC
|
||||
LIMIT
|
||||
""" + queueSize);
|
||||
while (rsp.next()) {
|
||||
ret.add(new EdgeDomain(rsp.getString(1)));
|
||||
""" + newCount);
|
||||
while (rst.next()) {
|
||||
ret.add(new EdgeDomain(rst.getString(1)));
|
||||
}
|
||||
|
||||
rst = stmt.executeQuery("""
|
||||
SELECT DATA_DOMAIN_HISTORY.DOMAIN_NAME FROM DATA_DOMAIN_HISTORY
|
||||
INNER JOIN DATA_DOMAIN_SCREENSHOT ON DATA_DOMAIN_SCREENSHOT.DOMAIN_NAME = DATA_DOMAIN_HISTORY.DOMAIN_NAME
|
||||
WHERE SCREENSHOT_DATE IS NOT NULL
|
||||
ORDER BY SCREENSHOT_DATE ASC
|
||||
LIMIT
|
||||
""" + oldCount);
|
||||
|
||||
while (rst.next()) {
|
||||
ret.add(new EdgeDomain(rst.getString(1)));
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.warn("Exception in fetching queue", ex);
|
||||
|
Loading…
Reference in New Issue
Block a user