diff --git a/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java index 3a978972..c45967bc 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java @@ -34,7 +34,6 @@ import org.apache.logging.log4j.util.Strings; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.sql.SQLException; @@ -201,7 +200,7 @@ public class ConverterMain extends ProcessMainClass { try { return Optional.of(CrawledDomainReader.createDataStream(path)); } - catch (IOException ex) { + catch (Exception ex) { return Optional.empty(); } } diff --git a/code/processes/converting-process/java/nu/marginalia/converting/sideload/reddit/RedditSideloader.java b/code/processes/converting-process/java/nu/marginalia/converting/sideload/reddit/RedditSideloader.java index e46d8d4d..c2536f77 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/reddit/RedditSideloader.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/reddit/RedditSideloader.java @@ -151,9 +151,9 @@ public class RedditSideloader implements SideloadSource { var doc = sideloaderProcessing .processDocument(fullUrl, fullHtml, - List.of("encyclopedia", "wiki"), + List.of("reddit"), domainLinks, - GeneratorType.WIKI, + GeneratorType.FORUM, DocumentClass.SIDELOAD, anchorTextKeywords.getAnchorTextKeywords(domainLinks, urls), pubYear, diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index 7a4ebdc8..09a82367 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -9,6 +9,9 @@ import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor; import nu.marginalia.crawl.retreival.revisit.DocumentWithReference; import nu.marginalia.crawl.retreival.sitemap.SitemapFetcher; +import nu.marginalia.crawling.body.HttpFetchResult; +import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.crawling.model.CrawlerDomainStatus; import nu.marginalia.ip_blocklist.UrlBlocklist; import nu.marginalia.link_parser.LinkParser; import nu.marginalia.model.EdgeDomain; @@ -28,6 +31,7 @@ import java.nio.file.Path; import java.util.ArrayList; import java.util.List; import java.util.Objects; +import java.util.Optional; import java.util.concurrent.TimeUnit; public class CrawlerRetreiver implements AutoCloseable { @@ -88,17 +92,8 @@ public class CrawlerRetreiver implements AutoCloseable { } public int fetch(DomainLinks domainLinks, CrawlDataReference oldCrawlData) { - final DomainProber.ProbeResult probeResult = domainProber.probeDomain( - fetcher, - domain, - new EdgeUrl("http", new EdgeDomain(domain), null, "/", null)); - try { - // Sleep a bit to avoid hammering the server with requests, we just probed it - TimeUnit.SECONDS.sleep(1); - - // Fetch the domain - return crawlDomain(oldCrawlData, probeResult, domainLinks); + return crawlDomain(oldCrawlData, domainLinks); } catch (Exception ex) { logger.error("Error crawling domain {}", domain, ex); @@ -112,25 +107,33 @@ public class CrawlerRetreiver implements AutoCloseable { resync.run(warcFile); } - private int crawlDomain(CrawlDataReference oldCrawlData, DomainProber.ProbeResult probeResult, DomainLinks domainLinks) throws IOException, InterruptedException { - String ip = findIp(domain); - EdgeUrl rootUrl; + private DomainProber.ProbeResult probeRootUrl(String ip) throws IOException { + // Construct an URL to the root of the domain, we don't know the schema yet so we'll + // start with http and then try https if that fails + var httpUrl = new EdgeUrl("http", new EdgeDomain(domain), null, "/", null); + final DomainProber.ProbeResult probeResult = domainProber.probeDomain(fetcher, domain, httpUrl); warcRecorder.writeWarcinfoHeader(ip, new EdgeDomain(domain), probeResult); - if (!(probeResult instanceof DomainProber.ProbeResultOk ok)) { - return 1; - } - else { - rootUrl = ok.probedUrl(); - } + return probeResult; + } + + private int crawlDomain(CrawlDataReference oldCrawlData, DomainLinks domainLinks) throws IOException, InterruptedException { + String ip = findIp(domain); + EdgeUrl rootUrl; + + if (probeRootUrl(ip) instanceof DomainProber.ProbeResultOk ok) rootUrl = ok.probedUrl(); + else return 1; + + // Sleep after the initial probe, we don't have access to the robots.txt yet + // so we don't know the crawl delay + TimeUnit.SECONDS.sleep(1); final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(rootUrl.domain, warcRecorder); final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay()); delayTimer.waitFetchDelay(0); // initial delay after robots.txt sniffRootDocument(rootUrl, delayTimer); - delayTimer.waitFetchDelay(0); // delay after sniffing // Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified int recrawled = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer); @@ -188,7 +191,7 @@ public class CrawlerRetreiver implements AutoCloseable { try { - if (fetchWriteAndSleep(top, delayTimer, DocumentWithReference.empty()).isOk()) { + if (fetchContentWithReference(top, delayTimer, DocumentWithReference.empty()).isOk()) { fetchedCount++; } } @@ -209,21 +212,8 @@ public class CrawlerRetreiver implements AutoCloseable { var url = rootUrl.withPathAndParam("/", null); - HttpFetchResult result = null; - - for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) { - try { - result = fetcher.fetchContent(url, warcRecorder, ContentTags.empty()); - break; - } - catch (RateLimitException ex) { - timer.waitRetryDelay(ex); - } - catch (Exception ex) { - logger.warn("Failed to fetch {}", url, ex); - result = new HttpFetchResult.ResultException(ex); - } - } + HttpFetchResult result = fetchWithRetry(url, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty()); + timer.waitFetchDelay(0); if (!(result instanceof HttpFetchResult.ResultOk ok)) return; @@ -236,24 +226,40 @@ public class CrawlerRetreiver implements AutoCloseable { var doc = optDoc.get(); crawlFrontier.setLinkFilter(linkFilterSelector.selectFilter(doc)); + EdgeUrl faviconUrl = url.withPathAndParam("/favicon.ico", null); + Optional sitemapUrl = Optional.empty(); + for (var link : doc.getElementsByTag("link")) { String rel = link.attr("rel"); String type = link.attr("type"); - if (!rel.equalsIgnoreCase("alternate")) - continue; + if (rel.equals("icon") || rel.equals("shortcut icon")) { + String href = link.attr("href"); - if (!(type.equalsIgnoreCase("application/atom+xml") - || type.equalsIgnoreCase("application/rss+xml"))) - continue; + faviconUrl = linkParser.parseLink(url, href) + .filter(crawlFrontier::isSameDomain) + .orElse(faviconUrl); + } - String href = link.attr("href"); + // Grab the RSS/Atom as a sitemap if it exists + if (rel.equalsIgnoreCase("alternate") + && (type.equalsIgnoreCase("application/atom+xml") || type.equalsIgnoreCase("application/atomsvc+xml"))) { + String href = link.attr("href"); - linkParser.parseLink(url, href) - .filter(crawlFrontier::isSameDomain) - .map(List::of) - .ifPresent(sitemapFetcher::downloadSitemaps); + sitemapUrl = linkParser.parseLink(url, href) + .filter(crawlFrontier::isSameDomain); + } } + + // Download the sitemap if available exists + if (sitemapUrl.isPresent()) { + sitemapFetcher.downloadSitemaps(List.of(sitemapUrl.get())); + timer.waitFetchDelay(0); + } + + // Grab the favicon if it exists + fetchWithRetry(faviconUrl, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty()); + timer.waitFetchDelay(0); } catch (Exception ex) { logger.error("Error configuring link filter", ex); @@ -263,31 +269,16 @@ public class CrawlerRetreiver implements AutoCloseable { } } - public HttpFetchResult fetchWriteAndSleep(EdgeUrl top, - CrawlDelayTimer timer, - DocumentWithReference reference) throws InterruptedException + public HttpFetchResult fetchContentWithReference(EdgeUrl top, + CrawlDelayTimer timer, + DocumentWithReference reference) throws InterruptedException { logger.debug("Fetching {}", top); - HttpFetchResult fetchedDoc = new HttpFetchResult.ResultNone(); - long startTime = System.currentTimeMillis(); var contentTags = reference.getContentTags(); - // Fetch the document, retrying if we get a rate limit exception - for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) { - try { - fetchedDoc = fetcher.fetchContent(top, warcRecorder, contentTags); - break; - } - catch (RateLimitException ex) { - timer.waitRetryDelay(ex); - } - catch (Exception ex) { - logger.warn("Failed to fetch {}", top, ex); - fetchedDoc = new HttpFetchResult.ResultException(ex); - } - } + HttpFetchResult fetchedDoc = fetchWithRetry(top, timer, HttpFetcher.ProbeType.FULL, contentTags); // Parse the document and enqueue links try { @@ -329,6 +320,27 @@ public class CrawlerRetreiver implements AutoCloseable { return fetchedDoc; } + /** Fetch a document and retry on 429s */ + private HttpFetchResult fetchWithRetry(EdgeUrl url, + CrawlDelayTimer timer, + HttpFetcher.ProbeType probeType, + ContentTags contentTags) throws InterruptedException { + for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) { + try { + return fetcher.fetchContent(url, warcRecorder, contentTags, probeType); + } + catch (RateLimitException ex) { + timer.waitRetryDelay(ex); + } + catch (Exception ex) { + logger.warn("Failed to fetch {}", url, ex); + return new HttpFetchResult.ResultException(ex); + } + } + + return new HttpFetchResult.ResultNone(); + } + private boolean isAllowedProtocol(String proto) { return proto.equalsIgnoreCase("http") || proto.equalsIgnoreCase("https"); diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java index a2015e8f..d0a8b075 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java @@ -4,6 +4,7 @@ import com.google.inject.ImplementedBy; import crawlercommons.robots.SimpleRobotRules; import nu.marginalia.crawl.retreival.RateLimitException; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; +import nu.marginalia.crawling.body.HttpFetchResult; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.body.HttpFetchResult; @@ -19,9 +20,18 @@ public interface HttpFetcher { FetchResult probeDomain(EdgeUrl url); - HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags) throws RateLimitException; + HttpFetchResult fetchContent(EdgeUrl url, + WarcRecorder recorder, + ContentTags tags, + ProbeType probeType) throws RateLimitException; SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder); SitemapRetriever createSitemapRetriever(); + + enum ProbeType { + DISABLED, + FULL, + IF_MODIFIED_SINCE + } } diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java index f4be6b7f..49ac03bc 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java @@ -12,6 +12,9 @@ import nu.marginalia.crawl.retreival.fetcher.socket.FastTerminatingSocketFactory import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor; import nu.marginalia.crawl.retreival.fetcher.socket.NoSecuritySSL; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; +import nu.marginalia.crawling.body.ContentTypeLogic; +import nu.marginalia.crawling.body.DocumentBodyExtractor; +import nu.marginalia.crawling.body.HttpFetchResult; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.body.ContentTypeLogic; @@ -145,12 +148,13 @@ public class HttpFetcherImpl implements HttpFetcher { @SneakyThrows public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder warcRecorder, - ContentTags contentTags) + ContentTags contentTags, + ProbeType probeType) { // We don't want to waste time and resources on URLs that are not HTML, so if the file ending // looks like it might be something else, we perform a HEAD first to check the content type - if (contentTags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url)) + if (probeType == ProbeType.FULL && contentTags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url)) { ContentTypeProbeResult probeResult = contentTypeProber.probeContentType(url); if (probeResult instanceof ContentTypeProbeResult.Ok ok) { @@ -174,7 +178,9 @@ public class HttpFetcherImpl implements HttpFetcher { else { // Possibly do a soft probe to see if the URL has been modified since the last time we crawled it // if we have reason to suspect ETags are not supported by the server. - if (softIfModifiedSinceProber.probeModificationTime(url, contentTags)) { + if (probeType == ProbeType.IF_MODIFIED_SINCE + && softIfModifiedSinceProber.probeModificationTime(url, contentTags)) + { return new HttpFetchResult.Result304Raw(); } } diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java index f9310028..f5bb863e 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java @@ -137,7 +137,7 @@ public class CrawlerRevisitor { DocumentWithReference reference = new DocumentWithReference(doc, oldCrawlData); - var result = crawlerRetreiver.fetchWriteAndSleep(url, delayTimer, reference); + var result = crawlerRetreiver.fetchContentWithReference(url, delayTimer, reference); if (reference.isSame(result)) { retained++; diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/spec/DbCrawlSpecProvider.java b/code/processes/crawling-process/java/nu/marginalia/crawl/spec/DbCrawlSpecProvider.java index 6681df1a..b482182f 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/spec/DbCrawlSpecProvider.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/spec/DbCrawlSpecProvider.java @@ -46,22 +46,35 @@ public class DbCrawlSpecProvider implements CrawlSpecProvider { blacklist.waitUntilLoaded(); + List domainIds = new ArrayList<>(10_000); + try (var conn = dataSource.getConnection(); + var assignFreeDomains = conn.prepareStatement("UPDATE EC_DOMAIN SET NODE_AFFINITY=? WHERE NODE_AFFINITY=0"); var query = conn.prepareStatement(""" SELECT DOMAIN_NAME, COALESCE(KNOWN_URLS, 0), EC_DOMAIN.ID FROM EC_DOMAIN LEFT JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID - WHERE NODE_AFFINITY=? - """)) + WHERE NODE_AFFINITY=? OR NODE_AFFINITY=0 + """) + ) { + + // Assign any domains with node_affinity=0 to this node. We must do this now, before we start crawling + // to avoid race conditions with other crawl runs. We don't want multiple crawlers to crawl the same domain. + assignFreeDomains.setInt(1, processConfiguration.node()); + assignFreeDomains.executeUpdate(); + + // Fetch the domains to be crawled query.setInt(1, processConfiguration.node()); query.setFetchSize(10_000); var rs = query.executeQuery(); while (rs.next()) { // Skip blacklisted domains - if (blacklist.isBlacklisted(rs.getInt(3))) + int id = rs.getInt(3); + if (blacklist.isBlacklisted(id)) continue; + domainIds.add(id); int urls = rs.getInt(2); double growthFactor; @@ -83,6 +96,7 @@ public class DbCrawlSpecProvider implements CrawlSpecProvider { domains.add(record); } + } logger.info("Loaded {} domains", domains.size()); diff --git a/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/CrawledDomainReader.java b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/CrawledDomainReader.java index 272ebf3b..7588bbaa 100644 --- a/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/CrawledDomainReader.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/CrawledDomainReader.java @@ -1,6 +1,9 @@ package nu.marginalia.io.crawldata; import nu.marginalia.io.crawldata.format.ParquetSerializableCrawlDataStream; +import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.FileNotFoundException; import java.io.IOException; @@ -8,16 +11,23 @@ import java.nio.file.Files; import java.nio.file.Path; public class CrawledDomainReader { + private static final Logger logger = LoggerFactory.getLogger(CrawledDomainReader.class); /** An iterator-like access to domain data This must be closed otherwise it will leak off-heap memory! */ public static SerializableCrawlDataStream createDataStream(Path fullPath) throws IOException { + String fileName = fullPath.getFileName().toString(); if (fileName.endsWith(".parquet")) { - return new ParquetSerializableCrawlDataStream(fullPath); - } - else { - throw new IllegalArgumentException("Unknown file type: " + fullPath); + try { + return new ParquetSerializableCrawlDataStream(fullPath); + } catch (Exception ex) { + logger.error("Error reading domain data from " + fullPath, ex); + return SerializableCrawlDataStream.empty(); + } + } else { + logger.error("Unknown file type: {}", fullPath); + return SerializableCrawlDataStream.empty(); } } diff --git a/code/processes/crawling-process/resources/ip-banned-cidr.txt b/code/processes/crawling-process/resources/ip-banned-cidr.txt index cd25413a..12418411 100644 --- a/code/processes/crawling-process/resources/ip-banned-cidr.txt +++ b/code/processes/crawling-process/resources/ip-banned-cidr.txt @@ -12,13 +12,10 @@ # Cloud Yuqu LLC 172.247.0.0/16 - 107.151.64.0/18 -# Google Cloud -# 35.208.0.0/12 -# 35.224.0.0/12 -# 35.240.0.0/13 - # 1Blu -178.254.10.0/23 \ No newline at end of file +178.254.10.0/23 + +# Domain parking spam +199.59.243.0/24 \ No newline at end of file diff --git a/code/processes/crawling-process/test/nu/marginalia/crawling/HttpFetcherTest.java b/code/processes/crawling-process/test/nu/marginalia/crawling/HttpFetcherTest.java index 63d5aa27..af196da7 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawling/HttpFetcherTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawling/HttpFetcherTest.java @@ -3,12 +3,13 @@ package nu.marginalia.crawling; import lombok.SneakyThrows; import nu.marginalia.crawl.retreival.RateLimitException; import nu.marginalia.crawl.retreival.fetcher.ContentTags; +import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; +import nu.marginalia.crawling.body.ContentTypeLogic; +import nu.marginalia.crawling.body.DocumentBodyExtractor; +import nu.marginalia.crawling.body.DocumentBodyResult; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.model.body.ContentTypeLogic; -import nu.marginalia.model.body.DocumentBodyExtractor; -import nu.marginalia.model.body.DocumentBodyResult; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; @@ -35,7 +36,7 @@ class HttpFetcherTest { void fetchUTF8() throws URISyntaxException, RateLimitException, IOException { var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler"); try (var recorder = new WarcRecorder()) { - var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, ContentTags.empty()); + var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, ContentTags.empty(), HttpFetcher.ProbeType.FULL); if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) { System.out.println(bodyOk.contentType()); } @@ -47,7 +48,7 @@ class HttpFetcherTest { var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler"); try (var recorder = new WarcRecorder()) { - var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, ContentTags.empty()); + var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, ContentTags.empty(), HttpFetcher.ProbeType.FULL); if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) { System.out.println(bodyOk.contentType()); } diff --git a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java index 43040313..45986bbc 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java @@ -122,7 +122,7 @@ public class CrawlerMockFetcherTest { @SneakyThrows @Override - public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags) { + public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags, ProbeType probeType) { logger.info("Fetching {}", url); if (mockData.containsKey(url)) { byte[] bodyBytes = mockData.get(url).documentBody.getBytes(); diff --git a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java index 803ba983..1b541b63 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java @@ -261,6 +261,7 @@ class CrawlerRetreiverTest { .collect(Collectors.toSet()); assertEquals(Set.of("https://www.marginalia.nu/", + "https://www.marginalia.nu/favicon.ico", "https://www.marginalia.nu/log/06-optimization.gmi/"), fetchedUrls); diff --git a/code/services-application/search-service/java/nu/marginalia/search/SearchOperator.java b/code/services-application/search-service/java/nu/marginalia/search/SearchOperator.java index 9b78e970..80c6aa9f 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/SearchOperator.java +++ b/code/services-application/search-service/java/nu/marginalia/search/SearchOperator.java @@ -12,6 +12,7 @@ import nu.marginalia.bbpc.BrailleBlockPunchCards; import nu.marginalia.db.DbDomainQueries; import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.search.command.SearchParameters; import nu.marginalia.search.model.ClusteredUrlDetails; @@ -162,7 +163,7 @@ public class SearchOperator { return new UrlDetails( item.documentId(), item.domainId(), - item.url, + cleanUrl(item.url), item.title, item.description, item.format, @@ -177,6 +178,31 @@ public class SearchOperator { ); } + /** Replace nuisance domains with replacements where available */ + private static EdgeUrl cleanUrl(EdgeUrl url) { + String topdomain = url.domain.topDomain; + String subdomain = url.domain.subDomain; + String path = url.path; + + if (topdomain.equals("fandom.com")) { + int wikiIndex = path.indexOf("/wiki/"); + if (wikiIndex >= 0) { + return new EdgeUrl("https", new EdgeDomain("breezewiki.com"), null, "/" + subdomain + path.substring(wikiIndex), null); + } + } + else if (topdomain.equals("medium.com")) { + if (!subdomain.isBlank()) { + return new EdgeUrl("https", new EdgeDomain("scribe.rip"), null, path, null); + } + else { + String article = path.substring(path.indexOf("/", 1)); + return new EdgeUrl("https", new EdgeDomain("scribe.rip"), null, article, null); + } + + } + return url; + } + @SneakyThrows private List getProblems(String evalResult, List queryResults, QueryResponse response) { diff --git a/code/services-application/search-service/java/nu/marginalia/search/svc/SearchQueryIndexService.java b/code/services-application/search-service/java/nu/marginalia/search/svc/SearchQueryIndexService.java new file mode 100644 index 00000000..e69de29b diff --git a/code/services-core/control-service/build.gradle b/code/services-core/control-service/build.gradle index 3c7c5956..20db2bed 100644 --- a/code/services-core/control-service/build.gradle +++ b/code/services-core/control-service/build.gradle @@ -54,6 +54,7 @@ dependencies { implementation libs.handlebars implementation libs.duckdb + implementation libs.jsoup implementation libs.trove implementation dependencies.create(libs.spark.get()) { diff --git a/code/services-core/control-service/java/nu/marginalia/control/ControlService.java b/code/services-core/control-service/java/nu/marginalia/control/ControlService.java index 5c0a014e..8a509a88 100644 --- a/code/services-core/control-service/java/nu/marginalia/control/ControlService.java +++ b/code/services-core/control-service/java/nu/marginalia/control/ControlService.java @@ -2,16 +2,18 @@ package nu.marginalia.control; import com.google.gson.Gson; import com.google.inject.Inject; -import nu.marginalia.service.ServiceMonitors; import nu.marginalia.control.actor.ControlActorService; import nu.marginalia.control.app.svc.*; -import nu.marginalia.control.node.svc.ControlNodeActionsService; import nu.marginalia.control.node.svc.ControlFileStorageService; +import nu.marginalia.control.node.svc.ControlNodeActionsService; import nu.marginalia.control.node.svc.ControlNodeService; import nu.marginalia.control.sys.svc.*; import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.screenshot.ScreenshotService; -import nu.marginalia.service.server.*; +import nu.marginalia.service.ServiceMonitors; +import nu.marginalia.service.server.BaseServiceParams; +import nu.marginalia.service.server.Service; +import nu.marginalia.service.server.StaticResources; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import spark.Request; @@ -19,7 +21,7 @@ import spark.Response; import spark.Spark; import java.io.IOException; -import java.util.*; +import java.util.Map; public class ControlService extends Service { @@ -56,6 +58,7 @@ public class ControlService extends Service { ControlDomainRankingSetsService controlDomainRankingSetsService, ControlActorService controlActorService, AbortedProcessService abortedProcessService, + DomainsManagementService domainsManagementService, ControlErrorHandler errorHandler ) throws IOException { @@ -84,6 +87,7 @@ public class ControlService extends Service { apiKeyService.register(); domainComplaintService.register(); randomExplorationService.register(); + domainsManagementService.register(); errorHandler.register(); diff --git a/code/services-core/control-service/java/nu/marginalia/control/app/model/DomainModel.java b/code/services-core/control-service/java/nu/marginalia/control/app/model/DomainModel.java new file mode 100644 index 00000000..4c7b9754 --- /dev/null +++ b/code/services-core/control-service/java/nu/marginalia/control/app/model/DomainModel.java @@ -0,0 +1,40 @@ +package nu.marginalia.control.app.model; + +public record DomainModel(int id, + String name, + String ip, + int nodeAffinity, + double rank, + boolean blacklisted) { + + public boolean isUnassigned() { + return nodeAffinity < 0; + } + + public DomainAffinityState getAffinityState() { + if (nodeAffinity < 0) { + return DomainAffinityState.Known; + } + else if (nodeAffinity == 0) { + return DomainAffinityState.Scheduled; + } + else { + return DomainAffinityState.Assigned; + } + } + + public enum DomainAffinityState { + Assigned("The domain has been assigned to a node."), + Scheduled("The domain will be assigned to the next crawling node."), + Known("The domain is known but not yet scheduled for crawling."); + + private final String desc; + DomainAffinityState(String desc) { + this.desc = desc; + } + + public String getDesc() { + return desc; + } + } +} diff --git a/code/services-core/control-service/java/nu/marginalia/control/app/model/DomainSearchResultModel.java b/code/services-core/control-service/java/nu/marginalia/control/app/model/DomainSearchResultModel.java new file mode 100644 index 00000000..13f69466 --- /dev/null +++ b/code/services-core/control-service/java/nu/marginalia/control/app/model/DomainSearchResultModel.java @@ -0,0 +1,26 @@ +package nu.marginalia.control.app.model; + +import java.util.List; +import java.util.Map; + +public record DomainSearchResultModel(String query, + String affinity, + String field, + Map selectedAffinity, + Map selectedField, + int page, + boolean hasNext, + boolean hasPrevious, + List nodes, + List results) +{ + public Integer getNextPage() { + if (!hasNext) return null; + return page + 1; + } + + public Integer getPreviousPage() { + if (!hasPrevious) return null; + return page - 1; + } +} diff --git a/code/services-core/control-service/java/nu/marginalia/control/app/svc/DomainsManagementService.java b/code/services-core/control-service/java/nu/marginalia/control/app/svc/DomainsManagementService.java new file mode 100644 index 00000000..11c8499a --- /dev/null +++ b/code/services-core/control-service/java/nu/marginalia/control/app/svc/DomainsManagementService.java @@ -0,0 +1,310 @@ +package nu.marginalia.control.app.svc; + +import com.google.inject.Inject; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.control.ControlRendererFactory; +import nu.marginalia.control.Redirects; +import nu.marginalia.control.app.model.DomainModel; +import nu.marginalia.control.app.model.DomainSearchResultModel; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.nodecfg.NodeConfigurationService; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Element; +import spark.Request; +import spark.Response; +import spark.Spark; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.http.HttpClient; +import java.net.http.HttpRequest; +import java.net.http.HttpResponse; +import java.sql.SQLException; +import java.util.*; + +public class DomainsManagementService { + + private final HikariDataSource dataSource; + private final NodeConfigurationService nodeConfigurationService; + private final ControlRendererFactory rendererFactory; + + @Inject + public DomainsManagementService(HikariDataSource dataSource, + NodeConfigurationService nodeConfigurationService, + ControlRendererFactory rendererFactory + ) { + this.dataSource = dataSource; + this.nodeConfigurationService = nodeConfigurationService; + this.rendererFactory = rendererFactory; + } + + public void register() throws IOException { + + var domainsViewRenderer = rendererFactory.renderer("control/app/domains"); + var addDomainsTxtViewRenderer = rendererFactory.renderer("control/app/domains-new"); + var addDomainsUrlViewRenderer = rendererFactory.renderer("control/app/domains-new-url"); + var addDomainsAfterReportRenderer = rendererFactory.renderer("control/app/domains-new-report"); + + Spark.get("/domain", this::getDomains, domainsViewRenderer::render); + Spark.get("/domain/new", this::addDomainsTextfield, addDomainsTxtViewRenderer::render); + Spark.post("/domain/new", this::addDomainsTextfield, addDomainsAfterReportRenderer::render); + Spark.get("/domain/new-url", this::addDomainsFromDownload, addDomainsUrlViewRenderer::render); + Spark.post("/domain/new-url", this::addDomainsFromDownload, addDomainsAfterReportRenderer::render); + Spark.post("/domain/:id/assign/:node", this::assignDomain, new Redirects.HtmlRedirect("/domain")); + + } + + private Object addDomainsTextfield(Request request, Response response) throws SQLException { + if ("GET".equals(request.requestMethod())) { + return ""; + } + else if ("POST".equals(request.requestMethod())) { + String nodeStr = request.queryParams("node"); + String domainsStr = request.queryParams("domains"); + + int node = Integer.parseInt(nodeStr); + + List validDomains; + List invalidDomains; + + Map.Entry, List> domainsList = parseDomainsList(domainsStr); + + validDomains = domainsList.getKey(); + invalidDomains = domainsList.getValue(); + + insertDomains(validDomains, node); + + return Map.of("validDomains", validDomains, + "invalidDomains", invalidDomains); + } + return ""; + } + + private Map.Entry, List> parseDomainsList(String domainsStr) { + List validDomains = new ArrayList<>(); + List invalidDomains = new ArrayList<>(); + + for (String domain : domainsStr.split("\n+")) { + domain = domain.trim(); + if (domain.isBlank()) continue; + if (domain.length() > 255) { + invalidDomains.add(domain); + continue; + } + if (domain.startsWith("#")) { + continue; + } + + // Run through the URI parser to check for bad domains + try { + if (domain.contains(":")) { + domain = new URI(domain ).toURL().getHost(); + } + else { + domain = new URI("https://" + domain + "/").toURL().getHost(); + } + } catch (URISyntaxException | MalformedURLException e) { + invalidDomains.add(domain); + continue; + } + + validDomains.add(new EdgeDomain(domain)); + } + + return Map.entry(validDomains, invalidDomains); + } + + private Object addDomainsFromDownload(Request request, Response response) throws SQLException, URISyntaxException, IOException, InterruptedException { + if ("GET".equals(request.requestMethod())) { + return ""; + } + else if ("POST".equals(request.requestMethod())) { + String nodeStr = request.queryParams("node"); + URI domainsUrl = new URI(request.queryParams("url")); + + int node = Integer.parseInt(nodeStr); + + HttpClient client = HttpClient.newBuilder().build(); + var httpReq = HttpRequest.newBuilder(domainsUrl).GET().build(); + + + HttpResponse result = client.send(httpReq, HttpResponse.BodyHandlers.ofString()); + if (result.statusCode() != 200) { + return Map.of("error", "Failed to download domains"); + } + Optional ct = result.headers().firstValue("Content-Type"); + if (ct.isEmpty()) { + return Map.of("error", "No content type"); + } + + List validDomains = new ArrayList<>(); + List invalidDomains = new ArrayList<>(); + + String contentType = ct.get().toLowerCase(); + + if (contentType.startsWith("text/plain")) { + var parsedDomains = parseDomainsList(result.body()); + validDomains = parsedDomains.getKey(); + invalidDomains = parsedDomains.getValue(); + } + else { + for (Element e : Jsoup.parse(result.body()).select("a")) { + String s = e.attr("href"); + if (s.isBlank()) continue; + if (!s.contains("://")) continue; + + URI uri = URI.create(s); + String scheme = uri.getScheme(); + String host = uri.getHost(); + + if (scheme == null || host == null) + continue; + if (!scheme.equalsIgnoreCase("http") && !scheme.equalsIgnoreCase("https")) + continue; + + validDomains.add(new EdgeDomain(host)); + } + } + + + insertDomains(validDomains, node); + + + return Map.of("validDomains", validDomains, + "invalidDomains", invalidDomains); + } + return ""; + } + + private void insertDomains(List domains, int node) throws SQLException { + + // Insert the domains into the database, updating the node affinity if the domain already exists and the affinity is not already set to a node + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) + VALUES (?, ?, ?) + ON DUPLICATE KEY UPDATE NODE_AFFINITY = IF(NODE_AFFINITY<=0, VALUES(NODE_AFFINITY), NODE_AFFINITY) + """)) + { + for (var domain : domains) { + stmt.setString(1, domain.toString()); + stmt.setString(2, domain.getTopDomain()); + stmt.setInt(3, node); + stmt.addBatch(); + } + stmt.executeBatch(); + } + } + + + private Object assignDomain(Request request, Response response) throws SQLException { + + String idStr = request.params(":id"); + String nodeStr = request.params(":node"); + + int id = Integer.parseInt(idStr); + int node = Integer.parseInt(nodeStr); + + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET NODE_AFFINITY = ? WHERE ID = ?")) + { + stmt.setInt(1, node); + stmt.setInt(2, id); + stmt.executeUpdate(); + } + + return ""; + } + + private DomainSearchResultModel getDomains(Request request, Response response) throws SQLException { + List ret = new ArrayList<>(); + + String filterRaw = Objects.requireNonNullElse(request.queryParams("filter"), "*"); + + String filter; + if (filterRaw.isBlank()) filter = "%"; + else filter = filterRaw.replace('*', '%'); + + int page = Integer.parseInt(Objects.requireNonNullElse(request.queryParams("page"), "0")); + boolean hasMore = false; + int count = 10; + + String field = Objects.requireNonNullElse(request.queryParams("field"), "domain"); + Map selectedField = Map.of(field, true); + + String affinity = Objects.requireNonNullElse(request.queryParams("affinity"), "all"); + Map selectedAffinity = Map.of(affinity, true); + + StringJoiner queryJoiner = new StringJoiner(" "); + queryJoiner.add(""" + SELECT EC_DOMAIN.ID, + DOMAIN_NAME, + NODE_AFFINITY, + `RANK`, + IP, + EC_DOMAIN_BLACKLIST.URL_DOMAIN IS NOT NULL AS BLACKLISTED + FROM WMSA_prod.EC_DOMAIN + LEFT JOIN WMSA_prod.EC_DOMAIN_BLACKLIST ON DOMAIN_NAME = EC_DOMAIN_BLACKLIST.URL_DOMAIN + """) + .add((switch (field) { + case "domain" -> "WHERE DOMAIN_NAME LIKE ?"; + case "ip" -> "WHERE IP LIKE ?"; + case "id" -> "WHERE EC_DOMAIN.ID = ?"; + default -> "WHERE DOMAIN_NAME LIKE ?"; + })) + .add((switch (affinity) { + case "assigned" -> "AND NODE_AFFINITY > 0"; + case "scheduled" -> "AND NODE_AFFINITY = 0"; + case "unassigned" -> "AND NODE_AFFINITY < 0"; + default -> ""; + })) + .add("LIMIT ?") + .add("OFFSET ?"); + + + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(queryJoiner.toString())) + { + stmt.setString(1, filter); + stmt.setInt(2, count + 1); + stmt.setInt(3, count * page); + + try (var rs = stmt.executeQuery()) { + while (rs.next()) { + if (ret.size() == count) { + hasMore = true; + break; + } + ret.add(new DomainModel( + rs.getInt("ID"), + rs.getString("DOMAIN_NAME"), + rs.getString("IP"), + rs.getInt("NODE_AFFINITY"), + Math.round(100 * rs.getDouble("RANK"))/100., + rs.getBoolean("BLACKLISTED") + )); + } + } + } + + List nodes = new ArrayList<>(); + + for (var node : nodeConfigurationService.getAll()) { + nodes.add(node.node()); + } + + return new DomainSearchResultModel(filterRaw, + affinity, + field, + selectedAffinity, + selectedField, + page, + hasMore, + page > 0, + nodes, + ret); + } + +} diff --git a/code/services-core/control-service/resources/templates/control/app/domains-new-report.hdb b/code/services-core/control-service/resources/templates/control/app/domains-new-report.hdb new file mode 100644 index 00000000..b0a4eb91 --- /dev/null +++ b/code/services-core/control-service/resources/templates/control/app/domains-new-report.hdb @@ -0,0 +1,41 @@ + + + + Control Service + {{> control/partials/head-includes }} + + +{{> control/partials/nav}} +
+

Add Domains Report

+ +

+ {{#if error}} +

{{error}}

+ {{/if}} + {{#unless errror}} + {{#unless invalidDomains}} +

All domains were added successfully!

+ {{/unless}} + {{/unless}} + {{#if invalidDomains}} +

Some domains were invalid and could not be added:

+ + {{/if}} + {{#if validDomains}} +

If they were not already in the database, these domains were added:

+ + {{/if}} +

+
+ +{{> control/partials/foot-includes }} + \ No newline at end of file diff --git a/code/services-core/control-service/resources/templates/control/app/domains-new-url.hdb b/code/services-core/control-service/resources/templates/control/app/domains-new-url.hdb new file mode 100644 index 00000000..219d563c --- /dev/null +++ b/code/services-core/control-service/resources/templates/control/app/domains-new-url.hdb @@ -0,0 +1,48 @@ + + + + Control Service + {{> control/partials/head-includes }} + + +{{> control/partials/nav}} +
+

Add Domains (URL)

+ +
+

This utility lets you add domains to be crawled via an external URL.

+ It's also possible to add domains directly via a text area +
+ +
+
+ + + + Enter the URL to the file or page that contains the domains to add. If the URL leads to a text file, + the domains will be parsed from the file, one per line. If it leads to a HTML page, the HTML + will be parsed and all the links will be extracted and added as domains. + +
+ +
+ + + + Select the node to assign the domains to, this is the index node that will "own" the domain, crawl its documents + and index dem. If you select "Auto", the system will assign the domains to the next node that performs a crawl. + +
+ +
+
+ +{{> control/partials/foot-includes }} + \ No newline at end of file diff --git a/code/services-core/control-service/resources/templates/control/app/domains-new.hdb b/code/services-core/control-service/resources/templates/control/app/domains-new.hdb new file mode 100644 index 00000000..f456a3a3 --- /dev/null +++ b/code/services-core/control-service/resources/templates/control/app/domains-new.hdb @@ -0,0 +1,47 @@ + + + + Control Service + {{> control/partials/head-includes }} + + +{{> control/partials/nav}} +
+

Add Domains

+ +
+

This utility lets you add domains to be crawled via a text area.

+ It's also possible to add domains via an external URL +
+ +
+
+ + + + Enter a list of domains to add, one per line. The system will check if the domain is already in the database and + will not add duplicates. Spaces and empty lines are ignored. + +
+ +
+ + + + Select the node to assign the domains to, this is the index node that will "own" the domain, crawl its documents + and index dem. If you select "Auto", the system will assign the domains to the next node that performs a crawl. + +
+ +
+
+ +{{> control/partials/foot-includes }} + \ No newline at end of file diff --git a/code/services-core/control-service/resources/templates/control/app/domains.hdb b/code/services-core/control-service/resources/templates/control/app/domains.hdb new file mode 100644 index 00000000..339b9664 --- /dev/null +++ b/code/services-core/control-service/resources/templates/control/app/domains.hdb @@ -0,0 +1,109 @@ + + + + Control Service + {{> control/partials/head-includes }} + + +{{> control/partials/nav}} +
+

Domains

+ + + + + + + + + + + + + + + + + + + {{#each results}} + + + + + + + + + {{/each}} + {{#unless results}} + + + + {{/unless}} + + + + + +
+ + + +
DomainIDNode AffinityRankIPBlacklisted
{{name}}{{id}}{{#unless unassigned}}{{affinityState}} {{#if nodeAffinity}}{{nodeAffinity}}{{/if}} {{/unless}} + {{#if unassigned}} + + {{/if}} + {{rank}}{{ip}}{{#if blacklisted}}✓{{/if}}
No results found
+ {{#if hasPrevious}} + Previous + {{/if}} + + {{#if hasNext}} + Next + {{/if}} +
+
+ +{{> control/partials/foot-includes }} + \ No newline at end of file diff --git a/code/services-core/control-service/resources/templates/control/partials/foot-includes.hdb b/code/services-core/control-service/resources/templates/control/partials/foot-includes.hdb index 1cb72fcd..3cfadd75 100644 --- a/code/services-core/control-service/resources/templates/control/partials/foot-includes.hdb +++ b/code/services-core/control-service/resources/templates/control/partials/foot-includes.hdb @@ -1,5 +1,4 @@ - - +