diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index a64360f7..67e661eb 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -10,7 +10,8 @@ import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor; import nu.marginalia.crawl.retreival.revisit.DocumentWithReference; import nu.marginalia.crawl.retreival.sitemap.SitemapFetcher; import nu.marginalia.crawling.body.HttpFetchResult; -import nu.marginalia.crawling.model.*; +import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.crawling.model.CrawlerDomainStatus; import nu.marginalia.ip_blocklist.UrlBlocklist; import nu.marginalia.link_parser.LinkParser; import nu.marginalia.model.EdgeDomain; @@ -87,17 +88,8 @@ public class CrawlerRetreiver implements AutoCloseable { } public int fetch(DomainLinks domainLinks, CrawlDataReference oldCrawlData) { - final DomainProber.ProbeResult probeResult = domainProber.probeDomain( - fetcher, - domain, - new EdgeUrl("http", new EdgeDomain(domain), null, "/", null)); - try { - // Sleep a bit to avoid hammering the server with requests, we just probed it - TimeUnit.SECONDS.sleep(1); - - // Fetch the domain - return crawlDomain(oldCrawlData, probeResult, domainLinks); + return crawlDomain(oldCrawlData, domainLinks); } catch (Exception ex) { logger.error("Error crawling domain {}", domain, ex); @@ -111,25 +103,33 @@ public class CrawlerRetreiver implements AutoCloseable { resync.run(warcFile); } - private int crawlDomain(CrawlDataReference oldCrawlData, DomainProber.ProbeResult probeResult, DomainLinks domainLinks) throws IOException, InterruptedException { - String ip = findIp(domain); - EdgeUrl rootUrl; + private DomainProber.ProbeResult probeRootUrl(String ip) throws IOException { + // Construct an URL to the root of the domain, we don't know the schema yet so we'll + // start with http and then try https if that fails + var httpUrl = new EdgeUrl("http", new EdgeDomain(domain), null, "/", null); + final DomainProber.ProbeResult probeResult = domainProber.probeDomain(fetcher, domain, httpUrl); warcRecorder.writeWarcinfoHeader(ip, new EdgeDomain(domain), probeResult); - if (!(probeResult instanceof DomainProber.ProbeResultOk ok)) { - return 1; - } - else { - rootUrl = ok.probedUrl(); - } + return probeResult; + } + + private int crawlDomain(CrawlDataReference oldCrawlData, DomainLinks domainLinks) throws IOException, InterruptedException { + String ip = findIp(domain); + EdgeUrl rootUrl; + + if (probeRootUrl(ip) instanceof DomainProber.ProbeResultOk ok) rootUrl = ok.probedUrl(); + else return 1; + + // Sleep after the initial probe, we don't have access to the robots.txt yet + // so we don't know the crawl delay + TimeUnit.SECONDS.sleep(1); final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(rootUrl.domain, warcRecorder); final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay()); delayTimer.waitFetchDelay(0); // initial delay after robots.txt sniffRootDocument(rootUrl, delayTimer); - delayTimer.waitFetchDelay(0); // delay after sniffing // Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified int recrawled = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer); @@ -187,7 +187,7 @@ public class CrawlerRetreiver implements AutoCloseable { try { - if (fetchWriteAndSleep(top, delayTimer, DocumentWithReference.empty()).isOk()) { + if (fetchContentWithReference(top, delayTimer, DocumentWithReference.empty()).isOk()) { fetchedCount++; } } @@ -208,21 +208,8 @@ public class CrawlerRetreiver implements AutoCloseable { var url = rootUrl.withPathAndParam("/", null); - HttpFetchResult result = null; - - for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) { - try { - result = fetcher.fetchContent(url, warcRecorder, ContentTags.empty()); - break; - } - catch (RateLimitException ex) { - timer.waitRetryDelay(ex); - } - catch (Exception ex) { - logger.warn("Failed to fetch {}", url, ex); - result = new HttpFetchResult.ResultException(ex); - } - } + HttpFetchResult result = fetchWithRetry(url, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty()); + timer.waitFetchDelay(0); if (!(result instanceof HttpFetchResult.ResultOk ok)) return; @@ -235,24 +222,39 @@ public class CrawlerRetreiver implements AutoCloseable { var doc = optDoc.get(); crawlFrontier.setLinkFilter(linkFilterSelector.selectFilter(doc)); + EdgeUrl faviconUrl = url.withPathAndParam("/favicon.ico", null); + EdgeUrl sitemapUrl = url.withPathAndParam("/sitemap.xml", null); + for (var link : doc.getElementsByTag("link")) { String rel = link.attr("rel"); String type = link.attr("type"); - if (!rel.equalsIgnoreCase("alternate")) - continue; + if (rel.equals("icon") || rel.equals("shortcut icon")) { + String href = link.attr("href"); - if (!(type.equalsIgnoreCase("application/atom+xml") - || type.equalsIgnoreCase("application/rss+xml"))) - continue; + faviconUrl = linkParser.parseLink(url, href) + .filter(crawlFrontier::isSameDomain) + .orElse(faviconUrl); + } - String href = link.attr("href"); + // Grab the RSS/Atom as a sitemap if it exists + if (rel.equalsIgnoreCase("alternate") + && (type.equalsIgnoreCase("application/atom+xml") || type.equalsIgnoreCase("application/atomsvc+xml"))) { + String href = link.attr("href"); - linkParser.parseLink(url, href) - .filter(crawlFrontier::isSameDomain) - .map(List::of) - .ifPresent(sitemapFetcher::downloadSitemaps); + sitemapUrl = linkParser.parseLink(url, href) + .filter(crawlFrontier::isSameDomain) + .orElse(sitemapUrl); + } } + + // Download the sitemap if it exists + sitemapFetcher.downloadSitemaps(List.of(sitemapUrl)); + timer.waitFetchDelay(0); + + // Grab the favicon if it exists + fetchWithRetry(faviconUrl, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty()); + timer.waitFetchDelay(0); } catch (Exception ex) { logger.error("Error configuring link filter", ex); @@ -262,31 +264,16 @@ public class CrawlerRetreiver implements AutoCloseable { } } - public HttpFetchResult fetchWriteAndSleep(EdgeUrl top, - CrawlDelayTimer timer, - DocumentWithReference reference) throws InterruptedException + public HttpFetchResult fetchContentWithReference(EdgeUrl top, + CrawlDelayTimer timer, + DocumentWithReference reference) throws InterruptedException { logger.debug("Fetching {}", top); - HttpFetchResult fetchedDoc = new HttpFetchResult.ResultNone(); - long startTime = System.currentTimeMillis(); var contentTags = reference.getContentTags(); - // Fetch the document, retrying if we get a rate limit exception - for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) { - try { - fetchedDoc = fetcher.fetchContent(top, warcRecorder, contentTags); - break; - } - catch (RateLimitException ex) { - timer.waitRetryDelay(ex); - } - catch (Exception ex) { - logger.warn("Failed to fetch {}", top, ex); - fetchedDoc = new HttpFetchResult.ResultException(ex); - } - } + HttpFetchResult fetchedDoc = fetchWithRetry(top, timer, HttpFetcher.ProbeType.FULL, contentTags); // Parse the document and enqueue links try { @@ -328,6 +315,27 @@ public class CrawlerRetreiver implements AutoCloseable { return fetchedDoc; } + /** Fetch a document and retry on 429s */ + private HttpFetchResult fetchWithRetry(EdgeUrl url, + CrawlDelayTimer timer, + HttpFetcher.ProbeType probeType, + ContentTags contentTags) throws InterruptedException { + for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) { + try { + return fetcher.fetchContent(url, warcRecorder, contentTags, probeType); + } + catch (RateLimitException ex) { + timer.waitRetryDelay(ex); + } + catch (Exception ex) { + logger.warn("Failed to fetch {}", url, ex); + return new HttpFetchResult.ResultException(ex); + } + } + + return new HttpFetchResult.ResultNone(); + } + private boolean isAllowedProtocol(String proto) { return proto.equalsIgnoreCase("http") || proto.equalsIgnoreCase("https"); diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java index 70576510..fd3dd0dd 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java @@ -3,8 +3,8 @@ package nu.marginalia.crawl.retreival.fetcher; import com.google.inject.ImplementedBy; import crawlercommons.robots.SimpleRobotRules; import nu.marginalia.crawl.retreival.RateLimitException; -import nu.marginalia.crawling.body.HttpFetchResult; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; +import nu.marginalia.crawling.body.HttpFetchResult; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; @@ -19,9 +19,18 @@ public interface HttpFetcher { FetchResult probeDomain(EdgeUrl url); - HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags) throws RateLimitException; + HttpFetchResult fetchContent(EdgeUrl url, + WarcRecorder recorder, + ContentTags tags, + ProbeType probeType) throws RateLimitException; SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder); SitemapRetriever createSitemapRetriever(); + + enum ProbeType { + DISABLED, + FULL, + IF_MODIFIED_SINCE + } } diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java index 1df0301b..6ec3cd73 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java @@ -11,10 +11,10 @@ import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeR import nu.marginalia.crawl.retreival.fetcher.socket.FastTerminatingSocketFactory; import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor; import nu.marginalia.crawl.retreival.fetcher.socket.NoSecuritySSL; -import nu.marginalia.crawling.body.DocumentBodyExtractor; -import nu.marginalia.crawling.body.HttpFetchResult; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawling.body.ContentTypeLogic; +import nu.marginalia.crawling.body.DocumentBodyExtractor; +import nu.marginalia.crawling.body.HttpFetchResult; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import okhttp3.ConnectionPool; @@ -145,12 +145,13 @@ public class HttpFetcherImpl implements HttpFetcher { @SneakyThrows public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder warcRecorder, - ContentTags contentTags) + ContentTags contentTags, + ProbeType probeType) { // We don't want to waste time and resources on URLs that are not HTML, so if the file ending // looks like it might be something else, we perform a HEAD first to check the content type - if (contentTags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url)) + if (probeType == ProbeType.FULL && contentTags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url)) { ContentTypeProbeResult probeResult = contentTypeProber.probeContentType(url); if (probeResult instanceof ContentTypeProbeResult.Ok ok) { @@ -174,7 +175,9 @@ public class HttpFetcherImpl implements HttpFetcher { else { // Possibly do a soft probe to see if the URL has been modified since the last time we crawled it // if we have reason to suspect ETags are not supported by the server. - if (softIfModifiedSinceProber.probeModificationTime(url, contentTags)) { + if (probeType == ProbeType.IF_MODIFIED_SINCE + && softIfModifiedSinceProber.probeModificationTime(url, contentTags)) + { return new HttpFetchResult.Result304Raw(); } } diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java index af4a743f..6b32317d 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java @@ -137,7 +137,7 @@ public class CrawlerRevisitor { DocumentWithReference reference = new DocumentWithReference(doc, oldCrawlData); - var result = crawlerRetreiver.fetchWriteAndSleep(url, delayTimer, reference); + var result = crawlerRetreiver.fetchContentWithReference(url, delayTimer, reference); if (reference.isSame(result)) { retained++; diff --git a/code/processes/crawling-process/test/nu/marginalia/crawling/HttpFetcherTest.java b/code/processes/crawling-process/test/nu/marginalia/crawling/HttpFetcherTest.java index 0873924f..af196da7 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawling/HttpFetcherTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawling/HttpFetcherTest.java @@ -3,11 +3,12 @@ package nu.marginalia.crawling; import lombok.SneakyThrows; import nu.marginalia.crawl.retreival.RateLimitException; import nu.marginalia.crawl.retreival.fetcher.ContentTags; +import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; -import nu.marginalia.crawling.body.DocumentBodyExtractor; -import nu.marginalia.crawling.body.DocumentBodyResult; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawling.body.ContentTypeLogic; +import nu.marginalia.crawling.body.DocumentBodyExtractor; +import nu.marginalia.crawling.body.DocumentBodyResult; import nu.marginalia.model.EdgeUrl; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; @@ -35,7 +36,7 @@ class HttpFetcherTest { void fetchUTF8() throws URISyntaxException, RateLimitException, IOException { var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler"); try (var recorder = new WarcRecorder()) { - var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, ContentTags.empty()); + var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, ContentTags.empty(), HttpFetcher.ProbeType.FULL); if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) { System.out.println(bodyOk.contentType()); } @@ -47,7 +48,7 @@ class HttpFetcherTest { var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler"); try (var recorder = new WarcRecorder()) { - var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, ContentTags.empty()); + var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, ContentTags.empty(), HttpFetcher.ProbeType.FULL); if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) { System.out.println(bodyOk.contentType()); } diff --git a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java index 749b821c..01534385 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java @@ -5,8 +5,8 @@ import lombok.SneakyThrows; import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.fetcher.*; -import nu.marginalia.crawling.body.HttpFetchResult; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; +import nu.marginalia.crawling.body.HttpFetchResult; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawlerDocumentStatus; import nu.marginalia.crawling.model.SerializableCrawlData; @@ -23,7 +23,10 @@ import org.slf4j.LoggerFactory; import java.io.IOException; import java.net.URISyntaxException; -import java.util.*; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; public class CrawlerMockFetcherTest { @@ -119,7 +122,7 @@ public class CrawlerMockFetcherTest { @SneakyThrows @Override - public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags) { + public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags, ProbeType probeType) { logger.info("Fetching {}", url); if (mockData.containsKey(url)) { byte[] bodyBytes = mockData.get(url).documentBody.getBytes(); diff --git a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java index aa1f00e7..a6df0791 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java @@ -261,6 +261,7 @@ class CrawlerRetreiverTest { .collect(Collectors.toSet()); assertEquals(Set.of("https://www.marginalia.nu/", + "https://www.marginalia.nu/favicon.ico", "https://www.marginalia.nu/log/06-optimization.gmi/"), fetchedUrls);