From 58f2f86ea8607db9a01b1b581cd0da017e9fcf74 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 21 Jul 2023 19:47:52 +0200 Subject: [PATCH] (crawler) Don't read all the data into RAM when doing a refresh-crawl --- .../crawling/io/CrawledDomainReader.java | 41 ++++ .../nu/marginalia/crawl/CrawlLimiter.java | 45 +--- .../java/nu/marginalia/crawl/CrawlerMain.java | 24 +- .../crawl/retreival/CrawlerRetreiver.java | 227 ++++++++++-------- .../crawl/retreival/fetcher/ContentTags.java | 24 ++ .../crawl/retreival/fetcher/HttpFetcher.java | 2 +- .../retreival/fetcher/HttpFetcherImpl.java | 31 ++- .../marginalia/crawling/HttpFetcherTest.java | 5 +- .../retreival/CrawlerMockFetcherTest.java | 7 +- .../retreival/CrawlerRetreiverTest.java | 18 +- 10 files changed, 249 insertions(+), 175 deletions(-) create mode 100644 code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/ContentTags.java diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java index 67b95484..abc524ac 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java @@ -2,8 +2,10 @@ package nu.marginalia.crawling.io; import com.github.luben.zstd.ZstdInputStream; import com.google.gson.Gson; +import lombok.SneakyThrows; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.crawling.model.SerializableCrawlData; import nu.marginalia.model.gson.GsonFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -14,6 +16,7 @@ import java.io.IOException; import java.io.InputStreamReader; import java.nio.file.Path; import java.util.ArrayList; +import java.util.Iterator; import java.util.List; import java.util.Optional; import java.util.concurrent.ForkJoinPool; @@ -27,6 +30,44 @@ public class CrawledDomainReader { public CrawledDomainReader() { } + public Iterator createIterator(Path path) throws IOException { + BufferedReader br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile())))); + + return new Iterator<>() { + SerializableCrawlData next; + + @Override + @SneakyThrows + public boolean hasNext() { + String identifier = br.readLine(); + if (identifier == null) { + br.close(); + return false; + } + String data = br.readLine(); + if (data == null) { + br.close(); + return false; + } + + if (identifier.equals(CrawledDomain.SERIAL_IDENTIFIER)) { + next = gson.fromJson(data, CrawledDomain.class); + } else if (identifier.equals(CrawledDocument.SERIAL_IDENTIFIER)) { + next = gson.fromJson(data, CrawledDocument.class); + } + else { + throw new IllegalStateException("Unknown identifier: " + identifier); + } + return true; + } + + @Override + public SerializableCrawlData next() { + return next; + } + }; + } + public CrawledDomain read(Path path) throws IOException { DomainDataAssembler domainData = new DomainDataAssembler(); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java index 29f02e4f..7285b0c5 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java @@ -8,65 +8,22 @@ import java.util.concurrent.Semaphore; public class CrawlLimiter { public static final int maxPoolSize = Integer.getInteger("crawler.pool-size", 512); - // We'll round up to this size when we're crawling a new domain to prevent - // too many concurrent connections - public static final int minCrawlDataSizeKb = 128; // 100 Kb - - // The largest size on disk where we'll permit a refresh crawl - // (these files easily grow into the gigabytes, we don't want that in RAM) - public static final int maxRefreshableCrawlDataSizeKBytes = 1024*128; // 128 Mb - - // This limits how many concurrent crawl tasks we can have running at once - // based on their size on disk. The on-disk size is compressed, and the - // in-ram size is partially compressed (i.e. only the document body); so - // maybe a fair estimate is something like 2-4x this figure for RAM usage - // - public static final int maxConcurrentCrawlTaskSizeKb = 512*1024; // 512 Mb - - static { - // Sanity check; if this is false we'll get a deadlock on taskSemRAM - assert maxConcurrentCrawlTaskSizeKb >= maxRefreshableCrawlDataSizeKBytes - : "maxConcurrentCrawlTaskSizeKb must be larger than maxRefreshableCrawlDataSizeKBytes"; - } - public record CrawlTaskLimits(Path refreshPath, boolean isRefreshable, int taskSize) {} - // We use two semaphores to keep track of the number of concurrent crawls; - // first a RAM sempahore to limit the amount of RAM used by refresh crawls. - // then a count semaphore to limit the number of concurrent threads (this keeps the connection count manageable) - private final Semaphore taskSemRAM = new Semaphore(maxConcurrentCrawlTaskSizeKb); private final Semaphore taskSemCount = new Semaphore(maxPoolSize); public CrawlTaskLimits getTaskLimits(Path fileName) { - long size; - - try { - size = Math.max(minCrawlDataSizeKb, Files.size(fileName) / 1024); - } catch (IOException ex) { - // If we can't read the file, we'll assume it's small since we won't be able to read it later for the refresh either - return new CrawlTaskLimits(null,false, minCrawlDataSizeKb); - } - - // We'll only permit refresh crawls if the file is small enough - boolean isRefreshable = size < maxRefreshableCrawlDataSizeKBytes; - - // We'll truncate this down to maxRefreshableCrawlDataSizeKBytes to ensure - // it's possible to acquire the RAM semaphore - int effectiveSize = (int) Math.min(maxRefreshableCrawlDataSizeKBytes, size); - - return new CrawlTaskLimits(fileName, isRefreshable, effectiveSize); + return new CrawlTaskLimits(fileName, true, 1); } public void acquire(CrawlTaskLimits properties) throws InterruptedException { // It's very important that we acquire the RAM semaphore first to avoid a deadlock - taskSemRAM.acquire(properties.taskSize); taskSemCount.acquire(1); } public void release(CrawlTaskLimits properties) { taskSemCount.release(1); - taskSemRAM.release(properties.taskSize); } } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java index 3dd096cb..6fafb128 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java @@ -10,6 +10,7 @@ import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawling.io.CrawledDomainReader; import nu.marginalia.crawling.io.CrawlerOutputFile; import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.crawling.model.SerializableCrawlData; import nu.marginalia.db.storage.FileStorageService; import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mq.MqMessage; @@ -32,10 +33,7 @@ import org.slf4j.LoggerFactory; import java.io.IOException; import java.nio.file.Path; import java.sql.SQLException; -import java.util.HashSet; -import java.util.Optional; -import java.util.Set; -import java.util.UUID; +import java.util.*; import java.util.concurrent.*; import java.util.concurrent.atomic.AtomicInteger; @@ -201,19 +199,23 @@ public class CrawlerMain implements AutoCloseable { HttpFetcher fetcher = new HttpFetcherImpl(userAgent.uaString(), dispatcher, connectionPool); - // Read the previous crawl's data for this domain, if it exists and has a reasonable size - Optional domain; - if (limits.isRefreshable()) { - domain = reader.readOptionally(limits.refreshPath()); - if (domain.isPresent()) { - specification = specification.withOldData(domain.get()); + Iterator iterator; + try { + if (limits.isRefreshable()) { + iterator = reader.createIterator(limits.refreshPath()); } + else { + iterator = Collections.emptyIterator(); + } + } catch (IOException e) { + logger.warn("Failed to read previous crawl data for {}", specification.domain); + iterator = Collections.emptyIterator(); } try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) { var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept); - int size = retreiver.fetch(); + int size = retreiver.fetch(iterator); workLog.setJobToFinished(specification.id, writer.getOutputFile().toString(), size); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index 52927f38..8091dac8 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -4,6 +4,7 @@ import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; import crawlercommons.robots.SimpleRobotRules; import lombok.SneakyThrows; +import nu.marginalia.crawl.retreival.fetcher.ContentTags; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.SitemapRetriever; import nu.marginalia.crawling.model.spec.CrawlingSpecification; @@ -18,6 +19,7 @@ import org.jsoup.nodes.Document; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import javax.annotation.Nullable; import java.net.InetAddress; import java.net.UnknownHostException; import java.time.LocalDateTime; @@ -58,15 +60,13 @@ public class CrawlerRetreiver { private final SitemapRetriever sitemapRetriever; private final DomainCrawlFrontier crawlFrontier; - private final CrawlDataReference oldCrawlData; - int errorCount = 0; + private String retainedTag = "RETAINED/304"; public CrawlerRetreiver(HttpFetcher fetcher, CrawlingSpecification specs, Consumer writer) { this.fetcher = fetcher; - this.oldCrawlData = new CrawlDataReference(specs.oldData); id = specs.id; domain = specs.domain; @@ -97,10 +97,14 @@ public class CrawlerRetreiver { } public int fetch() { + return fetch(Collections.emptyIterator()); + } + + public int fetch(Iterator oldCrawlData) { final DomainProber.ProbeResult probeResult = domainProber.probeDomain(fetcher, domain, crawlFrontier.peek()); if (probeResult instanceof DomainProber.ProbeResultOk) { - return crawlDomain(); + return crawlDomain(oldCrawlData); } // handle error cases for probe @@ -137,44 +141,29 @@ public class CrawlerRetreiver { throw new IllegalStateException("Unknown probe result: " + probeResult); }; - private int crawlDomain() { + private int crawlDomain(Iterator oldCrawlData) { String ip = findIp(domain); assert !crawlFrontier.isEmpty(); var robotsRules = fetcher.fetchRobotRules(crawlFrontier.peek().domain); + long crawlDelay = robotsRules.getCrawlDelay(); - CrawlDataComparison comparison = compareWithOldData(robotsRules); - logger.info("Comparison result for {} : {}", domain, comparison); + sniffRootDocument(); - // If we have reference data, we will always grow the crawl depth a bit - if (oldCrawlData.size() > 0) { + // Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified + int recrawled = recrawl(oldCrawlData, robotsRules, crawlDelay); + + if (recrawled > 0) { + // If we have reference data, we will always grow the crawl depth a bit crawlFrontier.increaseDepth(1.5); } - // When the reference data doesn't appear to have changed, we'll forego - // re-fetching it and just use the old data - if (comparison == CrawlDataComparison.NO_CHANGES) { - oldCrawlData.allDocuments().forEach((url, doc) -> { - if (crawlFrontier.addVisited(url)) { - doc.recrawlState = "RETAINED"; - crawledDomainWriter.accept(doc); - } - }); - - // We don't need to hold onto this in RAM anymore - oldCrawlData.evict(); - } - - downloadSitemaps(robotsRules); - sniffRootDocument(); - - long crawlDelay = robotsRules.getCrawlDelay(); CrawledDomain ret = new CrawledDomain(id, domain, null, CrawlerDomainStatus.OK.name(), null, ip, new ArrayList<>(), null); - int fetchedCount = 0; + int fetchedCount = recrawled; while (!crawlFrontier.isEmpty() && !crawlFrontier.isCrawlDepthReached() @@ -187,11 +176,6 @@ public class CrawlerRetreiver { continue; } - // Don't re-fetch links that were previously found dead as it's very unlikely that a - // 404:ing link will suddenly start working at a later point - if (oldCrawlData.isPreviouslyDead(top)) - continue; - // Check the link filter if the endpoint should be fetched based on site-type if (!crawlFrontier.filterLink(top)) continue; @@ -211,7 +195,7 @@ public class CrawlerRetreiver { continue; - if (fetchDocument(top, crawlDelay).isPresent()) { + if (fetchDocument(top, null, crawlDelay).isPresent()) { fetchedCount++; } } @@ -223,63 +207,69 @@ public class CrawlerRetreiver { return fetchedCount; } - private CrawlDataComparison compareWithOldData(SimpleRobotRules robotsRules) { + private int recrawl(Iterator oldCrawlData, + SimpleRobotRules robotsRules, + long crawlDelay) { + int recrawled = 0; + int retained = 0; - int numGoodDocuments = oldCrawlData.size(); + while (oldCrawlData.hasNext()) { + if (!(oldCrawlData.next() instanceof CrawledDocument doc)) continue; - if (numGoodDocuments == 0) - return CrawlDataComparison.NO_OLD_DATA; + // This Shouldn't Happen (TM) + var urlMaybe = EdgeUrl.parse(doc.url); + if (urlMaybe.isEmpty()) continue; + var url = urlMaybe.get(); - if (numGoodDocuments < 10) - return CrawlDataComparison.SMALL_SAMPLE; - - // We fetch a sample of the data to assess how much it has changed - int sampleSize = (int) Math.min(20, 0.25 * numGoodDocuments); - Map referenceUrls = oldCrawlData.sample(sampleSize); - - int differences = 0; - - long crawlDelay = robotsRules.getCrawlDelay(); - for (var url : referenceUrls.keySet()) { - - var docMaybe = fetchDocument(url, crawlDelay); - if (docMaybe.isEmpty()) { - differences++; + // If we've previously 404:d on this URL, we'll refrain from trying to fetch it again + if (doc.httpStatus == 404) { + crawlFrontier.addVisited(url); continue; } - var newDoc = docMaybe.get(); - var referenceDoc = referenceUrls.get(url); + if (doc.httpStatus != 200) continue; - // This looks like a bug but it is not, we want to compare references - // to detect if the page has bounced off etag or last-modified headers - // to avoid having to do a full content comparison - if (newDoc == referenceDoc) + if (!robotsRules.isAllowed(url.toString())) { + crawledDomainWriter.accept(createRobotsError(url)); + continue; + } + if (!crawlFrontier.filterLink(url)) + continue; + if (!crawlFrontier.addVisited(url)) continue; - if (newDoc.httpStatus != referenceDoc.httpStatus) { - differences++; + + if (recrawled > 10 + && retained > 0.9 * recrawled + && Math.random() < 0.75) + { + logger.info("Direct-loading {}", url); + + // Since it looks like most of these documents haven't changed, + // we'll load the documents directly; but we do this in a random + // fashion to make sure we eventually catch changes over time + + crawledDomainWriter.accept(doc); + crawlFrontier.addVisited(url); continue; } - if (newDoc.documentBody == null) { - differences++; - continue; + + // GET the document with the stored document as a reference + // providing etag and last-modified headers, so we can recycle the + // document if it hasn't changed without actually downloading it + + var fetchedDocOpt = fetchDocument(url, doc, crawlDelay); + if (fetchedDocOpt.isEmpty()) continue; + + if (Objects.equals(fetchedDocOpt.get().recrawlState, retainedTag)) { + retained ++; } - long referenceLsh = hashDoc(referenceDoc); - long newLsh = hashDoc(newDoc); - - if (EasyLSH.hammingDistance(referenceLsh, newLsh) > 5) { - differences++; - } - } - if (differences > sampleSize/4) { - return CrawlDataComparison.CHANGES_FOUND; - } - else { - return CrawlDataComparison.NO_CHANGES; + recrawled ++; } + + return recrawled; } private static final HashFunction hasher = Hashing.murmur3_128(0); @@ -346,7 +336,7 @@ public class CrawlerRetreiver { var url = crawlFrontier.peek().withPathAndParam("/", null); - var maybeSample = fetchUrl(url).filter(sample -> sample.httpStatus == 200); + var maybeSample = fetchUrl(url, null).filter(sample -> sample.httpStatus == 200); if (maybeSample.isEmpty()) return; var sample = maybeSample.get(); @@ -382,23 +372,21 @@ public class CrawlerRetreiver { } } - private Optional fetchDocument(EdgeUrl top, long crawlDelay) { + private Optional fetchDocument(EdgeUrl top, + @Nullable CrawledDocument reference, + long crawlDelay) { logger.debug("Fetching {}", top); long startTime = System.currentTimeMillis(); - var doc = fetchUrl(top); + var doc = fetchUrl(top, reference); if (doc.isPresent()) { var d = doc.get(); crawledDomainWriter.accept(d); - oldCrawlData.dispose(top); if (d.url != null) { // We may have redirected to a different path - EdgeUrl.parse(d.url).ifPresent(url -> { - crawlFrontier.addVisited(url); - oldCrawlData.dispose(url); - }); + EdgeUrl.parse(d.url).ifPresent(crawlFrontier::addVisited); } if ("ERROR".equals(d.crawlerStatus) && d.httpStatus != 404) { @@ -418,14 +406,31 @@ public class CrawlerRetreiver { || proto.equalsIgnoreCase("https"); } - private Optional fetchUrl(EdgeUrl top) { + private Optional fetchUrl(EdgeUrl top, @Nullable CrawledDocument reference) { try { - var doc = fetchContent(top); + var contentTags = getContentTags(reference); + var fetchedDoc = fetchContent(top, contentTags); + CrawledDocument doc; + + // HTTP status 304 is NOT MODIFIED, which means the document is the same as it was when + // we fetched it last time. We can recycle the reference document. + if (reference != null + && fetchedDoc.httpStatus == 304) + { + doc = reference; + doc.recrawlState = retainedTag; + doc.timestamp = LocalDateTime.now().toString(); + } + else { + doc = fetchedDoc; + } if (doc.documentBody != null) { - doc.documentBodyHash = createHash(doc.documentBody.decode()); + var decoded = doc.documentBody.decode(); - Optional parsedDoc = parseDoc(doc); + doc.documentBodyHash = createHash(decoded); + + Optional parsedDoc = parseDoc(decoded); EdgeUrl url = new EdgeUrl(doc.url); parsedDoc.ifPresent(parsed -> findLinks(url, parsed)); @@ -443,23 +448,37 @@ public class CrawlerRetreiver { } + private ContentTags getContentTags(@Nullable CrawledDocument reference) { + if (null == reference) + return ContentTags.empty(); + + String headers = reference.headers; + if (headers == null) + return ContentTags.empty(); + + String[] headersLines = headers.split("\n"); + + String lastmod = null; + String etag = null; + + for (String line : headersLines) { + if (line.toLowerCase().startsWith("etag:")) { + etag = line.substring(5).trim(); + } + if (line.toLowerCase().startsWith("last-modified:")) { + lastmod = line.substring(14).trim(); + } + } + + return new ContentTags(etag, lastmod); + } + @SneakyThrows - private CrawledDocument fetchContent(EdgeUrl top) { + private CrawledDocument fetchContent(EdgeUrl top, ContentTags tags) { for (int i = 0; i < 2; i++) { try { - var doc = fetcher.fetchContent(top, oldCrawlData.getEtag(top), oldCrawlData.getLastModified(top)); - + var doc = fetcher.fetchContent(top, tags); doc.recrawlState = "NEW"; - - if (doc.httpStatus == 304) { - var referenceData = oldCrawlData.getDoc(top); - if (referenceData != null) { - referenceData.recrawlState = "304/UNCHANGED"; - return referenceData; - } - } - - return doc; } catch (RateLimitException ex) { @@ -478,10 +497,8 @@ public class CrawlerRetreiver { return hashMethod.hashUnencodedChars(documentBodyHash).toString(); } - private Optional parseDoc(CrawledDocument doc) { - if (doc.documentBody == null) - return Optional.empty(); - return Optional.of(Jsoup.parse(doc.documentBody.decode())); + private Optional parseDoc(String decoded) { + return Optional.of(Jsoup.parse(decoded)); } private void findLinks(EdgeUrl baseUrl, Document parsed) { diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/ContentTags.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/ContentTags.java new file mode 100644 index 00000000..e1df86c8 --- /dev/null +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/ContentTags.java @@ -0,0 +1,24 @@ +package nu.marginalia.crawl.retreival.fetcher; + +import okhttp3.Request; + +/** Encapsulates request modifiers; the ETag and Last-Modified tags for a resource */ +public record ContentTags(String etag, String lastMod) { + public static ContentTags empty() { + return new ContentTags(null, null); + } + + public boolean isPresent() { + return etag != null || lastMod != null; + } + + public boolean isEmpty() { + return etag == null && lastMod == null; + } + + /** Paints the tags onto the request builder. */ + public void paint(Request.Builder getBuilder) { + if (etag != null) getBuilder.addHeader("If-None-Match", etag); + if (lastMod != null) getBuilder.addHeader("If-Modified-Since", lastMod); + } +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java index 7f588783..11ad272e 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java @@ -18,7 +18,7 @@ public interface HttpFetcher { FetchResult probeDomain(EdgeUrl url); - CrawledDocument fetchContent(EdgeUrl url, String etag, String lastMod) throws RateLimitException; + CrawledDocument fetchContent(EdgeUrl url, ContentTags tags) throws RateLimitException; SimpleRobotRules fetchRobotRules(EdgeDomain domain); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java index 36c8bd34..be6a6a06 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java @@ -128,9 +128,15 @@ public class HttpFetcherImpl implements HttpFetcher { @Override @SneakyThrows - public CrawledDocument fetchContent(EdgeUrl url, String etag, String lastMod) throws RateLimitException { + public CrawledDocument fetchContent(EdgeUrl url, + ContentTags contentTags) + throws RateLimitException + { - if (contentTypeLogic.isUrlLikeBinary(url)) { + // We don't want to waste time and resources on URLs that are not HTML, so if the file ending + // looks like it might be something else, we perform a HEAD first to check the content type + if (contentTags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url)) + { logger.debug("Probing suspected binary {}", url); var headBuilder = new Request.Builder().head() @@ -146,6 +152,21 @@ public class HttpFetcherImpl implements HttpFetcher { if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) { return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, "Early probe failed"); } + + // Update the URL to the final URL of the HEAD request, otherwise we might end up doing + + // HEAD 301 url1 -> url2 + // HEAD 200 url2 + // GET 301 url1 -> url2 + // GET 200 url2 + + // which is not what we want. Overall we want to do as few requests as possible to not raise + // too many eyebrows when looking at the logs on the target server. Overall it's probably desirable + // that it looks like the traffic makes sense, as opposed to looking like a broken bot. + + var redirectUrl = new EdgeUrl(rsp.request().url().toString()); + if (Objects.equals(redirectUrl.domain, url.domain)) + url = redirectUrl; } catch (SocketTimeoutException ex) { return createTimeoutErrorRsp(url, ex); @@ -157,12 +178,12 @@ public class HttpFetcherImpl implements HttpFetcher { } var getBuilder = new Request.Builder().get(); + getBuilder.addHeader("User-agent", userAgent) .url(url.toString()) .addHeader("Accept-Encoding", "gzip"); - if (etag != null) getBuilder.addHeader("If-None-Match", etag); - if (lastMod != null) getBuilder.addHeader("If-Modified-Since", lastMod); + contentTags.paint(getBuilder); var get = getBuilder.build(); var call = client.newCall(get); @@ -314,7 +335,7 @@ public class HttpFetcherImpl implements HttpFetcher { private Optional fetchRobotsForProto(String proto, EdgeDomain domain) { try { var url = new EdgeUrl(proto, domain, null, "/robots.txt", null); - return Optional.of(parseRobotsTxt(fetchContent(url, null, null))); + return Optional.of(parseRobotsTxt(fetchContent(url, ContentTags.empty()))); } catch (Exception ex) { return Optional.empty(); diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java index 2ea9c763..5893910f 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java @@ -2,6 +2,7 @@ package nu.marginalia.crawling; import lombok.SneakyThrows; import nu.marginalia.crawl.retreival.RateLimitException; +import nu.marginalia.crawl.retreival.fetcher.ContentTags; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.retreival.logic.ContentTypeLogic; import nu.marginalia.model.EdgeUrl; @@ -29,14 +30,14 @@ class HttpFetcherTest { @Test void fetchUTF8() throws URISyntaxException, RateLimitException { var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler"); - var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), null, null); + var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), ContentTags.empty()); System.out.println(str.contentType); } @Test void fetchText() throws URISyntaxException, RateLimitException { var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler"); - var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), null, null); + var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), ContentTags.empty()); System.out.println(str); } } \ No newline at end of file diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java index f580a123..59e3c45e 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java @@ -4,10 +4,7 @@ import crawlercommons.robots.SimpleRobotRules; import lombok.SneakyThrows; import nu.marginalia.bigstring.BigString; import nu.marginalia.crawl.retreival.CrawlerRetreiver; -import nu.marginalia.crawl.retreival.fetcher.FetchResult; -import nu.marginalia.crawl.retreival.fetcher.FetchResultState; -import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; -import nu.marginalia.crawl.retreival.fetcher.SitemapRetriever; +import nu.marginalia.crawl.retreival.fetcher.*; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawlerDocumentStatus; import nu.marginalia.crawling.model.SerializableCrawlData; @@ -126,7 +123,7 @@ public class CrawlerMockFetcherTest { } @Override - public CrawledDocument fetchContent(EdgeUrl url, String etag, String lastModified) { + public CrawledDocument fetchContent(EdgeUrl url, ContentTags tags) { logger.info("Fetching {}", url); if (mockData.containsKey(url)) { return mockData.get(url); diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java index 741c8704..009e9084 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java @@ -5,12 +5,18 @@ import nu.marginalia.WmsaHome; import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; +import nu.marginalia.crawling.io.CrawledDomainReader; +import nu.marginalia.crawling.io.CrawledDomainWriter; +import nu.marginalia.crawling.io.CrawlerOutputFile; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.spec.CrawlingSpecification; import nu.marginalia.crawling.model.SerializableCrawlData; import org.junit.jupiter.api.*; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -99,7 +105,7 @@ class CrawlerRetreiverTest { } @Test - public void testRecrawl() { + public void testRecrawl() throws IOException { var specs = CrawlingSpecification .builder() @@ -110,6 +116,8 @@ class CrawlerRetreiverTest { .build(); + Path out = Files.createTempDirectory("crawling-process"); + var writer = new CrawledDomainWriter(out, "test", "123456"); Map, List> data = new HashMap<>(); new CrawlerRetreiver(httpFetcher, specs, d -> { @@ -117,7 +125,12 @@ class CrawlerRetreiverTest { if (d instanceof CrawledDocument doc) { System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus); } + writer.accept(d); }).fetch(); + writer.close(); + + var reader = new CrawledDomainReader(); + var iter = reader.createIterator(CrawlerOutputFile.getOutputFile(out, "123456", "test")); CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0); domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList()); @@ -128,6 +141,7 @@ class CrawlerRetreiverTest { if (d instanceof CrawledDocument doc) { System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus); } - }).fetch(); + }).fetch(iter); + } } \ No newline at end of file