From 69f333c0bf8bf76da9f7bc1a91be285c6284886a Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 23 Jul 2023 18:59:14 +0200 Subject: [PATCH] (crawler) Clean up and refactor the code a bit --- .../crawl/retreival/CrawlDataReference.java | 20 +- .../crawl/retreival/CrawlDelayTimer.java | 57 ++++ .../crawl/retreival/CrawlerRetreiver.java | 265 ++++++++---------- .../crawl/retreival/DomainCrawlFrontier.java | 2 +- .../retreival/CrawlerMockFetcherTest.java | 3 - .../retreival/CrawlerRetreiverTest.java | 5 +- 6 files changed, 190 insertions(+), 162 deletions(-) create mode 100644 code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java index 4c4a33d8..8f331a65 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java @@ -1,24 +1,19 @@ package nu.marginalia.crawl.retreival; -import com.google.common.hash.HashCode; import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; import nu.marginalia.bigstring.BigString; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.SerializableCrawlData; import nu.marginalia.lsh.EasyLSH; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import javax.annotation.Nullable; import java.util.*; /** A reference to a domain that has been crawled before. */ public class CrawlDataReference { - private final Logger logger = LoggerFactory.getLogger(CrawlDataReference.class); private final Iterator data; - private final HashFunction hashFunction = Hashing.murmur3_128(); public CrawlDataReference(Iterator data) { this.data = data; @@ -38,7 +33,7 @@ public class CrawlDataReference { return null; } - public boolean isContentSame(CrawledDocument one, CrawledDocument other) { + public boolean isContentBodySame(CrawledDocument one, CrawledDocument other) { assert one.documentBody != null; assert other.documentBody != null; @@ -48,13 +43,15 @@ public class CrawlDataReference { return EasyLSH.hammingDistance(contentHashOne, contentHashOther) < 4; } - private long contentHash(BigString documentBody) { String content = documentBody.decode(); EasyLSH hash = new EasyLSH(); int next = 0; boolean isInTag = false; + + // In a naive best-effort fashion, extract the text + // content of the document and feed it into the LSH for (int i = 0; i < content.length(); i++) { char c = content.charAt(i); if (c == '<') { @@ -62,12 +59,17 @@ public class CrawlDataReference { } else if (c == '>') { isInTag = false; } else if (!isInTag) { - next = (next << 8) | (byte) c; - hash.addHashUnordered(hashFunction.hashInt(next).asInt()); + next = (next << 8) | (c & 0xff); + hash.addHashUnordered(hashInt(next)); } } return hash.get(); } + private final HashFunction hashFunction = Hashing.murmur3_128(); + private int hashInt(int v) { + return hashFunction.hashInt(v).asInt(); + } + } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java new file mode 100644 index 00000000..ca2494dc --- /dev/null +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java @@ -0,0 +1,57 @@ +package nu.marginalia.crawl.retreival; + +import lombok.SneakyThrows; + +import static java.lang.Math.max; +import static java.lang.Math.min; + +public class CrawlDelayTimer { + + // When no crawl delay is specified, lean toward twice the fetch+process time, within these limits: + private static final long DEFAULT_CRAWL_DELAY_MIN_MS = Long.getLong("defaultCrawlDelay", 1000); + private static final long DEFAULT_CRAWL_DELAY_MAX_MS = Long.getLong("defaultCrawlDelaySlow", 2500); + + /** Flag to indicate that the crawler should slow down, e.g. from 429s */ + private boolean slowDown = false; + + private final long delayTime; + + public CrawlDelayTimer(long delayTime) { + this.delayTime = delayTime; + } + + @SneakyThrows + public void delay(long spentTime) { + long sleepTime = delayTime; + + if (sleepTime >= 1) { + if (spentTime > sleepTime) + return; + + Thread.sleep(min(sleepTime - spentTime, 5000)); + } + else if (slowDown) { + // Additional delay when the server is signalling it wants slower requests + Thread.sleep( DEFAULT_CRAWL_DELAY_MIN_MS); + } + else { + // When no crawl delay is specified, lean toward twice the fetch+process time, + // within sane limits. This means slower servers get slower crawling, and faster + // servers get faster crawling. + + sleepTime = spentTime * 2; + sleepTime = min(sleepTime, DEFAULT_CRAWL_DELAY_MAX_MS); + sleepTime = max(sleepTime, DEFAULT_CRAWL_DELAY_MIN_MS); + + if (spentTime > sleepTime) + return; + + Thread.sleep(sleepTime - spentTime); + } + } + + /** Increase the delay between requests if the server is signalling it wants slower requests with HTTP 429 */ + public void slowDown() { + slowDown = true; + } +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index 48587bdb..ebdbd4f0 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -11,7 +11,6 @@ import nu.marginalia.crawling.model.spec.CrawlingSpecification; import nu.marginalia.link_parser.LinkParser; import nu.marginalia.crawling.model.*; import nu.marginalia.ip_blocklist.UrlBlocklist; -import nu.marginalia.lsh.EasyLSH; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import org.jsoup.Jsoup; @@ -26,25 +25,12 @@ import java.time.LocalDateTime; import java.util.*; import java.util.function.Consumer; -import static java.lang.Math.max; -import static java.lang.Math.min; - public class CrawlerRetreiver { - private static final long DEFAULT_CRAWL_DELAY_MIN_MS = Long.getLong("defaultCrawlDelay", 1000); - private static final long DEFAULT_CRAWL_DELAY_MAX_MS = Long.getLong("defaultCrawlDelaySlow", 2500); private static final int MAX_ERRORS = 20; private final HttpFetcher fetcher; - - /** Flag to indicate that the crawler should slow down, e.g. from 429s */ - private boolean slowDown = false; - - - /** Testing flag to disable crawl delay (otherwise crawler tests take several minutes) */ - private boolean testFlagIgnoreDelay = false; - private final String id; private final String domain; private final Consumer crawledDomainWriter; @@ -61,7 +47,12 @@ public class CrawlerRetreiver { private final DomainCrawlFrontier crawlFrontier; int errorCount = 0; - private String retainedTag = "RETAINED/304"; + + /** recrawlState tag for documents that had a HTTP status 304 */ + private static final String documentWasRetainedTag = "RETAINED/304"; + + /** recrawlState tag for documents that had a 200 status but were identical to a previous version */ + private static final String documentWasSameTag = "SAME-BY-COMPARISON"; public CrawlerRetreiver(HttpFetcher fetcher, CrawlingSpecification specs, @@ -91,11 +82,6 @@ public class CrawlerRetreiver { } } - public CrawlerRetreiver withNoDelay() { - testFlagIgnoreDelay = true; - return this; - } - public int fetch() { return fetch(new CrawlDataReference()); } @@ -146,13 +132,13 @@ public class CrawlerRetreiver { assert !crawlFrontier.isEmpty(); - var robotsRules = fetcher.fetchRobotRules(crawlFrontier.peek().domain); - long crawlDelay = robotsRules.getCrawlDelay(); + final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(crawlFrontier.peek().domain); + final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay()); - sniffRootDocument(); + sniffRootDocument(delayTimer); // Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified - int recrawled = recrawl(oldCrawlData, robotsRules, crawlDelay); + int recrawled = recrawl(oldCrawlData, robotsRules, delayTimer); if (recrawled > 0) { // If we have reference data, we will always grow the crawl depth a bit @@ -195,7 +181,7 @@ public class CrawlerRetreiver { continue; - if (fetchDocument(top, null, crawlDelay).isPresent()) { + if (fetchWriteAndSleep(top, delayTimer, DocumentWithReference.empty()).isPresent()) { fetchedCount++; } } @@ -207,9 +193,10 @@ public class CrawlerRetreiver { return fetchedCount; } + /** Performs a re-crawl of old documents, comparing etags and last-modified */ private int recrawl(CrawlDataReference oldCrawlData, SimpleRobotRules robotsRules, - long crawlDelay) { + CrawlDelayTimer delayTimer) { int recrawled = 0; int retained = 0; @@ -247,8 +234,6 @@ public class CrawlerRetreiver { && retained > 0.9 * recrawled && Math.random() < 0.75) { - logger.info("Direct-loading {}", url); - // Since it looks like most of these documents haven't changed, // we'll load the documents directly; but we do this in a random // fashion to make sure we eventually catch changes over time @@ -263,15 +248,13 @@ public class CrawlerRetreiver { // providing etag and last-modified headers, so we can recycle the // document if it hasn't changed without actually downloading it - var fetchedDocOpt = fetchDocument(url, doc, crawlDelay); + var fetchedDocOpt = fetchWriteAndSleep(url, + delayTimer, + new DocumentWithReference(doc, oldCrawlData)); if (fetchedDocOpt.isEmpty()) continue; - if (Objects.equals(fetchedDocOpt.get().recrawlState, retainedTag)) { - retained ++; - } - else if (oldCrawlData.isContentSame(doc, fetchedDocOpt.get())) { - retained ++; - } + if (documentWasRetainedTag.equals(fetchedDocOpt.get().recrawlState)) retained ++; + else if (documentWasSameTag.equals(fetchedDocOpt.get().recrawlState)) retained ++; recrawled ++; } @@ -279,18 +262,6 @@ public class CrawlerRetreiver { return recrawled; } - private static final HashFunction hasher = Hashing.murmur3_128(0); - private long hashDoc(CrawledDocument doc) { - var hash = new EasyLSH(); - long val = 0; - for (var b : doc.documentBody.decode().getBytes()) { - val = val << 8 | (b & 0xFF); - hash.addUnordered(hasher.hashLong(val).asLong()); - } - return hash.get(); - } - - private void downloadSitemaps(SimpleRobotRules robotsRules) { List sitemaps = robotsRules.getSitemaps(); if (sitemaps.isEmpty()) { @@ -337,13 +308,13 @@ public class CrawlerRetreiver { logger.debug("Queue is now {}", crawlFrontier.queueSize()); } - private void sniffRootDocument() { + private void sniffRootDocument(CrawlDelayTimer delayTimer) { try { logger.debug("Configuring link filter"); var url = crawlFrontier.peek().withPathAndParam("/", null); - var maybeSample = fetchUrl(url, null).filter(sample -> sample.httpStatus == 200); + var maybeSample = fetchUrl(url, delayTimer, DocumentWithReference.empty()).filter(sample -> sample.httpStatus == 200); if (maybeSample.isEmpty()) return; var sample = maybeSample.get(); @@ -379,33 +350,41 @@ public class CrawlerRetreiver { } } - private Optional fetchDocument(EdgeUrl top, - @Nullable CrawledDocument reference, - long crawlDelay) { + private Optional fetchWriteAndSleep(EdgeUrl top, + CrawlDelayTimer timer, + DocumentWithReference reference) { logger.debug("Fetching {}", top); long startTime = System.currentTimeMillis(); - var doc = fetchUrl(top, reference); - if (doc.isPresent()) { - var d = doc.get(); - crawledDomainWriter.accept(d); + var docOpt = fetchUrl(top, timer, reference); - if (d.url != null) { - // We may have redirected to a different path - EdgeUrl.parse(d.url).ifPresent(crawlFrontier::addVisited); + if (docOpt.isPresent()) { + var doc = docOpt.get(); + + if (!Objects.equals(doc.recrawlState, documentWasRetainedTag) + && reference.isContentBodySame(doc)) + { + // The document didn't change since the last time + doc.recrawlState = documentWasSameTag; } - if ("ERROR".equals(d.crawlerStatus) && d.httpStatus != 404) { + crawledDomainWriter.accept(doc); + + if (doc.url != null) { + // We may have redirected to a different path + EdgeUrl.parse(doc.url).ifPresent(crawlFrontier::addVisited); + } + + if ("ERROR".equals(doc.crawlerStatus) && doc.httpStatus != 404) { errorCount++; } } - long crawledTime = System.currentTimeMillis() - startTime; - delay(crawlDelay, crawledTime); + timer.delay(System.currentTimeMillis() - startTime); - return doc; + return docOpt; } private boolean isAllowedProtocol(String proto) { @@ -413,35 +392,23 @@ public class CrawlerRetreiver { || proto.equalsIgnoreCase("https"); } - private Optional fetchUrl(EdgeUrl top, @Nullable CrawledDocument reference) { + private Optional fetchUrl(EdgeUrl top, CrawlDelayTimer timer, DocumentWithReference reference) { try { - var contentTags = getContentTags(reference); - var fetchedDoc = fetchContent(top, contentTags); - CrawledDocument doc; + var contentTags = reference.getContentTags(); + var fetchedDoc = tryDownload(top, timer, contentTags); - // HTTP status 304 is NOT MODIFIED, which means the document is the same as it was when - // we fetched it last time. We can recycle the reference document. - if (reference != null - && fetchedDoc.httpStatus == 304) - { - doc = reference; - doc.recrawlState = retainedTag; - doc.timestamp = LocalDateTime.now().toString(); - } - else { - doc = fetchedDoc; - } + CrawledDocument doc = reference.replaceOn304(fetchedDoc); if (doc.documentBody != null) { var decoded = doc.documentBody.decode(); doc.documentBodyHash = createHash(decoded); - Optional parsedDoc = parseDoc(decoded); + var parsedDoc = Jsoup.parse(decoded); EdgeUrl url = new EdgeUrl(doc.url); - parsedDoc.ifPresent(parsed -> findLinks(url, parsed)); - parsedDoc.flatMap(parsed -> findCanonicalUrl(url, parsed)) + findLinks(url, parsedDoc); + findCanonicalUrl(url, parsedDoc) .ifPresent(canonicalLink -> doc.canonicalUrl = canonicalLink.toString()); } @@ -455,33 +422,9 @@ public class CrawlerRetreiver { } - private ContentTags getContentTags(@Nullable CrawledDocument reference) { - if (null == reference) - return ContentTags.empty(); - - String headers = reference.headers; - if (headers == null) - return ContentTags.empty(); - - String[] headersLines = headers.split("\n"); - - String lastmod = null; - String etag = null; - - for (String line : headersLines) { - if (line.toLowerCase().startsWith("etag:")) { - etag = line.substring(5).trim(); - } - if (line.toLowerCase().startsWith("last-modified:")) { - lastmod = line.substring(14).trim(); - } - } - - return new ContentTags(etag, lastmod); - } @SneakyThrows - private CrawledDocument fetchContent(EdgeUrl top, ContentTags tags) { + private CrawledDocument tryDownload(EdgeUrl top, CrawlDelayTimer timer, ContentTags tags) { for (int i = 0; i < 2; i++) { try { var doc = fetcher.fetchContent(top, tags); @@ -489,7 +432,8 @@ public class CrawlerRetreiver { return doc; } catch (RateLimitException ex) { - slowDown = true; + timer.slowDown(); + int delay = ex.retryAfter(); if (delay > 0 && delay < 5000) { Thread.sleep(delay); @@ -504,10 +448,6 @@ public class CrawlerRetreiver { return hashMethod.hashUnencodedChars(documentBodyHash).toString(); } - private Optional parseDoc(String decoded) { - return Optional.of(Jsoup.parse(decoded)); - } - private void findLinks(EdgeUrl baseUrl, Document parsed) { baseUrl = linkParser.getBaseLink(parsed, baseUrl); @@ -547,36 +487,6 @@ public class CrawlerRetreiver { } } - @SneakyThrows - private void delay(long sleepTime, long spentTime) { - if (testFlagIgnoreDelay) - return; - - if (sleepTime >= 1) { - if (spentTime > sleepTime) - return; - - Thread.sleep(min(sleepTime - spentTime, 5000)); - } - else if (slowDown) { - Thread.sleep( 1000); - } - else { - // When no crawl delay is specified, lean toward twice the fetch+process time, - // within sane limits. This means slower servers get slower crawling, and faster - // servers get faster crawling. - - sleepTime = spentTime * 2; - sleepTime = min(sleepTime, DEFAULT_CRAWL_DELAY_MAX_MS); - sleepTime = max(sleepTime, DEFAULT_CRAWL_DELAY_MIN_MS); - - if (spentTime > sleepTime) - return; - - Thread.sleep(sleepTime - spentTime); - } - } - private CrawledDocument createRobotsError(EdgeUrl url) { return CrawledDocument.builder() .url(url.toString()) @@ -594,12 +504,71 @@ public class CrawlerRetreiver { .build(); } + private record DocumentWithReference( + @Nullable CrawledDocument doc, + @Nullable CrawlDataReference reference) { - enum CrawlDataComparison { - NO_OLD_DATA, - SMALL_SAMPLE, - CHANGES_FOUND, - NO_CHANGES - }; + private static final DocumentWithReference emptyInstance = new DocumentWithReference(null, null); + public static DocumentWithReference empty() { + return emptyInstance; + } + + public boolean isContentBodySame(CrawledDocument newDoc) { + if (reference == null) + return false; + if (doc == null) + return false; + + return reference.isContentBodySame(doc, newDoc); + } + + private ContentTags getContentTags() { + if (null == doc) + return ContentTags.empty(); + + String headers = doc.headers; + if (headers == null) + return ContentTags.empty(); + + String[] headersLines = headers.split("\n"); + + String lastmod = null; + String etag = null; + + for (String line : headersLines) { + if (line.toLowerCase().startsWith("etag:")) { + etag = line.substring(5).trim(); + } + if (line.toLowerCase().startsWith("last-modified:")) { + lastmod = line.substring(14).trim(); + } + } + + return new ContentTags(etag, lastmod); + } + + public boolean isEmpty() { + return doc == null || reference == null; + } + + /** If the provided document has HTTP status 304, and the reference document is provided, + * return the reference document; otherwise return the provided document. + */ + public CrawledDocument replaceOn304(CrawledDocument fetchedDoc) { + + if (doc == null) + return fetchedDoc; + + // HTTP status 304 is NOT MODIFIED, which means the document is the same as it was when + // we fetched it last time. We can recycle the reference document. + if (fetchedDoc.httpStatus != 304) + return fetchedDoc; + + var ret = doc; + ret.recrawlState = documentWasRetainedTag; + ret.timestamp = LocalDateTime.now().toString(); + return ret; + } + } } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java index 7d5fc214..4b9cc265 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java @@ -80,7 +80,7 @@ public class DomainCrawlFrontier { return; // reduce memory usage by not growing queue huge when crawling large sites - if (queue.size() + visited.size() >= depth + 100) + if (queue.size() + visited.size() >= depth + 1000) return; if (visited.contains(url.toString())) diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java index ea1dd08e..ae8e4679 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java @@ -71,7 +71,6 @@ public class CrawlerMockFetcherTest { registerUrlClasspathData(new EdgeUrl("https://startrek.website/post/108995"), "mock-crawl-data/lemmy/108995.html"); new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 10, "startrek.website", new ArrayList<>()), out::add) - .withNoDelay() .fetch(); out.forEach(System.out::println); @@ -84,7 +83,6 @@ public class CrawlerMockFetcherTest { registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html"); new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 10, "en.wikipedia.org", new ArrayList<>()), out::add) - .withNoDelay() .fetch(); out.forEach(System.out::println); @@ -99,7 +97,6 @@ public class CrawlerMockFetcherTest { registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/combined-mode-but-grid/4489"), "mock-crawl-data/discourse/grid.html"); new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 100, "community.tt-rss.org", new ArrayList<>()), out::add) - .withNoDelay() .fetch(); out.forEach(System.out::println); diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java index 2a37707f..bb4dd6f4 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java @@ -118,13 +118,16 @@ class CrawlerRetreiverTest { Path out = Files.createTempDirectory("crawling-process"); - var writer = new CrawledDomainWriter(out, "www.marginalia.nu", "123456"); + var writer = new CrawledDomainWriter(out, specs.domain, specs.id); Map, List> data = new HashMap<>(); new CrawlerRetreiver(httpFetcher, specs, d -> { data.computeIfAbsent(d.getClass(), k->new ArrayList<>()).add(d); if (d instanceof CrawledDocument doc) { System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus); + if (Math.random() > 0.5) { + doc.headers = ""; + } } writer.accept(d); }).fetch();