(crawler) Clean up and refactor the code a bit

2025-02-24 05:18:58 +00:00 · 2023-07-23 18:59:14 +02:00 · 2023-07-23 18:59:14 +02:00 · 69f333c0bf
commit 69f333c0bf
parent c069c8c182
6 changed files with 190 additions and 162 deletions
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java
@ -1,24 +1,19 @@
 package nu.marginalia.crawl.retreival;
 import com.google.common.hash.HashCode;
 import com.google.common.hash.HashFunction;
 import com.google.common.hash.Hashing;
 import nu.marginalia.bigstring.BigString;
 import nu.marginalia.crawling.model.CrawledDocument;
 import nu.marginalia.crawling.model.SerializableCrawlData;
 import nu.marginalia.lsh.EasyLSH;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import javax.annotation.Nullable;
 import java.util.*;
 /** A reference to a domain that has been crawled before. */
 public class CrawlDataReference {
    private final Logger logger = LoggerFactory.getLogger(CrawlDataReference.class);
    private final Iterator<SerializableCrawlData> data;
    private final HashFunction hashFunction = Hashing.murmur3_128();
    public CrawlDataReference(Iterator<SerializableCrawlData> data) {
        this.data = data;
@ -38,7 +33,7 @@ public class CrawlDataReference {
        return null;
    }
-    public boolean isContentSame(CrawledDocument one, CrawledDocument other) {
+    public boolean isContentBodySame(CrawledDocument one, CrawledDocument other) {
        assert one.documentBody != null;
        assert other.documentBody != null;
@ -48,13 +43,15 @@ public class CrawlDataReference {
        return EasyLSH.hammingDistance(contentHashOne, contentHashOther) < 4;
    }
    private long contentHash(BigString documentBody) {
        String content = documentBody.decode();
        EasyLSH hash = new EasyLSH();
        int next = 0;
        boolean isInTag = false;
        // In a naive best-effort fashion, extract the text
        // content of the document and feed it into the LSH
        for (int i = 0; i < content.length(); i++) {
            char c = content.charAt(i);
            if (c == '<') {
@ -62,12 +59,17 @@ public class CrawlDataReference {
            } else if (c == '>') {
                isInTag = false;
            } else if (!isInTag) {
-                next = (next << 8) | (byte) c;
+                next = (next << 8) | (c & 0xff);
-                hash.addHashUnordered(hashFunction.hashInt(next).asInt());
+                hash.addHashUnordered(hashInt(next));
            }
        }
        return hash.get();
    }
    private final HashFunction hashFunction = Hashing.murmur3_128();
    private int hashInt(int v) {
        return hashFunction.hashInt(v).asInt();
    }
 }
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java
@ -0,0 +1,57 @@
 package nu.marginalia.crawl.retreival;
 import lombok.SneakyThrows;
 import static java.lang.Math.max;
 import static java.lang.Math.min;
 public class CrawlDelayTimer {
    // When no crawl delay is specified, lean toward twice the fetch+process time, within these limits:
    private static final long DEFAULT_CRAWL_DELAY_MIN_MS = Long.getLong("defaultCrawlDelay", 1000);
    private static final long DEFAULT_CRAWL_DELAY_MAX_MS = Long.getLong("defaultCrawlDelaySlow", 2500);
    /** Flag to indicate that the crawler should slow down, e.g. from 429s */
    private boolean slowDown = false;
    private final long delayTime;
    public CrawlDelayTimer(long delayTime) {
        this.delayTime = delayTime;
    }
    @SneakyThrows
    public void delay(long spentTime) {
        long sleepTime = delayTime;
        if (sleepTime >= 1) {
            if (spentTime > sleepTime)
                return;
            Thread.sleep(min(sleepTime - spentTime, 5000));
        }
        else if (slowDown) {
            // Additional delay when the server is signalling it wants slower requests
            Thread.sleep( DEFAULT_CRAWL_DELAY_MIN_MS);
        }
        else {
            // When no crawl delay is specified, lean toward twice the fetch+process time,
            // within sane limits. This means slower servers get slower crawling, and faster
            // servers get faster crawling.
            sleepTime = spentTime * 2;
            sleepTime = min(sleepTime, DEFAULT_CRAWL_DELAY_MAX_MS);
            sleepTime = max(sleepTime, DEFAULT_CRAWL_DELAY_MIN_MS);
            if (spentTime > sleepTime)
                return;
            Thread.sleep(sleepTime - spentTime);
        }
    }
    /** Increase the delay between requests if the server is signalling it wants slower requests with HTTP 429 */
    public void slowDown() {
        slowDown = true;
    }
 }
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
@ -11,7 +11,6 @@ import nu.marginalia.crawling.model.spec.CrawlingSpecification;
 import nu.marginalia.link_parser.LinkParser;
 import nu.marginalia.crawling.model.*;
 import nu.marginalia.ip_blocklist.UrlBlocklist;
 import nu.marginalia.lsh.EasyLSH;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
 import org.jsoup.Jsoup;
@ -26,25 +25,12 @@ import java.time.LocalDateTime;
 import java.util.*;
 import java.util.function.Consumer;
 import static java.lang.Math.max;
 import static java.lang.Math.min;
 public class CrawlerRetreiver {
    private static final long DEFAULT_CRAWL_DELAY_MIN_MS = Long.getLong("defaultCrawlDelay", 1000);
    private static final long DEFAULT_CRAWL_DELAY_MAX_MS = Long.getLong("defaultCrawlDelaySlow", 2500);
    private static final int MAX_ERRORS = 20;
    private final HttpFetcher fetcher;
    /** Flag to indicate that the crawler should slow down, e.g. from 429s */
    private boolean slowDown = false;
    /** Testing flag to disable crawl delay (otherwise crawler tests take several minutes) */
    private boolean testFlagIgnoreDelay = false;
    private final String id;
    private final String domain;
    private final Consumer<SerializableCrawlData> crawledDomainWriter;
@ -61,7 +47,12 @@ public class CrawlerRetreiver {
    private final DomainCrawlFrontier crawlFrontier;
    int errorCount = 0;
-    private String retainedTag = "RETAINED/304";
+
    /** recrawlState tag for documents that had a HTTP status 304 */
    private static final String documentWasRetainedTag = "RETAINED/304";
    /** recrawlState tag for documents that had a 200 status but were identical to a previous version */
    private static final String documentWasSameTag = "SAME-BY-COMPARISON";
    public CrawlerRetreiver(HttpFetcher fetcher,
                            CrawlingSpecification specs,
@ -91,11 +82,6 @@ public class CrawlerRetreiver {
        }
    }
    public CrawlerRetreiver withNoDelay() {
        testFlagIgnoreDelay = true;
        return this;
    }
    public int fetch() {
        return fetch(new CrawlDataReference());
    }
@ -146,13 +132,13 @@ public class CrawlerRetreiver {
        assert !crawlFrontier.isEmpty();
-        var robotsRules = fetcher.fetchRobotRules(crawlFrontier.peek().domain);
+        final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(crawlFrontier.peek().domain);
-        long crawlDelay = robotsRules.getCrawlDelay();
+        final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());
-        sniffRootDocument();
+        sniffRootDocument(delayTimer);
        // Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
-        int recrawled = recrawl(oldCrawlData, robotsRules, crawlDelay);
+        int recrawled = recrawl(oldCrawlData, robotsRules, delayTimer);
        if (recrawled > 0) {
            // If we have reference data, we will always grow the crawl depth a bit
@ -195,7 +181,7 @@ public class CrawlerRetreiver {
                continue;
-            if (fetchDocument(top, null, crawlDelay).isPresent()) {
+            if (fetchWriteAndSleep(top, delayTimer, DocumentWithReference.empty()).isPresent()) {
                fetchedCount++;
            }
        }
@ -207,9 +193,10 @@ public class CrawlerRetreiver {
        return fetchedCount;
    }
    /** Performs a re-crawl of old documents, comparing etags and last-modified */
    private int recrawl(CrawlDataReference oldCrawlData,
                        SimpleRobotRules robotsRules,
-                        long crawlDelay) {
+                        CrawlDelayTimer delayTimer) {
        int recrawled = 0;
        int retained = 0;
@ -247,8 +234,6 @@ public class CrawlerRetreiver {
             && retained > 0.9 * recrawled
             && Math.random() < 0.75)
            {
                logger.info("Direct-loading {}", url);
                // Since it looks like most of these documents haven't changed,
                // we'll load the documents directly; but we do this in a random
                // fashion to make sure we eventually catch changes over time
@ -263,15 +248,13 @@ public class CrawlerRetreiver {
            // providing etag and last-modified headers, so we can recycle the
            // document if it hasn't changed without actually downloading it
-            var fetchedDocOpt = fetchDocument(url, doc, crawlDelay);
+            var fetchedDocOpt = fetchWriteAndSleep(url,
                    delayTimer,
                    new DocumentWithReference(doc, oldCrawlData));
            if (fetchedDocOpt.isEmpty()) continue;
-            if (Objects.equals(fetchedDocOpt.get().recrawlState, retainedTag)) {
+            if (documentWasRetainedTag.equals(fetchedDocOpt.get().recrawlState)) retained ++;
-                retained ++;
+            else if (documentWasSameTag.equals(fetchedDocOpt.get().recrawlState)) retained ++;
            }
            else if (oldCrawlData.isContentSame(doc, fetchedDocOpt.get())) {
                retained ++;
            }
            recrawled ++;
        }
@ -279,18 +262,6 @@ public class CrawlerRetreiver {
        return recrawled;
    }
    private static final HashFunction hasher = Hashing.murmur3_128(0);
    private long hashDoc(CrawledDocument doc) {
        var hash = new EasyLSH();
        long val = 0;
        for (var b : doc.documentBody.decode().getBytes()) {
            val = val << 8 | (b & 0xFF);
            hash.addUnordered(hasher.hashLong(val).asLong());
        }
        return hash.get();
    }
    private void downloadSitemaps(SimpleRobotRules robotsRules) {
        List<String> sitemaps = robotsRules.getSitemaps();
        if (sitemaps.isEmpty()) {
@ -337,13 +308,13 @@ public class CrawlerRetreiver {
        logger.debug("Queue is now {}", crawlFrontier.queueSize());
    }
-    private void sniffRootDocument() {
+    private void sniffRootDocument(CrawlDelayTimer delayTimer) {
        try {
            logger.debug("Configuring link filter");
            var url = crawlFrontier.peek().withPathAndParam("/", null);
-            var maybeSample = fetchUrl(url, null).filter(sample -> sample.httpStatus == 200);
+            var maybeSample = fetchUrl(url, delayTimer, DocumentWithReference.empty()).filter(sample -> sample.httpStatus == 200);
            if (maybeSample.isEmpty())
                return;
            var sample = maybeSample.get();
@ -379,33 +350,41 @@ public class CrawlerRetreiver {
        }
    }
-    private Optional<CrawledDocument> fetchDocument(EdgeUrl top,
+    private Optional<CrawledDocument> fetchWriteAndSleep(EdgeUrl top,
-                                                    @Nullable CrawledDocument reference,
+                                                         CrawlDelayTimer timer,
-                                                    long crawlDelay) {
+                                                         DocumentWithReference reference) {
        logger.debug("Fetching {}", top);
        long startTime = System.currentTimeMillis();
-        var doc = fetchUrl(top, reference);
+        var docOpt = fetchUrl(top, timer, reference);
        if (doc.isPresent()) {
            var d = doc.get();
            crawledDomainWriter.accept(d);
-            if (d.url != null) {
+        if (docOpt.isPresent()) {
-                // We may have redirected to a different path
+            var doc = docOpt.get();
-                EdgeUrl.parse(d.url).ifPresent(crawlFrontier::addVisited);
+
            if (!Objects.equals(doc.recrawlState, documentWasRetainedTag)
                && reference.isContentBodySame(doc))
            {
                // The document didn't change since the last time
                doc.recrawlState = documentWasSameTag;
            }
-            if ("ERROR".equals(d.crawlerStatus) && d.httpStatus != 404) {
+            crawledDomainWriter.accept(doc);
            if (doc.url != null) {
                // We may have redirected to a different path
                EdgeUrl.parse(doc.url).ifPresent(crawlFrontier::addVisited);
            }
            if ("ERROR".equals(doc.crawlerStatus) && doc.httpStatus != 404) {
                errorCount++;
            }
        }
-        long crawledTime = System.currentTimeMillis() - startTime;
+        timer.delay(System.currentTimeMillis() - startTime);
        delay(crawlDelay, crawledTime);
-        return doc;
+        return docOpt;
    }
    private boolean isAllowedProtocol(String proto) {
@ -413,35 +392,23 @@ public class CrawlerRetreiver {
                || proto.equalsIgnoreCase("https");
    }
-    private Optional<CrawledDocument> fetchUrl(EdgeUrl top, @Nullable CrawledDocument reference) {
+    private Optional<CrawledDocument> fetchUrl(EdgeUrl top, CrawlDelayTimer timer, DocumentWithReference reference) {
        try {
-            var contentTags = getContentTags(reference);
+            var contentTags = reference.getContentTags();
-            var fetchedDoc = fetchContent(top, contentTags);
+            var fetchedDoc = tryDownload(top, timer, contentTags);
            CrawledDocument doc;
-            // HTTP status 304 is NOT MODIFIED, which means the document is the same as it was when
+            CrawledDocument doc = reference.replaceOn304(fetchedDoc);
            // we fetched it last time. We can recycle the reference document.
            if (reference != null
             && fetchedDoc.httpStatus == 304)
            {
                doc = reference;
                doc.recrawlState = retainedTag;
                doc.timestamp = LocalDateTime.now().toString();
            }
            else {
                doc = fetchedDoc;
            }
            if (doc.documentBody != null) {
                var decoded = doc.documentBody.decode();
                doc.documentBodyHash = createHash(decoded);
-                Optional<Document> parsedDoc = parseDoc(decoded);
+                var parsedDoc = Jsoup.parse(decoded);
                EdgeUrl url = new EdgeUrl(doc.url);
-                parsedDoc.ifPresent(parsed -> findLinks(url, parsed));
+                findLinks(url, parsedDoc);
-                parsedDoc.flatMap(parsed -> findCanonicalUrl(url, parsed))
+                findCanonicalUrl(url, parsedDoc)
                        .ifPresent(canonicalLink -> doc.canonicalUrl = canonicalLink.toString());
            }
@ -455,33 +422,9 @@ public class CrawlerRetreiver {
    }
    private ContentTags getContentTags(@Nullable CrawledDocument reference) {
        if (null == reference)
            return ContentTags.empty();
        String headers = reference.headers;
        if (headers == null)
            return ContentTags.empty();
        String[] headersLines = headers.split("\n");
        String lastmod = null;
        String etag = null;
        for (String line : headersLines) {
            if (line.toLowerCase().startsWith("etag:")) {
                etag = line.substring(5).trim();
            }
            if (line.toLowerCase().startsWith("last-modified:")) {
                lastmod = line.substring(14).trim();
            }
        }
        return new ContentTags(etag, lastmod);
    }
    @SneakyThrows
-    private CrawledDocument fetchContent(EdgeUrl top, ContentTags tags) {
+    private CrawledDocument tryDownload(EdgeUrl top, CrawlDelayTimer timer, ContentTags tags) {
        for (int i = 0; i < 2; i++) {
            try {
                var doc = fetcher.fetchContent(top, tags);
@ -489,7 +432,8 @@ public class CrawlerRetreiver {
                return doc;
            }
            catch (RateLimitException ex) {
-                slowDown = true;
+                timer.slowDown();
                int delay = ex.retryAfter();
                if (delay > 0 && delay < 5000) {
                    Thread.sleep(delay);
@ -504,10 +448,6 @@ public class CrawlerRetreiver {
        return hashMethod.hashUnencodedChars(documentBodyHash).toString();
    }
    private Optional<Document> parseDoc(String decoded) {
        return Optional.of(Jsoup.parse(decoded));
    }
    private void findLinks(EdgeUrl baseUrl, Document parsed) {
        baseUrl = linkParser.getBaseLink(parsed, baseUrl);
@ -547,36 +487,6 @@ public class CrawlerRetreiver {
        }
    }
    @SneakyThrows
    private void delay(long sleepTime, long spentTime) {
        if (testFlagIgnoreDelay)
            return;
        if (sleepTime >= 1) {
            if (spentTime > sleepTime)
                return;
            Thread.sleep(min(sleepTime - spentTime, 5000));
        }
        else if (slowDown) {
            Thread.sleep( 1000);
        }
        else {
            // When no crawl delay is specified, lean toward twice the fetch+process time,
            // within sane limits. This means slower servers get slower crawling, and faster
            // servers get faster crawling.
            sleepTime = spentTime * 2;
            sleepTime = min(sleepTime, DEFAULT_CRAWL_DELAY_MAX_MS);
            sleepTime = max(sleepTime, DEFAULT_CRAWL_DELAY_MIN_MS);
            if (spentTime > sleepTime)
                return;
            Thread.sleep(sleepTime - spentTime);
        }
    }
    private CrawledDocument createRobotsError(EdgeUrl url) {
        return CrawledDocument.builder()
                .url(url.toString())
@ -594,12 +504,71 @@ public class CrawlerRetreiver {
                .build();
    }
    private record DocumentWithReference(
            @Nullable CrawledDocument doc,
            @Nullable CrawlDataReference reference) {
-    enum CrawlDataComparison {
+        private static final DocumentWithReference emptyInstance = new DocumentWithReference(null, null);
-        NO_OLD_DATA,
+        public static DocumentWithReference empty() {
-        SMALL_SAMPLE,
+            return emptyInstance;
-        CHANGES_FOUND,
+        }
-        NO_CHANGES
+
-    };
+        public boolean isContentBodySame(CrawledDocument newDoc) {
            if (reference == null)
                return false;
            if (doc == null)
                return false;
            return reference.isContentBodySame(doc, newDoc);
        }
        private ContentTags getContentTags() {
            if (null == doc)
                return ContentTags.empty();
            String headers = doc.headers;
            if (headers == null)
                return ContentTags.empty();
            String[] headersLines = headers.split("\n");
            String lastmod = null;
            String etag = null;
            for (String line : headersLines) {
                if (line.toLowerCase().startsWith("etag:")) {
                    etag = line.substring(5).trim();
                }
                if (line.toLowerCase().startsWith("last-modified:")) {
                    lastmod = line.substring(14).trim();
                }
            }
            return new ContentTags(etag, lastmod);
        }
        public boolean isEmpty() {
            return doc == null || reference == null;
        }
        /** If the provided document has HTTP status 304, and the reference document is provided,
         *  return the reference document; otherwise return the provided document.
         */
        public CrawledDocument replaceOn304(CrawledDocument fetchedDoc) {
            if (doc == null)
                return fetchedDoc;
            // HTTP status 304 is NOT MODIFIED, which means the document is the same as it was when
            // we fetched it last time. We can recycle the reference document.
            if (fetchedDoc.httpStatus != 304)
                return fetchedDoc;
            var ret = doc;
            ret.recrawlState = documentWasRetainedTag;
            ret.timestamp = LocalDateTime.now().toString();
            return ret;
        }
    }
 }
--- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java
+++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java
@ -80,7 +80,7 @@ public class DomainCrawlFrontier {
            return;
        // reduce memory usage by not growing queue huge when crawling large sites
-        if (queue.size() + visited.size() >= depth + 100)
+        if (queue.size() + visited.size() >= depth + 1000)
            return;
        if (visited.contains(url.toString()))
--- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java
+++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java
@ -71,7 +71,6 @@ public class CrawlerMockFetcherTest {
        registerUrlClasspathData(new EdgeUrl("https://startrek.website/post/108995"), "mock-crawl-data/lemmy/108995.html");
        new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 10, "startrek.website", new ArrayList<>()), out::add)
                .withNoDelay()
                .fetch();
        out.forEach(System.out::println);
@ -84,7 +83,6 @@ public class CrawlerMockFetcherTest {
        registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html");
        new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 10, "en.wikipedia.org", new ArrayList<>()), out::add)
                .withNoDelay()
                .fetch();
        out.forEach(System.out::println);
@ -99,7 +97,6 @@ public class CrawlerMockFetcherTest {
        registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/combined-mode-but-grid/4489"), "mock-crawl-data/discourse/grid.html");
        new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 100, "community.tt-rss.org", new ArrayList<>()), out::add)
                .withNoDelay()
                .fetch();
        out.forEach(System.out::println);
--- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
+++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
@ -118,13 +118,16 @@ class CrawlerRetreiverTest {
        Path out = Files.createTempDirectory("crawling-process");
-        var writer = new CrawledDomainWriter(out, "www.marginalia.nu", "123456");
+        var writer = new CrawledDomainWriter(out, specs.domain, specs.id);
        Map<Class<? extends SerializableCrawlData>, List<SerializableCrawlData>> data = new HashMap<>();
        new CrawlerRetreiver(httpFetcher, specs, d -> {
            data.computeIfAbsent(d.getClass(), k->new ArrayList<>()).add(d);
            if (d instanceof CrawledDocument doc) {
                System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus);
                if (Math.random() > 0.5) {
                    doc.headers = "";
                }
            }
            writer.accept(d);
        }).fetch();