(crawler) Fast detection and bail-out for crawler traps

Improve logging and exclude robots.txt from this logic.
2025-02-23 13:09:00 +00:00 · 2025-01-18 15:28:54 +01:00 · 2025-01-18 15:28:54 +01:00 · 567e4e1237
commit 567e4e1237
parent 4342e42722
1 changed files with 9 additions and 2 deletions
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcRecorder.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcRecorder.java
@ -90,6 +90,7 @@ public class WarcRecorder implements AutoCloseable {

        var call = client.newCall(request);

+
        cookieInformation.update(client, request.url());

        try (var response = call.execute();
@ -169,7 +170,8 @@ public class WarcRecorder implements AutoCloseable {
            writer.write(warcRequest);

            if (Duration.between(date, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
-                    && inputBuffer.size() < 2048)
+                    && inputBuffer.size() < 2048
+                    && !request.url().encodedPath().endsWith("robots.txt")) // don't bail on robots.txt
            {
                // Fast detection and mitigation of crawler traps that respond with slow
                // small responses, with a high branching factor
@ -177,7 +179,12 @@ public class WarcRecorder implements AutoCloseable {
                // Note we bail *after* writing the warc records, this will effectively only
                // prevent link extraction from the document.

-                logger.warn("URL {} took too long to fetch and was too small for the effort", requestUri);
+                logger.warn("URL {} took too long to fetch ({}s) and was too small for the effort ({}b)",
+                        requestUri,
+                        Duration.between(date, Instant.now()).getSeconds(),
+                        inputBuffer.size()
+                );
+
                return new HttpFetchResult.ResultException(new IOException("Likely crawler trap"));
            }