(crawler) Fast detection and bail-out for crawler traps

Nephentes has been doing the rounds in social media, adding an easy detection and mitigation mechanism for this type of trap, as sadly not all webmasters set up their robots.txt correctly. Out of the box crawl limits will also deal with this type of attack, but this fix is faster.
2025-02-23 13:09:00 +00:00 · 2025-01-17 13:02:57 +01:00 · 2025-01-17 13:02:57 +01:00 · 4342e42722
commit 4342e42722
parent bc818056e6
1 changed files with 14 additions and 0 deletions
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcRecorder.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcRecorder.java
@ -22,6 +22,7 @@ import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.security.NoSuchAlgorithmException;
+import java.time.Duration;
 import java.time.Instant;
 import java.util.*;

@ -167,6 +168,19 @@ public class WarcRecorder implements AutoCloseable {
            warcRequest.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
            writer.write(warcRequest);

+            if (Duration.between(date, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
+                    && inputBuffer.size() < 2048)
+            {
+                // Fast detection and mitigation of crawler traps that respond with slow
+                // small responses, with a high branching factor
+
+                // Note we bail *after* writing the warc records, this will effectively only
+                // prevent link extraction from the document.
+
+                logger.warn("URL {} took too long to fetch and was too small for the effort", requestUri);
+                return new HttpFetchResult.ResultException(new IOException("Likely crawler trap"));
+            }
+
            return new HttpFetchResult.ResultOk(responseUri,
                    response.code(),
                    inputBuffer.headers(),