From 4342e427220d884d73a7e434b75097ac6da04a0b Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 17 Jan 2025 13:02:57 +0100 Subject: [PATCH] (crawler) Fast detection and bail-out for crawler traps Nephentes has been doing the rounds in social media, adding an easy detection and mitigation mechanism for this type of trap, as sadly not all webmasters set up their robots.txt correctly. Out of the box crawl limits will also deal with this type of attack, but this fix is faster. --- .../crawl/fetcher/warc/WarcRecorder.java | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcRecorder.java b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcRecorder.java index 06ba3719..b1f3c6b0 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcRecorder.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcRecorder.java @@ -22,6 +22,7 @@ import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.security.NoSuchAlgorithmException; +import java.time.Duration; import java.time.Instant; import java.util.*; @@ -167,6 +168,19 @@ public class WarcRecorder implements AutoCloseable { warcRequest.http(); // force HTTP header to be parsed before body is consumed so that caller can use it writer.write(warcRequest); + if (Duration.between(date, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0 + && inputBuffer.size() < 2048) + { + // Fast detection and mitigation of crawler traps that respond with slow + // small responses, with a high branching factor + + // Note we bail *after* writing the warc records, this will effectively only + // prevent link extraction from the document. + + logger.warn("URL {} took too long to fetch and was too small for the effort", requestUri); + return new HttpFetchResult.ResultException(new IOException("Likely crawler trap")); + } + return new HttpFetchResult.ResultOk(responseUri, response.code(), inputBuffer.headers(),