mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(crawler) Fast detection and bail-out for crawler traps
Nephentes has been doing the rounds in social media, adding an easy detection and mitigation mechanism for this type of trap, as sadly not all webmasters set up their robots.txt correctly. Out of the box crawl limits will also deal with this type of attack, but this fix is faster.
This commit is contained in:
parent
bc818056e6
commit
4342e42722
@ -22,6 +22,7 @@ import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.*;
|
||||
|
||||
@ -167,6 +168,19 @@ public class WarcRecorder implements AutoCloseable {
|
||||
warcRequest.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
|
||||
writer.write(warcRequest);
|
||||
|
||||
if (Duration.between(date, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
|
||||
&& inputBuffer.size() < 2048)
|
||||
{
|
||||
// Fast detection and mitigation of crawler traps that respond with slow
|
||||
// small responses, with a high branching factor
|
||||
|
||||
// Note we bail *after* writing the warc records, this will effectively only
|
||||
// prevent link extraction from the document.
|
||||
|
||||
logger.warn("URL {} took too long to fetch and was too small for the effort", requestUri);
|
||||
return new HttpFetchResult.ResultException(new IOException("Likely crawler trap"));
|
||||
}
|
||||
|
||||
return new HttpFetchResult.ResultOk(responseUri,
|
||||
response.code(),
|
||||
inputBuffer.headers(),
|
||||
|
Loading…
Reference in New Issue
Block a user