From e1c93133968e667063b4d6054fd567f80fdebb67 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 22 Apr 2024 17:26:31 +0200 Subject: [PATCH] (crawler) Emulate if-modified-since for domains that don't support the header This will help reduce the strain on some server software, in particular Discourse. --- .../retreival/fetcher/HttpFetcherImpl.java | 10 ++++ .../fetcher/SoftIfModifiedSinceProber.java | 49 +++++++++++++++++++ 2 files changed, 59 insertions(+) create mode 100644 code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/SoftIfModifiedSinceProber.java diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java index 77dc6463..da7ddd3e 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java @@ -42,6 +42,7 @@ public class HttpFetcherImpl implements HttpFetcher { private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic(); private final ContentTypeProber contentTypeProber; + private final SoftIfModifiedSinceProber softIfModifiedSinceProber; @Override public void setAllowAllContentTypes(boolean allowAllContentTypes) { @@ -93,6 +94,7 @@ public class HttpFetcherImpl implements HttpFetcher { this.userAgentString = userAgent.uaString(); this.userAgentIdentifier = userAgent.uaIdentifier(); this.contentTypeProber = new ContentTypeProber(userAgentString, client); + this.softIfModifiedSinceProber = new SoftIfModifiedSinceProber(userAgentString, client); } public HttpFetcherImpl(String userAgent) { @@ -100,6 +102,7 @@ public class HttpFetcherImpl implements HttpFetcher { this.userAgentString = userAgent; this.userAgentIdentifier = userAgent; this.contentTypeProber = new ContentTypeProber(userAgent, client); + this.softIfModifiedSinceProber = new SoftIfModifiedSinceProber(userAgent, client); } /** @@ -166,6 +169,13 @@ public class HttpFetcherImpl implements HttpFetcher { return new HttpFetchResult.ResultNone(); } } + else { + // Possibly do a soft probe to see if the URL has been modified since the last time we crawled it + // if we have reason to suspect ETags are not supported by the server. + if (softIfModifiedSinceProber.probeModificationTime(url, contentTags)) { + return new HttpFetchResult.Result304Raw(); + } + } var getBuilder = new Request.Builder().get(); diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/SoftIfModifiedSinceProber.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/SoftIfModifiedSinceProber.java new file mode 100644 index 00000000..238e8944 --- /dev/null +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/SoftIfModifiedSinceProber.java @@ -0,0 +1,49 @@ +package nu.marginalia.crawl.retreival.fetcher; + +import com.google.common.base.Strings; +import nu.marginalia.model.EdgeUrl; +import okhttp3.OkHttpClient; +import okhttp3.Request; + +import java.io.IOException; +import java.util.Objects; + +public class SoftIfModifiedSinceProber { + + private final String userAgentString; + private final OkHttpClient client; + + public SoftIfModifiedSinceProber(String userAgentString, OkHttpClient httpClient) { + this.userAgentString = userAgentString; + this.client = httpClient; + } + + /** Implement a soft probe of the last modified time of the given URL with a HEAD request. + * This is used to detect if the URL has been modified since the last time we crawled it. + */ + public boolean probeModificationTime(EdgeUrl url, ContentTags tags) throws IOException { + var headBuilder = new Request.Builder().head() + .addHeader("User-agent", userAgentString) + .addHeader("Accept-Encoding", "gzip") + .url(url.toString()); + + // This logic is only applicable if we only have a last-modified time, but no ETag. + if (Strings.isNullOrEmpty(tags.lastMod())) + return false; + if (!Strings.isNullOrEmpty(tags.etag())) + return false; + + var head = headBuilder.build(); + var call = client.newCall(head); + + try (var rsp = call.execute()) { + if (rsp.code() != 200) { + return false; + } + + var contentTypeHeader = rsp.header("Last-Modified"); + return Objects.equals(contentTypeHeader, tags.lastMod()); + } + } + +}