(crawler) Emulate if-modified-since for domains that don't support the header

This will help reduce the strain on some server software, in particular Discourse.
This commit is contained in:
Viktor Lofgren 2024-04-22 17:26:31 +02:00
parent f430a084e8
commit e1c9313396
2 changed files with 59 additions and 0 deletions

View File

@ -42,6 +42,7 @@ public class HttpFetcherImpl implements HttpFetcher {
private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic(); private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
private final ContentTypeProber contentTypeProber; private final ContentTypeProber contentTypeProber;
private final SoftIfModifiedSinceProber softIfModifiedSinceProber;
@Override @Override
public void setAllowAllContentTypes(boolean allowAllContentTypes) { public void setAllowAllContentTypes(boolean allowAllContentTypes) {
@ -93,6 +94,7 @@ public class HttpFetcherImpl implements HttpFetcher {
this.userAgentString = userAgent.uaString(); this.userAgentString = userAgent.uaString();
this.userAgentIdentifier = userAgent.uaIdentifier(); this.userAgentIdentifier = userAgent.uaIdentifier();
this.contentTypeProber = new ContentTypeProber(userAgentString, client); this.contentTypeProber = new ContentTypeProber(userAgentString, client);
this.softIfModifiedSinceProber = new SoftIfModifiedSinceProber(userAgentString, client);
} }
public HttpFetcherImpl(String userAgent) { public HttpFetcherImpl(String userAgent) {
@ -100,6 +102,7 @@ public class HttpFetcherImpl implements HttpFetcher {
this.userAgentString = userAgent; this.userAgentString = userAgent;
this.userAgentIdentifier = userAgent; this.userAgentIdentifier = userAgent;
this.contentTypeProber = new ContentTypeProber(userAgent, client); this.contentTypeProber = new ContentTypeProber(userAgent, client);
this.softIfModifiedSinceProber = new SoftIfModifiedSinceProber(userAgent, client);
} }
/** /**
@ -166,6 +169,13 @@ public class HttpFetcherImpl implements HttpFetcher {
return new HttpFetchResult.ResultNone(); return new HttpFetchResult.ResultNone();
} }
} }
else {
// Possibly do a soft probe to see if the URL has been modified since the last time we crawled it
// if we have reason to suspect ETags are not supported by the server.
if (softIfModifiedSinceProber.probeModificationTime(url, contentTags)) {
return new HttpFetchResult.Result304Raw();
}
}
var getBuilder = new Request.Builder().get(); var getBuilder = new Request.Builder().get();

View File

@ -0,0 +1,49 @@
package nu.marginalia.crawl.retreival.fetcher;
import com.google.common.base.Strings;
import nu.marginalia.model.EdgeUrl;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import java.io.IOException;
import java.util.Objects;
public class SoftIfModifiedSinceProber {
private final String userAgentString;
private final OkHttpClient client;
public SoftIfModifiedSinceProber(String userAgentString, OkHttpClient httpClient) {
this.userAgentString = userAgentString;
this.client = httpClient;
}
/** Implement a soft probe of the last modified time of the given URL with a HEAD request.
* This is used to detect if the URL has been modified since the last time we crawled it.
*/
public boolean probeModificationTime(EdgeUrl url, ContentTags tags) throws IOException {
var headBuilder = new Request.Builder().head()
.addHeader("User-agent", userAgentString)
.addHeader("Accept-Encoding", "gzip")
.url(url.toString());
// This logic is only applicable if we only have a last-modified time, but no ETag.
if (Strings.isNullOrEmpty(tags.lastMod()))
return false;
if (!Strings.isNullOrEmpty(tags.etag()))
return false;
var head = headBuilder.build();
var call = client.newCall(head);
try (var rsp = call.execute()) {
if (rsp.code() != 200) {
return false;
}
var contentTypeHeader = rsp.header("Last-Modified");
return Objects.equals(contentTypeHeader, tags.lastMod());
}
}
}