mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(crawler) Emulate if-modified-since for domains that don't support the header
This will help reduce the strain on some server software, in particular Discourse.
This commit is contained in:
parent
f430a084e8
commit
e1c9313396
@ -42,6 +42,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
|
|
||||||
private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
|
private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
|
||||||
private final ContentTypeProber contentTypeProber;
|
private final ContentTypeProber contentTypeProber;
|
||||||
|
private final SoftIfModifiedSinceProber softIfModifiedSinceProber;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setAllowAllContentTypes(boolean allowAllContentTypes) {
|
public void setAllowAllContentTypes(boolean allowAllContentTypes) {
|
||||||
@ -93,6 +94,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
this.userAgentString = userAgent.uaString();
|
this.userAgentString = userAgent.uaString();
|
||||||
this.userAgentIdentifier = userAgent.uaIdentifier();
|
this.userAgentIdentifier = userAgent.uaIdentifier();
|
||||||
this.contentTypeProber = new ContentTypeProber(userAgentString, client);
|
this.contentTypeProber = new ContentTypeProber(userAgentString, client);
|
||||||
|
this.softIfModifiedSinceProber = new SoftIfModifiedSinceProber(userAgentString, client);
|
||||||
}
|
}
|
||||||
|
|
||||||
public HttpFetcherImpl(String userAgent) {
|
public HttpFetcherImpl(String userAgent) {
|
||||||
@ -100,6 +102,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
this.userAgentString = userAgent;
|
this.userAgentString = userAgent;
|
||||||
this.userAgentIdentifier = userAgent;
|
this.userAgentIdentifier = userAgent;
|
||||||
this.contentTypeProber = new ContentTypeProber(userAgent, client);
|
this.contentTypeProber = new ContentTypeProber(userAgent, client);
|
||||||
|
this.softIfModifiedSinceProber = new SoftIfModifiedSinceProber(userAgent, client);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -166,6 +169,13 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
return new HttpFetchResult.ResultNone();
|
return new HttpFetchResult.ResultNone();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
else {
|
||||||
|
// Possibly do a soft probe to see if the URL has been modified since the last time we crawled it
|
||||||
|
// if we have reason to suspect ETags are not supported by the server.
|
||||||
|
if (softIfModifiedSinceProber.probeModificationTime(url, contentTags)) {
|
||||||
|
return new HttpFetchResult.Result304Raw();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
var getBuilder = new Request.Builder().get();
|
var getBuilder = new Request.Builder().get();
|
||||||
|
|
||||||
|
@ -0,0 +1,49 @@
|
|||||||
|
package nu.marginalia.crawl.retreival.fetcher;
|
||||||
|
|
||||||
|
import com.google.common.base.Strings;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import okhttp3.OkHttpClient;
|
||||||
|
import okhttp3.Request;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
|
public class SoftIfModifiedSinceProber {
|
||||||
|
|
||||||
|
private final String userAgentString;
|
||||||
|
private final OkHttpClient client;
|
||||||
|
|
||||||
|
public SoftIfModifiedSinceProber(String userAgentString, OkHttpClient httpClient) {
|
||||||
|
this.userAgentString = userAgentString;
|
||||||
|
this.client = httpClient;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Implement a soft probe of the last modified time of the given URL with a HEAD request.
|
||||||
|
* This is used to detect if the URL has been modified since the last time we crawled it.
|
||||||
|
*/
|
||||||
|
public boolean probeModificationTime(EdgeUrl url, ContentTags tags) throws IOException {
|
||||||
|
var headBuilder = new Request.Builder().head()
|
||||||
|
.addHeader("User-agent", userAgentString)
|
||||||
|
.addHeader("Accept-Encoding", "gzip")
|
||||||
|
.url(url.toString());
|
||||||
|
|
||||||
|
// This logic is only applicable if we only have a last-modified time, but no ETag.
|
||||||
|
if (Strings.isNullOrEmpty(tags.lastMod()))
|
||||||
|
return false;
|
||||||
|
if (!Strings.isNullOrEmpty(tags.etag()))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
var head = headBuilder.build();
|
||||||
|
var call = client.newCall(head);
|
||||||
|
|
||||||
|
try (var rsp = call.execute()) {
|
||||||
|
if (rsp.code() != 200) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
var contentTypeHeader = rsp.header("Last-Modified");
|
||||||
|
return Objects.equals(contentTypeHeader, tags.lastMod());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user