(crawler) Correct content type probing to only run on URLs that are suspected to be binary

This commit is contained in:
Viktor Lofgren 2024-12-26 14:12:09 +01:00
parent 69ad6287b1
commit e4a41f7dd1

View File

@ -139,7 +139,7 @@ public class HttpFetcherImpl implements HttpFetcher {
public ContentTypeProbeResult probeContentType(EdgeUrl url, public ContentTypeProbeResult probeContentType(EdgeUrl url,
WarcRecorder warcRecorder, WarcRecorder warcRecorder,
ContentTags tags) throws RateLimitException { ContentTags tags) throws RateLimitException {
if (tags.isEmpty()) { if (tags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url)) {
var headBuilder = new Request.Builder().head() var headBuilder = new Request.Builder().head()
.addHeader("User-agent", userAgentString) .addHeader("User-agent", userAgentString)
.addHeader("Accept-Encoding", "gzip") .addHeader("Accept-Encoding", "gzip")