From e4a41f7dd179f21bfa93e69094728a59bf9fe4c8 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 26 Dec 2024 14:12:09 +0100 Subject: [PATCH] (crawler) Correct content type probing to only run on URLs that are suspected to be binary --- .../java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java index 62560d83..5ac9cf21 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java @@ -139,7 +139,7 @@ public class HttpFetcherImpl implements HttpFetcher { public ContentTypeProbeResult probeContentType(EdgeUrl url, WarcRecorder warcRecorder, ContentTags tags) throws RateLimitException { - if (tags.isEmpty()) { + if (tags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url)) { var headBuilder = new Request.Builder().head() .addHeader("User-agent", userAgentString) .addHeader("Accept-Encoding", "gzip")