(crawler) Correct content type probing to only run on URLs that are suspected to be binary

2025-02-23 13:09:00 +00:00 · 2024-12-26 14:12:09 +01:00 · 2024-12-26 14:12:09 +01:00 · e4a41f7dd1
commit e4a41f7dd1
parent 69ad6287b1
1 changed files with 1 additions and 1 deletions
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java
@ -139,7 +139,7 @@ public class HttpFetcherImpl implements HttpFetcher {
    public ContentTypeProbeResult probeContentType(EdgeUrl url,
                                                   WarcRecorder warcRecorder,
                                                   ContentTags tags) throws RateLimitException {
-        if (tags.isEmpty()) {
+        if (tags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url)) {
            var headBuilder = new Request.Builder().head()
                    .addHeader("User-agent", userAgentString)
                    .addHeader("Accept-Encoding", "gzip")