From e4a41f7dd179f21bfa93e69094728a59bf9fe4c8 Mon Sep 17 00:00:00 2001
From: Viktor Lofgren <vlofgren@marginalia.nu>
Date: Thu, 26 Dec 2024 14:12:09 +0100
Subject: [PATCH] (crawler) Correct content type probing to only run on URLs
 that are suspected to be binary

---
 .../java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java
index 62560d83..5ac9cf21 100644
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java
@@ -139,7 +139,7 @@ public class HttpFetcherImpl implements HttpFetcher {
     public ContentTypeProbeResult probeContentType(EdgeUrl url,
                                                    WarcRecorder warcRecorder,
                                                    ContentTags tags) throws RateLimitException {
-        if (tags.isEmpty()) {
+        if (tags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url)) {
             var headBuilder = new Request.Builder().head()
                     .addHeader("User-agent", userAgentString)
                     .addHeader("Accept-Encoding", "gzip")