mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(crawler) Correct content type probing to only run on URLs that are suspected to be binary
This commit is contained in:
parent
69ad6287b1
commit
e4a41f7dd1
@ -139,7 +139,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
public ContentTypeProbeResult probeContentType(EdgeUrl url,
|
public ContentTypeProbeResult probeContentType(EdgeUrl url,
|
||||||
WarcRecorder warcRecorder,
|
WarcRecorder warcRecorder,
|
||||||
ContentTags tags) throws RateLimitException {
|
ContentTags tags) throws RateLimitException {
|
||||||
if (tags.isEmpty()) {
|
if (tags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url)) {
|
||||||
var headBuilder = new Request.Builder().head()
|
var headBuilder = new Request.Builder().head()
|
||||||
.addHeader("User-agent", userAgentString)
|
.addHeader("User-agent", userAgentString)
|
||||||
.addHeader("Accept-Encoding", "gzip")
|
.addHeader("Accept-Encoding", "gzip")
|
||||||
|
Loading…
Reference in New Issue
Block a user