(crawler) Be more lenient when performing a domain probe

This commit is contained in:
Viktor Lofgren 2025-01-28 15:24:30 +01:00
parent 5354e034bf
commit eee73ab16c

View File

@ -45,6 +45,7 @@ public class HttpFetcherImpl implements HttpFetcher {
private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
private final Duration requestTimeout = Duration.ofSeconds(10);
private final Duration probeTimeout = Duration.ofSeconds(30);
@Override
public void setAllowAllContentTypes(boolean allowAllContentTypes) {
@ -107,12 +108,13 @@ public class HttpFetcherImpl implements HttpFetcher {
.HEAD()
.uri(url.asURI())
.header("User-agent", userAgentString)
.timeout(requestTimeout)
.timeout(probeTimeout)
.build();
} catch (URISyntaxException e) {
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid URL");
}
for (int tries = 0;; tries++) {
try {
var rsp = client.send(head, HttpResponse.BodyHandlers.discarding());
EdgeUrl rspUri = new EdgeUrl(rsp.uri());
@ -121,10 +123,13 @@ public class HttpFetcherImpl implements HttpFetcher {
return new DomainProbeResult.Redirect(rspUri.domain);
}
return new DomainProbeResult.Ok(rspUri);
}
catch (Exception ex) {
} catch (Exception ex) {
if (tries > 3) {
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, ex.getMessage());
}
// else try again ...
}
}
}
/** Perform a HEAD request to fetch the content type of a URL.