(crawler) Be more lenient when performing a domain probe

This commit is contained in:
Viktor Lofgren 2025-01-28 15:24:30 +01:00
parent 5354e034bf
commit eee73ab16c

View File

@ -45,6 +45,7 @@ public class HttpFetcherImpl implements HttpFetcher {
private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic(); private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
private final Duration requestTimeout = Duration.ofSeconds(10); private final Duration requestTimeout = Duration.ofSeconds(10);
private final Duration probeTimeout = Duration.ofSeconds(30);
@Override @Override
public void setAllowAllContentTypes(boolean allowAllContentTypes) { public void setAllowAllContentTypes(boolean allowAllContentTypes) {
@ -107,12 +108,13 @@ public class HttpFetcherImpl implements HttpFetcher {
.HEAD() .HEAD()
.uri(url.asURI()) .uri(url.asURI())
.header("User-agent", userAgentString) .header("User-agent", userAgentString)
.timeout(requestTimeout) .timeout(probeTimeout)
.build(); .build();
} catch (URISyntaxException e) { } catch (URISyntaxException e) {
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid URL"); return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid URL");
} }
for (int tries = 0;; tries++) {
try { try {
var rsp = client.send(head, HttpResponse.BodyHandlers.discarding()); var rsp = client.send(head, HttpResponse.BodyHandlers.discarding());
EdgeUrl rspUri = new EdgeUrl(rsp.uri()); EdgeUrl rspUri = new EdgeUrl(rsp.uri());
@ -121,10 +123,13 @@ public class HttpFetcherImpl implements HttpFetcher {
return new DomainProbeResult.Redirect(rspUri.domain); return new DomainProbeResult.Redirect(rspUri.domain);
} }
return new DomainProbeResult.Ok(rspUri); return new DomainProbeResult.Ok(rspUri);
} } catch (Exception ex) {
catch (Exception ex) { if (tries > 3) {
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, ex.getMessage()); return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, ex.getMessage());
} }
// else try again ...
}
}
} }
/** Perform a HEAD request to fetch the content type of a URL. /** Perform a HEAD request to fetch the content type of a URL.