(crawler) Be more lenient when performing a domain probe

This commit is contained in:
Viktor Lofgren 2025-01-28 15:24:30 +01:00
parent 5354e034bf
commit eee73ab16c

View File

@ -45,6 +45,7 @@ public class HttpFetcherImpl implements HttpFetcher {
private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic(); private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
private final Duration requestTimeout = Duration.ofSeconds(10); private final Duration requestTimeout = Duration.ofSeconds(10);
private final Duration probeTimeout = Duration.ofSeconds(30);
@Override @Override
public void setAllowAllContentTypes(boolean allowAllContentTypes) { public void setAllowAllContentTypes(boolean allowAllContentTypes) {
@ -107,23 +108,27 @@ public class HttpFetcherImpl implements HttpFetcher {
.HEAD() .HEAD()
.uri(url.asURI()) .uri(url.asURI())
.header("User-agent", userAgentString) .header("User-agent", userAgentString)
.timeout(requestTimeout) .timeout(probeTimeout)
.build(); .build();
} catch (URISyntaxException e) { } catch (URISyntaxException e) {
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid URL"); return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid URL");
} }
try { for (int tries = 0;; tries++) {
var rsp = client.send(head, HttpResponse.BodyHandlers.discarding()); try {
EdgeUrl rspUri = new EdgeUrl(rsp.uri()); var rsp = client.send(head, HttpResponse.BodyHandlers.discarding());
EdgeUrl rspUri = new EdgeUrl(rsp.uri());
if (!Objects.equals(rspUri.domain, url.domain)) { if (!Objects.equals(rspUri.domain, url.domain)) {
return new DomainProbeResult.Redirect(rspUri.domain); return new DomainProbeResult.Redirect(rspUri.domain);
}
return new DomainProbeResult.Ok(rspUri);
} catch (Exception ex) {
if (tries > 3) {
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, ex.getMessage());
}
// else try again ...
} }
return new DomainProbeResult.Ok(rspUri);
}
catch (Exception ex) {
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, ex.getMessage());
} }
} }