mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(crawler) Be more lenient when performing a domain probe
This commit is contained in:
parent
5354e034bf
commit
eee73ab16c
@ -45,6 +45,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
|
private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
|
||||||
|
|
||||||
private final Duration requestTimeout = Duration.ofSeconds(10);
|
private final Duration requestTimeout = Duration.ofSeconds(10);
|
||||||
|
private final Duration probeTimeout = Duration.ofSeconds(30);
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setAllowAllContentTypes(boolean allowAllContentTypes) {
|
public void setAllowAllContentTypes(boolean allowAllContentTypes) {
|
||||||
@ -107,12 +108,13 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
.HEAD()
|
.HEAD()
|
||||||
.uri(url.asURI())
|
.uri(url.asURI())
|
||||||
.header("User-agent", userAgentString)
|
.header("User-agent", userAgentString)
|
||||||
.timeout(requestTimeout)
|
.timeout(probeTimeout)
|
||||||
.build();
|
.build();
|
||||||
} catch (URISyntaxException e) {
|
} catch (URISyntaxException e) {
|
||||||
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid URL");
|
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid URL");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (int tries = 0;; tries++) {
|
||||||
try {
|
try {
|
||||||
var rsp = client.send(head, HttpResponse.BodyHandlers.discarding());
|
var rsp = client.send(head, HttpResponse.BodyHandlers.discarding());
|
||||||
EdgeUrl rspUri = new EdgeUrl(rsp.uri());
|
EdgeUrl rspUri = new EdgeUrl(rsp.uri());
|
||||||
@ -121,10 +123,13 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
return new DomainProbeResult.Redirect(rspUri.domain);
|
return new DomainProbeResult.Redirect(rspUri.domain);
|
||||||
}
|
}
|
||||||
return new DomainProbeResult.Ok(rspUri);
|
return new DomainProbeResult.Ok(rspUri);
|
||||||
}
|
} catch (Exception ex) {
|
||||||
catch (Exception ex) {
|
if (tries > 3) {
|
||||||
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, ex.getMessage());
|
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, ex.getMessage());
|
||||||
}
|
}
|
||||||
|
// else try again ...
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Perform a HEAD request to fetch the content type of a URL.
|
/** Perform a HEAD request to fetch the content type of a URL.
|
||||||
|
Loading…
Reference in New Issue
Block a user