mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 04:58:59 +00:00
(crawler) Be more lenient when performing a domain probe
This commit is contained in:
parent
5354e034bf
commit
eee73ab16c
@ -45,6 +45,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
|
private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
|
||||||
|
|
||||||
private final Duration requestTimeout = Duration.ofSeconds(10);
|
private final Duration requestTimeout = Duration.ofSeconds(10);
|
||||||
|
private final Duration probeTimeout = Duration.ofSeconds(30);
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setAllowAllContentTypes(boolean allowAllContentTypes) {
|
public void setAllowAllContentTypes(boolean allowAllContentTypes) {
|
||||||
@ -107,23 +108,27 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
.HEAD()
|
.HEAD()
|
||||||
.uri(url.asURI())
|
.uri(url.asURI())
|
||||||
.header("User-agent", userAgentString)
|
.header("User-agent", userAgentString)
|
||||||
.timeout(requestTimeout)
|
.timeout(probeTimeout)
|
||||||
.build();
|
.build();
|
||||||
} catch (URISyntaxException e) {
|
} catch (URISyntaxException e) {
|
||||||
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid URL");
|
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid URL");
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
for (int tries = 0;; tries++) {
|
||||||
var rsp = client.send(head, HttpResponse.BodyHandlers.discarding());
|
try {
|
||||||
EdgeUrl rspUri = new EdgeUrl(rsp.uri());
|
var rsp = client.send(head, HttpResponse.BodyHandlers.discarding());
|
||||||
|
EdgeUrl rspUri = new EdgeUrl(rsp.uri());
|
||||||
|
|
||||||
if (!Objects.equals(rspUri.domain, url.domain)) {
|
if (!Objects.equals(rspUri.domain, url.domain)) {
|
||||||
return new DomainProbeResult.Redirect(rspUri.domain);
|
return new DomainProbeResult.Redirect(rspUri.domain);
|
||||||
|
}
|
||||||
|
return new DomainProbeResult.Ok(rspUri);
|
||||||
|
} catch (Exception ex) {
|
||||||
|
if (tries > 3) {
|
||||||
|
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, ex.getMessage());
|
||||||
|
}
|
||||||
|
// else try again ...
|
||||||
}
|
}
|
||||||
return new DomainProbeResult.Ok(rspUri);
|
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
|
||||||
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, ex.getMessage());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user