(crawler) Be more lenient when performing a domain probe

2025-02-23 04:58:59 +00:00 · 2025-01-28 15:24:30 +01:00 · 2025-01-28 15:24:30 +01:00 · eee73ab16c
commit eee73ab16c
parent 5354e034bf
1 changed files with 15 additions and 10 deletions
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java
@ -45,6 +45,7 @@ public class HttpFetcherImpl implements HttpFetcher {
    private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();

    private final Duration requestTimeout = Duration.ofSeconds(10);
+    private final Duration probeTimeout = Duration.ofSeconds(30);

    @Override
    public void setAllowAllContentTypes(boolean allowAllContentTypes) {
@ -107,12 +108,13 @@ public class HttpFetcherImpl implements HttpFetcher {
                    .HEAD()
                    .uri(url.asURI())
                    .header("User-agent", userAgentString)
-                    .timeout(requestTimeout)
+                    .timeout(probeTimeout)
                    .build();
        } catch (URISyntaxException e) {
            return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid URL");
        }

+        for (int tries = 0;; tries++) {
            try {
                var rsp = client.send(head, HttpResponse.BodyHandlers.discarding());
                EdgeUrl rspUri = new EdgeUrl(rsp.uri());
@ -121,10 +123,13 @@ public class HttpFetcherImpl implements HttpFetcher {
                    return new DomainProbeResult.Redirect(rspUri.domain);
                }
                return new DomainProbeResult.Ok(rspUri);
-        }
-        catch (Exception ex) {
+            } catch (Exception ex) {
+                if (tries > 3) {
                    return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, ex.getMessage());
                }
+                // else try again ...
+            }
+        }
    }

    /** Perform a HEAD request to fetch the content type of a URL.