(crawler) Content type prober should not swallow exceptions

This commit is contained in:
Viktor Lofgren 2024-04-25 17:54:07 +02:00
parent 4d71c776fc
commit 70e2e41955
3 changed files with 8 additions and 6 deletions

View File

@ -291,7 +291,7 @@ public class CrawlerRetreiver implements AutoCloseable {
crawlFrontier.addVisited(top);
}
}
else if (fetchedDoc instanceof HttpFetchResult.ResultException ex) {
else if (fetchedDoc instanceof HttpFetchResult.ResultException) {
errorCount ++;
}
}

View File

@ -69,7 +69,7 @@ public class ContentTypeProber {
return new ContentTypeProbeResult.Ok(ret);
} catch (SocketTimeoutException ex) {
return new ContentTypeProbeResult.Timeout();
return new ContentTypeProbeResult.Timeout(ex);
} catch (Exception ex) {
logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
@ -80,7 +80,7 @@ public class ContentTypeProber {
public sealed interface ContentTypeProbeResult {
record Ok(EdgeUrl resolvedUrl) implements ContentTypeProbeResult { }
record BadContentType(String contentType, int statusCode) implements ContentTypeProbeResult { }
record Timeout() implements ContentTypeProbeResult { }
record Timeout(java.lang.Exception ex) implements ContentTypeProbeResult { }
record Exception(java.lang.Exception ex) implements ContentTypeProbeResult { }
}
}

View File

@ -162,11 +162,13 @@ public class HttpFetcherImpl implements HttpFetcher {
}
else if (probeResult instanceof ContentTypeProbeResult.BadContentType.Timeout timeout) {
warcRecorder.flagAsTimeout(url);
return new HttpFetchResult.ResultNone();
return new HttpFetchResult.ResultException(timeout.ex());
}
else if (probeResult instanceof ContentTypeProbeResult.Exception exception) {
warcRecorder.flagAsError(url, exception.ex());
return new HttpFetchResult.ResultNone();
return new HttpFetchResult.ResultException(exception.ex());
}
}
else {
@ -200,7 +202,7 @@ public class HttpFetcherImpl implements HttpFetcher {
}
}
return new HttpFetchResult.ResultNone();
return result;
}
@Override