diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java index dbb2dcfc..cec26ca2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java @@ -33,7 +33,7 @@ public class CrawlerMain implements AutoCloseable { private final UserAgent userAgent; private final ThreadPoolExecutor pool; - final int poolSize = 512; + final int poolSize = Integer.getInteger("crawler.pool-size", 512); final int poolQueueSize = 32; public CrawlerMain(EdgeCrawlPlan plan) throws Exception { @@ -72,8 +72,7 @@ public class CrawlerMain implements AutoCloseable { HttpFetcher fetcher = new HttpFetcher(userAgent.uaString(), dispatcher, connectionPool); - try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) - { + try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) { var retreiver = new CrawlerRetreiver(fetcher, specification, writer); int size = retreiver.fetch(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java index e7ec01b2..3b7239c0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java @@ -13,6 +13,7 @@ import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeLogic; import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeParser; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeContentType; import okhttp3.*; import org.apache.commons.io.input.BOMInputStream; import org.slf4j.Logger; @@ -20,9 +21,12 @@ import org.slf4j.LoggerFactory; import javax.net.ssl.X509TrustManager; import java.io.IOException; +import java.net.SocketTimeoutException; import java.net.URISyntaxException; import java.nio.charset.Charset; +import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.StandardCharsets; +import java.nio.charset.UnsupportedCharsetException; import java.time.LocalDateTime; import java.util.List; import java.util.Objects; @@ -122,6 +126,7 @@ public class HttpFetcher { return new FetchResult(FetchResultState.OK, requestDomain); } catch (Exception ex) { + logger.debug("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage()); return new FetchResult(FetchResultState.ERROR, url.domain); } } @@ -156,7 +161,11 @@ public class HttpFetcher { return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, "Early probe failed"); } } + catch (SocketTimeoutException ex) { + return createTimeoutErrorRsp(url, ex); + } catch (Exception ex) { + logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage()); return createHardErrorRsp(url, ex); } } @@ -167,7 +176,17 @@ public class HttpFetcher { try (var rsp = call.execute()) { return extractBody(url, rsp); } + catch (RateLimitException rle) { + throw rle; + } + catch (SocketTimeoutException ex) { + return createTimeoutErrorRsp(url, ex); + } + catch (IllegalCharsetNameException ex) { + return createHardErrorRsp(url, ex); + } catch (Exception ex) { + logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage()); return createHardErrorRsp(url, ex); } } @@ -180,7 +199,14 @@ public class HttpFetcher { .url(url.toString()) .build(); } - + private CrawledDocument createTimeoutErrorRsp(EdgeUrl url, Exception why) { + return CrawledDocument.builder() + .crawlerStatus("Timeout") + .crawlerStatusDesc(why.getMessage()) + .timestamp(LocalDateTime.now().toString()) + .url(url.toString()) + .build(); + } private CrawledDocument createErrorResponse(EdgeUrl url, Response rsp, CrawlerDocumentStatus status, String why) { return CrawledDocument.builder() .crawlerStatus(status.toString()) @@ -234,7 +260,7 @@ public class HttpFetcher { return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CHARSET, ""); } - var strData = new String(data, Charset.forName(contentType.charset)); + var strData = getStringData(data, contentType); var canonical = rsp.header("rel=canonical", ""); return CrawledDocument.builder() @@ -249,6 +275,24 @@ public class HttpFetcher { .build(); } + private String getStringData(byte[] data, EdgeContentType contentType) { + Charset charset; + try { + charset = Charset.forName(contentType.charset); + } + catch (IllegalCharsetNameException ex) { + charset = StandardCharsets.UTF_8; + } + catch (UnsupportedCharsetException ex) { + // This is usually like Macintosh Latin + // (https://en.wikipedia.org/wiki/Macintosh_Latin_encoding) + // + // It's close enough to 8859-1 to serve + charset = StandardCharsets.ISO_8859_1; + } + return new String(data, charset); + } + private CrawledDocument createRedirectResponse(EdgeUrl url, Response rsp, EdgeUrl responseUrl) { return CrawledDocument.builder() diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpRedirectResolver.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpRedirectResolver.java index 4079db66..5857c19b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpRedirectResolver.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpRedirectResolver.java @@ -8,7 +8,6 @@ import lombok.SneakyThrows; import nu.marginalia.wmsa.client.exception.NetworkException; import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; import nu.marginalia.wmsa.edge.model.EdgeUrl; -import okhttp3.Call; import okhttp3.OkHttpClient; import okhttp3.Request; import okhttp3.Response; @@ -16,6 +15,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.net.ssl.X509TrustManager; +import java.io.IOException; import java.util.concurrent.TimeUnit; // TODO: Is this used? @@ -64,10 +64,12 @@ public class HttpRedirectResolver { .addHeader("Accept-Encoding", "gzip") .build(); - return Observable.just(client.newCall(head)) - .map(Call::execute) - .flatMap(data -> resolveRedirects(depth, url, data)) - .timeout(10, TimeUnit.SECONDS); + var call = client.newCall(head); + try (var rsp = call.execute()) { + return resolveRedirects(depth, url, rsp); + } catch (IOException e) { + return Observable.error(e); + } } @SneakyThrows diff --git a/marginalia_nu/src/main/resources/static/edge/style-new.css b/marginalia_nu/src/main/resources/static/edge/style-new.css index 2d7f9acd..02cde57a 100644 --- a/marginalia_nu/src/main/resources/static/edge/style-new.css +++ b/marginalia_nu/src/main/resources/static/edge/style-new.css @@ -1,4 +1,18 @@ /* If you need to borrow something from below, that's fine */ +.extra a { + background: #ccc linear-gradient(45deg, rgba(255,220,220,1) 0%, rgba(219,255,196,1) 50%, rgba(212,216,255,1) 100%); + color: #000; + padding: 0.5ch; + border-radius: 0.5ch; + text-decoration: none; + border: 3px outset #000; + word-break: none; + white-space: nowrap; +} + +.extra a:active { + border: 3px inset #000; +} body { margin: 0px; @@ -395,6 +409,14 @@ a.underline { /* https://www.youtube.com/watch?v=v0nmHymgM7Y */ @media (prefers-color-scheme: dark) { + .extra a { + background: #000 linear-gradient(45deg, rgba(135,93,93,1) 0%, rgba(106,135,87,1) 50%, rgba(76,83,118,1) 100%); + font-weight: bold; + color: #fff; + border: 3px outset #000; + } + + a { color: #acf; }