From e9854f194c162f9aedb84fde03815a923850e539 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 23 Sep 2024 17:51:07 +0200 Subject: [PATCH] (crawler) Refactor * Restructure the code to make a bit more sense * Store full headers in crawl data * Fix bug in retry-after header that assumed the timeout was in milliseconds, and then clamped it to a lower bound of 500ms, meaning this was almost always handled wrong --- ...CrawlingThenConvertingIntegrationTest.java | 8 +- .../java/nu/marginalia/crawl/CrawlerMain.java | 8 +- .../{retreival => }/fetcher/ContentTags.java | 2 +- .../crawl/{retreival => fetcher}/Cookies.java | 2 +- .../{retreival => }/fetcher/HttpFetcher.java | 9 +- .../fetcher/HttpFetcherImpl.java | 68 ++++++++-- .../fetcher/SitemapRetriever.java | 2 +- .../socket/FastTerminatingSocketFactory.java | 2 +- .../IpInterceptingNetworkInterceptor.java | 2 +- .../fetcher/socket/NoSecuritySSL.java | 2 +- .../fetcher/warc/WarcDigestBuilder.java | 2 +- .../fetcher/warc/WarcInputBuffer.java | 2 +- .../warc/WarcProtocolReconstructor.java | 7 +- .../fetcher/warc/WarcRecorder.java | 63 +++++---- .../fetcher => logic}/ContentTypeProber.java | 6 +- .../{retreival => logic}/DomainLocks.java | 2 +- .../LinkFilterSelector.java | 16 ++- .../SoftIfModifiedSinceProber.java | 3 +- .../crawl/retreival/CrawlDelayTimer.java | 18 ++- .../retreival/CrawledDocumentFactory.java | 91 ------------- .../crawl/retreival/CrawlerRetreiver.java | 76 +++++------ .../retreival/CrawlerWarcResynchronizer.java | 13 +- .../crawl/retreival/DomainCrawlFrontier.java | 7 +- .../crawl/retreival/DomainProber.java | 37 +----- .../crawl/retreival/RateLimitException.java | 21 --- .../crawl/retreival/fetcher/FetchResult.java | 24 ---- .../retreival/fetcher/FetchResultState.java | 7 - .../retreival/revisit/CrawlerRevisitor.java | 5 +- .../revisit/DocumentWithReference.java | 2 +- .../retreival/sitemap/SitemapFetcher.java | 2 +- .../ParquetSerializableCrawlDataStream.java | 2 +- .../model/crawldata/CrawledDocument.java | 1 - .../CrawledDocumentParquetRecord.java | 11 +- ...rawledDocumentParquetRecordFileWriter.java | 13 ++ .../crawl/logic/ContentTypeProberTest.java | 124 ++++++++++++++++++ .../CrawlerWarcResynchronizerTest.java | 7 +- .../fetcher/ContentTypeProberTest.java | 7 +- .../retreival/fetcher/WarcRecorderTest.java | 10 +- .../revisit/DocumentWithReferenceTest.java | 2 +- .../marginalia/crawling/HttpFetcherTest.java | 13 +- .../retreival/CrawlerMockFetcherTest.java | 13 +- .../retreival/CrawlerRetreiverTest.java | 10 +- .../test/nu/marginalia/IntegrationTest.java | 11 +- .../screenshot/ScreenshotCaptureToolMain.java | 2 +- 44 files changed, 398 insertions(+), 337 deletions(-) rename code/processes/crawling-process/java/nu/marginalia/crawl/{retreival => }/fetcher/ContentTags.java (97%) rename code/processes/crawling-process/java/nu/marginalia/crawl/{retreival => fetcher}/Cookies.java (96%) rename code/processes/crawling-process/java/nu/marginalia/crawl/{retreival => }/fetcher/HttpFetcher.java (73%) rename code/processes/crawling-process/java/nu/marginalia/crawl/{retreival => }/fetcher/HttpFetcherImpl.java (78%) rename code/processes/crawling-process/java/nu/marginalia/crawl/{retreival => }/fetcher/SitemapRetriever.java (98%) rename code/processes/crawling-process/java/nu/marginalia/crawl/{retreival => }/fetcher/socket/FastTerminatingSocketFactory.java (96%) rename code/processes/crawling-process/java/nu/marginalia/crawl/{retreival => }/fetcher/socket/IpInterceptingNetworkInterceptor.java (94%) rename code/processes/crawling-process/java/nu/marginalia/crawl/{retreival => }/fetcher/socket/NoSecuritySSL.java (97%) rename code/processes/crawling-process/java/nu/marginalia/crawl/{retreival => }/fetcher/warc/WarcDigestBuilder.java (93%) rename code/processes/crawling-process/java/nu/marginalia/crawl/{retreival => }/fetcher/warc/WarcInputBuffer.java (99%) rename code/processes/crawling-process/java/nu/marginalia/crawl/{retreival => }/fetcher/warc/WarcProtocolReconstructor.java (96%) rename code/processes/crawling-process/java/nu/marginalia/crawl/{retreival => }/fetcher/warc/WarcRecorder.java (84%) rename code/processes/crawling-process/java/nu/marginalia/crawl/{retreival/fetcher => logic}/ContentTypeProber.java (96%) rename code/processes/crawling-process/java/nu/marginalia/crawl/{retreival => logic}/DomainLocks.java (97%) rename code/processes/crawling-process/java/nu/marginalia/crawl/{retreival => logic}/LinkFilterSelector.java (77%) rename code/processes/crawling-process/java/nu/marginalia/crawl/{retreival/fetcher => logic}/SoftIfModifiedSinceProber.java (95%) delete mode 100644 code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawledDocumentFactory.java delete mode 100644 code/processes/crawling-process/java/nu/marginalia/crawl/retreival/RateLimitException.java delete mode 100644 code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/FetchResult.java delete mode 100644 code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/FetchResultState.java create mode 100644 code/processes/crawling-process/test/nu/marginalia/crawl/logic/ContentTypeProberTest.java diff --git a/code/processes/converting-process/test/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java b/code/processes/converting-process/test/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java index 0e935276..b433a276 100644 --- a/code/processes/converting-process/test/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java +++ b/code/processes/converting-process/test/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java @@ -7,11 +7,11 @@ import nu.marginalia.UserAgent; import nu.marginalia.WmsaHome; import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.processor.DomainProcessor; +import nu.marginalia.crawl.fetcher.HttpFetcher; +import nu.marginalia.crawl.fetcher.HttpFetcherImpl; +import nu.marginalia.crawl.fetcher.warc.WarcRecorder; import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.DomainProber; -import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; -import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; -import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.io.crawldata.format.ParquetSerializableCrawlDataStream; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.crawl.DomainIndexingState; @@ -266,7 +266,7 @@ public class CrawlingThenConvertingIntegrationTest { List data = new ArrayList<>(); try (var recorder = new WarcRecorder(fileName)) { - new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, recorder).fetch(); + new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, recorder).crawlDomain(); } CrawledDocumentParquetRecordFileWriter.convertWarc(specs.domain, diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java index d8d49f3b..9577f2c1 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java @@ -10,12 +10,12 @@ import nu.marginalia.UserAgent; import nu.marginalia.WmsaHome; import nu.marginalia.atags.source.AnchorTagsSource; import nu.marginalia.atags.source.AnchorTagsSourceFactory; +import nu.marginalia.crawl.fetcher.HttpFetcherImpl; +import nu.marginalia.crawl.fetcher.warc.WarcRecorder; +import nu.marginalia.crawl.logic.DomainLocks; import nu.marginalia.crawl.retreival.CrawlDataReference; import nu.marginalia.crawl.retreival.CrawlerRetreiver; -import nu.marginalia.crawl.retreival.DomainLocks; import nu.marginalia.crawl.retreival.DomainProber; -import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; -import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawl.spec.CrawlSpecProvider; import nu.marginalia.crawl.spec.DbCrawlSpecProvider; import nu.marginalia.crawl.spec.ParquetCrawlSpecProvider; @@ -294,7 +294,7 @@ public class CrawlerMain extends ProcessMainClass { Files.delete(tempFile); } - int size = retriever.fetch(domainLinks, reference); + int size = retriever.crawlDomain(domainLinks, reference); // Delete the reference crawl data if it's not the same as the new one // (mostly a case when migrating from legacy->warc) diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/ContentTags.java b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/ContentTags.java similarity index 97% rename from code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/ContentTags.java rename to code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/ContentTags.java index a3b3a2bc..c8cddc4e 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/ContentTags.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/ContentTags.java @@ -1,4 +1,4 @@ -package nu.marginalia.crawl.retreival.fetcher; +package nu.marginalia.crawl.fetcher; import okhttp3.Request; diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/Cookies.java b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/Cookies.java similarity index 96% rename from code/processes/crawling-process/java/nu/marginalia/crawl/retreival/Cookies.java rename to code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/Cookies.java index 7b43321e..10cf2bc7 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/Cookies.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/Cookies.java @@ -1,4 +1,4 @@ -package nu.marginalia.crawl.retreival; +package nu.marginalia.crawl.fetcher; import okhttp3.Cookie; import okhttp3.CookieJar; diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcher.java similarity index 73% rename from code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java rename to code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcher.java index 42723d5c..a5bc005f 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcher.java @@ -1,9 +1,8 @@ -package nu.marginalia.crawl.retreival.fetcher; +package nu.marginalia.crawl.fetcher; import com.google.inject.ImplementedBy; import crawlercommons.robots.SimpleRobotRules; -import nu.marginalia.crawl.retreival.RateLimitException; -import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; +import nu.marginalia.crawl.fetcher.warc.WarcRecorder; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.body.HttpFetchResult; @@ -17,12 +16,12 @@ public interface HttpFetcher { List getCookies(); void clearCookies(); - FetchResult probeDomain(EdgeUrl url); + HttpFetcherImpl.ProbeResult probeDomain(EdgeUrl url); HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags, - ProbeType probeType) throws RateLimitException; + ProbeType probeType) throws HttpFetcherImpl.RateLimitException; SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder); diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java similarity index 78% rename from code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java rename to code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java index 40725b0f..567d9d2d 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java @@ -1,22 +1,23 @@ -package nu.marginalia.crawl.retreival.fetcher; +package nu.marginalia.crawl.fetcher; import com.google.inject.Inject; import crawlercommons.robots.SimpleRobotRules; import crawlercommons.robots.SimpleRobotRulesParser; import lombok.SneakyThrows; import nu.marginalia.UserAgent; -import nu.marginalia.crawl.retreival.Cookies; -import nu.marginalia.crawl.retreival.RateLimitException; -import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult; -import nu.marginalia.crawl.retreival.fetcher.socket.FastTerminatingSocketFactory; -import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor; -import nu.marginalia.crawl.retreival.fetcher.socket.NoSecuritySSL; -import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; +import nu.marginalia.crawl.fetcher.socket.FastTerminatingSocketFactory; +import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor; +import nu.marginalia.crawl.fetcher.socket.NoSecuritySSL; +import nu.marginalia.crawl.fetcher.warc.WarcRecorder; +import nu.marginalia.crawl.logic.ContentTypeProber; +import nu.marginalia.crawl.logic.ContentTypeProber.ContentTypeProbeResult; +import nu.marginalia.crawl.logic.SoftIfModifiedSinceProber; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.body.ContentTypeLogic; import nu.marginalia.model.body.DocumentBodyExtractor; import nu.marginalia.model.body.HttpFetchResult; +import nu.marginalia.model.crawldata.CrawlerDomainStatus; import okhttp3.ConnectionPool; import okhttp3.Dispatcher; import okhttp3.OkHttpClient; @@ -25,6 +26,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.net.ssl.X509TrustManager; +import java.time.Duration; import java.util.List; import java.util.Objects; import java.util.Optional; @@ -114,7 +116,7 @@ public class HttpFetcherImpl implements HttpFetcher { */ @Override @SneakyThrows - public FetchResult probeDomain(EdgeUrl url) { + public ProbeResult probeDomain(EdgeUrl url) { var head = new Request.Builder().head().addHeader("User-agent", userAgentString) .url(url.toString()) .build(); @@ -125,9 +127,9 @@ public class HttpFetcherImpl implements HttpFetcher { EdgeUrl requestUrl = new EdgeUrl(rsp.request().url().toString()); if (!Objects.equals(requestUrl.domain, url.domain)) { - return new FetchResult(FetchResultState.REDIRECT, requestUrl); + return new ProbeResultRedirect(url.domain); } - return new FetchResult(FetchResultState.OK, requestUrl); + return new ProbeResultOk(requestUrl); } catch (Exception ex) { @@ -136,7 +138,7 @@ public class HttpFetcherImpl implements HttpFetcher { } logger.info("Error during fetching {}", ex.getMessage()); - return new FetchResult(FetchResultState.ERROR, url); + return new ProbeResultError(CrawlerDomainStatus.ERROR, ex.getMessage()); } } @@ -196,8 +198,7 @@ public class HttpFetcherImpl implements HttpFetcher { if (result instanceof HttpFetchResult.ResultOk ok) { if (ok.statusCode() == 429) { - String retryAfter = Objects.requireNonNullElse(ok.header("Retry-After"), "1000"); - throw new RateLimitException(retryAfter); + throw new RateLimitException(Objects.requireNonNullElse(ok.header("Retry-After"), "1")); } if (ok.statusCode() == 304) { return new HttpFetchResult.Result304Raw(); @@ -249,5 +250,44 @@ public class HttpFetcherImpl implements HttpFetcher { } + public sealed interface ProbeResult permits ProbeResultError, ProbeResultRedirect, ProbeResultOk {} + + /** The probing failed for one reason or another + * @param status Machine readable status + * @param desc Human-readable description of the error + */ + public record ProbeResultError(CrawlerDomainStatus status, String desc) implements ProbeResult {} + + /** This domain redirects to another domain */ + public record ProbeResultRedirect(EdgeDomain domain) implements ProbeResult {} + + /** If the retrieval of the probed url was successful, return the url as it was fetched + * (which may be different from the url we probed, if we attempted another URL schema). + * + * @param probedUrl The url we successfully probed + */ + public record ProbeResultOk(EdgeUrl probedUrl) implements ProbeResult {} + + + /** Exception thrown when the server signals the rate limit is exceeded */ + public static class RateLimitException extends Exception { + private final String retryAfter; + + public RateLimitException(String retryAfterHeader) { + this.retryAfter = retryAfterHeader; + } + + @Override + public StackTraceElement[] getStackTrace() { return new StackTraceElement[0]; } + + public Duration retryAfter() { + try { + return Duration.ofSeconds(Integer.parseInt(retryAfter)); + } + catch (NumberFormatException ex) { + return Duration.ofSeconds(1); + } + } + } } diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/SitemapRetriever.java b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/SitemapRetriever.java similarity index 98% rename from code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/SitemapRetriever.java rename to code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/SitemapRetriever.java index 90b26a88..031aad03 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/SitemapRetriever.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/SitemapRetriever.java @@ -1,4 +1,4 @@ -package nu.marginalia.crawl.retreival.fetcher; +package nu.marginalia.crawl.fetcher; import crawlercommons.sitemaps.*; import nu.marginalia.model.EdgeUrl; diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/socket/FastTerminatingSocketFactory.java b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/socket/FastTerminatingSocketFactory.java similarity index 96% rename from code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/socket/FastTerminatingSocketFactory.java rename to code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/socket/FastTerminatingSocketFactory.java index ffb29b33..1aa7b5aa 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/socket/FastTerminatingSocketFactory.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/socket/FastTerminatingSocketFactory.java @@ -1,4 +1,4 @@ -package nu.marginalia.crawl.retreival.fetcher.socket; +package nu.marginalia.crawl.fetcher.socket; import javax.net.SocketFactory; import java.io.IOException; diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/socket/IpInterceptingNetworkInterceptor.java b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/socket/IpInterceptingNetworkInterceptor.java similarity index 94% rename from code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/socket/IpInterceptingNetworkInterceptor.java rename to code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/socket/IpInterceptingNetworkInterceptor.java index 90f43e5c..9287b6f0 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/socket/IpInterceptingNetworkInterceptor.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/socket/IpInterceptingNetworkInterceptor.java @@ -1,4 +1,4 @@ -package nu.marginalia.crawl.retreival.fetcher.socket; +package nu.marginalia.crawl.fetcher.socket; import okhttp3.Interceptor; import okhttp3.Response; diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/socket/NoSecuritySSL.java b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/socket/NoSecuritySSL.java similarity index 97% rename from code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/socket/NoSecuritySSL.java rename to code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/socket/NoSecuritySSL.java index 4833c6e7..f8278287 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/socket/NoSecuritySSL.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/socket/NoSecuritySSL.java @@ -1,4 +1,4 @@ -package nu.marginalia.crawl.retreival.fetcher.socket; +package nu.marginalia.crawl.fetcher.socket; import lombok.SneakyThrows; diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcDigestBuilder.java b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcDigestBuilder.java similarity index 93% rename from code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcDigestBuilder.java rename to code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcDigestBuilder.java index 79b4c86a..52403fa9 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcDigestBuilder.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcDigestBuilder.java @@ -1,4 +1,4 @@ -package nu.marginalia.crawl.retreival.fetcher.warc; +package nu.marginalia.crawl.fetcher.warc; import org.netpreserve.jwarc.WarcDigest; diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcInputBuffer.java b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcInputBuffer.java similarity index 99% rename from code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcInputBuffer.java rename to code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcInputBuffer.java index d5864071..641a9f6e 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcInputBuffer.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcInputBuffer.java @@ -1,4 +1,4 @@ -package nu.marginalia.crawl.retreival.fetcher.warc; +package nu.marginalia.crawl.fetcher.warc; import okhttp3.Headers; import okhttp3.Response; diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcProtocolReconstructor.java similarity index 96% rename from code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java rename to code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcProtocolReconstructor.java index b75589ee..bcbcd7e3 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcProtocolReconstructor.java @@ -1,7 +1,6 @@ -package nu.marginalia.crawl.retreival.fetcher.warc; +package nu.marginalia.crawl.fetcher.warc; import okhttp3.Protocol; -import okhttp3.Request; import okhttp3.Response; import org.apache.commons.lang3.StringUtils; @@ -73,7 +72,7 @@ public class WarcProtocolReconstructor { String headerString = getHeadersAsString(headersAsString); - return STR."HTTP/\{version} \{statusCode} \{statusMessage}\r\n\{headerString}\r\n\r\n"; + return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n"; } static String getResponseHeader(Response response, long size) { @@ -84,7 +83,7 @@ public class WarcProtocolReconstructor { String headerString = getHeadersAsString(response, size); - return STR."HTTP/\{version} \{statusCode} \{statusMessage}\r\n\{headerString}\r\n\r\n"; + return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n"; } private static final Map STATUS_CODE_MAP = Map.ofEntries( diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcRecorder.java similarity index 84% rename from code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java rename to code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcRecorder.java index 1d4a4372..3fa5a903 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcRecorder.java @@ -1,13 +1,14 @@ -package nu.marginalia.crawl.retreival.fetcher.warc; +package nu.marginalia.crawl.fetcher.warc; -import nu.marginalia.crawl.retreival.DomainProber; -import nu.marginalia.crawl.retreival.fetcher.ContentTags; -import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor; +import nu.marginalia.crawl.fetcher.ContentTags; +import nu.marginalia.crawl.fetcher.HttpFetcherImpl; +import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.body.HttpFetchResult; import okhttp3.OkHttpClient; import okhttp3.Request; +import org.jetbrains.annotations.Nullable; import org.netpreserve.jwarc.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -183,7 +184,7 @@ public class WarcRecorder implements AutoCloseable { writer.write(item); } - private void saveOldResponse(EdgeUrl url, String contentType, int statusCode, String documentBody, ContentTags contentTags) { + private void saveOldResponse(EdgeUrl url, String contentType, int statusCode, String documentBody, @Nullable String headers, ContentTags contentTags) { try { WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder(); WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder(); @@ -192,24 +193,42 @@ public class WarcRecorder implements AutoCloseable { if (documentBody == null) { bytes = new byte[0]; - } - else { + } else { bytes = documentBody.getBytes(); } - StringJoiner fakeHeadersBuilder = new StringJoiner("\n"); + // Create a synthesis of custom headers and the original headers + // to create a new set of headers that will be written to the WARC file. - fakeHeadersBuilder.add(STR."Content-Type: \{contentType}"); - fakeHeadersBuilder.add(STR."Content-Length: \{bytes.length}"); + StringJoiner syntheticHeadersBuilder = new StringJoiner("\n"); + + syntheticHeadersBuilder.add("Content-Type: " + contentType); + syntheticHeadersBuilder.add("Content-Length: " + bytes.length); if (contentTags.etag() != null) { - fakeHeadersBuilder.add(STR."ETag: \{contentTags.etag()}"); + syntheticHeadersBuilder.add("ETag: " + contentTags.etag()); } if (contentTags.lastMod() != null) { - fakeHeadersBuilder.add(STR."Last-Modified: \{contentTags.lastMod()}"); + syntheticHeadersBuilder.add("Last-Modified: " + contentTags.lastMod()); + } + + // Grab the headers from the original response and add them to the fake headers if they are not + // Content-Type, Content-Length, ETag, or Last-Modified + for (String headerLine : Objects.requireNonNullElse(headers, "").split("\n")) { + if (headerLine.isBlank()) continue; + + var lowerCase = headerLine.toLowerCase(); + + if (lowerCase.startsWith("content-type:")) continue; + if (lowerCase.startsWith("content-length:")) continue; + + if (contentTags.etag() != null && lowerCase.startsWith("etag:")) continue; + if (contentTags.lastMod() != null && lowerCase.startsWith("last-modified:")) continue; + + syntheticHeadersBuilder.add(headerLine); } byte[] header = WarcProtocolReconstructor - .getResponseHeader(fakeHeadersBuilder.toString(), statusCode) + .getResponseHeader(syntheticHeadersBuilder.toString(), statusCode) .getBytes(StandardCharsets.UTF_8); ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(bytes.length + header.length); responseDataBuffer.put(header); @@ -244,25 +263,25 @@ public class WarcRecorder implements AutoCloseable { * an E-Tag or Last-Modified header, and the server responds with a 304 Not Modified. In this * scenario we want to record the data as it was in the previous crawl, but not re-fetch it. */ - public void writeReferenceCopy(EdgeUrl url, String contentType, int statusCode, String documentBody, ContentTags ctags) { - saveOldResponse(url, contentType, statusCode, documentBody, ctags); + public void writeReferenceCopy(EdgeUrl url, String contentType, int statusCode, String documentBody, @Nullable String headers, ContentTags ctags) { + saveOldResponse(url, contentType, statusCode, documentBody, headers, ctags); } - public void writeWarcinfoHeader(String ip, EdgeDomain domain, DomainProber.ProbeResult result) throws IOException { + public void writeWarcinfoHeader(String ip, EdgeDomain domain, HttpFetcherImpl.ProbeResult result) throws IOException { Map> fields = new HashMap<>(); fields.put("ip", List.of(ip)); - fields.put("software", List.of(STR."search.marginalia.nu/\{warcRecorderVersion}")); + fields.put("software", List.of("search.marginalia.nu/" + warcRecorderVersion)); fields.put("domain", List.of(domain.toString())); switch (result) { - case DomainProber.ProbeResultRedirect redirectDomain: - fields.put("X-WARC-Probe-Status", List.of(STR."REDIRECT;\{redirectDomain.domain()}")); + case HttpFetcherImpl.ProbeResultRedirect redirectDomain: + fields.put("X-WARC-Probe-Status", List.of("REDIRECT;" + redirectDomain.domain())); break; - case DomainProber.ProbeResultError error: - fields.put("X-WARC-Probe-Status", List.of(STR."\{error.status().toString()};\{error.desc()}")); + case HttpFetcherImpl.ProbeResultError error: + fields.put("X-WARC-Probe-Status", List.of(error.status().toString() + ";" + error.desc())); break; - case DomainProber.ProbeResultOk ok: + case HttpFetcherImpl.ProbeResultOk ok: fields.put("X-WARC-Probe-Status", List.of("OK")); break; } diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/ContentTypeProber.java b/code/processes/crawling-process/java/nu/marginalia/crawl/logic/ContentTypeProber.java similarity index 96% rename from code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/ContentTypeProber.java rename to code/processes/crawling-process/java/nu/marginalia/crawl/logic/ContentTypeProber.java index c9997017..93d15579 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/ContentTypeProber.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/logic/ContentTypeProber.java @@ -1,4 +1,4 @@ -package nu.marginalia.crawl.retreival.fetcher; +package nu.marginalia.crawl.logic; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.body.ContentTypeLogic; @@ -7,7 +7,7 @@ import okhttp3.Request; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.net.SocketTimeoutException; +import java.io.InterruptedIOException; import java.util.Objects; public class ContentTypeProber { @@ -68,7 +68,7 @@ public class ContentTypeProber { return new ContentTypeProbeResult.Ok(ret); - } catch (SocketTimeoutException ex) { + } catch (InterruptedIOException ex) { return new ContentTypeProbeResult.Timeout(ex); } catch (Exception ex) { logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage()); diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainLocks.java b/code/processes/crawling-process/java/nu/marginalia/crawl/logic/DomainLocks.java similarity index 97% rename from code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainLocks.java rename to code/processes/crawling-process/java/nu/marginalia/crawl/logic/DomainLocks.java index 3b061d93..856fa387 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainLocks.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/logic/DomainLocks.java @@ -1,4 +1,4 @@ -package nu.marginalia.crawl.retreival; +package nu.marginalia.crawl.logic; import nu.marginalia.model.EdgeDomain; diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/LinkFilterSelector.java b/code/processes/crawling-process/java/nu/marginalia/crawl/logic/LinkFilterSelector.java similarity index 77% rename from code/processes/crawling-process/java/nu/marginalia/crawl/retreival/LinkFilterSelector.java rename to code/processes/crawling-process/java/nu/marginalia/crawl/logic/LinkFilterSelector.java index e3b5f998..4ac23ccb 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/LinkFilterSelector.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/logic/LinkFilterSelector.java @@ -1,4 +1,4 @@ -package nu.marginalia.crawl.retreival; +package nu.marginalia.crawl.logic; import nu.marginalia.model.EdgeUrl; import org.jsoup.nodes.Document; @@ -26,6 +26,20 @@ public class LinkFilterSelector { if (isDiscourse(head)) { return url -> url.path.startsWith("/t/") || url.path.contains("/latest"); } + if (isMediawiki(head)) { + return url -> { + if (url.path.endsWith(".php")) { + return false; + } + if (url.path.contains("Special:")) { + return false; + } + if (url.path.contains("Talk:")) { + return false; + } + return true; + }; + } return LinkFilterSelector::defaultFilter; } diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/SoftIfModifiedSinceProber.java b/code/processes/crawling-process/java/nu/marginalia/crawl/logic/SoftIfModifiedSinceProber.java similarity index 95% rename from code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/SoftIfModifiedSinceProber.java rename to code/processes/crawling-process/java/nu/marginalia/crawl/logic/SoftIfModifiedSinceProber.java index 7b6071e9..6f059ff5 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/SoftIfModifiedSinceProber.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/logic/SoftIfModifiedSinceProber.java @@ -1,6 +1,7 @@ -package nu.marginalia.crawl.retreival.fetcher; +package nu.marginalia.crawl.logic; import com.google.common.base.Strings; +import nu.marginalia.crawl.fetcher.ContentTags; import nu.marginalia.model.EdgeUrl; import okhttp3.OkHttpClient; import okhttp3.Request; diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java index e52b73b6..e1e6d24a 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java @@ -1,6 +1,9 @@ package nu.marginalia.crawl.retreival; import lombok.SneakyThrows; +import nu.marginalia.crawl.fetcher.HttpFetcherImpl; + +import java.time.Duration; import static java.lang.Math.max; import static java.lang.Math.min; @@ -22,12 +25,21 @@ public class CrawlDelayTimer { /** Call when we've gotten an HTTP 429 response. This will wait a moment, and then * set a flag that slows down the main crawl delay as well. */ - public void waitRetryDelay(RateLimitException ex) throws InterruptedException { + public void waitRetryDelay(HttpFetcherImpl.RateLimitException ex) throws InterruptedException { slowDown = true; - int delay = ex.retryAfter(); + Duration delay = ex.retryAfter(); - Thread.sleep(Math.clamp(delay, 100, 5000)); + if (delay.compareTo(Duration.ofSeconds(1)) < 0) { + // If the server wants us to retry in less than a second, we'll just wait a bit + delay = Duration.ofSeconds(1); + } + else if (delay.compareTo(Duration.ofSeconds(5)) > 0) { + // If the server wants us to retry in more than a minute, we'll wait a bit + delay = Duration.ofSeconds(5); + } + + Thread.sleep(delay.toMillis()); } @SneakyThrows diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawledDocumentFactory.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawledDocumentFactory.java deleted file mode 100644 index c7fee792..00000000 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawledDocumentFactory.java +++ /dev/null @@ -1,91 +0,0 @@ -package nu.marginalia.crawl.retreival; - -import nu.marginalia.model.EdgeUrl; -import nu.marginalia.model.body.HttpFetchResult; -import nu.marginalia.model.crawldata.CrawledDocument; -import nu.marginalia.model.crawldata.CrawlerDocumentStatus; - -import java.time.LocalDateTime; -import java.util.Objects; - -public class CrawledDocumentFactory { - - public static CrawledDocument createHardErrorRsp(EdgeUrl url, Exception why) { - return CrawledDocument.builder() - .crawlerStatus(CrawlerDocumentStatus.ERROR.toString()) - .crawlerStatusDesc(why.getClass().getSimpleName() + ": " + why.getMessage()) - .timestamp(LocalDateTime.now().toString()) - .url(url.toString()) - .build(); - } - - public static CrawledDocument createUnknownHostError(EdgeUrl url) { - return CrawledDocument.builder() - .crawlerStatus(CrawlerDocumentStatus.ERROR.toString()) - .crawlerStatusDesc("Unknown Host") - .timestamp(LocalDateTime.now().toString()) - .url(url.toString()) - .build(); - } - - public static CrawledDocument createTimeoutErrorRsp(EdgeUrl url) { - return CrawledDocument.builder() - .crawlerStatus("Timeout") - .timestamp(LocalDateTime.now().toString()) - .url(url.toString()) - .build(); - } - - public static CrawledDocument createErrorResponse(EdgeUrl url, HttpFetchResult.ResultOk rsp, CrawlerDocumentStatus status, String why) { - return CrawledDocument.builder() - .crawlerStatus(status.toString()) - .crawlerStatusDesc(why) - .headers(rsp.headers().toString()) - .contentType(Objects.requireNonNullElse(rsp.headers().get("Content-Type"), "")) - .timestamp(LocalDateTime.now().toString()) - .httpStatus(rsp.statusCode()) - .url(url.toString()) - .build(); - } - public static CrawledDocument createErrorResponse(EdgeUrl url, String contentType, int statusCode, CrawlerDocumentStatus status, String why) { - return CrawledDocument.builder() - .crawlerStatus(status.toString()) - .crawlerStatusDesc(why) - .headers("") - .contentType(contentType) - .timestamp(LocalDateTime.now().toString()) - .httpStatus(statusCode) - .url(url.toString()) - .build(); - } - - public static CrawledDocument createRedirectResponse(EdgeUrl url, HttpFetchResult.ResultOk rsp, EdgeUrl responseUrl) { - - return CrawledDocument.builder() - .crawlerStatus(CrawlerDocumentStatus.REDIRECT.name()) - .redirectUrl(responseUrl.toString()) - .headers(rsp.headers().toString()) - .contentType(Objects.requireNonNullElse(rsp.headers().get("Content-Type"), "")) - .timestamp(LocalDateTime.now().toString()) - .httpStatus(rsp.statusCode()) - .url(url.toString()) - .build(); - } - - public static CrawledDocument createRobotsError(EdgeUrl url) { - return CrawledDocument.builder() - .url(url.toString()) - .timestamp(LocalDateTime.now().toString()) - .httpStatus(-1) - .crawlerStatus(CrawlerDocumentStatus.ROBOTS_TXT.name()) - .build(); - } - public static CrawledDocument createRetryError(EdgeUrl url) { - return CrawledDocument.builder() - .url(url.toString()) - .timestamp(LocalDateTime.now().toString()) - .httpStatus(429) - .crawlerStatus(CrawlerDocumentStatus.ERROR.name()) - .build(); - } -} diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index 457c524c..7b563e42 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -3,9 +3,11 @@ package nu.marginalia.crawl.retreival; import crawlercommons.robots.SimpleRobotRules; import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.contenttype.ContentType; -import nu.marginalia.crawl.retreival.fetcher.ContentTags; -import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; -import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; +import nu.marginalia.crawl.fetcher.ContentTags; +import nu.marginalia.crawl.fetcher.HttpFetcher; +import nu.marginalia.crawl.fetcher.HttpFetcherImpl; +import nu.marginalia.crawl.fetcher.warc.WarcRecorder; +import nu.marginalia.crawl.logic.LinkFilterSelector; import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor; import nu.marginalia.crawl.retreival.revisit.DocumentWithReference; import nu.marginalia.crawl.retreival.sitemap.SitemapFetcher; @@ -14,8 +16,6 @@ import nu.marginalia.link_parser.LinkParser; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.body.HttpFetchResult; -import nu.marginalia.model.crawldata.CrawledDomain; -import nu.marginalia.model.crawldata.CrawlerDomainStatus; import nu.marginalia.model.crawlspec.CrawlSpecRecord; import org.jsoup.Jsoup; import org.slf4j.Logger; @@ -25,7 +25,6 @@ import java.io.IOException; import java.net.InetAddress; import java.net.UnknownHostException; import java.nio.file.Path; -import java.util.ArrayList; import java.util.List; import java.util.Objects; import java.util.Optional; @@ -84,11 +83,11 @@ public class CrawlerRetreiver implements AutoCloseable { return crawlFrontier; } - public int fetch() { - return fetch(new DomainLinks(), new CrawlDataReference()); + public int crawlDomain() { + return crawlDomain(new DomainLinks(), new CrawlDataReference()); } - public int fetch(DomainLinks domainLinks, CrawlDataReference oldCrawlData) { + public int crawlDomain(DomainLinks domainLinks, CrawlDataReference oldCrawlData) { try { return crawlDomain(oldCrawlData, domainLinks); } @@ -98,28 +97,11 @@ public class CrawlerRetreiver implements AutoCloseable { } } - public void syncAbortedRun(Path warcFile) { - var resync = new CrawlerWarcResynchronizer(crawlFrontier, warcRecorder); - - resync.run(warcFile); - } - - private DomainProber.ProbeResult probeRootUrl(String ip) throws IOException { - // Construct an URL to the root of the domain, we don't know the schema yet so we'll - // start with http and then try https if that fails - var httpUrl = new EdgeUrl("http", new EdgeDomain(domain), null, "/", null); - final DomainProber.ProbeResult probeResult = domainProber.probeDomain(fetcher, domain, httpUrl); - - warcRecorder.writeWarcinfoHeader(ip, new EdgeDomain(domain), probeResult); - - return probeResult; - } - private int crawlDomain(CrawlDataReference oldCrawlData, DomainLinks domainLinks) throws IOException, InterruptedException { String ip = findIp(domain); EdgeUrl rootUrl; - if (probeRootUrl(ip) instanceof DomainProber.ProbeResultOk ok) rootUrl = ok.probedUrl(); + if (probeRootUrl(ip) instanceof HttpFetcherImpl.ProbeResultOk ok) rootUrl = ok.probedUrl(); else return 1; // Sleep after the initial probe, we don't have access to the robots.txt yet @@ -130,12 +112,13 @@ public class CrawlerRetreiver implements AutoCloseable { final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay()); delayTimer.waitFetchDelay(0); // initial delay after robots.txt + sniffRootDocument(rootUrl, delayTimer); // Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified - int recrawled = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer); + int fetchedCount = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer); - if (recrawled > 0) { + if (fetchedCount > 0) { // If we have reference data, we will always grow the crawl depth a bit crawlFrontier.increaseDepth(1.5, 2500); } @@ -146,15 +129,6 @@ public class CrawlerRetreiver implements AutoCloseable { // Add links from the sitemap to the crawl frontier sitemapFetcher.downloadSitemaps(robotsRules, rootUrl); - CrawledDomain ret = new CrawledDomain(domain, - null, - CrawlerDomainStatus.OK.name(), - null, - ip, - new ArrayList<>(), - null); - - int fetchedCount = recrawled; while (!crawlFrontier.isEmpty() && !crawlFrontier.isCrawlDepthReached() @@ -186,7 +160,6 @@ public class CrawlerRetreiver implements AutoCloseable { if (!crawlFrontier.addVisited(top)) continue; - try { if (fetchContentWithReference(top, delayTimer, DocumentWithReference.empty()).isOk()) { fetchedCount++; @@ -198,15 +171,28 @@ public class CrawlerRetreiver implements AutoCloseable { } } - ret.cookies = fetcher.getCookies(); - return fetchedCount; } + public void syncAbortedRun(Path warcFile) { + var resync = new CrawlerWarcResynchronizer(crawlFrontier, warcRecorder); + + resync.run(warcFile); + } + + private HttpFetcherImpl.ProbeResult probeRootUrl(String ip) throws IOException { + // Construct an URL to the root of the domain, we don't know the schema yet so we'll + // start with http and then try https if that fails + var httpUrl = new EdgeUrl("http", new EdgeDomain(domain), null, "/", null); + final HttpFetcherImpl.ProbeResult probeResult = domainProber.probeDomain(fetcher, domain, httpUrl); + + warcRecorder.writeWarcinfoHeader(ip, new EdgeDomain(domain), probeResult); + + return probeResult; + } + private void sniffRootDocument(EdgeUrl rootUrl, CrawlDelayTimer timer) { try { - logger.debug("Configuring link filter"); - var url = rootUrl.withPathAndParam("/", null); HttpFetchResult result = fetchWithRetry(url, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty()); @@ -291,7 +277,7 @@ public class CrawlerRetreiver implements AutoCloseable { else if (fetchedDoc instanceof HttpFetchResult.Result304Raw && reference.doc() != null) { var doc = reference.doc(); - warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBody, contentTags); + warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBody, doc.headers, contentTags); fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url, new ContentType(doc.contentType, "UTF-8"), @@ -326,7 +312,7 @@ public class CrawlerRetreiver implements AutoCloseable { try { return fetcher.fetchContent(url, warcRecorder, contentTags, probeType); } - catch (RateLimitException ex) { + catch (HttpFetcherImpl.RateLimitException ex) { timer.waitRetryDelay(ex); } catch (Exception ex) { diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java index 1468d6ed..1b8e178d 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java @@ -1,6 +1,6 @@ package nu.marginalia.crawl.retreival; -import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; +import nu.marginalia.crawl.fetcher.warc.WarcRecorder; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.body.DocumentBodyExtractor; import nu.marginalia.model.body.HttpFetchResult; @@ -38,7 +38,7 @@ public class CrawlerWarcResynchronizer { accept(item); } } catch (Exception e) { - logger.info(STR."(Expected) Failed read full warc file \{tempFile}: \{e.getClass().getSimpleName()} \{e.getMessage()}"); + logger.info("(Expected) Failed read full warc file " + tempFile + ": " + e.getClass().getSimpleName() + " " + e.getMessage()); } // Second pass, copy records to the new warc file @@ -47,7 +47,7 @@ public class CrawlerWarcResynchronizer { recorder.resync(item); } } catch (Exception e) { - logger.info(STR."(Expected) Failed read full warc file \{tempFile}: \{e.getClass().getSimpleName()} \{e.getMessage()}"); + logger.info("(Expected) Failed read full warc file " + tempFile + ": " + e.getClass().getSimpleName() + " " + e.getMessage()); } } @@ -63,7 +63,7 @@ public class CrawlerWarcResynchronizer { } catch (Exception ex) { - logger.info(STR."Failed to process warc record \{item}", ex); + logger.info("Failed to process warc record " + item, ex); } } @@ -78,7 +78,8 @@ public class CrawlerWarcResynchronizer { } private void request(WarcRequest request) { - EdgeUrl.parse(request.target()).ifPresent(crawlFrontier::addVisited); + var url = new EdgeUrl(request.targetURI()); + crawlFrontier.addVisited(url); } private void response(WarcResponse rsp) { @@ -97,7 +98,7 @@ public class CrawlerWarcResynchronizer { }); } catch (Exception e) { - logger.info(STR."Failed to parse response body for \{url}", e); + logger.info("Failed to parse response body for " + url, e); } } diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java index 88fea00a..6b364313 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java @@ -9,9 +9,14 @@ import nu.marginalia.model.EdgeUrl; import org.jsoup.nodes.Document; import java.net.URISyntaxException; -import java.util.*; +import java.util.ArrayDeque; +import java.util.Collection; +import java.util.Objects; import java.util.function.Predicate; +/** Encapsulates the crawl frontier for a single domain, + * that is information about known and visited URLs + */ public class DomainCrawlFrontier { private static final LinkParser linkParser = new LinkParser(); diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainProber.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainProber.java index 3ec9b8da..503bc7cf 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainProber.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainProber.java @@ -2,8 +2,8 @@ package nu.marginalia.crawl.retreival; import com.google.inject.Inject; import com.google.inject.Singleton; -import nu.marginalia.crawl.retreival.fetcher.FetchResultState; -import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; +import nu.marginalia.crawl.fetcher.HttpFetcher; +import nu.marginalia.crawl.fetcher.HttpFetcherImpl; import nu.marginalia.ip_blocklist.IpBlockList; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; @@ -34,43 +34,18 @@ public class DomainProber { * doesn't immediately redirect to another domain (which should be crawled separately, not under the name * of this domain). */ - public ProbeResult probeDomain(HttpFetcher fetcher, String domain, @Nullable EdgeUrl firstUrlInQueue) { + public HttpFetcherImpl.ProbeResult probeDomain(HttpFetcher fetcher, String domain, @Nullable EdgeUrl firstUrlInQueue) { if (firstUrlInQueue == null) { logger.warn("No valid URLs for domain {}", domain); - return new ProbeResultError(CrawlerDomainStatus.ERROR, "No known URLs"); + return new HttpFetcherImpl.ProbeResultError(CrawlerDomainStatus.ERROR, "No known URLs"); } if (!domainBlacklist.test(firstUrlInQueue.domain)) - return new ProbeResultError(CrawlerDomainStatus.BLOCKED, "IP not allowed"); + return new HttpFetcherImpl.ProbeResultError(CrawlerDomainStatus.BLOCKED, "IP not allowed"); - var fetchResult = fetcher.probeDomain(firstUrlInQueue.withPathAndParam("/", null)); - - if (fetchResult.ok()) - return new ProbeResultOk(fetchResult.url); - - if (fetchResult.state == FetchResultState.REDIRECT) - return new ProbeResultRedirect(fetchResult.domain); - - return new ProbeResultError(CrawlerDomainStatus.ERROR, "Bad status"); + return fetcher.probeDomain(firstUrlInQueue.withPathAndParam("/", null)); } - public sealed interface ProbeResult permits ProbeResultError, ProbeResultRedirect, ProbeResultOk {} - - /** The probing failed for one reason or another - * @param status Machine readable status - * @param desc Human-readable description of the error - */ - public record ProbeResultError(CrawlerDomainStatus status, String desc) implements ProbeResult {} - - /** This domain redirects to another domain */ - public record ProbeResultRedirect(EdgeDomain domain) implements ProbeResult {} - - /** If the retrieval of the probed url was successful, return the url as it was fetched - * (which may be different from the url we probed, if we attempted another URL schema). - * - * @param probedUrl The url we successfully probed - */ - public record ProbeResultOk(EdgeUrl probedUrl) implements ProbeResult {} } diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/RateLimitException.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/RateLimitException.java deleted file mode 100644 index 5e0c57dd..00000000 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/RateLimitException.java +++ /dev/null @@ -1,21 +0,0 @@ -package nu.marginalia.crawl.retreival; - -public class RateLimitException extends Exception { - private final String retryAfter; - - public RateLimitException(String retryAfter) { - this.retryAfter = retryAfter; - } - - @Override - public StackTraceElement[] getStackTrace() { return new StackTraceElement[0]; } - - public int retryAfter() { - try { - return Integer.parseInt(retryAfter); - } - catch (NumberFormatException ex) { - return 1000; - } - } -} diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/FetchResult.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/FetchResult.java deleted file mode 100644 index ee4c734f..00000000 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/FetchResult.java +++ /dev/null @@ -1,24 +0,0 @@ -package nu.marginalia.crawl.retreival.fetcher; - -import lombok.AllArgsConstructor; -import lombok.ToString; -import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.EdgeUrl; - -@AllArgsConstructor -@ToString -public class FetchResult { - public final FetchResultState state; - public final EdgeUrl url; - public final EdgeDomain domain; - - public FetchResult(FetchResultState state, EdgeUrl url) { - this.state = state; - this.url = url; - this.domain = url.domain; - } - - public boolean ok() { - return state == FetchResultState.OK; - } -} diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/FetchResultState.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/FetchResultState.java deleted file mode 100644 index 846e2b4f..00000000 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/FetchResultState.java +++ /dev/null @@ -1,7 +0,0 @@ -package nu.marginalia.crawl.retreival.fetcher; - -public enum FetchResultState { - OK, - REDIRECT, - ERROR -} diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java index f5bb863e..7f3c6c22 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java @@ -2,12 +2,12 @@ package nu.marginalia.crawl.retreival.revisit; import com.google.common.base.Strings; import crawlercommons.robots.SimpleRobotRules; +import nu.marginalia.crawl.fetcher.ContentTags; +import nu.marginalia.crawl.fetcher.warc.WarcRecorder; import nu.marginalia.crawl.retreival.CrawlDataReference; import nu.marginalia.crawl.retreival.CrawlDelayTimer; import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.DomainCrawlFrontier; -import nu.marginalia.crawl.retreival.fetcher.ContentTags; -import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.body.HttpFetchResult; import nu.marginalia.model.crawldata.CrawledDocument; @@ -125,6 +125,7 @@ public class CrawlerRevisitor { doc.contentType, doc.httpStatus, doc.documentBody, + doc.headers, new ContentTags(doc.etagMaybe, doc.lastModifiedMaybe) ); diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java index b5589401..1f1854b5 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java @@ -1,7 +1,7 @@ package nu.marginalia.crawl.retreival.revisit; +import nu.marginalia.crawl.fetcher.ContentTags; import nu.marginalia.crawl.retreival.CrawlDataReference; -import nu.marginalia.crawl.retreival.fetcher.ContentTags; import nu.marginalia.model.body.DocumentBodyExtractor; import nu.marginalia.model.body.DocumentBodyResult; import nu.marginalia.model.body.HttpFetchResult; diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/sitemap/SitemapFetcher.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/sitemap/SitemapFetcher.java index 3ce33d64..2cf0ab33 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/sitemap/SitemapFetcher.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/sitemap/SitemapFetcher.java @@ -1,8 +1,8 @@ package nu.marginalia.crawl.retreival.sitemap; import crawlercommons.robots.SimpleRobotRules; +import nu.marginalia.crawl.fetcher.SitemapRetriever; import nu.marginalia.crawl.retreival.DomainCrawlFrontier; -import nu.marginalia.crawl.retreival.fetcher.SitemapRetriever; import nu.marginalia.model.EdgeUrl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/ParquetSerializableCrawlDataStream.java b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/ParquetSerializableCrawlDataStream.java index 55c5ce8e..dba7a326 100644 --- a/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/ParquetSerializableCrawlDataStream.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/ParquetSerializableCrawlDataStream.java @@ -144,7 +144,7 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial nextRecord.httpStatus, status.toString(), "", - "", + nextRecord.headers, bodyString, Long.toHexString(hash.hashNearlyASCII(bodyString)), // this field isn't actually used, maybe we can skip calculating it? nextRecord.url, diff --git a/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java index f43433b9..2ba90bbc 100644 --- a/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java @@ -23,7 +23,6 @@ public class CrawledDocument implements SerializableCrawlData { public String crawlerStatusDesc; @Nullable - @Deprecated // use getETag() or getLastModified() instead public String headers; public String documentBody; diff --git a/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecord.java b/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecord.java index e4ce7ad9..16e2f6a0 100644 --- a/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecord.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecord.java @@ -29,7 +29,11 @@ public class CrawledDocumentParquetRecord { public String contentType; public byte[] body; + public String headers; + + @Deprecated // will be replaced with the full headers field in the future public String etagHeader; + @Deprecated // will be replaced with the full headers field in the future public String lastModifiedHeader; public static Hydrator newHydrator() { @@ -51,7 +55,8 @@ public class CrawledDocumentParquetRecord { Types.required(BINARY).as(stringType()).named("contentType"), Types.required(BINARY).named("body"), Types.optional(BINARY).as(stringType()).named("etagHeader"), - Types.optional(BINARY).as(stringType()).named("lastModifiedHeader") + Types.optional(BINARY).as(stringType()).named("lastModifiedHeader"), + Types.optional(BINARY).as(stringType()).named("headers") ); @@ -67,6 +72,7 @@ public class CrawledDocumentParquetRecord { case "epochSeconds" -> timestamp = Instant.ofEpochSecond((Long) value); case "etagHeader" -> etagHeader = (String) value; case "lastModifiedHeader" -> lastModifiedHeader = (String) value; + case "headers" -> headers = (String) value; default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"'); } @@ -82,6 +88,9 @@ public class CrawledDocumentParquetRecord { valueWriter.write("cookies", cookies); valueWriter.write("contentType", contentType); valueWriter.write("body", body); + if (headers != null) { + valueWriter.write("headers", headers); + } if (etagHeader != null) { valueWriter.write("etagHeader", etagHeader); } diff --git a/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileWriter.java b/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileWriter.java index 36a58673..7055039c 100644 --- a/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileWriter.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileWriter.java @@ -16,6 +16,7 @@ import java.nio.file.Path; import java.time.Instant; import java.util.List; import java.util.Objects; +import java.util.StringJoiner; public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { private final ParquetWriter writer; @@ -150,6 +151,14 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { contentType = ""; } + String headersStr = null; + StringJoiner headersStrBuilder = new StringJoiner("\n"); + for (var header : headers) { + headersStrBuilder.add(header.getFirst() + ": " + header.getSecond()); + } + headersStr = headersStrBuilder.toString(); + + write(new CrawledDocumentParquetRecord( domain, response.target(), @@ -159,6 +168,7 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { response.date(), contentType, bodyBytes, + headersStr, headers.get("ETag"), headers.get("Last-Modified")) ); @@ -179,6 +189,7 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { "x-marginalia/advisory;state=redirect", new byte[0], null, + null, null ); } @@ -192,6 +203,7 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { "x-marginalia/advisory;state=error", errorStatus.getBytes(), null, + null, null ); } @@ -206,6 +218,7 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { errorStatus, new byte[0], null, + null, null ); } diff --git a/code/processes/crawling-process/test/nu/marginalia/crawl/logic/ContentTypeProberTest.java b/code/processes/crawling-process/test/nu/marginalia/crawl/logic/ContentTypeProberTest.java new file mode 100644 index 00000000..40a92e87 --- /dev/null +++ b/code/processes/crawling-process/test/nu/marginalia/crawl/logic/ContentTypeProberTest.java @@ -0,0 +1,124 @@ +package nu.marginalia.crawl.logic; + +import com.sun.net.httpserver.HttpServer; +import nu.marginalia.model.EdgeUrl; +import okhttp3.OkHttpClient; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.net.InetSocketAddress; +import java.util.Random; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; + +class ContentTypeProberTest { + + private static int port; + private static HttpServer server; + private static OkHttpClient client; + + static EdgeUrl htmlEndpoint; + static EdgeUrl htmlRedirEndpoint; + static EdgeUrl binaryEndpoint; + static EdgeUrl timeoutEndpoint; + + @BeforeEach + void setUp() throws IOException { + Random r = new Random(); + port = r.nextInt(10000) + 8000; + server = HttpServer.create(new InetSocketAddress("127.0.0.1", port), 10); + + server.createContext("/html", exchange -> { + exchange.getResponseHeaders().add("Content-Type", "text/html"); + exchange.sendResponseHeaders(200, -1); + exchange.close(); + }); + server.createContext("/redir", exchange -> { + exchange.getResponseHeaders().add("Location", "/html"); + exchange.sendResponseHeaders(301, -1); + exchange.close(); + }); + + server.createContext("/bin", exchange -> { + exchange.getResponseHeaders().add("Content-Type", "application/binary"); + exchange.sendResponseHeaders(200, -1); + exchange.close(); + }); + + server.createContext("/timeout", exchange -> { + try { + Thread.sleep(2000); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + + exchange.getResponseHeaders().add("Content-Type", "application/binary"); + exchange.sendResponseHeaders(200, -1); + exchange.close(); + }); + + server.start(); + + htmlEndpoint = EdgeUrl.parse("http://localhost:" + port + "/html").get(); + binaryEndpoint = EdgeUrl.parse("http://localhost:" + port + "/bin").get(); + timeoutEndpoint = EdgeUrl.parse("http://localhost:" + port + "/timeout").get(); + htmlRedirEndpoint = EdgeUrl.parse("http://localhost:" + port + "/redir").get(); + + client = new OkHttpClient.Builder() + .readTimeout(1, java.util.concurrent.TimeUnit.SECONDS) + .connectTimeout(1, java.util.concurrent.TimeUnit.SECONDS) + .callTimeout(1, java.util.concurrent.TimeUnit.SECONDS) + .writeTimeout(1, java.util.concurrent.TimeUnit.SECONDS) + .build(); + } + + @AfterEach + void tearDown() { + server.stop(0); + client.dispatcher().executorService().shutdown(); + client.connectionPool().evictAll(); + } + + @Test + void probeContentTypeOk() { + ContentTypeProber.ContentTypeProbeResult result = new ContentTypeProber("test", client) + .probeContentType(htmlEndpoint); + + System.out.println(result); + + assertEquals(result, new ContentTypeProber.ContentTypeProbeResult.Ok(htmlEndpoint)); + } + + @Test + void probeContentTypeRedir() { + ContentTypeProber.ContentTypeProbeResult result = new ContentTypeProber("test", client) + .probeContentType(htmlRedirEndpoint); + + System.out.println(result); + + assertEquals(result, new ContentTypeProber.ContentTypeProbeResult.Ok(htmlEndpoint)); + } + + @Test + void probeContentTypeBad() { + ContentTypeProber.ContentTypeProbeResult result = new ContentTypeProber("test", client) + .probeContentType(binaryEndpoint); + + System.out.println(result); + + assertInstanceOf(ContentTypeProber.ContentTypeProbeResult.BadContentType.class, result); + } + + @Test + void probeContentTypeTimeout() { + ContentTypeProber.ContentTypeProbeResult result = new ContentTypeProber("test", client) + .probeContentType(timeoutEndpoint); + + System.out.println(result); + + assertInstanceOf(ContentTypeProber.ContentTypeProbeResult.Timeout.class, result); + } +} \ No newline at end of file diff --git a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizerTest.java b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizerTest.java index e3720941..feda711b 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizerTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizerTest.java @@ -1,7 +1,7 @@ package nu.marginalia.crawl.retreival; -import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor; -import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; +import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor; +import nu.marginalia.crawl.fetcher.warc.WarcRecorder; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import okhttp3.OkHttpClient; @@ -21,7 +21,8 @@ import java.security.NoSuchAlgorithmException; import java.util.List; import java.util.zip.GZIPInputStream; -import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; class CrawlerWarcResynchronizerTest { Path fileName; diff --git a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/ContentTypeProberTest.java b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/ContentTypeProberTest.java index 4a015fb9..92d05b12 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/ContentTypeProberTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/ContentTypeProberTest.java @@ -1,7 +1,8 @@ package nu.marginalia.crawl.retreival.fetcher; -import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult.BadContentType; -import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult.Ok; +import nu.marginalia.crawl.logic.ContentTypeProber; +import nu.marginalia.crawl.logic.ContentTypeProber.ContentTypeProbeResult.BadContentType; +import nu.marginalia.crawl.logic.ContentTypeProber.ContentTypeProbeResult.Ok; import nu.marginalia.model.EdgeUrl; import okhttp3.ConnectionPool; import okhttp3.Dispatcher; @@ -13,7 +14,7 @@ import java.net.URISyntaxException; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; -import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertEquals; class ContentTypeProberTest { diff --git a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java index a9df80ac..d6d407bf 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java @@ -1,8 +1,9 @@ package nu.marginalia.crawl.retreival.fetcher; import nu.marginalia.UserAgent; -import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor; -import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; +import nu.marginalia.crawl.fetcher.ContentTags; +import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor; +import nu.marginalia.crawl.fetcher.warc.WarcRecorder; import nu.marginalia.model.EdgeUrl; import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader; import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter; @@ -80,6 +81,7 @@ class WarcRecorderTest { "text/html", 200, "test", + null, ContentTags.empty()); } @@ -103,7 +105,7 @@ class WarcRecorderTest { "text/html", 200, null, - ContentTags.empty()); + null, ContentTags.empty()); } } @@ -115,7 +117,7 @@ class WarcRecorderTest { "text/html", 200, "test", - ContentTags.empty()); + null, ContentTags.empty()); } try (var reader = new WarcReader(fileNameWarc)) { diff --git a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/revisit/DocumentWithReferenceTest.java b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/revisit/DocumentWithReferenceTest.java index 9d46ec75..0ed6117f 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/revisit/DocumentWithReferenceTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/revisit/DocumentWithReferenceTest.java @@ -1,7 +1,7 @@ package nu.marginalia.crawl.retreival.revisit; +import nu.marginalia.crawl.fetcher.ContentTags; import nu.marginalia.crawl.retreival.CrawlDataReference; -import nu.marginalia.crawl.retreival.fetcher.ContentTags; import nu.marginalia.model.crawldata.CrawledDocument; import org.junit.jupiter.api.Test; diff --git a/code/processes/crawling-process/test/nu/marginalia/crawling/HttpFetcherTest.java b/code/processes/crawling-process/test/nu/marginalia/crawling/HttpFetcherTest.java index 611cc8c2..1f7fe338 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawling/HttpFetcherTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawling/HttpFetcherTest.java @@ -1,11 +1,10 @@ package nu.marginalia.crawling; import lombok.SneakyThrows; -import nu.marginalia.crawl.retreival.RateLimitException; -import nu.marginalia.crawl.retreival.fetcher.ContentTags; -import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; -import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; -import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; +import nu.marginalia.crawl.fetcher.ContentTags; +import nu.marginalia.crawl.fetcher.HttpFetcher; +import nu.marginalia.crawl.fetcher.HttpFetcherImpl; +import nu.marginalia.crawl.fetcher.warc.WarcRecorder; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.body.ContentTypeLogic; import nu.marginalia.model.body.DocumentBodyExtractor; @@ -33,7 +32,7 @@ class HttpFetcherTest { } @Test - void fetchUTF8() throws URISyntaxException, RateLimitException, IOException { + void fetchUTF8() throws URISyntaxException, HttpFetcherImpl.RateLimitException, IOException { var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler"); try (var recorder = new WarcRecorder()) { var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, ContentTags.empty(), HttpFetcher.ProbeType.FULL); @@ -44,7 +43,7 @@ class HttpFetcherTest { } @Test - void fetchText() throws URISyntaxException, RateLimitException, IOException { + void fetchText() throws URISyntaxException, HttpFetcherImpl.RateLimitException, IOException { var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler"); try (var recorder = new WarcRecorder()) { diff --git a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java index 45986bbc..2d0a24eb 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java @@ -2,10 +2,13 @@ package nu.marginalia.crawling.retreival; import crawlercommons.robots.SimpleRobotRules; import lombok.SneakyThrows; +import nu.marginalia.crawl.fetcher.ContentTags; +import nu.marginalia.crawl.fetcher.HttpFetcher; +import nu.marginalia.crawl.fetcher.HttpFetcherImpl; +import nu.marginalia.crawl.fetcher.SitemapRetriever; +import nu.marginalia.crawl.fetcher.warc.WarcRecorder; import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.DomainProber; -import nu.marginalia.crawl.retreival.fetcher.*; -import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.body.HttpFetchResult; @@ -68,7 +71,7 @@ public class CrawlerMockFetcherTest { void crawl(CrawlSpecRecord spec) throws IOException { try (var recorder = new WarcRecorder()) { new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), spec, recorder) - .fetch(); + .crawlDomain(); } } @@ -115,9 +118,9 @@ public class CrawlerMockFetcherTest { public void clearCookies() {} @Override - public FetchResult probeDomain(EdgeUrl url) { + public HttpFetcherImpl.ProbeResult probeDomain(EdgeUrl url) { logger.info("Probing {}", url); - return new FetchResult(FetchResultState.OK, url); + return new HttpFetcherImpl.ProbeResultOk(url); } @SneakyThrows diff --git a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java index 1b541b63..df0df41a 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java @@ -4,10 +4,10 @@ import lombok.SneakyThrows; import nu.marginalia.UserAgent; import nu.marginalia.WmsaHome; import nu.marginalia.atags.model.DomainLinks; +import nu.marginalia.crawl.fetcher.HttpFetcher; +import nu.marginalia.crawl.fetcher.HttpFetcherImpl; +import nu.marginalia.crawl.fetcher.warc.WarcRecorder; import nu.marginalia.crawl.retreival.*; -import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; -import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; -import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.io.crawldata.CrawledDomainReader; import nu.marginalia.io.crawldata.SerializableCrawlDataStream; import nu.marginalia.model.EdgeDomain; @@ -468,7 +468,7 @@ class CrawlerRetreiverTest { private void doCrawlWithReferenceStream(CrawlSpecRecord specs, SerializableCrawlDataStream stream) { try (var recorder = new WarcRecorder(tempFileWarc2)) { - new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch(new DomainLinks(), + new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).crawlDomain(new DomainLinks(), new CrawlDataReference(stream)); } catch (IOException ex) { @@ -480,7 +480,7 @@ class CrawlerRetreiverTest { private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlSpecRecord specs) { try (var recorder = new WarcRecorder(tempFileWarc1)) { var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder); - crawler.fetch(); + crawler.crawlDomain(); return crawler.getCrawlFrontier(); } catch (IOException ex) { Assertions.fail(ex); diff --git a/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java b/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java index 7fbcdefc..ec039edd 100644 --- a/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java +++ b/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java @@ -8,9 +8,9 @@ import nu.marginalia.api.searchquery.RpcQueryLimits; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.converting.processor.DomainProcessor; import nu.marginalia.converting.writer.ConverterBatchWriter; -import nu.marginalia.crawl.retreival.DomainProber; -import nu.marginalia.crawl.retreival.fetcher.ContentTags; -import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; +import nu.marginalia.crawl.fetcher.ContentTags; +import nu.marginalia.crawl.fetcher.HttpFetcherImpl; +import nu.marginalia.crawl.fetcher.warc.WarcRecorder; import nu.marginalia.functions.searchquery.QueryFactory; import nu.marginalia.index.IndexGrpcService; import nu.marginalia.index.ReverseIndexFullFileNames; @@ -120,7 +120,7 @@ public class IntegrationTest { /** CREATE WARC */ try (WarcRecorder warcRecorder = new WarcRecorder(warcData)) { warcRecorder.writeWarcinfoHeader("127.0.0.1", new EdgeDomain("www.example.com"), - new DomainProber.ProbeResultOk(new EdgeUrl("https://www.example.com/"))); + new HttpFetcherImpl.ProbeResultOk(new EdgeUrl("https://www.example.com/"))); warcRecorder.writeReferenceCopy(new EdgeUrl("https://www.example.com/"), "text/html", 200, @@ -134,6 +134,7 @@ public class IntegrationTest { """, + "", ContentTags.empty() ); } @@ -204,7 +205,7 @@ public class IntegrationTest { .setFetchSize(1000) .build()) .setQueryStrategy("AUTO") - .setHumanQuery("\"This is how thinking works\"") + .setHumanQuery("\"is that there is\"") .build(); var params = QueryProtobufCodec.convertRequest(request); diff --git a/code/tools/screenshot-capture-tool/java/nu/marginalia/screenshot/ScreenshotCaptureToolMain.java b/code/tools/screenshot-capture-tool/java/nu/marginalia/screenshot/ScreenshotCaptureToolMain.java index 8432b80e..169df4dc 100644 --- a/code/tools/screenshot-capture-tool/java/nu/marginalia/screenshot/ScreenshotCaptureToolMain.java +++ b/code/tools/screenshot-capture-tool/java/nu/marginalia/screenshot/ScreenshotCaptureToolMain.java @@ -96,7 +96,7 @@ public class ScreenshotCaptureToolMain { private static byte[] fetchDomain(HttpClient client, EdgeDomain domain) { try { Map requestData = Map.of( - "url", domain.toRootUrl().toString(), + "url", domain.toRootUrlHttp().toString(), "options", Map.of("fullPage", false, "type", "png"),