diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcSerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcSerializableCrawlDataStream.java index a766a58d..02aefb6d 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcSerializableCrawlDataStream.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcSerializableCrawlDataStream.java @@ -28,6 +28,7 @@ public class WarcSerializableCrawlDataStream implements AutoCloseable, Serializa path = file; reader = new WarcReader(file); WarcXResponseReference.register(reader); + WarcXEntityRefused.register(reader); backingIterator = reader.iterator(); } diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java index 60e0178e..5a993fda 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java @@ -9,29 +9,34 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; +import java.net.URI; import java.nio.file.Path; public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { private final ParquetWriter writer; private static final Logger logger = LoggerFactory.getLogger(CrawledDocumentParquetRecordFileWriter.class); - public static void convertWarc(String domain, Path warcInputFile, Path parquetOutputFile) throws IOException { + public static void convertWarc(String domain, Path warcInputFile, Path parquetOutputFile) { try (var warcReader = new WarcReader(warcInputFile); var parquetWriter = new CrawledDocumentParquetRecordFileWriter(parquetOutputFile) ) { WarcXResponseReference.register(warcReader); + WarcXEntityRefused.register(warcReader); for (var record : warcReader) { if (record instanceof WarcResponse response) { + // this also captures WarcXResponseReference, which inherits from WarcResponse + // and is used to store old responses from previous crawls; in this part of the logic + // we treat them the same as a normal response + parquetWriter.write(domain, response); } + else if (record instanceof WarcXEntityRefused refused) { + parquetWriter.write(domain, refused); + } else if (record instanceof Warcinfo warcinfo) { - parquetWriter.write(domain, warcinfo); + parquetWriter.write(warcinfo); } - else { - logger.warn("Skipping record of type {}", record.type()); - } - } } catch (Exception ex) { @@ -39,31 +44,40 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { } } - private void write(String domain, Warcinfo warcinfo) throws IOException { + private void write(String domain, WarcXEntityRefused refused) throws IOException { + URI profile = refused.profile(); + + String meta; + if (profile.equals(WarcXEntityRefused.documentRobotsTxtSkippedURN)) { + meta = "x-marginalia/advisory;state=robots-txt-skipped"; + } + else if (profile.equals(WarcXEntityRefused.documentBadContentTypeURN)) { + meta = "x-marginalia/advisory;state=content-type-failed-probe"; + } + else if (profile.equals(WarcXEntityRefused.documentProbeTimeout)) { + meta = "x-marginalia/advisory;state=timeout-probe"; + } + else if (profile.equals(WarcXEntityRefused.documentUnspecifiedError)) { + meta = "x-marginalia/advisory;state=doc-error"; + } + else { + meta = "x-marginalia/advisory;state=unknown"; + } + + write(forDocError(domain, refused.target(), meta)); + } + + private void write(Warcinfo warcinfo) throws IOException { String selfDomain = warcinfo.fields().first("domain").orElse(""); String ip = warcinfo.fields().first("ip").orElse(""); String probeStatus = warcinfo.fields().first("X-WARC-Probe-Status").orElse(""); if (probeStatus.startsWith("REDIRECT")) { String redirectDomain = probeStatus.substring("REDIRECT;".length()); - write(new CrawledDocumentParquetRecord(selfDomain, - STR."https://\{redirectDomain}/", - ip, - false, - 0, - "x-marginalia/advisory;state=redirect", - new byte[0] - )); + write(forDomainRedirect(selfDomain, redirectDomain)); } else if (!"OK".equals(probeStatus)) { - write(new CrawledDocumentParquetRecord(selfDomain, - STR."https://\{domain}/", - ip, - false, - 0, - "x-marginalia/advisory;state=error", - probeStatus.getBytes() - )); + write(forDomainError(selfDomain, ip, probeStatus)); } } @@ -83,6 +97,15 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { return; } + // We don't want to store robots.txt files, as they are not + // interesting for the analysis we want to do. This is important + // since txt-files in general are interesting, and we don't want to + // exclude them as a class. + + if (fetchOk.uri().getPath().equals("/robots.txt")) { + return; + } + byte[] bodyBytes; String contentType; @@ -112,4 +135,36 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { public void close() throws IOException { writer.close(); } + + private CrawledDocumentParquetRecord forDomainRedirect(String domain, String redirectDomain) { + return new CrawledDocumentParquetRecord(domain, + STR."https://\{redirectDomain}/", + "", + false, + 0, + "x-marginalia/advisory;state=redirect", + new byte[0] + ); + } + private CrawledDocumentParquetRecord forDomainError(String domain, String ip, String errorStatus) { + return new CrawledDocumentParquetRecord(domain, + STR."https://\{domain}/", + ip, + false, + 0, + "x-marginalia/advisory;state=error", + errorStatus.getBytes() + ); + } + + private CrawledDocumentParquetRecord forDocError(String domain, String url, String errorStatus) { + return new CrawledDocumentParquetRecord(domain, + url, + "", + false, + 0, + "x-marginalia/advisory;state=error", + errorStatus.getBytes() + ); + } } diff --git a/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXEntityRefused.java b/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXEntityRefused.java new file mode 100644 index 00000000..4480115e --- /dev/null +++ b/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXEntityRefused.java @@ -0,0 +1,45 @@ +package org.netpreserve.jwarc; + +import java.io.IOException; +import java.net.URI; + +/** This defines a non-standard extension to WARC for storing old HTTP responses, + * essentially a 'response' with different semantics + */ +public class WarcXEntityRefused extends WarcRevisit { + private static final String TYPE_NAME = "x-entity-refused"; + + public static final URI documentRobotsTxtSkippedURN = URI.create("urn:marginalia/meta/doc/robots-txt-skipped"); + public static final URI documentBadContentTypeURN = URI.create("urn:marginalia/meta/doc/content-type-failed-probe"); + public static final URI documentProbeTimeout = URI.create("urn:marginalia/meta/doc/timeout-probe"); + public static final URI documentUnspecifiedError = URI.create("urn:marginalia/meta/doc/error"); + + WarcXEntityRefused(MessageVersion version, MessageHeaders headers, MessageBody body) { + super(version, headers, body); + } + + public static void register(WarcReader reader) { + reader.registerType(TYPE_NAME, WarcXEntityRefused::new); + } + + public static class Builder extends AbstractBuilder { + public Builder(URI targetURI, URI profile) { + this(targetURI.toString(), profile.toString()); + } + + public Builder(String targetURI, String profileURI) { + super(TYPE_NAME); + setHeader("WARC-Target-URI", targetURI); + setHeader("WARC-Profile", profileURI); + } + + public Builder body(HttpResponse httpResponse) throws IOException { + return body(MediaType.HTTP_RESPONSE, httpResponse); + } + + @Override + public WarcXEntityRefused build() { + return build(WarcXEntityRefused::new); + } + } +} diff --git a/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXResponseReference.java b/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXResponseReference.java index 7e02d936..19a5a00f 100644 --- a/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXResponseReference.java +++ b/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXResponseReference.java @@ -4,9 +4,7 @@ import java.io.IOException; import java.net.URI; /** This defines a non-standard extension to WARC for storing old HTTP responses, - * essentially a 'revisit' with a full body, which is not something that is - * expected by the jwarc parser, and goes against the semantics of the revisit - * records a fair bit. + * essentially a 'response' with different semantics.. *

* An x-response-reference record is a response record with a full body, where * the data is a reconstructed HTTP response from a previous crawl. diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java index 9088ebb4..65e1529b 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java @@ -5,6 +5,8 @@ import com.google.common.hash.Hashing; import nu.marginalia.crawling.io.SerializableCrawlDataStream; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.lsh.EasyLSH; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import javax.annotation.Nullable; import java.io.IOException; @@ -15,6 +17,7 @@ import java.nio.file.Path; public class CrawlDataReference implements AutoCloseable { private final SerializableCrawlDataStream data; + private static final Logger logger = LoggerFactory.getLogger(CrawlDataReference.class); public CrawlDataReference(SerializableCrawlDataStream data) { this.data = data; @@ -43,8 +46,9 @@ public class CrawlDataReference implements AutoCloseable { } } catch (IOException ex) { - ex.printStackTrace(); + logger.error("Failed to read next document", ex); } + return null; } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java index ca2494dc..e52b73b6 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java @@ -20,8 +20,18 @@ public class CrawlDelayTimer { this.delayTime = delayTime; } + /** Call when we've gotten an HTTP 429 response. This will wait a moment, and then + * set a flag that slows down the main crawl delay as well. */ + public void waitRetryDelay(RateLimitException ex) throws InterruptedException { + slowDown = true; + + int delay = ex.retryAfter(); + + Thread.sleep(Math.clamp(delay, 100, 5000)); + } + @SneakyThrows - public void delay(long spentTime) { + public void waitFetchDelay(long spentTime) { long sleepTime = delayTime; if (sleepTime >= 1) { @@ -30,10 +40,6 @@ public class CrawlDelayTimer { Thread.sleep(min(sleepTime - spentTime, 5000)); } - else if (slowDown) { - // Additional delay when the server is signalling it wants slower requests - Thread.sleep( DEFAULT_CRAWL_DELAY_MIN_MS); - } else { // When no crawl delay is specified, lean toward twice the fetch+process time, // within sane limits. This means slower servers get slower crawling, and faster @@ -48,10 +54,10 @@ public class CrawlDelayTimer { Thread.sleep(sleepTime - spentTime); } - } - /** Increase the delay between requests if the server is signalling it wants slower requests with HTTP 429 */ - public void slowDown() { - slowDown = true; + if (slowDown) { + // Additional delay when the server is signalling it wants slower requests + Thread.sleep( DEFAULT_CRAWL_DELAY_MIN_MS); + } } } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index 668f597a..35f5bcd0 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -3,7 +3,6 @@ package nu.marginalia.crawl.retreival; import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; import crawlercommons.robots.SimpleRobotRules; -import lombok.SneakyThrows; import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.contenttype.ContentType; import nu.marginalia.crawl.retreival.fetcher.ContentTags; @@ -19,6 +18,7 @@ import nu.marginalia.ip_blocklist.UrlBlocklist; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawlspec.CrawlSpecRecord; +import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -32,6 +32,7 @@ import java.util.*; public class CrawlerRetreiver implements AutoCloseable { private static final int MAX_ERRORS = 20; + private static final int HTTP_429_RETRY_LIMIT = 1; // Retry 429s once private final HttpFetcher fetcher; @@ -40,7 +41,6 @@ public class CrawlerRetreiver implements AutoCloseable { private static final LinkParser linkParser = new LinkParser(); private static final Logger logger = LoggerFactory.getLogger(CrawlerRetreiver.class); - private static final HashFunction hashMethod = Hashing.murmur3_128(0); private static final UrlBlocklist urlBlocklist = new UrlBlocklist(); private static final LinkFilterSelector linkFilterSelector = new LinkFilterSelector(); @@ -104,7 +104,7 @@ public class CrawlerRetreiver implements AutoCloseable { resync.run(warcFile); } - private int crawlDomain(CrawlDataReference oldCrawlData, DomainProber.ProbeResult probeResult, DomainLinks domainLinks) throws IOException { + private int crawlDomain(CrawlDataReference oldCrawlData, DomainProber.ProbeResult probeResult, DomainLinks domainLinks) throws IOException, InterruptedException { String ip = findIp(domain); EdgeUrl rootUrl; @@ -124,7 +124,7 @@ public class CrawlerRetreiver implements AutoCloseable { final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(crawlFrontier.peek().domain, warcRecorder); final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay()); - sniffRootDocument(delayTimer, rootUrl); + sniffRootDocument(rootUrl); // Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified int recrawled = recrawl(oldCrawlData, robotsRules, delayTimer); @@ -181,8 +181,14 @@ public class CrawlerRetreiver implements AutoCloseable { continue; - if (fetchWriteAndSleep(top, delayTimer, DocumentWithReference.empty()).isOk()) { - fetchedCount++; + try { + if (fetchWriteAndSleep(top, delayTimer, DocumentWithReference.empty()).isOk()) { + fetchedCount++; + } + } + catch (InterruptedException ex) { + Thread.currentThread().interrupt(); + break; } } @@ -192,17 +198,17 @@ public class CrawlerRetreiver implements AutoCloseable { } /** Using the old crawl data, fetch the documents comparing etags and last-modified */ - private int recrawl(CrawlDataReference oldCrawlData, SimpleRobotRules robotsRules, CrawlDelayTimer delayTimer) { + private int recrawl(CrawlDataReference oldCrawlData, SimpleRobotRules robotsRules, CrawlDelayTimer delayTimer) throws InterruptedException { return crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer); } - private void sniffRootDocument(CrawlDelayTimer delayTimer, EdgeUrl rootUrl) { + private void sniffRootDocument(EdgeUrl rootUrl) { try { logger.debug("Configuring link filter"); var url = rootUrl.withPathAndParam("/", null); - var result = tryDownload(url, delayTimer, ContentTags.empty()); + var result = fetcher.fetchContent(url, warcRecorder, ContentTags.empty()); if (!(result instanceof HttpFetchResult.ResultOk ok)) return; @@ -239,22 +245,28 @@ public class CrawlerRetreiver implements AutoCloseable { } public HttpFetchResult fetchWriteAndSleep(EdgeUrl top, - CrawlDelayTimer timer, - DocumentWithReference reference) { + CrawlDelayTimer timer, + DocumentWithReference reference) throws InterruptedException + { logger.debug("Fetching {}", top); + HttpFetchResult fetchedDoc = new HttpFetchResult.ResultNone(); + long startTime = System.currentTimeMillis(); - var contentTags = reference.getContentTags(); - var fetchedDoc = tryDownload(top, timer, contentTags); - if (fetchedDoc instanceof HttpFetchResult.Result304Raw) { - var doc = reference.doc(); - if (doc != null) { - warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBody); - fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url, - new ContentType(doc.contentType, "UTF-8"), - doc.documentBody); + // Fetch the document, retrying if we get a rate limit exception + for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) { + try { + fetchedDoc = fetcher.fetchContent(top, warcRecorder, contentTags); + break; + } + catch (RateLimitException ex) { + timer.waitRetryDelay(ex); + } + catch (Exception ex) { + logger.warn("Failed to fetch {}", top, ex); + fetchedDoc = new HttpFetchResult.ResultException(ex); } } @@ -268,14 +280,19 @@ public class CrawlerRetreiver implements AutoCloseable { crawlFrontier.addVisited(new EdgeUrl(ok.uri())); } } - else if (fetchedDoc instanceof HttpFetchResult.Result304ReplacedWithReference retained) { - var docOpt = retained.parseDocument(); - if (docOpt.isPresent()) { - var doc = docOpt.get(); + else if (fetchedDoc instanceof HttpFetchResult.Result304Raw && reference.doc() != null) { + var doc = reference.doc(); - crawlFrontier.enqueueLinksFromDocument(top, doc); - EdgeUrl.parse(retained.url()).ifPresent(crawlFrontier::addVisited); - } + warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBody); + + fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url, + new ContentType(doc.contentType, "UTF-8"), + doc.documentBody); + + var parsed = Jsoup.parse(doc.documentBody); + + crawlFrontier.enqueueLinksFromDocument(top, parsed); + crawlFrontier.addVisited(top); } else if (fetchedDoc instanceof HttpFetchResult.ResultException ex) { errorCount ++; @@ -285,7 +302,7 @@ public class CrawlerRetreiver implements AutoCloseable { logger.error("Error parsing document {}", top, ex); } - timer.delay(System.currentTimeMillis() - startTime); + timer.waitFetchDelay(System.currentTimeMillis() - startTime); return fetchedDoc; } @@ -295,33 +312,6 @@ public class CrawlerRetreiver implements AutoCloseable { || proto.equalsIgnoreCase("https"); } - @SneakyThrows - private HttpFetchResult tryDownload(EdgeUrl top, CrawlDelayTimer timer, ContentTags tags) { - for (int i = 0; i < 2; i++) { - try { - return fetcher.fetchContent(top, warcRecorder, tags); - } - catch (RateLimitException ex) { - timer.slowDown(); - - int delay = ex.retryAfter(); - if (delay > 0 && delay < 5000) { - Thread.sleep(delay); - } - } - catch (Exception ex) { - logger.warn("Failed to fetch {}", top, ex); - return new HttpFetchResult.ResultException(ex); - } - } - - return new HttpFetchResult.ResultNone(); - } - - private String createHash(String documentBodyHash) { - return hashMethod.hashUnencodedChars(documentBodyHash).toString(); - } - // FIXME this does not belong in the crawler private Optional findCanonicalUrl(EdgeUrl baseUrl, Document parsed) { baseUrl = baseUrl.domain.toRootUrl(); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java index 47b5b2d8..52ebe2f3 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java @@ -34,6 +34,7 @@ public class CrawlerWarcResynchronizer { // First pass, enqueue links try (var reader = new WarcReader(tempFile)) { WarcXResponseReference.register(reader); + WarcXEntityRefused.register(reader); for (var item : reader) { accept(item); @@ -58,13 +59,26 @@ public class CrawlerWarcResynchronizer { response(rsp); } else if (item instanceof WarcRequest req) { request(req); + } else if (item instanceof WarcXEntityRefused refused) { + refused(refused); } + } catch (Exception ex) { logger.info(STR."Failed to process warc record \{item}", ex); } } + private void refused(WarcXEntityRefused refused) { + // In general, we don't want to re-crawl urls that were refused, + // but to permit circumstances to change over time, we'll + // allow for a small chance of re-probing these entries + + if (Math.random() > 0.1) { + crawlFrontier.addVisited(new EdgeUrl(refused.targetURI())); + } + } + private void request(WarcRequest request) { EdgeUrl.parse(request.target()).ifPresent(crawlFrontier::addVisited); } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java index 6d868fdf..46446fee 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java @@ -50,9 +50,14 @@ public class DomainCrawlFrontier { } } + /** Increase the depth of the crawl by a factor. If the current depth is smaller + * than the number of already visited documents, the base depth will be adjusted + * to the visited count first. + */ public void increaseDepth(double depthIncreaseFactor) { - depth = (int)(depth * depthIncreaseFactor); + depth = (int)(Math.max(visited.size(), depth) * depthIncreaseFactor); } + public void setLinkFilter(Predicate linkFilter) { this.linkFilter = linkFilter; } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java index 2ceb076d..ad29056f 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java @@ -20,7 +20,10 @@ public class WarcProtocolReconstructor { static String getHttpRequestString(Request request, URI uri) { StringBuilder requestStringBuilder = new StringBuilder(); - requestStringBuilder.append(request.method()).append(" ").append(URLEncoder.encode(uri.getPath(), StandardCharsets.UTF_8)); + + final String encodedURL = encodeURLKeepSlashes(uri.getPath()); + + requestStringBuilder.append(request.method()).append(" ").append(encodedURL); if (uri.getQuery() != null) { requestStringBuilder.append("?").append(uri.getQuery()); @@ -37,6 +40,19 @@ public class WarcProtocolReconstructor { return requestStringBuilder.toString(); } + /** Java's URLEncoder will URLEncode slashes, which is not desirable + * when sanitizing a URL for HTTP protocol purposes + */ + + private static String encodeURLKeepSlashes(String URL) { + String[] parts = StringUtils.split(URL,"/"); + StringJoiner joiner = new StringJoiner("/"); + for (String part : parts) { + joiner.add(URLEncoder.encode(part, StandardCharsets.UTF_8)); + } + return joiner.toString(); + } + static String getResponseHeader(String headersAsString, int code) { String version = "1.1"; @@ -131,6 +147,11 @@ public class WarcProtocolReconstructor { if (headerCapitalized.startsWith("X-Marginalia")) return; + // Omit Transfer-Encoding header, as we'll be using Content-Length + // instead in the warc file, despite what the server says + if (headerCapitalized.startsWith("Transfer-Encoding")) + return; + for (var value : values) { joiner.add(headerCapitalized + ": " + value); } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java index a1335eb8..5ccfacb5 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java @@ -29,11 +29,6 @@ import java.util.*; * be reconstructed. */ public class WarcRecorder implements AutoCloseable { - public static final URI documentRobotsTxtSkippedURN = URI.create("urn:marginalia/meta/doc/robots-txt-skipped"); - public static final URI documentBadContentTypeURN = URI.create("urn:marginalia/meta/doc/content-type-failed-probe"); - public static final URI documentProbeTimeout = URI.create("urn:marginalia/meta/doc/timeout-probe"); - public static final URI documentUnspecifiedError = URI.create("urn:marginalia/meta/doc/error"); - private static final int MAX_TIME = 30_000; private static final int MAX_SIZE = 1024 * 1024 * 10; private final WarcWriter writer; @@ -91,6 +86,8 @@ public class WarcRecorder implements AutoCloseable { ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(); + boolean hasCookies = !client.cookieJar().loadForRequest(request.url()).isEmpty(); + try (var response = call.execute()) { var body = response.body(); InputStream inputStream; @@ -143,6 +140,7 @@ public class WarcRecorder implements AutoCloseable { WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri) .blockDigest(responseDigestBuilder.build()) + .addHeader("X-Has-Cookies", hasCookies ? "1" : "0") .date(date) .body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes()); @@ -280,11 +278,11 @@ public class WarcRecorder implements AutoCloseable { public void flagAsRobotsTxtError(EdgeUrl top) { try { - WarcRevisit revisit = new WarcRevisit.Builder(top.asURI(), documentRobotsTxtSkippedURN) + WarcXEntityRefused refusal = new WarcXEntityRefused.Builder(top.asURI(), WarcXEntityRefused.documentRobotsTxtSkippedURN) .date(Instant.now()) .build(); - writer.write(revisit); + writer.write(refusal); } catch (URISyntaxException | IOException e) { throw new RuntimeException(e); } @@ -292,13 +290,13 @@ public class WarcRecorder implements AutoCloseable { public void flagAsFailedContentTypeProbe(EdgeUrl url, String contentType, int status) { try { - WarcRevisit revisit = new WarcRevisit.Builder(url.asURI(), documentBadContentTypeURN) + WarcXEntityRefused refusal = new WarcXEntityRefused.Builder(url.asURI(), WarcXEntityRefused.documentBadContentTypeURN) .date(Instant.now()) .addHeader("Rejected-Content-Type", contentType) .addHeader("Http-Status", Integer.toString(status)) .build(); - writer.write(revisit); + writer.write(refusal); } catch (URISyntaxException | IOException e) { throw new RuntimeException(e); } @@ -306,13 +304,13 @@ public class WarcRecorder implements AutoCloseable { public void flagAsError(EdgeUrl url, Exception ex) { try { - WarcRevisit revisit = new WarcRevisit.Builder(url.asURI(), documentUnspecifiedError) + WarcXEntityRefused refusal = new WarcXEntityRefused.Builder(url.asURI(), WarcXEntityRefused.documentUnspecifiedError) .date(Instant.now()) .addHeader("Exception", ex.getClass().getSimpleName()) .addHeader("ErrorMessage", Objects.requireNonNullElse(ex.getMessage(), "")) .build(); - writer.write(revisit); + writer.write(refusal); } catch (URISyntaxException | IOException e) { throw new RuntimeException(e); } @@ -320,11 +318,11 @@ public class WarcRecorder implements AutoCloseable { public void flagAsTimeout(EdgeUrl url) { try { - WarcRevisit revisit = new WarcRevisit.Builder(url.asURI(), documentProbeTimeout) + WarcXEntityRefused refusal = new WarcXEntityRefused.Builder(url.asURI(), WarcXEntityRefused.documentProbeTimeout) .date(Instant.now()) .build(); - writer.write(revisit); + writer.write(refusal); } catch (URISyntaxException | IOException e) { throw new RuntimeException(e); } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java index 70a98310..91c21d65 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java @@ -15,13 +15,6 @@ import org.jsoup.Jsoup; * E-Tag and Last-Modified headers. */ public class CrawlerRevisitor { - /** recrawlState tag for documents that had a HTTP status 304 */ - public static final String documentWasRetainedTag = "RETAINED/304"; - - /** recrawlState tag for documents that had a 200 status but were identical to a previous version */ - public static final String documentWasSameTag = "SAME-BY-COMPARISON"; - - private final DomainCrawlFrontier crawlFrontier; private final CrawlerRetreiver crawlerRetreiver; private final WarcRecorder warcRecorder; @@ -37,7 +30,8 @@ public class CrawlerRevisitor { /** Performs a re-crawl of old documents, comparing etags and last-modified */ public int recrawl(CrawlDataReference oldCrawlData, SimpleRobotRules robotsRules, - CrawlDelayTimer delayTimer) { + CrawlDelayTimer delayTimer) + throws InterruptedException { int recrawled = 0; int retained = 0;