From d816f048f5baf2804cd7ad77609e80607b460637 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 22 Apr 2024 14:14:24 +0200 Subject: [PATCH] (crawler) Ensure all appropriate headers are recorded on the request --- .../warc/WarcProtocolReconstructor.java | 24 ++++++++++++++----- .../retreival/fetcher/warc/WarcRecorder.java | 6 ++++- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java index 6f977e44..b75589ee 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java @@ -8,9 +8,7 @@ import org.apache.commons.lang3.StringUtils; import java.net.URI; import java.net.URLEncoder; import java.nio.charset.StandardCharsets; -import java.util.Arrays; -import java.util.Map; -import java.util.StringJoiner; +import java.util.*; import java.util.stream.Collectors; /** We don't have access to the raw HTTP request and response, so we need to reconstruct them @@ -18,12 +16,15 @@ import java.util.stream.Collectors; */ public class WarcProtocolReconstructor { - static String getHttpRequestString(Request request, URI uri) { + static String getHttpRequestString(String method, + Map> mainHeaders, + Map> extraHeaders, + URI uri) { StringBuilder requestStringBuilder = new StringBuilder(); final String encodedURL = encodeURLKeepSlashes(uri.getPath()); - requestStringBuilder.append(request.method()).append(" ").append(encodedURL); + requestStringBuilder.append(method).append(" ").append(encodedURL); if (uri.getQuery() != null) { requestStringBuilder.append("?").append(URLEncoder.encode(uri.getQuery(), StandardCharsets.UTF_8)); @@ -31,12 +32,23 @@ public class WarcProtocolReconstructor { requestStringBuilder.append(" HTTP/1.1\r\n"); requestStringBuilder.append("Host: ").append(uri.getHost()).append("\r\n"); - request.headers().toMultimap().forEach((k, values) -> { + Set addedHeaders = new HashSet<>(); + + mainHeaders.forEach((k, values) -> { for (var value : values) { + addedHeaders.add(k); requestStringBuilder.append(capitalizeHeader(k)).append(": ").append(value).append("\r\n"); } }); + extraHeaders.forEach((k, values) -> { + if (!addedHeaders.contains(k)) { + for (var value : values) { + requestStringBuilder.append(capitalizeHeader(k)).append(": ").append(value).append("\r\n"); + } + } + }); + return requestStringBuilder.toString(); } diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java index 23ab4766..180811cf 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java @@ -146,7 +146,11 @@ public class WarcRecorder implements AutoCloseable { WarcDigestBuilder requestDigestBuilder = new WarcDigestBuilder(); byte[] httpRequestString = WarcProtocolReconstructor - .getHttpRequestString(response.request(), requestUri) + .getHttpRequestString( + response.request().method(), + response.request().headers().toMultimap(), + request.headers().toMultimap(), + requestUri) .getBytes(); requestDigestBuilder.update(httpRequestString);