mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 13:19:02 +00:00
(crawler) Ensure all appropriate headers are recorded on the request
This commit is contained in:
parent
b09ddd0036
commit
d816f048f5
@ -8,9 +8,7 @@ import org.apache.commons.lang3.StringUtils;
|
|||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
import java.net.URLEncoder;
|
import java.net.URLEncoder;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.Arrays;
|
import java.util.*;
|
||||||
import java.util.Map;
|
|
||||||
import java.util.StringJoiner;
|
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
/** We don't have access to the raw HTTP request and response, so we need to reconstruct them
|
/** We don't have access to the raw HTTP request and response, so we need to reconstruct them
|
||||||
@ -18,12 +16,15 @@ import java.util.stream.Collectors;
|
|||||||
*/
|
*/
|
||||||
public class WarcProtocolReconstructor {
|
public class WarcProtocolReconstructor {
|
||||||
|
|
||||||
static String getHttpRequestString(Request request, URI uri) {
|
static String getHttpRequestString(String method,
|
||||||
|
Map<String, List<String>> mainHeaders,
|
||||||
|
Map<String, List<String>> extraHeaders,
|
||||||
|
URI uri) {
|
||||||
StringBuilder requestStringBuilder = new StringBuilder();
|
StringBuilder requestStringBuilder = new StringBuilder();
|
||||||
|
|
||||||
final String encodedURL = encodeURLKeepSlashes(uri.getPath());
|
final String encodedURL = encodeURLKeepSlashes(uri.getPath());
|
||||||
|
|
||||||
requestStringBuilder.append(request.method()).append(" ").append(encodedURL);
|
requestStringBuilder.append(method).append(" ").append(encodedURL);
|
||||||
|
|
||||||
if (uri.getQuery() != null) {
|
if (uri.getQuery() != null) {
|
||||||
requestStringBuilder.append("?").append(URLEncoder.encode(uri.getQuery(), StandardCharsets.UTF_8));
|
requestStringBuilder.append("?").append(URLEncoder.encode(uri.getQuery(), StandardCharsets.UTF_8));
|
||||||
@ -31,12 +32,23 @@ public class WarcProtocolReconstructor {
|
|||||||
requestStringBuilder.append(" HTTP/1.1\r\n");
|
requestStringBuilder.append(" HTTP/1.1\r\n");
|
||||||
requestStringBuilder.append("Host: ").append(uri.getHost()).append("\r\n");
|
requestStringBuilder.append("Host: ").append(uri.getHost()).append("\r\n");
|
||||||
|
|
||||||
request.headers().toMultimap().forEach((k, values) -> {
|
Set<String> addedHeaders = new HashSet<>();
|
||||||
|
|
||||||
|
mainHeaders.forEach((k, values) -> {
|
||||||
for (var value : values) {
|
for (var value : values) {
|
||||||
|
addedHeaders.add(k);
|
||||||
requestStringBuilder.append(capitalizeHeader(k)).append(": ").append(value).append("\r\n");
|
requestStringBuilder.append(capitalizeHeader(k)).append(": ").append(value).append("\r\n");
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
extraHeaders.forEach((k, values) -> {
|
||||||
|
if (!addedHeaders.contains(k)) {
|
||||||
|
for (var value : values) {
|
||||||
|
requestStringBuilder.append(capitalizeHeader(k)).append(": ").append(value).append("\r\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
return requestStringBuilder.toString();
|
return requestStringBuilder.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -146,7 +146,11 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
WarcDigestBuilder requestDigestBuilder = new WarcDigestBuilder();
|
WarcDigestBuilder requestDigestBuilder = new WarcDigestBuilder();
|
||||||
|
|
||||||
byte[] httpRequestString = WarcProtocolReconstructor
|
byte[] httpRequestString = WarcProtocolReconstructor
|
||||||
.getHttpRequestString(response.request(), requestUri)
|
.getHttpRequestString(
|
||||||
|
response.request().method(),
|
||||||
|
response.request().headers().toMultimap(),
|
||||||
|
request.headers().toMultimap(),
|
||||||
|
requestUri)
|
||||||
.getBytes();
|
.getBytes();
|
||||||
|
|
||||||
requestDigestBuilder.update(httpRequestString);
|
requestDigestBuilder.update(httpRequestString);
|
||||||
|
Loading…
Reference in New Issue
Block a user