From b8581b0f569c4606db9b6f916b78d90ead6d0275 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 31 Jan 2025 12:47:42 +0100 Subject: [PATCH] (crawler) Safe sanitization of headers during warc->slop conversion The warc->slop converter was rejecting some items because they had headers that were representable in the Warc code's MessageHeader map implementation, but illegal in the HttpHeaders' implementation. Fixing this by manually filtering these out. Ostensibly the constructor has a filtering predicate, but this annoyingly runs too late and fails to prevent the problem. --- .../model/body/HttpFetchResult.java | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/code/processes/crawling-process/model/java/nu/marginalia/model/body/HttpFetchResult.java b/code/processes/crawling-process/model/java/nu/marginalia/model/body/HttpFetchResult.java index b5948dcb..92659b6c 100644 --- a/code/processes/crawling-process/model/java/nu/marginalia/model/body/HttpFetchResult.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/model/body/HttpFetchResult.java @@ -12,8 +12,7 @@ import java.io.InputStream; import java.net.InetAddress; import java.net.URI; import java.net.http.HttpHeaders; -import java.util.Arrays; -import java.util.Optional; +import java.util.*; /* FIXME: This interface has a very unfortunate name that is not very descriptive. */ @@ -65,7 +64,21 @@ public sealed interface HttpFetchResult { ) implements HttpFetchResult { public ResultOk(URI uri, int status, MessageHeaders headers, String ipAddress, byte[] bytes, int bytesStart, int length) { - this(uri, status, HttpHeaders.of(headers.map(), (k,v) -> true), ipAddress, bytes, bytesStart, length); + this(uri, status, convertHeaders(headers), ipAddress, bytes, bytesStart, length); + } + + private static HttpHeaders convertHeaders(MessageHeaders messageHeaders) { + Map> inputMap = messageHeaders.map(); + Map> filteredMap = new HashMap<>(Math.max(4, inputMap.size())); + + inputMap.forEach((k, v) -> { + if (k.isBlank()) return; + if (!Character.isAlphabetic(k.charAt(0))) return; + + filteredMap.put(k, v); + }); + + return HttpHeaders.of(filteredMap, (k,v) -> true); } public boolean isOk() {