mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-22 12:48:58 +00:00
(crawler) Safe sanitization of headers during warc->slop conversion
The warc->slop converter was rejecting some items because they had headers that were representable in the Warc code's MessageHeader map implementation, but illegal in the HttpHeaders' implementation. Fixing this by manually filtering these out. Ostensibly the constructor has a filtering predicate, but this annoyingly runs too late and fails to prevent the problem.
This commit is contained in:
parent
2ea34767d8
commit
b8581b0f56
@ -12,8 +12,7 @@ import java.io.InputStream;
|
||||
import java.net.InetAddress;
|
||||
import java.net.URI;
|
||||
import java.net.http.HttpHeaders;
|
||||
import java.util.Arrays;
|
||||
import java.util.Optional;
|
||||
import java.util.*;
|
||||
|
||||
/* FIXME: This interface has a very unfortunate name that is not very descriptive.
|
||||
*/
|
||||
@ -65,7 +64,21 @@ public sealed interface HttpFetchResult {
|
||||
) implements HttpFetchResult {
|
||||
|
||||
public ResultOk(URI uri, int status, MessageHeaders headers, String ipAddress, byte[] bytes, int bytesStart, int length) {
|
||||
this(uri, status, HttpHeaders.of(headers.map(), (k,v) -> true), ipAddress, bytes, bytesStart, length);
|
||||
this(uri, status, convertHeaders(headers), ipAddress, bytes, bytesStart, length);
|
||||
}
|
||||
|
||||
private static HttpHeaders convertHeaders(MessageHeaders messageHeaders) {
|
||||
Map<String, List<String>> inputMap = messageHeaders.map();
|
||||
Map<String, List<String>> filteredMap = new HashMap<>(Math.max(4, inputMap.size()));
|
||||
|
||||
inputMap.forEach((k, v) -> {
|
||||
if (k.isBlank()) return;
|
||||
if (!Character.isAlphabetic(k.charAt(0))) return;
|
||||
|
||||
filteredMap.put(k, v);
|
||||
});
|
||||
|
||||
return HttpHeaders.of(filteredMap, (k,v) -> true);
|
||||
}
|
||||
|
||||
public boolean isOk() {
|
||||
|
Loading…
Reference in New Issue
Block a user