From b4eac2516eaf5a646dfd27a9755aca6db23db893 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 2 Jun 2024 16:30:34 +0200 Subject: [PATCH] (crawler) Send "Accept"-headers when fetching documents, also indicate we prefer English results --- .../nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java index 7980f3a7..1df0301b 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java @@ -183,6 +183,8 @@ public class HttpFetcherImpl implements HttpFetcher { getBuilder.url(url.toString()) .addHeader("Accept-Encoding", "gzip") + .addHeader("Accept-Language", "en,*;q=0.5") + .addHeader("Accept", "text/html, application/xhtml+xml, */*;q=0.8") .addHeader("User-agent", userAgentString); contentTags.paint(getBuilder); @@ -225,6 +227,7 @@ public class HttpFetcherImpl implements HttpFetcher { getBuilder.url(url.toString()) .addHeader("Accept-Encoding", "gzip") + .addHeader("Accept", "text/*, */*;q=0.9") .addHeader("User-agent", userAgentString); HttpFetchResult result = recorder.fetch(client, getBuilder.build());