(crawler) Send "Accept"-headers when fetching documents, also indicate we prefer English results

This commit is contained in:
Viktor Lofgren 2024-06-02 16:30:34 +02:00
parent 206a7ce6c1
commit b4eac2516e

View File

@ -183,6 +183,8 @@ public class HttpFetcherImpl implements HttpFetcher {
getBuilder.url(url.toString()) getBuilder.url(url.toString())
.addHeader("Accept-Encoding", "gzip") .addHeader("Accept-Encoding", "gzip")
.addHeader("Accept-Language", "en,*;q=0.5")
.addHeader("Accept", "text/html, application/xhtml+xml, */*;q=0.8")
.addHeader("User-agent", userAgentString); .addHeader("User-agent", userAgentString);
contentTags.paint(getBuilder); contentTags.paint(getBuilder);
@ -225,6 +227,7 @@ public class HttpFetcherImpl implements HttpFetcher {
getBuilder.url(url.toString()) getBuilder.url(url.toString())
.addHeader("Accept-Encoding", "gzip") .addHeader("Accept-Encoding", "gzip")
.addHeader("Accept", "text/*, */*;q=0.9")
.addHeader("User-agent", userAgentString); .addHeader("User-agent", userAgentString);
HttpFetchResult result = recorder.fetch(client, getBuilder.build()); HttpFetchResult result = recorder.fetch(client, getBuilder.build());