diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java index f4b111e2..03075e91 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java @@ -218,7 +218,7 @@ public class HttpFetcherImpl implements HttpFetcher { getBuilder.url(url.toString()) .addHeader("Accept-Encoding", "gzip") .addHeader("Accept-Language", "en,*;q=0.5") - .addHeader("Accept", "text/html, application/xhtml+xml, */*;q=0.8") + .addHeader("Accept", "text/html, application/xhtml+xml, text/*;q=0.8") .addHeader("User-agent", userAgentString); contentTags.paint(getBuilder); diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index 4666aaa9..ab6456de 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -321,6 +321,9 @@ public class CrawlerRetreiver implements AutoCloseable { long probeStart = System.currentTimeMillis(); + /* + probing is on probation for now while we evaluate how much the added delays slows down the crawler + if (probeType == HttpFetcher.ProbeType.FULL) { for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) { try { @@ -348,9 +351,10 @@ public class CrawlerRetreiver implements AutoCloseable { return new HttpFetchResult.ResultException(ex); } } - } timer.waitFetchDelay(System.currentTimeMillis() - probeStart); + }*/ + for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) { try {