From ecb5eedeaebc3c8c8ba27c9b1d8ac49716d55eb4 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 30 Sep 2024 14:53:01 +0200 Subject: [PATCH] (crawler, EXPERIMENT) Disable content type probing and use Accept header instead There's reason to think this may speed up crawling quite significantly, and the benefits of the probing aren't quite there. --- .../java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java | 2 +- .../nu/marginalia/crawl/retreival/CrawlerRetreiver.java | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java index f4b111e2..03075e91 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java @@ -218,7 +218,7 @@ public class HttpFetcherImpl implements HttpFetcher { getBuilder.url(url.toString()) .addHeader("Accept-Encoding", "gzip") .addHeader("Accept-Language", "en,*;q=0.5") - .addHeader("Accept", "text/html, application/xhtml+xml, */*;q=0.8") + .addHeader("Accept", "text/html, application/xhtml+xml, text/*;q=0.8") .addHeader("User-agent", userAgentString); contentTags.paint(getBuilder); diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index 4666aaa9..ab6456de 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -321,6 +321,9 @@ public class CrawlerRetreiver implements AutoCloseable { long probeStart = System.currentTimeMillis(); + /* + probing is on probation for now while we evaluate how much the added delays slows down the crawler + if (probeType == HttpFetcher.ProbeType.FULL) { for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) { try { @@ -348,9 +351,10 @@ public class CrawlerRetreiver implements AutoCloseable { return new HttpFetchResult.ResultException(ex); } } - } timer.waitFetchDelay(System.currentTimeMillis() - probeStart); + }*/ + for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) { try {