(crawler, EXPERIMENT) Disable content type probing and use Accept header instead

There's reason to think this may speed up crawling quite significantly, and the benefits of the probing aren't quite there.
2025-02-23 21:18:58 +00:00 · 2024-09-30 14:53:01 +02:00 · 2024-09-30 14:53:01 +02:00 · ecb5eedeae
commit ecb5eedeae
parent 90a2d4ae38
2 changed files with 6 additions and 2 deletions
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java
@ -218,7 +218,7 @@ public class HttpFetcherImpl implements HttpFetcher {
        getBuilder.url(url.toString())
                .addHeader("Accept-Encoding", "gzip")
                .addHeader("Accept-Language", "en,*;q=0.5")
-                .addHeader("Accept", "text/html, application/xhtml+xml, */*;q=0.8")
+                .addHeader("Accept", "text/html, application/xhtml+xml, text/*;q=0.8")
                .addHeader("User-agent", userAgentString);
        contentTags.paint(getBuilder);
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
@ -321,6 +321,9 @@ public class CrawlerRetreiver implements AutoCloseable {
        long probeStart = System.currentTimeMillis();
        /*
        probing is on probation for now while we evaluate how much the added delays slows down the crawler
        if (probeType == HttpFetcher.ProbeType.FULL) {
            for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
                try {
@ -348,9 +351,10 @@ public class CrawlerRetreiver implements AutoCloseable {
                    return new HttpFetchResult.ResultException(ex);
                }
            }
        }
        timer.waitFetchDelay(System.currentTimeMillis() - probeStart);
        }*/
        for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
            try {