(crawler, EXPERIMENT) Disable content type probing and use Accept header instead

There's reason to think this may speed up crawling quite significantly, and the benefits of the probing aren't quite there.
This commit is contained in:
Viktor Lofgren 2024-09-30 14:53:01 +02:00
parent 90a2d4ae38
commit ecb5eedeae
2 changed files with 6 additions and 2 deletions

View File

@ -218,7 +218,7 @@ public class HttpFetcherImpl implements HttpFetcher {
getBuilder.url(url.toString()) getBuilder.url(url.toString())
.addHeader("Accept-Encoding", "gzip") .addHeader("Accept-Encoding", "gzip")
.addHeader("Accept-Language", "en,*;q=0.5") .addHeader("Accept-Language", "en,*;q=0.5")
.addHeader("Accept", "text/html, application/xhtml+xml, */*;q=0.8") .addHeader("Accept", "text/html, application/xhtml+xml, text/*;q=0.8")
.addHeader("User-agent", userAgentString); .addHeader("User-agent", userAgentString);
contentTags.paint(getBuilder); contentTags.paint(getBuilder);

View File

@ -321,6 +321,9 @@ public class CrawlerRetreiver implements AutoCloseable {
long probeStart = System.currentTimeMillis(); long probeStart = System.currentTimeMillis();
/*
probing is on probation for now while we evaluate how much the added delays slows down the crawler
if (probeType == HttpFetcher.ProbeType.FULL) { if (probeType == HttpFetcher.ProbeType.FULL) {
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) { for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
try { try {
@ -348,9 +351,10 @@ public class CrawlerRetreiver implements AutoCloseable {
return new HttpFetchResult.ResultException(ex); return new HttpFetchResult.ResultException(ex);
} }
} }
}
timer.waitFetchDelay(System.currentTimeMillis() - probeStart); timer.waitFetchDelay(System.currentTimeMillis() - probeStart);
}*/
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) { for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
try { try {