mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(crawler, EXPERIMENT) Disable content type probing and use Accept header instead
There's reason to think this may speed up crawling quite significantly, and the benefits of the probing aren't quite there.
This commit is contained in:
parent
90a2d4ae38
commit
ecb5eedeae
@ -218,7 +218,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
getBuilder.url(url.toString())
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.addHeader("Accept-Language", "en,*;q=0.5")
|
||||
.addHeader("Accept", "text/html, application/xhtml+xml, */*;q=0.8")
|
||||
.addHeader("Accept", "text/html, application/xhtml+xml, text/*;q=0.8")
|
||||
.addHeader("User-agent", userAgentString);
|
||||
|
||||
contentTags.paint(getBuilder);
|
||||
|
@ -321,6 +321,9 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
|
||||
long probeStart = System.currentTimeMillis();
|
||||
|
||||
/*
|
||||
probing is on probation for now while we evaluate how much the added delays slows down the crawler
|
||||
|
||||
if (probeType == HttpFetcher.ProbeType.FULL) {
|
||||
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
|
||||
try {
|
||||
@ -348,9 +351,10 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
return new HttpFetchResult.ResultException(ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
timer.waitFetchDelay(System.currentTimeMillis() - probeStart);
|
||||
}*/
|
||||
|
||||
|
||||
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
|
||||
try {
|
||||
|
Loading…
Reference in New Issue
Block a user