(crawler) Adjust revisit logic

The revisit logic wasn't sufficiently dampening the recrawl rate for websites that largely have not changed.

Modified it to be more reactive to the degree to which the content has changed, while applying upper and lower limits depending on the size of the crawl set.
This commit is contained in:
Viktor Lofgren 2024-07-16 14:51:49 +02:00
parent 4d29581ea4
commit f4d79c203d

View File

@ -38,6 +38,7 @@ public class CrawlerRevisitor {
int recrawled = 0; int recrawled = 0;
int retained = 0; int retained = 0;
int errors = 0; int errors = 0;
int skipped = 0;
for (;;) { for (;;) {
if (errors > 20) { if (errors > 20) {
@ -84,9 +85,32 @@ public class CrawlerRevisitor {
} }
if (recrawled > 5 double skipProb;
&& retained > 0.9 * recrawled
&& Math.random() < 0.9) // calculate the probability of skipping this document based on the
// fraction of documents that haven't changed
if (recrawled > 0) {
skipProb = (double) retained / recrawled;
// If we've crawled a lot of documents, we'll be more conservative
// in trying to recrawl documents, to avoid hammering the server too much;
// in the case of a large change, we'll eventually catch it anyway
if (skipped + recrawled > 10_000) {
skipProb = Math.clamp(skipProb, 0.75, 0.99);
} else if (skipped + recrawled > 1000) {
skipProb = Math.clamp(skipProb, 0.5, 0.99);
} else {
skipProb = Math.clamp(skipProb, 0, 0.95);
}
} else {
// If we haven't recrawled anything yet, we'll be more aggressive
// in trying to recrawl documents
skipProb = 0.25;
}
if (Math.random() < skipProb) //
{ {
// Since it looks like most of these documents haven't changed, // Since it looks like most of these documents haven't changed,
// we'll load the documents directly; but we do this in a random // we'll load the documents directly; but we do this in a random
@ -103,6 +127,8 @@ public class CrawlerRevisitor {
doc.documentBody, doc.documentBody,
new ContentTags(doc.etagMaybe, doc.lastModifiedMaybe) new ContentTags(doc.etagMaybe, doc.lastModifiedMaybe)
); );
skipped++;
} }
else { else {
// GET the document with the stored document as a reference // GET the document with the stored document as a reference