diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java index e88ee454..af4a743f 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java @@ -38,6 +38,7 @@ public class CrawlerRevisitor { int recrawled = 0; int retained = 0; int errors = 0; + int skipped = 0; for (;;) { if (errors > 20) { @@ -84,9 +85,32 @@ public class CrawlerRevisitor { } - if (recrawled > 5 - && retained > 0.9 * recrawled - && Math.random() < 0.9) + double skipProb; + + // calculate the probability of skipping this document based on the + // fraction of documents that haven't changed + if (recrawled > 0) { + skipProb = (double) retained / recrawled; + + // If we've crawled a lot of documents, we'll be more conservative + // in trying to recrawl documents, to avoid hammering the server too much; + // in the case of a large change, we'll eventually catch it anyway + + if (skipped + recrawled > 10_000) { + skipProb = Math.clamp(skipProb, 0.75, 0.99); + } else if (skipped + recrawled > 1000) { + skipProb = Math.clamp(skipProb, 0.5, 0.99); + } else { + skipProb = Math.clamp(skipProb, 0, 0.95); + } + + } else { + // If we haven't recrawled anything yet, we'll be more aggressive + // in trying to recrawl documents + skipProb = 0.25; + } + + if (Math.random() < skipProb) // { // Since it looks like most of these documents haven't changed, // we'll load the documents directly; but we do this in a random @@ -103,6 +127,8 @@ public class CrawlerRevisitor { doc.documentBody, new ContentTags(doc.etagMaybe, doc.lastModifiedMaybe) ); + + skipped++; } else { // GET the document with the stored document as a reference