mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(crawler) Adjust revisit logic
The revisit logic wasn't sufficiently dampening the recrawl rate for websites that largely have not changed. Modified it to be more reactive to the degree to which the content has changed, while applying upper and lower limits depending on the size of the crawl set.
This commit is contained in:
parent
4d29581ea4
commit
f4d79c203d
@ -38,6 +38,7 @@ public class CrawlerRevisitor {
|
||||
int recrawled = 0;
|
||||
int retained = 0;
|
||||
int errors = 0;
|
||||
int skipped = 0;
|
||||
|
||||
for (;;) {
|
||||
if (errors > 20) {
|
||||
@ -84,9 +85,32 @@ public class CrawlerRevisitor {
|
||||
}
|
||||
|
||||
|
||||
if (recrawled > 5
|
||||
&& retained > 0.9 * recrawled
|
||||
&& Math.random() < 0.9)
|
||||
double skipProb;
|
||||
|
||||
// calculate the probability of skipping this document based on the
|
||||
// fraction of documents that haven't changed
|
||||
if (recrawled > 0) {
|
||||
skipProb = (double) retained / recrawled;
|
||||
|
||||
// If we've crawled a lot of documents, we'll be more conservative
|
||||
// in trying to recrawl documents, to avoid hammering the server too much;
|
||||
// in the case of a large change, we'll eventually catch it anyway
|
||||
|
||||
if (skipped + recrawled > 10_000) {
|
||||
skipProb = Math.clamp(skipProb, 0.75, 0.99);
|
||||
} else if (skipped + recrawled > 1000) {
|
||||
skipProb = Math.clamp(skipProb, 0.5, 0.99);
|
||||
} else {
|
||||
skipProb = Math.clamp(skipProb, 0, 0.95);
|
||||
}
|
||||
|
||||
} else {
|
||||
// If we haven't recrawled anything yet, we'll be more aggressive
|
||||
// in trying to recrawl documents
|
||||
skipProb = 0.25;
|
||||
}
|
||||
|
||||
if (Math.random() < skipProb) //
|
||||
{
|
||||
// Since it looks like most of these documents haven't changed,
|
||||
// we'll load the documents directly; but we do this in a random
|
||||
@ -103,6 +127,8 @@ public class CrawlerRevisitor {
|
||||
doc.documentBody,
|
||||
new ContentTags(doc.etagMaybe, doc.lastModifiedMaybe)
|
||||
);
|
||||
|
||||
skipped++;
|
||||
}
|
||||
else {
|
||||
// GET the document with the stored document as a reference
|
||||
|
Loading…
Reference in New Issue
Block a user