mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(crawler) Make DomainCrawlFrontier a bit less aggressive with throwing away excess links when it's approaching full.
Also be a bit smarter about pre-allocating queues and sets based on depth rather than the number of provided URLs, which was always zero outside of tests.
This commit is contained in:
parent
4b16022556
commit
481f999b70
@ -46,9 +46,9 @@ public class DomainCrawlFrontier {
|
|||||||
this.urlBlocklist = new UrlBlocklist();
|
this.urlBlocklist = new UrlBlocklist();
|
||||||
this.depth = depth;
|
this.depth = depth;
|
||||||
|
|
||||||
queue = new ArrayDeque<>(10 + (int) (urls.size()*1.2));
|
queue = new ArrayDeque<>(depth);
|
||||||
visited = new LongOpenHashSet(10 + (int)(urls.size() * 1.5));
|
visited = new LongOpenHashSet(depth);
|
||||||
known = new LongOpenHashSet(10 + urls.size() * 2);
|
known = new LongOpenHashSet(depth);
|
||||||
|
|
||||||
for (String urlStr : urls) {
|
for (String urlStr : urls) {
|
||||||
EdgeUrl.parse(urlStr).ifPresent(this::addToQueue);
|
EdgeUrl.parse(urlStr).ifPresent(this::addToQueue);
|
||||||
@ -63,7 +63,6 @@ public class DomainCrawlFrontier {
|
|||||||
int maxDepthIncreaseAbsolute
|
int maxDepthIncreaseAbsolute
|
||||||
) {
|
) {
|
||||||
int base = Math.max(visited.size(), depth);
|
int base = Math.max(visited.size(), depth);
|
||||||
|
|
||||||
int scaledUp = (int)(base * depthIncreaseFactor);
|
int scaledUp = (int)(base * depthIncreaseFactor);
|
||||||
|
|
||||||
depth = Math.min(base + maxDepthIncreaseAbsolute, scaledUp);
|
depth = Math.min(base + maxDepthIncreaseAbsolute, scaledUp);
|
||||||
@ -142,7 +141,7 @@ public class DomainCrawlFrontier {
|
|||||||
return;
|
return;
|
||||||
|
|
||||||
// reduce memory usage by not growing queue huge when crawling large sites
|
// reduce memory usage by not growing queue huge when crawling large sites
|
||||||
if (queue.size() + visited.size() >= depth + 200)
|
if (queue.size() + visited.size() >= depth + 10_000)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if (isVisited(url))
|
if (isVisited(url))
|
||||||
|
Loading…
Reference in New Issue
Block a user