mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(crawler) Make SitemapRetriever abort on too large sitemaps.
This commit is contained in:
parent
d2b6b2044c
commit
05ba3bab96
@ -5,7 +5,6 @@ import nu.marginalia.model.EdgeUrl;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.inject.Singleton;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
@ -44,14 +43,19 @@ public class SitemapRetriever {
|
||||
final List<EdgeUrl> urlsList = new ArrayList<>(10000);
|
||||
final Set<EdgeUrl> seenUrls = new HashSet<>();
|
||||
|
||||
final LinkedList<AbstractSiteMap> maps = new LinkedList<>();
|
||||
final ArrayDeque<AbstractSiteMap> maps = new ArrayDeque<>();
|
||||
|
||||
maps.add(map);
|
||||
|
||||
while (!maps.isEmpty()) {
|
||||
while (!maps.isEmpty() && seenSiteMapUrls.size() > 2) {
|
||||
if (urlsList.size() >= 10000)
|
||||
break;
|
||||
|
||||
// This is some weird site that too many sitemaps
|
||||
// ... it's causing us to run out of memory
|
||||
if (seenSiteMapUrls.size() > 25)
|
||||
break;
|
||||
|
||||
var firstMap = maps.removeFirst();
|
||||
|
||||
if (!seenSiteMapUrls.add(firstMap.getUrl().toString())) {
|
||||
@ -74,7 +78,12 @@ public class SitemapRetriever {
|
||||
}
|
||||
else if (map instanceof SiteMapIndex index) {
|
||||
var sitemaps = index.getSitemaps(false);
|
||||
maps.addAll(sitemaps);
|
||||
for (var sitemap : sitemaps) {
|
||||
// Limit how many sitemaps we can add to the queue
|
||||
if (maps.size() < 25) {
|
||||
maps.add(sitemap);
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
logger.warn("Unknown sitemap type: {}", map.getClass());
|
||||
|
Loading…
Reference in New Issue
Block a user