(crawler) Make SitemapRetriever abort on too large sitemaps.

This commit is contained in:
Viktor Lofgren 2023-07-29 19:17:19 +02:00
parent d2b6b2044c
commit 05ba3bab96

View File

@ -5,7 +5,6 @@ import nu.marginalia.model.EdgeUrl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.inject.Singleton;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;
@ -44,14 +43,19 @@ public class SitemapRetriever {
final List<EdgeUrl> urlsList = new ArrayList<>(10000);
final Set<EdgeUrl> seenUrls = new HashSet<>();
final LinkedList<AbstractSiteMap> maps = new LinkedList<>();
final ArrayDeque<AbstractSiteMap> maps = new ArrayDeque<>();
maps.add(map);
while (!maps.isEmpty()) {
while (!maps.isEmpty() && seenSiteMapUrls.size() > 2) {
if (urlsList.size() >= 10000)
break;
// This is some weird site that too many sitemaps
// ... it's causing us to run out of memory
if (seenSiteMapUrls.size() > 25)
break;
var firstMap = maps.removeFirst();
if (!seenSiteMapUrls.add(firstMap.getUrl().toString())) {
@ -74,7 +78,12 @@ public class SitemapRetriever {
}
else if (map instanceof SiteMapIndex index) {
var sitemaps = index.getSitemaps(false);
maps.addAll(sitemaps);
for (var sitemap : sitemaps) {
// Limit how many sitemaps we can add to the queue
if (maps.size() < 25) {
maps.add(sitemap);
}
}
}
else {
logger.warn("Unknown sitemap type: {}", map.getClass());