(crawler) Make SitemapRetriever abort on too large sitemaps.

This commit is contained in:
Viktor Lofgren 2023-07-29 19:17:19 +02:00
parent d2b6b2044c
commit 05ba3bab96

View File

@ -5,7 +5,6 @@ import nu.marginalia.model.EdgeUrl;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import javax.inject.Singleton;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.io.IOException; import java.io.IOException;
import java.util.*; import java.util.*;
@ -44,14 +43,19 @@ public class SitemapRetriever {
final List<EdgeUrl> urlsList = new ArrayList<>(10000); final List<EdgeUrl> urlsList = new ArrayList<>(10000);
final Set<EdgeUrl> seenUrls = new HashSet<>(); final Set<EdgeUrl> seenUrls = new HashSet<>();
final LinkedList<AbstractSiteMap> maps = new LinkedList<>(); final ArrayDeque<AbstractSiteMap> maps = new ArrayDeque<>();
maps.add(map); maps.add(map);
while (!maps.isEmpty()) { while (!maps.isEmpty() && seenSiteMapUrls.size() > 2) {
if (urlsList.size() >= 10000) if (urlsList.size() >= 10000)
break; break;
// This is some weird site that too many sitemaps
// ... it's causing us to run out of memory
if (seenSiteMapUrls.size() > 25)
break;
var firstMap = maps.removeFirst(); var firstMap = maps.removeFirst();
if (!seenSiteMapUrls.add(firstMap.getUrl().toString())) { if (!seenSiteMapUrls.add(firstMap.getUrl().toString())) {
@ -74,7 +78,12 @@ public class SitemapRetriever {
} }
else if (map instanceof SiteMapIndex index) { else if (map instanceof SiteMapIndex index) {
var sitemaps = index.getSitemaps(false); var sitemaps = index.getSitemaps(false);
maps.addAll(sitemaps); for (var sitemap : sitemaps) {
// Limit how many sitemaps we can add to the queue
if (maps.size() < 25) {
maps.add(sitemap);
}
}
} }
else { else {
logger.warn("Unknown sitemap type: {}", map.getClass()); logger.warn("Unknown sitemap type: {}", map.getClass());