mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 13:19:02 +00:00
(crawler) Make SitemapRetriever abort on too large sitemaps.
This commit is contained in:
parent
d2b6b2044c
commit
05ba3bab96
@ -5,7 +5,6 @@ import nu.marginalia.model.EdgeUrl;
|
|||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import javax.inject.Singleton;
|
|
||||||
import java.io.FileNotFoundException;
|
import java.io.FileNotFoundException;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
@ -44,14 +43,19 @@ public class SitemapRetriever {
|
|||||||
final List<EdgeUrl> urlsList = new ArrayList<>(10000);
|
final List<EdgeUrl> urlsList = new ArrayList<>(10000);
|
||||||
final Set<EdgeUrl> seenUrls = new HashSet<>();
|
final Set<EdgeUrl> seenUrls = new HashSet<>();
|
||||||
|
|
||||||
final LinkedList<AbstractSiteMap> maps = new LinkedList<>();
|
final ArrayDeque<AbstractSiteMap> maps = new ArrayDeque<>();
|
||||||
|
|
||||||
maps.add(map);
|
maps.add(map);
|
||||||
|
|
||||||
while (!maps.isEmpty()) {
|
while (!maps.isEmpty() && seenSiteMapUrls.size() > 2) {
|
||||||
if (urlsList.size() >= 10000)
|
if (urlsList.size() >= 10000)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
// This is some weird site that too many sitemaps
|
||||||
|
// ... it's causing us to run out of memory
|
||||||
|
if (seenSiteMapUrls.size() > 25)
|
||||||
|
break;
|
||||||
|
|
||||||
var firstMap = maps.removeFirst();
|
var firstMap = maps.removeFirst();
|
||||||
|
|
||||||
if (!seenSiteMapUrls.add(firstMap.getUrl().toString())) {
|
if (!seenSiteMapUrls.add(firstMap.getUrl().toString())) {
|
||||||
@ -74,7 +78,12 @@ public class SitemapRetriever {
|
|||||||
}
|
}
|
||||||
else if (map instanceof SiteMapIndex index) {
|
else if (map instanceof SiteMapIndex index) {
|
||||||
var sitemaps = index.getSitemaps(false);
|
var sitemaps = index.getSitemaps(false);
|
||||||
maps.addAll(sitemaps);
|
for (var sitemap : sitemaps) {
|
||||||
|
// Limit how many sitemaps we can add to the queue
|
||||||
|
if (maps.size() < 25) {
|
||||||
|
maps.add(sitemap);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
logger.warn("Unknown sitemap type: {}", map.getClass());
|
logger.warn("Unknown sitemap type: {}", map.getClass());
|
||||||
|
Loading…
Reference in New Issue
Block a user