(live-crawl) Flag URLs that don't pass robots.txt as bad so we don't keep fetching robots.txt every day for an empty link list

This commit is contained in:
Viktor Lofgren 2024-11-23 17:07:16 +01:00
parent 923ebbac81
commit 88caca60f9

View File

@ -92,6 +92,7 @@ public class SimpleLinkScraper implements AutoCloseable {
EdgeUrl parsedUrl = optParsedUrl.get();
if (!rules.isAllowed(url)) {
maybeFlagAsBad(parsedUrl);
continue;
}