From 88caca60f9408d7ca27c0d44cae61da4709afae3 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 23 Nov 2024 17:07:16 +0100 Subject: [PATCH] (live-crawl) Flag URLs that don't pass robots.txt as bad so we don't keep fetching robots.txt every day for an empty link list --- .../java/nu/marginalia/livecrawler/SimpleLinkScraper.java | 1 + 1 file changed, 1 insertion(+) diff --git a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/SimpleLinkScraper.java b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/SimpleLinkScraper.java index b1ad42e5..89f5f338 100644 --- a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/SimpleLinkScraper.java +++ b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/SimpleLinkScraper.java @@ -92,6 +92,7 @@ public class SimpleLinkScraper implements AutoCloseable { EdgeUrl parsedUrl = optParsedUrl.get(); if (!rules.isAllowed(url)) { + maybeFlagAsBad(parsedUrl); continue; }