Don't try to fetch text/css and text/javascript-files. Refactor fetcher to separate content type sniffing logic. Clean up crawler a smidge.

2025-02-23 13:09:00 +00:00 · 2022-08-18 18:40:34 +02:00 · 2022-08-18 18:40:34 +02:00 · 340d80f6c7
commit 340d80f6c7
parent 6b6cd56e3a
1 changed files with 3 additions and 3 deletions
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java
@ -20,7 +20,7 @@ import java.net.InetAddress;
 import java.net.URISyntaxException;
 import java.net.UnknownHostException;
 import java.time.LocalDateTime;
-import java.util.Collections;
+import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.LinkedList;
 import java.util.Optional;
@ -129,7 +129,7 @@ public class CrawlerRetreiver {
        var robotsRules = fetcher.fetchRobotRules(queue.peek().domain);
        long crawlDelay = robotsRules.getCrawlDelay();

-        CrawledDomain ret = new CrawledDomain(id, domain, null, CrawlerDomainStatus.OK.name(), null, ip, Collections.emptyList(), null);
+        CrawledDomain ret = new CrawledDomain(id, domain, null, CrawlerDomainStatus.OK.name(), null, ip, new ArrayList<>(), null);

        int fetchedCount = 0;

@ -137,7 +137,7 @@ public class CrawlerRetreiver {
            var top = queue.removeFirst();

            if (!robotsRules.isAllowed(top.toString())) {
-                ret.doc.add(createRobotsError(top));
+                crawledDomainWriter.accept(createRobotsError(top));
                continue;
            }