From fbdedf53de3012205de10c21314d793f5ff15e51 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 27 Jun 2023 15:50:38 +0200 Subject: [PATCH] Fix bug in CrawlerRetreiver ... where the root URL wasn't always added properly to the front of the crawl queue. --- .../model/spec/CrawlingSpecification.java | 3 +- .../crawl/retreival/CrawlerRetreiver.java | 4 +- .../crawl/retreival/DomainCrawlFrontier.java | 8 +- .../retreival/CrawlerRetreiverTest.java | 80 +++++++++++++++---- 4 files changed, 74 insertions(+), 21 deletions(-) diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/spec/CrawlingSpecification.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/spec/CrawlingSpecification.java index fea0f867..47ecf921 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/spec/CrawlingSpecification.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/spec/CrawlingSpecification.java @@ -1,11 +1,12 @@ package nu.marginalia.crawling.model.spec; import lombok.AllArgsConstructor; +import lombok.Builder; import lombok.NoArgsConstructor; import java.util.List; -@AllArgsConstructor @NoArgsConstructor +@AllArgsConstructor @NoArgsConstructor @Builder public class CrawlingSpecification { public String id; diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index 78514a27..b81ea431 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -77,8 +77,8 @@ public class CrawlerRetreiver { // Ensure the index page is always crawled var root = fst.withPathAndParam("/", null); - if (crawlFrontier.addKnown(root)) - crawlFrontier.addFirst(root); + + crawlFrontier.addFirst(root); } else { // We know nothing about this domain, so we'll start with the index, trying both HTTP and HTTPS diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java index e72d1f33..b6e23f0c 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java @@ -43,11 +43,11 @@ public class DomainCrawlFrontier { public boolean isEmpty() { return queue.isEmpty(); } - public boolean addKnown(EdgeUrl url) { - return known.contains(url.toString()); - } + public void addFirst(EdgeUrl url) { - queue.addFirst(url); + if (known.add(url.toString())) { + queue.addFirst(url); + } } public EdgeUrl takeNextUrl() { diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java index 19666f13..1e309c71 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java @@ -1,5 +1,6 @@ package nu.marginalia.crawling.retreival; +import lombok.SneakyThrows; import nu.marginalia.WmsaHome; import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; @@ -7,42 +8,93 @@ import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.spec.CrawlingSpecification; import nu.marginalia.crawling.model.SerializableCrawlData; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; -import java.io.IOException; import java.util.ArrayList; import java.util.List; +import java.util.stream.Collectors; + +import static org.junit.jupiter.api.Assertions.assertTrue; @Tag("slow") class CrawlerRetreiverTest { + private HttpFetcher httpFetcher; + + @BeforeEach + public void setUp() { + httpFetcher = new HttpFetcherImpl("search.marginalia.nu; testing a bit :D"); + } + + @SneakyThrows + public static void setUpAll() { + // this must be done to avoid java inserting its own user agent for the sitemap requests + System.setProperty("http.agent", WmsaHome.getUserAgent().uaString()); + } @Test - public void testEmptySet() throws IOException { - System.setProperty("http.agent", WmsaHome.getUserAgent().uaString()); - // Tests the case when there are no URLs provided in the crawl set and the - // crawler needs to guess the protocol - - var specs = new CrawlingSpecification("1", 5, "www.marginalia.nu", new ArrayList<>()); - - HttpFetcher fetcher = new HttpFetcherImpl("search.marginalia.nu; testing a bit :D"); + public void testWithKnownDomains() { + var specs = CrawlingSpecification + .builder() + .id("whatever") + .crawlDepth(5) + .domain("www.marginalia.nu") + .urls(List.of("https://www.marginalia.nu/misc/debian-laptop-install-log/")) + .build(); List data = new ArrayList<>(); - new CrawlerRetreiver(fetcher, specs, data::add).fetch(); + new CrawlerRetreiver(httpFetcher, specs, data::add).fetch(); + + var fetchedUrls = + data.stream().filter(CrawledDocument.class::isInstance) + .map(CrawledDocument.class::cast) + .map(doc -> doc.url) + .collect(Collectors.toSet()); + + assertTrue(fetchedUrls.contains("https://www.marginalia.nu/")); + assertTrue(fetchedUrls.contains("https://www.marginalia.nu/misc/debian-laptop-install-log/")); data.stream().filter(CrawledDocument.class::isInstance) .map(CrawledDocument.class::cast) .forEach(doc -> System.out.println(doc.url + "\t" + doc.crawlerStatus + "\t" + doc.httpStatus)); - /* + } + + @Test + public void testEmptySet() { + + var specs = CrawlingSpecification + .builder() + .id("whatever") + .crawlDepth(5) + .domain("www.marginalia.nu") + .urls(List.of()) + .build(); + + List data = new ArrayList<>(); + + new CrawlerRetreiver(httpFetcher, specs, data::add).fetch(); + + data.stream().filter(CrawledDocument.class::isInstance) + .map(CrawledDocument.class::cast) + .forEach(doc -> System.out.println(doc.url + "\t" + doc.crawlerStatus + "\t" + doc.httpStatus)); + + var fetchedUrls = + data.stream().filter(CrawledDocument.class::isInstance) + .map(CrawledDocument.class::cast) + .map(doc -> doc.url) + .collect(Collectors.toSet()); + + assertTrue(fetchedUrls.contains("https://www.marginalia.nu/")); + Assertions.assertTrue( data.stream().filter(CrawledDocument.class::isInstance) .map(CrawledDocument.class::cast) - .filter(doc -> "OK".equals(doc.crawlerStatus)) - .count() > 1 + .anyMatch(doc -> "OK".equals(doc.crawlerStatus)) ); - */ } } \ No newline at end of file