diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawledDomainWriter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawledDomainWriter.java index d1b5439c..40dc1f8c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawledDomainWriter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawledDomainWriter.java @@ -2,6 +2,7 @@ package nu.marginalia.wmsa.edge.crawling; import com.github.luben.zstd.ZstdOutputStream; import com.google.gson.Gson; +import lombok.SneakyThrows; import nu.marginalia.wmsa.client.GsonFactory; import nu.marginalia.wmsa.edge.crawling.model.SerializableCrawlData; import org.slf4j.Logger; @@ -36,7 +37,8 @@ public class CrawledDomainWriter implements AutoCloseable { return outputFile; } - public void accept(SerializableCrawlData data) throws IOException { + @SneakyThrows + public void accept(SerializableCrawlData data) { writer.write(data.getSerialIdentifier()); writer.write('\n'); gson.toJson(data, writer); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java index cec26ca2..54283e98 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java @@ -73,7 +73,7 @@ public class CrawlerMain implements AutoCloseable { HttpFetcher fetcher = new HttpFetcher(userAgent.uaString(), dispatcher, connectionPool); try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) { - var retreiver = new CrawlerRetreiver(fetcher, specification, writer); + var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept); int size = retreiver.fetch(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawledDocument.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawledDocument.java index 497f323f..3250630f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawledDocument.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawledDocument.java @@ -1,10 +1,12 @@ package nu.marginalia.wmsa.edge.crawling.model; import lombok.Builder; +import lombok.ToString; import nu.marginalia.util.bigstring.BigString; import nu.marginalia.util.bigstring.CompressedBigString; @Builder +@ToString public class CrawledDocument implements SerializableCrawlData { public String crawlId; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java index f950e831..9e4242c5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java @@ -9,6 +9,7 @@ import nu.marginalia.wmsa.edge.crawling.blocklist.GeoIpBlocklist; import nu.marginalia.wmsa.edge.crawling.blocklist.IpBlockList; import nu.marginalia.wmsa.edge.crawling.blocklist.UrlBlocklist; import nu.marginalia.wmsa.edge.crawling.model.*; +import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeUrl; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; @@ -23,6 +24,7 @@ import java.util.ArrayList; import java.util.HashSet; import java.util.LinkedList; import java.util.Optional; +import java.util.function.Consumer; import static java.lang.Math.max; import static java.lang.Math.min; @@ -43,7 +45,7 @@ public class CrawlerRetreiver { private final int depth; private final String id; private final String domain; - private final CrawledDomainWriter crawledDomainWriter; + private final Consumer crawledDomainWriter; private static final LinkParser linkParser = new LinkParser(); private static final Logger logger = LoggerFactory.getLogger(CrawlerRetreiver.class); @@ -62,7 +64,7 @@ public class CrawlerRetreiver { } } - public CrawlerRetreiver(HttpFetcher fetcher, CrawlingSpecification specs, CrawledDomainWriter writer) { + public CrawlerRetreiver(HttpFetcher fetcher, CrawlingSpecification specs, Consumer writer) { this.fetcher = fetcher; visited = new HashSet<>((int)(specs.urls.size() * 1.5)); known = new HashSet<>(specs.urls.size() * 10); @@ -79,10 +81,15 @@ public class CrawlerRetreiver { if (queue.peek() != null) { var fst = queue.peek(); + var root = fst.withPathAndParam("/", null); if (known.add(root.toString())) queue.addFirst(root); } + else { + addToQueue(new EdgeUrl("http", new EdgeDomain(domain), null, "/", null)); + addToQueue(new EdgeUrl("https", new EdgeDomain(domain), null, "/", null)); + } } public int fetch() throws IOException { @@ -255,7 +262,7 @@ public class CrawlerRetreiver { } public boolean isSameDomain(EdgeUrl url) { - return domain.equals(url.domain.toString().toLowerCase()); + return domain.equalsIgnoreCase(url.domain.toString()); } private void findLinks(EdgeUrl baseUrl, Document parsed) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java index 4532156f..b7074825 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java @@ -127,7 +127,11 @@ public class HttpFetcher { return new FetchResult(FetchResultState.OK, requestDomain); } catch (Exception ex) { - logger.debug("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage()); + if (url.proto.equalsIgnoreCase("http") && "/".equals(url.path)) { + return probeDomain(new EdgeUrl("https", url.domain, url.port, url.path, url.param)); + } + + logger.info("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage()); return new FetchResult(FetchResultState.ERROR, url.domain); } } diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiverTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiverTest.java new file mode 100644 index 00000000..c3f558e8 --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiverTest.java @@ -0,0 +1,39 @@ +package nu.marginalia.wmsa.edge.crawling.retreival; + +import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; +import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification; +import nu.marginalia.wmsa.edge.crawling.model.SerializableCrawlData; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +@Tag("slow") +class CrawlerRetreiverTest { + + @Test + public void testEmptySet() throws IOException { + // Tests the case when there are no URLs provided in the crawl set and the + // crawler needs to guess the protocol + + var specs = new CrawlingSpecification("1", 5, "memex.marginalia.nu", new ArrayList<>()); + + HttpFetcher fetcher = new HttpFetcher("test.marginalia.nu"); + + + List data = new ArrayList<>(); + + new CrawlerRetreiver(fetcher, specs, data::add).fetch(); + + Assertions.assertTrue( + data.stream().filter(CrawledDocument.class::isInstance) + .map(CrawledDocument.class::cast) + .filter(doc -> "OK".equals(doc.crawlerStatus)) + .count() > 1 + ); + } + +} \ No newline at end of file