From 2ea34767d88990b2cbbf2f34ee4b03c1b0efd5d7 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 31 Jan 2025 12:40:13 +0100 Subject: [PATCH] (crawler) Use the response URL when resolving relative links The crawler was incorrectly using the request URL as the base URL when resolving relative links. This caused problems when encountering redirects. For example if we fetch /log, redirecting to /log/ and find links to foo/, and bar/; these would resolve to /foo and /bar, and not /log/foo and /log/bar. --- .../nu/marginalia/crawl/retreival/CrawlerRetreiver.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index 56aae5d0..39db4878 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -381,8 +381,10 @@ public class CrawlerRetreiver implements AutoCloseable { if (docOpt.isPresent()) { var doc = docOpt.get(); - crawlFrontier.enqueueLinksFromDocument(top, doc); - crawlFrontier.addVisited(new EdgeUrl(ok.uri())); + var responseUrl = new EdgeUrl(ok.uri()); + + crawlFrontier.enqueueLinksFromDocument(responseUrl, doc); + crawlFrontier.addVisited(responseUrl); } } else if (fetchedDoc instanceof HttpFetchResult.Result304Raw && reference.doc() != null) {