From ce9abc00dcb49d39812f125accbba7b8a8a0a8dc Mon Sep 17 00:00:00 2001 From: vlofgren Date: Wed, 17 Aug 2022 00:49:29 +0200 Subject: [PATCH 1/2] Fix bug in redirect handling that caused the crawler to not index some documents. --- .../edge/crawling/retreival/CrawlerRetreiver.java | 11 ++++++++++- .../wmsa/edge/crawling/retreival/HttpFetcher.java | 4 ++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java index 802211ce..0c6105e7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java @@ -17,6 +17,7 @@ import org.slf4j.LoggerFactory; import java.io.IOException; import java.net.InetAddress; +import java.net.URISyntaxException; import java.net.UnknownHostException; import java.time.LocalDateTime; import java.util.*; @@ -163,7 +164,15 @@ public class CrawlerRetreiver { var doc = fetchUrl(top); if (doc.isPresent()) { fetchedCount++; - crawledDomainWriter.accept(doc.get()); + + var d = doc.get(); + crawledDomainWriter.accept(d); + + try { + visited.add(new EdgeUrl(d.url)); + } + catch (URISyntaxException ex) {} + } long crawledTime = System.currentTimeMillis() - startTime; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java index 967e0203..76a2e247 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java @@ -198,7 +198,7 @@ public class HttpFetcher { private CrawledDocument extractBody(EdgeUrl url, Response rsp) throws IOException, URISyntaxException { var responseUrl = new EdgeUrl(rsp.request().url().toString()); - if (!responseUrl.equals(url)) { + if (!Objects.equals(responseUrl.domain, url.domain)) { return createRedirectResponse(url, rsp, responseUrl); } @@ -242,7 +242,7 @@ public class HttpFetcher { .timestamp(LocalDateTime.now().toString()) .canonicalUrl(canonical) .httpStatus(rsp.code()) - .url(url.toString()) + .url(responseUrl.toString()) .documentBody(strData) .build(); } From 0bac422091c586a67d453d4341bfc67ed22c7918 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Wed, 17 Aug 2022 00:51:07 +0200 Subject: [PATCH 2/2] Fix bug in redirect handling that caused the crawler to not index some documents. --- .../wmsa/edge/crawling/retreival/CrawlerRetreiver.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java index 0c6105e7..a8793a36 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java @@ -168,10 +168,11 @@ public class CrawlerRetreiver { var d = doc.get(); crawledDomainWriter.accept(d); - try { - visited.add(new EdgeUrl(d.url)); + if (d.url != null) { + try { + visited.add(new EdgeUrl(d.url)); + } catch (URISyntaxException ex) {} } - catch (URISyntaxException ex) {} }