Merge pull request 'Fix bug in redirect handling that caused the crawler to not index some documents.' (#88) from master into release

Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/88
This commit is contained in:
Viktor Lofgren 2022-08-17 00:52:34 +02:00
commit a8745d627b
2 changed files with 13 additions and 3 deletions

View File

@ -17,6 +17,7 @@ import org.slf4j.LoggerFactory;
import java.io.IOException; import java.io.IOException;
import java.net.InetAddress; import java.net.InetAddress;
import java.net.URISyntaxException;
import java.net.UnknownHostException; import java.net.UnknownHostException;
import java.time.LocalDateTime; import java.time.LocalDateTime;
import java.util.*; import java.util.*;
@ -163,7 +164,16 @@ public class CrawlerRetreiver {
var doc = fetchUrl(top); var doc = fetchUrl(top);
if (doc.isPresent()) { if (doc.isPresent()) {
fetchedCount++; fetchedCount++;
crawledDomainWriter.accept(doc.get());
var d = doc.get();
crawledDomainWriter.accept(d);
if (d.url != null) {
try {
visited.add(new EdgeUrl(d.url));
} catch (URISyntaxException ex) {}
}
} }
long crawledTime = System.currentTimeMillis() - startTime; long crawledTime = System.currentTimeMillis() - startTime;

View File

@ -198,7 +198,7 @@ public class HttpFetcher {
private CrawledDocument extractBody(EdgeUrl url, Response rsp) throws IOException, URISyntaxException { private CrawledDocument extractBody(EdgeUrl url, Response rsp) throws IOException, URISyntaxException {
var responseUrl = new EdgeUrl(rsp.request().url().toString()); var responseUrl = new EdgeUrl(rsp.request().url().toString());
if (!responseUrl.equals(url)) { if (!Objects.equals(responseUrl.domain, url.domain)) {
return createRedirectResponse(url, rsp, responseUrl); return createRedirectResponse(url, rsp, responseUrl);
} }
@ -242,7 +242,7 @@ public class HttpFetcher {
.timestamp(LocalDateTime.now().toString()) .timestamp(LocalDateTime.now().toString())
.canonicalUrl(canonical) .canonicalUrl(canonical)
.httpStatus(rsp.code()) .httpStatus(rsp.code())
.url(url.toString()) .url(responseUrl.toString())
.documentBody(strData) .documentBody(strData)
.build(); .build();
} }