Merge pull request 'Fix bug in redirect handling that caused the crawler to not index some documents.' (#88) from master into release

Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/88
This commit is contained in:
Viktor Lofgren 2022-08-17 00:52:34 +02:00
commit a8745d627b
2 changed files with 13 additions and 3 deletions

View File

@ -17,6 +17,7 @@ import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.InetAddress;
import java.net.URISyntaxException;
import java.net.UnknownHostException;
import java.time.LocalDateTime;
import java.util.*;
@ -163,7 +164,16 @@ public class CrawlerRetreiver {
var doc = fetchUrl(top);
if (doc.isPresent()) {
fetchedCount++;
crawledDomainWriter.accept(doc.get());
var d = doc.get();
crawledDomainWriter.accept(d);
if (d.url != null) {
try {
visited.add(new EdgeUrl(d.url));
} catch (URISyntaxException ex) {}
}
}
long crawledTime = System.currentTimeMillis() - startTime;

View File

@ -198,7 +198,7 @@ public class HttpFetcher {
private CrawledDocument extractBody(EdgeUrl url, Response rsp) throws IOException, URISyntaxException {
var responseUrl = new EdgeUrl(rsp.request().url().toString());
if (!responseUrl.equals(url)) {
if (!Objects.equals(responseUrl.domain, url.domain)) {
return createRedirectResponse(url, rsp, responseUrl);
}
@ -242,7 +242,7 @@ public class HttpFetcher {
.timestamp(LocalDateTime.now().toString())
.canonicalUrl(canonical)
.httpStatus(rsp.code())
.url(url.toString())
.url(responseUrl.toString())
.documentBody(strData)
.build();
}