mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Merge pull request 'Fix bug in redirect handling that caused the crawler to not index some documents.' (#88) from master into release
Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/88
This commit is contained in:
commit
a8745d627b
@ -17,6 +17,7 @@ import org.slf4j.LoggerFactory;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.InetAddress;
|
import java.net.InetAddress;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
import java.net.UnknownHostException;
|
import java.net.UnknownHostException;
|
||||||
import java.time.LocalDateTime;
|
import java.time.LocalDateTime;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
@ -163,7 +164,16 @@ public class CrawlerRetreiver {
|
|||||||
var doc = fetchUrl(top);
|
var doc = fetchUrl(top);
|
||||||
if (doc.isPresent()) {
|
if (doc.isPresent()) {
|
||||||
fetchedCount++;
|
fetchedCount++;
|
||||||
crawledDomainWriter.accept(doc.get());
|
|
||||||
|
var d = doc.get();
|
||||||
|
crawledDomainWriter.accept(d);
|
||||||
|
|
||||||
|
if (d.url != null) {
|
||||||
|
try {
|
||||||
|
visited.add(new EdgeUrl(d.url));
|
||||||
|
} catch (URISyntaxException ex) {}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
long crawledTime = System.currentTimeMillis() - startTime;
|
long crawledTime = System.currentTimeMillis() - startTime;
|
||||||
|
@ -198,7 +198,7 @@ public class HttpFetcher {
|
|||||||
private CrawledDocument extractBody(EdgeUrl url, Response rsp) throws IOException, URISyntaxException {
|
private CrawledDocument extractBody(EdgeUrl url, Response rsp) throws IOException, URISyntaxException {
|
||||||
|
|
||||||
var responseUrl = new EdgeUrl(rsp.request().url().toString());
|
var responseUrl = new EdgeUrl(rsp.request().url().toString());
|
||||||
if (!responseUrl.equals(url)) {
|
if (!Objects.equals(responseUrl.domain, url.domain)) {
|
||||||
return createRedirectResponse(url, rsp, responseUrl);
|
return createRedirectResponse(url, rsp, responseUrl);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -242,7 +242,7 @@ public class HttpFetcher {
|
|||||||
.timestamp(LocalDateTime.now().toString())
|
.timestamp(LocalDateTime.now().toString())
|
||||||
.canonicalUrl(canonical)
|
.canonicalUrl(canonical)
|
||||||
.httpStatus(rsp.code())
|
.httpStatus(rsp.code())
|
||||||
.url(url.toString())
|
.url(responseUrl.toString())
|
||||||
.documentBody(strData)
|
.documentBody(strData)
|
||||||
.build();
|
.build();
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user