mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-22 20:48:59 +00:00
(crawler) Use the response URL when resolving relative links
The crawler was incorrectly using the request URL as the base URL when resolving relative links. This caused problems when encountering redirects. For example if we fetch /log, redirecting to /log/ and find links to foo/, and bar/; these would resolve to /foo and /bar, and not /log/foo and /log/bar.
This commit is contained in:
parent
e9af838231
commit
2ea34767d8
@ -381,8 +381,10 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
if (docOpt.isPresent()) {
|
||||
var doc = docOpt.get();
|
||||
|
||||
crawlFrontier.enqueueLinksFromDocument(top, doc);
|
||||
crawlFrontier.addVisited(new EdgeUrl(ok.uri()));
|
||||
var responseUrl = new EdgeUrl(ok.uri());
|
||||
|
||||
crawlFrontier.enqueueLinksFromDocument(responseUrl, doc);
|
||||
crawlFrontier.addVisited(responseUrl);
|
||||
}
|
||||
}
|
||||
else if (fetchedDoc instanceof HttpFetchResult.Result304Raw && reference.doc() != null) {
|
||||
|
Loading…
Reference in New Issue
Block a user