mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(crawler) Correct feed URLs in domain state db
Discovered feed URLs were given a double slash after their domain name in the DB. This will go away in the URL normalizer, so the URLs are still viable, but the commit fixes the issue regardless.
This commit is contained in:
parent
895cee7004
commit
89db69d360
@ -297,16 +297,16 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private final List<String> likelyFeedEndpoints = List.of(
|
private final List<String> likelyFeedEndpoints = List.of(
|
||||||
"/rss.xml",
|
"rss.xml",
|
||||||
"/atom.xml",
|
"atom.xml",
|
||||||
"/feed.xml",
|
"feed.xml",
|
||||||
"/index.xml",
|
"index.xml",
|
||||||
"/feed",
|
"feed",
|
||||||
"/rss",
|
"rss",
|
||||||
"/atom",
|
"atom",
|
||||||
"/feeds",
|
"feeds",
|
||||||
"/blog/feed",
|
"blog/feed",
|
||||||
"/blog/rss"
|
"blog/rss"
|
||||||
);
|
);
|
||||||
|
|
||||||
private Optional<String> guessFeedUrl(CrawlDelayTimer timer) throws InterruptedException {
|
private Optional<String> guessFeedUrl(CrawlDelayTimer timer) throws InterruptedException {
|
||||||
|
Loading…
Reference in New Issue
Block a user