2023-03-04 12:19:01 +00:00
|
|
|
package nu.marginalia.crawling;
|
2022-05-19 15:45:26 +00:00
|
|
|
|
2023-03-12 10:42:07 +00:00
|
|
|
import nu.marginalia.link_parser.LinkParser;
|
2023-03-04 12:19:01 +00:00
|
|
|
import nu.marginalia.model.EdgeUrl;
|
2022-05-19 15:45:26 +00:00
|
|
|
import org.jsoup.Jsoup;
|
|
|
|
import org.junit.jupiter.api.Disabled;
|
|
|
|
import org.junit.jupiter.api.Test;
|
|
|
|
|
|
|
|
import java.io.IOException;
|
|
|
|
import java.net.URISyntaxException;
|
|
|
|
import java.nio.file.Files;
|
|
|
|
import java.nio.file.Path;
|
|
|
|
import java.util.LinkedHashSet;
|
|
|
|
import java.util.Objects;
|
|
|
|
import java.util.Set;
|
|
|
|
|
|
|
|
class RssCrawlerTest {
|
|
|
|
|
2022-05-19 20:01:51 +00:00
|
|
|
final LinkParser lp = new LinkParser();
|
2022-05-19 15:45:26 +00:00
|
|
|
|
|
|
|
@Test @Disabled
|
|
|
|
public void test() throws URISyntaxException, IOException {
|
|
|
|
getLinks(new EdgeUrl("https://eli.li/feed.rss"), new String(Files.readAllBytes(Path.of("/home/vlofgren/Work/feed.rss"))));
|
|
|
|
}
|
|
|
|
|
|
|
|
private Set<EdgeUrl> getLinks(EdgeUrl base, String str) {
|
|
|
|
|
|
|
|
var doc = Jsoup.parse(str.replaceAll("link", "lnk"));
|
|
|
|
|
|
|
|
Set<EdgeUrl> urls = new LinkedHashSet<>();
|
|
|
|
|
|
|
|
doc.select("entry > lnk[rel=alternate]").forEach(element -> {
|
|
|
|
var href = element.attr("href");
|
|
|
|
if (href != null && !href.isBlank()) {
|
|
|
|
lp.parseLink(base, href)
|
2023-12-17 13:00:07 +00:00
|
|
|
.filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain))
|
2022-05-19 15:45:26 +00:00
|
|
|
.ifPresent(urls::add);
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
doc.getElementsByTag("lnk").forEach(element -> {
|
|
|
|
var href = element.text();
|
|
|
|
if (href != null && !href.isBlank()) {
|
|
|
|
lp.parseLink(base, href)
|
2023-12-17 13:00:07 +00:00
|
|
|
.filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain))
|
2022-05-19 15:45:26 +00:00
|
|
|
.ifPresent(urls::add);
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
doc.select("item > guid[isPermalink=true]").forEach(element -> {
|
|
|
|
var href = element.text();
|
|
|
|
if (href != null && !href.isBlank()) {
|
|
|
|
lp.parseLink(base, href)
|
2023-12-17 13:00:07 +00:00
|
|
|
.filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain))
|
2022-05-19 15:45:26 +00:00
|
|
|
.ifPresent(urls::add);
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
return urls;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|