MarginaliaSearch/code/processes/crawling-process/test/nu/marginalia/crawling/RssCrawlerTest.java

62 lines
2.0 KiB
Java
Raw Normal View History

2023-03-04 12:19:01 +00:00
package nu.marginalia.crawling;
2022-05-19 15:45:26 +00:00
import nu.marginalia.link_parser.LinkParser;
2023-03-04 12:19:01 +00:00
import nu.marginalia.model.EdgeUrl;
2022-05-19 15:45:26 +00:00
import org.jsoup.Jsoup;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.LinkedHashSet;
import java.util.Objects;
import java.util.Set;
class RssCrawlerTest {
final LinkParser lp = new LinkParser();
2022-05-19 15:45:26 +00:00
@Test @Disabled
public void test() throws URISyntaxException, IOException {
getLinks(new EdgeUrl("https://eli.li/feed.rss"), new String(Files.readAllBytes(Path.of("/home/vlofgren/Work/feed.rss"))));
}
private Set<EdgeUrl> getLinks(EdgeUrl base, String str) {
var doc = Jsoup.parse(str.replaceAll("link", "lnk"));
Set<EdgeUrl> urls = new LinkedHashSet<>();
doc.select("entry > lnk[rel=alternate]").forEach(element -> {
var href = element.attr("href");
if (href != null && !href.isBlank()) {
lp.parseLink(base, href)
.filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain))
2022-05-19 15:45:26 +00:00
.ifPresent(urls::add);
}
});
doc.getElementsByTag("lnk").forEach(element -> {
var href = element.text();
if (href != null && !href.isBlank()) {
lp.parseLink(base, href)
.filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain))
2022-05-19 15:45:26 +00:00
.ifPresent(urls::add);
}
});
doc.select("item > guid[isPermalink=true]").forEach(element -> {
var href = element.text();
if (href != null && !href.isBlank()) {
lp.parseLink(base, href)
.filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain))
2022-05-19 15:45:26 +00:00
.ifPresent(urls::add);
}
});
return urls;
}
}