MarginaliaSearch/code/processes/crawling-process/test/nu/marginalia/crawling/RssCrawlerTest.java
Viktor Lofgren 1d34224416 (refac) Remove src/main from all source code paths.
Look, this will make the git history look funny, but trimming unnecessary depth from the source tree is a very necessary sanity-preserving measure when dealing with a super-modularized codebase like this one.

While it makes the project configuration a bit less conventional, it will save you several clicks every time you jump between modules.  Which you'll do a lot, because it's *modul*ar.  The src/main/java convention makes a lot of sense for a non-modular project though.  This ain't that.
2024-02-23 16:13:40 +01:00

62 lines
2.0 KiB
Java

package nu.marginalia.crawling;
import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.model.EdgeUrl;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.LinkedHashSet;
import java.util.Objects;
import java.util.Set;
class RssCrawlerTest {
final LinkParser lp = new LinkParser();
@Test @Disabled
public void test() throws URISyntaxException, IOException {
getLinks(new EdgeUrl("https://eli.li/feed.rss"), new String(Files.readAllBytes(Path.of("/home/vlofgren/Work/feed.rss"))));
}
private Set<EdgeUrl> getLinks(EdgeUrl base, String str) {
var doc = Jsoup.parse(str.replaceAll("link", "lnk"));
Set<EdgeUrl> urls = new LinkedHashSet<>();
doc.select("entry > lnk[rel=alternate]").forEach(element -> {
var href = element.attr("href");
if (href != null && !href.isBlank()) {
lp.parseLink(base, href)
.filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain))
.ifPresent(urls::add);
}
});
doc.getElementsByTag("lnk").forEach(element -> {
var href = element.text();
if (href != null && !href.isBlank()) {
lp.parseLink(base, href)
.filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain))
.ifPresent(urls::add);
}
});
doc.select("item > guid[isPermalink=true]").forEach(element -> {
var href = element.text();
if (href != null && !href.isBlank()) {
lp.parseLink(base, href)
.filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain))
.ifPresent(urls::add);
}
});
return urls;
}
}