mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 21:29:00 +00:00

Look, this will make the git history look funny, but trimming unnecessary depth from the source tree is a very necessary sanity-preserving measure when dealing with a super-modularized codebase like this one. While it makes the project configuration a bit less conventional, it will save you several clicks every time you jump between modules. Which you'll do a lot, because it's *modul*ar. The src/main/java convention makes a lot of sense for a non-modular project though. This ain't that.
62 lines
2.0 KiB
Java
62 lines
2.0 KiB
Java
package nu.marginalia.crawling;
|
|
|
|
import nu.marginalia.link_parser.LinkParser;
|
|
import nu.marginalia.model.EdgeUrl;
|
|
import org.jsoup.Jsoup;
|
|
import org.junit.jupiter.api.Disabled;
|
|
import org.junit.jupiter.api.Test;
|
|
|
|
import java.io.IOException;
|
|
import java.net.URISyntaxException;
|
|
import java.nio.file.Files;
|
|
import java.nio.file.Path;
|
|
import java.util.LinkedHashSet;
|
|
import java.util.Objects;
|
|
import java.util.Set;
|
|
|
|
class RssCrawlerTest {
|
|
|
|
final LinkParser lp = new LinkParser();
|
|
|
|
@Test @Disabled
|
|
public void test() throws URISyntaxException, IOException {
|
|
getLinks(new EdgeUrl("https://eli.li/feed.rss"), new String(Files.readAllBytes(Path.of("/home/vlofgren/Work/feed.rss"))));
|
|
}
|
|
|
|
private Set<EdgeUrl> getLinks(EdgeUrl base, String str) {
|
|
|
|
var doc = Jsoup.parse(str.replaceAll("link", "lnk"));
|
|
|
|
Set<EdgeUrl> urls = new LinkedHashSet<>();
|
|
|
|
doc.select("entry > lnk[rel=alternate]").forEach(element -> {
|
|
var href = element.attr("href");
|
|
if (href != null && !href.isBlank()) {
|
|
lp.parseLink(base, href)
|
|
.filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain))
|
|
.ifPresent(urls::add);
|
|
}
|
|
});
|
|
|
|
doc.getElementsByTag("lnk").forEach(element -> {
|
|
var href = element.text();
|
|
if (href != null && !href.isBlank()) {
|
|
lp.parseLink(base, href)
|
|
.filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain))
|
|
.ifPresent(urls::add);
|
|
}
|
|
});
|
|
|
|
doc.select("item > guid[isPermalink=true]").forEach(element -> {
|
|
var href = element.text();
|
|
if (href != null && !href.isBlank()) {
|
|
lp.parseLink(base, href)
|
|
.filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain))
|
|
.ifPresent(urls::add);
|
|
}
|
|
});
|
|
|
|
return urls;
|
|
}
|
|
|
|
} |