MarginaliaSearch/code/processes/crawling-process/test/nu/marginalia/crawling/RssCrawlerTest.java

package nu.marginalia.crawling;

import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.model.EdgeUrl;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;

import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.LinkedHashSet;
import java.util.Objects;
import java.util.Set;

class RssCrawlerTest {

    final LinkParser lp = new LinkParser();

    @Test @Disabled
    public void test() throws URISyntaxException, IOException {
        getLinks(new EdgeUrl("https://eli.li/feed.rss"), new String(Files.readAllBytes(Path.of("/home/vlofgren/Work/feed.rss"))));
    }

    private Set<EdgeUrl> getLinks(EdgeUrl base, String str) {

        var doc = Jsoup.parse(str.replaceAll("link", "lnk"));

        Set<EdgeUrl> urls = new LinkedHashSet<>();

        doc.select("entry > lnk[rel=alternate]").forEach(element -> {
            var href = element.attr("href");
            if (href != null && !href.isBlank()) {
                lp.parseLink(base, href)
                        .filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain))
                        .ifPresent(urls::add);
            }
        });

        doc.getElementsByTag("lnk").forEach(element -> {
            var href = element.text();
            if (href != null && !href.isBlank()) {
                lp.parseLink(base, href)
                        .filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain))
                        .ifPresent(urls::add);
            }
        });

        doc.select("item > guid[isPermalink=true]").forEach(element -> {
            var href = element.text();
            if (href != null && !href.isBlank()) {
                lp.parseLink(base, href)
                        .filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain))
                        .ifPresent(urls::add);
            }
        });

        return urls;
    }

}
Restructuring the git repo 2023-03-04 12:19:01 +00:00			`package nu.marginalia.crawling;`
first commit 2022-05-19 15:45:26 +00:00
The refactoring will continue until morale improves. 2023-03-12 10:42:07 +00:00			`import nu.marginalia.link_parser.LinkParser;`
Restructuring the git repo 2023-03-04 12:19:01 +00:00			`import nu.marginalia.model.EdgeUrl;`
first commit 2022-05-19 15:45:26 +00:00			`import org.jsoup.Jsoup;`
			`import org.junit.jupiter.api.Disabled;`
			`import org.junit.jupiter.api.Test;`

			`import java.io.IOException;`
			`import java.net.URISyntaxException;`
			`import java.nio.file.Files;`
			`import java.nio.file.Path;`
			`import java.util.LinkedHashSet;`
			`import java.util.Objects;`
			`import java.util.Set;`

			`class RssCrawlerTest {`

Fixing some compiler warnings + a rotten test 2022-05-19 20:01:51 +00:00			`final LinkParser lp = new LinkParser();`
first commit 2022-05-19 15:45:26 +00:00
			`@Test @Disabled`
			`public void test() throws URISyntaxException, IOException {`
			`getLinks(new EdgeUrl("https://eli.li/feed.rss"), new String(Files.readAllBytes(Path.of("/home/vlofgren/Work/feed.rss"))));`
			`}`

			`private Set<EdgeUrl> getLinks(EdgeUrl base, String str) {`

			`var doc = Jsoup.parse(str.replaceAll("link", "lnk"));`

			`Set<EdgeUrl> urls = new LinkedHashSet<>();`

			`doc.select("entry > lnk[rel=alternate]").forEach(element -> {`
			`var href = element.attr("href");`
			`if (href != null && !href.isBlank()) {`
			`lp.parseLink(base, href)`
(*) Rename EdgeDomain$domain into topDomain This variable had a very confusing name, and was dangerously easy to use in the wrong place with the result of getting something that only works as expected half the time. Ideally this class needs an overhaul, the assumptions it makes about domain names aren't great. 2023-12-17 13:00:07 +00:00			`.filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain))`
first commit 2022-05-19 15:45:26 +00:00			`.ifPresent(urls::add);`
			`}`
			`});`

			`doc.getElementsByTag("lnk").forEach(element -> {`
			`var href = element.text();`
			`if (href != null && !href.isBlank()) {`
			`lp.parseLink(base, href)`
(*) Rename EdgeDomain$domain into topDomain This variable had a very confusing name, and was dangerously easy to use in the wrong place with the result of getting something that only works as expected half the time. Ideally this class needs an overhaul, the assumptions it makes about domain names aren't great. 2023-12-17 13:00:07 +00:00			`.filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain))`
first commit 2022-05-19 15:45:26 +00:00			`.ifPresent(urls::add);`
			`}`
			`});`

			`doc.select("item > guid[isPermalink=true]").forEach(element -> {`
			`var href = element.text();`
			`if (href != null && !href.isBlank()) {`
			`lp.parseLink(base, href)`
(*) Rename EdgeDomain$domain into topDomain This variable had a very confusing name, and was dangerously easy to use in the wrong place with the result of getting something that only works as expected half the time. Ideally this class needs an overhaul, the assumptions it makes about domain names aren't great. 2023-12-17 13:00:07 +00:00			`.filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain))`
first commit 2022-05-19 15:45:26 +00:00			`.ifPresent(urls::add);`
			`}`
			`});`

			`return urls;`
			`}`

			`}`