From 81bfd7e5fb88cd1804e859b565d1abd574a4e091 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 31 Oct 2023 16:10:21 +0100 Subject: [PATCH] (experiment) Utility for exporting atags --- code/tools/experiment-runner/build.gradle | 2 + .../tools/ExperimentRunnerMain.java | 1 + .../ExportExternalLinksExperiment.java | 67 +++++++++++++++++++ 3 files changed, 70 insertions(+) create mode 100644 code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/ExportExternalLinksExperiment.java diff --git a/code/tools/experiment-runner/build.gradle b/code/tools/experiment-runner/build.gradle index 800ee336..e308b8e5 100644 --- a/code/tools/experiment-runner/build.gradle +++ b/code/tools/experiment-runner/build.gradle @@ -33,6 +33,8 @@ dependencies { implementation project(':code:processes:converting-process') implementation project(':code:process-models:crawling-model') + implementation project(':third-party:commons-codec') + implementation project(':code:features-crawl:link-parser') implementation project(':code:features-convert:adblock') implementation project(':code:features-convert:topic-detection') implementation project(':code:features-convert:keyword-extraction') diff --git a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/ExperimentRunnerMain.java b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/ExperimentRunnerMain.java index 0cc5c9be..9997ce71 100644 --- a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/ExperimentRunnerMain.java +++ b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/ExperimentRunnerMain.java @@ -24,6 +24,7 @@ public class ExperimentRunnerMain { "topic", TopicExperiment.class, "sentence-statistics", SentenceStatisticsExperiment.class, "site-statistics", SiteStatisticsExperiment.class, + "export-atags", ExportExternalLinksExperiment.class, "debug-converter", DebugConverterExperiment.class ); diff --git a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/ExportExternalLinksExperiment.java b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/ExportExternalLinksExperiment.java new file mode 100644 index 00000000..f602a837 --- /dev/null +++ b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/ExportExternalLinksExperiment.java @@ -0,0 +1,67 @@ +package nu.marginalia.tools.experiments; + +import com.google.inject.Inject; +import gnu.trove.set.hash.TLongHashSet; +import lombok.SneakyThrows; +import nu.marginalia.crawling.io.SerializableCrawlDataStream; +import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.hash.MurmurHash3_128; +import nu.marginalia.link_parser.LinkParser; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.tools.Experiment; +import org.jsoup.Jsoup; + +import java.util.Objects; + +public class ExportExternalLinksExperiment extends Experiment { + + + + @Inject + public ExportExternalLinksExperiment() { + + } + private static final LinkParser linkParser = new LinkParser(); + MurmurHash3_128 hash = new MurmurHash3_128(); + @SneakyThrows + @Override + public boolean process(SerializableCrawlDataStream stream) { + TLongHashSet hashes = new TLongHashSet(); + + while (stream.hasNext()) { + if (!(stream.next() instanceof CrawledDocument doc)) + continue; + if (null == doc.documentBody) + continue; + + var baseUrl = new EdgeUrl(doc.url); + var parsed = Jsoup.parse(doc.documentBody); + + for (var atag : parsed.getElementsByTag("a")) { + String linkText = atag.text(); + if (linkText.isBlank()) + continue; + + var linkOpt = linkParser.parseLinkPermissive(baseUrl, atag); + linkOpt + .filter(url -> !Objects.equals(url.domain, baseUrl.domain)) + .filter(url -> hashes.add(hash.hashNearlyASCII(linkText) ^ hash.hashNearlyASCII(url.toString()))) + .ifPresent(url -> + System.out.printf("\"%s\",\"%s\",\"%s\"\n", + csvify(url), + csvify(baseUrl.domain), + csvify(linkText))); + } + } + + return true; + } + + private static String csvify(Object field) { + return field.toString().replace("\"", "\"\""); + } + + @Override + public void onFinish() { + } +}