(experiment) Utility for exporting atags

This commit is contained in:
Viktor Lofgren 2023-10-31 16:10:21 +01:00
parent fd8a5e695d
commit 81bfd7e5fb
3 changed files with 70 additions and 0 deletions

View File

@ -33,6 +33,8 @@ dependencies {
implementation project(':code:processes:converting-process')
implementation project(':code:process-models:crawling-model')
implementation project(':third-party:commons-codec')
implementation project(':code:features-crawl:link-parser')
implementation project(':code:features-convert:adblock')
implementation project(':code:features-convert:topic-detection')
implementation project(':code:features-convert:keyword-extraction')

View File

@ -24,6 +24,7 @@ public class ExperimentRunnerMain {
"topic", TopicExperiment.class,
"sentence-statistics", SentenceStatisticsExperiment.class,
"site-statistics", SiteStatisticsExperiment.class,
"export-atags", ExportExternalLinksExperiment.class,
"debug-converter", DebugConverterExperiment.class
);

View File

@ -0,0 +1,67 @@
package nu.marginalia.tools.experiments;
import com.google.inject.Inject;
import gnu.trove.set.hash.TLongHashSet;
import lombok.SneakyThrows;
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.tools.Experiment;
import org.jsoup.Jsoup;
import java.util.Objects;
public class ExportExternalLinksExperiment extends Experiment {
@Inject
public ExportExternalLinksExperiment() {
}
private static final LinkParser linkParser = new LinkParser();
MurmurHash3_128 hash = new MurmurHash3_128();
@SneakyThrows
@Override
public boolean process(SerializableCrawlDataStream stream) {
TLongHashSet hashes = new TLongHashSet();
while (stream.hasNext()) {
if (!(stream.next() instanceof CrawledDocument doc))
continue;
if (null == doc.documentBody)
continue;
var baseUrl = new EdgeUrl(doc.url);
var parsed = Jsoup.parse(doc.documentBody);
for (var atag : parsed.getElementsByTag("a")) {
String linkText = atag.text();
if (linkText.isBlank())
continue;
var linkOpt = linkParser.parseLinkPermissive(baseUrl, atag);
linkOpt
.filter(url -> !Objects.equals(url.domain, baseUrl.domain))
.filter(url -> hashes.add(hash.hashNearlyASCII(linkText) ^ hash.hashNearlyASCII(url.toString())))
.ifPresent(url ->
System.out.printf("\"%s\",\"%s\",\"%s\"\n",
csvify(url),
csvify(baseUrl.domain),
csvify(linkText)));
}
}
return true;
}
private static String csvify(Object field) {
return field.toString().replace("\"", "\"\"");
}
@Override
public void onFinish() {
}
}