mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(experiment) Utility for exporting atags
This commit is contained in:
parent
fd8a5e695d
commit
81bfd7e5fb
@ -33,6 +33,8 @@ dependencies {
|
||||
implementation project(':code:processes:converting-process')
|
||||
implementation project(':code:process-models:crawling-model')
|
||||
|
||||
implementation project(':third-party:commons-codec')
|
||||
implementation project(':code:features-crawl:link-parser')
|
||||
implementation project(':code:features-convert:adblock')
|
||||
implementation project(':code:features-convert:topic-detection')
|
||||
implementation project(':code:features-convert:keyword-extraction')
|
||||
|
@ -24,6 +24,7 @@ public class ExperimentRunnerMain {
|
||||
"topic", TopicExperiment.class,
|
||||
"sentence-statistics", SentenceStatisticsExperiment.class,
|
||||
"site-statistics", SiteStatisticsExperiment.class,
|
||||
"export-atags", ExportExternalLinksExperiment.class,
|
||||
"debug-converter", DebugConverterExperiment.class
|
||||
);
|
||||
|
||||
|
@ -0,0 +1,67 @@
|
||||
package nu.marginalia.tools.experiments;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import gnu.trove.set.hash.TLongHashSet;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.hash.MurmurHash3_128;
|
||||
import nu.marginalia.link_parser.LinkParser;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.tools.Experiment;
|
||||
import org.jsoup.Jsoup;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
public class ExportExternalLinksExperiment extends Experiment {
|
||||
|
||||
|
||||
|
||||
@Inject
|
||||
public ExportExternalLinksExperiment() {
|
||||
|
||||
}
|
||||
private static final LinkParser linkParser = new LinkParser();
|
||||
MurmurHash3_128 hash = new MurmurHash3_128();
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public boolean process(SerializableCrawlDataStream stream) {
|
||||
TLongHashSet hashes = new TLongHashSet();
|
||||
|
||||
while (stream.hasNext()) {
|
||||
if (!(stream.next() instanceof CrawledDocument doc))
|
||||
continue;
|
||||
if (null == doc.documentBody)
|
||||
continue;
|
||||
|
||||
var baseUrl = new EdgeUrl(doc.url);
|
||||
var parsed = Jsoup.parse(doc.documentBody);
|
||||
|
||||
for (var atag : parsed.getElementsByTag("a")) {
|
||||
String linkText = atag.text();
|
||||
if (linkText.isBlank())
|
||||
continue;
|
||||
|
||||
var linkOpt = linkParser.parseLinkPermissive(baseUrl, atag);
|
||||
linkOpt
|
||||
.filter(url -> !Objects.equals(url.domain, baseUrl.domain))
|
||||
.filter(url -> hashes.add(hash.hashNearlyASCII(linkText) ^ hash.hashNearlyASCII(url.toString())))
|
||||
.ifPresent(url ->
|
||||
System.out.printf("\"%s\",\"%s\",\"%s\"\n",
|
||||
csvify(url),
|
||||
csvify(baseUrl.domain),
|
||||
csvify(linkText)));
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private static String csvify(Object field) {
|
||||
return field.toString().replace("\"", "\"\"");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onFinish() {
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user