mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(experiment) Utility for exporting atags
This commit is contained in:
parent
fd8a5e695d
commit
81bfd7e5fb
@ -33,6 +33,8 @@ dependencies {
|
|||||||
implementation project(':code:processes:converting-process')
|
implementation project(':code:processes:converting-process')
|
||||||
implementation project(':code:process-models:crawling-model')
|
implementation project(':code:process-models:crawling-model')
|
||||||
|
|
||||||
|
implementation project(':third-party:commons-codec')
|
||||||
|
implementation project(':code:features-crawl:link-parser')
|
||||||
implementation project(':code:features-convert:adblock')
|
implementation project(':code:features-convert:adblock')
|
||||||
implementation project(':code:features-convert:topic-detection')
|
implementation project(':code:features-convert:topic-detection')
|
||||||
implementation project(':code:features-convert:keyword-extraction')
|
implementation project(':code:features-convert:keyword-extraction')
|
||||||
|
@ -24,6 +24,7 @@ public class ExperimentRunnerMain {
|
|||||||
"topic", TopicExperiment.class,
|
"topic", TopicExperiment.class,
|
||||||
"sentence-statistics", SentenceStatisticsExperiment.class,
|
"sentence-statistics", SentenceStatisticsExperiment.class,
|
||||||
"site-statistics", SiteStatisticsExperiment.class,
|
"site-statistics", SiteStatisticsExperiment.class,
|
||||||
|
"export-atags", ExportExternalLinksExperiment.class,
|
||||||
"debug-converter", DebugConverterExperiment.class
|
"debug-converter", DebugConverterExperiment.class
|
||||||
);
|
);
|
||||||
|
|
||||||
|
@ -0,0 +1,67 @@
|
|||||||
|
package nu.marginalia.tools.experiments;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import gnu.trove.set.hash.TLongHashSet;
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
|
||||||
|
import nu.marginalia.crawling.model.CrawledDocument;
|
||||||
|
import nu.marginalia.hash.MurmurHash3_128;
|
||||||
|
import nu.marginalia.link_parser.LinkParser;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import nu.marginalia.tools.Experiment;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
|
public class ExportExternalLinksExperiment extends Experiment {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public ExportExternalLinksExperiment() {
|
||||||
|
|
||||||
|
}
|
||||||
|
private static final LinkParser linkParser = new LinkParser();
|
||||||
|
MurmurHash3_128 hash = new MurmurHash3_128();
|
||||||
|
@SneakyThrows
|
||||||
|
@Override
|
||||||
|
public boolean process(SerializableCrawlDataStream stream) {
|
||||||
|
TLongHashSet hashes = new TLongHashSet();
|
||||||
|
|
||||||
|
while (stream.hasNext()) {
|
||||||
|
if (!(stream.next() instanceof CrawledDocument doc))
|
||||||
|
continue;
|
||||||
|
if (null == doc.documentBody)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
var baseUrl = new EdgeUrl(doc.url);
|
||||||
|
var parsed = Jsoup.parse(doc.documentBody);
|
||||||
|
|
||||||
|
for (var atag : parsed.getElementsByTag("a")) {
|
||||||
|
String linkText = atag.text();
|
||||||
|
if (linkText.isBlank())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
var linkOpt = linkParser.parseLinkPermissive(baseUrl, atag);
|
||||||
|
linkOpt
|
||||||
|
.filter(url -> !Objects.equals(url.domain, baseUrl.domain))
|
||||||
|
.filter(url -> hashes.add(hash.hashNearlyASCII(linkText) ^ hash.hashNearlyASCII(url.toString())))
|
||||||
|
.ifPresent(url ->
|
||||||
|
System.out.printf("\"%s\",\"%s\",\"%s\"\n",
|
||||||
|
csvify(url),
|
||||||
|
csvify(baseUrl.domain),
|
||||||
|
csvify(linkText)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String csvify(Object field) {
|
||||||
|
return field.toString().replace("\"", "\"\"");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void onFinish() {
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user