diff --git a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/Experiment.java b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/Experiment.java index 3fffb463..df2ad7ee 100644 --- a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/Experiment.java +++ b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/Experiment.java @@ -14,4 +14,8 @@ public interface Experiment { * */ void onFinish(); + + default boolean isInterested(String domainName) { + return true; + } } diff --git a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/ExperimentRunnerMain.java b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/ExperimentRunnerMain.java index e6ff6db4..885eb071 100644 --- a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/ExperimentRunnerMain.java +++ b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/ExperimentRunnerMain.java @@ -2,12 +2,14 @@ package nu.marginalia.tools; import com.google.inject.Guice; import com.google.inject.Injector; +import nu.marginalia.converting.ConverterModule; import nu.marginalia.service.module.DatabaseModule; import nu.marginalia.tools.experiments.*; import plan.CrawlPlanLoader; import java.io.IOException; import java.nio.file.Path; +import java.util.HashMap; import java.util.Map; public class ExperimentRunnerMain { @@ -16,7 +18,8 @@ public class ExperimentRunnerMain { "test", TestExperiment.class, "adblock", AdblockExperiment.class, "topic", TopicExperiment.class, - "statistics", SentenceStatisticsExperiment.class + "sentence-statistics", SentenceStatisticsExperiment.class, + "site-statistics", SiteStatisticsExperiment.class ); public static void main(String... args) throws IOException { @@ -30,19 +33,26 @@ public class ExperimentRunnerMain { return; } + var plan = new CrawlPlanLoader().load(Path.of(args[0])); + Injector injector = Guice.createInjector( - new DatabaseModule() + new DatabaseModule(), + new ConverterModule(plan) ); Experiment experiment = injector.getInstance(experiments.get(args[1])); - var plan = new CrawlPlanLoader().load(Path.of(args[0])); - for (var domain : plan.domainsIterable()) { // leaks file descriptor, is fine - if (!experiment.process(domain)) { - break; - } - } + Map idToDomain = new HashMap<>(); + plan.forEachCrawlingSpecification(spec -> { + idToDomain.put(spec.id, spec.domain); + }); + + plan.forEachCrawledDomain( + id -> experiment.isInterested(idToDomain.get(id)), + experiment::process + ); + experiment.onFinish(); } diff --git a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java index 21eda145..f15599a4 100644 --- a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java +++ b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java @@ -58,7 +58,7 @@ public class SentenceStatisticsExperiment implements Experiment { } double avgLength = dld.totalNumWords() / (double) numSentences; - if (avgLength < 50) { + if (avgLength < 5 && dld.totalNumWords() > 250) { writer.printf("%s\t%d\t%f\n", doc.url, dld.totalNumWords(), avgLength); } } diff --git a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SiteStatisticsExperiment.java b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SiteStatisticsExperiment.java new file mode 100644 index 00000000..64a2bdf4 --- /dev/null +++ b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SiteStatisticsExperiment.java @@ -0,0 +1,47 @@ +package nu.marginalia.tools.experiments; + +import com.google.inject.Inject; +import nu.marginalia.WmsaHome; +import nu.marginalia.adblock.GoogleAnwersSpamDetector; +import nu.marginalia.converting.model.ProcessedDocument; +import nu.marginalia.converting.processor.DocumentProcessor; +import nu.marginalia.converting.processor.DomainProcessor; +import nu.marginalia.converting.processor.logic.dom.DomPruningFilter; +import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.language.sentence.SentenceExtractor; +import nu.marginalia.model.idx.DocumentMetadata; +import nu.marginalia.tools.Experiment; +import nu.marginalia.topic.RecipeDetector; +import nu.marginalia.topic.TextileCraftDetector; +import nu.marginalia.topic.WoodworkingDetector; +import org.jsoup.Jsoup; + +import java.util.Comparator; + +public class SiteStatisticsExperiment implements Experiment { + + + private final DomainProcessor domainProcessor; + + @Inject + public SiteStatisticsExperiment(DomainProcessor domainProcessor) { + this.domainProcessor = domainProcessor; + + } + + @Override + public boolean process(CrawledDomain domain) { + var ret = domainProcessor.process(domain); + + ret.documents.stream() + .filter(ProcessedDocument::isProcessedFully) + .sorted(Comparator.comparing(doc -> doc.details.metadata.topology())) + .forEach(doc -> System.out.println(doc.url + ":" + doc.details.metadata)); + + return true; + } + + @Override + public void onFinish() { + } +} diff --git a/run/experiment.sh b/run/experiment.sh new file mode 100755 index 00000000..970702a9 --- /dev/null +++ b/run/experiment.sh @@ -0,0 +1,71 @@ +#!/bin/bash + +set -e + +EXPERIMENT=$1 +SAMPLE_NAME=crawl-${2:-m} +SAMPLE_DIR="samples/${SAMPLE_NAME}/" + +## Configuration + +JAVA_OPTS=" +-Dcrawl.rootDirRewrite=/crawl:${SAMPLE_DIR} +-Ddb.overrideJdbc=jdbc:mariadb://localhost:3306/WMSA_prod?rewriteBatchedStatements=true +" + +## Configuration ends + +if [ -z "$EXPERIMENT" ]; then + echo "Usage: $0 experiment-name" + exit 255; +fi + +function download_model { + model=$1 + url=$2 + + if [ ! -f $model ]; then + echo "** Downloading $url" + wget -O $model $url + fi +} + +pushd $(dirname $0) + +## Upgrade the tools + +rm -rf install/* +tar xf ../code/tools/experiment-runner/build/distributions/experiment-runner.tar -C install/ + +## Download the sample if necessary + +if [ ! -d ${SAMPLE_DIR} ]; then + mkdir -p samples/ + + SAMPLE_TARBALL=samples/${SAMPLE_NAME}.tar.gz + download_model ${SAMPLE_TARBALL} https://downloads.marginalia.nu/${SAMPLE_TARBALL} || rm ${SAMPLE_TARBALL} + + if [ ! -f ${SAMPLE_TARBALL} ]; then + echo "!! Failed" + exit 255 + fi + + mkdir -p samples/${SAMPLE_NAME} + if [ ! -f $SAMPLE_DIR/plan.yaml ]; then + echo "Uncompressing" + tar zxf ${SAMPLE_TARBALL} --strip-components=1 -C ${SAMPLE_DIR} + fi +fi + +## Wipe the old index data + +PATH+=":install/experiment-runner/bin" + +export WMSA_HOME=. +export PATH + +export JAVA_OPTS + +experiment-runner ${SAMPLE_DIR}/plan.yaml ${EXPERIMENT} + +popd