Improve experiment runner, convenient start script.

This commit is contained in:
Viktor Lofgren 2023-03-30 15:40:31 +02:00
parent 0fcb2b534c
commit d0c72ceb7e
5 changed files with 141 additions and 9 deletions

View File

@ -14,4 +14,8 @@ public interface Experiment {
*
*/
void onFinish();
default boolean isInterested(String domainName) {
return true;
}
}

View File

@ -2,12 +2,14 @@ package nu.marginalia.tools;
import com.google.inject.Guice;
import com.google.inject.Injector;
import nu.marginalia.converting.ConverterModule;
import nu.marginalia.service.module.DatabaseModule;
import nu.marginalia.tools.experiments.*;
import plan.CrawlPlanLoader;
import java.io.IOException;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.Map;
public class ExperimentRunnerMain {
@ -16,7 +18,8 @@ public class ExperimentRunnerMain {
"test", TestExperiment.class,
"adblock", AdblockExperiment.class,
"topic", TopicExperiment.class,
"statistics", SentenceStatisticsExperiment.class
"sentence-statistics", SentenceStatisticsExperiment.class,
"site-statistics", SiteStatisticsExperiment.class
);
public static void main(String... args) throws IOException {
@ -30,19 +33,26 @@ public class ExperimentRunnerMain {
return;
}
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
Injector injector = Guice.createInjector(
new DatabaseModule()
new DatabaseModule(),
new ConverterModule(plan)
);
Experiment experiment = injector.getInstance(experiments.get(args[1]));
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
for (var domain : plan.domainsIterable()) { // leaks file descriptor, is fine
if (!experiment.process(domain)) {
break;
}
}
Map<String, String> idToDomain = new HashMap<>();
plan.forEachCrawlingSpecification(spec -> {
idToDomain.put(spec.id, spec.domain);
});
plan.forEachCrawledDomain(
id -> experiment.isInterested(idToDomain.get(id)),
experiment::process
);
experiment.onFinish();
}

View File

@ -58,7 +58,7 @@ public class SentenceStatisticsExperiment implements Experiment {
}
double avgLength = dld.totalNumWords() / (double) numSentences;
if (avgLength < 50) {
if (avgLength < 5 && dld.totalNumWords() > 250) {
writer.printf("%s\t%d\t%f\n", doc.url, dld.totalNumWords(), avgLength);
}
}

View File

@ -0,0 +1,47 @@
package nu.marginalia.tools.experiments;
import com.google.inject.Inject;
import nu.marginalia.WmsaHome;
import nu.marginalia.adblock.GoogleAnwersSpamDetector;
import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.processor.DocumentProcessor;
import nu.marginalia.converting.processor.DomainProcessor;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.tools.Experiment;
import nu.marginalia.topic.RecipeDetector;
import nu.marginalia.topic.TextileCraftDetector;
import nu.marginalia.topic.WoodworkingDetector;
import org.jsoup.Jsoup;
import java.util.Comparator;
public class SiteStatisticsExperiment implements Experiment {
private final DomainProcessor domainProcessor;
@Inject
public SiteStatisticsExperiment(DomainProcessor domainProcessor) {
this.domainProcessor = domainProcessor;
}
@Override
public boolean process(CrawledDomain domain) {
var ret = domainProcessor.process(domain);
ret.documents.stream()
.filter(ProcessedDocument::isProcessedFully)
.sorted(Comparator.comparing(doc -> doc.details.metadata.topology()))
.forEach(doc -> System.out.println(doc.url + ":" + doc.details.metadata));
return true;
}
@Override
public void onFinish() {
}
}

71
run/experiment.sh Executable file
View File

@ -0,0 +1,71 @@
#!/bin/bash
set -e
EXPERIMENT=$1
SAMPLE_NAME=crawl-${2:-m}
SAMPLE_DIR="samples/${SAMPLE_NAME}/"
## Configuration
JAVA_OPTS="
-Dcrawl.rootDirRewrite=/crawl:${SAMPLE_DIR}
-Ddb.overrideJdbc=jdbc:mariadb://localhost:3306/WMSA_prod?rewriteBatchedStatements=true
"
## Configuration ends
if [ -z "$EXPERIMENT" ]; then
echo "Usage: $0 experiment-name"
exit 255;
fi
function download_model {
model=$1
url=$2
if [ ! -f $model ]; then
echo "** Downloading $url"
wget -O $model $url
fi
}
pushd $(dirname $0)
## Upgrade the tools
rm -rf install/*
tar xf ../code/tools/experiment-runner/build/distributions/experiment-runner.tar -C install/
## Download the sample if necessary
if [ ! -d ${SAMPLE_DIR} ]; then
mkdir -p samples/
SAMPLE_TARBALL=samples/${SAMPLE_NAME}.tar.gz
download_model ${SAMPLE_TARBALL} https://downloads.marginalia.nu/${SAMPLE_TARBALL} || rm ${SAMPLE_TARBALL}
if [ ! -f ${SAMPLE_TARBALL} ]; then
echo "!! Failed"
exit 255
fi
mkdir -p samples/${SAMPLE_NAME}
if [ ! -f $SAMPLE_DIR/plan.yaml ]; then
echo "Uncompressing"
tar zxf ${SAMPLE_TARBALL} --strip-components=1 -C ${SAMPLE_DIR}
fi
fi
## Wipe the old index data
PATH+=":install/experiment-runner/bin"
export WMSA_HOME=.
export PATH
export JAVA_OPTS
experiment-runner ${SAMPLE_DIR}/plan.yaml ${EXPERIMENT}
popd