mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Improve experiment runner, convenient start script.
This commit is contained in:
parent
0fcb2b534c
commit
d0c72ceb7e
@ -14,4 +14,8 @@ public interface Experiment {
|
||||
*
|
||||
*/
|
||||
void onFinish();
|
||||
|
||||
default boolean isInterested(String domainName) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -2,12 +2,14 @@ package nu.marginalia.tools;
|
||||
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Injector;
|
||||
import nu.marginalia.converting.ConverterModule;
|
||||
import nu.marginalia.service.module.DatabaseModule;
|
||||
import nu.marginalia.tools.experiments.*;
|
||||
import plan.CrawlPlanLoader;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class ExperimentRunnerMain {
|
||||
@ -16,7 +18,8 @@ public class ExperimentRunnerMain {
|
||||
"test", TestExperiment.class,
|
||||
"adblock", AdblockExperiment.class,
|
||||
"topic", TopicExperiment.class,
|
||||
"statistics", SentenceStatisticsExperiment.class
|
||||
"sentence-statistics", SentenceStatisticsExperiment.class,
|
||||
"site-statistics", SiteStatisticsExperiment.class
|
||||
);
|
||||
|
||||
public static void main(String... args) throws IOException {
|
||||
@ -30,19 +33,26 @@ public class ExperimentRunnerMain {
|
||||
return;
|
||||
}
|
||||
|
||||
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
||||
|
||||
Injector injector = Guice.createInjector(
|
||||
new DatabaseModule()
|
||||
new DatabaseModule(),
|
||||
new ConverterModule(plan)
|
||||
);
|
||||
|
||||
Experiment experiment = injector.getInstance(experiments.get(args[1]));
|
||||
|
||||
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
||||
|
||||
for (var domain : plan.domainsIterable()) { // leaks file descriptor, is fine
|
||||
if (!experiment.process(domain)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
Map<String, String> idToDomain = new HashMap<>();
|
||||
plan.forEachCrawlingSpecification(spec -> {
|
||||
idToDomain.put(spec.id, spec.domain);
|
||||
});
|
||||
|
||||
plan.forEachCrawledDomain(
|
||||
id -> experiment.isInterested(idToDomain.get(id)),
|
||||
experiment::process
|
||||
);
|
||||
|
||||
experiment.onFinish();
|
||||
|
||||
}
|
||||
|
@ -58,7 +58,7 @@ public class SentenceStatisticsExperiment implements Experiment {
|
||||
}
|
||||
|
||||
double avgLength = dld.totalNumWords() / (double) numSentences;
|
||||
if (avgLength < 50) {
|
||||
if (avgLength < 5 && dld.totalNumWords() > 250) {
|
||||
writer.printf("%s\t%d\t%f\n", doc.url, dld.totalNumWords(), avgLength);
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,47 @@
|
||||
package nu.marginalia.tools.experiments;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.adblock.GoogleAnwersSpamDetector;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.converting.processor.DocumentProcessor;
|
||||
import nu.marginalia.converting.processor.DomainProcessor;
|
||||
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
||||
import nu.marginalia.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import nu.marginalia.tools.Experiment;
|
||||
import nu.marginalia.topic.RecipeDetector;
|
||||
import nu.marginalia.topic.TextileCraftDetector;
|
||||
import nu.marginalia.topic.WoodworkingDetector;
|
||||
import org.jsoup.Jsoup;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
public class SiteStatisticsExperiment implements Experiment {
|
||||
|
||||
|
||||
private final DomainProcessor domainProcessor;
|
||||
|
||||
@Inject
|
||||
public SiteStatisticsExperiment(DomainProcessor domainProcessor) {
|
||||
this.domainProcessor = domainProcessor;
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean process(CrawledDomain domain) {
|
||||
var ret = domainProcessor.process(domain);
|
||||
|
||||
ret.documents.stream()
|
||||
.filter(ProcessedDocument::isProcessedFully)
|
||||
.sorted(Comparator.comparing(doc -> doc.details.metadata.topology()))
|
||||
.forEach(doc -> System.out.println(doc.url + ":" + doc.details.metadata));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onFinish() {
|
||||
}
|
||||
}
|
71
run/experiment.sh
Executable file
71
run/experiment.sh
Executable file
@ -0,0 +1,71 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
EXPERIMENT=$1
|
||||
SAMPLE_NAME=crawl-${2:-m}
|
||||
SAMPLE_DIR="samples/${SAMPLE_NAME}/"
|
||||
|
||||
## Configuration
|
||||
|
||||
JAVA_OPTS="
|
||||
-Dcrawl.rootDirRewrite=/crawl:${SAMPLE_DIR}
|
||||
-Ddb.overrideJdbc=jdbc:mariadb://localhost:3306/WMSA_prod?rewriteBatchedStatements=true
|
||||
"
|
||||
|
||||
## Configuration ends
|
||||
|
||||
if [ -z "$EXPERIMENT" ]; then
|
||||
echo "Usage: $0 experiment-name"
|
||||
exit 255;
|
||||
fi
|
||||
|
||||
function download_model {
|
||||
model=$1
|
||||
url=$2
|
||||
|
||||
if [ ! -f $model ]; then
|
||||
echo "** Downloading $url"
|
||||
wget -O $model $url
|
||||
fi
|
||||
}
|
||||
|
||||
pushd $(dirname $0)
|
||||
|
||||
## Upgrade the tools
|
||||
|
||||
rm -rf install/*
|
||||
tar xf ../code/tools/experiment-runner/build/distributions/experiment-runner.tar -C install/
|
||||
|
||||
## Download the sample if necessary
|
||||
|
||||
if [ ! -d ${SAMPLE_DIR} ]; then
|
||||
mkdir -p samples/
|
||||
|
||||
SAMPLE_TARBALL=samples/${SAMPLE_NAME}.tar.gz
|
||||
download_model ${SAMPLE_TARBALL} https://downloads.marginalia.nu/${SAMPLE_TARBALL} || rm ${SAMPLE_TARBALL}
|
||||
|
||||
if [ ! -f ${SAMPLE_TARBALL} ]; then
|
||||
echo "!! Failed"
|
||||
exit 255
|
||||
fi
|
||||
|
||||
mkdir -p samples/${SAMPLE_NAME}
|
||||
if [ ! -f $SAMPLE_DIR/plan.yaml ]; then
|
||||
echo "Uncompressing"
|
||||
tar zxf ${SAMPLE_TARBALL} --strip-components=1 -C ${SAMPLE_DIR}
|
||||
fi
|
||||
fi
|
||||
|
||||
## Wipe the old index data
|
||||
|
||||
PATH+=":install/experiment-runner/bin"
|
||||
|
||||
export WMSA_HOME=.
|
||||
export PATH
|
||||
|
||||
export JAVA_OPTS
|
||||
|
||||
experiment-runner ${SAMPLE_DIR}/plan.yaml ${EXPERIMENT}
|
||||
|
||||
popd
|
Loading…
Reference in New Issue
Block a user