mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Improve experiment runner, convenient start script.
This commit is contained in:
parent
0fcb2b534c
commit
d0c72ceb7e
@ -14,4 +14,8 @@ public interface Experiment {
|
|||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
void onFinish();
|
void onFinish();
|
||||||
|
|
||||||
|
default boolean isInterested(String domainName) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -2,12 +2,14 @@ package nu.marginalia.tools;
|
|||||||
|
|
||||||
import com.google.inject.Guice;
|
import com.google.inject.Guice;
|
||||||
import com.google.inject.Injector;
|
import com.google.inject.Injector;
|
||||||
|
import nu.marginalia.converting.ConverterModule;
|
||||||
import nu.marginalia.service.module.DatabaseModule;
|
import nu.marginalia.service.module.DatabaseModule;
|
||||||
import nu.marginalia.tools.experiments.*;
|
import nu.marginalia.tools.experiments.*;
|
||||||
import plan.CrawlPlanLoader;
|
import plan.CrawlPlanLoader;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
public class ExperimentRunnerMain {
|
public class ExperimentRunnerMain {
|
||||||
@ -16,7 +18,8 @@ public class ExperimentRunnerMain {
|
|||||||
"test", TestExperiment.class,
|
"test", TestExperiment.class,
|
||||||
"adblock", AdblockExperiment.class,
|
"adblock", AdblockExperiment.class,
|
||||||
"topic", TopicExperiment.class,
|
"topic", TopicExperiment.class,
|
||||||
"statistics", SentenceStatisticsExperiment.class
|
"sentence-statistics", SentenceStatisticsExperiment.class,
|
||||||
|
"site-statistics", SiteStatisticsExperiment.class
|
||||||
);
|
);
|
||||||
|
|
||||||
public static void main(String... args) throws IOException {
|
public static void main(String... args) throws IOException {
|
||||||
@ -30,19 +33,26 @@ public class ExperimentRunnerMain {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
||||||
|
|
||||||
Injector injector = Guice.createInjector(
|
Injector injector = Guice.createInjector(
|
||||||
new DatabaseModule()
|
new DatabaseModule(),
|
||||||
|
new ConverterModule(plan)
|
||||||
);
|
);
|
||||||
|
|
||||||
Experiment experiment = injector.getInstance(experiments.get(args[1]));
|
Experiment experiment = injector.getInstance(experiments.get(args[1]));
|
||||||
|
|
||||||
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
|
||||||
|
|
||||||
for (var domain : plan.domainsIterable()) { // leaks file descriptor, is fine
|
Map<String, String> idToDomain = new HashMap<>();
|
||||||
if (!experiment.process(domain)) {
|
plan.forEachCrawlingSpecification(spec -> {
|
||||||
break;
|
idToDomain.put(spec.id, spec.domain);
|
||||||
}
|
});
|
||||||
}
|
|
||||||
|
plan.forEachCrawledDomain(
|
||||||
|
id -> experiment.isInterested(idToDomain.get(id)),
|
||||||
|
experiment::process
|
||||||
|
);
|
||||||
|
|
||||||
experiment.onFinish();
|
experiment.onFinish();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -58,7 +58,7 @@ public class SentenceStatisticsExperiment implements Experiment {
|
|||||||
}
|
}
|
||||||
|
|
||||||
double avgLength = dld.totalNumWords() / (double) numSentences;
|
double avgLength = dld.totalNumWords() / (double) numSentences;
|
||||||
if (avgLength < 50) {
|
if (avgLength < 5 && dld.totalNumWords() > 250) {
|
||||||
writer.printf("%s\t%d\t%f\n", doc.url, dld.totalNumWords(), avgLength);
|
writer.printf("%s\t%d\t%f\n", doc.url, dld.totalNumWords(), avgLength);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,47 @@
|
|||||||
|
package nu.marginalia.tools.experiments;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import nu.marginalia.WmsaHome;
|
||||||
|
import nu.marginalia.adblock.GoogleAnwersSpamDetector;
|
||||||
|
import nu.marginalia.converting.model.ProcessedDocument;
|
||||||
|
import nu.marginalia.converting.processor.DocumentProcessor;
|
||||||
|
import nu.marginalia.converting.processor.DomainProcessor;
|
||||||
|
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
||||||
|
import nu.marginalia.crawling.model.CrawledDomain;
|
||||||
|
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||||
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
|
import nu.marginalia.tools.Experiment;
|
||||||
|
import nu.marginalia.topic.RecipeDetector;
|
||||||
|
import nu.marginalia.topic.TextileCraftDetector;
|
||||||
|
import nu.marginalia.topic.WoodworkingDetector;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
|
||||||
|
import java.util.Comparator;
|
||||||
|
|
||||||
|
public class SiteStatisticsExperiment implements Experiment {
|
||||||
|
|
||||||
|
|
||||||
|
private final DomainProcessor domainProcessor;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public SiteStatisticsExperiment(DomainProcessor domainProcessor) {
|
||||||
|
this.domainProcessor = domainProcessor;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean process(CrawledDomain domain) {
|
||||||
|
var ret = domainProcessor.process(domain);
|
||||||
|
|
||||||
|
ret.documents.stream()
|
||||||
|
.filter(ProcessedDocument::isProcessedFully)
|
||||||
|
.sorted(Comparator.comparing(doc -> doc.details.metadata.topology()))
|
||||||
|
.forEach(doc -> System.out.println(doc.url + ":" + doc.details.metadata));
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void onFinish() {
|
||||||
|
}
|
||||||
|
}
|
71
run/experiment.sh
Executable file
71
run/experiment.sh
Executable file
@ -0,0 +1,71 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
EXPERIMENT=$1
|
||||||
|
SAMPLE_NAME=crawl-${2:-m}
|
||||||
|
SAMPLE_DIR="samples/${SAMPLE_NAME}/"
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
JAVA_OPTS="
|
||||||
|
-Dcrawl.rootDirRewrite=/crawl:${SAMPLE_DIR}
|
||||||
|
-Ddb.overrideJdbc=jdbc:mariadb://localhost:3306/WMSA_prod?rewriteBatchedStatements=true
|
||||||
|
"
|
||||||
|
|
||||||
|
## Configuration ends
|
||||||
|
|
||||||
|
if [ -z "$EXPERIMENT" ]; then
|
||||||
|
echo "Usage: $0 experiment-name"
|
||||||
|
exit 255;
|
||||||
|
fi
|
||||||
|
|
||||||
|
function download_model {
|
||||||
|
model=$1
|
||||||
|
url=$2
|
||||||
|
|
||||||
|
if [ ! -f $model ]; then
|
||||||
|
echo "** Downloading $url"
|
||||||
|
wget -O $model $url
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
pushd $(dirname $0)
|
||||||
|
|
||||||
|
## Upgrade the tools
|
||||||
|
|
||||||
|
rm -rf install/*
|
||||||
|
tar xf ../code/tools/experiment-runner/build/distributions/experiment-runner.tar -C install/
|
||||||
|
|
||||||
|
## Download the sample if necessary
|
||||||
|
|
||||||
|
if [ ! -d ${SAMPLE_DIR} ]; then
|
||||||
|
mkdir -p samples/
|
||||||
|
|
||||||
|
SAMPLE_TARBALL=samples/${SAMPLE_NAME}.tar.gz
|
||||||
|
download_model ${SAMPLE_TARBALL} https://downloads.marginalia.nu/${SAMPLE_TARBALL} || rm ${SAMPLE_TARBALL}
|
||||||
|
|
||||||
|
if [ ! -f ${SAMPLE_TARBALL} ]; then
|
||||||
|
echo "!! Failed"
|
||||||
|
exit 255
|
||||||
|
fi
|
||||||
|
|
||||||
|
mkdir -p samples/${SAMPLE_NAME}
|
||||||
|
if [ ! -f $SAMPLE_DIR/plan.yaml ]; then
|
||||||
|
echo "Uncompressing"
|
||||||
|
tar zxf ${SAMPLE_TARBALL} --strip-components=1 -C ${SAMPLE_DIR}
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
## Wipe the old index data
|
||||||
|
|
||||||
|
PATH+=":install/experiment-runner/bin"
|
||||||
|
|
||||||
|
export WMSA_HOME=.
|
||||||
|
export PATH
|
||||||
|
|
||||||
|
export JAVA_OPTS
|
||||||
|
|
||||||
|
experiment-runner ${SAMPLE_DIR}/plan.yaml ${EXPERIMENT}
|
||||||
|
|
||||||
|
popd
|
Loading…
Reference in New Issue
Block a user