MarginaliaSearch/code/tools/experiment-runner/java/nu/marginalia/tools/ExperimentRunnerMain.java
Viktor Lofgren d84a2c183f (*) Remove the crawl spec abstraction
The crawl spec abstraction was used to upload lists of domains into the system for future crawling.  This was fairly clunky, and it was difficult to understand what was going to be crawled.

Since a while back, a new domains listing view has been added to the control view that allows direct access to the domains table.  This is much preferred and means the operator can directly manage domains without specs.

This commit removes the crawl spec abstraction from the code, and changes the GUI to direct to the domains list instead.
2024-10-03 13:41:17 +02:00

61 lines
2.1 KiB
Java

package nu.marginalia.tools;
import com.google.inject.Guice;
import com.google.inject.Injector;
import nu.marginalia.converting.ConverterModule;
import nu.marginalia.io.CrawledDomainReader;
import nu.marginalia.process.log.WorkLog;
import nu.marginalia.service.module.DatabaseModule;
import nu.marginalia.tools.experiments.*;
import java.io.IOException;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.Map;
public class ExperimentRunnerMain {
private static Map<String, Class<? extends Experiment>> experiments = Map.of(
"test", TestExperiment.class,
"adblock", AdblockExperiment.class,
"topic", TopicExperiment.class,
"sentence-statistics", SentenceStatisticsExperiment.class,
"site-statistics", SiteStatisticsExperiment.class,
"export-atags", ExportExternalLinksExperiment.class,
"debug-converter", DebugConverterExperiment.class
);
public static void main(String... args) throws IOException {
if (args.length < 2) {
System.err.println("Expected arguments: crawl-data-path experiment-name [experiment-args]");
return;
}
if (!experiments.containsKey(args[1])) {
System.err.println("Valid experiment names: " + experiments.keySet());
return;
}
Injector injector = Guice.createInjector(
new DatabaseModule(false),
new ConverterModule()
);
Experiment experiment = injector.getInstance(experiments.get(args[1]));
experiment.args(Arrays.copyOfRange(args, 2, args.length));
Path basePath = Path.of(args[0]);
for (var item : WorkLog.iterable(basePath.resolve("crawler.log"))) {
Path crawlDataPath = basePath.resolve(item.relPath());
try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) {
experiment.process(stream);
}
catch (Exception ex) {
ex.printStackTrace();
}
}
experiment.onFinish();
}
}