mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 13:19:02 +00:00

The crawl spec abstraction was used to upload lists of domains into the system for future crawling. This was fairly clunky, and it was difficult to understand what was going to be crawled. Since a while back, a new domains listing view has been added to the control view that allows direct access to the domains table. This is much preferred and means the operator can directly manage domains without specs. This commit removes the crawl spec abstraction from the code, and changes the GUI to direct to the domains list instead.
61 lines
2.1 KiB
Java
61 lines
2.1 KiB
Java
package nu.marginalia.tools;
|
|
|
|
import com.google.inject.Guice;
|
|
import com.google.inject.Injector;
|
|
import nu.marginalia.converting.ConverterModule;
|
|
import nu.marginalia.io.CrawledDomainReader;
|
|
import nu.marginalia.process.log.WorkLog;
|
|
import nu.marginalia.service.module.DatabaseModule;
|
|
import nu.marginalia.tools.experiments.*;
|
|
|
|
import java.io.IOException;
|
|
import java.nio.file.Path;
|
|
import java.util.Arrays;
|
|
import java.util.Map;
|
|
|
|
public class ExperimentRunnerMain {
|
|
|
|
private static Map<String, Class<? extends Experiment>> experiments = Map.of(
|
|
"test", TestExperiment.class,
|
|
"adblock", AdblockExperiment.class,
|
|
"topic", TopicExperiment.class,
|
|
"sentence-statistics", SentenceStatisticsExperiment.class,
|
|
"site-statistics", SiteStatisticsExperiment.class,
|
|
"export-atags", ExportExternalLinksExperiment.class,
|
|
"debug-converter", DebugConverterExperiment.class
|
|
);
|
|
|
|
public static void main(String... args) throws IOException {
|
|
if (args.length < 2) {
|
|
System.err.println("Expected arguments: crawl-data-path experiment-name [experiment-args]");
|
|
return;
|
|
}
|
|
|
|
if (!experiments.containsKey(args[1])) {
|
|
System.err.println("Valid experiment names: " + experiments.keySet());
|
|
return;
|
|
}
|
|
|
|
Injector injector = Guice.createInjector(
|
|
new DatabaseModule(false),
|
|
new ConverterModule()
|
|
);
|
|
|
|
Experiment experiment = injector.getInstance(experiments.get(args[1]));
|
|
|
|
experiment.args(Arrays.copyOfRange(args, 2, args.length));
|
|
|
|
Path basePath = Path.of(args[0]);
|
|
for (var item : WorkLog.iterable(basePath.resolve("crawler.log"))) {
|
|
Path crawlDataPath = basePath.resolve(item.relPath());
|
|
try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) {
|
|
experiment.process(stream);
|
|
}
|
|
catch (Exception ex) {
|
|
ex.printStackTrace();
|
|
}
|
|
}
|
|
experiment.onFinish();
|
|
}
|
|
}
|