mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-22 20:48:59 +00:00
Add experiment runner tool and got rid of experiments module in processes.
This commit is contained in:
parent
03bd892b95
commit
8f51345a1d
@ -1,3 +0,0 @@
|
||||
# Experimental
|
||||
|
||||
Contains tools for running classification experiments on crawl data.
|
@ -1,57 +0,0 @@
|
||||
package nu.marginalia.experimental;
|
||||
|
||||
import nu.marginalia.adblock.AdblockSimulator;
|
||||
import nu.marginalia.converting.processor.DocumentProcessor;
|
||||
import plan.CrawlPlanLoader;
|
||||
import plan.CrawlPlan;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.crawling.model.CrawledDomain;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
|
||||
|
||||
public class AdblockTesterTool {
|
||||
|
||||
static AdblockSimulator simulator;
|
||||
|
||||
static {
|
||||
try {
|
||||
simulator = new AdblockSimulator();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static void main(String... args) throws IOException {
|
||||
CrawlPlan plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
||||
|
||||
try (var iterable = plan.domainsIterable()) {
|
||||
for (var domain : iterable) {
|
||||
processDomain(domain);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static void processDomain(CrawledDomain domain) {
|
||||
if (domain.doc == null) return;
|
||||
for (var doc : domain.doc) {
|
||||
if (DocumentProcessor.isAcceptedContentType(doc) && "OK".equals(doc.crawlerStatus)) {
|
||||
processDocument(doc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static void processDocument(CrawledDocument doc) {
|
||||
Document parsedDocument = Jsoup.parse(doc.documentBody.decode());
|
||||
|
||||
if (simulator.hasAds(parsedDocument)) {
|
||||
System.out.println(doc.url);
|
||||
}
|
||||
}
|
||||
}
|
@ -1,97 +0,0 @@
|
||||
package nu.marginalia.experimental;
|
||||
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Injector;
|
||||
import nu.marginalia.converting.ConverterModule;
|
||||
import plan.CrawlPlanLoader;
|
||||
import plan.CrawlPlan;
|
||||
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.converting.processor.DomainProcessor;
|
||||
import nu.marginalia.adblock.GoogleAnwersSpamDetector;
|
||||
import nu.marginalia.topic.RecipeDetector;
|
||||
import nu.marginalia.topic.TextileCraftDetector;
|
||||
import nu.marginalia.topic.WoodworkingDetector;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.concurrent.ForkJoinPool;
|
||||
|
||||
public class ConverterLogicTestTool {
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
RecipeDetector recipeDetector = new RecipeDetector();
|
||||
WoodworkingDetector woodworkingDetector = new WoodworkingDetector();
|
||||
TextileCraftDetector textileCraftDetector = new TextileCraftDetector();
|
||||
GoogleAnwersSpamDetector spamDetector = new GoogleAnwersSpamDetector();
|
||||
|
||||
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
|
||||
public static void main(String... args) throws IOException {
|
||||
|
||||
if (args.length != 1) {
|
||||
System.err.println("Arguments: crawl-plan.yaml");
|
||||
System.exit(0);
|
||||
}
|
||||
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
||||
|
||||
Injector injector = Guice.createInjector(
|
||||
new ConverterModule(plan)
|
||||
);
|
||||
|
||||
injector.getInstance(ConverterLogicTestTool.class);
|
||||
}
|
||||
|
||||
@Inject
|
||||
public ConverterLogicTestTool(
|
||||
CrawlPlan plan,
|
||||
DomainProcessor processor
|
||||
) throws Exception {
|
||||
var cp = new ForkJoinPool(16);
|
||||
|
||||
plan.forEachCrawledDomain(domain -> {
|
||||
if (domain.doc == null) return;
|
||||
|
||||
|
||||
for (var doc : domain.doc) {
|
||||
if (doc.documentBody == null) continue;
|
||||
|
||||
Runnable task = () -> {
|
||||
var parsed = Jsoup.parse(doc.documentBody.decode());
|
||||
|
||||
parsed.body().filter(new DomPruningFilter(0.5));
|
||||
var dld = se.extractSentences(parsed);
|
||||
|
||||
if (dld.totalNumWords() < 250)
|
||||
return;
|
||||
|
||||
if (textileCraftDetector.testP(dld) > 0.3) {
|
||||
System.out.println("textilecraft\t" + doc.url);
|
||||
}
|
||||
if (woodworkingDetector.testP(dld) > 0.1) {
|
||||
System.out.println("woodworking\t" + doc.url);
|
||||
}
|
||||
if (recipeDetector.testP(dld) > 0.5) {
|
||||
System.out.println("recipe\t" + doc.url);
|
||||
}
|
||||
if (spamDetector.testP(parsed) > 0.5) {
|
||||
System.out.println("GA spam\t" + doc.url);
|
||||
}
|
||||
};
|
||||
|
||||
if (cp.getQueuedSubmissionCount() > 32) {
|
||||
task.run();
|
||||
} else {
|
||||
cp.execute(task);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
}
|
@ -1,93 +0,0 @@
|
||||
package nu.marginalia.experimental;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.adblock.AdblockSimulator;
|
||||
import nu.marginalia.converting.processor.DocumentProcessor;
|
||||
import plan.CrawlPlanLoader;
|
||||
import plan.CrawlPlan;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.service.module.DatabaseModule;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.*;
|
||||
|
||||
|
||||
public class CrawlDataExtractorTool {
|
||||
private static final AdblockSimulator abs;
|
||||
|
||||
static {
|
||||
try {
|
||||
abs = new AdblockSimulator();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private static final Set<String> urls = new HashSet<>(50_000_000);
|
||||
|
||||
@SneakyThrows
|
||||
public static void main(String... args) throws IOException {
|
||||
CrawlPlan plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
||||
DatabaseModule module = new DatabaseModule();
|
||||
|
||||
try (var ds = module.provideConnection();
|
||||
var conn = ds.getConnection();
|
||||
var stmt = conn.createStatement()) {
|
||||
var rsp = stmt.executeQuery("SELECT URL FROM EC_URL_VIEW WHERE TITLE IS NOT NULL");
|
||||
while (rsp.next()) {
|
||||
urls.add(rsp.getString(1));
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
ex.printStackTrace();
|
||||
}
|
||||
|
||||
LinkedBlockingQueue<Runnable> queue = new LinkedBlockingQueue<>(10);
|
||||
ExecutorService pool = new ThreadPoolExecutor(10, 20, 5, TimeUnit.MINUTES, queue);
|
||||
Semaphore sem = new Semaphore(20);
|
||||
|
||||
try (var iterable = plan.domainsIterable()) {
|
||||
for (var domain : iterable) {
|
||||
sem.acquire();
|
||||
pool.execute(() -> {
|
||||
try { processDomain(domain); }
|
||||
finally { sem.release(); }
|
||||
});
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
pool.shutdown();
|
||||
|
||||
while (!pool.awaitTermination(1, TimeUnit.MINUTES));
|
||||
}
|
||||
|
||||
private static void processDomain(CrawledDomain domain) {
|
||||
if (domain.doc == null) return;
|
||||
for (var doc : domain.doc) {
|
||||
if (!urls.contains(doc.url))
|
||||
continue;
|
||||
|
||||
if (DocumentProcessor.isAcceptedContentType(doc) && "OK".equals(doc.crawlerStatus)) {
|
||||
processDocument(doc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static void processDocument(CrawledDocument doc) {
|
||||
Document parsedDocument = Jsoup.parse(doc.documentBody.decode());
|
||||
|
||||
if (abs.hasAds(parsedDocument)) {
|
||||
System.out.println(doc.url);
|
||||
}
|
||||
}
|
||||
}
|
@ -1,6 +1,7 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
id "io.freefair.lombok" version "5.3.3.3"
|
||||
id 'application'
|
||||
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
@ -11,37 +12,49 @@ java {
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:common:process')
|
||||
application {
|
||||
mainClass = 'nu.marginalia.tools.ExperimentRunnerMain'
|
||||
applicationName = 'experiment-runner'
|
||||
}
|
||||
|
||||
tasks.distZip.enabled = false
|
||||
|
||||
dependencies {
|
||||
implementation project(':third-party:rdrpostagger')
|
||||
implementation project(':third-party:porterstemmer')
|
||||
implementation project(':third-party:monkey-patch-opennlp')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:process')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:libraries:big-string')
|
||||
implementation project(':code:api:index-api')
|
||||
implementation project(':code:common:service-discovery')
|
||||
implementation project(':code:common:service-client')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
implementation project(':code:libraries:big-string')
|
||||
implementation project(':code:processes:converting-process')
|
||||
implementation project(':code:process-models:crawling-model')
|
||||
|
||||
implementation project(':code:processes:converting-process')
|
||||
implementation project(':code:features-convert:adblock')
|
||||
implementation project(':code:features-convert:topic-detection')
|
||||
|
||||
implementation libs.lombok
|
||||
annotationProcessor libs.lombok
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.notnull
|
||||
|
||||
implementation libs.guice
|
||||
implementation libs.jsoup
|
||||
implementation libs.bundles.mariadb
|
||||
implementation libs.trove
|
||||
implementation libs.fastutil
|
||||
|
||||
implementation libs.bundles.nlp
|
||||
implementation libs.commons.lang3
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
}
|
||||
|
||||
|
||||
test {
|
||||
useJUnitPlatform()
|
||||
}
|
7
code/tools/experiment-runner/readme.md
Normal file
7
code/tools/experiment-runner/readme.md
Normal file
@ -0,0 +1,7 @@
|
||||
# Experiment Runner
|
||||
|
||||
This tool is a means of launching crawl data processing experiments,
|
||||
for interacting with crawl data.
|
||||
|
||||
It's launched with `run/experiment.sh`. New experiments need to be added to
|
||||
`ExperimentRunnerMain` in order for the script to be able to run them.
|
@ -0,0 +1,17 @@
|
||||
package nu.marginalia.tools;
|
||||
|
||||
import nu.marginalia.crawling.model.CrawledDomain;
|
||||
|
||||
public interface Experiment {
|
||||
|
||||
/** The experiment processes the domain here.
|
||||
*
|
||||
* @return true to continue, false to terminate.
|
||||
*/
|
||||
boolean process(CrawledDomain domain);
|
||||
|
||||
/** Invoked after all domains are processed
|
||||
*
|
||||
*/
|
||||
void onFinish();
|
||||
}
|
@ -0,0 +1,49 @@
|
||||
package nu.marginalia.tools;
|
||||
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Injector;
|
||||
import nu.marginalia.service.module.DatabaseModule;
|
||||
import nu.marginalia.tools.experiments.*;
|
||||
import plan.CrawlPlanLoader;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Map;
|
||||
|
||||
public class ExperimentRunnerMain {
|
||||
|
||||
private static Map<String, Class<? extends Experiment>> experiments = Map.of(
|
||||
"test", TestExperiment.class,
|
||||
"adblock", AdblockExperiment.class,
|
||||
"topic", TopicExperiment.class,
|
||||
"statistics", SentenceStatisticsExperiment.class
|
||||
);
|
||||
|
||||
public static void main(String... args) throws IOException {
|
||||
if (args.length != 2) {
|
||||
System.err.println("Expected arguments: plan.yaml experiment-name");
|
||||
return;
|
||||
}
|
||||
|
||||
if (!experiments.containsKey(args[1])) {
|
||||
System.err.println("Valid experiment names: " + experiments.keySet());
|
||||
return;
|
||||
}
|
||||
|
||||
Injector injector = Guice.createInjector(
|
||||
new DatabaseModule()
|
||||
);
|
||||
|
||||
Experiment experiment = injector.getInstance(experiments.get(args[1]));
|
||||
|
||||
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
||||
|
||||
for (var domain : plan.domainsIterable()) { // leaks file descriptor, is fine
|
||||
if (!experiment.process(domain)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
experiment.onFinish();
|
||||
|
||||
}
|
||||
}
|
@ -0,0 +1,45 @@
|
||||
package nu.marginalia.tools.experiments;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.adblock.AdblockSimulator;
|
||||
import nu.marginalia.converting.processor.DocumentProcessor;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.tools.Experiment;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
public class AdblockExperiment implements Experiment {
|
||||
|
||||
private final AdblockSimulator simulator;
|
||||
|
||||
@Inject
|
||||
public AdblockExperiment(AdblockSimulator simulator) {
|
||||
this.simulator = simulator;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean process(CrawledDomain domain) {
|
||||
if (domain.doc == null) return true;
|
||||
|
||||
for (var doc : domain.doc) {
|
||||
if (DocumentProcessor.isAcceptedContentType(doc) && "OK".equals(doc.crawlerStatus)) {
|
||||
processDocument(doc);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private void processDocument(CrawledDocument doc) {
|
||||
Document parsedDocument = Jsoup.parse(doc.documentBody.decode());
|
||||
|
||||
if (simulator.hasAds(parsedDocument)) {
|
||||
System.out.println(doc.url);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onFinish() {
|
||||
}
|
||||
}
|
@ -0,0 +1,74 @@
|
||||
package nu.marginalia.tools.experiments;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.adblock.GoogleAnwersSpamDetector;
|
||||
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
||||
import nu.marginalia.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.tools.Experiment;
|
||||
import nu.marginalia.topic.RecipeDetector;
|
||||
import nu.marginalia.topic.TextileCraftDetector;
|
||||
import nu.marginalia.topic.WoodworkingDetector;
|
||||
import org.jsoup.Jsoup;
|
||||
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintWriter;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
public class SentenceStatisticsExperiment implements Experiment {
|
||||
|
||||
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
Path filename;
|
||||
PrintWriter writer;
|
||||
|
||||
@Inject
|
||||
public SentenceStatisticsExperiment() throws IOException {
|
||||
filename = Files.createTempFile(getClass().getSimpleName(), ".csv");
|
||||
System.out.println("Writing to " + filename);
|
||||
|
||||
writer = new PrintWriter(new BufferedOutputStream(new FileOutputStream(filename.toFile())));
|
||||
}
|
||||
|
||||
private void logLine(String message) {
|
||||
System.out.printf("\u001b[2K\r%s", message);
|
||||
}
|
||||
@Override
|
||||
public boolean process(CrawledDomain domain) {
|
||||
if (domain.doc == null) return true;
|
||||
|
||||
logLine("Processing: " + domain.domain);
|
||||
|
||||
for (var doc : domain.doc) {
|
||||
if (doc.documentBody == null) continue;
|
||||
|
||||
var parsed = Jsoup.parse(doc.documentBody.decode());
|
||||
|
||||
parsed.body().filter(new DomPruningFilter(0.5));
|
||||
|
||||
var dld = se.extractSentences(parsed);
|
||||
|
||||
|
||||
int numSentences = dld.sentences.length;
|
||||
if (numSentences == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
double avgLength = dld.totalNumWords() / (double) numSentences;
|
||||
if (avgLength < 50) {
|
||||
writer.printf("%s\t%d\t%f\n", doc.url, dld.totalNumWords(), avgLength);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onFinish() {
|
||||
logLine("Done!\n");
|
||||
writer.close();
|
||||
}
|
||||
}
|
@ -0,0 +1,16 @@
|
||||
package nu.marginalia.tools.experiments;
|
||||
|
||||
import nu.marginalia.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.tools.Experiment;
|
||||
|
||||
public class TestExperiment implements Experiment {
|
||||
@Override
|
||||
public boolean process(CrawledDomain domain) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onFinish() {
|
||||
System.out.println("Tada!");
|
||||
}
|
||||
}
|
@ -0,0 +1,69 @@
|
||||
package nu.marginalia.tools.experiments;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.adblock.AdblockSimulator;
|
||||
import nu.marginalia.adblock.GoogleAnwersSpamDetector;
|
||||
import nu.marginalia.converting.processor.DocumentProcessor;
|
||||
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.tools.Experiment;
|
||||
import nu.marginalia.topic.RecipeDetector;
|
||||
import nu.marginalia.topic.TextileCraftDetector;
|
||||
import nu.marginalia.topic.WoodworkingDetector;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
public class TopicExperiment implements Experiment {
|
||||
|
||||
RecipeDetector recipeDetector = new RecipeDetector();
|
||||
WoodworkingDetector woodworkingDetector = new WoodworkingDetector();
|
||||
TextileCraftDetector textileCraftDetector = new TextileCraftDetector();
|
||||
GoogleAnwersSpamDetector spamDetector = new GoogleAnwersSpamDetector();
|
||||
|
||||
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
|
||||
@Inject
|
||||
public TopicExperiment() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean process(CrawledDomain domain) {
|
||||
if (domain.doc == null) return true;
|
||||
|
||||
|
||||
for (var doc : domain.doc) {
|
||||
if (doc.documentBody == null) continue;
|
||||
|
||||
var parsed = Jsoup.parse(doc.documentBody.decode());
|
||||
|
||||
parsed.body().filter(new DomPruningFilter(0.5));
|
||||
var dld = se.extractSentences(parsed);
|
||||
|
||||
if (dld.totalNumWords() < 250)
|
||||
continue;
|
||||
|
||||
if (textileCraftDetector.testP(dld) > 0.3) {
|
||||
System.out.println("textilecraft\t" + doc.url);
|
||||
}
|
||||
if (woodworkingDetector.testP(dld) > 0.1) {
|
||||
System.out.println("woodworking\t" + doc.url);
|
||||
}
|
||||
if (recipeDetector.testP(dld) > 0.5) {
|
||||
System.out.println("recipe\t" + doc.url);
|
||||
}
|
||||
if (spamDetector.testP(parsed) > 0.5) {
|
||||
System.out.println("GA spam\t" + doc.url);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onFinish() {
|
||||
}
|
||||
}
|
@ -50,4 +50,29 @@ $ docker-compose up
|
||||
indexes. Wait for the line 'Auto-conversion finished!'
|
||||
|
||||
When all is done, it should be possible to visit
|
||||
[http://localhost:8080](http://localhost:8080) and try a few searches!
|
||||
[http://localhost:8080](http://localhost:8080) and try a few searches!
|
||||
|
||||
|
||||
## Other Crawl Data
|
||||
|
||||
By default, `reconvert.sh` will load the medium dataset. This is appropriate for a demo,
|
||||
but other datasets also exist.
|
||||
|
||||
| Set | Description |
|
||||
|-----|----------------------------------------------------------------------------|
|
||||
| s | 1000 domains, suitable for low-end machines |
|
||||
| m | 2000 domains |
|
||||
| l | 5000 domains |
|
||||
| xl | 50,000 domains, basically pre-prod.<br><b>Warning</b>: 5h+ processing time |
|
||||
|
||||
To switch datasets, run e.g.
|
||||
|
||||
```shell
|
||||
$ docker-compose up -d mariadb
|
||||
$ ./run/reconvert.sh l
|
||||
```
|
||||
|
||||
## Experiment Runner
|
||||
|
||||
The script `experiment.sh` is a launcher for the experiment runner, which is useful when
|
||||
evaluating new algorithms in processing crawl data.
|
@ -56,13 +56,13 @@ include 'code:common:process'
|
||||
include 'code:processes:converting-process'
|
||||
include 'code:processes:crawling-process'
|
||||
include 'code:processes:loading-process'
|
||||
include 'code:processes:experimental'
|
||||
|
||||
include 'code:process-models:converting-model'
|
||||
include 'code:process-models:crawling-model'
|
||||
|
||||
include 'code:tools:term-frequency-extractor'
|
||||
include 'code:tools:crawl-job-extractor'
|
||||
include 'code:tools:experiment-runner'
|
||||
|
||||
include 'third-party:porterstemmer'
|
||||
include 'third-party:xz'
|
||||
|
Loading…
Reference in New Issue
Block a user