Add experiment runner tool and got rid of experiments module in processes.

This commit is contained in:
Viktor Lofgren 2023-03-28 16:58:46 +02:00
parent 03bd892b95
commit 8f51345a1d
14 changed files with 326 additions and 261 deletions

View File

@ -1,3 +0,0 @@
# Experimental
Contains tools for running classification experiments on crawl data.

View File

@ -1,57 +0,0 @@
package nu.marginalia.experimental;
import nu.marginalia.adblock.AdblockSimulator;
import nu.marginalia.converting.processor.DocumentProcessor;
import plan.CrawlPlanLoader;
import plan.CrawlPlan;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.IOException;
import java.nio.file.Path;
public class AdblockTesterTool {
static AdblockSimulator simulator;
static {
try {
simulator = new AdblockSimulator();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
public static void main(String... args) throws IOException {
CrawlPlan plan = new CrawlPlanLoader().load(Path.of(args[0]));
try (var iterable = plan.domainsIterable()) {
for (var domain : iterable) {
processDomain(domain);
}
}
}
private static void processDomain(CrawledDomain domain) {
if (domain.doc == null) return;
for (var doc : domain.doc) {
if (DocumentProcessor.isAcceptedContentType(doc) && "OK".equals(doc.crawlerStatus)) {
processDocument(doc);
}
}
}
private static void processDocument(CrawledDocument doc) {
Document parsedDocument = Jsoup.parse(doc.documentBody.decode());
if (simulator.hasAds(parsedDocument)) {
System.out.println(doc.url);
}
}
}

View File

@ -1,97 +0,0 @@
package nu.marginalia.experimental;
import com.google.inject.Guice;
import com.google.inject.Inject;
import com.google.inject.Injector;
import nu.marginalia.converting.ConverterModule;
import plan.CrawlPlanLoader;
import plan.CrawlPlan;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.WmsaHome;
import nu.marginalia.converting.processor.DomainProcessor;
import nu.marginalia.adblock.GoogleAnwersSpamDetector;
import nu.marginalia.topic.RecipeDetector;
import nu.marginalia.topic.TextileCraftDetector;
import nu.marginalia.topic.WoodworkingDetector;
import org.jsoup.Jsoup;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Path;
import java.util.concurrent.ForkJoinPool;
public class ConverterLogicTestTool {
private final Logger logger = LoggerFactory.getLogger(getClass());
RecipeDetector recipeDetector = new RecipeDetector();
WoodworkingDetector woodworkingDetector = new WoodworkingDetector();
TextileCraftDetector textileCraftDetector = new TextileCraftDetector();
GoogleAnwersSpamDetector spamDetector = new GoogleAnwersSpamDetector();
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
public static void main(String... args) throws IOException {
if (args.length != 1) {
System.err.println("Arguments: crawl-plan.yaml");
System.exit(0);
}
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
Injector injector = Guice.createInjector(
new ConverterModule(plan)
);
injector.getInstance(ConverterLogicTestTool.class);
}
@Inject
public ConverterLogicTestTool(
CrawlPlan plan,
DomainProcessor processor
) throws Exception {
var cp = new ForkJoinPool(16);
plan.forEachCrawledDomain(domain -> {
if (domain.doc == null) return;
for (var doc : domain.doc) {
if (doc.documentBody == null) continue;
Runnable task = () -> {
var parsed = Jsoup.parse(doc.documentBody.decode());
parsed.body().filter(new DomPruningFilter(0.5));
var dld = se.extractSentences(parsed);
if (dld.totalNumWords() < 250)
return;
if (textileCraftDetector.testP(dld) > 0.3) {
System.out.println("textilecraft\t" + doc.url);
}
if (woodworkingDetector.testP(dld) > 0.1) {
System.out.println("woodworking\t" + doc.url);
}
if (recipeDetector.testP(dld) > 0.5) {
System.out.println("recipe\t" + doc.url);
}
if (spamDetector.testP(parsed) > 0.5) {
System.out.println("GA spam\t" + doc.url);
}
};
if (cp.getQueuedSubmissionCount() > 32) {
task.run();
} else {
cp.execute(task);
}
}
});
}
}

View File

@ -1,93 +0,0 @@
package nu.marginalia.experimental;
import lombok.SneakyThrows;
import nu.marginalia.adblock.AdblockSimulator;
import nu.marginalia.converting.processor.DocumentProcessor;
import plan.CrawlPlanLoader;
import plan.CrawlPlan;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.service.module.DatabaseModule;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.IOException;
import java.nio.file.Path;
import java.sql.SQLException;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.*;
public class CrawlDataExtractorTool {
private static final AdblockSimulator abs;
static {
try {
abs = new AdblockSimulator();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
private static final Set<String> urls = new HashSet<>(50_000_000);
@SneakyThrows
public static void main(String... args) throws IOException {
CrawlPlan plan = new CrawlPlanLoader().load(Path.of(args[0]));
DatabaseModule module = new DatabaseModule();
try (var ds = module.provideConnection();
var conn = ds.getConnection();
var stmt = conn.createStatement()) {
var rsp = stmt.executeQuery("SELECT URL FROM EC_URL_VIEW WHERE TITLE IS NOT NULL");
while (rsp.next()) {
urls.add(rsp.getString(1));
}
}
catch (SQLException ex) {
ex.printStackTrace();
}
LinkedBlockingQueue<Runnable> queue = new LinkedBlockingQueue<>(10);
ExecutorService pool = new ThreadPoolExecutor(10, 20, 5, TimeUnit.MINUTES, queue);
Semaphore sem = new Semaphore(20);
try (var iterable = plan.domainsIterable()) {
for (var domain : iterable) {
sem.acquire();
pool.execute(() -> {
try { processDomain(domain); }
finally { sem.release(); }
});
}
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
pool.shutdown();
while (!pool.awaitTermination(1, TimeUnit.MINUTES));
}
private static void processDomain(CrawledDomain domain) {
if (domain.doc == null) return;
for (var doc : domain.doc) {
if (!urls.contains(doc.url))
continue;
if (DocumentProcessor.isAcceptedContentType(doc) && "OK".equals(doc.crawlerStatus)) {
processDocument(doc);
}
}
}
private static void processDocument(CrawledDocument doc) {
Document parsedDocument = Jsoup.parse(doc.documentBody.decode());
if (abs.hasAds(parsedDocument)) {
System.out.println(doc.url);
}
}
}

View File

@ -1,6 +1,7 @@
plugins { plugins {
id 'java' id 'java'
id "io.freefair.lombok" version "5.3.3.3" id "io.freefair.lombok" version "5.3.3.3"
id 'application'
id 'jvm-test-suite' id 'jvm-test-suite'
} }
@ -11,37 +12,49 @@ java {
} }
} }
dependencies { application {
implementation project(':code:common:process') mainClass = 'nu.marginalia.tools.ExperimentRunnerMain'
applicationName = 'experiment-runner'
}
tasks.distZip.enabled = false
dependencies {
implementation project(':third-party:rdrpostagger')
implementation project(':third-party:porterstemmer')
implementation project(':third-party:monkey-patch-opennlp')
implementation project(':code:common:model') implementation project(':code:common:model')
implementation project(':code:common:config') implementation project(':code:common:config')
implementation project(':code:common:process')
implementation project(':code:common:service') implementation project(':code:common:service')
implementation project(':code:libraries:big-string')
implementation project(':code:api:index-api')
implementation project(':code:common:service-discovery')
implementation project(':code:common:service-client')
implementation project(':code:libraries:language-processing') implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:libraries:big-string')
implementation project(':code:processes:converting-process')
implementation project(':code:process-models:crawling-model') implementation project(':code:process-models:crawling-model')
implementation project(':code:processes:converting-process')
implementation project(':code:features-convert:adblock') implementation project(':code:features-convert:adblock')
implementation project(':code:features-convert:topic-detection') implementation project(':code:features-convert:topic-detection')
implementation libs.lombok implementation libs.lombok
annotationProcessor libs.lombok annotationProcessor libs.lombok
implementation libs.bundles.slf4j implementation libs.bundles.slf4j
implementation libs.notnull
implementation libs.guice implementation libs.guice
implementation libs.jsoup implementation libs.jsoup
implementation libs.bundles.mariadb implementation libs.trove
implementation libs.fastutil
implementation libs.bundles.nlp
implementation libs.commons.lang3
testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit testImplementation libs.bundles.junit
testImplementation libs.mockito testImplementation libs.mockito
} }
test { test {
useJUnitPlatform() useJUnitPlatform()
} }

View File

@ -0,0 +1,7 @@
# Experiment Runner
This tool is a means of launching crawl data processing experiments,
for interacting with crawl data.
It's launched with `run/experiment.sh`. New experiments need to be added to
`ExperimentRunnerMain` in order for the script to be able to run them.

View File

@ -0,0 +1,17 @@
package nu.marginalia.tools;
import nu.marginalia.crawling.model.CrawledDomain;
public interface Experiment {
/** The experiment processes the domain here.
*
* @return true to continue, false to terminate.
*/
boolean process(CrawledDomain domain);
/** Invoked after all domains are processed
*
*/
void onFinish();
}

View File

@ -0,0 +1,49 @@
package nu.marginalia.tools;
import com.google.inject.Guice;
import com.google.inject.Injector;
import nu.marginalia.service.module.DatabaseModule;
import nu.marginalia.tools.experiments.*;
import plan.CrawlPlanLoader;
import java.io.IOException;
import java.nio.file.Path;
import java.util.Map;
public class ExperimentRunnerMain {
private static Map<String, Class<? extends Experiment>> experiments = Map.of(
"test", TestExperiment.class,
"adblock", AdblockExperiment.class,
"topic", TopicExperiment.class,
"statistics", SentenceStatisticsExperiment.class
);
public static void main(String... args) throws IOException {
if (args.length != 2) {
System.err.println("Expected arguments: plan.yaml experiment-name");
return;
}
if (!experiments.containsKey(args[1])) {
System.err.println("Valid experiment names: " + experiments.keySet());
return;
}
Injector injector = Guice.createInjector(
new DatabaseModule()
);
Experiment experiment = injector.getInstance(experiments.get(args[1]));
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
for (var domain : plan.domainsIterable()) { // leaks file descriptor, is fine
if (!experiment.process(domain)) {
break;
}
}
experiment.onFinish();
}
}

View File

@ -0,0 +1,45 @@
package nu.marginalia.tools.experiments;
import com.google.inject.Inject;
import nu.marginalia.adblock.AdblockSimulator;
import nu.marginalia.converting.processor.DocumentProcessor;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.tools.Experiment;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
public class AdblockExperiment implements Experiment {
private final AdblockSimulator simulator;
@Inject
public AdblockExperiment(AdblockSimulator simulator) {
this.simulator = simulator;
}
@Override
public boolean process(CrawledDomain domain) {
if (domain.doc == null) return true;
for (var doc : domain.doc) {
if (DocumentProcessor.isAcceptedContentType(doc) && "OK".equals(doc.crawlerStatus)) {
processDocument(doc);
}
}
return true;
}
private void processDocument(CrawledDocument doc) {
Document parsedDocument = Jsoup.parse(doc.documentBody.decode());
if (simulator.hasAds(parsedDocument)) {
System.out.println(doc.url);
}
}
@Override
public void onFinish() {
}
}

View File

@ -0,0 +1,74 @@
package nu.marginalia.tools.experiments;
import com.google.inject.Inject;
import nu.marginalia.WmsaHome;
import nu.marginalia.adblock.GoogleAnwersSpamDetector;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.tools.Experiment;
import nu.marginalia.topic.RecipeDetector;
import nu.marginalia.topic.TextileCraftDetector;
import nu.marginalia.topic.WoodworkingDetector;
import org.jsoup.Jsoup;
import java.io.BufferedOutputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.nio.file.Files;
import java.nio.file.Path;
public class SentenceStatisticsExperiment implements Experiment {
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
Path filename;
PrintWriter writer;
@Inject
public SentenceStatisticsExperiment() throws IOException {
filename = Files.createTempFile(getClass().getSimpleName(), ".csv");
System.out.println("Writing to " + filename);
writer = new PrintWriter(new BufferedOutputStream(new FileOutputStream(filename.toFile())));
}
private void logLine(String message) {
System.out.printf("\u001b[2K\r%s", message);
}
@Override
public boolean process(CrawledDomain domain) {
if (domain.doc == null) return true;
logLine("Processing: " + domain.domain);
for (var doc : domain.doc) {
if (doc.documentBody == null) continue;
var parsed = Jsoup.parse(doc.documentBody.decode());
parsed.body().filter(new DomPruningFilter(0.5));
var dld = se.extractSentences(parsed);
int numSentences = dld.sentences.length;
if (numSentences == 0) {
continue;
}
double avgLength = dld.totalNumWords() / (double) numSentences;
if (avgLength < 50) {
writer.printf("%s\t%d\t%f\n", doc.url, dld.totalNumWords(), avgLength);
}
}
return true;
}
@Override
public void onFinish() {
logLine("Done!\n");
writer.close();
}
}

View File

@ -0,0 +1,16 @@
package nu.marginalia.tools.experiments;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.tools.Experiment;
public class TestExperiment implements Experiment {
@Override
public boolean process(CrawledDomain domain) {
return true;
}
@Override
public void onFinish() {
System.out.println("Tada!");
}
}

View File

@ -0,0 +1,69 @@
package nu.marginalia.tools.experiments;
import com.google.inject.Inject;
import nu.marginalia.WmsaHome;
import nu.marginalia.adblock.AdblockSimulator;
import nu.marginalia.adblock.GoogleAnwersSpamDetector;
import nu.marginalia.converting.processor.DocumentProcessor;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.tools.Experiment;
import nu.marginalia.topic.RecipeDetector;
import nu.marginalia.topic.TextileCraftDetector;
import nu.marginalia.topic.WoodworkingDetector;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
public class TopicExperiment implements Experiment {
RecipeDetector recipeDetector = new RecipeDetector();
WoodworkingDetector woodworkingDetector = new WoodworkingDetector();
TextileCraftDetector textileCraftDetector = new TextileCraftDetector();
GoogleAnwersSpamDetector spamDetector = new GoogleAnwersSpamDetector();
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
@Inject
public TopicExperiment() {
}
@Override
public boolean process(CrawledDomain domain) {
if (domain.doc == null) return true;
for (var doc : domain.doc) {
if (doc.documentBody == null) continue;
var parsed = Jsoup.parse(doc.documentBody.decode());
parsed.body().filter(new DomPruningFilter(0.5));
var dld = se.extractSentences(parsed);
if (dld.totalNumWords() < 250)
continue;
if (textileCraftDetector.testP(dld) > 0.3) {
System.out.println("textilecraft\t" + doc.url);
}
if (woodworkingDetector.testP(dld) > 0.1) {
System.out.println("woodworking\t" + doc.url);
}
if (recipeDetector.testP(dld) > 0.5) {
System.out.println("recipe\t" + doc.url);
}
if (spamDetector.testP(parsed) > 0.5) {
System.out.println("GA spam\t" + doc.url);
}
}
return true;
}
@Override
public void onFinish() {
}
}

View File

@ -51,3 +51,28 @@ indexes. Wait for the line 'Auto-conversion finished!'
When all is done, it should be possible to visit When all is done, it should be possible to visit
[http://localhost:8080](http://localhost:8080) and try a few searches! [http://localhost:8080](http://localhost:8080) and try a few searches!
## Other Crawl Data
By default, `reconvert.sh` will load the medium dataset. This is appropriate for a demo,
but other datasets also exist.
| Set | Description |
|-----|----------------------------------------------------------------------------|
| s | 1000 domains, suitable for low-end machines |
| m | 2000 domains |
| l | 5000 domains |
| xl | 50,000 domains, basically pre-prod.<br><b>Warning</b>: 5h+ processing time |
To switch datasets, run e.g.
```shell
$ docker-compose up -d mariadb
$ ./run/reconvert.sh l
```
## Experiment Runner
The script `experiment.sh` is a launcher for the experiment runner, which is useful when
evaluating new algorithms in processing crawl data.

View File

@ -56,13 +56,13 @@ include 'code:common:process'
include 'code:processes:converting-process' include 'code:processes:converting-process'
include 'code:processes:crawling-process' include 'code:processes:crawling-process'
include 'code:processes:loading-process' include 'code:processes:loading-process'
include 'code:processes:experimental'
include 'code:process-models:converting-model' include 'code:process-models:converting-model'
include 'code:process-models:crawling-model' include 'code:process-models:crawling-model'
include 'code:tools:term-frequency-extractor' include 'code:tools:term-frequency-extractor'
include 'code:tools:crawl-job-extractor' include 'code:tools:crawl-job-extractor'
include 'code:tools:experiment-runner'
include 'third-party:porterstemmer' include 'third-party:porterstemmer'
include 'third-party:xz' include 'third-party:xz'