Add experiment runner tool and got rid of experiments module in processes.

This commit is contained in:
Viktor Lofgren 2023-03-28 16:58:46 +02:00
parent 03bd892b95
commit 8f51345a1d
14 changed files with 326 additions and 261 deletions

View File

@ -1,3 +0,0 @@
# Experimental
Contains tools for running classification experiments on crawl data.

View File

@ -1,57 +0,0 @@
package nu.marginalia.experimental;
import nu.marginalia.adblock.AdblockSimulator;
import nu.marginalia.converting.processor.DocumentProcessor;
import plan.CrawlPlanLoader;
import plan.CrawlPlan;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.IOException;
import java.nio.file.Path;
public class AdblockTesterTool {
static AdblockSimulator simulator;
static {
try {
simulator = new AdblockSimulator();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
public static void main(String... args) throws IOException {
CrawlPlan plan = new CrawlPlanLoader().load(Path.of(args[0]));
try (var iterable = plan.domainsIterable()) {
for (var domain : iterable) {
processDomain(domain);
}
}
}
private static void processDomain(CrawledDomain domain) {
if (domain.doc == null) return;
for (var doc : domain.doc) {
if (DocumentProcessor.isAcceptedContentType(doc) && "OK".equals(doc.crawlerStatus)) {
processDocument(doc);
}
}
}
private static void processDocument(CrawledDocument doc) {
Document parsedDocument = Jsoup.parse(doc.documentBody.decode());
if (simulator.hasAds(parsedDocument)) {
System.out.println(doc.url);
}
}
}

View File

@ -1,97 +0,0 @@
package nu.marginalia.experimental;
import com.google.inject.Guice;
import com.google.inject.Inject;
import com.google.inject.Injector;
import nu.marginalia.converting.ConverterModule;
import plan.CrawlPlanLoader;
import plan.CrawlPlan;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.WmsaHome;
import nu.marginalia.converting.processor.DomainProcessor;
import nu.marginalia.adblock.GoogleAnwersSpamDetector;
import nu.marginalia.topic.RecipeDetector;
import nu.marginalia.topic.TextileCraftDetector;
import nu.marginalia.topic.WoodworkingDetector;
import org.jsoup.Jsoup;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Path;
import java.util.concurrent.ForkJoinPool;
public class ConverterLogicTestTool {
private final Logger logger = LoggerFactory.getLogger(getClass());
RecipeDetector recipeDetector = new RecipeDetector();
WoodworkingDetector woodworkingDetector = new WoodworkingDetector();
TextileCraftDetector textileCraftDetector = new TextileCraftDetector();
GoogleAnwersSpamDetector spamDetector = new GoogleAnwersSpamDetector();
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
public static void main(String... args) throws IOException {
if (args.length != 1) {
System.err.println("Arguments: crawl-plan.yaml");
System.exit(0);
}
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
Injector injector = Guice.createInjector(
new ConverterModule(plan)
);
injector.getInstance(ConverterLogicTestTool.class);
}
@Inject
public ConverterLogicTestTool(
CrawlPlan plan,
DomainProcessor processor
) throws Exception {
var cp = new ForkJoinPool(16);
plan.forEachCrawledDomain(domain -> {
if (domain.doc == null) return;
for (var doc : domain.doc) {
if (doc.documentBody == null) continue;
Runnable task = () -> {
var parsed = Jsoup.parse(doc.documentBody.decode());
parsed.body().filter(new DomPruningFilter(0.5));
var dld = se.extractSentences(parsed);
if (dld.totalNumWords() < 250)
return;
if (textileCraftDetector.testP(dld) > 0.3) {
System.out.println("textilecraft\t" + doc.url);
}
if (woodworkingDetector.testP(dld) > 0.1) {
System.out.println("woodworking\t" + doc.url);
}
if (recipeDetector.testP(dld) > 0.5) {
System.out.println("recipe\t" + doc.url);
}
if (spamDetector.testP(parsed) > 0.5) {
System.out.println("GA spam\t" + doc.url);
}
};
if (cp.getQueuedSubmissionCount() > 32) {
task.run();
} else {
cp.execute(task);
}
}
});
}
}

View File

@ -1,93 +0,0 @@
package nu.marginalia.experimental;
import lombok.SneakyThrows;
import nu.marginalia.adblock.AdblockSimulator;
import nu.marginalia.converting.processor.DocumentProcessor;
import plan.CrawlPlanLoader;
import plan.CrawlPlan;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.service.module.DatabaseModule;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.IOException;
import java.nio.file.Path;
import java.sql.SQLException;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.*;
public class CrawlDataExtractorTool {
private static final AdblockSimulator abs;
static {
try {
abs = new AdblockSimulator();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
private static final Set<String> urls = new HashSet<>(50_000_000);
@SneakyThrows
public static void main(String... args) throws IOException {
CrawlPlan plan = new CrawlPlanLoader().load(Path.of(args[0]));
DatabaseModule module = new DatabaseModule();
try (var ds = module.provideConnection();
var conn = ds.getConnection();
var stmt = conn.createStatement()) {
var rsp = stmt.executeQuery("SELECT URL FROM EC_URL_VIEW WHERE TITLE IS NOT NULL");
while (rsp.next()) {
urls.add(rsp.getString(1));
}
}
catch (SQLException ex) {
ex.printStackTrace();
}
LinkedBlockingQueue<Runnable> queue = new LinkedBlockingQueue<>(10);
ExecutorService pool = new ThreadPoolExecutor(10, 20, 5, TimeUnit.MINUTES, queue);
Semaphore sem = new Semaphore(20);
try (var iterable = plan.domainsIterable()) {
for (var domain : iterable) {
sem.acquire();
pool.execute(() -> {
try { processDomain(domain); }
finally { sem.release(); }
});
}
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
pool.shutdown();
while (!pool.awaitTermination(1, TimeUnit.MINUTES));
}
private static void processDomain(CrawledDomain domain) {
if (domain.doc == null) return;
for (var doc : domain.doc) {
if (!urls.contains(doc.url))
continue;
if (DocumentProcessor.isAcceptedContentType(doc) && "OK".equals(doc.crawlerStatus)) {
processDocument(doc);
}
}
}
private static void processDocument(CrawledDocument doc) {
Document parsedDocument = Jsoup.parse(doc.documentBody.decode());
if (abs.hasAds(parsedDocument)) {
System.out.println(doc.url);
}
}
}

View File

@ -1,6 +1,7 @@
plugins {
id 'java'
id "io.freefair.lombok" version "5.3.3.3"
id 'application'
id 'jvm-test-suite'
}
@ -11,37 +12,49 @@ java {
}
}
dependencies {
implementation project(':code:common:process')
application {
mainClass = 'nu.marginalia.tools.ExperimentRunnerMain'
applicationName = 'experiment-runner'
}
tasks.distZip.enabled = false
dependencies {
implementation project(':third-party:rdrpostagger')
implementation project(':third-party:porterstemmer')
implementation project(':third-party:monkey-patch-opennlp')
implementation project(':code:common:model')
implementation project(':code:common:config')
implementation project(':code:common:process')
implementation project(':code:common:service')
implementation project(':code:libraries:big-string')
implementation project(':code:api:index-api')
implementation project(':code:common:service-discovery')
implementation project(':code:common:service-client')
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:libraries:big-string')
implementation project(':code:processes:converting-process')
implementation project(':code:process-models:crawling-model')
implementation project(':code:processes:converting-process')
implementation project(':code:features-convert:adblock')
implementation project(':code:features-convert:topic-detection')
implementation libs.lombok
annotationProcessor libs.lombok
implementation libs.bundles.slf4j
implementation libs.notnull
implementation libs.guice
implementation libs.jsoup
implementation libs.bundles.mariadb
implementation libs.trove
implementation libs.fastutil
implementation libs.bundles.nlp
implementation libs.commons.lang3
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}
test {
useJUnitPlatform()
}

View File

@ -0,0 +1,7 @@
# Experiment Runner
This tool is a means of launching crawl data processing experiments,
for interacting with crawl data.
It's launched with `run/experiment.sh`. New experiments need to be added to
`ExperimentRunnerMain` in order for the script to be able to run them.

View File

@ -0,0 +1,17 @@
package nu.marginalia.tools;
import nu.marginalia.crawling.model.CrawledDomain;
public interface Experiment {
/** The experiment processes the domain here.
*
* @return true to continue, false to terminate.
*/
boolean process(CrawledDomain domain);
/** Invoked after all domains are processed
*
*/
void onFinish();
}

View File

@ -0,0 +1,49 @@
package nu.marginalia.tools;
import com.google.inject.Guice;
import com.google.inject.Injector;
import nu.marginalia.service.module.DatabaseModule;
import nu.marginalia.tools.experiments.*;
import plan.CrawlPlanLoader;
import java.io.IOException;
import java.nio.file.Path;
import java.util.Map;
public class ExperimentRunnerMain {
private static Map<String, Class<? extends Experiment>> experiments = Map.of(
"test", TestExperiment.class,
"adblock", AdblockExperiment.class,
"topic", TopicExperiment.class,
"statistics", SentenceStatisticsExperiment.class
);
public static void main(String... args) throws IOException {
if (args.length != 2) {
System.err.println("Expected arguments: plan.yaml experiment-name");
return;
}
if (!experiments.containsKey(args[1])) {
System.err.println("Valid experiment names: " + experiments.keySet());
return;
}
Injector injector = Guice.createInjector(
new DatabaseModule()
);
Experiment experiment = injector.getInstance(experiments.get(args[1]));
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
for (var domain : plan.domainsIterable()) { // leaks file descriptor, is fine
if (!experiment.process(domain)) {
break;
}
}
experiment.onFinish();
}
}

View File

@ -0,0 +1,45 @@
package nu.marginalia.tools.experiments;
import com.google.inject.Inject;
import nu.marginalia.adblock.AdblockSimulator;
import nu.marginalia.converting.processor.DocumentProcessor;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.tools.Experiment;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
public class AdblockExperiment implements Experiment {
private final AdblockSimulator simulator;
@Inject
public AdblockExperiment(AdblockSimulator simulator) {
this.simulator = simulator;
}
@Override
public boolean process(CrawledDomain domain) {
if (domain.doc == null) return true;
for (var doc : domain.doc) {
if (DocumentProcessor.isAcceptedContentType(doc) && "OK".equals(doc.crawlerStatus)) {
processDocument(doc);
}
}
return true;
}
private void processDocument(CrawledDocument doc) {
Document parsedDocument = Jsoup.parse(doc.documentBody.decode());
if (simulator.hasAds(parsedDocument)) {
System.out.println(doc.url);
}
}
@Override
public void onFinish() {
}
}

View File

@ -0,0 +1,74 @@
package nu.marginalia.tools.experiments;
import com.google.inject.Inject;
import nu.marginalia.WmsaHome;
import nu.marginalia.adblock.GoogleAnwersSpamDetector;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.tools.Experiment;
import nu.marginalia.topic.RecipeDetector;
import nu.marginalia.topic.TextileCraftDetector;
import nu.marginalia.topic.WoodworkingDetector;
import org.jsoup.Jsoup;
import java.io.BufferedOutputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.nio.file.Files;
import java.nio.file.Path;
public class SentenceStatisticsExperiment implements Experiment {
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
Path filename;
PrintWriter writer;
@Inject
public SentenceStatisticsExperiment() throws IOException {
filename = Files.createTempFile(getClass().getSimpleName(), ".csv");
System.out.println("Writing to " + filename);
writer = new PrintWriter(new BufferedOutputStream(new FileOutputStream(filename.toFile())));
}
private void logLine(String message) {
System.out.printf("\u001b[2K\r%s", message);
}
@Override
public boolean process(CrawledDomain domain) {
if (domain.doc == null) return true;
logLine("Processing: " + domain.domain);
for (var doc : domain.doc) {
if (doc.documentBody == null) continue;
var parsed = Jsoup.parse(doc.documentBody.decode());
parsed.body().filter(new DomPruningFilter(0.5));
var dld = se.extractSentences(parsed);
int numSentences = dld.sentences.length;
if (numSentences == 0) {
continue;
}
double avgLength = dld.totalNumWords() / (double) numSentences;
if (avgLength < 50) {
writer.printf("%s\t%d\t%f\n", doc.url, dld.totalNumWords(), avgLength);
}
}
return true;
}
@Override
public void onFinish() {
logLine("Done!\n");
writer.close();
}
}

View File

@ -0,0 +1,16 @@
package nu.marginalia.tools.experiments;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.tools.Experiment;
public class TestExperiment implements Experiment {
@Override
public boolean process(CrawledDomain domain) {
return true;
}
@Override
public void onFinish() {
System.out.println("Tada!");
}
}

View File

@ -0,0 +1,69 @@
package nu.marginalia.tools.experiments;
import com.google.inject.Inject;
import nu.marginalia.WmsaHome;
import nu.marginalia.adblock.AdblockSimulator;
import nu.marginalia.adblock.GoogleAnwersSpamDetector;
import nu.marginalia.converting.processor.DocumentProcessor;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.tools.Experiment;
import nu.marginalia.topic.RecipeDetector;
import nu.marginalia.topic.TextileCraftDetector;
import nu.marginalia.topic.WoodworkingDetector;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
public class TopicExperiment implements Experiment {
RecipeDetector recipeDetector = new RecipeDetector();
WoodworkingDetector woodworkingDetector = new WoodworkingDetector();
TextileCraftDetector textileCraftDetector = new TextileCraftDetector();
GoogleAnwersSpamDetector spamDetector = new GoogleAnwersSpamDetector();
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
@Inject
public TopicExperiment() {
}
@Override
public boolean process(CrawledDomain domain) {
if (domain.doc == null) return true;
for (var doc : domain.doc) {
if (doc.documentBody == null) continue;
var parsed = Jsoup.parse(doc.documentBody.decode());
parsed.body().filter(new DomPruningFilter(0.5));
var dld = se.extractSentences(parsed);
if (dld.totalNumWords() < 250)
continue;
if (textileCraftDetector.testP(dld) > 0.3) {
System.out.println("textilecraft\t" + doc.url);
}
if (woodworkingDetector.testP(dld) > 0.1) {
System.out.println("woodworking\t" + doc.url);
}
if (recipeDetector.testP(dld) > 0.5) {
System.out.println("recipe\t" + doc.url);
}
if (spamDetector.testP(parsed) > 0.5) {
System.out.println("GA spam\t" + doc.url);
}
}
return true;
}
@Override
public void onFinish() {
}
}

View File

@ -51,3 +51,28 @@ indexes. Wait for the line 'Auto-conversion finished!'
When all is done, it should be possible to visit
[http://localhost:8080](http://localhost:8080) and try a few searches!
## Other Crawl Data
By default, `reconvert.sh` will load the medium dataset. This is appropriate for a demo,
but other datasets also exist.
| Set | Description |
|-----|----------------------------------------------------------------------------|
| s | 1000 domains, suitable for low-end machines |
| m | 2000 domains |
| l | 5000 domains |
| xl | 50,000 domains, basically pre-prod.<br><b>Warning</b>: 5h+ processing time |
To switch datasets, run e.g.
```shell
$ docker-compose up -d mariadb
$ ./run/reconvert.sh l
```
## Experiment Runner
The script `experiment.sh` is a launcher for the experiment runner, which is useful when
evaluating new algorithms in processing crawl data.

View File

@ -56,13 +56,13 @@ include 'code:common:process'
include 'code:processes:converting-process'
include 'code:processes:crawling-process'
include 'code:processes:loading-process'
include 'code:processes:experimental'
include 'code:process-models:converting-model'
include 'code:process-models:crawling-model'
include 'code:tools:term-frequency-extractor'
include 'code:tools:crawl-job-extractor'
include 'code:tools:experiment-runner'
include 'third-party:porterstemmer'
include 'third-party:xz'