mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Add experiment runner tool and got rid of experiments module in processes.
This commit is contained in:
parent
03bd892b95
commit
8f51345a1d
@ -1,3 +0,0 @@
|
|||||||
# Experimental
|
|
||||||
|
|
||||||
Contains tools for running classification experiments on crawl data.
|
|
@ -1,57 +0,0 @@
|
|||||||
package nu.marginalia.experimental;
|
|
||||||
|
|
||||||
import nu.marginalia.adblock.AdblockSimulator;
|
|
||||||
import nu.marginalia.converting.processor.DocumentProcessor;
|
|
||||||
import plan.CrawlPlanLoader;
|
|
||||||
import plan.CrawlPlan;
|
|
||||||
import nu.marginalia.crawling.model.CrawledDocument;
|
|
||||||
import nu.marginalia.crawling.model.CrawledDomain;
|
|
||||||
import org.jsoup.Jsoup;
|
|
||||||
import org.jsoup.nodes.Document;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.nio.file.Path;
|
|
||||||
|
|
||||||
|
|
||||||
public class AdblockTesterTool {
|
|
||||||
|
|
||||||
static AdblockSimulator simulator;
|
|
||||||
|
|
||||||
static {
|
|
||||||
try {
|
|
||||||
simulator = new AdblockSimulator();
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public static void main(String... args) throws IOException {
|
|
||||||
CrawlPlan plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
|
||||||
|
|
||||||
try (var iterable = plan.domainsIterable()) {
|
|
||||||
for (var domain : iterable) {
|
|
||||||
processDomain(domain);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void processDomain(CrawledDomain domain) {
|
|
||||||
if (domain.doc == null) return;
|
|
||||||
for (var doc : domain.doc) {
|
|
||||||
if (DocumentProcessor.isAcceptedContentType(doc) && "OK".equals(doc.crawlerStatus)) {
|
|
||||||
processDocument(doc);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private static void processDocument(CrawledDocument doc) {
|
|
||||||
Document parsedDocument = Jsoup.parse(doc.documentBody.decode());
|
|
||||||
|
|
||||||
if (simulator.hasAds(parsedDocument)) {
|
|
||||||
System.out.println(doc.url);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,97 +0,0 @@
|
|||||||
package nu.marginalia.experimental;
|
|
||||||
|
|
||||||
import com.google.inject.Guice;
|
|
||||||
import com.google.inject.Inject;
|
|
||||||
import com.google.inject.Injector;
|
|
||||||
import nu.marginalia.converting.ConverterModule;
|
|
||||||
import plan.CrawlPlanLoader;
|
|
||||||
import plan.CrawlPlan;
|
|
||||||
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
|
||||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
|
||||||
import nu.marginalia.WmsaHome;
|
|
||||||
import nu.marginalia.converting.processor.DomainProcessor;
|
|
||||||
import nu.marginalia.adblock.GoogleAnwersSpamDetector;
|
|
||||||
import nu.marginalia.topic.RecipeDetector;
|
|
||||||
import nu.marginalia.topic.TextileCraftDetector;
|
|
||||||
import nu.marginalia.topic.WoodworkingDetector;
|
|
||||||
import org.jsoup.Jsoup;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.nio.file.Path;
|
|
||||||
import java.util.concurrent.ForkJoinPool;
|
|
||||||
|
|
||||||
public class ConverterLogicTestTool {
|
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
|
||||||
|
|
||||||
RecipeDetector recipeDetector = new RecipeDetector();
|
|
||||||
WoodworkingDetector woodworkingDetector = new WoodworkingDetector();
|
|
||||||
TextileCraftDetector textileCraftDetector = new TextileCraftDetector();
|
|
||||||
GoogleAnwersSpamDetector spamDetector = new GoogleAnwersSpamDetector();
|
|
||||||
|
|
||||||
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
|
||||||
|
|
||||||
public static void main(String... args) throws IOException {
|
|
||||||
|
|
||||||
if (args.length != 1) {
|
|
||||||
System.err.println("Arguments: crawl-plan.yaml");
|
|
||||||
System.exit(0);
|
|
||||||
}
|
|
||||||
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
|
||||||
|
|
||||||
Injector injector = Guice.createInjector(
|
|
||||||
new ConverterModule(plan)
|
|
||||||
);
|
|
||||||
|
|
||||||
injector.getInstance(ConverterLogicTestTool.class);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Inject
|
|
||||||
public ConverterLogicTestTool(
|
|
||||||
CrawlPlan plan,
|
|
||||||
DomainProcessor processor
|
|
||||||
) throws Exception {
|
|
||||||
var cp = new ForkJoinPool(16);
|
|
||||||
|
|
||||||
plan.forEachCrawledDomain(domain -> {
|
|
||||||
if (domain.doc == null) return;
|
|
||||||
|
|
||||||
|
|
||||||
for (var doc : domain.doc) {
|
|
||||||
if (doc.documentBody == null) continue;
|
|
||||||
|
|
||||||
Runnable task = () -> {
|
|
||||||
var parsed = Jsoup.parse(doc.documentBody.decode());
|
|
||||||
|
|
||||||
parsed.body().filter(new DomPruningFilter(0.5));
|
|
||||||
var dld = se.extractSentences(parsed);
|
|
||||||
|
|
||||||
if (dld.totalNumWords() < 250)
|
|
||||||
return;
|
|
||||||
|
|
||||||
if (textileCraftDetector.testP(dld) > 0.3) {
|
|
||||||
System.out.println("textilecraft\t" + doc.url);
|
|
||||||
}
|
|
||||||
if (woodworkingDetector.testP(dld) > 0.1) {
|
|
||||||
System.out.println("woodworking\t" + doc.url);
|
|
||||||
}
|
|
||||||
if (recipeDetector.testP(dld) > 0.5) {
|
|
||||||
System.out.println("recipe\t" + doc.url);
|
|
||||||
}
|
|
||||||
if (spamDetector.testP(parsed) > 0.5) {
|
|
||||||
System.out.println("GA spam\t" + doc.url);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
if (cp.getQueuedSubmissionCount() > 32) {
|
|
||||||
task.run();
|
|
||||||
} else {
|
|
||||||
cp.execute(task);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,93 +0,0 @@
|
|||||||
package nu.marginalia.experimental;
|
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
|
||||||
import nu.marginalia.adblock.AdblockSimulator;
|
|
||||||
import nu.marginalia.converting.processor.DocumentProcessor;
|
|
||||||
import plan.CrawlPlanLoader;
|
|
||||||
import plan.CrawlPlan;
|
|
||||||
import nu.marginalia.crawling.model.CrawledDocument;
|
|
||||||
import nu.marginalia.crawling.model.CrawledDomain;
|
|
||||||
import nu.marginalia.service.module.DatabaseModule;
|
|
||||||
import org.jsoup.Jsoup;
|
|
||||||
import org.jsoup.nodes.Document;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.nio.file.Path;
|
|
||||||
import java.sql.SQLException;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
import java.util.concurrent.*;
|
|
||||||
|
|
||||||
|
|
||||||
public class CrawlDataExtractorTool {
|
|
||||||
private static final AdblockSimulator abs;
|
|
||||||
|
|
||||||
static {
|
|
||||||
try {
|
|
||||||
abs = new AdblockSimulator();
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static final Set<String> urls = new HashSet<>(50_000_000);
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
public static void main(String... args) throws IOException {
|
|
||||||
CrawlPlan plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
|
||||||
DatabaseModule module = new DatabaseModule();
|
|
||||||
|
|
||||||
try (var ds = module.provideConnection();
|
|
||||||
var conn = ds.getConnection();
|
|
||||||
var stmt = conn.createStatement()) {
|
|
||||||
var rsp = stmt.executeQuery("SELECT URL FROM EC_URL_VIEW WHERE TITLE IS NOT NULL");
|
|
||||||
while (rsp.next()) {
|
|
||||||
urls.add(rsp.getString(1));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
catch (SQLException ex) {
|
|
||||||
ex.printStackTrace();
|
|
||||||
}
|
|
||||||
|
|
||||||
LinkedBlockingQueue<Runnable> queue = new LinkedBlockingQueue<>(10);
|
|
||||||
ExecutorService pool = new ThreadPoolExecutor(10, 20, 5, TimeUnit.MINUTES, queue);
|
|
||||||
Semaphore sem = new Semaphore(20);
|
|
||||||
|
|
||||||
try (var iterable = plan.domainsIterable()) {
|
|
||||||
for (var domain : iterable) {
|
|
||||||
sem.acquire();
|
|
||||||
pool.execute(() -> {
|
|
||||||
try { processDomain(domain); }
|
|
||||||
finally { sem.release(); }
|
|
||||||
});
|
|
||||||
}
|
|
||||||
} catch (InterruptedException e) {
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
|
||||||
|
|
||||||
pool.shutdown();
|
|
||||||
|
|
||||||
while (!pool.awaitTermination(1, TimeUnit.MINUTES));
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void processDomain(CrawledDomain domain) {
|
|
||||||
if (domain.doc == null) return;
|
|
||||||
for (var doc : domain.doc) {
|
|
||||||
if (!urls.contains(doc.url))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if (DocumentProcessor.isAcceptedContentType(doc) && "OK".equals(doc.crawlerStatus)) {
|
|
||||||
processDocument(doc);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private static void processDocument(CrawledDocument doc) {
|
|
||||||
Document parsedDocument = Jsoup.parse(doc.documentBody.decode());
|
|
||||||
|
|
||||||
if (abs.hasAds(parsedDocument)) {
|
|
||||||
System.out.println(doc.url);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,6 +1,7 @@
|
|||||||
plugins {
|
plugins {
|
||||||
id 'java'
|
id 'java'
|
||||||
id "io.freefair.lombok" version "5.3.3.3"
|
id "io.freefair.lombok" version "5.3.3.3"
|
||||||
|
id 'application'
|
||||||
|
|
||||||
id 'jvm-test-suite'
|
id 'jvm-test-suite'
|
||||||
}
|
}
|
||||||
@ -11,37 +12,49 @@ java {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
dependencies {
|
application {
|
||||||
implementation project(':code:common:process')
|
mainClass = 'nu.marginalia.tools.ExperimentRunnerMain'
|
||||||
|
applicationName = 'experiment-runner'
|
||||||
|
}
|
||||||
|
|
||||||
|
tasks.distZip.enabled = false
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
implementation project(':third-party:rdrpostagger')
|
||||||
|
implementation project(':third-party:porterstemmer')
|
||||||
|
implementation project(':third-party:monkey-patch-opennlp')
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
implementation project(':code:common:config')
|
implementation project(':code:common:config')
|
||||||
|
implementation project(':code:common:process')
|
||||||
implementation project(':code:common:service')
|
implementation project(':code:common:service')
|
||||||
implementation project(':code:libraries:big-string')
|
|
||||||
implementation project(':code:api:index-api')
|
|
||||||
implementation project(':code:common:service-discovery')
|
|
||||||
implementation project(':code:common:service-client')
|
|
||||||
implementation project(':code:libraries:language-processing')
|
implementation project(':code:libraries:language-processing')
|
||||||
|
implementation project(':code:libraries:term-frequency-dict')
|
||||||
|
implementation project(':code:libraries:big-string')
|
||||||
|
implementation project(':code:processes:converting-process')
|
||||||
implementation project(':code:process-models:crawling-model')
|
implementation project(':code:process-models:crawling-model')
|
||||||
|
|
||||||
implementation project(':code:processes:converting-process')
|
|
||||||
implementation project(':code:features-convert:adblock')
|
implementation project(':code:features-convert:adblock')
|
||||||
implementation project(':code:features-convert:topic-detection')
|
implementation project(':code:features-convert:topic-detection')
|
||||||
|
|
||||||
implementation libs.lombok
|
implementation libs.lombok
|
||||||
annotationProcessor libs.lombok
|
annotationProcessor libs.lombok
|
||||||
implementation libs.bundles.slf4j
|
implementation libs.bundles.slf4j
|
||||||
|
implementation libs.notnull
|
||||||
|
|
||||||
implementation libs.guice
|
implementation libs.guice
|
||||||
implementation libs.jsoup
|
implementation libs.jsoup
|
||||||
implementation libs.bundles.mariadb
|
implementation libs.trove
|
||||||
|
implementation libs.fastutil
|
||||||
|
|
||||||
|
implementation libs.bundles.nlp
|
||||||
|
implementation libs.commons.lang3
|
||||||
|
|
||||||
testImplementation libs.bundles.slf4j.test
|
testImplementation libs.bundles.slf4j.test
|
||||||
testImplementation libs.bundles.junit
|
testImplementation libs.bundles.junit
|
||||||
testImplementation libs.mockito
|
testImplementation libs.mockito
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
test {
|
test {
|
||||||
useJUnitPlatform()
|
useJUnitPlatform()
|
||||||
}
|
}
|
7
code/tools/experiment-runner/readme.md
Normal file
7
code/tools/experiment-runner/readme.md
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
# Experiment Runner
|
||||||
|
|
||||||
|
This tool is a means of launching crawl data processing experiments,
|
||||||
|
for interacting with crawl data.
|
||||||
|
|
||||||
|
It's launched with `run/experiment.sh`. New experiments need to be added to
|
||||||
|
`ExperimentRunnerMain` in order for the script to be able to run them.
|
@ -0,0 +1,17 @@
|
|||||||
|
package nu.marginalia.tools;
|
||||||
|
|
||||||
|
import nu.marginalia.crawling.model.CrawledDomain;
|
||||||
|
|
||||||
|
public interface Experiment {
|
||||||
|
|
||||||
|
/** The experiment processes the domain here.
|
||||||
|
*
|
||||||
|
* @return true to continue, false to terminate.
|
||||||
|
*/
|
||||||
|
boolean process(CrawledDomain domain);
|
||||||
|
|
||||||
|
/** Invoked after all domains are processed
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
void onFinish();
|
||||||
|
}
|
@ -0,0 +1,49 @@
|
|||||||
|
package nu.marginalia.tools;
|
||||||
|
|
||||||
|
import com.google.inject.Guice;
|
||||||
|
import com.google.inject.Injector;
|
||||||
|
import nu.marginalia.service.module.DatabaseModule;
|
||||||
|
import nu.marginalia.tools.experiments.*;
|
||||||
|
import plan.CrawlPlanLoader;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
public class ExperimentRunnerMain {
|
||||||
|
|
||||||
|
private static Map<String, Class<? extends Experiment>> experiments = Map.of(
|
||||||
|
"test", TestExperiment.class,
|
||||||
|
"adblock", AdblockExperiment.class,
|
||||||
|
"topic", TopicExperiment.class,
|
||||||
|
"statistics", SentenceStatisticsExperiment.class
|
||||||
|
);
|
||||||
|
|
||||||
|
public static void main(String... args) throws IOException {
|
||||||
|
if (args.length != 2) {
|
||||||
|
System.err.println("Expected arguments: plan.yaml experiment-name");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!experiments.containsKey(args[1])) {
|
||||||
|
System.err.println("Valid experiment names: " + experiments.keySet());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
Injector injector = Guice.createInjector(
|
||||||
|
new DatabaseModule()
|
||||||
|
);
|
||||||
|
|
||||||
|
Experiment experiment = injector.getInstance(experiments.get(args[1]));
|
||||||
|
|
||||||
|
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
||||||
|
|
||||||
|
for (var domain : plan.domainsIterable()) { // leaks file descriptor, is fine
|
||||||
|
if (!experiment.process(domain)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
experiment.onFinish();
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,45 @@
|
|||||||
|
package nu.marginalia.tools.experiments;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import nu.marginalia.adblock.AdblockSimulator;
|
||||||
|
import nu.marginalia.converting.processor.DocumentProcessor;
|
||||||
|
import nu.marginalia.crawling.model.CrawledDocument;
|
||||||
|
import nu.marginalia.crawling.model.CrawledDomain;
|
||||||
|
import nu.marginalia.tools.Experiment;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
|
public class AdblockExperiment implements Experiment {
|
||||||
|
|
||||||
|
private final AdblockSimulator simulator;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public AdblockExperiment(AdblockSimulator simulator) {
|
||||||
|
this.simulator = simulator;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean process(CrawledDomain domain) {
|
||||||
|
if (domain.doc == null) return true;
|
||||||
|
|
||||||
|
for (var doc : domain.doc) {
|
||||||
|
if (DocumentProcessor.isAcceptedContentType(doc) && "OK".equals(doc.crawlerStatus)) {
|
||||||
|
processDocument(doc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void processDocument(CrawledDocument doc) {
|
||||||
|
Document parsedDocument = Jsoup.parse(doc.documentBody.decode());
|
||||||
|
|
||||||
|
if (simulator.hasAds(parsedDocument)) {
|
||||||
|
System.out.println(doc.url);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void onFinish() {
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,74 @@
|
|||||||
|
package nu.marginalia.tools.experiments;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import nu.marginalia.WmsaHome;
|
||||||
|
import nu.marginalia.adblock.GoogleAnwersSpamDetector;
|
||||||
|
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
||||||
|
import nu.marginalia.crawling.model.CrawledDomain;
|
||||||
|
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||||
|
import nu.marginalia.tools.Experiment;
|
||||||
|
import nu.marginalia.topic.RecipeDetector;
|
||||||
|
import nu.marginalia.topic.TextileCraftDetector;
|
||||||
|
import nu.marginalia.topic.WoodworkingDetector;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
|
||||||
|
import java.io.BufferedOutputStream;
|
||||||
|
import java.io.FileOutputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.PrintWriter;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
public class SentenceStatisticsExperiment implements Experiment {
|
||||||
|
|
||||||
|
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||||
|
Path filename;
|
||||||
|
PrintWriter writer;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public SentenceStatisticsExperiment() throws IOException {
|
||||||
|
filename = Files.createTempFile(getClass().getSimpleName(), ".csv");
|
||||||
|
System.out.println("Writing to " + filename);
|
||||||
|
|
||||||
|
writer = new PrintWriter(new BufferedOutputStream(new FileOutputStream(filename.toFile())));
|
||||||
|
}
|
||||||
|
|
||||||
|
private void logLine(String message) {
|
||||||
|
System.out.printf("\u001b[2K\r%s", message);
|
||||||
|
}
|
||||||
|
@Override
|
||||||
|
public boolean process(CrawledDomain domain) {
|
||||||
|
if (domain.doc == null) return true;
|
||||||
|
|
||||||
|
logLine("Processing: " + domain.domain);
|
||||||
|
|
||||||
|
for (var doc : domain.doc) {
|
||||||
|
if (doc.documentBody == null) continue;
|
||||||
|
|
||||||
|
var parsed = Jsoup.parse(doc.documentBody.decode());
|
||||||
|
|
||||||
|
parsed.body().filter(new DomPruningFilter(0.5));
|
||||||
|
|
||||||
|
var dld = se.extractSentences(parsed);
|
||||||
|
|
||||||
|
|
||||||
|
int numSentences = dld.sentences.length;
|
||||||
|
if (numSentences == 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
double avgLength = dld.totalNumWords() / (double) numSentences;
|
||||||
|
if (avgLength < 50) {
|
||||||
|
writer.printf("%s\t%d\t%f\n", doc.url, dld.totalNumWords(), avgLength);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void onFinish() {
|
||||||
|
logLine("Done!\n");
|
||||||
|
writer.close();
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,16 @@
|
|||||||
|
package nu.marginalia.tools.experiments;
|
||||||
|
|
||||||
|
import nu.marginalia.crawling.model.CrawledDomain;
|
||||||
|
import nu.marginalia.tools.Experiment;
|
||||||
|
|
||||||
|
public class TestExperiment implements Experiment {
|
||||||
|
@Override
|
||||||
|
public boolean process(CrawledDomain domain) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void onFinish() {
|
||||||
|
System.out.println("Tada!");
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,69 @@
|
|||||||
|
package nu.marginalia.tools.experiments;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import nu.marginalia.WmsaHome;
|
||||||
|
import nu.marginalia.adblock.AdblockSimulator;
|
||||||
|
import nu.marginalia.adblock.GoogleAnwersSpamDetector;
|
||||||
|
import nu.marginalia.converting.processor.DocumentProcessor;
|
||||||
|
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
||||||
|
import nu.marginalia.crawling.model.CrawledDocument;
|
||||||
|
import nu.marginalia.crawling.model.CrawledDomain;
|
||||||
|
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||||
|
import nu.marginalia.tools.Experiment;
|
||||||
|
import nu.marginalia.topic.RecipeDetector;
|
||||||
|
import nu.marginalia.topic.TextileCraftDetector;
|
||||||
|
import nu.marginalia.topic.WoodworkingDetector;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
|
public class TopicExperiment implements Experiment {
|
||||||
|
|
||||||
|
RecipeDetector recipeDetector = new RecipeDetector();
|
||||||
|
WoodworkingDetector woodworkingDetector = new WoodworkingDetector();
|
||||||
|
TextileCraftDetector textileCraftDetector = new TextileCraftDetector();
|
||||||
|
GoogleAnwersSpamDetector spamDetector = new GoogleAnwersSpamDetector();
|
||||||
|
|
||||||
|
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public TopicExperiment() {
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean process(CrawledDomain domain) {
|
||||||
|
if (domain.doc == null) return true;
|
||||||
|
|
||||||
|
|
||||||
|
for (var doc : domain.doc) {
|
||||||
|
if (doc.documentBody == null) continue;
|
||||||
|
|
||||||
|
var parsed = Jsoup.parse(doc.documentBody.decode());
|
||||||
|
|
||||||
|
parsed.body().filter(new DomPruningFilter(0.5));
|
||||||
|
var dld = se.extractSentences(parsed);
|
||||||
|
|
||||||
|
if (dld.totalNumWords() < 250)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (textileCraftDetector.testP(dld) > 0.3) {
|
||||||
|
System.out.println("textilecraft\t" + doc.url);
|
||||||
|
}
|
||||||
|
if (woodworkingDetector.testP(dld) > 0.1) {
|
||||||
|
System.out.println("woodworking\t" + doc.url);
|
||||||
|
}
|
||||||
|
if (recipeDetector.testP(dld) > 0.5) {
|
||||||
|
System.out.println("recipe\t" + doc.url);
|
||||||
|
}
|
||||||
|
if (spamDetector.testP(parsed) > 0.5) {
|
||||||
|
System.out.println("GA spam\t" + doc.url);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void onFinish() {
|
||||||
|
}
|
||||||
|
}
|
@ -51,3 +51,28 @@ indexes. Wait for the line 'Auto-conversion finished!'
|
|||||||
|
|
||||||
When all is done, it should be possible to visit
|
When all is done, it should be possible to visit
|
||||||
[http://localhost:8080](http://localhost:8080) and try a few searches!
|
[http://localhost:8080](http://localhost:8080) and try a few searches!
|
||||||
|
|
||||||
|
|
||||||
|
## Other Crawl Data
|
||||||
|
|
||||||
|
By default, `reconvert.sh` will load the medium dataset. This is appropriate for a demo,
|
||||||
|
but other datasets also exist.
|
||||||
|
|
||||||
|
| Set | Description |
|
||||||
|
|-----|----------------------------------------------------------------------------|
|
||||||
|
| s | 1000 domains, suitable for low-end machines |
|
||||||
|
| m | 2000 domains |
|
||||||
|
| l | 5000 domains |
|
||||||
|
| xl | 50,000 domains, basically pre-prod.<br><b>Warning</b>: 5h+ processing time |
|
||||||
|
|
||||||
|
To switch datasets, run e.g.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ docker-compose up -d mariadb
|
||||||
|
$ ./run/reconvert.sh l
|
||||||
|
```
|
||||||
|
|
||||||
|
## Experiment Runner
|
||||||
|
|
||||||
|
The script `experiment.sh` is a launcher for the experiment runner, which is useful when
|
||||||
|
evaluating new algorithms in processing crawl data.
|
@ -56,13 +56,13 @@ include 'code:common:process'
|
|||||||
include 'code:processes:converting-process'
|
include 'code:processes:converting-process'
|
||||||
include 'code:processes:crawling-process'
|
include 'code:processes:crawling-process'
|
||||||
include 'code:processes:loading-process'
|
include 'code:processes:loading-process'
|
||||||
include 'code:processes:experimental'
|
|
||||||
|
|
||||||
include 'code:process-models:converting-model'
|
include 'code:process-models:converting-model'
|
||||||
include 'code:process-models:crawling-model'
|
include 'code:process-models:crawling-model'
|
||||||
|
|
||||||
include 'code:tools:term-frequency-extractor'
|
include 'code:tools:term-frequency-extractor'
|
||||||
include 'code:tools:crawl-job-extractor'
|
include 'code:tools:crawl-job-extractor'
|
||||||
|
include 'code:tools:experiment-runner'
|
||||||
|
|
||||||
include 'third-party:porterstemmer'
|
include 'third-party:porterstemmer'
|
||||||
include 'third-party:xz'
|
include 'third-party:xz'
|
||||||
|
Loading…
Reference in New Issue
Block a user