(experiment) Add add-hoc filter runner

This commit is contained in:
Viktor Lofgren 2024-08-03 13:24:03 +02:00
parent 8462e88b8f
commit e48f52faba
3 changed files with 70 additions and 21 deletions

View File

@ -0,0 +1,53 @@
package nu.marginalia.converting.processor.classifier.topic;
import ca.rmen.porterstemmer.PorterStemmer;
import nu.marginalia.language.model.DocumentLanguageData;
import org.apache.commons.lang3.StringUtils;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import static java.lang.Math.max;
import static java.lang.Math.sqrt;
public class AdHocDetector {
private static final int AVG_LENGTH = 1000;
private final Map<String, Double> termValues = new HashMap<>();
public AdHocDetector(List<String> terms) {
PorterStemmer ps = new PorterStemmer();
for (String term : terms) {
String[] parts = StringUtils.split(term, ' ');
termValues.put(ps.stemWord(parts[0]), Double.parseDouble(parts[1]));
}
}
public double testP(DocumentLanguageData dld) {
Map<String, Double> values = new HashMap<>();
int count = 0;
for (var sentence : dld) {
for (var stemmed : sentence.stemmedWords) {
count++;
final Double value = termValues.get(stemmed);
if (value != null) {
values.merge(stemmed, value, (a,b) -> 0.5*a + b);
}
}
}
if (count == 0) return 0.;
double lengthPenalty = sqrt(AVG_LENGTH)/sqrt(max(AVG_LENGTH, count));
return values.values().stream().mapToDouble(Double::valueOf).sum() * lengthPenalty;
}
}

View File

@ -28,7 +28,7 @@ public class ExperimentRunnerMain {
public static void main(String... args) throws IOException {
if (args.length < 2) {
System.err.println("Expected arguments: plan.yaml experiment-name [experiment-args]");
System.err.println("Expected arguments: crawl-data-path experiment-name [experiment-args]");
return;
}

View File

@ -1,25 +1,30 @@
package nu.marginalia.tools.experiments;
import com.google.inject.Inject;
import lombok.SneakyThrows;
import nu.marginalia.WmsaHome;
import nu.marginalia.converting.processor.classifier.adblock.GoogleAnwersSpamDetector;
import nu.marginalia.converting.processor.classifier.topic.RecipeDetector;
import nu.marginalia.converting.processor.classifier.topic.TextileCraftDetector;
import nu.marginalia.converting.processor.classifier.topic.WoodworkingDetector;
import nu.marginalia.converting.processor.classifier.topic.AdHocDetector;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.model.crawldata.CrawledDomain;
import nu.marginalia.tools.LegacyExperiment;
import org.jsoup.Jsoup;
import java.nio.file.Files;
import java.nio.file.Path;
public class TopicExperiment extends LegacyExperiment {
RecipeDetector recipeDetector = new RecipeDetector();
WoodworkingDetector woodworkingDetector = new WoodworkingDetector();
TextileCraftDetector textileCraftDetector = new TextileCraftDetector();
GoogleAnwersSpamDetector spamDetector = new GoogleAnwersSpamDetector();
AdHocDetector detector;
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
Path filename = null;
@SneakyThrows
public void args(String... args) {
filename = Path.of(args[0]);
detector = new AdHocDetector(Files.readAllLines(filename));
}
@Inject
public TopicExperiment() {
@ -38,20 +43,11 @@ public class TopicExperiment extends LegacyExperiment {
parsed.body().filter(new DomPruningFilter(0.5));
var dld = se.extractSentences(parsed);
if (dld.totalNumWords() < 250)
if (dld.totalNumWords() < 50)
continue;
if (textileCraftDetector.testP(dld) > 0.3) {
System.out.println("textilecraft\t" + doc.url);
}
if (woodworkingDetector.testP(dld) > 0.1) {
System.out.println("woodworking\t" + doc.url);
}
if (recipeDetector.testP(dld) > 0.5) {
System.out.println("recipe\t" + doc.url);
}
if (spamDetector.testP(parsed) > 0.5) {
System.out.println("GA spam\t" + doc.url);
if (detector.testP(dld) > 0.5) {
System.out.println("match\t" + doc.url);
}
}