diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/classifier/topic/AdHocDetector.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/classifier/topic/AdHocDetector.java new file mode 100644 index 00000000..2e52c865 --- /dev/null +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/classifier/topic/AdHocDetector.java @@ -0,0 +1,53 @@ +package nu.marginalia.converting.processor.classifier.topic; + +import ca.rmen.porterstemmer.PorterStemmer; +import nu.marginalia.language.model.DocumentLanguageData; +import org.apache.commons.lang3.StringUtils; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static java.lang.Math.max; +import static java.lang.Math.sqrt; + +public class AdHocDetector { + private static final int AVG_LENGTH = 1000; + + private final Map termValues = new HashMap<>(); + + public AdHocDetector(List terms) { + PorterStemmer ps = new PorterStemmer(); + + for (String term : terms) { + String[] parts = StringUtils.split(term, ' '); + termValues.put(ps.stemWord(parts[0]), Double.parseDouble(parts[1])); + } + } + + public double testP(DocumentLanguageData dld) { + + Map values = new HashMap<>(); + int count = 0; + for (var sentence : dld) { + + for (var stemmed : sentence.stemmedWords) { + count++; + + final Double value = termValues.get(stemmed); + + if (value != null) { + values.merge(stemmed, value, (a,b) -> 0.5*a + b); + } + } + + } + + if (count == 0) return 0.; + + double lengthPenalty = sqrt(AVG_LENGTH)/sqrt(max(AVG_LENGTH, count)); + + return values.values().stream().mapToDouble(Double::valueOf).sum() * lengthPenalty; + } + +} diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/ExperimentRunnerMain.java b/code/tools/experiment-runner/java/nu/marginalia/tools/ExperimentRunnerMain.java index a7879747..d71e0f47 100644 --- a/code/tools/experiment-runner/java/nu/marginalia/tools/ExperimentRunnerMain.java +++ b/code/tools/experiment-runner/java/nu/marginalia/tools/ExperimentRunnerMain.java @@ -28,7 +28,7 @@ public class ExperimentRunnerMain { public static void main(String... args) throws IOException { if (args.length < 2) { - System.err.println("Expected arguments: plan.yaml experiment-name [experiment-args]"); + System.err.println("Expected arguments: crawl-data-path experiment-name [experiment-args]"); return; } diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/TopicExperiment.java b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/TopicExperiment.java index 5ea9551d..00ed63ac 100644 --- a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/TopicExperiment.java +++ b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/TopicExperiment.java @@ -1,25 +1,30 @@ package nu.marginalia.tools.experiments; import com.google.inject.Inject; +import lombok.SneakyThrows; import nu.marginalia.WmsaHome; -import nu.marginalia.converting.processor.classifier.adblock.GoogleAnwersSpamDetector; -import nu.marginalia.converting.processor.classifier.topic.RecipeDetector; -import nu.marginalia.converting.processor.classifier.topic.TextileCraftDetector; -import nu.marginalia.converting.processor.classifier.topic.WoodworkingDetector; +import nu.marginalia.converting.processor.classifier.topic.AdHocDetector; import nu.marginalia.converting.processor.logic.dom.DomPruningFilter; import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.model.crawldata.CrawledDomain; import nu.marginalia.tools.LegacyExperiment; import org.jsoup.Jsoup; +import java.nio.file.Files; +import java.nio.file.Path; + public class TopicExperiment extends LegacyExperiment { - RecipeDetector recipeDetector = new RecipeDetector(); - WoodworkingDetector woodworkingDetector = new WoodworkingDetector(); - TextileCraftDetector textileCraftDetector = new TextileCraftDetector(); - GoogleAnwersSpamDetector spamDetector = new GoogleAnwersSpamDetector(); + AdHocDetector detector; SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); + Path filename = null; + + @SneakyThrows + public void args(String... args) { + filename = Path.of(args[0]); + detector = new AdHocDetector(Files.readAllLines(filename)); + } @Inject public TopicExperiment() { @@ -38,20 +43,11 @@ public class TopicExperiment extends LegacyExperiment { parsed.body().filter(new DomPruningFilter(0.5)); var dld = se.extractSentences(parsed); - if (dld.totalNumWords() < 250) + if (dld.totalNumWords() < 50) continue; - if (textileCraftDetector.testP(dld) > 0.3) { - System.out.println("textilecraft\t" + doc.url); - } - if (woodworkingDetector.testP(dld) > 0.1) { - System.out.println("woodworking\t" + doc.url); - } - if (recipeDetector.testP(dld) > 0.5) { - System.out.println("recipe\t" + doc.url); - } - if (spamDetector.testP(parsed) > 0.5) { - System.out.println("GA spam\t" + doc.url); + if (detector.testP(dld) > 0.5) { + System.out.println("match\t" + doc.url); } }