mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(experiment) Add add-hoc filter runner
This commit is contained in:
parent
8462e88b8f
commit
e48f52faba
@ -0,0 +1,53 @@
|
||||
package nu.marginalia.converting.processor.classifier.topic;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import static java.lang.Math.max;
|
||||
import static java.lang.Math.sqrt;
|
||||
|
||||
public class AdHocDetector {
|
||||
private static final int AVG_LENGTH = 1000;
|
||||
|
||||
private final Map<String, Double> termValues = new HashMap<>();
|
||||
|
||||
public AdHocDetector(List<String> terms) {
|
||||
PorterStemmer ps = new PorterStemmer();
|
||||
|
||||
for (String term : terms) {
|
||||
String[] parts = StringUtils.split(term, ' ');
|
||||
termValues.put(ps.stemWord(parts[0]), Double.parseDouble(parts[1]));
|
||||
}
|
||||
}
|
||||
|
||||
public double testP(DocumentLanguageData dld) {
|
||||
|
||||
Map<String, Double> values = new HashMap<>();
|
||||
int count = 0;
|
||||
for (var sentence : dld) {
|
||||
|
||||
for (var stemmed : sentence.stemmedWords) {
|
||||
count++;
|
||||
|
||||
final Double value = termValues.get(stemmed);
|
||||
|
||||
if (value != null) {
|
||||
values.merge(stemmed, value, (a,b) -> 0.5*a + b);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (count == 0) return 0.;
|
||||
|
||||
double lengthPenalty = sqrt(AVG_LENGTH)/sqrt(max(AVG_LENGTH, count));
|
||||
|
||||
return values.values().stream().mapToDouble(Double::valueOf).sum() * lengthPenalty;
|
||||
}
|
||||
|
||||
}
|
@ -28,7 +28,7 @@ public class ExperimentRunnerMain {
|
||||
|
||||
public static void main(String... args) throws IOException {
|
||||
if (args.length < 2) {
|
||||
System.err.println("Expected arguments: plan.yaml experiment-name [experiment-args]");
|
||||
System.err.println("Expected arguments: crawl-data-path experiment-name [experiment-args]");
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -1,25 +1,30 @@
|
||||
package nu.marginalia.tools.experiments;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.converting.processor.classifier.adblock.GoogleAnwersSpamDetector;
|
||||
import nu.marginalia.converting.processor.classifier.topic.RecipeDetector;
|
||||
import nu.marginalia.converting.processor.classifier.topic.TextileCraftDetector;
|
||||
import nu.marginalia.converting.processor.classifier.topic.WoodworkingDetector;
|
||||
import nu.marginalia.converting.processor.classifier.topic.AdHocDetector;
|
||||
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.model.crawldata.CrawledDomain;
|
||||
import nu.marginalia.tools.LegacyExperiment;
|
||||
import org.jsoup.Jsoup;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
public class TopicExperiment extends LegacyExperiment {
|
||||
|
||||
RecipeDetector recipeDetector = new RecipeDetector();
|
||||
WoodworkingDetector woodworkingDetector = new WoodworkingDetector();
|
||||
TextileCraftDetector textileCraftDetector = new TextileCraftDetector();
|
||||
GoogleAnwersSpamDetector spamDetector = new GoogleAnwersSpamDetector();
|
||||
AdHocDetector detector;
|
||||
|
||||
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
Path filename = null;
|
||||
|
||||
@SneakyThrows
|
||||
public void args(String... args) {
|
||||
filename = Path.of(args[0]);
|
||||
detector = new AdHocDetector(Files.readAllLines(filename));
|
||||
}
|
||||
|
||||
@Inject
|
||||
public TopicExperiment() {
|
||||
@ -38,20 +43,11 @@ public class TopicExperiment extends LegacyExperiment {
|
||||
parsed.body().filter(new DomPruningFilter(0.5));
|
||||
var dld = se.extractSentences(parsed);
|
||||
|
||||
if (dld.totalNumWords() < 250)
|
||||
if (dld.totalNumWords() < 50)
|
||||
continue;
|
||||
|
||||
if (textileCraftDetector.testP(dld) > 0.3) {
|
||||
System.out.println("textilecraft\t" + doc.url);
|
||||
}
|
||||
if (woodworkingDetector.testP(dld) > 0.1) {
|
||||
System.out.println("woodworking\t" + doc.url);
|
||||
}
|
||||
if (recipeDetector.testP(dld) > 0.5) {
|
||||
System.out.println("recipe\t" + doc.url);
|
||||
}
|
||||
if (spamDetector.testP(parsed) > 0.5) {
|
||||
System.out.println("GA spam\t" + doc.url);
|
||||
if (detector.testP(dld) > 0.5) {
|
||||
System.out.println("match\t" + doc.url);
|
||||
}
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user