mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(experiment) Add add-hoc filter runner
This commit is contained in:
parent
8462e88b8f
commit
e48f52faba
@ -0,0 +1,53 @@
|
|||||||
|
package nu.marginalia.converting.processor.classifier.topic;
|
||||||
|
|
||||||
|
import ca.rmen.porterstemmer.PorterStemmer;
|
||||||
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import static java.lang.Math.max;
|
||||||
|
import static java.lang.Math.sqrt;
|
||||||
|
|
||||||
|
public class AdHocDetector {
|
||||||
|
private static final int AVG_LENGTH = 1000;
|
||||||
|
|
||||||
|
private final Map<String, Double> termValues = new HashMap<>();
|
||||||
|
|
||||||
|
public AdHocDetector(List<String> terms) {
|
||||||
|
PorterStemmer ps = new PorterStemmer();
|
||||||
|
|
||||||
|
for (String term : terms) {
|
||||||
|
String[] parts = StringUtils.split(term, ' ');
|
||||||
|
termValues.put(ps.stemWord(parts[0]), Double.parseDouble(parts[1]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public double testP(DocumentLanguageData dld) {
|
||||||
|
|
||||||
|
Map<String, Double> values = new HashMap<>();
|
||||||
|
int count = 0;
|
||||||
|
for (var sentence : dld) {
|
||||||
|
|
||||||
|
for (var stemmed : sentence.stemmedWords) {
|
||||||
|
count++;
|
||||||
|
|
||||||
|
final Double value = termValues.get(stemmed);
|
||||||
|
|
||||||
|
if (value != null) {
|
||||||
|
values.merge(stemmed, value, (a,b) -> 0.5*a + b);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
if (count == 0) return 0.;
|
||||||
|
|
||||||
|
double lengthPenalty = sqrt(AVG_LENGTH)/sqrt(max(AVG_LENGTH, count));
|
||||||
|
|
||||||
|
return values.values().stream().mapToDouble(Double::valueOf).sum() * lengthPenalty;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -28,7 +28,7 @@ public class ExperimentRunnerMain {
|
|||||||
|
|
||||||
public static void main(String... args) throws IOException {
|
public static void main(String... args) throws IOException {
|
||||||
if (args.length < 2) {
|
if (args.length < 2) {
|
||||||
System.err.println("Expected arguments: plan.yaml experiment-name [experiment-args]");
|
System.err.println("Expected arguments: crawl-data-path experiment-name [experiment-args]");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,25 +1,30 @@
|
|||||||
package nu.marginalia.tools.experiments;
|
package nu.marginalia.tools.experiments;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.converting.processor.classifier.adblock.GoogleAnwersSpamDetector;
|
import nu.marginalia.converting.processor.classifier.topic.AdHocDetector;
|
||||||
import nu.marginalia.converting.processor.classifier.topic.RecipeDetector;
|
|
||||||
import nu.marginalia.converting.processor.classifier.topic.TextileCraftDetector;
|
|
||||||
import nu.marginalia.converting.processor.classifier.topic.WoodworkingDetector;
|
|
||||||
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
||||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||||
import nu.marginalia.model.crawldata.CrawledDomain;
|
import nu.marginalia.model.crawldata.CrawledDomain;
|
||||||
import nu.marginalia.tools.LegacyExperiment;
|
import nu.marginalia.tools.LegacyExperiment;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
|
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
public class TopicExperiment extends LegacyExperiment {
|
public class TopicExperiment extends LegacyExperiment {
|
||||||
|
|
||||||
RecipeDetector recipeDetector = new RecipeDetector();
|
AdHocDetector detector;
|
||||||
WoodworkingDetector woodworkingDetector = new WoodworkingDetector();
|
|
||||||
TextileCraftDetector textileCraftDetector = new TextileCraftDetector();
|
|
||||||
GoogleAnwersSpamDetector spamDetector = new GoogleAnwersSpamDetector();
|
|
||||||
|
|
||||||
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||||
|
Path filename = null;
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public void args(String... args) {
|
||||||
|
filename = Path.of(args[0]);
|
||||||
|
detector = new AdHocDetector(Files.readAllLines(filename));
|
||||||
|
}
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public TopicExperiment() {
|
public TopicExperiment() {
|
||||||
@ -38,20 +43,11 @@ public class TopicExperiment extends LegacyExperiment {
|
|||||||
parsed.body().filter(new DomPruningFilter(0.5));
|
parsed.body().filter(new DomPruningFilter(0.5));
|
||||||
var dld = se.extractSentences(parsed);
|
var dld = se.extractSentences(parsed);
|
||||||
|
|
||||||
if (dld.totalNumWords() < 250)
|
if (dld.totalNumWords() < 50)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
if (textileCraftDetector.testP(dld) > 0.3) {
|
if (detector.testP(dld) > 0.5) {
|
||||||
System.out.println("textilecraft\t" + doc.url);
|
System.out.println("match\t" + doc.url);
|
||||||
}
|
|
||||||
if (woodworkingDetector.testP(dld) > 0.1) {
|
|
||||||
System.out.println("woodworking\t" + doc.url);
|
|
||||||
}
|
|
||||||
if (recipeDetector.testP(dld) > 0.5) {
|
|
||||||
System.out.println("recipe\t" + doc.url);
|
|
||||||
}
|
|
||||||
if (spamDetector.testP(parsed) > 0.5) {
|
|
||||||
System.out.println("GA spam\t" + doc.url);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user