From 9c6e3b177299dc077aebe00e531d8f6e31875ab4 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Wed, 10 Aug 2022 15:04:25 +0200 Subject: [PATCH] Topical detection (experimental), Adblock simulation (experimental) --- .../logic/topic/AdblockSimulator.java | 133 +++++++++++++++ .../logic/{ => topic}/RecipeDetector.java | 4 +- .../logic/topic/TextileCraftDetector.java | 158 ++++++++++++++++++ .../logic/topic/WoodworkingDetector.java | 134 +++++++++++++++ .../edge/crawling/CrawledDomainReader.java | 9 +- .../wmsa/edge/crawling/WorkLog.java | 21 ++- .../wmsa/edge/model/EdgeCrawlPlan.java | 51 +++++- .../wmsa/edge/tools/AdblockTesterTool.java | 58 +++++++ .../wmsa/edge/tools/RecipeDetectorTool.java | 33 +++- 9 files changed, 578 insertions(+), 23 deletions(-) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/AdblockSimulator.java rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/{ => topic}/RecipeDetector.java (98%) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/TextileCraftDetector.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/WoodworkingDetector.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/AdblockTesterTool.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/AdblockSimulator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/AdblockSimulator.java new file mode 100644 index 00000000..2ed74810 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/AdblockSimulator.java @@ -0,0 +1,133 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.topic; + +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.select.NodeFilter; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import java.util.function.Predicate; +import java.util.regex.Pattern; + +public class AdblockSimulator { + + List idRules = new ArrayList(); + List classRules = new ArrayList(); + List> scriptRules = new ArrayList(); + + public AdblockSimulator(Path adsDefinition) throws IOException { + try (var lineStream = Files.lines(adsDefinition)) { + lineStream.skip(1).forEach(this::addRule); + } + } + + private void addRule(String s) { + if (s.startsWith("##") && !s.contains(":")) { + if (s.startsWith("###")) { + idRules.add(s.substring(3)); + } else if(s.startsWith("##.")) { + classRules.add(s.substring(3)); + } + } + else if (!s.startsWith("!") && !s.contains("#")){ + scriptRules.add(toRegexMatcher(s)); + } + } + + private Predicate toRegexMatcher(String s) { + + System.out.println("<-" + s); + + s = s.replaceAll("\\?", "\\\\?"); + s = s.replaceAll("\\.", "\\\\."); + s = s.replaceAll("\\$", "\\\\\\$"); + + if (s.startsWith("||")) { + s = s.replaceFirst("\\|\\|","^http(s)?://"); + } + + s = s.replaceAll("\\|", "\\\\|"); + s = s.replaceAll("\\*", ".*"); + s = s.replaceAll("\\^", "[?/]"); + + + System.out.println("->" + s); + return Pattern.compile(s).asPredicate(); + } + + class RuleVisitor implements NodeFilter { + public boolean sawAds; + Pattern spPattern = Pattern.compile("\\s"); + + @Override + public FilterResult head(Node node, int depth) { + + if (node.attributesSize() > 0 && node instanceof Element elem) { // instanceof is slow + + String id = elem.id(); + for (var rule : idRules) { + if (rule.equals(id)) { + sawAds = true; + return FilterResult.STOP; + } + } + + String classes = elem.className(); + if (classes.isBlank()) return FilterResult.CONTINUE; + + if (classes.indexOf(' ') > 0) { + String[] classNames = spPattern.split(classes); + for (var rule : classRules) { + + for (var className : classNames) { + if (className.equals(rule)) { + sawAds = true; + return FilterResult.STOP; + } + } + } + } + else { // tag only has one class + for (var rule : classRules) { + if (classes.equals(rule)) { + sawAds = true; + return FilterResult.STOP; + } + } + } + + if ("script".equals(elem.tagName())) { + String src = elem.attr("src"); + + for (var rule : scriptRules) { + if (rule.test(src)) { + sawAds = true; + return FilterResult.STOP; + } + } + } + + return FilterResult.CONTINUE; + } + return FilterResult.CONTINUE; + } + + @Override + public FilterResult tail(Node node, int depth) { + return FilterResult.CONTINUE; + } + } + + public boolean hasAds(Document document) { + + RuleVisitor ruleVisitor = new RuleVisitor(); + document.filter(ruleVisitor); + + return ruleVisitor.sawAds; + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/RecipeDetector.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/RecipeDetector.java similarity index 98% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/RecipeDetector.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/RecipeDetector.java index 4b77cba2..17f8d992 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/RecipeDetector.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/RecipeDetector.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic; +package nu.marginalia.wmsa.edge.converting.processor.logic.topic; import ca.rmen.porterstemmer.PorterStemmer; import nu.marginalia.util.language.processing.model.DocumentLanguageData; @@ -205,7 +205,7 @@ public class RecipeDetector { } - public double recipeP(DocumentLanguageData dld) { + public double testP(DocumentLanguageData dld) { Map values = new HashMap<>(); int count = 0; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/TextileCraftDetector.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/TextileCraftDetector.java new file mode 100644 index 00000000..1146c620 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/TextileCraftDetector.java @@ -0,0 +1,158 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.topic; + +import ca.rmen.porterstemmer.PorterStemmer; +import nu.marginalia.util.language.processing.model.DocumentLanguageData; + +import java.util.HashMap; +import java.util.Map; + +import static java.lang.Math.max; +import static java.lang.Math.sqrt; + +public class TextileCraftDetector { + private static final int AVG_LENGTH = 1000; + + private final Map termValues = new HashMap<>(); + + public TextileCraftDetector() { + PorterStemmer ps = new PorterStemmer(); + + termValues.put(ps.stemWord("shop"), -0.1); + termValues.put(ps.stemWord("newsletter"), -0.1); + termValues.put(ps.stemWord("cart"), -0.1); + termValues.put(ps.stemWord("item"), -0.025); + termValues.put(ps.stemWord("price"), -0.1); + termValues.put(ps.stemWord("book"), -0.1); + termValues.put(ps.stemWord("order"), -0.1); + termValues.put(ps.stemWord("exhibition"), -0.1); + + termValues.put(ps.stemWord("knit"), 0.05); + termValues.put(ps.stemWord("stitch"), 0.05); + termValues.put(ps.stemWord("yarn"), 0.05); + termValues.put(ps.stemWord("crochet"), 0.05); + termValues.put(ps.stemWord("ravelry"), 0.15); + + termValues.put(ps.stemWord("stockinette"), 0.075); + termValues.put(ps.stemWord("purl"), 0.075); + termValues.put(ps.stemWord("ksp"), 0.075); + termValues.put(ps.stemWord("kwise"), 0.075); + termValues.put(ps.stemWord("k2tog"), 0.075); + termValues.put(ps.stemWord("k1b"), 0.075); + termValues.put(ps.stemWord("psso"), 0.075); + termValues.put(ps.stemWord("p2sso"), 0.075); + termValues.put(ps.stemWord("pwise"), 0.075); + termValues.put(ps.stemWord("yrn"), 0.075); + termValues.put(ps.stemWord("yon"), 0.075); + termValues.put(ps.stemWord("entrelac"), 0.075); + termValues.put(ps.stemWord("thrum"), 0.075); + termValues.put(ps.stemWord("bobbin"), 0.025); + + termValues.put(ps.stemWord("boucle"), 0.075); + termValues.put(ps.stemWord("lopi"), 0.075); + termValues.put(ps.stemWord("eyelash"), 0.01); + termValues.put(ps.stemWord("variegated"), 0.075); + + termValues.put(ps.stemWord("serge"), 0.04); + termValues.put(ps.stemWord("selvage"), 0.075); + termValues.put(ps.stemWord("topstitch"), 0.075); + + termValues.put(ps.stemWord("gauge"), 0.01); + termValues.put(ps.stemWord("design"), 0.01); + termValues.put(ps.stemWord("pattern"), 0.01); + termValues.put(ps.stemWord("layer"), 0.01); + termValues.put(ps.stemWord("color"), 0.01); + termValues.put(ps.stemWord("colour"), 0.01); + termValues.put(ps.stemWord("chart"), 0.01); + termValues.put(ps.stemWord("grid"), 0.01); + termValues.put(ps.stemWord("wool"), 0.01); + termValues.put(ps.stemWord("acrylic"), 0.01); + termValues.put(ps.stemWord("loose"), 0.01); + termValues.put(ps.stemWord("loop"), 0.01); + termValues.put(ps.stemWord("needle"), 0.01); + termValues.put(ps.stemWord("row"), 0.01); + termValues.put(ps.stemWord("circular"), 0.01); + termValues.put(ps.stemWord("sew"), 0.01); + termValues.put(ps.stemWord("size"), 0.01); + termValues.put(ps.stemWord("repeat"), 0.01); + termValues.put(ps.stemWord("repetition"), 0.01); + termValues.put(ps.stemWord("basketweave"), 0.01); + termValues.put(ps.stemWord("weave"), 0.01); + termValues.put(ps.stemWord("loom"), 0.01); + termValues.put(ps.stemWord("warp"), 0.01); + termValues.put(ps.stemWord("weft"), 0.01); + termValues.put(ps.stemWord("shuttle"), 0.01); + termValues.put(ps.stemWord("brioche"), 0.01); + termValues.put(ps.stemWord("spool"), 0.01); + termValues.put(ps.stemWord("hem"), 0.01); + termValues.put(ps.stemWord("bodice"), 0.01); + termValues.put(ps.stemWord("seam"), 0.01); + termValues.put(ps.stemWord("allowance"), 0.01); + termValues.put(ps.stemWord("crinoline"), 0.01); + termValues.put(ps.stemWord("petticoat"), 0.01); + termValues.put(ps.stemWord("armscye"), 0.01); + termValues.put(ps.stemWord("baste"), 0.01); + termValues.put(ps.stemWord("cord"), 0.01); + termValues.put(ps.stemWord("darning"), 0.01); + termValues.put(ps.stemWord("draping"), 0.01); + termValues.put(ps.stemWord("embroider"), 0.01); + termValues.put(ps.stemWord("eyelet"), 0.01); + termValues.put(ps.stemWord("godet"), 0.01); + termValues.put(ps.stemWord("gore"), 0.01); + termValues.put(ps.stemWord("grain"), 0.01); + termValues.put(ps.stemWord("jersey"), 0.01); + termValues.put(ps.stemWord("lining"), 0.01); + termValues.put(ps.stemWord("muslin"), 0.01); + termValues.put(ps.stemWord("needlework"), 0.01); + termValues.put(ps.stemWord("pleat"), 0.01); + termValues.put(ps.stemWord("quilt"), 0.01); + termValues.put(ps.stemWord("silk"), 0.01); + + termValues.put(ps.stemWord("sloper"), 0.01); + termValues.put(ps.stemWord("surplice"), 0.01); + termValues.put(ps.stemWord("thread"), 0.01); + termValues.put(ps.stemWord("twill"), 0.01); + + termValues.put(ps.stemWord("ch"), 0.01); + termValues.put(ps.stemWord("sp"), 0.01); + termValues.put(ps.stemWord("sl"), 0.01); + termValues.put(ps.stemWord("sc"), 0.01); + termValues.put(ps.stemWord("ss"), 0.01); + termValues.put(ps.stemWord("hdc"), 0.01); + termValues.put(ps.stemWord("turn"), 0.01); + termValues.put(ps.stemWord("skip"), 0.01); + termValues.put(ps.stemWord("round"), 0.01); + termValues.put(ps.stemWord("ring"), 0.01); + + termValues.put(ps.stemWord("sequin"), 0.01); + termValues.put(ps.stemWord("bobble"), 0.01); + termValues.put(ps.stemWord("puff"), 0.01); + termValues.put(ps.stemWord("v-stitch"), 0.01); + } + + public double testP(DocumentLanguageData dld) { + + Map values = new HashMap<>(); + int count = 0; + for (var sentence : dld.sentences) { + + for (var word : sentence) { + count++; + + final String stemmed = word.stemmed(); + final Double value = termValues.get(stemmed); + + if (value != null) { + values.merge(stemmed, value, (a,b) -> 0.5*a + b); + } + } + + } + + if (count == 0) return 0.; + + double lengthPenalty = sqrt(AVG_LENGTH)/sqrt(max(AVG_LENGTH, count)); + + return values.values().stream().mapToDouble(Double::valueOf).sum() * lengthPenalty; + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/WoodworkingDetector.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/WoodworkingDetector.java new file mode 100644 index 00000000..bb4a0cd0 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/WoodworkingDetector.java @@ -0,0 +1,134 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.topic; + +import ca.rmen.porterstemmer.PorterStemmer; +import nu.marginalia.util.language.processing.model.DocumentLanguageData; + +import java.util.HashMap; +import java.util.Map; + +import static java.lang.Math.max; +import static java.lang.Math.sqrt; + +public class WoodworkingDetector { + private static final int AVG_LENGTH = 1000; + + private final Map termValues = new HashMap<>(); + + public WoodworkingDetector() { + PorterStemmer ps = new PorterStemmer(); + + termValues.put(ps.stemWord("shop"), -0.1); + termValues.put(ps.stemWord("newsletter"), -0.1); + termValues.put(ps.stemWord("cart"), -0.1); + termValues.put(ps.stemWord("item"), -0.025); + termValues.put(ps.stemWord("price"), -0.1); + termValues.put(ps.stemWord("book"), -0.1); + termValues.put(ps.stemWord("order"), -0.1); + termValues.put(ps.stemWord("exhibition"), -0.1); + + // woodworking and joinery + termValues.put(ps.stemWord("apse"), 0.01); + termValues.put(ps.stemWord("baluster"), 0.01); + termValues.put(ps.stemWord("beam"), 0.01); + termValues.put(ps.stemWord("cornice"), 0.01); + termValues.put(ps.stemWord("drill"), 0.01); + termValues.put(ps.stemWord("nail"), 0.01); + termValues.put(ps.stemWord("saw"), 0.01); + termValues.put(ps.stemWord("hacksaw"), 0.01); + termValues.put(ps.stemWord("bandsaw"), 0.01); + termValues.put(ps.stemWord("whipsaw"), 0.01); + termValues.put(ps.stemWord("gimlet"), 0.01); + termValues.put(ps.stemWord("clamp"), 0.01); + termValues.put(ps.stemWord("glue"), 0.01); + termValues.put(ps.stemWord("cut"), 0.01); + termValues.put(ps.stemWord("plane"), 0.01); + termValues.put(ps.stemWord("sand"), 0.01); + termValues.put(ps.stemWord("bevel"), 0.01); + termValues.put(ps.stemWord("chamfer"), 0.01); + termValues.put(ps.stemWord("dado"), 0.075); + termValues.put(ps.stemWord("dowel"), 0.05); + termValues.put(ps.stemWord("dovetail"), 0.05); + termValues.put(ps.stemWord("joint"), 0.01); + termValues.put(ps.stemWord("level"), 0.01); + termValues.put(ps.stemWord("edge"), 0.01); + termValues.put(ps.stemWord("face"), 0.01); + termValues.put(ps.stemWord("fibreboard"), 0.01); + termValues.put(ps.stemWord("fiberboard"), 0.01); + termValues.put(ps.stemWord("battens"), 0.01); + termValues.put(ps.stemWord("furring"), 0.01); + termValues.put(ps.stemWord("glulam"), 0.025); + termValues.put(ps.stemWord("hardboard"), 0.025); + termValues.put(ps.stemWord("hardwood"), 0.01); + termValues.put(ps.stemWord("jamb"), 0.015); + termValues.put(ps.stemWord("kerf"), 0.025); + termValues.put(ps.stemWord("lvl"), 0.025); + termValues.put(ps.stemWord("laminated"), 0.01); + termValues.put(ps.stemWord("lignin"), 0.01); + termValues.put(ps.stemWord("mitre"), 0.01); + termValues.put(ps.stemWord("mortise"), 0.015); + termValues.put(ps.stemWord("mullion"), 0.01); + termValues.put(ps.stemWord("newel"), 0.01); + termValues.put(ps.stemWord("nogging"), 0.01); + termValues.put(ps.stemWord("ogee"), 0.01); + termValues.put(ps.stemWord("ogive"), 0.01); + termValues.put(ps.stemWord("ovolo"), 0.01); + termValues.put(ps.stemWord("drawknife"), 0.01); + termValues.put(ps.stemWord("plywood"), 0.01); + termValues.put(ps.stemWord("purlin"), 0.01); + termValues.put(ps.stemWord("riser"), 0.01); + termValues.put(ps.stemWord("sapwood"), 0.01); + termValues.put(ps.stemWord("shingle"), 0.01); + termValues.put(ps.stemWord("softwood"), 0.01); + termValues.put(ps.stemWord("sapwood"), 0.01); + termValues.put(ps.stemWord("stave"), 0.01); + termValues.put(ps.stemWord("stopper"), 0.01); + termValues.put(ps.stemWord("stud"), 0.01); // beep beep beep, huh, the stud detector seems to work just well :D + termValues.put(ps.stemWord("transom"), 0.01); + termValues.put(ps.stemWord("v-joint"), 0.015); + termValues.put(ps.stemWord("veneer"), 0.01); + termValues.put(ps.stemWord("quartersaw"), 0.015); + termValues.put(ps.stemWord("screw"), 0.01); + termValues.put(ps.stemWord("woodturning"), 0.01); + + termValues.put(ps.stemWord("pine"), 0.005); + termValues.put(ps.stemWord("balsa"), 0.01); + termValues.put(ps.stemWord("poplar"), 0.005); + + termValues.put(ps.stemWord("nut"), 0.01); + termValues.put(ps.stemWord("bolt"), 0.01); + termValues.put(ps.stemWord("tack"), 0.01); + termValues.put(ps.stemWord("hinge"), 0.01); + termValues.put(ps.stemWord("brass"), 0.01); + termValues.put(ps.stemWord("fitting"), 0.01); + + termValues.put(ps.stemWord("diy"), 0.015); + termValues.put(ps.stemWord("dozuki"), 0.01); + } + + public double testP(DocumentLanguageData dld) { + + Map values = new HashMap<>(); + int count = 0; + for (var sentence : dld.sentences) { + + for (var word : sentence) { + count++; + + final String stemmed = word.stemmed(); + final Double value = termValues.get(stemmed); + + if (value != null) { + values.merge(stemmed, value, (a,b) -> 0.5*a + b); + } + } + + } + + if (count == 0) return 0.; + + double lengthPenalty = sqrt(AVG_LENGTH)/sqrt(max(AVG_LENGTH, count)); + + return values.values().stream().mapToDouble(Double::valueOf).sum() * lengthPenalty; + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawledDomainReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawledDomainReader.java index 0bd36d06..21b80993 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawledDomainReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawledDomainReader.java @@ -22,5 +22,12 @@ public class CrawledDomainReader { return gson.fromJson(br, CrawledDomain.class); } } - + public CrawledDomain readRuntimeExcept(Path path) { + try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new BufferedInputStream(new FileInputStream(path.toFile())))))) { + return gson.fromJson(br, CrawledDomain.class); + } + catch (Exception ex) { + throw new RuntimeException(ex); + } + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/WorkLog.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/WorkLog.java index 276d3651..fb5bf5b2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/WorkLog.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/WorkLog.java @@ -1,9 +1,11 @@ package nu.marginalia.wmsa.edge.crawling; +import com.google.errorprone.annotations.MustBeClosed; import nu.marginalia.wmsa.edge.crawling.model.CrawlLogEntry; import org.apache.logging.log4j.util.Strings; -import java.io.*; +import java.io.FileOutputStream; +import java.io.IOException; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; @@ -12,6 +14,7 @@ import java.util.HashSet; import java.util.Set; import java.util.function.Consumer; import java.util.regex.Pattern; +import java.util.stream.Stream; public class WorkLog implements AutoCloseable { private final Set finishedJobs = new HashSet<>(); @@ -29,15 +32,21 @@ public class WorkLog implements AutoCloseable { return; } - try (var lines = Files.lines(logFile)) { - lines.filter(WorkLog::isJobId).map(line -> { - String[] parts = line.split("\\s+"); - return new CrawlLogEntry(parts[0], parts[1], parts[2], Integer.parseInt(parts[3])); - }).forEach(entryConsumer); + try (var entries = streamLog(logFile)) { + entries.forEach(entryConsumer); } catch (IOException e) { e.printStackTrace(); } } + + @MustBeClosed + public static Stream streamLog(Path logFile) throws IOException { + return Files.lines(logFile).filter(WorkLog::isJobId).map(line -> { + String[] parts = line.split("\\s+"); + return new CrawlLogEntry(parts[0], parts[1], parts[2], Integer.parseInt(parts[3])); + }); + } + private void loadLog(Path logFile) throws IOException { if (!Files.exists(logFile)) { return; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java index 3515c48a..926b9d74 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java @@ -1,15 +1,20 @@ package nu.marginalia.wmsa.edge.model; +import com.google.errorprone.annotations.MustBeClosed; import lombok.AllArgsConstructor; import lombok.NoArgsConstructor; import lombok.ToString; import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader; import nu.marginalia.wmsa.edge.crawling.WorkLog; +import nu.marginalia.wmsa.edge.crawling.model.CrawlLogEntry; import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; +import org.jetbrains.annotations.NotNull; import java.io.IOException; import java.nio.file.Path; +import java.util.Iterator; import java.util.function.Consumer; +import java.util.stream.Stream; @AllArgsConstructor @NoArgsConstructor @ToString public class EdgeCrawlPlan { @@ -49,12 +54,44 @@ public class EdgeCrawlPlan { public void forEachCrawledDomain(Consumer consumer) { final CrawledDomainReader reader = new CrawledDomainReader(); - WorkLog.readLog(crawl.getLogFile(), entry -> { - try { - consumer.accept(reader.read(getCrawledFilePath(entry.path()))); - } catch (IOException e) { - throw new RuntimeException(e); - } - }); + try (Stream entryStream = WorkLog.streamLog(crawl.getLogFile())) { + entryStream + .map(CrawlLogEntry::path) + .map(this::getCrawledFilePath) + .map(reader::readRuntimeExcept) + .forEach(consumer); + } + catch (IOException ex) { + throw new RuntimeException(ex); + } + } + + @MustBeClosed + public DomainsIterable domainsIterable() throws IOException { + return new DomainsIterable(); + } + + public class DomainsIterable implements Iterable, AutoCloseable { + private final Stream stream; + + DomainsIterable() throws IOException { + final CrawledDomainReader reader = new CrawledDomainReader(); + + stream = WorkLog.streamLog(crawl.getLogFile()) + .map(CrawlLogEntry::path) + .map(EdgeCrawlPlan.this::getCrawledFilePath) + .map(reader::readRuntimeExcept); + } + + @Override + public void close() { + stream.close(); + } + + @NotNull + @Override + public Iterator iterator() { + return stream.iterator(); + } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/AdblockTesterTool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/AdblockTesterTool.java new file mode 100644 index 00000000..fde620c3 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/AdblockTesterTool.java @@ -0,0 +1,58 @@ +package nu.marginalia.wmsa.edge.tools; + +import nu.marginalia.wmsa.edge.converting.processor.logic.topic.AdblockSimulator; +import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader; +import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; +import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; +import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; + +import java.io.IOException; +import java.nio.file.Path; + +import static nu.marginalia.wmsa.edge.converting.processor.DocumentProcessor.isAcceptedContentType; + +public class AdblockTesterTool { + + static AdblockSimulator simulator; + + static { + try { + simulator = new AdblockSimulator(Path.of("/home/vlofgren/easylist.txt")); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + + public static void main(String... args) throws IOException { + EdgeCrawlPlan plan = new CrawlPlanLoader().load(Path.of(args[0])); + + + try (var iterable = plan.domainsIterable()) { + for (var domain : iterable) { + processDomain(domain); + } + } + + } + + private static void processDomain(CrawledDomain domain) { + if (domain.doc == null) return; + for (var doc : domain.doc) { + if (isAcceptedContentType(doc) && "OK".equals(doc.crawlerStatus)) { + processDocument(doc); + } + } + } + + + private static void processDocument(CrawledDocument doc) { + Document parsedDocument = Jsoup.parse(doc.documentBody); + + if (simulator.hasAds(parsedDocument)) { + System.out.println(doc.url); + } + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/RecipeDetectorTool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/RecipeDetectorTool.java index 480e85b8..e1b6c822 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/RecipeDetectorTool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/RecipeDetectorTool.java @@ -5,9 +5,10 @@ import nu.marginalia.util.language.processing.SentenceExtractor; import nu.marginalia.util.language.processing.model.DocumentLanguageData; import nu.marginalia.wmsa.configuration.WmsaHome; import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import nu.marginalia.wmsa.edge.converting.processor.logic.RecipeDetector; +import nu.marginalia.wmsa.edge.converting.processor.logic.topic.RecipeDetector; +import nu.marginalia.wmsa.edge.converting.processor.logic.topic.TextileCraftDetector; +import nu.marginalia.wmsa.edge.converting.processor.logic.topic.WoodworkingDetector; import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader; -import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader; import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan; @@ -25,8 +26,10 @@ import java.util.concurrent.TimeUnit; import static nu.marginalia.wmsa.edge.converting.processor.DocumentProcessor.isAcceptedContentType; public class RecipeDetectorTool { - private static final CrawledDomainReader reader = new CrawledDomainReader(); - private static final RecipeDetector detector = new RecipeDetector(); + private static final TextileCraftDetector textileCraftDetector = new TextileCraftDetector(); + private static final WoodworkingDetector woodworkingDetector = new WoodworkingDetector(); + private static final RecipeDetector recipeDetector = new RecipeDetector(); + private static final LanguageModels lm = WmsaHome.getLanguageModels(); private static final SentenceExtractor sentenceExtractor = new SentenceExtractor(lm); @@ -49,7 +52,12 @@ public class RecipeDetectorTool { } ForkJoinPool pool = new ForkJoinPool(16); - plan.forEachCrawledDomain(data -> pool.execute(() -> processDomain(data))); + + try (var iterable = plan.domainsIterable()) { + for (var domain : iterable) { + pool.execute(() -> processDomain(domain)); + } + } while (!pool.awaitQuiescence(1, TimeUnit.HOURS)); } @@ -74,9 +82,20 @@ public class RecipeDetectorTool { parsedDocument.getElementsByTag("nav").remove(); DocumentLanguageData dld = sentenceExtractor.extractSentences(parsedDocument); - double prob = 100*detector.recipeP(dld); + + double prob = 100*recipeDetector.testP(dld); if (prob > 50) { - System.out.printf("%3.2f\t%s\n", prob, doc.url); + System.out.printf("#%3.2f recipe\t%s\n%s\n", prob, parsedDocument.title(), doc.url); + } + + prob = 100*woodworkingDetector.testP(dld); + if (prob > 20) { + System.out.printf("#%3.2f woodworking\t%s\n%s\n", prob, parsedDocument.title(), doc.url); + } + + prob = 100*textileCraftDetector.testP(dld); + if (prob > 20) { + System.out.printf("#%3.2f textilecraft\t%s\n%s\n", prob, parsedDocument.title(), doc.url); } } }