From 0dfcf2f7af7de8f22ee01dea474fa04705ff3abf Mon Sep 17 00:00:00 2001 From: vlofgren Date: Mon, 8 Aug 2022 15:18:04 +0200 Subject: [PATCH] Recipe detection --- .../processor/DocumentProcessor.java | 2 +- .../processor/logic/HtmlFeature.java | 8 +- .../processor/logic/RecipeDetector.java | 234 ++++++++++++++++++ .../wmsa/edge/model/EdgeCrawlPlan.java | 21 +- .../wmsa/edge/tools/RecipeDetectorTool.java | 82 ++++++ .../wmsa/edge/tools/RecipesLoaderTool.java | 86 +++++++ 6 files changed, 430 insertions(+), 3 deletions(-) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/RecipeDetector.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/RecipeDetectorTool.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/RecipesLoaderTool.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java index f752ef80..9107f62c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java @@ -112,7 +112,7 @@ public class DocumentProcessor { return ret; } - private boolean isAcceptedContentType(CrawledDocument crawledDocument) { + public static boolean isAcceptedContentType(CrawledDocument crawledDocument) { if (crawledDocument.contentType == null) { return false; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java index 032315dd..c7ef9fd8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java @@ -7,7 +7,9 @@ public enum HtmlFeature { JS("special:scripts"), AFFILIATE_LINK( "special:affiliate"), TRACKING("special:tracking"), - COOKIES("special:cookies") + COOKIES("special:cookies"), + + CATEGORY_FOOD("category:food"), ; private final String keyword; @@ -31,4 +33,8 @@ public enum HtmlFeature { public static boolean hasFeature(int value, HtmlFeature feature) { return (value & (1<< feature.ordinal())) != 0; } + + public int getFeatureBit() { + return (1<< ordinal()); + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/RecipeDetector.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/RecipeDetector.java new file mode 100644 index 00000000..4b77cba2 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/RecipeDetector.java @@ -0,0 +1,234 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic; + +import ca.rmen.porterstemmer.PorterStemmer; +import nu.marginalia.util.language.processing.model.DocumentLanguageData; + +import java.util.HashMap; +import java.util.Map; + +import static java.lang.Math.max; +import static java.lang.Math.sqrt; + +public class RecipeDetector { + private static final int AVG_RECIPE_LENGTH = 250; + + private final Map termValues = new HashMap<>(); + + public RecipeDetector() { + PorterStemmer ps = new PorterStemmer(); + + // these terms appear in most recipes + termValues.put(ps.stemWord("ingredients"), 0.3); + termValues.put(ps.stemWord("recipe"), 0.1); + termValues.put(ps.stemWord("preparations"), 0.1); + termValues.put(ps.stemWord("instructions"), 0.1); + + // penalize restaurant menus + termValues.put(ps.stemWord("menu"), -0.5); + + // error non habet ius + termValues.put(ps.stemWord("email"), -0.15); + termValues.put(ps.stemWord("checkout"), -0.15); + termValues.put(ps.stemWord("reviews"), -0.15); + termValues.put(ps.stemWord("newsletter"), -0.15); + + // measures + termValues.put(ps.stemWord("dl"), 0.05); + termValues.put(ps.stemWord("l"), 0.05); + termValues.put(ps.stemWord("g"), 0.05); + termValues.put(ps.stemWord("ml"), 0.05); + termValues.put(ps.stemWord("tsp"), 0.05); + termValues.put(ps.stemWord("teaspoons"), 0.05); + termValues.put(ps.stemWord("tbsp"), 0.05); + termValues.put(ps.stemWord("tablespoons"), 0.05); + termValues.put(ps.stemWord("cups"), 0.05); + termValues.put(ps.stemWord("quarts"), 0.05); + termValues.put(ps.stemWord("pints"), 0.05); + + // techniques + termValues.put(ps.stemWord("grate"), 0.05); + termValues.put(ps.stemWord("cut"), 0.05); + termValues.put(ps.stemWord("peel"), 0.05); + termValues.put(ps.stemWord("chop"), 0.05); + termValues.put(ps.stemWord("slice"), 0.05); + termValues.put(ps.stemWord("debone"), 0.05); + termValues.put(ps.stemWord("julienne"), 0.05); + termValues.put(ps.stemWord("saute"), 0.05); + termValues.put(ps.stemWord("fry"), 0.05); + termValues.put(ps.stemWord("boil"), 0.05); + termValues.put(ps.stemWord("parboil"), 0.05); + termValues.put(ps.stemWord("roast"), 0.05); + termValues.put(ps.stemWord("grill"), 0.05); + termValues.put(ps.stemWord("sear"), 0.05); + termValues.put(ps.stemWord("heat"), 0.05); + termValues.put(ps.stemWord("dice"), 0.05); + termValues.put(ps.stemWord("bake"), 0.05); + termValues.put(ps.stemWord("strain"), 0.05); + termValues.put(ps.stemWord("melt"), 0.05); + termValues.put(ps.stemWord("garnish"), 0.05); + termValues.put(ps.stemWord("preheat"), 0.05); + termValues.put(ps.stemWord("sprinkle"), 0.05); + termValues.put(ps.stemWord("spritz"), 0.05); + + // utensils + termValues.put(ps.stemWord("colander"), 0.05); + termValues.put(ps.stemWord("pot"), 0.05); + termValues.put(ps.stemWord("pan"), 0.05); + termValues.put(ps.stemWord("oven"), 0.05); + termValues.put(ps.stemWord("stove"), 0.05); + termValues.put(ps.stemWord("skillet"), 0.05); + termValues.put(ps.stemWord("wok"), 0.05); + termValues.put(ps.stemWord("knife"), 0.05); + termValues.put(ps.stemWord("grater"), 0.05); + + // baking + termValues.put(ps.stemWord("yeast"), 0.025); + termValues.put(ps.stemWord("sourdough"), 0.025); + termValues.put(ps.stemWord("flour"), 0.025); + termValues.put(ps.stemWord("sugar"), 0.025); + termValues.put(ps.stemWord("rye"), 0.025); + termValues.put(ps.stemWord("wheat"), 0.025); + termValues.put(ps.stemWord("dough"), 0.025); + termValues.put(ps.stemWord("rise"), 0.025); + + // vegetables + termValues.put(ps.stemWord("lettuce"), 0.025); + termValues.put(ps.stemWord("onions"), 0.025); + termValues.put(ps.stemWord("parsnips"), 0.025); + termValues.put(ps.stemWord("beets"), 0.025); + termValues.put(ps.stemWord("carrots"), 0.025); + termValues.put(ps.stemWord("chilies"), 0.025); + termValues.put(ps.stemWord("peppers"), 0.025); + termValues.put(ps.stemWord("chives"), 0.025); + termValues.put(ps.stemWord("tomatoes"), 0.025); + termValues.put(ps.stemWord("salad"), 0.025); + termValues.put(ps.stemWord("leeks"), 0.025); + termValues.put(ps.stemWord("shallots"), 0.025); + termValues.put(ps.stemWord("avocado"), 0.025); + termValues.put(ps.stemWord("asparagus"), 0.025); + termValues.put(ps.stemWord("cucumbers"), 0.025); + termValues.put(ps.stemWord("eggplants"), 0.025); + termValues.put(ps.stemWord("broccoli"), 0.025); + termValues.put(ps.stemWord("kale"), 0.05); + + termValues.put(ps.stemWord("jalapeno"), 0.025); + termValues.put(ps.stemWord("habanero"), 0.025); + + termValues.put(ps.stemWord("mushrooms"), 0.025); + termValues.put(ps.stemWord("shiitake"), 0.025); + termValues.put(ps.stemWord("chanterelles"), 0.025); + + // brotein + termValues.put(ps.stemWord("meat"), 0.025); + termValues.put(ps.stemWord("beef"), 0.025); + termValues.put(ps.stemWord("chicken"), 0.025); + termValues.put(ps.stemWord("turkey"), 0.025); + termValues.put(ps.stemWord("cheese"), 0.025); + termValues.put(ps.stemWord("pork"), 0.025); + termValues.put(ps.stemWord("tofu"), 0.025); + termValues.put(ps.stemWord("salmon"), 0.025); + termValues.put(ps.stemWord("cod"), 0.025); + termValues.put(ps.stemWord("veal"), 0.025); + termValues.put(ps.stemWord("eggs"), 0.025); + termValues.put(ps.stemWord("lentils"), 0.025); + termValues.put(ps.stemWord("chickpeas"), 0.025); + + // carbs + termValues.put(ps.stemWord("rice"), 0.025); + termValues.put(ps.stemWord("noodles"), 0.025); + termValues.put(ps.stemWord("beans"), 0.025); + termValues.put(ps.stemWord("ramen"), 0.025); + + // japan + termValues.put(ps.stemWord("miso"), 0.025); + termValues.put(ps.stemWord("natto"), 0.025); + termValues.put(ps.stemWord("udon"), 0.025); + termValues.put(ps.stemWord("soba"), 0.025); + termValues.put(ps.stemWord("shichimi"), 0.025); + termValues.put(ps.stemWord("nori"), 0.025); + + // korea + termValues.put(ps.stemWord("kimchi"), 0.025); + + // fat of the land + termValues.put(ps.stemWord("salt"), 0.025); + termValues.put(ps.stemWord("oil"), 0.025); + termValues.put(ps.stemWord("olive"), 0.025); + termValues.put(ps.stemWord("feta"), 0.025); + termValues.put(ps.stemWord("parmesan"), 0.025); + termValues.put(ps.stemWord("mozzarella"), 0.025); + termValues.put(ps.stemWord("gouda"), 0.025); + termValues.put(ps.stemWord("cheese"), 0.025); + termValues.put(ps.stemWord("mayonnaise"), 0.025); + termValues.put(ps.stemWord("butter"), 0.025); + + // spices and sauces + termValues.put(ps.stemWord("pepper"), 0.025); + termValues.put(ps.stemWord("garlic"), 0.025); + termValues.put(ps.stemWord("sriracha"), 0.025); + termValues.put(ps.stemWord("sambal"), 0.025); + termValues.put(ps.stemWord("soy"), 0.025); + termValues.put(ps.stemWord("cumin"), 0.025); + termValues.put(ps.stemWord("thyme"), 0.025); + termValues.put(ps.stemWord("basil"), 0.025); + termValues.put(ps.stemWord("oregano"), 0.025); + termValues.put(ps.stemWord("cilantro"), 0.025); + termValues.put(ps.stemWord("ginger"), 0.025); + termValues.put(ps.stemWord("curry"), 0.025); + + termValues.put(ps.stemWord("water"), 0.025); + + // dessert + termValues.put(ps.stemWord("lemons"), 0.025); + termValues.put(ps.stemWord("melons"), 0.025); + termValues.put(ps.stemWord("cherries"), 0.025); + termValues.put(ps.stemWord("apples"), 0.025); + termValues.put(ps.stemWord("pears"), 0.025); + + termValues.put(ps.stemWord("chocolate"), 0.025); + termValues.put(ps.stemWord("vanilla"), 0.025); + + // dairy + termValues.put(ps.stemWord("milk"), 0.025); + termValues.put(ps.stemWord("creamer"), 0.025); + termValues.put(ps.stemWord("quark"), 0.025); + termValues.put(ps.stemWord("cream"), 0.025); + + + // dishes + termValues.put(ps.stemWord("cake"), 0.025); + termValues.put(ps.stemWord("pie"), 0.025); + termValues.put(ps.stemWord("crust"), 0.025); + termValues.put(ps.stemWord("bread"), 0.025); + termValues.put(ps.stemWord("omelet"), 0.025); + termValues.put(ps.stemWord("soup"), 0.025); + + } + + public double recipeP(DocumentLanguageData dld) { + + Map values = new HashMap<>(); + int count = 0; + for (var sentence : dld.sentences) { + + for (var word : sentence) { + count++; + + final String stemmed = word.stemmed(); + final Double value = termValues.get(stemmed); + + if (value != null) { + values.put(stemmed, value); + } + } + + } + + if (count == 0) return 0.; + + double lengthPenalty = sqrt(AVG_RECIPE_LENGTH)/sqrt(max(AVG_RECIPE_LENGTH, count)); + + return values.values().stream().mapToDouble(Double::valueOf).sum() * lengthPenalty; + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java index 264c1051..3515c48a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java @@ -1,8 +1,15 @@ package nu.marginalia.wmsa.edge.model; -import lombok.*; +import lombok.AllArgsConstructor; +import lombok.NoArgsConstructor; +import lombok.ToString; +import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader; +import nu.marginalia.wmsa.edge.crawling.WorkLog; +import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; +import java.io.IOException; import java.nio.file.Path; +import java.util.function.Consumer; @AllArgsConstructor @NoArgsConstructor @ToString public class EdgeCrawlPlan { @@ -38,4 +45,16 @@ public class EdgeCrawlPlan { String sp2 = fileName.substring(2, 4); return process.getDir().resolve(sp1).resolve(sp2).resolve(fileName); } + + public void forEachCrawledDomain(Consumer consumer) { + final CrawledDomainReader reader = new CrawledDomainReader(); + + WorkLog.readLog(crawl.getLogFile(), entry -> { + try { + consumer.accept(reader.read(getCrawledFilePath(entry.path()))); + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/RecipeDetectorTool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/RecipeDetectorTool.java new file mode 100644 index 00000000..480e85b8 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/RecipeDetectorTool.java @@ -0,0 +1,82 @@ +package nu.marginalia.wmsa.edge.tools; + +import nu.marginalia.util.language.conf.LanguageModels; +import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.util.language.processing.model.DocumentLanguageData; +import nu.marginalia.wmsa.configuration.WmsaHome; +import nu.marginalia.wmsa.configuration.module.DatabaseModule; +import nu.marginalia.wmsa.edge.converting.processor.logic.RecipeDetector; +import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader; +import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader; +import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; +import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; +import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; + +import java.io.IOException; +import java.nio.file.Path; +import java.sql.SQLException; +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.ForkJoinPool; +import java.util.concurrent.TimeUnit; + +import static nu.marginalia.wmsa.edge.converting.processor.DocumentProcessor.isAcceptedContentType; + +public class RecipeDetectorTool { + private static final CrawledDomainReader reader = new CrawledDomainReader(); + private static final RecipeDetector detector = new RecipeDetector(); + private static final LanguageModels lm = WmsaHome.getLanguageModels(); + private static final SentenceExtractor sentenceExtractor = new SentenceExtractor(lm); + + private static final Set urls = new HashSet<>(50_000_000); + + public static void main(String... args) throws IOException { + EdgeCrawlPlan plan = new CrawlPlanLoader().load(Path.of(args[0])); + DatabaseModule module = new DatabaseModule(); + + try (var ds = module.provideConnection(); + var conn = ds.getConnection(); + var stmt = conn.createStatement()) { + var rsp = stmt.executeQuery("SELECT URL FROM EC_URL_VIEW WHERE TITLE IS NOT NULL"); + while (rsp.next()) { + urls.add(rsp.getString(1)); + } + } + catch (SQLException ex) { + ex.printStackTrace(); + } + + ForkJoinPool pool = new ForkJoinPool(16); + plan.forEachCrawledDomain(data -> pool.execute(() -> processDomain(data))); + + while (!pool.awaitQuiescence(1, TimeUnit.HOURS)); + } + + private static void processDomain(CrawledDomain domain) { + if (domain.doc == null) return; + for (var doc : domain.doc) { + if (!urls.contains(doc.url)) + continue; + + if (isAcceptedContentType(doc) && "OK".equals(doc.crawlerStatus)) { + processDocument(doc); + } + } + } + + + private static void processDocument(CrawledDocument doc) { + Document parsedDocument = Jsoup.parse(doc.documentBody); + + parsedDocument.getElementsByTag("a").remove(); + parsedDocument.getElementsByTag("nav").remove(); + + DocumentLanguageData dld = sentenceExtractor.extractSentences(parsedDocument); + double prob = 100*detector.recipeP(dld); + if (prob > 50) { + System.out.printf("%3.2f\t%s\n", prob, doc.url); + } + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/RecipesLoaderTool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/RecipesLoaderTool.java new file mode 100644 index 00000000..96fc9e47 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/RecipesLoaderTool.java @@ -0,0 +1,86 @@ +package nu.marginalia.wmsa.edge.tools; + +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.wmsa.configuration.module.DatabaseModule; +import nu.marginalia.wmsa.configuration.server.Context; +import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; +import nu.marginalia.wmsa.edge.index.model.IndexBlock; +import nu.marginalia.wmsa.edge.model.EdgeId; +import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; +import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; + +import static nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature.CATEGORY_FOOD; + +public class RecipesLoaderTool { + public static void main(String... args) { + + try (EdgeIndexClient client = new EdgeIndexClient(); + HikariDataSource ds = new DatabaseModule().provideConnection(); + Connection conn = ds.getConnection(); + PreparedStatement ps = conn.prepareStatement("UPDATE EC_PAGE_DATA SET FEATURES = FEATURES | ? WHERE ID=?"); + var linesStream = Files.lines(Path.of(args[0]))) { + + var urls = getUrls(ds); + var wordSet = new EdgePageWordSet(new EdgePageWords(IndexBlock.Meta, List.of(CATEGORY_FOOD.getKeyword()))); + linesStream + .map(urls::get) + .filter(Objects::nonNull) + .forEach(id -> { + int urlId = (int)(id & 0xFFFF_FFFFL); + int domainId = (int)(id >>> 32L); + + try { + ps.setInt(2, urlId); + ps.setInt(1, CATEGORY_FOOD.getFeatureBit()); + ps.executeUpdate(); + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + + client.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), -5, wordSet, 0) + .blockingSubscribe(); + }); + + } catch (IOException e) { + throw new RuntimeException(e); + } catch (SQLException e) { + throw new RuntimeException(e); + } + } + + private static Map getUrls(HikariDataSource ds) { + + Map urls = new HashMap<>(100_000); + + try (var conn = ds.getConnection(); + var stmt = conn.createStatement()) + { + var rsp = stmt.executeQuery("SELECT URL, ID, DOMAIN_ID FROM EC_URL_VIEW WHERE TITLE IS NOT NULL"); + + while (rsp.next()) { + long val = rsp.getInt(3); + val = (val << 32L) | rsp.getInt(2); + + urls.put(rsp.getString(1), val); + } + + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + + return urls; + } +}