diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentLanguageData.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentLanguageData.java index 01d2cd58..7d829fd6 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentLanguageData.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentLanguageData.java @@ -6,6 +6,9 @@ import lombok.AllArgsConstructor; import java.util.Arrays; import java.util.stream.Stream; +/** + * @see nu.marginalia.util.language.processing.SentenceExtractor + */ @AllArgsConstructor public class DocumentLanguageData { public final DocumentSentence[] sentences; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java index f752ef80..9107f62c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java @@ -112,7 +112,7 @@ public class DocumentProcessor { return ret; } - private boolean isAcceptedContentType(CrawledDocument crawledDocument) { + public static boolean isAcceptedContentType(CrawledDocument crawledDocument) { if (crawledDocument.contentType == null) { return false; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java index 032315dd..c7ef9fd8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java @@ -7,7 +7,9 @@ public enum HtmlFeature { JS("special:scripts"), AFFILIATE_LINK( "special:affiliate"), TRACKING("special:tracking"), - COOKIES("special:cookies") + COOKIES("special:cookies"), + + CATEGORY_FOOD("category:food"), ; private final String keyword; @@ -31,4 +33,8 @@ public enum HtmlFeature { public static boolean hasFeature(int value, HtmlFeature feature) { return (value & (1<< feature.ordinal())) != 0; } + + public int getFeatureBit() { + return (1<< ordinal()); + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/RecipeDetector.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/RecipeDetector.java new file mode 100644 index 00000000..4b77cba2 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/RecipeDetector.java @@ -0,0 +1,234 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic; + +import ca.rmen.porterstemmer.PorterStemmer; +import nu.marginalia.util.language.processing.model.DocumentLanguageData; + +import java.util.HashMap; +import java.util.Map; + +import static java.lang.Math.max; +import static java.lang.Math.sqrt; + +public class RecipeDetector { + private static final int AVG_RECIPE_LENGTH = 250; + + private final Map termValues = new HashMap<>(); + + public RecipeDetector() { + PorterStemmer ps = new PorterStemmer(); + + // these terms appear in most recipes + termValues.put(ps.stemWord("ingredients"), 0.3); + termValues.put(ps.stemWord("recipe"), 0.1); + termValues.put(ps.stemWord("preparations"), 0.1); + termValues.put(ps.stemWord("instructions"), 0.1); + + // penalize restaurant menus + termValues.put(ps.stemWord("menu"), -0.5); + + // error non habet ius + termValues.put(ps.stemWord("email"), -0.15); + termValues.put(ps.stemWord("checkout"), -0.15); + termValues.put(ps.stemWord("reviews"), -0.15); + termValues.put(ps.stemWord("newsletter"), -0.15); + + // measures + termValues.put(ps.stemWord("dl"), 0.05); + termValues.put(ps.stemWord("l"), 0.05); + termValues.put(ps.stemWord("g"), 0.05); + termValues.put(ps.stemWord("ml"), 0.05); + termValues.put(ps.stemWord("tsp"), 0.05); + termValues.put(ps.stemWord("teaspoons"), 0.05); + termValues.put(ps.stemWord("tbsp"), 0.05); + termValues.put(ps.stemWord("tablespoons"), 0.05); + termValues.put(ps.stemWord("cups"), 0.05); + termValues.put(ps.stemWord("quarts"), 0.05); + termValues.put(ps.stemWord("pints"), 0.05); + + // techniques + termValues.put(ps.stemWord("grate"), 0.05); + termValues.put(ps.stemWord("cut"), 0.05); + termValues.put(ps.stemWord("peel"), 0.05); + termValues.put(ps.stemWord("chop"), 0.05); + termValues.put(ps.stemWord("slice"), 0.05); + termValues.put(ps.stemWord("debone"), 0.05); + termValues.put(ps.stemWord("julienne"), 0.05); + termValues.put(ps.stemWord("saute"), 0.05); + termValues.put(ps.stemWord("fry"), 0.05); + termValues.put(ps.stemWord("boil"), 0.05); + termValues.put(ps.stemWord("parboil"), 0.05); + termValues.put(ps.stemWord("roast"), 0.05); + termValues.put(ps.stemWord("grill"), 0.05); + termValues.put(ps.stemWord("sear"), 0.05); + termValues.put(ps.stemWord("heat"), 0.05); + termValues.put(ps.stemWord("dice"), 0.05); + termValues.put(ps.stemWord("bake"), 0.05); + termValues.put(ps.stemWord("strain"), 0.05); + termValues.put(ps.stemWord("melt"), 0.05); + termValues.put(ps.stemWord("garnish"), 0.05); + termValues.put(ps.stemWord("preheat"), 0.05); + termValues.put(ps.stemWord("sprinkle"), 0.05); + termValues.put(ps.stemWord("spritz"), 0.05); + + // utensils + termValues.put(ps.stemWord("colander"), 0.05); + termValues.put(ps.stemWord("pot"), 0.05); + termValues.put(ps.stemWord("pan"), 0.05); + termValues.put(ps.stemWord("oven"), 0.05); + termValues.put(ps.stemWord("stove"), 0.05); + termValues.put(ps.stemWord("skillet"), 0.05); + termValues.put(ps.stemWord("wok"), 0.05); + termValues.put(ps.stemWord("knife"), 0.05); + termValues.put(ps.stemWord("grater"), 0.05); + + // baking + termValues.put(ps.stemWord("yeast"), 0.025); + termValues.put(ps.stemWord("sourdough"), 0.025); + termValues.put(ps.stemWord("flour"), 0.025); + termValues.put(ps.stemWord("sugar"), 0.025); + termValues.put(ps.stemWord("rye"), 0.025); + termValues.put(ps.stemWord("wheat"), 0.025); + termValues.put(ps.stemWord("dough"), 0.025); + termValues.put(ps.stemWord("rise"), 0.025); + + // vegetables + termValues.put(ps.stemWord("lettuce"), 0.025); + termValues.put(ps.stemWord("onions"), 0.025); + termValues.put(ps.stemWord("parsnips"), 0.025); + termValues.put(ps.stemWord("beets"), 0.025); + termValues.put(ps.stemWord("carrots"), 0.025); + termValues.put(ps.stemWord("chilies"), 0.025); + termValues.put(ps.stemWord("peppers"), 0.025); + termValues.put(ps.stemWord("chives"), 0.025); + termValues.put(ps.stemWord("tomatoes"), 0.025); + termValues.put(ps.stemWord("salad"), 0.025); + termValues.put(ps.stemWord("leeks"), 0.025); + termValues.put(ps.stemWord("shallots"), 0.025); + termValues.put(ps.stemWord("avocado"), 0.025); + termValues.put(ps.stemWord("asparagus"), 0.025); + termValues.put(ps.stemWord("cucumbers"), 0.025); + termValues.put(ps.stemWord("eggplants"), 0.025); + termValues.put(ps.stemWord("broccoli"), 0.025); + termValues.put(ps.stemWord("kale"), 0.05); + + termValues.put(ps.stemWord("jalapeno"), 0.025); + termValues.put(ps.stemWord("habanero"), 0.025); + + termValues.put(ps.stemWord("mushrooms"), 0.025); + termValues.put(ps.stemWord("shiitake"), 0.025); + termValues.put(ps.stemWord("chanterelles"), 0.025); + + // brotein + termValues.put(ps.stemWord("meat"), 0.025); + termValues.put(ps.stemWord("beef"), 0.025); + termValues.put(ps.stemWord("chicken"), 0.025); + termValues.put(ps.stemWord("turkey"), 0.025); + termValues.put(ps.stemWord("cheese"), 0.025); + termValues.put(ps.stemWord("pork"), 0.025); + termValues.put(ps.stemWord("tofu"), 0.025); + termValues.put(ps.stemWord("salmon"), 0.025); + termValues.put(ps.stemWord("cod"), 0.025); + termValues.put(ps.stemWord("veal"), 0.025); + termValues.put(ps.stemWord("eggs"), 0.025); + termValues.put(ps.stemWord("lentils"), 0.025); + termValues.put(ps.stemWord("chickpeas"), 0.025); + + // carbs + termValues.put(ps.stemWord("rice"), 0.025); + termValues.put(ps.stemWord("noodles"), 0.025); + termValues.put(ps.stemWord("beans"), 0.025); + termValues.put(ps.stemWord("ramen"), 0.025); + + // japan + termValues.put(ps.stemWord("miso"), 0.025); + termValues.put(ps.stemWord("natto"), 0.025); + termValues.put(ps.stemWord("udon"), 0.025); + termValues.put(ps.stemWord("soba"), 0.025); + termValues.put(ps.stemWord("shichimi"), 0.025); + termValues.put(ps.stemWord("nori"), 0.025); + + // korea + termValues.put(ps.stemWord("kimchi"), 0.025); + + // fat of the land + termValues.put(ps.stemWord("salt"), 0.025); + termValues.put(ps.stemWord("oil"), 0.025); + termValues.put(ps.stemWord("olive"), 0.025); + termValues.put(ps.stemWord("feta"), 0.025); + termValues.put(ps.stemWord("parmesan"), 0.025); + termValues.put(ps.stemWord("mozzarella"), 0.025); + termValues.put(ps.stemWord("gouda"), 0.025); + termValues.put(ps.stemWord("cheese"), 0.025); + termValues.put(ps.stemWord("mayonnaise"), 0.025); + termValues.put(ps.stemWord("butter"), 0.025); + + // spices and sauces + termValues.put(ps.stemWord("pepper"), 0.025); + termValues.put(ps.stemWord("garlic"), 0.025); + termValues.put(ps.stemWord("sriracha"), 0.025); + termValues.put(ps.stemWord("sambal"), 0.025); + termValues.put(ps.stemWord("soy"), 0.025); + termValues.put(ps.stemWord("cumin"), 0.025); + termValues.put(ps.stemWord("thyme"), 0.025); + termValues.put(ps.stemWord("basil"), 0.025); + termValues.put(ps.stemWord("oregano"), 0.025); + termValues.put(ps.stemWord("cilantro"), 0.025); + termValues.put(ps.stemWord("ginger"), 0.025); + termValues.put(ps.stemWord("curry"), 0.025); + + termValues.put(ps.stemWord("water"), 0.025); + + // dessert + termValues.put(ps.stemWord("lemons"), 0.025); + termValues.put(ps.stemWord("melons"), 0.025); + termValues.put(ps.stemWord("cherries"), 0.025); + termValues.put(ps.stemWord("apples"), 0.025); + termValues.put(ps.stemWord("pears"), 0.025); + + termValues.put(ps.stemWord("chocolate"), 0.025); + termValues.put(ps.stemWord("vanilla"), 0.025); + + // dairy + termValues.put(ps.stemWord("milk"), 0.025); + termValues.put(ps.stemWord("creamer"), 0.025); + termValues.put(ps.stemWord("quark"), 0.025); + termValues.put(ps.stemWord("cream"), 0.025); + + + // dishes + termValues.put(ps.stemWord("cake"), 0.025); + termValues.put(ps.stemWord("pie"), 0.025); + termValues.put(ps.stemWord("crust"), 0.025); + termValues.put(ps.stemWord("bread"), 0.025); + termValues.put(ps.stemWord("omelet"), 0.025); + termValues.put(ps.stemWord("soup"), 0.025); + + } + + public double recipeP(DocumentLanguageData dld) { + + Map values = new HashMap<>(); + int count = 0; + for (var sentence : dld.sentences) { + + for (var word : sentence) { + count++; + + final String stemmed = word.stemmed(); + final Double value = termValues.get(stemmed); + + if (value != null) { + values.put(stemmed, value); + } + } + + } + + if (count == 0) return 0.; + + double lengthPenalty = sqrt(AVG_RECIPE_LENGTH)/sqrt(max(AVG_RECIPE_LENGTH, count)); + + return values.values().stream().mapToDouble(Double::valueOf).sum() * lengthPenalty; + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java index b3b4d45e..869c6f5b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java @@ -26,6 +26,8 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardCopyOption; import java.util.EnumMap; +import java.util.HashMap; +import java.util.Map; import java.util.concurrent.Callable; import static nu.marginalia.wmsa.edge.index.EdgeIndexService.DYNAMIC_BUCKET_LENGTH; @@ -39,7 +41,7 @@ public class IndexServicesFactory { private final PartitionedDataFile writerIndexFile; private final RootDataFile keywordLexiconFile; - private final PartitionedDataFile preconverterOutputFile; + private final DoublePartitionedDataFile preconverterOutputFile; private final DoublePartitionedDataFile indexReadWordsFile; private final DoublePartitionedDataFile indexReadUrlsFile; private final DoublePartitionedDataFile indexWriteWordsFile; @@ -75,7 +77,7 @@ public class IndexServicesFactory { this.indexReadUrlsFile = new DoublePartitionedDataFile(partitionRootFast, indexReadUrlsFile); this.indexWriteWordsFile = new DoublePartitionedDataFile(partitionRootFast, indexWriteWordsFile); this.indexWriteUrlsFile = new DoublePartitionedDataFile(partitionRootFast, indexWriteUrlsFile); - this.preconverterOutputFile = new PartitionedDataFile(partitionRootSlowTmp, "preconverted.dat"); + this.preconverterOutputFile = new DoublePartitionedDataFile(partitionRootSlowTmp, "preconverted.dat"); this.partitioner = partitioner; } @@ -101,7 +103,7 @@ public class IndexServicesFactory { public void convertIndex(int id, IndexBlock block) throws ConversionUnnecessaryException, IOException { var converter = new SearchIndexConverter(block, id, tmpFileDir, - preconverterOutputFile.get(id), + preconverterOutputFile.get(id, block.ordinal()), indexWriteWordsFile.get(id, block.id), indexWriteUrlsFile.get(id, block.id), partitioner, @@ -112,19 +114,23 @@ public class IndexServicesFactory { @SneakyThrows public SearchIndexPreconverter getIndexPreconverter() { - File[] outputFiles = new File[DYNAMIC_BUCKET_LENGTH+1]; - for (int i = 0; i < outputFiles.length; i++) { - outputFiles[i] = getPreconverterOutputFile(i); + Map shards = new HashMap<>(); + + for (int index = 0; index < (DYNAMIC_BUCKET_LENGTH + 1); index++) { + for (IndexBlock block : IndexBlock.values()) { + shards.put(new SearchIndexPreconverter.Shard(index, block.ordinal()), getPreconverterOutputFile(index, block.ordinal())); + } } + return new SearchIndexPreconverter(writerIndexFile.get(0), - outputFiles, + shards, partitioner, domainBlacklist ); } - private File getPreconverterOutputFile(int i) { - return preconverterOutputFile.get(i); + private File getPreconverterOutputFile(int index, int block) { + return preconverterOutputFile.get(index, block); } @SneakyThrows diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPreconverter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPreconverter.java index d096ff0e..37560b61 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPreconverter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPreconverter.java @@ -10,26 +10,42 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; +import java.io.IOException; import java.io.RandomAccessFile; import java.nio.ByteBuffer; import java.nio.channels.FileChannel; import java.nio.file.Files; +import java.util.Map; import java.util.Objects; public class SearchIndexPreconverter { private final Logger logger = LoggerFactory.getLogger(getClass()); + public record Shard(int bucket, int block) {} + + private record ShardOutput(Shard shard, RandomAccessFile raf, FileChannel fc) { + public static ShardOutput fromFile(Shard s, File f) { + try { + var v = new RandomAccessFile(f, "rw"); + v.seek(SearchIndexJournalReader.FILE_HEADER_SIZE_BYTES); + return new ShardOutput(s, v, v.getChannel()); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + }; + @SneakyThrows @Inject public SearchIndexPreconverter(File inputFile, - File[] outputFiles, + Map outputFiles, SearchIndexPartitioner partitioner, EdgeDomainBlacklist blacklist) { TIntHashSet spamDomains = blacklist.getSpamDomains(); logger.info("Preconverting {}", inputFile); - for (File f : outputFiles) { + for (File f : outputFiles.values()) { if (f.exists()) { Files.deleteIfExists(Objects.requireNonNull(f).toPath()); } @@ -41,15 +57,7 @@ public class SearchIndexPreconverter { logger.info("{}", indexJournalReader.fileHeader); - RandomAccessFile[] randomAccessFiles = new RandomAccessFile[outputFiles.length]; - for (int i = 0; i < randomAccessFiles.length; i++) { - randomAccessFiles[i] = new RandomAccessFile(outputFiles[i], "rw"); - randomAccessFiles[i].seek(SearchIndexJournalReader.FILE_HEADER_SIZE_BYTES); - } - FileChannel[] fileChannels = new FileChannel[outputFiles.length]; - for (int i = 0; i < fileChannels.length; i++) { - fileChannels[i] = randomAccessFiles[i].getChannel(); - } + ShardOutput[] outputs = outputFiles.entrySet().stream().map(entry -> ShardOutput.fromFile(entry.getKey(), entry.getValue())).toArray(ShardOutput[]::new); var lock = partitioner.getReadLock(); try { @@ -65,12 +73,14 @@ public class SearchIndexPreconverter { buffer.clear(); entry.copyToBuffer(buffer); - for (int i = 0; i < randomAccessFiles.length; i++) { - if (partitioner.filterUnsafe(domainId, i)) { + for (int i = 0; i < outputs.length; i++) { + if (outputs[i].shard.block == entry.header.block().id + && partitioner.filterUnsafe(domainId, outputs[i].shard.bucket)) + { buffer.flip(); while (buffer.position() < buffer.limit()) - fileChannels[i].write(buffer); + outputs[i].fc.write(buffer); } } } @@ -80,14 +90,14 @@ public class SearchIndexPreconverter { } logger.info("Finalizing preconversion"); - for (int i = 0; i < randomAccessFiles.length; i++) { - long pos = randomAccessFiles[i].getFilePointer(); - randomAccessFiles[i].seek(0); - randomAccessFiles[i].writeLong(pos); - randomAccessFiles[i].writeLong(wordCountOriginal); - fileChannels[i].force(true); - fileChannels[i].close(); - randomAccessFiles[i].close(); + for (int i = 0; i < outputs.length; i++) { + long pos = outputs[i].raf.getFilePointer(); + outputs[i].raf.seek(0); + outputs[i].raf.writeLong(pos); + outputs[i].raf.writeLong(wordCountOriginal); + outputs[i].fc.force(true); + outputs[i].fc.close(); + outputs[i].raf.close(); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java index 264c1051..3515c48a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java @@ -1,8 +1,15 @@ package nu.marginalia.wmsa.edge.model; -import lombok.*; +import lombok.AllArgsConstructor; +import lombok.NoArgsConstructor; +import lombok.ToString; +import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader; +import nu.marginalia.wmsa.edge.crawling.WorkLog; +import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; +import java.io.IOException; import java.nio.file.Path; +import java.util.function.Consumer; @AllArgsConstructor @NoArgsConstructor @ToString public class EdgeCrawlPlan { @@ -38,4 +45,16 @@ public class EdgeCrawlPlan { String sp2 = fileName.substring(2, 4); return process.getDir().resolve(sp1).resolve(sp2).resolve(fileName); } + + public void forEachCrawledDomain(Consumer consumer) { + final CrawledDomainReader reader = new CrawledDomainReader(); + + WorkLog.readLog(crawl.getLogFile(), entry -> { + try { + consumer.accept(reader.read(getCrawledFilePath(entry.path()))); + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/RecipeDetectorTool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/RecipeDetectorTool.java new file mode 100644 index 00000000..480e85b8 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/RecipeDetectorTool.java @@ -0,0 +1,82 @@ +package nu.marginalia.wmsa.edge.tools; + +import nu.marginalia.util.language.conf.LanguageModels; +import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.util.language.processing.model.DocumentLanguageData; +import nu.marginalia.wmsa.configuration.WmsaHome; +import nu.marginalia.wmsa.configuration.module.DatabaseModule; +import nu.marginalia.wmsa.edge.converting.processor.logic.RecipeDetector; +import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader; +import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader; +import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; +import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; +import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; + +import java.io.IOException; +import java.nio.file.Path; +import java.sql.SQLException; +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.ForkJoinPool; +import java.util.concurrent.TimeUnit; + +import static nu.marginalia.wmsa.edge.converting.processor.DocumentProcessor.isAcceptedContentType; + +public class RecipeDetectorTool { + private static final CrawledDomainReader reader = new CrawledDomainReader(); + private static final RecipeDetector detector = new RecipeDetector(); + private static final LanguageModels lm = WmsaHome.getLanguageModels(); + private static final SentenceExtractor sentenceExtractor = new SentenceExtractor(lm); + + private static final Set urls = new HashSet<>(50_000_000); + + public static void main(String... args) throws IOException { + EdgeCrawlPlan plan = new CrawlPlanLoader().load(Path.of(args[0])); + DatabaseModule module = new DatabaseModule(); + + try (var ds = module.provideConnection(); + var conn = ds.getConnection(); + var stmt = conn.createStatement()) { + var rsp = stmt.executeQuery("SELECT URL FROM EC_URL_VIEW WHERE TITLE IS NOT NULL"); + while (rsp.next()) { + urls.add(rsp.getString(1)); + } + } + catch (SQLException ex) { + ex.printStackTrace(); + } + + ForkJoinPool pool = new ForkJoinPool(16); + plan.forEachCrawledDomain(data -> pool.execute(() -> processDomain(data))); + + while (!pool.awaitQuiescence(1, TimeUnit.HOURS)); + } + + private static void processDomain(CrawledDomain domain) { + if (domain.doc == null) return; + for (var doc : domain.doc) { + if (!urls.contains(doc.url)) + continue; + + if (isAcceptedContentType(doc) && "OK".equals(doc.crawlerStatus)) { + processDocument(doc); + } + } + } + + + private static void processDocument(CrawledDocument doc) { + Document parsedDocument = Jsoup.parse(doc.documentBody); + + parsedDocument.getElementsByTag("a").remove(); + parsedDocument.getElementsByTag("nav").remove(); + + DocumentLanguageData dld = sentenceExtractor.extractSentences(parsedDocument); + double prob = 100*detector.recipeP(dld); + if (prob > 50) { + System.out.printf("%3.2f\t%s\n", prob, doc.url); + } + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/RecipesLoaderTool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/RecipesLoaderTool.java new file mode 100644 index 00000000..96fc9e47 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/RecipesLoaderTool.java @@ -0,0 +1,86 @@ +package nu.marginalia.wmsa.edge.tools; + +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.wmsa.configuration.module.DatabaseModule; +import nu.marginalia.wmsa.configuration.server.Context; +import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; +import nu.marginalia.wmsa.edge.index.model.IndexBlock; +import nu.marginalia.wmsa.edge.model.EdgeId; +import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; +import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; + +import static nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature.CATEGORY_FOOD; + +public class RecipesLoaderTool { + public static void main(String... args) { + + try (EdgeIndexClient client = new EdgeIndexClient(); + HikariDataSource ds = new DatabaseModule().provideConnection(); + Connection conn = ds.getConnection(); + PreparedStatement ps = conn.prepareStatement("UPDATE EC_PAGE_DATA SET FEATURES = FEATURES | ? WHERE ID=?"); + var linesStream = Files.lines(Path.of(args[0]))) { + + var urls = getUrls(ds); + var wordSet = new EdgePageWordSet(new EdgePageWords(IndexBlock.Meta, List.of(CATEGORY_FOOD.getKeyword()))); + linesStream + .map(urls::get) + .filter(Objects::nonNull) + .forEach(id -> { + int urlId = (int)(id & 0xFFFF_FFFFL); + int domainId = (int)(id >>> 32L); + + try { + ps.setInt(2, urlId); + ps.setInt(1, CATEGORY_FOOD.getFeatureBit()); + ps.executeUpdate(); + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + + client.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), -5, wordSet, 0) + .blockingSubscribe(); + }); + + } catch (IOException e) { + throw new RuntimeException(e); + } catch (SQLException e) { + throw new RuntimeException(e); + } + } + + private static Map getUrls(HikariDataSource ds) { + + Map urls = new HashMap<>(100_000); + + try (var conn = ds.getConnection(); + var stmt = conn.createStatement()) + { + var rsp = stmt.executeQuery("SELECT URL, ID, DOMAIN_ID FROM EC_URL_VIEW WHERE TITLE IS NOT NULL"); + + while (rsp.next()) { + long val = rsp.getInt(3); + val = (val << 32L) | rsp.getInt(2); + + urls.put(rsp.getString(1), val); + } + + } + catch (SQLException ex) { + throw new RuntimeException(ex); + } + + return urls; + } +}