From 43f3380cb9034077cf3d79a980a50b3407f96f81 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 6 Mar 2023 19:32:25 +0100 Subject: [PATCH] Refactoring converting-process --- .../converting/model/ProcessedDocument.java | 2 - .../converting/processor/DomainProcessor.java | 2 +- .../converting/processor/SiteWords.java | 4 +- .../processor/logic/DomPruningFilter.java | 5 ++ .../processor/logic/FeatureExtractor.java | 7 +- .../processor/logic/FeedExtractor.java | 3 - .../processor/logic/PlainTextLogic.java | 1 - .../processor/logic/SalientImageDetector.java | 74 ------------------- .../{ => links}/CommonKeywordExtractor.java | 2 +- .../logic/{ => links}/InternalLinkGraph.java | 2 +- .../logic/{ => links}/LinkProcessor.java | 2 +- .../SummaryExtractionFilter.java | 2 +- .../logic/{ => summary}/SummaryExtractor.java | 2 +- .../plugin/HtmlDocumentProcessorPlugin.java | 2 + .../logic/SummaryExtractorTest.java | 4 +- 15 files changed, 23 insertions(+), 91 deletions(-) delete mode 100644 crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/SalientImageDetector.java rename crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/{ => links}/CommonKeywordExtractor.java (97%) rename crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/{ => links}/InternalLinkGraph.java (97%) rename crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/{ => links}/LinkProcessor.java (97%) rename crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/{ => summary}/SummaryExtractionFilter.java (99%) rename crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/{ => summary}/SummaryExtractor.java (98%) diff --git a/crawl/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDocument.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDocument.java index b5fa8735..4b0fbb36 100644 --- a/crawl/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDocument.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDocument.java @@ -17,8 +17,6 @@ public class ProcessedDocument { public EdgeUrlState state; public String stateReason; - public long lshHash; - public boolean isOk() { return EdgeUrlState.OK == state; } diff --git a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index 005a1efc..c25d3440 100644 --- a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -11,7 +11,7 @@ import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.util.StringPool; -import nu.marginalia.converting.processor.logic.InternalLinkGraph; +import nu.marginalia.converting.processor.logic.links.InternalLinkGraph; import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator; import java.util.*; diff --git a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/SiteWords.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/SiteWords.java index 2fd75cba..6c16c068 100644 --- a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/SiteWords.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/SiteWords.java @@ -5,8 +5,8 @@ import nu.marginalia.model.crawl.EdgePageWordFlags; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.converting.processor.logic.CommonKeywordExtractor; -import nu.marginalia.converting.processor.logic.InternalLinkGraph; +import nu.marginalia.converting.processor.logic.links.CommonKeywordExtractor; +import nu.marginalia.converting.processor.logic.links.InternalLinkGraph; import javax.inject.Singleton; import java.util.HashMap; diff --git a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DomPruningFilter.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DomPruningFilter.java index f8c0b65d..b8c5b056 100644 --- a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DomPruningFilter.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DomPruningFilter.java @@ -8,6 +8,11 @@ import org.jsoup.select.NodeFilter; import java.util.HashMap; import java.util.Map; +/** Prune the DOM and remove noisy branches with a lot of tags and not a lot of text. + * This removes a lot of noise and keeps segments that are more or less just plain text. + *

+ * Used with JSoup's Document.filter() method + */ public class DomPruningFilter implements NodeFilter { private final double pruneThreshold; diff --git a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java index c0e1bc91..0ff35164 100644 --- a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java @@ -44,7 +44,12 @@ public class FeatureExtractor { private final GoogleAnwersSpamDetector googleAnwersSpamDetector; @Inject - public FeatureExtractor(AdblockSimulator adblockSimulator, RecipeDetector recipeDetector, TextileCraftDetector textileCraftDetector, WoodworkingDetector woodworkingDetector, GoogleAnwersSpamDetector googleAnwersSpamDetector) { + public FeatureExtractor(AdblockSimulator adblockSimulator, + RecipeDetector recipeDetector, + TextileCraftDetector textileCraftDetector, + WoodworkingDetector woodworkingDetector, + GoogleAnwersSpamDetector googleAnwersSpamDetector) + { this.adblockSimulator = adblockSimulator; this.recipeDetector = recipeDetector; this.textileCraftDetector = textileCraftDetector; diff --git a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeedExtractor.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeedExtractor.java index c20a9878..2a5bbada 100644 --- a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeedExtractor.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeedExtractor.java @@ -19,9 +19,6 @@ public class FeedExtractor { public Optional getFeedFromAlternateTag(EdgeUrl crawlUrl, Element alternateTag) { var type = alternateTag.attr("type"); - if (type == null) { - return Optional.empty(); - } try { var url = linkParser.parseLink(crawlUrl, alternateTag.attr("href")); diff --git a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/PlainTextLogic.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/PlainTextLogic.java index aefa5710..42bd47e9 100644 --- a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/PlainTextLogic.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/PlainTextLogic.java @@ -13,7 +13,6 @@ public class PlainTextLogic { public String getDescription(List firstFewLines) { return StringUtils.truncate(firstFewLines.stream().filter(this::looksLikeText) .collect(Collectors.joining(" ")).replaceAll("\\s+", " ") - , 255); } diff --git a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/SalientImageDetector.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/SalientImageDetector.java deleted file mode 100644 index 4a3baabd..00000000 --- a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/SalientImageDetector.java +++ /dev/null @@ -1,74 +0,0 @@ -package nu.marginalia.converting.processor.logic; - -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; - -import java.util.HashMap; -import java.util.Map; - -public class SalientImageDetector { - - public boolean hasSalientImage(Document document) { - document.getElementsByTag("a").removeIf(Element::hasText); - - Map counts = new HashMap<>(); - for (var elem : document.getElementsByTag("img")) { - counts.merge(elem.attr("src"), 1, Integer::sum); - } - for (var elem : document.select("p,div,section,article,font,center")) { - - String tagName = elem.tagName(); - if (("p".equals(tagName) || "center".equals(tagName) || "font".equals(tagName)) - && elem.text().length() < 16) - { - continue; - } - - if (aTagDensity(elem) < 0.1 && htmlTagDensity(elem) > 0.85) { - for (var imgTag : elem.getElementsByTag("img")) { - if (counts.getOrDefault(imgTag.attr("src"), 1) > 1) { - continue; - } - - if (isSmall(imgTag)) { - if (!imgTag.id().isBlank()) { - continue; - } - } - - return true; - } - } - } - - return false; - - } - - private boolean isSmall(Element imgTag) { - final String width = imgTag.attr("width"); - final String height = imgTag.attr("height"); - - if (width.isBlank() || height.isBlank()) - return true; - - try { - if (Integer.parseInt(width) < 400) - return true; - if (Integer.parseInt(height) < 400) - return true; - } - catch (NumberFormatException ex) { /* no-op */ } - - return false; - } - - private double htmlTagDensity(Element elem) { - return (double) elem.text().length() / elem.html().length(); - } - - private double aTagDensity(Element elem) { - return (double) elem.getElementsByTag("a").text().length() / elem.text().length(); - } - -} diff --git a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/CommonKeywordExtractor.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/links/CommonKeywordExtractor.java similarity index 97% rename from crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/CommonKeywordExtractor.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/links/CommonKeywordExtractor.java index eb7d39e7..3172252d 100644 --- a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/CommonKeywordExtractor.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/links/CommonKeywordExtractor.java @@ -1,4 +1,4 @@ -package nu.marginalia.converting.processor.logic; +package nu.marginalia.converting.processor.logic.links; import ca.rmen.porterstemmer.PorterStemmer; import nu.marginalia.model.crawl.EdgePageWordFlags; diff --git a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/InternalLinkGraph.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/links/InternalLinkGraph.java similarity index 97% rename from crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/InternalLinkGraph.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/links/InternalLinkGraph.java index 1f69cf31..4933e074 100644 --- a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/InternalLinkGraph.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/links/InternalLinkGraph.java @@ -1,4 +1,4 @@ -package nu.marginalia.converting.processor.logic; +package nu.marginalia.converting.processor.logic.links; import nu.marginalia.model.crawl.EdgePageWordFlags; import nu.marginalia.converting.model.ProcessedDocument; diff --git a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/LinkProcessor.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/links/LinkProcessor.java similarity index 97% rename from crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/LinkProcessor.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/links/LinkProcessor.java index 68b212de..65bdd4a4 100644 --- a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/LinkProcessor.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/links/LinkProcessor.java @@ -1,4 +1,4 @@ -package nu.marginalia.converting.processor.logic; +package nu.marginalia.converting.processor.logic.links; import nu.marginalia.converting.model.ProcessedDocumentDetails; import nu.marginalia.crawling.common.blocklist.UrlBlocklist; diff --git a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/SummaryExtractionFilter.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/summary/SummaryExtractionFilter.java similarity index 99% rename from crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/SummaryExtractionFilter.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/summary/SummaryExtractionFilter.java index 942aec70..312a22a6 100644 --- a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/SummaryExtractionFilter.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/summary/SummaryExtractionFilter.java @@ -1,4 +1,4 @@ -package nu.marginalia.converting.processor.logic; +package nu.marginalia.converting.processor.logic.summary; import com.google.common.base.Strings; import org.apache.commons.lang3.StringUtils; diff --git a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/SummaryExtractor.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/summary/SummaryExtractor.java similarity index 98% rename from crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/SummaryExtractor.java rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/summary/SummaryExtractor.java index ce37df0a..bf803aee 100644 --- a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/SummaryExtractor.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/summary/SummaryExtractor.java @@ -1,4 +1,4 @@ -package nu.marginalia.converting.processor.logic; +package nu.marginalia.converting.processor.logic.summary; import com.google.inject.Inject; import com.google.inject.name.Named; diff --git a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java index 66e865d6..15001148 100644 --- a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java @@ -2,6 +2,8 @@ package nu.marginalia.converting.processor.plugin; import com.google.inject.Inject; import com.google.inject.name.Named; +import nu.marginalia.converting.processor.logic.links.LinkProcessor; +import nu.marginalia.converting.processor.logic.summary.SummaryExtractor; import nu.marginalia.crawling.common.link.LinkParser; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; diff --git a/crawl/converting-process/src/test/java/nu/marginalia/converting/logic/SummaryExtractorTest.java b/crawl/converting-process/src/test/java/nu/marginalia/converting/logic/SummaryExtractorTest.java index 15a2d377..024f9f83 100644 --- a/crawl/converting-process/src/test/java/nu/marginalia/converting/logic/SummaryExtractorTest.java +++ b/crawl/converting-process/src/test/java/nu/marginalia/converting/logic/SummaryExtractorTest.java @@ -1,8 +1,8 @@ package nu.marginalia.converting.logic; import nu.marginalia.WmsaHome; -import nu.marginalia.converting.processor.logic.SummaryExtractionFilter; -import nu.marginalia.converting.processor.logic.SummaryExtractor; +import nu.marginalia.converting.processor.logic.summary.SummaryExtractionFilter; +import nu.marginalia.converting.processor.logic.summary.SummaryExtractor; import org.jsoup.Jsoup; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach;