diff --git a/crawl/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDocument.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDocument.java
index b5fa8735..4b0fbb36 100644
--- a/crawl/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDocument.java
+++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDocument.java
@@ -17,8 +17,6 @@ public class ProcessedDocument {
public EdgeUrlState state;
public String stateReason;
- public long lshHash;
-
public boolean isOk() {
return EdgeUrlState.OK == state;
}
diff --git a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java
index 005a1efc..c25d3440 100644
--- a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java
+++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java
@@ -11,7 +11,7 @@ import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.util.StringPool;
-import nu.marginalia.converting.processor.logic.InternalLinkGraph;
+import nu.marginalia.converting.processor.logic.links.InternalLinkGraph;
import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator;
import java.util.*;
diff --git a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/SiteWords.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/SiteWords.java
index 2fd75cba..6c16c068 100644
--- a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/SiteWords.java
+++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/SiteWords.java
@@ -5,8 +5,8 @@ import nu.marginalia.model.crawl.EdgePageWordFlags;
import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.model.EdgeUrl;
-import nu.marginalia.converting.processor.logic.CommonKeywordExtractor;
-import nu.marginalia.converting.processor.logic.InternalLinkGraph;
+import nu.marginalia.converting.processor.logic.links.CommonKeywordExtractor;
+import nu.marginalia.converting.processor.logic.links.InternalLinkGraph;
import javax.inject.Singleton;
import java.util.HashMap;
diff --git a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DomPruningFilter.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DomPruningFilter.java
index f8c0b65d..b8c5b056 100644
--- a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DomPruningFilter.java
+++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DomPruningFilter.java
@@ -8,6 +8,11 @@ import org.jsoup.select.NodeFilter;
import java.util.HashMap;
import java.util.Map;
+/** Prune the DOM and remove noisy branches with a lot of tags and not a lot of text.
+ * This removes a lot of noise and keeps segments that are more or less just plain text.
+ *
+ * Used with JSoup's Document.filter() method
+ */
public class DomPruningFilter implements NodeFilter {
private final double pruneThreshold;
diff --git a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java
index c0e1bc91..0ff35164 100644
--- a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java
+++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java
@@ -44,7 +44,12 @@ public class FeatureExtractor {
private final GoogleAnwersSpamDetector googleAnwersSpamDetector;
@Inject
- public FeatureExtractor(AdblockSimulator adblockSimulator, RecipeDetector recipeDetector, TextileCraftDetector textileCraftDetector, WoodworkingDetector woodworkingDetector, GoogleAnwersSpamDetector googleAnwersSpamDetector) {
+ public FeatureExtractor(AdblockSimulator adblockSimulator,
+ RecipeDetector recipeDetector,
+ TextileCraftDetector textileCraftDetector,
+ WoodworkingDetector woodworkingDetector,
+ GoogleAnwersSpamDetector googleAnwersSpamDetector)
+ {
this.adblockSimulator = adblockSimulator;
this.recipeDetector = recipeDetector;
this.textileCraftDetector = textileCraftDetector;
diff --git a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeedExtractor.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeedExtractor.java
index c20a9878..2a5bbada 100644
--- a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeedExtractor.java
+++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeedExtractor.java
@@ -19,9 +19,6 @@ public class FeedExtractor {
public Optional getFeedFromAlternateTag(EdgeUrl crawlUrl, Element alternateTag) {
var type = alternateTag.attr("type");
- if (type == null) {
- return Optional.empty();
- }
try {
var url = linkParser.parseLink(crawlUrl, alternateTag.attr("href"));
diff --git a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/PlainTextLogic.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/PlainTextLogic.java
index aefa5710..42bd47e9 100644
--- a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/PlainTextLogic.java
+++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/PlainTextLogic.java
@@ -13,7 +13,6 @@ public class PlainTextLogic {
public String getDescription(List firstFewLines) {
return StringUtils.truncate(firstFewLines.stream().filter(this::looksLikeText)
.collect(Collectors.joining(" ")).replaceAll("\\s+", " ")
-
, 255);
}
diff --git a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/SalientImageDetector.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/SalientImageDetector.java
deleted file mode 100644
index 4a3baabd..00000000
--- a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/SalientImageDetector.java
+++ /dev/null
@@ -1,74 +0,0 @@
-package nu.marginalia.converting.processor.logic;
-
-import org.jsoup.nodes.Document;
-import org.jsoup.nodes.Element;
-
-import java.util.HashMap;
-import java.util.Map;
-
-public class SalientImageDetector {
-
- public boolean hasSalientImage(Document document) {
- document.getElementsByTag("a").removeIf(Element::hasText);
-
- Map counts = new HashMap<>();
- for (var elem : document.getElementsByTag("img")) {
- counts.merge(elem.attr("src"), 1, Integer::sum);
- }
- for (var elem : document.select("p,div,section,article,font,center")) {
-
- String tagName = elem.tagName();
- if (("p".equals(tagName) || "center".equals(tagName) || "font".equals(tagName))
- && elem.text().length() < 16)
- {
- continue;
- }
-
- if (aTagDensity(elem) < 0.1 && htmlTagDensity(elem) > 0.85) {
- for (var imgTag : elem.getElementsByTag("img")) {
- if (counts.getOrDefault(imgTag.attr("src"), 1) > 1) {
- continue;
- }
-
- if (isSmall(imgTag)) {
- if (!imgTag.id().isBlank()) {
- continue;
- }
- }
-
- return true;
- }
- }
- }
-
- return false;
-
- }
-
- private boolean isSmall(Element imgTag) {
- final String width = imgTag.attr("width");
- final String height = imgTag.attr("height");
-
- if (width.isBlank() || height.isBlank())
- return true;
-
- try {
- if (Integer.parseInt(width) < 400)
- return true;
- if (Integer.parseInt(height) < 400)
- return true;
- }
- catch (NumberFormatException ex) { /* no-op */ }
-
- return false;
- }
-
- private double htmlTagDensity(Element elem) {
- return (double) elem.text().length() / elem.html().length();
- }
-
- private double aTagDensity(Element elem) {
- return (double) elem.getElementsByTag("a").text().length() / elem.text().length();
- }
-
-}
diff --git a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/CommonKeywordExtractor.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/links/CommonKeywordExtractor.java
similarity index 97%
rename from crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/CommonKeywordExtractor.java
rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/links/CommonKeywordExtractor.java
index eb7d39e7..3172252d 100644
--- a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/CommonKeywordExtractor.java
+++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/links/CommonKeywordExtractor.java
@@ -1,4 +1,4 @@
-package nu.marginalia.converting.processor.logic;
+package nu.marginalia.converting.processor.logic.links;
import ca.rmen.porterstemmer.PorterStemmer;
import nu.marginalia.model.crawl.EdgePageWordFlags;
diff --git a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/InternalLinkGraph.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/links/InternalLinkGraph.java
similarity index 97%
rename from crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/InternalLinkGraph.java
rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/links/InternalLinkGraph.java
index 1f69cf31..4933e074 100644
--- a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/InternalLinkGraph.java
+++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/links/InternalLinkGraph.java
@@ -1,4 +1,4 @@
-package nu.marginalia.converting.processor.logic;
+package nu.marginalia.converting.processor.logic.links;
import nu.marginalia.model.crawl.EdgePageWordFlags;
import nu.marginalia.converting.model.ProcessedDocument;
diff --git a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/LinkProcessor.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/links/LinkProcessor.java
similarity index 97%
rename from crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/LinkProcessor.java
rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/links/LinkProcessor.java
index 68b212de..65bdd4a4 100644
--- a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/LinkProcessor.java
+++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/links/LinkProcessor.java
@@ -1,4 +1,4 @@
-package nu.marginalia.converting.processor.logic;
+package nu.marginalia.converting.processor.logic.links;
import nu.marginalia.converting.model.ProcessedDocumentDetails;
import nu.marginalia.crawling.common.blocklist.UrlBlocklist;
diff --git a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/SummaryExtractionFilter.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/summary/SummaryExtractionFilter.java
similarity index 99%
rename from crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/SummaryExtractionFilter.java
rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/summary/SummaryExtractionFilter.java
index 942aec70..312a22a6 100644
--- a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/SummaryExtractionFilter.java
+++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/summary/SummaryExtractionFilter.java
@@ -1,4 +1,4 @@
-package nu.marginalia.converting.processor.logic;
+package nu.marginalia.converting.processor.logic.summary;
import com.google.common.base.Strings;
import org.apache.commons.lang3.StringUtils;
diff --git a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/SummaryExtractor.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/summary/SummaryExtractor.java
similarity index 98%
rename from crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/SummaryExtractor.java
rename to crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/summary/SummaryExtractor.java
index ce37df0a..bf803aee 100644
--- a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/SummaryExtractor.java
+++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/summary/SummaryExtractor.java
@@ -1,4 +1,4 @@
-package nu.marginalia.converting.processor.logic;
+package nu.marginalia.converting.processor.logic.summary;
import com.google.inject.Inject;
import com.google.inject.name.Named;
diff --git a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java
index 66e865d6..15001148 100644
--- a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java
+++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java
@@ -2,6 +2,8 @@ package nu.marginalia.converting.processor.plugin;
import com.google.inject.Inject;
import com.google.inject.name.Named;
+import nu.marginalia.converting.processor.logic.links.LinkProcessor;
+import nu.marginalia.converting.processor.logic.summary.SummaryExtractor;
import nu.marginalia.crawling.common.link.LinkParser;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
diff --git a/crawl/converting-process/src/test/java/nu/marginalia/converting/logic/SummaryExtractorTest.java b/crawl/converting-process/src/test/java/nu/marginalia/converting/logic/SummaryExtractorTest.java
index 15a2d377..024f9f83 100644
--- a/crawl/converting-process/src/test/java/nu/marginalia/converting/logic/SummaryExtractorTest.java
+++ b/crawl/converting-process/src/test/java/nu/marginalia/converting/logic/SummaryExtractorTest.java
@@ -1,8 +1,8 @@
package nu.marginalia.converting.logic;
import nu.marginalia.WmsaHome;
-import nu.marginalia.converting.processor.logic.SummaryExtractionFilter;
-import nu.marginalia.converting.processor.logic.SummaryExtractor;
+import nu.marginalia.converting.processor.logic.summary.SummaryExtractionFilter;
+import nu.marginalia.converting.processor.logic.summary.SummaryExtractor;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;