Refactoring converting-process

This commit is contained in:
Viktor Lofgren 2023-03-06 19:32:25 +01:00
parent bce452fb4f
commit 43f3380cb9
15 changed files with 23 additions and 91 deletions

View File

@ -17,8 +17,6 @@ public class ProcessedDocument {
public EdgeUrlState state;
public String stateReason;
public long lshHash;
public boolean isOk() {
return EdgeUrlState.OK == state;
}

View File

@ -11,7 +11,7 @@ import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.util.StringPool;
import nu.marginalia.converting.processor.logic.InternalLinkGraph;
import nu.marginalia.converting.processor.logic.links.InternalLinkGraph;
import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator;
import java.util.*;

View File

@ -5,8 +5,8 @@ import nu.marginalia.model.crawl.EdgePageWordFlags;
import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.converting.processor.logic.CommonKeywordExtractor;
import nu.marginalia.converting.processor.logic.InternalLinkGraph;
import nu.marginalia.converting.processor.logic.links.CommonKeywordExtractor;
import nu.marginalia.converting.processor.logic.links.InternalLinkGraph;
import javax.inject.Singleton;
import java.util.HashMap;

View File

@ -8,6 +8,11 @@ import org.jsoup.select.NodeFilter;
import java.util.HashMap;
import java.util.Map;
/** Prune the DOM and remove noisy branches with a lot of tags and not a lot of text.
* This removes a lot of noise and keeps segments that are more or less just plain text.
* <p>
* Used with JSoup's Document.filter() method
*/
public class DomPruningFilter implements NodeFilter {
private final double pruneThreshold;

View File

@ -44,7 +44,12 @@ public class FeatureExtractor {
private final GoogleAnwersSpamDetector googleAnwersSpamDetector;
@Inject
public FeatureExtractor(AdblockSimulator adblockSimulator, RecipeDetector recipeDetector, TextileCraftDetector textileCraftDetector, WoodworkingDetector woodworkingDetector, GoogleAnwersSpamDetector googleAnwersSpamDetector) {
public FeatureExtractor(AdblockSimulator adblockSimulator,
RecipeDetector recipeDetector,
TextileCraftDetector textileCraftDetector,
WoodworkingDetector woodworkingDetector,
GoogleAnwersSpamDetector googleAnwersSpamDetector)
{
this.adblockSimulator = adblockSimulator;
this.recipeDetector = recipeDetector;
this.textileCraftDetector = textileCraftDetector;

View File

@ -19,9 +19,6 @@ public class FeedExtractor {
public Optional<EdgeUrl> getFeedFromAlternateTag(EdgeUrl crawlUrl, Element alternateTag) {
var type = alternateTag.attr("type");
if (type == null) {
return Optional.empty();
}
try {
var url = linkParser.parseLink(crawlUrl, alternateTag.attr("href"));

View File

@ -13,7 +13,6 @@ public class PlainTextLogic {
public String getDescription(List<String> firstFewLines) {
return StringUtils.truncate(firstFewLines.stream().filter(this::looksLikeText)
.collect(Collectors.joining(" ")).replaceAll("\\s+", " ")
, 255);
}

View File

@ -1,74 +0,0 @@
package nu.marginalia.converting.processor.logic;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import java.util.HashMap;
import java.util.Map;
public class SalientImageDetector {
public boolean hasSalientImage(Document document) {
document.getElementsByTag("a").removeIf(Element::hasText);
Map<String, Integer> counts = new HashMap<>();
for (var elem : document.getElementsByTag("img")) {
counts.merge(elem.attr("src"), 1, Integer::sum);
}
for (var elem : document.select("p,div,section,article,font,center")) {
String tagName = elem.tagName();
if (("p".equals(tagName) || "center".equals(tagName) || "font".equals(tagName))
&& elem.text().length() < 16)
{
continue;
}
if (aTagDensity(elem) < 0.1 && htmlTagDensity(elem) > 0.85) {
for (var imgTag : elem.getElementsByTag("img")) {
if (counts.getOrDefault(imgTag.attr("src"), 1) > 1) {
continue;
}
if (isSmall(imgTag)) {
if (!imgTag.id().isBlank()) {
continue;
}
}
return true;
}
}
}
return false;
}
private boolean isSmall(Element imgTag) {
final String width = imgTag.attr("width");
final String height = imgTag.attr("height");
if (width.isBlank() || height.isBlank())
return true;
try {
if (Integer.parseInt(width) < 400)
return true;
if (Integer.parseInt(height) < 400)
return true;
}
catch (NumberFormatException ex) { /* no-op */ }
return false;
}
private double htmlTagDensity(Element elem) {
return (double) elem.text().length() / elem.html().length();
}
private double aTagDensity(Element elem) {
return (double) elem.getElementsByTag("a").text().length() / elem.text().length();
}
}

View File

@ -1,4 +1,4 @@
package nu.marginalia.converting.processor.logic;
package nu.marginalia.converting.processor.logic.links;
import ca.rmen.porterstemmer.PorterStemmer;
import nu.marginalia.model.crawl.EdgePageWordFlags;

View File

@ -1,4 +1,4 @@
package nu.marginalia.converting.processor.logic;
package nu.marginalia.converting.processor.logic.links;
import nu.marginalia.model.crawl.EdgePageWordFlags;
import nu.marginalia.converting.model.ProcessedDocument;

View File

@ -1,4 +1,4 @@
package nu.marginalia.converting.processor.logic;
package nu.marginalia.converting.processor.logic.links;
import nu.marginalia.converting.model.ProcessedDocumentDetails;
import nu.marginalia.crawling.common.blocklist.UrlBlocklist;

View File

@ -1,4 +1,4 @@
package nu.marginalia.converting.processor.logic;
package nu.marginalia.converting.processor.logic.summary;
import com.google.common.base.Strings;
import org.apache.commons.lang3.StringUtils;

View File

@ -1,4 +1,4 @@
package nu.marginalia.converting.processor.logic;
package nu.marginalia.converting.processor.logic.summary;
import com.google.inject.Inject;
import com.google.inject.name.Named;

View File

@ -2,6 +2,8 @@ package nu.marginalia.converting.processor.plugin;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import nu.marginalia.converting.processor.logic.links.LinkProcessor;
import nu.marginalia.converting.processor.logic.summary.SummaryExtractor;
import nu.marginalia.crawling.common.link.LinkParser;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;

View File

@ -1,8 +1,8 @@
package nu.marginalia.converting.logic;
import nu.marginalia.WmsaHome;
import nu.marginalia.converting.processor.logic.SummaryExtractionFilter;
import nu.marginalia.converting.processor.logic.SummaryExtractor;
import nu.marginalia.converting.processor.logic.summary.SummaryExtractionFilter;
import nu.marginalia.converting.processor.logic.summary.SummaryExtractor;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;