mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Refactoring converting-process
This commit is contained in:
parent
bce452fb4f
commit
43f3380cb9
@ -17,8 +17,6 @@ public class ProcessedDocument {
|
||||
public EdgeUrlState state;
|
||||
public String stateReason;
|
||||
|
||||
public long lshHash;
|
||||
|
||||
public boolean isOk() {
|
||||
return EdgeUrlState.OK == state;
|
||||
}
|
||||
|
@ -11,7 +11,7 @@ import nu.marginalia.converting.model.ProcessedDomain;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.util.StringPool;
|
||||
import nu.marginalia.converting.processor.logic.InternalLinkGraph;
|
||||
import nu.marginalia.converting.processor.logic.links.InternalLinkGraph;
|
||||
import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator;
|
||||
|
||||
import java.util.*;
|
||||
|
@ -5,8 +5,8 @@ import nu.marginalia.model.crawl.EdgePageWordFlags;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.converting.model.ProcessedDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.converting.processor.logic.CommonKeywordExtractor;
|
||||
import nu.marginalia.converting.processor.logic.InternalLinkGraph;
|
||||
import nu.marginalia.converting.processor.logic.links.CommonKeywordExtractor;
|
||||
import nu.marginalia.converting.processor.logic.links.InternalLinkGraph;
|
||||
|
||||
import javax.inject.Singleton;
|
||||
import java.util.HashMap;
|
||||
|
@ -8,6 +8,11 @@ import org.jsoup.select.NodeFilter;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/** Prune the DOM and remove noisy branches with a lot of tags and not a lot of text.
|
||||
* This removes a lot of noise and keeps segments that are more or less just plain text.
|
||||
* <p>
|
||||
* Used with JSoup's Document.filter() method
|
||||
*/
|
||||
public class DomPruningFilter implements NodeFilter {
|
||||
|
||||
private final double pruneThreshold;
|
||||
|
@ -44,7 +44,12 @@ public class FeatureExtractor {
|
||||
private final GoogleAnwersSpamDetector googleAnwersSpamDetector;
|
||||
|
||||
@Inject
|
||||
public FeatureExtractor(AdblockSimulator adblockSimulator, RecipeDetector recipeDetector, TextileCraftDetector textileCraftDetector, WoodworkingDetector woodworkingDetector, GoogleAnwersSpamDetector googleAnwersSpamDetector) {
|
||||
public FeatureExtractor(AdblockSimulator adblockSimulator,
|
||||
RecipeDetector recipeDetector,
|
||||
TextileCraftDetector textileCraftDetector,
|
||||
WoodworkingDetector woodworkingDetector,
|
||||
GoogleAnwersSpamDetector googleAnwersSpamDetector)
|
||||
{
|
||||
this.adblockSimulator = adblockSimulator;
|
||||
this.recipeDetector = recipeDetector;
|
||||
this.textileCraftDetector = textileCraftDetector;
|
||||
|
@ -19,9 +19,6 @@ public class FeedExtractor {
|
||||
|
||||
public Optional<EdgeUrl> getFeedFromAlternateTag(EdgeUrl crawlUrl, Element alternateTag) {
|
||||
var type = alternateTag.attr("type");
|
||||
if (type == null) {
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
try {
|
||||
var url = linkParser.parseLink(crawlUrl, alternateTag.attr("href"));
|
||||
|
@ -13,7 +13,6 @@ public class PlainTextLogic {
|
||||
public String getDescription(List<String> firstFewLines) {
|
||||
return StringUtils.truncate(firstFewLines.stream().filter(this::looksLikeText)
|
||||
.collect(Collectors.joining(" ")).replaceAll("\\s+", " ")
|
||||
|
||||
, 255);
|
||||
}
|
||||
|
||||
|
@ -1,74 +0,0 @@
|
||||
package nu.marginalia.converting.processor.logic;
|
||||
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class SalientImageDetector {
|
||||
|
||||
public boolean hasSalientImage(Document document) {
|
||||
document.getElementsByTag("a").removeIf(Element::hasText);
|
||||
|
||||
Map<String, Integer> counts = new HashMap<>();
|
||||
for (var elem : document.getElementsByTag("img")) {
|
||||
counts.merge(elem.attr("src"), 1, Integer::sum);
|
||||
}
|
||||
for (var elem : document.select("p,div,section,article,font,center")) {
|
||||
|
||||
String tagName = elem.tagName();
|
||||
if (("p".equals(tagName) || "center".equals(tagName) || "font".equals(tagName))
|
||||
&& elem.text().length() < 16)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if (aTagDensity(elem) < 0.1 && htmlTagDensity(elem) > 0.85) {
|
||||
for (var imgTag : elem.getElementsByTag("img")) {
|
||||
if (counts.getOrDefault(imgTag.attr("src"), 1) > 1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (isSmall(imgTag)) {
|
||||
if (!imgTag.id().isBlank()) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
|
||||
}
|
||||
|
||||
private boolean isSmall(Element imgTag) {
|
||||
final String width = imgTag.attr("width");
|
||||
final String height = imgTag.attr("height");
|
||||
|
||||
if (width.isBlank() || height.isBlank())
|
||||
return true;
|
||||
|
||||
try {
|
||||
if (Integer.parseInt(width) < 400)
|
||||
return true;
|
||||
if (Integer.parseInt(height) < 400)
|
||||
return true;
|
||||
}
|
||||
catch (NumberFormatException ex) { /* no-op */ }
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private double htmlTagDensity(Element elem) {
|
||||
return (double) elem.text().length() / elem.html().length();
|
||||
}
|
||||
|
||||
private double aTagDensity(Element elem) {
|
||||
return (double) elem.getElementsByTag("a").text().length() / elem.text().length();
|
||||
}
|
||||
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.converting.processor.logic;
|
||||
package nu.marginalia.converting.processor.logic.links;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.converting.processor.logic;
|
||||
package nu.marginalia.converting.processor.logic.links;
|
||||
|
||||
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.converting.processor.logic;
|
||||
package nu.marginalia.converting.processor.logic.links;
|
||||
|
||||
import nu.marginalia.converting.model.ProcessedDocumentDetails;
|
||||
import nu.marginalia.crawling.common.blocklist.UrlBlocklist;
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.converting.processor.logic;
|
||||
package nu.marginalia.converting.processor.logic.summary;
|
||||
|
||||
import com.google.common.base.Strings;
|
||||
import org.apache.commons.lang3.StringUtils;
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.converting.processor.logic;
|
||||
package nu.marginalia.converting.processor.logic.summary;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
@ -2,6 +2,8 @@ package nu.marginalia.converting.processor.plugin;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.converting.processor.logic.links.LinkProcessor;
|
||||
import nu.marginalia.converting.processor.logic.summary.SummaryExtractor;
|
||||
import nu.marginalia.crawling.common.link.LinkParser;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.crawling.model.CrawledDomain;
|
||||
|
@ -1,8 +1,8 @@
|
||||
package nu.marginalia.converting.logic;
|
||||
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.converting.processor.logic.SummaryExtractionFilter;
|
||||
import nu.marginalia.converting.processor.logic.SummaryExtractor;
|
||||
import nu.marginalia.converting.processor.logic.summary.SummaryExtractionFilter;
|
||||
import nu.marginalia.converting.processor.logic.summary.SummaryExtractor;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
|
Loading…
Reference in New Issue
Block a user