mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Refactoring converting-process
This commit is contained in:
parent
bce452fb4f
commit
43f3380cb9
@ -17,8 +17,6 @@ public class ProcessedDocument {
|
|||||||
public EdgeUrlState state;
|
public EdgeUrlState state;
|
||||||
public String stateReason;
|
public String stateReason;
|
||||||
|
|
||||||
public long lshHash;
|
|
||||||
|
|
||||||
public boolean isOk() {
|
public boolean isOk() {
|
||||||
return EdgeUrlState.OK == state;
|
return EdgeUrlState.OK == state;
|
||||||
}
|
}
|
||||||
|
@ -11,7 +11,7 @@ import nu.marginalia.converting.model.ProcessedDomain;
|
|||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.util.StringPool;
|
import nu.marginalia.util.StringPool;
|
||||||
import nu.marginalia.converting.processor.logic.InternalLinkGraph;
|
import nu.marginalia.converting.processor.logic.links.InternalLinkGraph;
|
||||||
import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator;
|
import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
@ -5,8 +5,8 @@ import nu.marginalia.model.crawl.EdgePageWordFlags;
|
|||||||
import nu.marginalia.converting.model.ProcessedDocument;
|
import nu.marginalia.converting.model.ProcessedDocument;
|
||||||
import nu.marginalia.converting.model.ProcessedDomain;
|
import nu.marginalia.converting.model.ProcessedDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.converting.processor.logic.CommonKeywordExtractor;
|
import nu.marginalia.converting.processor.logic.links.CommonKeywordExtractor;
|
||||||
import nu.marginalia.converting.processor.logic.InternalLinkGraph;
|
import nu.marginalia.converting.processor.logic.links.InternalLinkGraph;
|
||||||
|
|
||||||
import javax.inject.Singleton;
|
import javax.inject.Singleton;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
@ -8,6 +8,11 @@ import org.jsoup.select.NodeFilter;
|
|||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
/** Prune the DOM and remove noisy branches with a lot of tags and not a lot of text.
|
||||||
|
* This removes a lot of noise and keeps segments that are more or less just plain text.
|
||||||
|
* <p>
|
||||||
|
* Used with JSoup's Document.filter() method
|
||||||
|
*/
|
||||||
public class DomPruningFilter implements NodeFilter {
|
public class DomPruningFilter implements NodeFilter {
|
||||||
|
|
||||||
private final double pruneThreshold;
|
private final double pruneThreshold;
|
||||||
|
@ -44,7 +44,12 @@ public class FeatureExtractor {
|
|||||||
private final GoogleAnwersSpamDetector googleAnwersSpamDetector;
|
private final GoogleAnwersSpamDetector googleAnwersSpamDetector;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public FeatureExtractor(AdblockSimulator adblockSimulator, RecipeDetector recipeDetector, TextileCraftDetector textileCraftDetector, WoodworkingDetector woodworkingDetector, GoogleAnwersSpamDetector googleAnwersSpamDetector) {
|
public FeatureExtractor(AdblockSimulator adblockSimulator,
|
||||||
|
RecipeDetector recipeDetector,
|
||||||
|
TextileCraftDetector textileCraftDetector,
|
||||||
|
WoodworkingDetector woodworkingDetector,
|
||||||
|
GoogleAnwersSpamDetector googleAnwersSpamDetector)
|
||||||
|
{
|
||||||
this.adblockSimulator = adblockSimulator;
|
this.adblockSimulator = adblockSimulator;
|
||||||
this.recipeDetector = recipeDetector;
|
this.recipeDetector = recipeDetector;
|
||||||
this.textileCraftDetector = textileCraftDetector;
|
this.textileCraftDetector = textileCraftDetector;
|
||||||
|
@ -19,9 +19,6 @@ public class FeedExtractor {
|
|||||||
|
|
||||||
public Optional<EdgeUrl> getFeedFromAlternateTag(EdgeUrl crawlUrl, Element alternateTag) {
|
public Optional<EdgeUrl> getFeedFromAlternateTag(EdgeUrl crawlUrl, Element alternateTag) {
|
||||||
var type = alternateTag.attr("type");
|
var type = alternateTag.attr("type");
|
||||||
if (type == null) {
|
|
||||||
return Optional.empty();
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
var url = linkParser.parseLink(crawlUrl, alternateTag.attr("href"));
|
var url = linkParser.parseLink(crawlUrl, alternateTag.attr("href"));
|
||||||
|
@ -13,7 +13,6 @@ public class PlainTextLogic {
|
|||||||
public String getDescription(List<String> firstFewLines) {
|
public String getDescription(List<String> firstFewLines) {
|
||||||
return StringUtils.truncate(firstFewLines.stream().filter(this::looksLikeText)
|
return StringUtils.truncate(firstFewLines.stream().filter(this::looksLikeText)
|
||||||
.collect(Collectors.joining(" ")).replaceAll("\\s+", " ")
|
.collect(Collectors.joining(" ")).replaceAll("\\s+", " ")
|
||||||
|
|
||||||
, 255);
|
, 255);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,74 +0,0 @@
|
|||||||
package nu.marginalia.converting.processor.logic;
|
|
||||||
|
|
||||||
import org.jsoup.nodes.Document;
|
|
||||||
import org.jsoup.nodes.Element;
|
|
||||||
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
public class SalientImageDetector {
|
|
||||||
|
|
||||||
public boolean hasSalientImage(Document document) {
|
|
||||||
document.getElementsByTag("a").removeIf(Element::hasText);
|
|
||||||
|
|
||||||
Map<String, Integer> counts = new HashMap<>();
|
|
||||||
for (var elem : document.getElementsByTag("img")) {
|
|
||||||
counts.merge(elem.attr("src"), 1, Integer::sum);
|
|
||||||
}
|
|
||||||
for (var elem : document.select("p,div,section,article,font,center")) {
|
|
||||||
|
|
||||||
String tagName = elem.tagName();
|
|
||||||
if (("p".equals(tagName) || "center".equals(tagName) || "font".equals(tagName))
|
|
||||||
&& elem.text().length() < 16)
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (aTagDensity(elem) < 0.1 && htmlTagDensity(elem) > 0.85) {
|
|
||||||
for (var imgTag : elem.getElementsByTag("img")) {
|
|
||||||
if (counts.getOrDefault(imgTag.attr("src"), 1) > 1) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (isSmall(imgTag)) {
|
|
||||||
if (!imgTag.id().isBlank()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean isSmall(Element imgTag) {
|
|
||||||
final String width = imgTag.attr("width");
|
|
||||||
final String height = imgTag.attr("height");
|
|
||||||
|
|
||||||
if (width.isBlank() || height.isBlank())
|
|
||||||
return true;
|
|
||||||
|
|
||||||
try {
|
|
||||||
if (Integer.parseInt(width) < 400)
|
|
||||||
return true;
|
|
||||||
if (Integer.parseInt(height) < 400)
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
catch (NumberFormatException ex) { /* no-op */ }
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
private double htmlTagDensity(Element elem) {
|
|
||||||
return (double) elem.text().length() / elem.html().length();
|
|
||||||
}
|
|
||||||
|
|
||||||
private double aTagDensity(Element elem) {
|
|
||||||
return (double) elem.getElementsByTag("a").text().length() / elem.text().length();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.converting.processor.logic;
|
package nu.marginalia.converting.processor.logic.links;
|
||||||
|
|
||||||
import ca.rmen.porterstemmer.PorterStemmer;
|
import ca.rmen.porterstemmer.PorterStemmer;
|
||||||
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.converting.processor.logic;
|
package nu.marginalia.converting.processor.logic.links;
|
||||||
|
|
||||||
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
||||||
import nu.marginalia.converting.model.ProcessedDocument;
|
import nu.marginalia.converting.model.ProcessedDocument;
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.converting.processor.logic;
|
package nu.marginalia.converting.processor.logic.links;
|
||||||
|
|
||||||
import nu.marginalia.converting.model.ProcessedDocumentDetails;
|
import nu.marginalia.converting.model.ProcessedDocumentDetails;
|
||||||
import nu.marginalia.crawling.common.blocklist.UrlBlocklist;
|
import nu.marginalia.crawling.common.blocklist.UrlBlocklist;
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.converting.processor.logic;
|
package nu.marginalia.converting.processor.logic.summary;
|
||||||
|
|
||||||
import com.google.common.base.Strings;
|
import com.google.common.base.Strings;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.converting.processor.logic;
|
package nu.marginalia.converting.processor.logic.summary;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.name.Named;
|
import com.google.inject.name.Named;
|
@ -2,6 +2,8 @@ package nu.marginalia.converting.processor.plugin;
|
|||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.name.Named;
|
import com.google.inject.name.Named;
|
||||||
|
import nu.marginalia.converting.processor.logic.links.LinkProcessor;
|
||||||
|
import nu.marginalia.converting.processor.logic.summary.SummaryExtractor;
|
||||||
import nu.marginalia.crawling.common.link.LinkParser;
|
import nu.marginalia.crawling.common.link.LinkParser;
|
||||||
import nu.marginalia.crawling.model.CrawledDocument;
|
import nu.marginalia.crawling.model.CrawledDocument;
|
||||||
import nu.marginalia.crawling.model.CrawledDomain;
|
import nu.marginalia.crawling.model.CrawledDomain;
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
package nu.marginalia.converting.logic;
|
package nu.marginalia.converting.logic;
|
||||||
|
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.converting.processor.logic.SummaryExtractionFilter;
|
import nu.marginalia.converting.processor.logic.summary.SummaryExtractionFilter;
|
||||||
import nu.marginalia.converting.processor.logic.SummaryExtractor;
|
import nu.marginalia.converting.processor.logic.summary.SummaryExtractor;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.junit.jupiter.api.Assertions;
|
import org.junit.jupiter.api.Assertions;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
Loading…
Reference in New Issue
Block a user