mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Clean up summary extractor module.
This commit is contained in:
parent
6a20b2b678
commit
43430728aa
@ -2,10 +2,12 @@ package nu.marginalia.summary;
|
|||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.name.Named;
|
import com.google.inject.name.Named;
|
||||||
|
import nu.marginalia.summary.heuristic.*;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
import org.jsoup.nodes.Element;
|
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
public class SummaryExtractor {
|
public class SummaryExtractor {
|
||||||
@ -13,100 +15,36 @@ public class SummaryExtractor {
|
|||||||
|
|
||||||
private final Pattern truncatedCharacters = Pattern.compile("[\\-.,!?' ]{3,}");
|
private final Pattern truncatedCharacters = Pattern.compile("[\\-.,!?' ]{3,}");
|
||||||
|
|
||||||
|
private final List<SummaryHeuristic> heuristics = new ArrayList<>();
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public SummaryExtractor(@Named("max-summary-length") Integer maxSummaryLength) {
|
public SummaryExtractor(@Named("max-summary-length") Integer maxSummaryLength,
|
||||||
|
DomFilterHeuristic domFilterHeuristic,
|
||||||
|
TagDensityHeuristic tagDensityHeuristic,
|
||||||
|
OpenGraphDescriptionHeuristic ogTagHeuristic,
|
||||||
|
MetaDescriptionHeuristic metaDescriptionHeuristic,
|
||||||
|
FallbackHeuristic fallbackHeuristic)
|
||||||
|
{
|
||||||
this.maxSummaryLength = maxSummaryLength;
|
this.maxSummaryLength = maxSummaryLength;
|
||||||
|
|
||||||
|
heuristics.add(domFilterHeuristic);
|
||||||
|
heuristics.add(tagDensityHeuristic);
|
||||||
|
heuristics.add(ogTagHeuristic);
|
||||||
|
heuristics.add(metaDescriptionHeuristic);
|
||||||
|
heuristics.add(fallbackHeuristic);
|
||||||
}
|
}
|
||||||
|
|
||||||
public String extractSummary(Document parsed) {
|
public String extractSummary(Document parsed) {
|
||||||
String summaryString = extractSummaryRaw(parsed);
|
|
||||||
|
|
||||||
summaryString = truncatedCharacters.matcher(summaryString).replaceAll(" ");
|
|
||||||
summaryString = StringUtils.abbreviate(summaryString, "", maxSummaryLength);
|
|
||||||
|
|
||||||
return summaryString;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private String extractSummaryRaw(Document parsed) {
|
|
||||||
|
|
||||||
String maybe;
|
|
||||||
|
|
||||||
parsed.select("header,nav,#header,#nav,#navigation,.header,.nav,.navigation,ul,li").remove();
|
parsed.select("header,nav,#header,#nav,#navigation,.header,.nav,.navigation,ul,li").remove();
|
||||||
|
|
||||||
// Plan A
|
for (var heuristic : heuristics) {
|
||||||
maybe = getSummaryNew(parsed.clone());
|
String maybe = heuristic.summarize(parsed);
|
||||||
if (!maybe.isBlank()) return maybe;
|
if (!maybe.isBlank()) {
|
||||||
|
String cleaned = truncatedCharacters.matcher(maybe).replaceAll(" ");
|
||||||
maybe = getSummaryByTagDensity(parsed.clone());
|
return StringUtils.abbreviate(cleaned, "", maxSummaryLength);
|
||||||
if (!maybe.isBlank()) return maybe;
|
|
||||||
|
|
||||||
// Plan B: Open Graph Description
|
|
||||||
maybe = parsed.select("meta[name=og:description]").attr("content");
|
|
||||||
if (!maybe.isBlank()) return maybe;
|
|
||||||
|
|
||||||
// Plan C: Ye Olde meta-description
|
|
||||||
maybe = parsed.select("meta[name=description]").attr("content");
|
|
||||||
if (!maybe.isBlank()) return maybe;
|
|
||||||
|
|
||||||
// Plan D: The kitchen sink?
|
|
||||||
return lastDitchSummaryEffort(parsed);
|
|
||||||
}
|
|
||||||
|
|
||||||
private String getSummaryNew(Document parsed) {
|
|
||||||
var filter = new SummaryExtractionFilter();
|
|
||||||
|
|
||||||
parsed.filter(filter);
|
|
||||||
|
|
||||||
return filter.getSummary(maxSummaryLength+32);
|
|
||||||
}
|
|
||||||
|
|
||||||
private String getSummaryByTagDensity(Document parsed) {
|
|
||||||
StringBuilder content = new StringBuilder();
|
|
||||||
|
|
||||||
for (var elem : parsed.select("p,div,section,article,font,center")) {
|
|
||||||
if (content.length() >= maxSummaryLength) break;
|
|
||||||
|
|
||||||
String tagName = elem.tagName();
|
|
||||||
if (("p".equals(tagName) || "center".equals(tagName) || "font".equals(tagName))
|
|
||||||
&& elem.text().length() < 16)
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (aTagDensity(elem) < 0.1 && htmlTagDensity(elem) > 0.85) {
|
|
||||||
content.append(elem.text()).append(' ');
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (content.length() > 32) {
|
|
||||||
// AAAABBBBCCCCDDDDEEEEFFFFGGGGHHHH
|
|
||||||
return content.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
|
||||||
private String lastDitchSummaryEffort(Document parsed) {
|
|
||||||
int bodyTextLength = parsed.body().text().length();
|
|
||||||
|
|
||||||
parsed.getElementsByTag("a").remove();
|
|
||||||
|
|
||||||
for (var elem : parsed.select("p,div,section,article,font,center,td,h1,h2,h3,h4,h5,h6,tr,th")) {
|
|
||||||
if (elem.text().length() < bodyTextLength / 2 && aTagDensity(elem) > 0.25) {
|
|
||||||
elem.remove();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return parsed.body().text();
|
|
||||||
}
|
|
||||||
|
|
||||||
private double htmlTagDensity(Element elem) {
|
|
||||||
return (double) elem.text().length() / elem.html().length();
|
|
||||||
}
|
|
||||||
|
|
||||||
private double aTagDensity(Element elem) {
|
|
||||||
return (double) elem.getElementsByTag("a").text().length() / elem.text().length();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,26 @@
|
|||||||
|
package nu.marginalia.summary.heuristic;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.name.Named;
|
||||||
|
import nu.marginalia.summary.SummaryExtractionFilter;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
|
public class DomFilterHeuristic implements SummaryHeuristic {
|
||||||
|
private final int maxSummaryLength;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public DomFilterHeuristic(@Named("max-summary-length") Integer maxSummaryLength) {
|
||||||
|
this.maxSummaryLength = maxSummaryLength;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String summarize(Document doc) {
|
||||||
|
doc = doc.clone();
|
||||||
|
|
||||||
|
var filter = new SummaryExtractionFilter();
|
||||||
|
|
||||||
|
doc.filter(filter);
|
||||||
|
|
||||||
|
return filter.getSummary(maxSummaryLength+32);
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,28 @@
|
|||||||
|
package nu.marginalia.summary.heuristic;
|
||||||
|
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
import org.jsoup.nodes.Element;
|
||||||
|
|
||||||
|
public class FallbackHeuristic implements SummaryHeuristic {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String summarize(Document doc) {
|
||||||
|
doc = doc.clone();
|
||||||
|
|
||||||
|
int bodyTextLength = doc.body().text().length();
|
||||||
|
|
||||||
|
doc.getElementsByTag("a").remove();
|
||||||
|
|
||||||
|
for (var elem : doc.select("p,div,section,article,font,center,td,h1,h2,h3,h4,h5,h6,tr,th")) {
|
||||||
|
if (elem.text().length() < bodyTextLength / 2 && aTagDensity(elem) > 0.25) {
|
||||||
|
elem.remove();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return doc.body().text();
|
||||||
|
}
|
||||||
|
|
||||||
|
private double aTagDensity(Element elem) {
|
||||||
|
return (double) elem.getElementsByTag("a").text().length() / elem.text().length();
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,10 @@
|
|||||||
|
package nu.marginalia.summary.heuristic;
|
||||||
|
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
|
public class MetaDescriptionHeuristic implements SummaryHeuristic {
|
||||||
|
@Override
|
||||||
|
public String summarize(Document doc) {
|
||||||
|
return doc.select("meta[name=description]").attr("content");
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,10 @@
|
|||||||
|
package nu.marginalia.summary.heuristic;
|
||||||
|
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
|
public class OpenGraphDescriptionHeuristic implements SummaryHeuristic {
|
||||||
|
@Override
|
||||||
|
public String summarize(Document doc) {
|
||||||
|
return doc.select("meta[name=og:description]").attr("content");
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,7 @@
|
|||||||
|
package nu.marginalia.summary.heuristic;
|
||||||
|
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
|
public interface SummaryHeuristic {
|
||||||
|
String summarize(Document doc);
|
||||||
|
}
|
@ -0,0 +1,53 @@
|
|||||||
|
package nu.marginalia.summary.heuristic;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.name.Named;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
import org.jsoup.nodes.Element;
|
||||||
|
|
||||||
|
public class TagDensityHeuristic implements SummaryHeuristic {
|
||||||
|
private final int maxSummaryLength;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public TagDensityHeuristic(@Named("max-summary-length") Integer maxSummaryLength) {
|
||||||
|
this.maxSummaryLength = maxSummaryLength;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String summarize(Document doc) {
|
||||||
|
doc = doc.clone();
|
||||||
|
|
||||||
|
StringBuilder content = new StringBuilder();
|
||||||
|
|
||||||
|
for (var elem : doc.select("p,div,section,article,font,center")) {
|
||||||
|
if (content.length() >= maxSummaryLength) break;
|
||||||
|
|
||||||
|
String tagName = elem.tagName();
|
||||||
|
if (("p".equals(tagName) || "center".equals(tagName) || "font".equals(tagName))
|
||||||
|
&& elem.text().length() < 16)
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (aTagDensity(elem) < 0.1 && htmlTagDensity(elem) > 0.85) {
|
||||||
|
content.append(elem.text()).append(' ');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (content.length() > 32) {
|
||||||
|
// AAAABBBBCCCCDDDDEEEEFFFFGGGGHHHH
|
||||||
|
return content.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
private double htmlTagDensity(Element elem) {
|
||||||
|
return (double) elem.text().length() / elem.html().length();
|
||||||
|
}
|
||||||
|
|
||||||
|
private double aTagDensity(Element elem) {
|
||||||
|
return (double) elem.getElementsByTag("a").text().length() / elem.text().length();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -1,7 +1,6 @@
|
|||||||
package nu.marginalia.summary;
|
package nu.marginalia.summary;
|
||||||
|
|
||||||
import nu.marginalia.summary.SummaryExtractionFilter;
|
import nu.marginalia.summary.heuristic.*;
|
||||||
import nu.marginalia.summary.SummaryExtractor;
|
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.junit.jupiter.api.Assertions;
|
import org.junit.jupiter.api.Assertions;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
@ -18,7 +17,12 @@ class SummaryExtractorTest {
|
|||||||
SummaryExtractor summaryExtractor;
|
SummaryExtractor summaryExtractor;
|
||||||
@BeforeEach
|
@BeforeEach
|
||||||
public void setUp() {
|
public void setUp() {
|
||||||
summaryExtractor = new SummaryExtractor(255);
|
summaryExtractor = new SummaryExtractor(255,
|
||||||
|
new DomFilterHeuristic(255),
|
||||||
|
new TagDensityHeuristic(255),
|
||||||
|
new OpenGraphDescriptionHeuristic(),
|
||||||
|
new MetaDescriptionHeuristic(),
|
||||||
|
new FallbackHeuristic());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
Loading…
Reference in New Issue
Block a user