diff --git a/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/SummaryExtractor.java b/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/SummaryExtractor.java index 98e81fcf..73d16b3b 100644 --- a/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/SummaryExtractor.java +++ b/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/SummaryExtractor.java @@ -2,10 +2,12 @@ package nu.marginalia.summary; import com.google.inject.Inject; import com.google.inject.name.Named; +import nu.marginalia.summary.heuristic.*; import org.apache.commons.lang3.StringUtils; import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; +import java.util.ArrayList; +import java.util.List; import java.util.regex.Pattern; public class SummaryExtractor { @@ -13,100 +15,36 @@ public class SummaryExtractor { private final Pattern truncatedCharacters = Pattern.compile("[\\-.,!?' ]{3,}"); + private final List heuristics = new ArrayList<>(); + @Inject - public SummaryExtractor(@Named("max-summary-length") Integer maxSummaryLength) { + public SummaryExtractor(@Named("max-summary-length") Integer maxSummaryLength, + DomFilterHeuristic domFilterHeuristic, + TagDensityHeuristic tagDensityHeuristic, + OpenGraphDescriptionHeuristic ogTagHeuristic, + MetaDescriptionHeuristic metaDescriptionHeuristic, + FallbackHeuristic fallbackHeuristic) + { this.maxSummaryLength = maxSummaryLength; + + heuristics.add(domFilterHeuristic); + heuristics.add(tagDensityHeuristic); + heuristics.add(ogTagHeuristic); + heuristics.add(metaDescriptionHeuristic); + heuristics.add(fallbackHeuristic); } public String extractSummary(Document parsed) { - String summaryString = extractSummaryRaw(parsed); - - summaryString = truncatedCharacters.matcher(summaryString).replaceAll(" "); - summaryString = StringUtils.abbreviate(summaryString, "", maxSummaryLength); - - return summaryString; - } - - - private String extractSummaryRaw(Document parsed) { - - String maybe; - parsed.select("header,nav,#header,#nav,#navigation,.header,.nav,.navigation,ul,li").remove(); - // Plan A - maybe = getSummaryNew(parsed.clone()); - if (!maybe.isBlank()) return maybe; - - maybe = getSummaryByTagDensity(parsed.clone()); - if (!maybe.isBlank()) return maybe; - - // Plan B: Open Graph Description - maybe = parsed.select("meta[name=og:description]").attr("content"); - if (!maybe.isBlank()) return maybe; - - // Plan C: Ye Olde meta-description - maybe = parsed.select("meta[name=description]").attr("content"); - if (!maybe.isBlank()) return maybe; - - // Plan D: The kitchen sink? - return lastDitchSummaryEffort(parsed); - } - - private String getSummaryNew(Document parsed) { - var filter = new SummaryExtractionFilter(); - - parsed.filter(filter); - - return filter.getSummary(maxSummaryLength+32); - } - - private String getSummaryByTagDensity(Document parsed) { - StringBuilder content = new StringBuilder(); - - for (var elem : parsed.select("p,div,section,article,font,center")) { - if (content.length() >= maxSummaryLength) break; - - String tagName = elem.tagName(); - if (("p".equals(tagName) || "center".equals(tagName) || "font".equals(tagName)) - && elem.text().length() < 16) - { - continue; - } - - if (aTagDensity(elem) < 0.1 && htmlTagDensity(elem) > 0.85) { - content.append(elem.text()).append(' '); + for (var heuristic : heuristics) { + String maybe = heuristic.summarize(parsed); + if (!maybe.isBlank()) { + String cleaned = truncatedCharacters.matcher(maybe).replaceAll(" "); + return StringUtils.abbreviate(cleaned, "", maxSummaryLength); } } - - if (content.length() > 32) { - // AAAABBBBCCCCDDDDEEEEFFFFGGGGHHHH - return content.toString(); - } - return ""; } - private String lastDitchSummaryEffort(Document parsed) { - int bodyTextLength = parsed.body().text().length(); - - parsed.getElementsByTag("a").remove(); - - for (var elem : parsed.select("p,div,section,article,font,center,td,h1,h2,h3,h4,h5,h6,tr,th")) { - if (elem.text().length() < bodyTextLength / 2 && aTagDensity(elem) > 0.25) { - elem.remove(); - } - } - - return parsed.body().text(); - } - - private double htmlTagDensity(Element elem) { - return (double) elem.text().length() / elem.html().length(); - } - - private double aTagDensity(Element elem) { - return (double) elem.getElementsByTag("a").text().length() / elem.text().length(); - } - } diff --git a/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/heuristic/DomFilterHeuristic.java b/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/heuristic/DomFilterHeuristic.java new file mode 100644 index 00000000..cb24dd2d --- /dev/null +++ b/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/heuristic/DomFilterHeuristic.java @@ -0,0 +1,26 @@ +package nu.marginalia.summary.heuristic; + +import com.google.inject.Inject; +import com.google.inject.name.Named; +import nu.marginalia.summary.SummaryExtractionFilter; +import org.jsoup.nodes.Document; + +public class DomFilterHeuristic implements SummaryHeuristic { + private final int maxSummaryLength; + + @Inject + public DomFilterHeuristic(@Named("max-summary-length") Integer maxSummaryLength) { + this.maxSummaryLength = maxSummaryLength; + } + + @Override + public String summarize(Document doc) { + doc = doc.clone(); + + var filter = new SummaryExtractionFilter(); + + doc.filter(filter); + + return filter.getSummary(maxSummaryLength+32); + } +} diff --git a/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/heuristic/FallbackHeuristic.java b/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/heuristic/FallbackHeuristic.java new file mode 100644 index 00000000..d57383b9 --- /dev/null +++ b/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/heuristic/FallbackHeuristic.java @@ -0,0 +1,28 @@ +package nu.marginalia.summary.heuristic; + +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; + +public class FallbackHeuristic implements SummaryHeuristic { + + @Override + public String summarize(Document doc) { + doc = doc.clone(); + + int bodyTextLength = doc.body().text().length(); + + doc.getElementsByTag("a").remove(); + + for (var elem : doc.select("p,div,section,article,font,center,td,h1,h2,h3,h4,h5,h6,tr,th")) { + if (elem.text().length() < bodyTextLength / 2 && aTagDensity(elem) > 0.25) { + elem.remove(); + } + } + + return doc.body().text(); + } + + private double aTagDensity(Element elem) { + return (double) elem.getElementsByTag("a").text().length() / elem.text().length(); + } +} diff --git a/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/heuristic/MetaDescriptionHeuristic.java b/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/heuristic/MetaDescriptionHeuristic.java new file mode 100644 index 00000000..fa45897d --- /dev/null +++ b/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/heuristic/MetaDescriptionHeuristic.java @@ -0,0 +1,10 @@ +package nu.marginalia.summary.heuristic; + +import org.jsoup.nodes.Document; + +public class MetaDescriptionHeuristic implements SummaryHeuristic { + @Override + public String summarize(Document doc) { + return doc.select("meta[name=description]").attr("content"); + } +} diff --git a/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/heuristic/OpenGraphDescriptionHeuristic.java b/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/heuristic/OpenGraphDescriptionHeuristic.java new file mode 100644 index 00000000..28198be8 --- /dev/null +++ b/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/heuristic/OpenGraphDescriptionHeuristic.java @@ -0,0 +1,10 @@ +package nu.marginalia.summary.heuristic; + +import org.jsoup.nodes.Document; + +public class OpenGraphDescriptionHeuristic implements SummaryHeuristic { + @Override + public String summarize(Document doc) { + return doc.select("meta[name=og:description]").attr("content"); + } +} diff --git a/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/heuristic/SummaryHeuristic.java b/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/heuristic/SummaryHeuristic.java new file mode 100644 index 00000000..520b5b49 --- /dev/null +++ b/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/heuristic/SummaryHeuristic.java @@ -0,0 +1,7 @@ +package nu.marginalia.summary.heuristic; + +import org.jsoup.nodes.Document; + +public interface SummaryHeuristic { + String summarize(Document doc); +} diff --git a/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/heuristic/TagDensityHeuristic.java b/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/heuristic/TagDensityHeuristic.java new file mode 100644 index 00000000..75531826 --- /dev/null +++ b/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/heuristic/TagDensityHeuristic.java @@ -0,0 +1,53 @@ +package nu.marginalia.summary.heuristic; + +import com.google.inject.Inject; +import com.google.inject.name.Named; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; + +public class TagDensityHeuristic implements SummaryHeuristic { + private final int maxSummaryLength; + + @Inject + public TagDensityHeuristic(@Named("max-summary-length") Integer maxSummaryLength) { + this.maxSummaryLength = maxSummaryLength; + } + + @Override + public String summarize(Document doc) { + doc = doc.clone(); + + StringBuilder content = new StringBuilder(); + + for (var elem : doc.select("p,div,section,article,font,center")) { + if (content.length() >= maxSummaryLength) break; + + String tagName = elem.tagName(); + if (("p".equals(tagName) || "center".equals(tagName) || "font".equals(tagName)) + && elem.text().length() < 16) + { + continue; + } + + if (aTagDensity(elem) < 0.1 && htmlTagDensity(elem) > 0.85) { + content.append(elem.text()).append(' '); + } + } + + if (content.length() > 32) { + // AAAABBBBCCCCDDDDEEEEFFFFGGGGHHHH + return content.toString(); + } + + return ""; + } + + private double htmlTagDensity(Element elem) { + return (double) elem.text().length() / elem.html().length(); + } + + private double aTagDensity(Element elem) { + return (double) elem.getElementsByTag("a").text().length() / elem.text().length(); + } + +} diff --git a/code/features-convert/summary-extraction/src/test/java/nu/marginalia/summary/SummaryExtractorTest.java b/code/features-convert/summary-extraction/src/test/java/nu/marginalia/summary/SummaryExtractorTest.java index 65021e0c..fafc0747 100644 --- a/code/features-convert/summary-extraction/src/test/java/nu/marginalia/summary/SummaryExtractorTest.java +++ b/code/features-convert/summary-extraction/src/test/java/nu/marginalia/summary/SummaryExtractorTest.java @@ -1,7 +1,6 @@ package nu.marginalia.summary; -import nu.marginalia.summary.SummaryExtractionFilter; -import nu.marginalia.summary.SummaryExtractor; +import nu.marginalia.summary.heuristic.*; import org.jsoup.Jsoup; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach; @@ -18,7 +17,12 @@ class SummaryExtractorTest { SummaryExtractor summaryExtractor; @BeforeEach public void setUp() { - summaryExtractor = new SummaryExtractor(255); + summaryExtractor = new SummaryExtractor(255, + new DomFilterHeuristic(255), + new TagDensityHeuristic(255), + new OpenGraphDescriptionHeuristic(), + new MetaDescriptionHeuristic(), + new FallbackHeuristic()); } @Test