From db056be06a6bbb855717037f56bd90e3fc070481 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Wed, 24 Aug 2022 22:05:32 +0200 Subject: [PATCH] WIP logic for detecting significant images in the body of a website. --- .../processor/logic/SalientImageDetector.java | 74 +++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/SalientImageDetector.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/SalientImageDetector.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/SalientImageDetector.java new file mode 100644 index 00000000..271ad6f2 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/SalientImageDetector.java @@ -0,0 +1,74 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic; + +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; + +import java.util.HashMap; +import java.util.Map; + +public class SalientImageDetector { + + public boolean hasSalientImage(Document document) { + document.getElementsByTag("a").removeIf(Element::hasText); + + Map counts = new HashMap<>(); + for (var elem : document.getElementsByTag("img")) { + counts.merge(elem.attr("src"), 1, Integer::sum); + } + for (var elem : document.select("p,div,section,article,font,center")) { + + String tagName = elem.tagName(); + if (("p".equals(tagName) || "center".equals(tagName) || "font".equals(tagName)) + && elem.text().length() < 16) + { + continue; + } + + if (aTagDensity(elem) < 0.1 && htmlTagDensity(elem) > 0.85) { + for (var imgTag : elem.getElementsByTag("img")) { + if (counts.getOrDefault(imgTag.attr("src"), 1) > 1) { + continue; + } + + if (isSmall(imgTag)) { + if (!imgTag.id().isBlank()) { + continue; + } + } + + return true; + } + } + } + + return false; + + } + + private boolean isSmall(Element imgTag) { + final String width = imgTag.attr("width"); + final String height = imgTag.attr("height"); + + if (width.isBlank() || height.isBlank()) + return true; + + try { + if (Integer.parseInt(width) < 400) + return true; + if (Integer.parseInt(height) < 400) + return true; + } + catch (NumberFormatException ex) { /* no-op */ } + + return false; + } + + private double htmlTagDensity(Element elem) { + return (double) elem.text().length() / elem.html().length(); + } + + private double aTagDensity(Element elem) { + return (double) elem.getElementsByTag("a").text().length() / elem.text().length(); + } + +}