From 619fb8ba80e29ba2e55b0764c7842daaac2e5d45 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 19 Apr 2023 15:28:50 +0200 Subject: [PATCH] (converter) Adjust the pub-date sniffing heuristics' order. Doing HTML5 tags too early puts some sites too early. Also expanded support for JSON+LD. --- .../nu/marginalia/pubdate/PubDateSniffer.java | 5 +- .../heuristic/PubDateHeuristicJSONLD.java | 56 ++++++++++++++++--- .../pubdate/PubDateSnifferTest.java | 14 +++++ 3 files changed, 66 insertions(+), 9 deletions(-) diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateSniffer.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateSniffer.java index 2877f53d..b8b9b704 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateSniffer.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateSniffer.java @@ -14,13 +14,14 @@ public class PubDateSniffer { private final List heuristics = new ArrayList<>(); public PubDateSniffer() { - heuristics.add(new PubDateHeuristicHtml5ItempropDateTag()); - heuristics.add(new PubDateHeuristicHtml5ArticleDateTag()); heuristics.add(new PubDateHeuristicJSONLD()); heuristics.add(new PubDateHeuristicMicrodata()); heuristics.add(new PubDateHeuristicOpenGraph()); heuristics.add(new PubDateHeuristicRDFaTag()); + heuristics.add(new PubDateHeuristicHtml5ItempropDateTag()); + heuristics.add(new PubDateHeuristicHtml5ArticleDateTag()); + // The more questionable heuristics should be kept below this line heuristics.add(new PubDateHeuristicUrlPatternPass1()); diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicJSONLD.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicJSONLD.java index ea5ec17c..f41db4d8 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicJSONLD.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicJSONLD.java @@ -3,6 +3,8 @@ package nu.marginalia.pubdate.heuristic; import com.google.gson.Gson; import com.google.gson.GsonBuilder; import com.google.gson.JsonSyntaxException; +import com.google.gson.annotations.SerializedName; +import lombok.ToString; import nu.marginalia.converting.model.HtmlStandard; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.pubdate.PubDateHeuristic; @@ -11,6 +13,9 @@ import nu.marginalia.model.EdgeUrl; import nu.marginalia.pubdate.PubDateEffortLevel; import org.jsoup.nodes.Document; +import java.util.Collections; +import java.util.List; +import java.util.Objects; import java.util.Optional; public class PubDateHeuristicJSONLD implements PubDateHeuristic { @@ -29,17 +34,16 @@ public class PubDateHeuristicJSONLD implements PubDateHeuristic { return Optional.empty(); } - - private static class JsonModel { - String datePublished; - } - private static Gson gson = new GsonBuilder().create(); + private static final Gson gson = new GsonBuilder().create(); public Optional parseLdJson(String content) { try { var model = gson.fromJson(content, JsonModel.class); - return Optional.ofNullable(model) - .map(m -> m.datePublished); + if (model == null) + return Optional.empty(); + + return Optional.ofNullable(model.getDatePublished()); + } catch (JsonSyntaxException ex) { return Optional.empty(); @@ -47,3 +51,41 @@ public class PubDateHeuristicJSONLD implements PubDateHeuristic { } } + +class JsonModel { + public String getDatePublished() { + if (datePublished != null) + return datePublished; + + for (var item : Objects.requireNonNullElse(graph, + Collections.emptyList())) + { + if (!item.isRelevant()) + continue; + + if (item.datePublished != null) + return item.datePublished; + } + + return datePublished; + } + + String datePublished; + + @SerializedName("@graph") + List graph; +} + +@ToString +class JsonModelGraphItem { + @SerializedName("@type") + public String type; + + public String datePublished; + + public boolean isRelevant() { + return "NewsArticle".equalsIgnoreCase(type) + || "Article".equalsIgnoreCase(type); + } +} + diff --git a/code/features-convert/pubdate/src/test/java/nu/marginalia/pubdate/PubDateSnifferTest.java b/code/features-convert/pubdate/src/test/java/nu/marginalia/pubdate/PubDateSnifferTest.java index f4a64e53..732ef923 100644 --- a/code/features-convert/pubdate/src/test/java/nu/marginalia/pubdate/PubDateSnifferTest.java +++ b/code/features-convert/pubdate/src/test/java/nu/marginalia/pubdate/PubDateSnifferTest.java @@ -179,6 +179,20 @@ class PubDateSnifferTest { assertEquals("2004-08-24", ret.dateIso8601()); } + @Test + public void testLDWithGraph() throws URISyntaxException { + var ret = dateSniffer.getPubDate("", + new EdgeUrl("https://www.example.com/"), + Jsoup.parse(""" + + + + """), HtmlStandard.UNKNOWN, true); + + assertFalse(ret.isEmpty()); + assertEquals("2016-12-27", ret.dateIso8601()); + } + @Test public void testPath() throws URISyntaxException { var ret = dateSniffer.getPubDate("",