(converter) Adjust the pub-date sniffing heuristics' order. Doing HTML5 tags too early puts some sites too early. Also expanded support for JSON+LD.

This commit is contained in:
Viktor Lofgren 2023-04-19 15:28:50 +02:00
parent 5a5cdaf70e
commit 619fb8ba80
3 changed files with 66 additions and 9 deletions

View File

@ -14,13 +14,14 @@ public class PubDateSniffer {
private final List<PubDateHeuristic> heuristics = new ArrayList<>();
public PubDateSniffer() {
heuristics.add(new PubDateHeuristicHtml5ItempropDateTag());
heuristics.add(new PubDateHeuristicHtml5ArticleDateTag());
heuristics.add(new PubDateHeuristicJSONLD());
heuristics.add(new PubDateHeuristicMicrodata());
heuristics.add(new PubDateHeuristicOpenGraph());
heuristics.add(new PubDateHeuristicRDFaTag());
heuristics.add(new PubDateHeuristicHtml5ItempropDateTag());
heuristics.add(new PubDateHeuristicHtml5ArticleDateTag());
// The more questionable heuristics should be kept below this line
heuristics.add(new PubDateHeuristicUrlPatternPass1());

View File

@ -3,6 +3,8 @@ package nu.marginalia.pubdate.heuristic;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.JsonSyntaxException;
import com.google.gson.annotations.SerializedName;
import lombok.ToString;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateHeuristic;
@ -11,6 +13,9 @@ import nu.marginalia.model.EdgeUrl;
import nu.marginalia.pubdate.PubDateEffortLevel;
import org.jsoup.nodes.Document;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
public class PubDateHeuristicJSONLD implements PubDateHeuristic {
@ -29,17 +34,16 @@ public class PubDateHeuristicJSONLD implements PubDateHeuristic {
return Optional.empty();
}
private static class JsonModel {
String datePublished;
}
private static Gson gson = new GsonBuilder().create();
private static final Gson gson = new GsonBuilder().create();
public Optional<String> parseLdJson(String content) {
try {
var model = gson.fromJson(content, JsonModel.class);
return Optional.ofNullable(model)
.map(m -> m.datePublished);
if (model == null)
return Optional.empty();
return Optional.ofNullable(model.getDatePublished());
}
catch (JsonSyntaxException ex) {
return Optional.empty();
@ -47,3 +51,41 @@ public class PubDateHeuristicJSONLD implements PubDateHeuristic {
}
}
class JsonModel {
public String getDatePublished() {
if (datePublished != null)
return datePublished;
for (var item : Objects.requireNonNullElse(graph,
Collections.<JsonModelGraphItem>emptyList()))
{
if (!item.isRelevant())
continue;
if (item.datePublished != null)
return item.datePublished;
}
return datePublished;
}
String datePublished;
@SerializedName("@graph")
List<JsonModelGraphItem> graph;
}
@ToString
class JsonModelGraphItem {
@SerializedName("@type")
public String type;
public String datePublished;
public boolean isRelevant() {
return "NewsArticle".equalsIgnoreCase(type)
|| "Article".equalsIgnoreCase(type);
}
}

View File

@ -179,6 +179,20 @@ class PubDateSnifferTest {
assertEquals("2004-08-24", ret.dateIso8601());
}
@Test
public void testLDWithGraph() throws URISyntaxException {
var ret = dateSniffer.getPubDate("",
new EdgeUrl("https://www.example.com/"),
Jsoup.parse("""
<!doctype html>
<html>
<script type="application/ld+json" class="aioseop-schema">{"@context":"https://schema.org","@graph":[{"@type":"Organization","@id":"https://socialnomics.net/#organization","url":"https://socialnomics.net/","name":"Socialnomics","sameAs":[]},{"@type":"WebSite","@id":"https://socialnomics.net/#website","url":"https://socialnomics.net/","name":"Socialnomics","publisher":{"@id":"https://socialnomics.net/#organization"}},{"@type":"WebPage","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#webpage","url":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/","inLanguage":"en-US","name":"3 Reasons Why You Should Adopt Java-based Technology For Your Business","isPartOf":{"@id":"https://socialnomics.net/#website"},"breadcrumb":{"@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#breadcrumblist"},"datePublished":"2016-12-27T21:01:36-06:00","dateModified":"2016-12-22T21:02:32-06:00"},{"@type":"Article","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#article","isPartOf":{"@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#webpage"},"author":{"@id":"https://socialnomics.net/author/rahis-saifi/#author"},"headline":"3 Reasons Why You Should Adopt Java-based Technology For Your Business","datePublished":"2016-12-27T21:01:36-06:00","dateModified":"2016-12-22T21:02:32-06:00","commentCount":0,"mainEntityOfPage":{"@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#webpage"},"publisher":{"@id":"https://socialnomics.net/#organization"},"articleSection":"Business, business, java, Java Developers, programming languages"},{"@type":"Person","@id":"https://socialnomics.net/author/rahis-saifi/#author","name":"Rahis Saifi","sameAs":["https://www.facebook.com/RahisSaifiOfficial","https://www.twitter.com/57rahis"],"image":{"@type":"ImageObject","@id":"https://socialnomics.net/#personlogo","url":"https://secure.gravatar.com/avatar/e67f630f0b8bc87e59e111d5e955961d?s=96&d=mm&r=g","width":96,"height":96,"caption":"Rahis Saifi"}},{"@type":"BreadcrumbList","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#breadcrumblist","itemListElement":[{"@type":"ListItem","position":1,"item":{"@type":"WebPage","@id":"https://socialnomics.net/","url":"https://socialnomics.net/","name":"Socialnomics Blog"}},{"@type":"ListItem","position":2,"item":{"@type":"WebPage","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/","url":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/","name":"3 Reasons Why You Should Adopt Java-based Technology For Your Business"}}]}]}</script>
"""), HtmlStandard.UNKNOWN, true);
assertFalse(ret.isEmpty());
assertEquals("2016-12-27", ret.dateIso8601());
}
@Test
public void testPath() throws URISyntaxException {
var ret = dateSniffer.getPubDate("",