mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(converter) Adjust the pub-date sniffing heuristics' order. Doing HTML5 tags too early puts some sites too early. Also expanded support for JSON+LD.
This commit is contained in:
parent
5a5cdaf70e
commit
619fb8ba80
@ -14,13 +14,14 @@ public class PubDateSniffer {
|
||||
private final List<PubDateHeuristic> heuristics = new ArrayList<>();
|
||||
|
||||
public PubDateSniffer() {
|
||||
heuristics.add(new PubDateHeuristicHtml5ItempropDateTag());
|
||||
heuristics.add(new PubDateHeuristicHtml5ArticleDateTag());
|
||||
heuristics.add(new PubDateHeuristicJSONLD());
|
||||
heuristics.add(new PubDateHeuristicMicrodata());
|
||||
heuristics.add(new PubDateHeuristicOpenGraph());
|
||||
heuristics.add(new PubDateHeuristicRDFaTag());
|
||||
|
||||
heuristics.add(new PubDateHeuristicHtml5ItempropDateTag());
|
||||
heuristics.add(new PubDateHeuristicHtml5ArticleDateTag());
|
||||
|
||||
// The more questionable heuristics should be kept below this line
|
||||
heuristics.add(new PubDateHeuristicUrlPatternPass1());
|
||||
|
||||
|
@ -3,6 +3,8 @@ package nu.marginalia.pubdate.heuristic;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import com.google.gson.JsonSyntaxException;
|
||||
import com.google.gson.annotations.SerializedName;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||
@ -11,6 +13,9 @@ import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.pubdate.PubDateEffortLevel;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
public class PubDateHeuristicJSONLD implements PubDateHeuristic {
|
||||
@ -29,17 +34,16 @@ public class PubDateHeuristicJSONLD implements PubDateHeuristic {
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
|
||||
private static class JsonModel {
|
||||
String datePublished;
|
||||
}
|
||||
private static Gson gson = new GsonBuilder().create();
|
||||
private static final Gson gson = new GsonBuilder().create();
|
||||
|
||||
public Optional<String> parseLdJson(String content) {
|
||||
try {
|
||||
var model = gson.fromJson(content, JsonModel.class);
|
||||
return Optional.ofNullable(model)
|
||||
.map(m -> m.datePublished);
|
||||
if (model == null)
|
||||
return Optional.empty();
|
||||
|
||||
return Optional.ofNullable(model.getDatePublished());
|
||||
|
||||
}
|
||||
catch (JsonSyntaxException ex) {
|
||||
return Optional.empty();
|
||||
@ -47,3 +51,41 @@ public class PubDateHeuristicJSONLD implements PubDateHeuristic {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class JsonModel {
|
||||
public String getDatePublished() {
|
||||
if (datePublished != null)
|
||||
return datePublished;
|
||||
|
||||
for (var item : Objects.requireNonNullElse(graph,
|
||||
Collections.<JsonModelGraphItem>emptyList()))
|
||||
{
|
||||
if (!item.isRelevant())
|
||||
continue;
|
||||
|
||||
if (item.datePublished != null)
|
||||
return item.datePublished;
|
||||
}
|
||||
|
||||
return datePublished;
|
||||
}
|
||||
|
||||
String datePublished;
|
||||
|
||||
@SerializedName("@graph")
|
||||
List<JsonModelGraphItem> graph;
|
||||
}
|
||||
|
||||
@ToString
|
||||
class JsonModelGraphItem {
|
||||
@SerializedName("@type")
|
||||
public String type;
|
||||
|
||||
public String datePublished;
|
||||
|
||||
public boolean isRelevant() {
|
||||
return "NewsArticle".equalsIgnoreCase(type)
|
||||
|| "Article".equalsIgnoreCase(type);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -179,6 +179,20 @@ class PubDateSnifferTest {
|
||||
assertEquals("2004-08-24", ret.dateIso8601());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLDWithGraph() throws URISyntaxException {
|
||||
var ret = dateSniffer.getPubDate("",
|
||||
new EdgeUrl("https://www.example.com/"),
|
||||
Jsoup.parse("""
|
||||
<!doctype html>
|
||||
<html>
|
||||
<script type="application/ld+json" class="aioseop-schema">{"@context":"https://schema.org","@graph":[{"@type":"Organization","@id":"https://socialnomics.net/#organization","url":"https://socialnomics.net/","name":"Socialnomics","sameAs":[]},{"@type":"WebSite","@id":"https://socialnomics.net/#website","url":"https://socialnomics.net/","name":"Socialnomics","publisher":{"@id":"https://socialnomics.net/#organization"}},{"@type":"WebPage","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#webpage","url":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/","inLanguage":"en-US","name":"3 Reasons Why You Should Adopt Java-based Technology For Your Business","isPartOf":{"@id":"https://socialnomics.net/#website"},"breadcrumb":{"@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#breadcrumblist"},"datePublished":"2016-12-27T21:01:36-06:00","dateModified":"2016-12-22T21:02:32-06:00"},{"@type":"Article","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#article","isPartOf":{"@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#webpage"},"author":{"@id":"https://socialnomics.net/author/rahis-saifi/#author"},"headline":"3 Reasons Why You Should Adopt Java-based Technology For Your Business","datePublished":"2016-12-27T21:01:36-06:00","dateModified":"2016-12-22T21:02:32-06:00","commentCount":0,"mainEntityOfPage":{"@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#webpage"},"publisher":{"@id":"https://socialnomics.net/#organization"},"articleSection":"Business, business, java, Java Developers, programming languages"},{"@type":"Person","@id":"https://socialnomics.net/author/rahis-saifi/#author","name":"Rahis Saifi","sameAs":["https://www.facebook.com/RahisSaifiOfficial","https://www.twitter.com/57rahis"],"image":{"@type":"ImageObject","@id":"https://socialnomics.net/#personlogo","url":"https://secure.gravatar.com/avatar/e67f630f0b8bc87e59e111d5e955961d?s=96&d=mm&r=g","width":96,"height":96,"caption":"Rahis Saifi"}},{"@type":"BreadcrumbList","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#breadcrumblist","itemListElement":[{"@type":"ListItem","position":1,"item":{"@type":"WebPage","@id":"https://socialnomics.net/","url":"https://socialnomics.net/","name":"Socialnomics Blog"}},{"@type":"ListItem","position":2,"item":{"@type":"WebPage","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/","url":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/","name":"3 Reasons Why You Should Adopt Java-based Technology For Your Business"}}]}]}</script>
|
||||
"""), HtmlStandard.UNKNOWN, true);
|
||||
|
||||
assertFalse(ret.isEmpty());
|
||||
assertEquals("2016-12-27", ret.dateIso8601());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPath() throws URISyntaxException {
|
||||
var ret = dateSniffer.getPubDate("",
|
||||
|
Loading…
Reference in New Issue
Block a user