Publish-date guesser

This commit is contained in:
vlofgren 2022-10-27 19:11:38 +02:00
parent 7a9490dbc4
commit 94c157c5c3
25 changed files with 1013 additions and 16 deletions

View File

@ -30,7 +30,7 @@ public class DocumentsCompiler {
var details = doc.details;
if (details != null) {
ret.add(new LoadProcessedDocument(doc.url, doc.state, details.title, details.description, HtmlFeature.encode(details.features), details.standard, details.length, details.hashCode, details.quality));
ret.add(new LoadProcessedDocument(doc.url, doc.state, details.title, details.description, HtmlFeature.encode(details.features), details.standard, details.length, details.hashCode, details.quality, details.pubYear));
}
else {
ret.add(new LoadProcessedDocumentWithError(doc.url, doc.state, doc.stateReason));

View File

@ -7,6 +7,8 @@ import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState;
import javax.annotation.Nullable;
public record LoadProcessedDocument(EdgeUrl url,
EdgeUrlState state,
@ -16,7 +18,8 @@ public record LoadProcessedDocument(EdgeUrl url,
EdgeHtmlStandard standard,
int length,
long hash,
double quality) implements Instruction
double quality,
@Nullable Integer pubYear) implements Instruction
{
@Override
public void apply(Interpreter interpreter) {

View File

@ -8,6 +8,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.sql.SQLException;
import java.sql.Types;
import java.util.List;
import static java.sql.Statement.SUCCESS_NO_INFO;
@ -34,10 +35,11 @@ public class SqlLoadProcessedDocument {
IN FEATURES INT,
IN STANDARD VARCHAR(32),
IN QUALITY DOUBLE,
IN HASH INT)
IN HASH INT,
IN PUB_YEAR SMALLINT)
BEGIN
SET FOREIGN_KEY_CHECKS=0;
REPLACE INTO EC_PAGE_DATA(ID, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY) VALUES (URL_ID, TITLE, DESCRIPTION, LENGTH, STANDARD, FEATURES, HASH, QUALITY);
REPLACE INTO EC_PAGE_DATA(ID, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR) VALUES (URL_ID, TITLE, DESCRIPTION, LENGTH, STANDARD, FEATURES, HASH, QUALITY, PUB_YEAR);
UPDATE EC_URL SET VISITED=1, STATE=STATE WHERE ID=URL_ID;
SET FOREIGN_KEY_CHECKS=1;
END
@ -62,7 +64,7 @@ public class SqlLoadProcessedDocument {
public void load(LoaderData data, List<LoadProcessedDocument> documents) {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT(?, ?, ?, ?, ?, ?, ?, ?, ?)")) {
var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)")) {
conn.setAutoCommit(false);
int cnt = 0; int batchOffset = 0;
@ -82,6 +84,12 @@ public class SqlLoadProcessedDocument {
stmt.setString(7, doc.standard().name());
stmt.setDouble(8, doc.quality());
stmt.setInt(9, (int) doc.hash());
if (doc.pubYear() != null) {
stmt.setShort(10, (short) doc.pubYear().intValue());
}
else {
stmt.setInt(10, Types.SMALLINT);
}
stmt.addBatch();
if (++cnt == 100) {

View File

@ -5,6 +5,7 @@ import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
import javax.annotation.Nullable;
import java.util.List;
import java.util.Set;
@ -13,6 +14,9 @@ public class ProcessedDocumentDetails {
public String title;
public String description;
@Nullable
public Integer pubYear;
public int length;
public double quality;
public long hashCode;

View File

@ -13,6 +13,8 @@ import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException.Disqualifi
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocumentDetails;
import nu.marginalia.wmsa.edge.converting.processor.logic.*;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateSniffer;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
@ -47,6 +49,7 @@ public class DocumentProcessor {
private final TitleExtractor titleExtractor;
private final DocumentKeywordExtractor keywordExtractor;
private final SummaryExtractor summaryExtractor;
private final PubDateSniffer pubDateSniffer;
private static final DocumentValuator documentValuator = new DocumentValuator();
private static final LanguageFilter languageFilter = new LanguageFilter();
@ -60,7 +63,8 @@ public class DocumentProcessor {
FeatureExtractor featureExtractor,
TitleExtractor titleExtractor,
DocumentKeywordExtractor keywordExtractor,
SummaryExtractor summaryExtractor)
SummaryExtractor summaryExtractor,
PubDateSniffer pubDateSniffer)
{
this.minDocumentLength = minDocumentLength;
this.minDocumentQuality = minDocumentQuality;
@ -69,6 +73,7 @@ public class DocumentProcessor {
this.titleExtractor = titleExtractor;
this.keywordExtractor = keywordExtractor;
this.summaryExtractor = summaryExtractor;
this.pubDateSniffer = pubDateSniffer;
}
public ProcessedDocument makeDisqualifiedStub(CrawledDocument crawledDocument) {
@ -177,6 +182,9 @@ public class DocumentProcessor {
Document doc = Jsoup.parse(crawledDocument.documentBody);
if (AcceptableAds.hasAcceptableAdsTag(doc)) {
// I've never encountered a website where this hasn't been a severe indicator
// of spam
throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS);
}
@ -204,8 +212,10 @@ public class DocumentProcessor {
ret.quality = documentValuator.getQuality(crawledDocument, ret.standard, doc, dld);
ret.hashCode = HashCode.fromString(crawledDocument.documentBodyHash).asLong();
KeywordMetadata keywordMetadata = new KeywordMetadata(ret.quality);
PubDate pubDate;
EdgePageWordSet words;
if (shouldDoSimpleProcessing(url, ret)) {
/* Some documents we'll index, but only superficially. This is a compromise
@ -215,17 +225,25 @@ public class DocumentProcessor {
ret.features = Set.of(HtmlFeature.UNKNOWN);
words = keywordExtractor.extractKeywordsMinimal(dld, keywordMetadata);
ret.description = "";
pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, false);
}
else {
ret.features = featureExtractor.getFeatures(crawledDomain, doc, dld);
words = keywordExtractor.extractKeywords(dld, keywordMetadata);
ret.description = getDescription(doc);
pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, true);
}
addMetaWords(ret, url, crawledDomain, words);
addMetaWords(ret, url, pubDate, crawledDomain, words);
getLinks(url, ret, doc, words);
if (pubDate.hasYear()) {
ret.pubYear = pubDate.year();
}
return new DetailsWithWords(ret, words);
}
@ -256,7 +274,7 @@ public class DocumentProcessor {
return false;
}
private void addMetaWords(ProcessedDocumentDetails ret, EdgeUrl url, CrawledDomain domain, EdgePageWordSet words) {
private void addMetaWords(ProcessedDocumentDetails ret, EdgeUrl url, PubDate pubDate, CrawledDomain domain, EdgePageWordSet words) {
List<String> tagWords = new ArrayList<>();
var edgeDomain = url.domain;
@ -276,6 +294,13 @@ public class DocumentProcessor {
ret.features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add);
if (pubDate.year() > 1900) {
tagWords.add("year:" + pubDate.year());
}
if (pubDate.dateIso8601() != null) {
tagWords.add("pub:" + pubDate.dateIso8601());
}
words.appendWithNoMeta(IndexBlock.Meta, tagWords);
}

View File

@ -0,0 +1,44 @@
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
public record PubDate(String dateIso8601, int year) {
// First year we'll believe something can have been published on the web
// ... Tim Berners Lee's recipe collection or something
public static final int MIN_YEAR = 1989;
// Last year we'll believe something can be published in
public static final int MAX_YEAR = LocalDate.now().getYear() + 1;
public PubDate() {
this(null, Integer.MIN_VALUE);
}
public PubDate(LocalDate date) {
this(date.format(DateTimeFormatter.ISO_DATE), date.getYear());
}
public boolean isEmpty() {
return year == Integer.MIN_VALUE;
}
public String describe() {
if (dateIso8601 != null)
return dateIso8601;
if (hasYear())
return Integer.toString(year);
return "";
}
public static boolean isValidYear(int year) {
return year >= MIN_YEAR && year <= MAX_YEAR;
}
public boolean hasYear() {
return isValidYear(this.year);
}
}

View File

@ -0,0 +1,6 @@
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate;
public enum PubDateEffortLevel {
LOW,
HIGH
}

View File

@ -0,0 +1,12 @@
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
import org.jsoup.nodes.Document;
import java.util.Optional;
public interface PubDateHeuristic {
Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard);
}

View File

@ -0,0 +1,182 @@
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate;
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
import java.time.DateTimeException;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.ZonedDateTime;
import java.time.format.DateTimeFormatter;
import java.util.Optional;
import java.util.OptionalInt;
import java.util.Random;
import java.util.concurrent.ThreadLocalRandom;
import java.util.regex.Pattern;
public class PubDateParser {
// ThreadLocalRandom lacks a few methods we need out of Random
private static ThreadLocal<Random> localRandom = ThreadLocal.withInitial(Random::new);
public static Optional<PubDate> attemptParseDate(String date) {
return Optional.ofNullable(date)
.filter(str -> str.length() >= 4 && str.length() < 32)
.flatMap(str ->
parse8601(str)
.or(() -> parse1123(str))
.or(() -> dateFromHighestYearLookingSubstring(str))
)
.filter(PubDateParser::validateDate);
}
public static OptionalInt parseYearString(String yearString) {
try {
return OptionalInt.of(Integer.parseInt(yearString));
}
catch (NumberFormatException ex) {
return OptionalInt.empty();
}
}
private static final Pattern yearPattern = Pattern.compile("\\d{4}");
public static Optional<PubDate> dateFromHighestYearLookingSubstring(String maybe) {
var matcher = yearPattern.matcher(maybe);
int min = PubDate.MAX_YEAR + 1;
int max = PubDate.MIN_YEAR - 1;
for (int i = 0; i < maybe.length() && matcher.find(i); i = matcher.end()) {
String segment = maybe.substring(matcher.start(), matcher.end());
OptionalInt year = parseYearString(segment);
if (year.isEmpty())
continue;
int y = year.getAsInt();
if (PubDate.isValidYear(y)) {
if (max < y) max = y;
if (min > y) min = y;
}
}
if (max != min && PubDate.isValidYear(min) && PubDate.isValidYear(max)) {
return Optional.of(new PubDate(null, guessYear(min, max)));
}
if (max > PubDate.MIN_YEAR)
return Optional.of(new PubDate(null, max));
else
return Optional.empty();
}
public static Optional<PubDate> dateFromHighestYearLookingSubstringWithGuess(String maybe, int guess) {
var matcher = yearPattern.matcher(maybe);
int min = PubDate.MAX_YEAR + 1;
int max = PubDate.MIN_YEAR - 1;
for (int i = 0; i < maybe.length() && matcher.find(i); i = matcher.end()) {
String segment = maybe.substring(matcher.start(), matcher.end());
OptionalInt year = parseYearString(segment);
if (year.isEmpty())
continue;
int y = year.getAsInt();
if (PubDate.isValidYear(y)) {
if (max < y) max = y;
if (min > y) min = y;
}
}
if (max != min && PubDate.isValidYear(min) && PubDate.isValidYear(max)) {
return Optional.of(new PubDate(null, guessYear(min, max, guess)));
}
if (max > PubDate.MIN_YEAR)
return Optional.of(new PubDate(null, max));
else
return Optional.empty();
}
public static int guessYear(int min, int max, int educatedGuess) {
int var = max - min;
if (var < 3)
return min;
int avg = (max + min) / 2;
int guess = (avg + educatedGuess) / 2;
if (guess < min)
return min;
if (guess > max)
return max;
return guess;
}
public static int guessYear(int min, int max) {
return (max + min) / 2;
}
public static int guessYear(EdgeHtmlStandard standard) {
// Create some jitter to avoid having documents piling up in the same four years
// as this would make searching in those years disproportionately useless
double guess = standard.yearGuess + ThreadLocalRandom.current().nextGaussian();
if (guess < PubDate.MIN_YEAR) {
return PubDate.MIN_YEAR;
}
if (guess > PubDate.MAX_YEAR) {
return PubDate.MAX_YEAR;
}
return (int) guess;
}
public static Optional<PubDate> parse8601(String maybe) {
return parseOptionally(maybe, DateTimeFormatter.ISO_DATE)
.or(() -> parseOptionallyWithTime(maybe, DateTimeFormatter.ISO_DATE_TIME))
.or(() -> parseOptionallyWithZonedTime(maybe, DateTimeFormatter.ISO_DATE_TIME))
.map(PubDate::new);
}
public static Optional<PubDate> parse1123(String maybe) {
return parseOptionally(maybe, DateTimeFormatter.RFC_1123_DATE_TIME)
.map(PubDate::new);
}
public static Optional<LocalDate> parseOptionally(String str, DateTimeFormatter formatter) {
try {
return Optional.of(LocalDate.parse(str, formatter));
}
catch (DateTimeException ex) {
return Optional.empty();
}
}
public static Optional<LocalDate> parseOptionallyWithTime(String str, DateTimeFormatter formatter) {
try {
return Optional.of(LocalDateTime.parse(str, formatter).toLocalDate());
}
catch (DateTimeException ex) {
return Optional.empty();
}
}
public static Optional<LocalDate> parseOptionallyWithZonedTime(String str, DateTimeFormatter formatter) {
try {
return Optional.of(ZonedDateTime.parse(str, formatter).toLocalDate());
}
catch (DateTimeException ex) {
return Optional.empty();
}
}
public static boolean validateDate(PubDate date) {
return (date.year() >= PubDate.MIN_YEAR && date.year() <= PubDate.MAX_YEAR);
}
}

View File

@ -0,0 +1,47 @@
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic.*;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
import org.jsoup.nodes.Document;
import java.util.ArrayList;
import java.util.List;
public class PubDateSniffer {
private final List<PubDateHeuristic> heuristics = new ArrayList<>();
public PubDateSniffer() {
heuristics.add(new PubDateHeuristicHtml5ItempropDateTag());
heuristics.add(new PubDateHeuristicHtml5ArticleDateTag());
heuristics.add(new PubDateHeuristicJSONLD());
heuristics.add(new PubDateHeuristicMicrodata());
heuristics.add(new PubDateHeuristicOpenGraph());
heuristics.add(new PubDateHeuristicRDFaTag());
// The more questionable heuristics should be kept below this line
heuristics.add(new PubDateHeuristicUrlPattern());
heuristics.add(new PubDateHeuristicHtml5AnyTimeTag());
heuristics.add(new PubDateHeuristicDOMParsing());
heuristics.add(new PubDateHeuristicLastModified());
// This is complete guesswork
heuristics.add(new PubDateHeuristicGuessFromHtmlStandard());
}
public PubDate getPubDate(String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard, boolean runExpensive) {
final PubDateEffortLevel effortLevel = runExpensive ? PubDateEffortLevel.HIGH : PubDateEffortLevel.LOW;
for (var heuristic : heuristics) {
var maybe = heuristic.apply(effortLevel, headers, url, document, htmlStandard);
if (maybe.isPresent())
return maybe.get();
}
return new PubDate();
}
}

View File

@ -0,0 +1,148 @@
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
import org.jetbrains.annotations.NotNull;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.NodeFilter;
import java.util.Optional;
public class PubDateHeuristicDOMParsing implements PubDateHeuristic {
@Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
if (effortLevel == PubDateEffortLevel.LOW)
return Optional.empty();
DateExtractingNodeVisitor filter = new DateExtractingNodeVisitor(htmlStandard);
document.filter(filter);
return Optional.ofNullable(filter.pubDate);
}
private static class DateExtractingNodeVisitor implements NodeFilter {
public PubDate pubDate;
private final EdgeHtmlStandard htmlStandard;
private DateExtractingNodeVisitor(EdgeHtmlStandard htmlStandard) {
this.htmlStandard = htmlStandard;
}
@NotNull
@Override
public FilterResult head(@NotNull Node node, int depth) {
if (node instanceof TextNode tn) onTextNode(tn);
if (node instanceof Element el) onElementNode(el);
if (hasPubDate()) {
return FilterResult.STOP;
}
return FilterResult.CONTINUE;
}
public void onTextNode(TextNode tn) {
String text = tn.getWholeText();
if (isCandidatForCopyrightNotice(text)) {
parse(text);
}
}
public void onElementNode(Element el) {
if (hasCommonClass(el)) {
parse(el.text());
}
if (!hasPubDate())
tryParsePhpBBDate(el);
}
public boolean isCandidatForCopyrightNotice(String text) {
if (text.contains("ublished"))
return true;
if (text.contains("opyright"))
return true;
if (text.contains("&copy;"))
return true;
if (text.contains("(c)"))
return true;
return false;
}
public boolean hasCommonClass(Element el) {
var classes = el.classNames();
return classes.contains("entry-meta") // wordpress
|| classes.contains("byline")
|| classes.contains("author")
|| classes.contains("submitted")
|| classes.contains("footer-info-lastmod"); // mediawiki
}
public void tryParsePhpBBDate(Element el) {
/* Match HTML on the form <div>[...] <b>Posted:</b> Sun Oct 03, 2010 5:37 pm&nbsp;</div>
* this is used on old phpBB message boards
*
* Schematically the DOM looks like this
*
* b - TextNode[ Sun Oct 03, 2010 5:37 pm&nbsp;]
* |
* TextNode[Posted:]
*/
if ("b".equals(el.tagName())
&& el.childNodeSize() == 1
&& el.childNode(0) instanceof TextNode ctn
&& "Posted:".equals(ctn.getWholeText())
&& el.nextSibling() instanceof TextNode ntn
)
{
parse(ntn.getWholeText());
}
}
public boolean hasPubDate() {
return pubDate != null;
}
public void setPubDate(PubDate pubDate) {
this.pubDate = pubDate;
}
@NotNull
@Override
public FilterResult tail(@NotNull Node node, int depth) {
return FilterResult.CONTINUE;
}
private void parse(String text) {
if (htmlStandard == EdgeHtmlStandard.UNKNOWN) {
PubDateParser
.dateFromHighestYearLookingSubstring(text)
.ifPresent(this::setPubDate);
}
else {
PubDateParser
.dateFromHighestYearLookingSubstringWithGuess(text, htmlStandard.yearGuess)
.ifPresent(this::setPubDate);
}
}
}
}

View File

@ -0,0 +1,23 @@
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
import org.jsoup.nodes.Document;
import java.util.Optional;
public class PubDateHeuristicGuessFromHtmlStandard implements PubDateHeuristic {
@Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
if (htmlStandard == EdgeHtmlStandard.UNKNOWN)
return Optional.empty();
return Optional.of(new PubDate(null, PubDateParser.guessYear(htmlStandard)));
}
}

View File

@ -0,0 +1,33 @@
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
import org.jsoup.nodes.Document;
import java.util.Optional;
public class PubDateHeuristicHtml5AnyTimeTag implements PubDateHeuristic {
@Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
// HTML5, alternative approach
for (var tag : document.select("time")) {
var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime"));
if (maybeDate.isPresent()) {
return maybeDate;
}
maybeDate = PubDateParser.attemptParseDate(tag.text());
if (maybeDate.isPresent()) {
return maybeDate;
}
}
return Optional.empty();
}
}

View File

@ -0,0 +1,28 @@
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
import org.jsoup.nodes.Document;
import java.util.Optional;
public class PubDateHeuristicHtml5ArticleDateTag implements PubDateHeuristic {
@Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
// HTML5
for (var tag : document.select("time[pubdate=\"pubdate\"]")) {
var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime"));
if (maybeDate.isPresent()) {
return maybeDate;
}
}
return Optional.empty();
}
}

View File

@ -0,0 +1,27 @@
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
import org.jsoup.nodes.Document;
import java.util.Optional;
public class PubDateHeuristicHtml5ItempropDateTag implements PubDateHeuristic {
@Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
for (var tag : document.select("time[itemprop=\"datePublished\"]")) {
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
if (maybeDate.isPresent()) {
return maybeDate;
}
}
return Optional.empty();
}
}

View File

@ -0,0 +1,49 @@
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.JsonSyntaxException;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
import org.jsoup.nodes.Document;
import java.util.Optional;
public class PubDateHeuristicJSONLD implements PubDateHeuristic {
@Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
for (var tag : document.select("script[type=\"application/ld+json\"]")) {
var maybeDate = parseLdJson(tag.data())
.flatMap(PubDateParser::attemptParseDate);
if (maybeDate.isPresent()) {
return maybeDate;
}
}
return Optional.empty();
}
private static class JsonModel {
String datePublished;
}
private static Gson gson = new GsonBuilder().create();
public Optional<String> parseLdJson(String content) {
try {
var model = gson.fromJson(content, JsonModel.class);
return Optional.ofNullable(model)
.map(m -> m.datePublished);
}
catch (JsonSyntaxException ex) {
return Optional.empty();
}
}
}

View File

@ -0,0 +1,29 @@
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
import org.jsoup.nodes.Document;
import java.util.Optional;
public class PubDateHeuristicLastModified implements PubDateHeuristic {
@Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
String lmString = "last-modified: ";
int offset = headers.toLowerCase().indexOf(lmString);
if (offset < 0)
return Optional.empty();
int end = headers.indexOf('\n', offset);
if (end < 0) end = headers.length();
String lmDate = headers.substring(offset + lmString.length(), end);
return PubDateParser.attemptParseDate(lmDate);
}
}

View File

@ -0,0 +1,28 @@
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
import org.jsoup.nodes.Document;
import java.util.Optional;
public class PubDateHeuristicMicrodata implements PubDateHeuristic {
@Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
for (var tag : document.select("meta[itemprop=\"datePublished\"]")) {
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
if (maybeDate.isPresent()) {
return maybeDate;
}
}
return Optional.empty();
}
}

View File

@ -0,0 +1,27 @@
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
import org.jsoup.nodes.Document;
import java.util.Optional;
public class PubDateHeuristicOpenGraph implements PubDateHeuristic {
@Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
// OG
for (var tag : document.select("meta[property=\"article:published_time\"]")) {
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
if (maybeDate.isPresent()) {
return maybeDate;
}
}
return Optional.empty();
}
}

View File

@ -0,0 +1,27 @@
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
import org.jsoup.nodes.Document;
import java.util.Optional;
public class PubDateHeuristicRDFaTag implements PubDateHeuristic {
@Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
for (var tag : document.select("meta[property=\"datePublished\"]")) {
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
if (maybeDate.isPresent()) {
return maybeDate;
}
}
return Optional.empty();
}
}

View File

@ -0,0 +1,42 @@
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
import org.jsoup.nodes.Document;
import java.util.Optional;
import java.util.OptionalInt;
import java.util.regex.Pattern;
public class PubDateHeuristicUrlPattern implements PubDateHeuristic {
private static final Pattern yearUrlPattern = Pattern.compile("/\\d{4}/");
@Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
final String urlString = url.path;
var matcher = yearUrlPattern.matcher(urlString);
for (int i = 0; i < urlString.length() && matcher.find(i); i = matcher.end()) {
String segment = urlString.substring(matcher.start() + 1, matcher.end() - 1);
OptionalInt year = PubDateParser.parseYearString(segment);
if (year.isEmpty())
continue;
int y = year.getAsInt();
if (y >= PubDate.MIN_YEAR && y <= PubDate.MAX_YEAR) {
return Optional.of(new PubDate(null, y));
}
}
return Optional.empty();
}
}

View File

@ -1,19 +1,21 @@
package nu.marginalia.wmsa.edge.model.crawl;
public enum EdgeHtmlStandard {
PLAIN(0, 1),
UNKNOWN(0, 1),
HTML123(0, 1),
HTML4(-0.1, 1.05),
XHTML(-0.1, 1.05),
HTML5(0.5, 1.1);
PLAIN(0, 1, 1993),
UNKNOWN(0, 1, 2000),
HTML123(0, 1, 1997),
HTML4(-0.1, 1.05, 2008),
XHTML(-0.1, 1.05, 2005),
HTML5(0.5, 1.1, 2018);
public final double offset;
public final double scale;
public int yearGuess;
EdgeHtmlStandard(double offset, double scale) {
EdgeHtmlStandard(double offset, double scale, int yearGuess) {
this.offset = offset;
this.scale = scale;
this.yearGuess = yearGuess;
}
}

View File

@ -77,6 +77,8 @@ CREATE TABLE IF NOT EXISTS EC_PAGE_DATA (
DATA_HASH INTEGER NOT NULL,
QUALITY DOUBLE NOT NULL,
PUB_YEAR SMALLINT,
FOREIGN KEY (ID) REFERENCES EC_URL(ID) ON DELETE CASCADE
)
CHARACTER SET utf8mb4

View File

@ -75,7 +75,8 @@ class SqlLoadProcessedDocumentTest {
EdgeHtmlStandard.HTML5,
100,
12345,
-3.14
-3.14,
null
)));
var details = dataStoreDao.getUrlDetailsMulti(new EdgeIdArray<>(loaderData.getUrlId(new EdgeUrl("https://www.marginalia.nu/"))));

View File

@ -0,0 +1,200 @@
package nu.marginalia.wmsa.edge.converting.processor.logic;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser;
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateSniffer;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.Test;
import java.net.URISyntaxException;
import static org.junit.jupiter.api.Assertions.*;
class PubDateSnifferTest {
PubDateSniffer dateSniffer = new PubDateSniffer();
@Test
public void testGetYearFromText() {
var ret = PubDateParser.dateFromHighestYearLookingSubstring("&copy; 2005-2010 Bob Dobbs");
assertTrue(ret.isPresent());
assertEquals(2010, ret.get().year());
ret = PubDateParser.dateFromHighestYearLookingSubstring("&copy; 99 Bob Dobbs");
assertFalse(ret.isPresent());
ret = PubDateParser.dateFromHighestYearLookingSubstring("&copy; 1939 Bob Dobbs");
assertFalse(ret.isPresent());
ret = PubDateParser.dateFromHighestYearLookingSubstring("In the year 2525, if man is still alive");
assertFalse(ret.isPresent());
}
@Test
public void testParse() {
var ret = PubDateParser.attemptParseDate("2022-01-01");
assertTrue(ret.isPresent());
assertEquals("2022-01-01", ret.get().dateIso8601());
assertEquals(2022, ret.get().year());
ret = PubDateParser.attemptParseDate("2022-08-24T14:39:14Z");
assertTrue(ret.isPresent());
assertEquals("2022-08-24", ret.get().dateIso8601());
assertEquals(2022, ret.get().year());
ret = PubDateParser.attemptParseDate("2022-08-24T14:39:14");
assertTrue(ret.isPresent());
assertEquals("2022-08-24", ret.get().dateIso8601());
assertEquals(2022, ret.get().year());
ret = PubDateParser.attemptParseDate("Sun, 21 Oct 2018 12:16:24 GMT");
assertTrue(ret.isPresent());
assertEquals("2018-10-21", ret.get().dateIso8601());
assertEquals(2018, ret.get().year());
}
@Test
public void testHtml5A() throws URISyntaxException {
var ret = dateSniffer.getPubDate("",
new EdgeUrl("https://www.example.com/"),
Jsoup.parse("""
<!doctype html>
<html>
<article>
<time pubdate="pubdate" datetime="2022-08-24">time</time>
Wow, sure lor 'em boss
</article>
"""), EdgeHtmlStandard.UNKNOWN, true);
assertFalse(ret.isEmpty());
assertEquals("2022-08-24", ret.dateIso8601());
}
@Test
public void testHtml5B() throws URISyntaxException {
var ret = dateSniffer.getPubDate("",
new EdgeUrl("https://www.example.com/"),
Jsoup.parse("""
<!doctype html>
<html>
<time>2022-08-24</time>
Wow, sure lor 'em boss
</article>
"""), EdgeHtmlStandard.UNKNOWN, true);
assertFalse(ret.isEmpty());
assertEquals("2022-08-24", ret.dateIso8601());
}
@Test
public void testGuessYear() {
System.out.println(PubDateParser.guessYear(2010, 2020));
System.out.println(PubDateParser.guessYear(2010, 2020));
System.out.println(PubDateParser.guessYear(2010, 2020));
System.out.println(PubDateParser.guessYear(2010, 2020));
System.out.println(PubDateParser.guessYear(2010, 2020));
}
@Test
public void testMicrodata() throws URISyntaxException {
var ret = dateSniffer.getPubDate("",
new EdgeUrl("https://www.example.com/"),
Jsoup.parse("""
<!doctype html>
<html>
<meta itemprop="datePublished" content="2022-08-24" />
"""), EdgeHtmlStandard.UNKNOWN, true);
assertFalse(ret.isEmpty());
assertEquals("2022-08-24", ret.dateIso8601());
}
@Test
public void testRDFa() throws URISyntaxException {
var ret = dateSniffer.getPubDate("",
new EdgeUrl("https://www.example.com/"),
Jsoup.parse("""
<!doctype html>
<html>
<meta property="datePublished" content="2022-08-24" />
"""),EdgeHtmlStandard.UNKNOWN, true);
assertFalse(ret.isEmpty());
assertEquals("2022-08-24", ret.dateIso8601());
}
@Test
public void testLD() throws URISyntaxException {
var ret = dateSniffer.getPubDate("",
new EdgeUrl("https://www.example.com/"),
Jsoup.parse("""
<!doctype html>
<html>
<script type="application/ld+json">{"@context":"https:\\/\\/schema.org","@type":"Article","name":"In the Year 2525","url":"https:\\/\\/en.wikipedia.org\\/wiki\\/In_the_Year_2525","sameAs":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","mainEntity":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\\/\\/www.wikimedia.org\\/static\\/images\\/wmf-hor-googpub.png"}},"datePublished":"2004-08-24T14:39:14Z","dateModified":"2022-10-20T11:54:37Z","image":"https:\\/\\/upload.wikimedia.org\\/wikipedia\\/commons\\/4\\/4a\\/In_the_Year_2525_by_Zager_and_Evans_US_vinyl_Side-A_RCA_release.png","headline":"song written and compsoed by Rick Evans, originally recorded by Zager and Evans and released in 1969"}</script><script type="application/ld+json">{"@context":"https:\\/\\/schema.org","@type":"Article","name":"In the Year 2525","url":"https:\\/\\/en.wikipedia.org\\/wiki\\/In_the_Year_2525","sameAs":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","mainEntity":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\\/\\/www.wikimedia.org\\/static\\/images\\/wmf-hor-googpub.png"}},"datePublished":"2004-08-24T14:39:14Z","dateModified":"2022-10-20T11:54:37Z","image":"https:\\/\\/upload.wikimedia.org\\/wikipedia\\/commons\\/4\\/4a\\/In_the_Year_2525_by_Zager_and_Evans_US_vinyl_Side-A_RCA_release.png","headline":"song written and compsoed by Rick Evans, originally recorded by Zager and Evans and released in 1969"}</script>
"""), EdgeHtmlStandard.UNKNOWN, true);
assertFalse(ret.isEmpty());
assertEquals("2004-08-24", ret.dateIso8601());
}
@Test
public void testPath() throws URISyntaxException {
var ret = dateSniffer.getPubDate("",
new EdgeUrl("https://www.example.com/articles/2022/04/how-to-detect-dates"),
Jsoup.parse("""
<!doctype html>
<html>
<title>No date in the HTML</title>
"""), EdgeHtmlStandard.UNKNOWN, true);
assertFalse(ret.isEmpty());
assertNull(ret.dateIso8601());
assertEquals(2022, ret.year());
}
@Test
public void testHeader() throws URISyntaxException {
var ret = dateSniffer.getPubDate("content-type: application/pdf\netag: \"4fc0ba8a7f5090b6fa6be385dca206ec\"\nlast-modified: Thu, 03 Feb 2022 19:22:58 GMT\ncontent-length: 298819\ndate: Wed, 24 Aug 2022 19:48:52 GMT\ncache-control: public, no-transform, immutable, max-age\u003d31536000\naccess-control-expose-headers: Content-Length,Content-Disposition,Content-Range,Etag,Server-Timing,Vary,X-Cld-Error,X-Content-Type-Options\naccess-control-allow-origin: *\naccept-ranges: bytes\ntiming-allow-origin: *\nserver: Cloudinary\nstrict-transport-security: max-age\u003d604800\nx-content-type-options: nosniff\nserver-timing: akam;dur\u003d25;start\u003d2022-08-24T19:48:52.519Z;desc\u003dmiss,rtt;dur\u003d19,cloudinary;dur\u003d129;start\u003d2022-08-23T06:35:17.331Z\n",
new EdgeUrl("https://www.example.com/"),
Jsoup.parse("""
<!doctype html>
<html>
<title>No date in the HTML</title>
"""), EdgeHtmlStandard.UNKNOWN, true);
assertFalse(ret.isEmpty());
assertEquals("2022-02-03", ret.dateIso8601());
}
@Test
public void testDOM() throws URISyntaxException {
var ret = dateSniffer.getPubDate("",
new EdgeUrl("https://www.example.com/"),
Jsoup.parse("""
<!doctype html>
<html>
<p>Published 2003, updated 2022</p>
"""), EdgeHtmlStandard.HTML5, true);
assertFalse(ret.isEmpty());
assertNull(ret.dateIso8601());
assertEquals(2015, ret.year());
}
@Test
public void testOldInvision() throws URISyntaxException {
var ret = dateSniffer.getPubDate("",
new EdgeUrl("https://www.example.com/"),
Jsoup.parse("""
<!doctype html>
<html>
<div style="float: left;">&nbsp;<b>Post subject:</b> Keyboards.</div><div style="float: right;"><span class="postdetails"><b><img src="./styles/subsilver2/imageset/icon_post_target.gif" width="12" height="9" alt="Post" title="Post" /> <a href="./viewtopic.php?p=34580&amp;sid=cf0c13dedebb4fea1f03fa73e510cd9f#p34580">#1</a></b></span>&nbsp;<b>Posted:</b> Sun Oct 03, 2010 5:37 pm&nbsp;</div>
"""), EdgeHtmlStandard.UNKNOWN, true);
assertFalse(ret.isEmpty());
assertNull(ret.dateIso8601());
assertEquals(2010, ret.year());
}
}