From 94c157c5c32b761728731634cdf4c9da857354e7 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Thu, 27 Oct 2022 19:11:38 +0200 Subject: [PATCH] Publish-date guesser --- .../compiler/DocumentsCompiler.java | 2 +- .../instruction/LoadProcessedDocument.java | 5 +- .../loader/SqlLoadProcessedDocument.java | 14 +- .../model/ProcessedDocumentDetails.java | 4 + .../processor/DocumentProcessor.java | 31 ++- .../processor/logic/pubdate/PubDate.java | 44 ++++ .../logic/pubdate/PubDateEffortLevel.java | 6 + .../logic/pubdate/PubDateHeuristic.java | 12 ++ .../logic/pubdate/PubDateParser.java | 182 ++++++++++++++++ .../logic/pubdate/PubDateSniffer.java | 47 ++++ .../heuristic/PubDateHeuristicDOMParsing.java | 148 +++++++++++++ ...PubDateHeuristicGuessFromHtmlStandard.java | 23 ++ .../PubDateHeuristicHtml5AnyTimeTag.java | 33 +++ .../PubDateHeuristicHtml5ArticleDateTag.java | 28 +++ .../PubDateHeuristicHtml5ItempropDateTag.java | 27 +++ .../heuristic/PubDateHeuristicJSONLD.java | 49 +++++ .../PubDateHeuristicLastModified.java | 29 +++ .../heuristic/PubDateHeuristicMicrodata.java | 28 +++ .../heuristic/PubDateHeuristicOpenGraph.java | 27 +++ .../heuristic/PubDateHeuristicRDFaTag.java | 27 +++ .../heuristic/PubDateHeuristicUrlPattern.java | 42 ++++ .../edge/model/crawl/EdgeHtmlStandard.java | 16 +- .../main/resources/sql/edge-crawler-cache.sql | 2 + .../loader/SqlLoadProcessedDocumentTest.java | 3 +- .../processor/logic/PubDateSnifferTest.java | 200 ++++++++++++++++++ 25 files changed, 1013 insertions(+), 16 deletions(-) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDate.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateEffortLevel.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateHeuristic.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateParser.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateSniffer.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicDOMParsing.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicJSONLD.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicLastModified.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicMicrodata.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicOpenGraph.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicRDFaTag.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicUrlPattern.java create mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/PubDateSnifferTest.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/DocumentsCompiler.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/DocumentsCompiler.java index 1aebe182..e5b18c6a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/DocumentsCompiler.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/DocumentsCompiler.java @@ -30,7 +30,7 @@ public class DocumentsCompiler { var details = doc.details; if (details != null) { - ret.add(new LoadProcessedDocument(doc.url, doc.state, details.title, details.description, HtmlFeature.encode(details.features), details.standard, details.length, details.hashCode, details.quality)); + ret.add(new LoadProcessedDocument(doc.url, doc.state, details.title, details.description, HtmlFeature.encode(details.features), details.standard, details.length, details.hashCode, details.quality, details.pubYear)); } else { ret.add(new LoadProcessedDocumentWithError(doc.url, doc.state, doc.stateReason)); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDocument.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDocument.java index 9a35c58b..3f65f7af 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDocument.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDocument.java @@ -7,6 +7,8 @@ import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState; +import javax.annotation.Nullable; + public record LoadProcessedDocument(EdgeUrl url, EdgeUrlState state, @@ -16,7 +18,8 @@ public record LoadProcessedDocument(EdgeUrl url, EdgeHtmlStandard standard, int length, long hash, - double quality) implements Instruction + double quality, + @Nullable Integer pubYear) implements Instruction { @Override public void apply(Interpreter interpreter) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java index 6a2dd7cd..fac60a74 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java @@ -8,6 +8,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.sql.SQLException; +import java.sql.Types; import java.util.List; import static java.sql.Statement.SUCCESS_NO_INFO; @@ -34,10 +35,11 @@ public class SqlLoadProcessedDocument { IN FEATURES INT, IN STANDARD VARCHAR(32), IN QUALITY DOUBLE, - IN HASH INT) + IN HASH INT, + IN PUB_YEAR SMALLINT) BEGIN SET FOREIGN_KEY_CHECKS=0; - REPLACE INTO EC_PAGE_DATA(ID, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY) VALUES (URL_ID, TITLE, DESCRIPTION, LENGTH, STANDARD, FEATURES, HASH, QUALITY); + REPLACE INTO EC_PAGE_DATA(ID, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR) VALUES (URL_ID, TITLE, DESCRIPTION, LENGTH, STANDARD, FEATURES, HASH, QUALITY, PUB_YEAR); UPDATE EC_URL SET VISITED=1, STATE=STATE WHERE ID=URL_ID; SET FOREIGN_KEY_CHECKS=1; END @@ -62,7 +64,7 @@ public class SqlLoadProcessedDocument { public void load(LoaderData data, List documents) { try (var conn = dataSource.getConnection(); - var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT(?, ?, ?, ?, ?, ?, ?, ?, ?)")) { + var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)")) { conn.setAutoCommit(false); int cnt = 0; int batchOffset = 0; @@ -82,6 +84,12 @@ public class SqlLoadProcessedDocument { stmt.setString(7, doc.standard().name()); stmt.setDouble(8, doc.quality()); stmt.setInt(9, (int) doc.hash()); + if (doc.pubYear() != null) { + stmt.setShort(10, (short) doc.pubYear().intValue()); + } + else { + stmt.setInt(10, Types.SMALLINT); + } stmt.addBatch(); if (++cnt == 100) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/ProcessedDocumentDetails.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/ProcessedDocumentDetails.java index 69f5a57a..25afe126 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/ProcessedDocumentDetails.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/ProcessedDocumentDetails.java @@ -5,6 +5,7 @@ import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature; import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import javax.annotation.Nullable; import java.util.List; import java.util.Set; @@ -13,6 +14,9 @@ public class ProcessedDocumentDetails { public String title; public String description; + @Nullable + public Integer pubYear; + public int length; public double quality; public long hashCode; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java index e7ebd4e0..7c953074 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java @@ -13,6 +13,8 @@ import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException.Disqualifi import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument; import nu.marginalia.wmsa.edge.converting.model.ProcessedDocumentDetails; import nu.marginalia.wmsa.edge.converting.processor.logic.*; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateSniffer; import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus; @@ -47,6 +49,7 @@ public class DocumentProcessor { private final TitleExtractor titleExtractor; private final DocumentKeywordExtractor keywordExtractor; private final SummaryExtractor summaryExtractor; + private final PubDateSniffer pubDateSniffer; private static final DocumentValuator documentValuator = new DocumentValuator(); private static final LanguageFilter languageFilter = new LanguageFilter(); @@ -60,7 +63,8 @@ public class DocumentProcessor { FeatureExtractor featureExtractor, TitleExtractor titleExtractor, DocumentKeywordExtractor keywordExtractor, - SummaryExtractor summaryExtractor) + SummaryExtractor summaryExtractor, + PubDateSniffer pubDateSniffer) { this.minDocumentLength = minDocumentLength; this.minDocumentQuality = minDocumentQuality; @@ -69,6 +73,7 @@ public class DocumentProcessor { this.titleExtractor = titleExtractor; this.keywordExtractor = keywordExtractor; this.summaryExtractor = summaryExtractor; + this.pubDateSniffer = pubDateSniffer; } public ProcessedDocument makeDisqualifiedStub(CrawledDocument crawledDocument) { @@ -177,6 +182,9 @@ public class DocumentProcessor { Document doc = Jsoup.parse(crawledDocument.documentBody); if (AcceptableAds.hasAcceptableAdsTag(doc)) { + // I've never encountered a website where this hasn't been a severe indicator + // of spam + throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS); } @@ -204,8 +212,10 @@ public class DocumentProcessor { ret.quality = documentValuator.getQuality(crawledDocument, ret.standard, doc, dld); ret.hashCode = HashCode.fromString(crawledDocument.documentBodyHash).asLong(); + KeywordMetadata keywordMetadata = new KeywordMetadata(ret.quality); + PubDate pubDate; EdgePageWordSet words; if (shouldDoSimpleProcessing(url, ret)) { /* Some documents we'll index, but only superficially. This is a compromise @@ -215,17 +225,25 @@ public class DocumentProcessor { ret.features = Set.of(HtmlFeature.UNKNOWN); words = keywordExtractor.extractKeywordsMinimal(dld, keywordMetadata); ret.description = ""; + + pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, false); } else { ret.features = featureExtractor.getFeatures(crawledDomain, doc, dld); words = keywordExtractor.extractKeywords(dld, keywordMetadata); ret.description = getDescription(doc); + + pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, true); } - addMetaWords(ret, url, crawledDomain, words); + addMetaWords(ret, url, pubDate, crawledDomain, words); getLinks(url, ret, doc, words); + if (pubDate.hasYear()) { + ret.pubYear = pubDate.year(); + } + return new DetailsWithWords(ret, words); } @@ -256,7 +274,7 @@ public class DocumentProcessor { return false; } - private void addMetaWords(ProcessedDocumentDetails ret, EdgeUrl url, CrawledDomain domain, EdgePageWordSet words) { + private void addMetaWords(ProcessedDocumentDetails ret, EdgeUrl url, PubDate pubDate, CrawledDomain domain, EdgePageWordSet words) { List tagWords = new ArrayList<>(); var edgeDomain = url.domain; @@ -276,6 +294,13 @@ public class DocumentProcessor { ret.features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add); + if (pubDate.year() > 1900) { + tagWords.add("year:" + pubDate.year()); + } + if (pubDate.dateIso8601() != null) { + tagWords.add("pub:" + pubDate.dateIso8601()); + } + words.appendWithNoMeta(IndexBlock.Meta, tagWords); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDate.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDate.java new file mode 100644 index 00000000..6b1c948f --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDate.java @@ -0,0 +1,44 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate; + +import java.time.LocalDate; +import java.time.format.DateTimeFormatter; + +public record PubDate(String dateIso8601, int year) { + + // First year we'll believe something can have been published on the web + // ... Tim Berners Lee's recipe collection or something + public static final int MIN_YEAR = 1989; + + // Last year we'll believe something can be published in + public static final int MAX_YEAR = LocalDate.now().getYear() + 1; + + + public PubDate() { + this(null, Integer.MIN_VALUE); + } + + public PubDate(LocalDate date) { + this(date.format(DateTimeFormatter.ISO_DATE), date.getYear()); + } + + public boolean isEmpty() { + return year == Integer.MIN_VALUE; + } + + public String describe() { + if (dateIso8601 != null) + return dateIso8601; + + if (hasYear()) + return Integer.toString(year); + + return ""; + } + + public static boolean isValidYear(int year) { + return year >= MIN_YEAR && year <= MAX_YEAR; + } + public boolean hasYear() { + return isValidYear(this.year); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateEffortLevel.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateEffortLevel.java new file mode 100644 index 00000000..b146c0d0 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateEffortLevel.java @@ -0,0 +1,6 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate; + +public enum PubDateEffortLevel { + LOW, + HIGH +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateHeuristic.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateHeuristic.java new file mode 100644 index 00000000..0bac7705 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateHeuristic.java @@ -0,0 +1,12 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate; + +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import org.jsoup.nodes.Document; + +import java.util.Optional; + +public interface PubDateHeuristic { + + Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard); +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateParser.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateParser.java new file mode 100644 index 00000000..7ed8dafb --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateParser.java @@ -0,0 +1,182 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate; + +import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; + +import java.time.DateTimeException; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.ZonedDateTime; +import java.time.format.DateTimeFormatter; +import java.util.Optional; +import java.util.OptionalInt; +import java.util.Random; +import java.util.concurrent.ThreadLocalRandom; +import java.util.regex.Pattern; + +public class PubDateParser { + + // ThreadLocalRandom lacks a few methods we need out of Random + private static ThreadLocal localRandom = ThreadLocal.withInitial(Random::new); + + public static Optional attemptParseDate(String date) { + return Optional.ofNullable(date) + .filter(str -> str.length() >= 4 && str.length() < 32) + .flatMap(str -> + parse8601(str) + .or(() -> parse1123(str)) + .or(() -> dateFromHighestYearLookingSubstring(str)) + ) + .filter(PubDateParser::validateDate); + } + + public static OptionalInt parseYearString(String yearString) { + try { + return OptionalInt.of(Integer.parseInt(yearString)); + } + catch (NumberFormatException ex) { + return OptionalInt.empty(); + } + } + + + private static final Pattern yearPattern = Pattern.compile("\\d{4}"); + + public static Optional dateFromHighestYearLookingSubstring(String maybe) { + var matcher = yearPattern.matcher(maybe); + + int min = PubDate.MAX_YEAR + 1; + int max = PubDate.MIN_YEAR - 1; + + for (int i = 0; i < maybe.length() && matcher.find(i); i = matcher.end()) { + + String segment = maybe.substring(matcher.start(), matcher.end()); + OptionalInt year = parseYearString(segment); + + if (year.isEmpty()) + continue; + + int y = year.getAsInt(); + if (PubDate.isValidYear(y)) { + if (max < y) max = y; + if (min > y) min = y; + } + } + + if (max != min && PubDate.isValidYear(min) && PubDate.isValidYear(max)) { + return Optional.of(new PubDate(null, guessYear(min, max))); + } + + if (max > PubDate.MIN_YEAR) + return Optional.of(new PubDate(null, max)); + else + return Optional.empty(); + } + + + public static Optional dateFromHighestYearLookingSubstringWithGuess(String maybe, int guess) { + var matcher = yearPattern.matcher(maybe); + + int min = PubDate.MAX_YEAR + 1; + int max = PubDate.MIN_YEAR - 1; + + for (int i = 0; i < maybe.length() && matcher.find(i); i = matcher.end()) { + + String segment = maybe.substring(matcher.start(), matcher.end()); + OptionalInt year = parseYearString(segment); + + if (year.isEmpty()) + continue; + + int y = year.getAsInt(); + if (PubDate.isValidYear(y)) { + if (max < y) max = y; + if (min > y) min = y; + } + } + + if (max != min && PubDate.isValidYear(min) && PubDate.isValidYear(max)) { + return Optional.of(new PubDate(null, guessYear(min, max, guess))); + } + + if (max > PubDate.MIN_YEAR) + return Optional.of(new PubDate(null, max)); + else + return Optional.empty(); + } + + public static int guessYear(int min, int max, int educatedGuess) { + int var = max - min; + + if (var < 3) + return min; + + int avg = (max + min) / 2; + int guess = (avg + educatedGuess) / 2; + + if (guess < min) + return min; + if (guess > max) + return max; + + return guess; + } + + public static int guessYear(int min, int max) { + return (max + min) / 2; + } + + public static int guessYear(EdgeHtmlStandard standard) { + // Create some jitter to avoid having documents piling up in the same four years + // as this would make searching in those years disproportionately useless + + double guess = standard.yearGuess + ThreadLocalRandom.current().nextGaussian(); + + if (guess < PubDate.MIN_YEAR) { + return PubDate.MIN_YEAR; + } + if (guess > PubDate.MAX_YEAR) { + return PubDate.MAX_YEAR; + } + return (int) guess; + } + + public static Optional parse8601(String maybe) { + return parseOptionally(maybe, DateTimeFormatter.ISO_DATE) + .or(() -> parseOptionallyWithTime(maybe, DateTimeFormatter.ISO_DATE_TIME)) + .or(() -> parseOptionallyWithZonedTime(maybe, DateTimeFormatter.ISO_DATE_TIME)) + .map(PubDate::new); + } + + public static Optional parse1123(String maybe) { + return parseOptionally(maybe, DateTimeFormatter.RFC_1123_DATE_TIME) + .map(PubDate::new); + } + + public static Optional parseOptionally(String str, DateTimeFormatter formatter) { + try { + return Optional.of(LocalDate.parse(str, formatter)); + } + catch (DateTimeException ex) { + return Optional.empty(); + } + } + public static Optional parseOptionallyWithTime(String str, DateTimeFormatter formatter) { + try { + return Optional.of(LocalDateTime.parse(str, formatter).toLocalDate()); + } + catch (DateTimeException ex) { + return Optional.empty(); + } + } + public static Optional parseOptionallyWithZonedTime(String str, DateTimeFormatter formatter) { + try { + return Optional.of(ZonedDateTime.parse(str, formatter).toLocalDate()); + } + catch (DateTimeException ex) { + return Optional.empty(); + } + } + public static boolean validateDate(PubDate date) { + return (date.year() >= PubDate.MIN_YEAR && date.year() <= PubDate.MAX_YEAR); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateSniffer.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateSniffer.java new file mode 100644 index 00000000..2b1d2158 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateSniffer.java @@ -0,0 +1,47 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate; + +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic.*; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import org.jsoup.nodes.Document; + +import java.util.ArrayList; +import java.util.List; + +public class PubDateSniffer { + + private final List heuristics = new ArrayList<>(); + + public PubDateSniffer() { + heuristics.add(new PubDateHeuristicHtml5ItempropDateTag()); + heuristics.add(new PubDateHeuristicHtml5ArticleDateTag()); + heuristics.add(new PubDateHeuristicJSONLD()); + heuristics.add(new PubDateHeuristicMicrodata()); + heuristics.add(new PubDateHeuristicOpenGraph()); + heuristics.add(new PubDateHeuristicRDFaTag()); + + // The more questionable heuristics should be kept below this line + + heuristics.add(new PubDateHeuristicUrlPattern()); + heuristics.add(new PubDateHeuristicHtml5AnyTimeTag()); + heuristics.add(new PubDateHeuristicDOMParsing()); + heuristics.add(new PubDateHeuristicLastModified()); + + // This is complete guesswork + + heuristics.add(new PubDateHeuristicGuessFromHtmlStandard()); + } + + public PubDate getPubDate(String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard, boolean runExpensive) { + final PubDateEffortLevel effortLevel = runExpensive ? PubDateEffortLevel.HIGH : PubDateEffortLevel.LOW; + + for (var heuristic : heuristics) { + var maybe = heuristic.apply(effortLevel, headers, url, document, htmlStandard); + if (maybe.isPresent()) + return maybe.get(); + } + + return new PubDate(); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicDOMParsing.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicDOMParsing.java new file mode 100644 index 00000000..7621f4cc --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicDOMParsing.java @@ -0,0 +1,148 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic; + +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import org.jetbrains.annotations.NotNull; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.nodes.TextNode; +import org.jsoup.select.NodeFilter; + +import java.util.Optional; + +public class PubDateHeuristicDOMParsing implements PubDateHeuristic { + + @Override + public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { + if (effortLevel == PubDateEffortLevel.LOW) + return Optional.empty(); + + DateExtractingNodeVisitor filter = new DateExtractingNodeVisitor(htmlStandard); + + document.filter(filter); + + return Optional.ofNullable(filter.pubDate); + } + + + private static class DateExtractingNodeVisitor implements NodeFilter { + public PubDate pubDate; + private final EdgeHtmlStandard htmlStandard; + + private DateExtractingNodeVisitor(EdgeHtmlStandard htmlStandard) { + this.htmlStandard = htmlStandard; + } + + @NotNull + @Override + public FilterResult head(@NotNull Node node, int depth) { + if (node instanceof TextNode tn) onTextNode(tn); + if (node instanceof Element el) onElementNode(el); + + if (hasPubDate()) { + return FilterResult.STOP; + } + return FilterResult.CONTINUE; + } + + public void onTextNode(TextNode tn) { + String text = tn.getWholeText(); + + if (isCandidatForCopyrightNotice(text)) { + parse(text); + } + } + + public void onElementNode(Element el) { + if (hasCommonClass(el)) { + parse(el.text()); + } + + if (!hasPubDate()) + tryParsePhpBBDate(el); + } + + + public boolean isCandidatForCopyrightNotice(String text) { + if (text.contains("ublished")) + return true; + if (text.contains("opyright")) + return true; + if (text.contains("©")) + return true; + if (text.contains("(c)")) + return true; + + return false; + } + + + public boolean hasCommonClass(Element el) { + var classes = el.classNames(); + + return classes.contains("entry-meta") // wordpress + || classes.contains("byline") + || classes.contains("author") + || classes.contains("submitted") + || classes.contains("footer-info-lastmod"); // mediawiki + } + + public void tryParsePhpBBDate(Element el) { + + /* Match HTML on the form
[...] Posted: Sun Oct 03, 2010 5:37 pm 
+ * this is used on old phpBB message boards + * + * Schematically the DOM looks like this + * + * b - TextNode[ Sun Oct 03, 2010 5:37 pm ] + * | + * TextNode[Posted:] + */ + if ("b".equals(el.tagName()) + && el.childNodeSize() == 1 + && el.childNode(0) instanceof TextNode ctn + && "Posted:".equals(ctn.getWholeText()) + && el.nextSibling() instanceof TextNode ntn + ) + { + parse(ntn.getWholeText()); + } + } + + + public boolean hasPubDate() { + return pubDate != null; + } + public void setPubDate(PubDate pubDate) { + this.pubDate = pubDate; + } + + @NotNull + @Override + public FilterResult tail(@NotNull Node node, int depth) { + return FilterResult.CONTINUE; + } + + private void parse(String text) { + if (htmlStandard == EdgeHtmlStandard.UNKNOWN) { + PubDateParser + .dateFromHighestYearLookingSubstring(text) + .ifPresent(this::setPubDate); + } + else { + PubDateParser + .dateFromHighestYearLookingSubstringWithGuess(text, htmlStandard.yearGuess) + .ifPresent(this::setPubDate); + } + } + + + } + + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java new file mode 100644 index 00000000..e3d0e556 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java @@ -0,0 +1,23 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic; + +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import org.jsoup.nodes.Document; + +import java.util.Optional; + +public class PubDateHeuristicGuessFromHtmlStandard implements PubDateHeuristic { + + @Override + public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { + if (htmlStandard == EdgeHtmlStandard.UNKNOWN) + return Optional.empty(); + + return Optional.of(new PubDate(null, PubDateParser.guessYear(htmlStandard))); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java new file mode 100644 index 00000000..5919a608 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java @@ -0,0 +1,33 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic; + +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import org.jsoup.nodes.Document; + +import java.util.Optional; + +public class PubDateHeuristicHtml5AnyTimeTag implements PubDateHeuristic { + + @Override + public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { + // HTML5, alternative approach + for (var tag : document.select("time")) { + var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime")); + if (maybeDate.isPresent()) { + return maybeDate; + } + + maybeDate = PubDateParser.attemptParseDate(tag.text()); + if (maybeDate.isPresent()) { + return maybeDate; + } + } + + return Optional.empty(); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java new file mode 100644 index 00000000..78c54b9a --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java @@ -0,0 +1,28 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic; + +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import org.jsoup.nodes.Document; + +import java.util.Optional; + +public class PubDateHeuristicHtml5ArticleDateTag implements PubDateHeuristic { + + @Override + public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { + // HTML5 + for (var tag : document.select("time[pubdate=\"pubdate\"]")) { + var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime")); + if (maybeDate.isPresent()) { + return maybeDate; + } + } + + return Optional.empty(); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java new file mode 100644 index 00000000..8dec0f6a --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java @@ -0,0 +1,27 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic; + +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import org.jsoup.nodes.Document; + +import java.util.Optional; + +public class PubDateHeuristicHtml5ItempropDateTag implements PubDateHeuristic { + + @Override + public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { + for (var tag : document.select("time[itemprop=\"datePublished\"]")) { + var maybeDate = PubDateParser.attemptParseDate(tag.attr("content")); + if (maybeDate.isPresent()) { + return maybeDate; + } + } + + return Optional.empty(); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicJSONLD.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicJSONLD.java new file mode 100644 index 00000000..2187a744 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicJSONLD.java @@ -0,0 +1,49 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic; + +import com.google.gson.Gson; +import com.google.gson.GsonBuilder; +import com.google.gson.JsonSyntaxException; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import org.jsoup.nodes.Document; + +import java.util.Optional; + +public class PubDateHeuristicJSONLD implements PubDateHeuristic { + + @Override + public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { + for (var tag : document.select("script[type=\"application/ld+json\"]")) { + var maybeDate = parseLdJson(tag.data()) + .flatMap(PubDateParser::attemptParseDate); + + if (maybeDate.isPresent()) { + return maybeDate; + } + } + + return Optional.empty(); + } + + + private static class JsonModel { + String datePublished; + } + private static Gson gson = new GsonBuilder().create(); + + public Optional parseLdJson(String content) { + try { + var model = gson.fromJson(content, JsonModel.class); + return Optional.ofNullable(model) + .map(m -> m.datePublished); + } + catch (JsonSyntaxException ex) { + return Optional.empty(); + } + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicLastModified.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicLastModified.java new file mode 100644 index 00000000..5a47c9df --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicLastModified.java @@ -0,0 +1,29 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic; + +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import org.jsoup.nodes.Document; + +import java.util.Optional; + +public class PubDateHeuristicLastModified implements PubDateHeuristic { + + @Override + public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { + String lmString = "last-modified: "; + int offset = headers.toLowerCase().indexOf(lmString); + + if (offset < 0) + return Optional.empty(); + int end = headers.indexOf('\n', offset); + if (end < 0) end = headers.length(); + + String lmDate = headers.substring(offset + lmString.length(), end); + return PubDateParser.attemptParseDate(lmDate); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicMicrodata.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicMicrodata.java new file mode 100644 index 00000000..a257bba2 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicMicrodata.java @@ -0,0 +1,28 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic; + +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import org.jsoup.nodes.Document; + +import java.util.Optional; + +public class PubDateHeuristicMicrodata implements PubDateHeuristic { + + @Override + public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { + + for (var tag : document.select("meta[itemprop=\"datePublished\"]")) { + var maybeDate = PubDateParser.attemptParseDate(tag.attr("content")); + if (maybeDate.isPresent()) { + return maybeDate; + } + } + + return Optional.empty(); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicOpenGraph.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicOpenGraph.java new file mode 100644 index 00000000..bd9b66a9 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicOpenGraph.java @@ -0,0 +1,27 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic; + +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import org.jsoup.nodes.Document; + +import java.util.Optional; + +public class PubDateHeuristicOpenGraph implements PubDateHeuristic { + + @Override + public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { + // OG + for (var tag : document.select("meta[property=\"article:published_time\"]")) { + var maybeDate = PubDateParser.attemptParseDate(tag.attr("content")); + if (maybeDate.isPresent()) { + return maybeDate; + } + } + + return Optional.empty(); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicRDFaTag.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicRDFaTag.java new file mode 100644 index 00000000..2618cdef --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicRDFaTag.java @@ -0,0 +1,27 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic; + +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import org.jsoup.nodes.Document; + +import java.util.Optional; + +public class PubDateHeuristicRDFaTag implements PubDateHeuristic { + + @Override + public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { + for (var tag : document.select("meta[property=\"datePublished\"]")) { + var maybeDate = PubDateParser.attemptParseDate(tag.attr("content")); + if (maybeDate.isPresent()) { + return maybeDate; + } + } + + return Optional.empty(); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicUrlPattern.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicUrlPattern.java new file mode 100644 index 00000000..075a5a1a --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicUrlPattern.java @@ -0,0 +1,42 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic; + +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import org.jsoup.nodes.Document; + +import java.util.Optional; +import java.util.OptionalInt; +import java.util.regex.Pattern; + +public class PubDateHeuristicUrlPattern implements PubDateHeuristic { + + private static final Pattern yearUrlPattern = Pattern.compile("/\\d{4}/"); + + @Override + public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { + final String urlString = url.path; + + var matcher = yearUrlPattern.matcher(urlString); + + for (int i = 0; i < urlString.length() && matcher.find(i); i = matcher.end()) { + + String segment = urlString.substring(matcher.start() + 1, matcher.end() - 1); + + OptionalInt year = PubDateParser.parseYearString(segment); + + if (year.isEmpty()) + continue; + + int y = year.getAsInt(); + if (y >= PubDate.MIN_YEAR && y <= PubDate.MAX_YEAR) { + return Optional.of(new PubDate(null, y)); + } + } + + return Optional.empty(); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeHtmlStandard.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeHtmlStandard.java index 18142da2..51eb7d4a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeHtmlStandard.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeHtmlStandard.java @@ -1,19 +1,21 @@ package nu.marginalia.wmsa.edge.model.crawl; public enum EdgeHtmlStandard { - PLAIN(0, 1), - UNKNOWN(0, 1), - HTML123(0, 1), - HTML4(-0.1, 1.05), - XHTML(-0.1, 1.05), - HTML5(0.5, 1.1); + PLAIN(0, 1, 1993), + UNKNOWN(0, 1, 2000), + HTML123(0, 1, 1997), + HTML4(-0.1, 1.05, 2008), + XHTML(-0.1, 1.05, 2005), + HTML5(0.5, 1.1, 2018); public final double offset; public final double scale; + public int yearGuess; - EdgeHtmlStandard(double offset, double scale) { + EdgeHtmlStandard(double offset, double scale, int yearGuess) { this.offset = offset; this.scale = scale; + this.yearGuess = yearGuess; } } diff --git a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql index 994c6473..f0dc851c 100644 --- a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql +++ b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql @@ -77,6 +77,8 @@ CREATE TABLE IF NOT EXISTS EC_PAGE_DATA ( DATA_HASH INTEGER NOT NULL, QUALITY DOUBLE NOT NULL, + PUB_YEAR SMALLINT, + FOREIGN KEY (ID) REFERENCES EC_URL(ID) ON DELETE CASCADE ) CHARACTER SET utf8mb4 diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java index 220078f3..0dde33c9 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java @@ -75,7 +75,8 @@ class SqlLoadProcessedDocumentTest { EdgeHtmlStandard.HTML5, 100, 12345, - -3.14 + -3.14, + null ))); var details = dataStoreDao.getUrlDetailsMulti(new EdgeIdArray<>(loaderData.getUrlId(new EdgeUrl("https://www.marginalia.nu/")))); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/PubDateSnifferTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/PubDateSnifferTest.java new file mode 100644 index 00000000..df0affb7 --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/PubDateSnifferTest.java @@ -0,0 +1,200 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic; + +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateSniffer; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import org.jsoup.Jsoup; +import org.junit.jupiter.api.Test; + +import java.net.URISyntaxException; + +import static org.junit.jupiter.api.Assertions.*; + +class PubDateSnifferTest { + + PubDateSniffer dateSniffer = new PubDateSniffer(); + + @Test + public void testGetYearFromText() { + var ret = PubDateParser.dateFromHighestYearLookingSubstring("© 2005-2010 Bob Dobbs"); + assertTrue(ret.isPresent()); + assertEquals(2010, ret.get().year()); + + ret = PubDateParser.dateFromHighestYearLookingSubstring("© 99 Bob Dobbs"); + assertFalse(ret.isPresent()); + + ret = PubDateParser.dateFromHighestYearLookingSubstring("© 1939 Bob Dobbs"); + assertFalse(ret.isPresent()); + + ret = PubDateParser.dateFromHighestYearLookingSubstring("In the year 2525, if man is still alive"); + assertFalse(ret.isPresent()); + } + + @Test + public void testParse() { + var ret = PubDateParser.attemptParseDate("2022-01-01"); + assertTrue(ret.isPresent()); + assertEquals("2022-01-01", ret.get().dateIso8601()); + assertEquals(2022, ret.get().year()); + + ret = PubDateParser.attemptParseDate("2022-08-24T14:39:14Z"); + assertTrue(ret.isPresent()); + assertEquals("2022-08-24", ret.get().dateIso8601()); + assertEquals(2022, ret.get().year()); + + ret = PubDateParser.attemptParseDate("2022-08-24T14:39:14"); + assertTrue(ret.isPresent()); + assertEquals("2022-08-24", ret.get().dateIso8601()); + assertEquals(2022, ret.get().year()); + + ret = PubDateParser.attemptParseDate("Sun, 21 Oct 2018 12:16:24 GMT"); + assertTrue(ret.isPresent()); + assertEquals("2018-10-21", ret.get().dateIso8601()); + assertEquals(2018, ret.get().year()); + + } + + + @Test + public void testHtml5A() throws URISyntaxException { + var ret = dateSniffer.getPubDate("", + new EdgeUrl("https://www.example.com/"), + Jsoup.parse(""" + + +
+ + Wow, sure lor 'em boss +
+ """), EdgeHtmlStandard.UNKNOWN, true); + + assertFalse(ret.isEmpty()); + assertEquals("2022-08-24", ret.dateIso8601()); + } + + @Test + public void testHtml5B() throws URISyntaxException { + var ret = dateSniffer.getPubDate("", + new EdgeUrl("https://www.example.com/"), + Jsoup.parse(""" + + + + Wow, sure lor 'em boss + + """), EdgeHtmlStandard.UNKNOWN, true); + + assertFalse(ret.isEmpty()); + assertEquals("2022-08-24", ret.dateIso8601()); + } + + @Test + public void testGuessYear() { + System.out.println(PubDateParser.guessYear(2010, 2020)); + System.out.println(PubDateParser.guessYear(2010, 2020)); + System.out.println(PubDateParser.guessYear(2010, 2020)); + System.out.println(PubDateParser.guessYear(2010, 2020)); + System.out.println(PubDateParser.guessYear(2010, 2020)); + } + + @Test + public void testMicrodata() throws URISyntaxException { + var ret = dateSniffer.getPubDate("", + new EdgeUrl("https://www.example.com/"), + Jsoup.parse(""" + + + + """), EdgeHtmlStandard.UNKNOWN, true); + + assertFalse(ret.isEmpty()); + assertEquals("2022-08-24", ret.dateIso8601()); + } + + @Test + public void testRDFa() throws URISyntaxException { + var ret = dateSniffer.getPubDate("", + new EdgeUrl("https://www.example.com/"), + Jsoup.parse(""" + + + + """),EdgeHtmlStandard.UNKNOWN, true); + + assertFalse(ret.isEmpty()); + assertEquals("2022-08-24", ret.dateIso8601()); + } + + @Test + public void testLD() throws URISyntaxException { + var ret = dateSniffer.getPubDate("", + new EdgeUrl("https://www.example.com/"), + Jsoup.parse(""" + + + + """), EdgeHtmlStandard.UNKNOWN, true); + + assertFalse(ret.isEmpty()); + assertEquals("2004-08-24", ret.dateIso8601()); + } + + @Test + public void testPath() throws URISyntaxException { + var ret = dateSniffer.getPubDate("", + new EdgeUrl("https://www.example.com/articles/2022/04/how-to-detect-dates"), + Jsoup.parse(""" + + + No date in the HTML + """), EdgeHtmlStandard.UNKNOWN, true); + + assertFalse(ret.isEmpty()); + assertNull(ret.dateIso8601()); + assertEquals(2022, ret.year()); + } + + @Test + public void testHeader() throws URISyntaxException { + var ret = dateSniffer.getPubDate("content-type: application/pdf\netag: \"4fc0ba8a7f5090b6fa6be385dca206ec\"\nlast-modified: Thu, 03 Feb 2022 19:22:58 GMT\ncontent-length: 298819\ndate: Wed, 24 Aug 2022 19:48:52 GMT\ncache-control: public, no-transform, immutable, max-age\u003d31536000\naccess-control-expose-headers: Content-Length,Content-Disposition,Content-Range,Etag,Server-Timing,Vary,X-Cld-Error,X-Content-Type-Options\naccess-control-allow-origin: *\naccept-ranges: bytes\ntiming-allow-origin: *\nserver: Cloudinary\nstrict-transport-security: max-age\u003d604800\nx-content-type-options: nosniff\nserver-timing: akam;dur\u003d25;start\u003d2022-08-24T19:48:52.519Z;desc\u003dmiss,rtt;dur\u003d19,cloudinary;dur\u003d129;start\u003d2022-08-23T06:35:17.331Z\n", + new EdgeUrl("https://www.example.com/"), + Jsoup.parse(""" + + + No date in the HTML + """), EdgeHtmlStandard.UNKNOWN, true); + + assertFalse(ret.isEmpty()); + assertEquals("2022-02-03", ret.dateIso8601()); + } + @Test + public void testDOM() throws URISyntaxException { + var ret = dateSniffer.getPubDate("", + new EdgeUrl("https://www.example.com/"), + Jsoup.parse(""" + + +

Published 2003, updated 2022

+ """), EdgeHtmlStandard.HTML5, true); + + assertFalse(ret.isEmpty()); + assertNull(ret.dateIso8601()); + assertEquals(2015, ret.year()); + } + + @Test + public void testOldInvision() throws URISyntaxException { + var ret = dateSniffer.getPubDate("", + new EdgeUrl("https://www.example.com/"), + Jsoup.parse(""" + + +
 Post subject: Keyboards.
Post #1 Posted: Sun Oct 03, 2010 5:37 pm 
+ """), EdgeHtmlStandard.UNKNOWN, true); + + assertFalse(ret.isEmpty()); + assertNull(ret.dateIso8601()); + assertEquals(2010, ret.year()); + } +} \ No newline at end of file