diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateFromHtmlStandard.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateFromHtmlStandard.java new file mode 100644 index 00000000..d7777e0e --- /dev/null +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateFromHtmlStandard.java @@ -0,0 +1,38 @@ +package nu.marginalia.pubdate; + +import nu.marginalia.converting.model.HtmlStandard; + +public class PubDateFromHtmlStandard { + /** Used to bias pub date heuristics */ + public static int blindGuess(HtmlStandard standard) { + return switch (standard) { + case PLAIN -> 1993; + case HTML123 -> 1997; + case HTML4, XHTML -> 2006; + case HTML5 -> 2018; + case UNKNOWN -> 2000; + }; + } + + /** Sanity check a publication year based on the HTML standard. + * It is for example unlikely for a HTML5 document to be published + * in 1998, since that is 6 years before the HTML5 standard was published. + *

+ * Discovering publication year involves a lot of guesswork, this helps + * keep the guesses relatively sane. + */ + public static boolean isGuessPlausible(HtmlStandard standard, int year) { + switch (standard) { + case HTML123: + return year <= 2000; + case XHTML: + case HTML4: + return year >= 2000; + case HTML5: + return year >= 2014; + default: + return true; + } + } + +} diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateParser.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateParser.java index 0a52fb39..1abd84dd 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateParser.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateParser.java @@ -26,6 +26,17 @@ public class PubDateParser { .filter(PubDateParser::validateDate); } + public static Optional attemptParseDate(String date, HtmlStandard standard) { + return Optional.ofNullable(date) + .filter(str -> str.length() >= 4 && str.length() < 32) + .flatMap(str -> + parse8601(str) + .or(() -> parse1123(str)) + .or(() -> dateFromHighestYearLookingSubstringWithGuess(str, standard)) + ) + .filter(PubDateParser::validateDate); + } + public static OptionalInt parseYearString(String yearString) { try { return OptionalInt.of(Integer.parseInt(yearString)); @@ -70,7 +81,9 @@ public class PubDateParser { } - public static Optional dateFromHighestYearLookingSubstringWithGuess(String maybe, int guess) { + public static Optional dateFromHighestYearLookingSubstringWithGuess(String maybe, HtmlStandard standard) { + int guess = PubDateFromHtmlStandard.blindGuess(standard); + var matcher = yearPattern.matcher(maybe); int min = PubDate.MAX_YEAR + 1; @@ -126,7 +139,7 @@ public class PubDateParser { // Create some jitter to avoid having documents piling up in the same four years // as this would make searching in those years disproportionately useless - double guess = standard.yearGuess + ThreadLocalRandom.current().nextGaussian(); + double guess = PubDateFromHtmlStandard.blindGuess(standard) + ThreadLocalRandom.current().nextGaussian(); if (guess < PubDate.MIN_YEAR) { return PubDate.MIN_YEAR; diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java index 31ab4859..5f8c7ffc 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java @@ -74,6 +74,8 @@ public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic { return true; if (text.contains("opyright")) return true; + if (text.contains("Posted on")) + return true; if (text.contains("©")) return true; if (text.contains("(c)")) @@ -90,6 +92,8 @@ public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic { || classes.contains("byline") || classes.contains("author") || classes.contains("submitted") + || classes.contains("date") + || classes.contains("datey") || el.id().contains("footer-info-lastmod"); // mediawiki } @@ -137,7 +141,7 @@ public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic { } else { PubDateParser - .dateFromHighestYearLookingSubstringWithGuess(text, htmlStandard.yearGuess) + .attemptParseDate(text) .ifPresent(this::setPubDate); } } diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java index f9fd6489..2bcf5dab 100644 --- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java +++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java @@ -3,6 +3,7 @@ package nu.marginalia.pubdate.heuristic; import nu.marginalia.converting.model.HtmlStandard; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.pubdate.PubDateEffortLevel; +import nu.marginalia.pubdate.PubDateFromHtmlStandard; import nu.marginalia.pubdate.PubDateHeuristic; import nu.marginalia.pubdate.PubDateParser; import nu.marginalia.model.EdgeUrl; @@ -42,7 +43,7 @@ public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic { public FilterResult head(@NotNull Node node, int depth) { if (node instanceof TextNode tn) onTextNode(tn); - if (hasPubDate()) { + if (hasPubDate() && PubDateFromHtmlStandard.isGuessPlausible(htmlStandard, pubDate.year())) { return FilterResult.STOP; } return FilterResult.CONTINUE; @@ -78,7 +79,7 @@ public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic { } else { PubDateParser - .dateFromHighestYearLookingSubstringWithGuess(text, htmlStandard.yearGuess) + .dateFromHighestYearLookingSubstringWithGuess(text, htmlStandard) .ifPresent(this::setPubDate); } } diff --git a/code/features-convert/pubdate/src/test/java/nu/marginalia/pubdate/PubDateSnifferTest.java b/code/features-convert/pubdate/src/test/java/nu/marginalia/pubdate/PubDateSnifferTest.java index 732ef923..1794c196 100644 --- a/code/features-convert/pubdate/src/test/java/nu/marginalia/pubdate/PubDateSnifferTest.java +++ b/code/features-convert/pubdate/src/test/java/nu/marginalia/pubdate/PubDateSnifferTest.java @@ -235,7 +235,7 @@ class PubDateSnifferTest { assertFalse(ret.isEmpty()); assertNull(ret.dateIso8601()); - assertEquals(2015, ret.year()); + assertEquals(2012, ret.year()); } @Test diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/model/HtmlStandard.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/model/HtmlStandard.java index 01d2c905..ecb3d630 100644 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/model/HtmlStandard.java +++ b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/model/HtmlStandard.java @@ -2,26 +2,21 @@ package nu.marginalia.converting.model; public enum HtmlStandard { - PLAIN(0, 1, 1993), - UNKNOWN(0, 1, 2000), - HTML123(0, 1, 1997), - HTML4(-0.1, 1.05, 2006), - XHTML(-0.1, 1.05, 2006), - HTML5(0.5, 1.1, 2018); + PLAIN(0, 1), + UNKNOWN(0, 1), + HTML123(0, 1), + HTML4(-0.1, 1.05), + XHTML(-0.1, 1.05), + HTML5(0.5, 1.1); /** Used to tune quality score */ public final double offset; /** Used to tune quality score */ public final double scale; - /** This parameter is used to bias publish date heuristics - * */ - public final int yearGuess; - - HtmlStandard(double offset, double scale, int yearGuess) { + HtmlStandard(double offset, double scale) { this.offset = offset; this.scale = scale; - this.yearGuess = yearGuess; } } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/HtmlStandardExtractor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/HtmlStandardExtractor.java index a7e7047f..52537f68 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/HtmlStandardExtractor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/HtmlStandardExtractor.java @@ -16,6 +16,7 @@ public class HtmlStandardExtractor { if (null == docType) { return HtmlStandard.UNKNOWN; } + String publicId = docType.publicId(); if (Strings.isNullOrEmpty(publicId)) return HtmlStandard.HTML5; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java index 5c43ca66..d6aca321 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java @@ -266,7 +266,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin private HtmlStandard getHtmlStandard(Document doc) { HtmlStandard htmlStandard = HtmlStandardExtractor.parseDocType(doc.documentType()); - if (HtmlStandard.UNKNOWN.equals(htmlStandard)) { return HtmlStandardExtractor.sniffHtmlStandard(doc); }