diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateFromHtmlStandard.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateFromHtmlStandard.java
new file mode 100644
index 00000000..d7777e0e
--- /dev/null
+++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateFromHtmlStandard.java
@@ -0,0 +1,38 @@
+package nu.marginalia.pubdate;
+
+import nu.marginalia.converting.model.HtmlStandard;
+
+public class PubDateFromHtmlStandard {
+ /** Used to bias pub date heuristics */
+ public static int blindGuess(HtmlStandard standard) {
+ return switch (standard) {
+ case PLAIN -> 1993;
+ case HTML123 -> 1997;
+ case HTML4, XHTML -> 2006;
+ case HTML5 -> 2018;
+ case UNKNOWN -> 2000;
+ };
+ }
+
+ /** Sanity check a publication year based on the HTML standard.
+ * It is for example unlikely for a HTML5 document to be published
+ * in 1998, since that is 6 years before the HTML5 standard was published.
+ *
+ * Discovering publication year involves a lot of guesswork, this helps
+ * keep the guesses relatively sane.
+ */
+ public static boolean isGuessPlausible(HtmlStandard standard, int year) {
+ switch (standard) {
+ case HTML123:
+ return year <= 2000;
+ case XHTML:
+ case HTML4:
+ return year >= 2000;
+ case HTML5:
+ return year >= 2014;
+ default:
+ return true;
+ }
+ }
+
+}
diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateParser.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateParser.java
index 0a52fb39..1abd84dd 100644
--- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateParser.java
+++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/PubDateParser.java
@@ -26,6 +26,17 @@ public class PubDateParser {
.filter(PubDateParser::validateDate);
}
+ public static Optional attemptParseDate(String date, HtmlStandard standard) {
+ return Optional.ofNullable(date)
+ .filter(str -> str.length() >= 4 && str.length() < 32)
+ .flatMap(str ->
+ parse8601(str)
+ .or(() -> parse1123(str))
+ .or(() -> dateFromHighestYearLookingSubstringWithGuess(str, standard))
+ )
+ .filter(PubDateParser::validateDate);
+ }
+
public static OptionalInt parseYearString(String yearString) {
try {
return OptionalInt.of(Integer.parseInt(yearString));
@@ -70,7 +81,9 @@ public class PubDateParser {
}
- public static Optional dateFromHighestYearLookingSubstringWithGuess(String maybe, int guess) {
+ public static Optional dateFromHighestYearLookingSubstringWithGuess(String maybe, HtmlStandard standard) {
+ int guess = PubDateFromHtmlStandard.blindGuess(standard);
+
var matcher = yearPattern.matcher(maybe);
int min = PubDate.MAX_YEAR + 1;
@@ -126,7 +139,7 @@ public class PubDateParser {
// Create some jitter to avoid having documents piling up in the same four years
// as this would make searching in those years disproportionately useless
- double guess = standard.yearGuess + ThreadLocalRandom.current().nextGaussian();
+ double guess = PubDateFromHtmlStandard.blindGuess(standard) + ThreadLocalRandom.current().nextGaussian();
if (guess < PubDate.MIN_YEAR) {
return PubDate.MIN_YEAR;
diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java
index 31ab4859..5f8c7ffc 100644
--- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java
+++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java
@@ -74,6 +74,8 @@ public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
return true;
if (text.contains("opyright"))
return true;
+ if (text.contains("Posted on"))
+ return true;
if (text.contains("©"))
return true;
if (text.contains("(c)"))
@@ -90,6 +92,8 @@ public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
|| classes.contains("byline")
|| classes.contains("author")
|| classes.contains("submitted")
+ || classes.contains("date")
+ || classes.contains("datey")
|| el.id().contains("footer-info-lastmod"); // mediawiki
}
@@ -137,7 +141,7 @@ public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
}
else {
PubDateParser
- .dateFromHighestYearLookingSubstringWithGuess(text, htmlStandard.yearGuess)
+ .attemptParseDate(text)
.ifPresent(this::setPubDate);
}
}
diff --git a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java
index f9fd6489..2bcf5dab 100644
--- a/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java
+++ b/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java
@@ -3,6 +3,7 @@ package nu.marginalia.pubdate.heuristic;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateEffortLevel;
+import nu.marginalia.pubdate.PubDateFromHtmlStandard;
import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser;
import nu.marginalia.model.EdgeUrl;
@@ -42,7 +43,7 @@ public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
public FilterResult head(@NotNull Node node, int depth) {
if (node instanceof TextNode tn) onTextNode(tn);
- if (hasPubDate()) {
+ if (hasPubDate() && PubDateFromHtmlStandard.isGuessPlausible(htmlStandard, pubDate.year())) {
return FilterResult.STOP;
}
return FilterResult.CONTINUE;
@@ -78,7 +79,7 @@ public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
}
else {
PubDateParser
- .dateFromHighestYearLookingSubstringWithGuess(text, htmlStandard.yearGuess)
+ .dateFromHighestYearLookingSubstringWithGuess(text, htmlStandard)
.ifPresent(this::setPubDate);
}
}
diff --git a/code/features-convert/pubdate/src/test/java/nu/marginalia/pubdate/PubDateSnifferTest.java b/code/features-convert/pubdate/src/test/java/nu/marginalia/pubdate/PubDateSnifferTest.java
index 732ef923..1794c196 100644
--- a/code/features-convert/pubdate/src/test/java/nu/marginalia/pubdate/PubDateSnifferTest.java
+++ b/code/features-convert/pubdate/src/test/java/nu/marginalia/pubdate/PubDateSnifferTest.java
@@ -235,7 +235,7 @@ class PubDateSnifferTest {
assertFalse(ret.isEmpty());
assertNull(ret.dateIso8601());
- assertEquals(2015, ret.year());
+ assertEquals(2012, ret.year());
}
@Test
diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/model/HtmlStandard.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/model/HtmlStandard.java
index 01d2c905..ecb3d630 100644
--- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/model/HtmlStandard.java
+++ b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/model/HtmlStandard.java
@@ -2,26 +2,21 @@ package nu.marginalia.converting.model;
public enum HtmlStandard {
- PLAIN(0, 1, 1993),
- UNKNOWN(0, 1, 2000),
- HTML123(0, 1, 1997),
- HTML4(-0.1, 1.05, 2006),
- XHTML(-0.1, 1.05, 2006),
- HTML5(0.5, 1.1, 2018);
+ PLAIN(0, 1),
+ UNKNOWN(0, 1),
+ HTML123(0, 1),
+ HTML4(-0.1, 1.05),
+ XHTML(-0.1, 1.05),
+ HTML5(0.5, 1.1);
/** Used to tune quality score */
public final double offset;
/** Used to tune quality score */
public final double scale;
- /** This parameter is used to bias publish date heuristics
- * */
- public final int yearGuess;
-
- HtmlStandard(double offset, double scale, int yearGuess) {
+ HtmlStandard(double offset, double scale) {
this.offset = offset;
this.scale = scale;
- this.yearGuess = yearGuess;
}
}
diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/HtmlStandardExtractor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/HtmlStandardExtractor.java
index a7e7047f..52537f68 100644
--- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/HtmlStandardExtractor.java
+++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/HtmlStandardExtractor.java
@@ -16,6 +16,7 @@ public class HtmlStandardExtractor {
if (null == docType) {
return HtmlStandard.UNKNOWN;
}
+
String publicId = docType.publicId();
if (Strings.isNullOrEmpty(publicId))
return HtmlStandard.HTML5;
diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java
index 5c43ca66..d6aca321 100644
--- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java
+++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java
@@ -266,7 +266,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
private HtmlStandard getHtmlStandard(Document doc) {
HtmlStandard htmlStandard = HtmlStandardExtractor.parseDocType(doc.documentType());
-
if (HtmlStandard.UNKNOWN.equals(htmlStandard)) {
return HtmlStandardExtractor.sniffHtmlStandard(doc);
}