Tweaks to pub date heuristics to make it mostly get the 'historyofphilosophy.net' case right.

Use HTML standard for plausibility checks in the more guesswork-like heuristics. Added more class names to look for date strings.
This commit is contained in:
Viktor Lofgren 2023-06-20 14:15:05 +02:00
parent a9fabba407
commit 7326ba74fe
8 changed files with 70 additions and 19 deletions

View File

@ -0,0 +1,38 @@
package nu.marginalia.pubdate;
import nu.marginalia.converting.model.HtmlStandard;
public class PubDateFromHtmlStandard {
/** Used to bias pub date heuristics */
public static int blindGuess(HtmlStandard standard) {
return switch (standard) {
case PLAIN -> 1993;
case HTML123 -> 1997;
case HTML4, XHTML -> 2006;
case HTML5 -> 2018;
case UNKNOWN -> 2000;
};
}
/** Sanity check a publication year based on the HTML standard.
* It is for example unlikely for a HTML5 document to be published
* in 1998, since that is 6 years before the HTML5 standard was published.
* <p>
* Discovering publication year involves a lot of guesswork, this helps
* keep the guesses relatively sane.
*/
public static boolean isGuessPlausible(HtmlStandard standard, int year) {
switch (standard) {
case HTML123:
return year <= 2000;
case XHTML:
case HTML4:
return year >= 2000;
case HTML5:
return year >= 2014;
default:
return true;
}
}
}

View File

@ -26,6 +26,17 @@ public class PubDateParser {
.filter(PubDateParser::validateDate);
}
public static Optional<PubDate> attemptParseDate(String date, HtmlStandard standard) {
return Optional.ofNullable(date)
.filter(str -> str.length() >= 4 && str.length() < 32)
.flatMap(str ->
parse8601(str)
.or(() -> parse1123(str))
.or(() -> dateFromHighestYearLookingSubstringWithGuess(str, standard))
)
.filter(PubDateParser::validateDate);
}
public static OptionalInt parseYearString(String yearString) {
try {
return OptionalInt.of(Integer.parseInt(yearString));
@ -70,7 +81,9 @@ public class PubDateParser {
}
public static Optional<PubDate> dateFromHighestYearLookingSubstringWithGuess(String maybe, int guess) {
public static Optional<PubDate> dateFromHighestYearLookingSubstringWithGuess(String maybe, HtmlStandard standard) {
int guess = PubDateFromHtmlStandard.blindGuess(standard);
var matcher = yearPattern.matcher(maybe);
int min = PubDate.MAX_YEAR + 1;
@ -126,7 +139,7 @@ public class PubDateParser {
// Create some jitter to avoid having documents piling up in the same four years
// as this would make searching in those years disproportionately useless
double guess = standard.yearGuess + ThreadLocalRandom.current().nextGaussian();
double guess = PubDateFromHtmlStandard.blindGuess(standard) + ThreadLocalRandom.current().nextGaussian();
if (guess < PubDate.MIN_YEAR) {
return PubDate.MIN_YEAR;

View File

@ -74,6 +74,8 @@ public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
return true;
if (text.contains("opyright"))
return true;
if (text.contains("Posted on"))
return true;
if (text.contains("&copy;"))
return true;
if (text.contains("(c)"))
@ -90,6 +92,8 @@ public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
|| classes.contains("byline")
|| classes.contains("author")
|| classes.contains("submitted")
|| classes.contains("date")
|| classes.contains("datey")
|| el.id().contains("footer-info-lastmod"); // mediawiki
}
@ -137,7 +141,7 @@ public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
}
else {
PubDateParser
.dateFromHighestYearLookingSubstringWithGuess(text, htmlStandard.yearGuess)
.attemptParseDate(text)
.ifPresent(this::setPubDate);
}
}

View File

@ -3,6 +3,7 @@ package nu.marginalia.pubdate.heuristic;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateEffortLevel;
import nu.marginalia.pubdate.PubDateFromHtmlStandard;
import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser;
import nu.marginalia.model.EdgeUrl;
@ -42,7 +43,7 @@ public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
public FilterResult head(@NotNull Node node, int depth) {
if (node instanceof TextNode tn) onTextNode(tn);
if (hasPubDate()) {
if (hasPubDate() && PubDateFromHtmlStandard.isGuessPlausible(htmlStandard, pubDate.year())) {
return FilterResult.STOP;
}
return FilterResult.CONTINUE;
@ -78,7 +79,7 @@ public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
}
else {
PubDateParser
.dateFromHighestYearLookingSubstringWithGuess(text, htmlStandard.yearGuess)
.dateFromHighestYearLookingSubstringWithGuess(text, htmlStandard)
.ifPresent(this::setPubDate);
}
}

View File

@ -235,7 +235,7 @@ class PubDateSnifferTest {
assertFalse(ret.isEmpty());
assertNull(ret.dateIso8601());
assertEquals(2015, ret.year());
assertEquals(2012, ret.year());
}
@Test

View File

@ -2,26 +2,21 @@ package nu.marginalia.converting.model;
public enum HtmlStandard {
PLAIN(0, 1, 1993),
UNKNOWN(0, 1, 2000),
HTML123(0, 1, 1997),
HTML4(-0.1, 1.05, 2006),
XHTML(-0.1, 1.05, 2006),
HTML5(0.5, 1.1, 2018);
PLAIN(0, 1),
UNKNOWN(0, 1),
HTML123(0, 1),
HTML4(-0.1, 1.05),
XHTML(-0.1, 1.05),
HTML5(0.5, 1.1);
/** Used to tune quality score */
public final double offset;
/** Used to tune quality score */
public final double scale;
/** This parameter is used to bias publish date heuristics
* */
public final int yearGuess;
HtmlStandard(double offset, double scale, int yearGuess) {
HtmlStandard(double offset, double scale) {
this.offset = offset;
this.scale = scale;
this.yearGuess = yearGuess;
}
}

View File

@ -16,6 +16,7 @@ public class HtmlStandardExtractor {
if (null == docType) {
return HtmlStandard.UNKNOWN;
}
String publicId = docType.publicId();
if (Strings.isNullOrEmpty(publicId))
return HtmlStandard.HTML5;

View File

@ -266,7 +266,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
private HtmlStandard getHtmlStandard(Document doc) {
HtmlStandard htmlStandard = HtmlStandardExtractor.parseDocType(doc.documentType());
if (HtmlStandard.UNKNOWN.equals(htmlStandard)) {
return HtmlStandardExtractor.sniffHtmlStandard(doc);
}