mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Tweaks to pub date heuristics to make it mostly get the 'historyofphilosophy.net' case right.
Use HTML standard for plausibility checks in the more guesswork-like heuristics. Added more class names to look for date strings.
This commit is contained in:
parent
a9fabba407
commit
7326ba74fe
@ -0,0 +1,38 @@
|
|||||||
|
package nu.marginalia.pubdate;
|
||||||
|
|
||||||
|
import nu.marginalia.converting.model.HtmlStandard;
|
||||||
|
|
||||||
|
public class PubDateFromHtmlStandard {
|
||||||
|
/** Used to bias pub date heuristics */
|
||||||
|
public static int blindGuess(HtmlStandard standard) {
|
||||||
|
return switch (standard) {
|
||||||
|
case PLAIN -> 1993;
|
||||||
|
case HTML123 -> 1997;
|
||||||
|
case HTML4, XHTML -> 2006;
|
||||||
|
case HTML5 -> 2018;
|
||||||
|
case UNKNOWN -> 2000;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Sanity check a publication year based on the HTML standard.
|
||||||
|
* It is for example unlikely for a HTML5 document to be published
|
||||||
|
* in 1998, since that is 6 years before the HTML5 standard was published.
|
||||||
|
* <p>
|
||||||
|
* Discovering publication year involves a lot of guesswork, this helps
|
||||||
|
* keep the guesses relatively sane.
|
||||||
|
*/
|
||||||
|
public static boolean isGuessPlausible(HtmlStandard standard, int year) {
|
||||||
|
switch (standard) {
|
||||||
|
case HTML123:
|
||||||
|
return year <= 2000;
|
||||||
|
case XHTML:
|
||||||
|
case HTML4:
|
||||||
|
return year >= 2000;
|
||||||
|
case HTML5:
|
||||||
|
return year >= 2014;
|
||||||
|
default:
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -26,6 +26,17 @@ public class PubDateParser {
|
|||||||
.filter(PubDateParser::validateDate);
|
.filter(PubDateParser::validateDate);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static Optional<PubDate> attemptParseDate(String date, HtmlStandard standard) {
|
||||||
|
return Optional.ofNullable(date)
|
||||||
|
.filter(str -> str.length() >= 4 && str.length() < 32)
|
||||||
|
.flatMap(str ->
|
||||||
|
parse8601(str)
|
||||||
|
.or(() -> parse1123(str))
|
||||||
|
.or(() -> dateFromHighestYearLookingSubstringWithGuess(str, standard))
|
||||||
|
)
|
||||||
|
.filter(PubDateParser::validateDate);
|
||||||
|
}
|
||||||
|
|
||||||
public static OptionalInt parseYearString(String yearString) {
|
public static OptionalInt parseYearString(String yearString) {
|
||||||
try {
|
try {
|
||||||
return OptionalInt.of(Integer.parseInt(yearString));
|
return OptionalInt.of(Integer.parseInt(yearString));
|
||||||
@ -70,7 +81,9 @@ public class PubDateParser {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static Optional<PubDate> dateFromHighestYearLookingSubstringWithGuess(String maybe, int guess) {
|
public static Optional<PubDate> dateFromHighestYearLookingSubstringWithGuess(String maybe, HtmlStandard standard) {
|
||||||
|
int guess = PubDateFromHtmlStandard.blindGuess(standard);
|
||||||
|
|
||||||
var matcher = yearPattern.matcher(maybe);
|
var matcher = yearPattern.matcher(maybe);
|
||||||
|
|
||||||
int min = PubDate.MAX_YEAR + 1;
|
int min = PubDate.MAX_YEAR + 1;
|
||||||
@ -126,7 +139,7 @@ public class PubDateParser {
|
|||||||
// Create some jitter to avoid having documents piling up in the same four years
|
// Create some jitter to avoid having documents piling up in the same four years
|
||||||
// as this would make searching in those years disproportionately useless
|
// as this would make searching in those years disproportionately useless
|
||||||
|
|
||||||
double guess = standard.yearGuess + ThreadLocalRandom.current().nextGaussian();
|
double guess = PubDateFromHtmlStandard.blindGuess(standard) + ThreadLocalRandom.current().nextGaussian();
|
||||||
|
|
||||||
if (guess < PubDate.MIN_YEAR) {
|
if (guess < PubDate.MIN_YEAR) {
|
||||||
return PubDate.MIN_YEAR;
|
return PubDate.MIN_YEAR;
|
||||||
|
@ -74,6 +74,8 @@ public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
|
|||||||
return true;
|
return true;
|
||||||
if (text.contains("opyright"))
|
if (text.contains("opyright"))
|
||||||
return true;
|
return true;
|
||||||
|
if (text.contains("Posted on"))
|
||||||
|
return true;
|
||||||
if (text.contains("©"))
|
if (text.contains("©"))
|
||||||
return true;
|
return true;
|
||||||
if (text.contains("(c)"))
|
if (text.contains("(c)"))
|
||||||
@ -90,6 +92,8 @@ public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
|
|||||||
|| classes.contains("byline")
|
|| classes.contains("byline")
|
||||||
|| classes.contains("author")
|
|| classes.contains("author")
|
||||||
|| classes.contains("submitted")
|
|| classes.contains("submitted")
|
||||||
|
|| classes.contains("date")
|
||||||
|
|| classes.contains("datey")
|
||||||
|| el.id().contains("footer-info-lastmod"); // mediawiki
|
|| el.id().contains("footer-info-lastmod"); // mediawiki
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -137,7 +141,7 @@ public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
|
|||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
PubDateParser
|
PubDateParser
|
||||||
.dateFromHighestYearLookingSubstringWithGuess(text, htmlStandard.yearGuess)
|
.attemptParseDate(text)
|
||||||
.ifPresent(this::setPubDate);
|
.ifPresent(this::setPubDate);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3,6 +3,7 @@ package nu.marginalia.pubdate.heuristic;
|
|||||||
import nu.marginalia.converting.model.HtmlStandard;
|
import nu.marginalia.converting.model.HtmlStandard;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.pubdate.PubDateEffortLevel;
|
import nu.marginalia.pubdate.PubDateEffortLevel;
|
||||||
|
import nu.marginalia.pubdate.PubDateFromHtmlStandard;
|
||||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.pubdate.PubDateParser;
|
import nu.marginalia.pubdate.PubDateParser;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
@ -42,7 +43,7 @@ public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
|
|||||||
public FilterResult head(@NotNull Node node, int depth) {
|
public FilterResult head(@NotNull Node node, int depth) {
|
||||||
if (node instanceof TextNode tn) onTextNode(tn);
|
if (node instanceof TextNode tn) onTextNode(tn);
|
||||||
|
|
||||||
if (hasPubDate()) {
|
if (hasPubDate() && PubDateFromHtmlStandard.isGuessPlausible(htmlStandard, pubDate.year())) {
|
||||||
return FilterResult.STOP;
|
return FilterResult.STOP;
|
||||||
}
|
}
|
||||||
return FilterResult.CONTINUE;
|
return FilterResult.CONTINUE;
|
||||||
@ -78,7 +79,7 @@ public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
|
|||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
PubDateParser
|
PubDateParser
|
||||||
.dateFromHighestYearLookingSubstringWithGuess(text, htmlStandard.yearGuess)
|
.dateFromHighestYearLookingSubstringWithGuess(text, htmlStandard)
|
||||||
.ifPresent(this::setPubDate);
|
.ifPresent(this::setPubDate);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -235,7 +235,7 @@ class PubDateSnifferTest {
|
|||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertNull(ret.dateIso8601());
|
assertNull(ret.dateIso8601());
|
||||||
assertEquals(2015, ret.year());
|
assertEquals(2012, ret.year());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -2,26 +2,21 @@ package nu.marginalia.converting.model;
|
|||||||
|
|
||||||
|
|
||||||
public enum HtmlStandard {
|
public enum HtmlStandard {
|
||||||
PLAIN(0, 1, 1993),
|
PLAIN(0, 1),
|
||||||
UNKNOWN(0, 1, 2000),
|
UNKNOWN(0, 1),
|
||||||
HTML123(0, 1, 1997),
|
HTML123(0, 1),
|
||||||
HTML4(-0.1, 1.05, 2006),
|
HTML4(-0.1, 1.05),
|
||||||
XHTML(-0.1, 1.05, 2006),
|
XHTML(-0.1, 1.05),
|
||||||
HTML5(0.5, 1.1, 2018);
|
HTML5(0.5, 1.1);
|
||||||
|
|
||||||
/** Used to tune quality score */
|
/** Used to tune quality score */
|
||||||
public final double offset;
|
public final double offset;
|
||||||
/** Used to tune quality score */
|
/** Used to tune quality score */
|
||||||
public final double scale;
|
public final double scale;
|
||||||
|
|
||||||
/** This parameter is used to bias publish date heuristics
|
HtmlStandard(double offset, double scale) {
|
||||||
* */
|
|
||||||
public final int yearGuess;
|
|
||||||
|
|
||||||
HtmlStandard(double offset, double scale, int yearGuess) {
|
|
||||||
this.offset = offset;
|
this.offset = offset;
|
||||||
this.scale = scale;
|
this.scale = scale;
|
||||||
this.yearGuess = yearGuess;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -16,6 +16,7 @@ public class HtmlStandardExtractor {
|
|||||||
if (null == docType) {
|
if (null == docType) {
|
||||||
return HtmlStandard.UNKNOWN;
|
return HtmlStandard.UNKNOWN;
|
||||||
}
|
}
|
||||||
|
|
||||||
String publicId = docType.publicId();
|
String publicId = docType.publicId();
|
||||||
if (Strings.isNullOrEmpty(publicId))
|
if (Strings.isNullOrEmpty(publicId))
|
||||||
return HtmlStandard.HTML5;
|
return HtmlStandard.HTML5;
|
||||||
|
@ -266,7 +266,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
|
|
||||||
private HtmlStandard getHtmlStandard(Document doc) {
|
private HtmlStandard getHtmlStandard(Document doc) {
|
||||||
HtmlStandard htmlStandard = HtmlStandardExtractor.parseDocType(doc.documentType());
|
HtmlStandard htmlStandard = HtmlStandardExtractor.parseDocType(doc.documentType());
|
||||||
|
|
||||||
if (HtmlStandard.UNKNOWN.equals(htmlStandard)) {
|
if (HtmlStandard.UNKNOWN.equals(htmlStandard)) {
|
||||||
return HtmlStandardExtractor.sniffHtmlStandard(doc);
|
return HtmlStandardExtractor.sniffHtmlStandard(doc);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user