mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Tweaks to pub date heuristics to make it mostly get the 'historyofphilosophy.net' case right.
Use HTML standard for plausibility checks in the more guesswork-like heuristics. Added more class names to look for date strings.
This commit is contained in:
parent
a9fabba407
commit
7326ba74fe
@ -0,0 +1,38 @@
|
||||
package nu.marginalia.pubdate;
|
||||
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
|
||||
public class PubDateFromHtmlStandard {
|
||||
/** Used to bias pub date heuristics */
|
||||
public static int blindGuess(HtmlStandard standard) {
|
||||
return switch (standard) {
|
||||
case PLAIN -> 1993;
|
||||
case HTML123 -> 1997;
|
||||
case HTML4, XHTML -> 2006;
|
||||
case HTML5 -> 2018;
|
||||
case UNKNOWN -> 2000;
|
||||
};
|
||||
}
|
||||
|
||||
/** Sanity check a publication year based on the HTML standard.
|
||||
* It is for example unlikely for a HTML5 document to be published
|
||||
* in 1998, since that is 6 years before the HTML5 standard was published.
|
||||
* <p>
|
||||
* Discovering publication year involves a lot of guesswork, this helps
|
||||
* keep the guesses relatively sane.
|
||||
*/
|
||||
public static boolean isGuessPlausible(HtmlStandard standard, int year) {
|
||||
switch (standard) {
|
||||
case HTML123:
|
||||
return year <= 2000;
|
||||
case XHTML:
|
||||
case HTML4:
|
||||
return year >= 2000;
|
||||
case HTML5:
|
||||
return year >= 2014;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -26,6 +26,17 @@ public class PubDateParser {
|
||||
.filter(PubDateParser::validateDate);
|
||||
}
|
||||
|
||||
public static Optional<PubDate> attemptParseDate(String date, HtmlStandard standard) {
|
||||
return Optional.ofNullable(date)
|
||||
.filter(str -> str.length() >= 4 && str.length() < 32)
|
||||
.flatMap(str ->
|
||||
parse8601(str)
|
||||
.or(() -> parse1123(str))
|
||||
.or(() -> dateFromHighestYearLookingSubstringWithGuess(str, standard))
|
||||
)
|
||||
.filter(PubDateParser::validateDate);
|
||||
}
|
||||
|
||||
public static OptionalInt parseYearString(String yearString) {
|
||||
try {
|
||||
return OptionalInt.of(Integer.parseInt(yearString));
|
||||
@ -70,7 +81,9 @@ public class PubDateParser {
|
||||
}
|
||||
|
||||
|
||||
public static Optional<PubDate> dateFromHighestYearLookingSubstringWithGuess(String maybe, int guess) {
|
||||
public static Optional<PubDate> dateFromHighestYearLookingSubstringWithGuess(String maybe, HtmlStandard standard) {
|
||||
int guess = PubDateFromHtmlStandard.blindGuess(standard);
|
||||
|
||||
var matcher = yearPattern.matcher(maybe);
|
||||
|
||||
int min = PubDate.MAX_YEAR + 1;
|
||||
@ -126,7 +139,7 @@ public class PubDateParser {
|
||||
// Create some jitter to avoid having documents piling up in the same four years
|
||||
// as this would make searching in those years disproportionately useless
|
||||
|
||||
double guess = standard.yearGuess + ThreadLocalRandom.current().nextGaussian();
|
||||
double guess = PubDateFromHtmlStandard.blindGuess(standard) + ThreadLocalRandom.current().nextGaussian();
|
||||
|
||||
if (guess < PubDate.MIN_YEAR) {
|
||||
return PubDate.MIN_YEAR;
|
||||
|
@ -74,6 +74,8 @@ public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
|
||||
return true;
|
||||
if (text.contains("opyright"))
|
||||
return true;
|
||||
if (text.contains("Posted on"))
|
||||
return true;
|
||||
if (text.contains("©"))
|
||||
return true;
|
||||
if (text.contains("(c)"))
|
||||
@ -90,6 +92,8 @@ public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
|
||||
|| classes.contains("byline")
|
||||
|| classes.contains("author")
|
||||
|| classes.contains("submitted")
|
||||
|| classes.contains("date")
|
||||
|| classes.contains("datey")
|
||||
|| el.id().contains("footer-info-lastmod"); // mediawiki
|
||||
}
|
||||
|
||||
@ -137,7 +141,7 @@ public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
|
||||
}
|
||||
else {
|
||||
PubDateParser
|
||||
.dateFromHighestYearLookingSubstringWithGuess(text, htmlStandard.yearGuess)
|
||||
.attemptParseDate(text)
|
||||
.ifPresent(this::setPubDate);
|
||||
}
|
||||
}
|
||||
|
@ -3,6 +3,7 @@ package nu.marginalia.pubdate.heuristic;
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.pubdate.PubDateFromHtmlStandard;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.pubdate.PubDateParser;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
@ -42,7 +43,7 @@ public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
|
||||
public FilterResult head(@NotNull Node node, int depth) {
|
||||
if (node instanceof TextNode tn) onTextNode(tn);
|
||||
|
||||
if (hasPubDate()) {
|
||||
if (hasPubDate() && PubDateFromHtmlStandard.isGuessPlausible(htmlStandard, pubDate.year())) {
|
||||
return FilterResult.STOP;
|
||||
}
|
||||
return FilterResult.CONTINUE;
|
||||
@ -78,7 +79,7 @@ public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
|
||||
}
|
||||
else {
|
||||
PubDateParser
|
||||
.dateFromHighestYearLookingSubstringWithGuess(text, htmlStandard.yearGuess)
|
||||
.dateFromHighestYearLookingSubstringWithGuess(text, htmlStandard)
|
||||
.ifPresent(this::setPubDate);
|
||||
}
|
||||
}
|
||||
|
@ -235,7 +235,7 @@ class PubDateSnifferTest {
|
||||
|
||||
assertFalse(ret.isEmpty());
|
||||
assertNull(ret.dateIso8601());
|
||||
assertEquals(2015, ret.year());
|
||||
assertEquals(2012, ret.year());
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -2,26 +2,21 @@ package nu.marginalia.converting.model;
|
||||
|
||||
|
||||
public enum HtmlStandard {
|
||||
PLAIN(0, 1, 1993),
|
||||
UNKNOWN(0, 1, 2000),
|
||||
HTML123(0, 1, 1997),
|
||||
HTML4(-0.1, 1.05, 2006),
|
||||
XHTML(-0.1, 1.05, 2006),
|
||||
HTML5(0.5, 1.1, 2018);
|
||||
PLAIN(0, 1),
|
||||
UNKNOWN(0, 1),
|
||||
HTML123(0, 1),
|
||||
HTML4(-0.1, 1.05),
|
||||
XHTML(-0.1, 1.05),
|
||||
HTML5(0.5, 1.1);
|
||||
|
||||
/** Used to tune quality score */
|
||||
public final double offset;
|
||||
/** Used to tune quality score */
|
||||
public final double scale;
|
||||
|
||||
/** This parameter is used to bias publish date heuristics
|
||||
* */
|
||||
public final int yearGuess;
|
||||
|
||||
HtmlStandard(double offset, double scale, int yearGuess) {
|
||||
HtmlStandard(double offset, double scale) {
|
||||
this.offset = offset;
|
||||
this.scale = scale;
|
||||
this.yearGuess = yearGuess;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -16,6 +16,7 @@ public class HtmlStandardExtractor {
|
||||
if (null == docType) {
|
||||
return HtmlStandard.UNKNOWN;
|
||||
}
|
||||
|
||||
String publicId = docType.publicId();
|
||||
if (Strings.isNullOrEmpty(publicId))
|
||||
return HtmlStandard.HTML5;
|
||||
|
@ -266,7 +266,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
|
||||
private HtmlStandard getHtmlStandard(Document doc) {
|
||||
HtmlStandard htmlStandard = HtmlStandardExtractor.parseDocType(doc.documentType());
|
||||
|
||||
if (HtmlStandard.UNKNOWN.equals(htmlStandard)) {
|
||||
return HtmlStandardExtractor.sniffHtmlStandard(doc);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user