diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..7909b9c6 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,42 @@ +# Contributing + +At present this is mostly a solo project, but +external contributions are very welcome. + +This is a bit of a special project, +in part because a search engine isn't +like a text editor that you can just +download and tinker with; and in part +because it's as much a research project +as it is a search engine. + +If you have an idea for a cool change, +send an email to and +we can discuss its feasibility. + +Search is essentially a fractal of interesting +problems, so even if you don't have an idea, +just a skillset (really any), odds are there's +something interesting I could point you to. + +## Release and branches + +The search engine has a release cycle of +once per 6-8 weeks, coinciding with the crawling +cycle. Where model-breaking changes and changes to +the crawler can be introduced. + +## Quick Set Up + +There is a [Set Up Guide](https://git.marginalia.nu/marginalia/marginalia.nu/wiki/Setup/Search) +in the wiki. It has a small tendency to oxidize rather +rapidly since the project currently does not have a +lot of contributors to test it. If you find a problem +with the guide, email . + +## Documentation + +What documentation exists resides here: + +https://git.marginalia.nu/marginalia/marginalia.nu/wiki + diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/DocumentsCompiler.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/DocumentsCompiler.java index 1aebe182..e5b18c6a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/DocumentsCompiler.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/DocumentsCompiler.java @@ -30,7 +30,7 @@ public class DocumentsCompiler { var details = doc.details; if (details != null) { - ret.add(new LoadProcessedDocument(doc.url, doc.state, details.title, details.description, HtmlFeature.encode(details.features), details.standard, details.length, details.hashCode, details.quality)); + ret.add(new LoadProcessedDocument(doc.url, doc.state, details.title, details.description, HtmlFeature.encode(details.features), details.standard, details.length, details.hashCode, details.quality, details.pubYear)); } else { ret.add(new LoadProcessedDocumentWithError(doc.url, doc.state, doc.stateReason)); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDocument.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDocument.java index 9a35c58b..3f65f7af 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDocument.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDocument.java @@ -7,6 +7,8 @@ import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState; +import javax.annotation.Nullable; + public record LoadProcessedDocument(EdgeUrl url, EdgeUrlState state, @@ -16,7 +18,8 @@ public record LoadProcessedDocument(EdgeUrl url, EdgeHtmlStandard standard, int length, long hash, - double quality) implements Instruction + double quality, + @Nullable Integer pubYear) implements Instruction { @Override public void apply(Interpreter interpreter) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java index 6a2dd7cd..fac60a74 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java @@ -8,6 +8,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.sql.SQLException; +import java.sql.Types; import java.util.List; import static java.sql.Statement.SUCCESS_NO_INFO; @@ -34,10 +35,11 @@ public class SqlLoadProcessedDocument { IN FEATURES INT, IN STANDARD VARCHAR(32), IN QUALITY DOUBLE, - IN HASH INT) + IN HASH INT, + IN PUB_YEAR SMALLINT) BEGIN SET FOREIGN_KEY_CHECKS=0; - REPLACE INTO EC_PAGE_DATA(ID, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY) VALUES (URL_ID, TITLE, DESCRIPTION, LENGTH, STANDARD, FEATURES, HASH, QUALITY); + REPLACE INTO EC_PAGE_DATA(ID, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR) VALUES (URL_ID, TITLE, DESCRIPTION, LENGTH, STANDARD, FEATURES, HASH, QUALITY, PUB_YEAR); UPDATE EC_URL SET VISITED=1, STATE=STATE WHERE ID=URL_ID; SET FOREIGN_KEY_CHECKS=1; END @@ -62,7 +64,7 @@ public class SqlLoadProcessedDocument { public void load(LoaderData data, List documents) { try (var conn = dataSource.getConnection(); - var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT(?, ?, ?, ?, ?, ?, ?, ?, ?)")) { + var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)")) { conn.setAutoCommit(false); int cnt = 0; int batchOffset = 0; @@ -82,6 +84,12 @@ public class SqlLoadProcessedDocument { stmt.setString(7, doc.standard().name()); stmt.setDouble(8, doc.quality()); stmt.setInt(9, (int) doc.hash()); + if (doc.pubYear() != null) { + stmt.setShort(10, (short) doc.pubYear().intValue()); + } + else { + stmt.setInt(10, Types.SMALLINT); + } stmt.addBatch(); if (++cnt == 100) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/ProcessedDocumentDetails.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/ProcessedDocumentDetails.java index 69f5a57a..25afe126 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/ProcessedDocumentDetails.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/ProcessedDocumentDetails.java @@ -5,6 +5,7 @@ import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature; import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import javax.annotation.Nullable; import java.util.List; import java.util.Set; @@ -13,6 +14,9 @@ public class ProcessedDocumentDetails { public String title; public String description; + @Nullable + public Integer pubYear; + public int length; public double quality; public long hashCode; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java index e7ebd4e0..7c953074 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java @@ -13,6 +13,8 @@ import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException.Disqualifi import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument; import nu.marginalia.wmsa.edge.converting.model.ProcessedDocumentDetails; import nu.marginalia.wmsa.edge.converting.processor.logic.*; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateSniffer; import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus; @@ -47,6 +49,7 @@ public class DocumentProcessor { private final TitleExtractor titleExtractor; private final DocumentKeywordExtractor keywordExtractor; private final SummaryExtractor summaryExtractor; + private final PubDateSniffer pubDateSniffer; private static final DocumentValuator documentValuator = new DocumentValuator(); private static final LanguageFilter languageFilter = new LanguageFilter(); @@ -60,7 +63,8 @@ public class DocumentProcessor { FeatureExtractor featureExtractor, TitleExtractor titleExtractor, DocumentKeywordExtractor keywordExtractor, - SummaryExtractor summaryExtractor) + SummaryExtractor summaryExtractor, + PubDateSniffer pubDateSniffer) { this.minDocumentLength = minDocumentLength; this.minDocumentQuality = minDocumentQuality; @@ -69,6 +73,7 @@ public class DocumentProcessor { this.titleExtractor = titleExtractor; this.keywordExtractor = keywordExtractor; this.summaryExtractor = summaryExtractor; + this.pubDateSniffer = pubDateSniffer; } public ProcessedDocument makeDisqualifiedStub(CrawledDocument crawledDocument) { @@ -177,6 +182,9 @@ public class DocumentProcessor { Document doc = Jsoup.parse(crawledDocument.documentBody); if (AcceptableAds.hasAcceptableAdsTag(doc)) { + // I've never encountered a website where this hasn't been a severe indicator + // of spam + throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS); } @@ -204,8 +212,10 @@ public class DocumentProcessor { ret.quality = documentValuator.getQuality(crawledDocument, ret.standard, doc, dld); ret.hashCode = HashCode.fromString(crawledDocument.documentBodyHash).asLong(); + KeywordMetadata keywordMetadata = new KeywordMetadata(ret.quality); + PubDate pubDate; EdgePageWordSet words; if (shouldDoSimpleProcessing(url, ret)) { /* Some documents we'll index, but only superficially. This is a compromise @@ -215,17 +225,25 @@ public class DocumentProcessor { ret.features = Set.of(HtmlFeature.UNKNOWN); words = keywordExtractor.extractKeywordsMinimal(dld, keywordMetadata); ret.description = ""; + + pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, false); } else { ret.features = featureExtractor.getFeatures(crawledDomain, doc, dld); words = keywordExtractor.extractKeywords(dld, keywordMetadata); ret.description = getDescription(doc); + + pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, true); } - addMetaWords(ret, url, crawledDomain, words); + addMetaWords(ret, url, pubDate, crawledDomain, words); getLinks(url, ret, doc, words); + if (pubDate.hasYear()) { + ret.pubYear = pubDate.year(); + } + return new DetailsWithWords(ret, words); } @@ -256,7 +274,7 @@ public class DocumentProcessor { return false; } - private void addMetaWords(ProcessedDocumentDetails ret, EdgeUrl url, CrawledDomain domain, EdgePageWordSet words) { + private void addMetaWords(ProcessedDocumentDetails ret, EdgeUrl url, PubDate pubDate, CrawledDomain domain, EdgePageWordSet words) { List tagWords = new ArrayList<>(); var edgeDomain = url.domain; @@ -276,6 +294,13 @@ public class DocumentProcessor { ret.features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add); + if (pubDate.year() > 1900) { + tagWords.add("year:" + pubDate.year()); + } + if (pubDate.dateIso8601() != null) { + tagWords.add("pub:" + pubDate.dateIso8601()); + } + words.appendWithNoMeta(IndexBlock.Meta, tagWords); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/SummaryExtractionFilter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/SummaryExtractionFilter.java new file mode 100644 index 00000000..adafa835 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/SummaryExtractionFilter.java @@ -0,0 +1,250 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic; + +import com.google.common.base.Strings; +import org.apache.commons.lang3.StringUtils; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.nodes.TextNode; +import org.jsoup.select.NodeFilter; + +import java.util.*; + +import static org.jsoup.internal.StringUtil.isActuallyWhitespace; +import static org.jsoup.internal.StringUtil.isInvisibleChar; + +public class SummaryExtractionFilter implements NodeFilter { + + public Map statistics = new HashMap<>(10000); + public Map pos = new HashMap<>(10000); + public int cnt = 0; + + @Override + public FilterResult head(Node node, int depth) { + pos.put(node, cnt++); + return FilterResult.CONTINUE; + } + + @Override + public FilterResult tail(Node node, int depth) { + if (node instanceof TextNode tn) { + statistics.put(node, new NodeStatistics(tn, 0, textLength(tn.getWholeText()), pos.getOrDefault(tn, cnt))); + } + else if (node instanceof Element e) { + statistics.put(node, aggregateStatistics(e)); + + if (shouldPruneTag(e)) { + return FilterResult.REMOVE; + } + } + + return FilterResult.CONTINUE; + } + + public boolean shouldPruneTag(Element tag) { + String tagName = tag.tagName(); + + if ("h1".equalsIgnoreCase(tagName)) return true; + if ("h2".equalsIgnoreCase(tagName)) return true; + if ("h3".equalsIgnoreCase(tagName)) return true; + + return false; + } + + public String getSummary(int maxLength) { + List ret = new ArrayList<>(statistics.size()); + for (var stats : statistics.values()) { + if (stats.textToTagRatio() < 0.85) continue; + if (!stats.isElement() || !stats.isAppropriateTagType()) continue; + if (stats.textLength() < 128) continue; + if (stats.isLink()) continue; + + ret.add(stats); + } + ret.sort(Comparator.comparing(e -> -e.textLength())); + if (ret.size() > 32) ret.subList(32, ret.size()).clear(); + ret.sort(Comparator.comparing(NodeStatistics::pos)); + if (ret.size() > 3) ret.subList(3, ret.size()).clear(); + ret.sort(Comparator.comparing(NodeStatistics::isBody)); + if (ret.size() >= 1) { + return StringUtils.abbreviate(ret.get(0).text(), "", maxLength); + } + return ""; + } + + private NodeStatistics aggregateStatistics(Element e) { + int text = 0; + int tag = 0; + + String tagName = e.tagName(); + if (!tagName.equalsIgnoreCase("br") && !tagName.equalsIgnoreCase("p")) { + tag += tagName.length(); + } + + int numAttributes = e.attributesSize(); + tag += Math.max(numAttributes - 1, 0); + + if (numAttributes > 0) { + var attrs = e.attributes(); + for (var attr : attrs) { + if (Strings.isNullOrEmpty(attr.getValue())) + tag += attr.getKey().length(); + else { + tag += 3 + attr.getKey().length() + attr.getValue().length(); + } + } + } + + for (var childNode : e.childNodes()) { + var cn = statistics.get(childNode); + + if (cn != null) { + boolean isLink = (tagName.equalsIgnoreCase("a") || cn.isLink()); + if (isLink) { + tag += cn.tagLength + cn.textLength; + } + else { + text += cn.textLength; + tag += cn.tagLength; + } + + if (!cn.isElement()) { + statistics.remove(cn.node); + } + } + } + + return new NodeStatistics(e, tag, text, pos.getOrDefault(e, cnt)); + } + + private int textLength(String str) { + int length = 0; + + // This is a modified version of JSoup's StringUtil.normaliseWhitespace() + // that doesn't do allocation + + int len = str.length(); + int c; + boolean lastWasWhite = false; + boolean reachedNonWhite = false; + + for (int i = 0; i < len; i+= Character.charCount(c)) { + c = str.codePointAt(i); + if (isActuallyWhitespace(c)) { + if ((!reachedNonWhite) || lastWasWhite) + continue; + + if (isAscii(c) && Character.isAlphabetic(c)) { + length++; + } + + lastWasWhite = true; + } + else if (!isInvisibleChar(c)) { + if (isAscii(c) && Character.isAlphabetic(c)) { + length++; + } + lastWasWhite = false; + reachedNonWhite = true; + } + } + + return length; + } + + public boolean isAscii(int cp) { + return (cp & ~0x7F) == 0; + } + + public record NodeStatistics(Node node, int tagLength, int textLength, int pos) { + public double textToTagRatio() { + if (textLength == 0) return 1; + + return textLength / (double)(tagLength + textLength); + } + + public String text() { + if (node instanceof Element e) { + return e.text(); + } + else if (node instanceof TextNode tn) { + return tn.text(); + } + return ""; + } + + public boolean isElement() { + return node instanceof Element; + } + + public boolean isLink() { + if (node instanceof Element el) { + return "a".equalsIgnoreCase(el.tagName()); + } + return false; + } + + public boolean isAppropriateTagType() { + + if (node instanceof Element el) { + String tagName = el.tagName(); + if ("blockquote".equalsIgnoreCase(tagName)) + return false; + if ("tt".equalsIgnoreCase(tagName)) + return false; + if ("ol".equalsIgnoreCase(tagName)) + return false; + if ("ul".equalsIgnoreCase(tagName)) + return false; + if ("li".equalsIgnoreCase(tagName)) + return false; + if ("h1".equalsIgnoreCase(tagName)) + return false; + if ("h2".equalsIgnoreCase(tagName)) + return false; + if ("h3".equalsIgnoreCase(tagName)) + return false; + if ("th".equalsIgnoreCase(tagName)) + return false; + if ("td".equalsIgnoreCase(tagName)) + return false; + if ("tbody".equalsIgnoreCase(tagName)) + return false; + if ("html".equalsIgnoreCase(tagName)) + return false; + if ("title".equalsIgnoreCase(tagName)) + return false; + if ("#root".equalsIgnoreCase(tagName)) + return false; + } + + if (node.parent() instanceof Element elp) { + if ("a".equals(elp.tagName())) + return false; + } + + return true; + } + + public boolean isBody() { + if (node instanceof Element el) { + return "body".equalsIgnoreCase(el.tagName()); + } + return false; + } + + public String tagName() { + if (node instanceof Element el) { + return el.tagName(); + } + return '$'+node.getClass().getSimpleName(); + } + + public String toString() { + return String.format("NodeStatistics[%s %d p %d %d]", tagName(), pos, tagLength, textLength); + } + + public double sortValue() { + return -textToTagRatio() * Math.log(1 + textLength) / Math.log(1+pos); + } + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/SummaryExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/SummaryExtractor.java index 2169adf6..4984125f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/SummaryExtractor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/SummaryExtractor.java @@ -11,7 +11,7 @@ import java.util.regex.Pattern; public class SummaryExtractor { private final int maxSummaryLength; - private final Pattern truncatedCharacters = Pattern.compile("[^a-zA-Z0-9.,!?\\-'\"]+|[\\-.,!?' ]{3,}"); + private final Pattern truncatedCharacters = Pattern.compile("[\\-.,!?' ]{3,}"); @Inject public SummaryExtractor(@Named("max-summary-length") Integer maxSummaryLength) { @@ -19,12 +19,9 @@ public class SummaryExtractor { } public String extractSummary(Document parsed) { - var cleanDoc = parsed.clone(); - cleanDoc.select("header,nav,#header,#nav,#navigation,.header,.nav,.navigation,ul,li").remove(); - String summaryString; - summaryString = extractSummaryRaw(cleanDoc); + summaryString = extractSummaryRaw(parsed); summaryString = truncatedCharacters.matcher(summaryString).replaceAll(" "); summaryString = StringUtils.abbreviate(summaryString, "", maxSummaryLength); @@ -36,9 +33,13 @@ public class SummaryExtractor { String maybe; - // Plan A + parsed.select("header,nav,#header,#nav,#navigation,.header,.nav,.navigation,ul,li").remove(); - maybe = getSummaryByTagDensity(parsed); + // Plan A + maybe = getSummaryNew(parsed.clone()); + if (!maybe.isBlank()) return maybe; + + maybe = getSummaryByTagDensity(parsed.clone()); if (!maybe.isBlank()) return maybe; // Plan B: Open Graph Description @@ -53,6 +54,14 @@ public class SummaryExtractor { return lastDitchSummaryEffort(parsed); } + private String getSummaryNew(Document parsed) { + var filter = new SummaryExtractionFilter(); + + parsed.filter(filter); + + return filter.getSummary(maxSummaryLength+32); + } + private String getSummaryByTagDensity(Document parsed) { StringBuilder content = new StringBuilder(); @@ -92,6 +101,7 @@ public class SummaryExtractor { return parsed.body().text(); } + private double htmlTagDensity(Element elem) { return (double) elem.text().length() / elem.html().length(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDate.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDate.java new file mode 100644 index 00000000..ed4f0f63 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDate.java @@ -0,0 +1,46 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate; + +import java.time.LocalDate; +import java.time.format.DateTimeFormatter; + +public record PubDate(String dateIso8601, int year) { + + // First year we'll believe something can have been published on the web + // cut off at 1995 to reduce false positive error rate; number of bona fide + // documents from these years are so few almost all hits are wrong + + public static final int MIN_YEAR = 1995; + + // Last year we'll believe something can be published in + public static final int MAX_YEAR = LocalDate.now().getYear() + 1; + + + public PubDate() { + this(null, Integer.MIN_VALUE); + } + + public PubDate(LocalDate date) { + this(date.format(DateTimeFormatter.ISO_DATE), date.getYear()); + } + + public boolean isEmpty() { + return year == Integer.MIN_VALUE; + } + + public String describe() { + if (dateIso8601 != null) + return dateIso8601; + + if (hasYear()) + return Integer.toString(year); + + return ""; + } + + public static boolean isValidYear(int year) { + return year >= MIN_YEAR && year <= MAX_YEAR; + } + public boolean hasYear() { + return isValidYear(this.year); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateEffortLevel.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateEffortLevel.java new file mode 100644 index 00000000..b146c0d0 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateEffortLevel.java @@ -0,0 +1,6 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate; + +public enum PubDateEffortLevel { + LOW, + HIGH +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateHeuristic.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateHeuristic.java new file mode 100644 index 00000000..0bac7705 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateHeuristic.java @@ -0,0 +1,12 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate; + +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import org.jsoup.nodes.Document; + +import java.util.Optional; + +public interface PubDateHeuristic { + + Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard); +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateParser.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateParser.java new file mode 100644 index 00000000..8e49fda8 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateParser.java @@ -0,0 +1,178 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate; + +import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; + +import java.time.DateTimeException; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.ZonedDateTime; +import java.time.format.DateTimeFormatter; +import java.util.Optional; +import java.util.OptionalInt; +import java.util.concurrent.ThreadLocalRandom; +import java.util.regex.Pattern; + +public class PubDateParser { + + public static Optional attemptParseDate(String date) { + return Optional.ofNullable(date) + .filter(str -> str.length() >= 4 && str.length() < 32) + .flatMap(str -> + parse8601(str) + .or(() -> parse1123(str)) + .or(() -> dateFromHighestYearLookingSubstring(str)) + ) + .filter(PubDateParser::validateDate); + } + + public static OptionalInt parseYearString(String yearString) { + try { + return OptionalInt.of(Integer.parseInt(yearString)); + } + catch (NumberFormatException ex) { + return OptionalInt.empty(); + } + } + + + private static final Pattern yearPattern = Pattern.compile("\\d{4}"); + + public static Optional dateFromHighestYearLookingSubstring(String maybe) { + var matcher = yearPattern.matcher(maybe); + + int min = PubDate.MAX_YEAR + 1; + int max = PubDate.MIN_YEAR - 1; + + for (int i = 0; i < maybe.length() && matcher.find(i); i = matcher.end()) { + + String segment = maybe.substring(matcher.start(), matcher.end()); + OptionalInt year = parseYearString(segment); + + if (year.isEmpty()) + continue; + + int y = year.getAsInt(); + if (PubDate.isValidYear(y)) { + if (max < y) max = y; + if (min > y) min = y; + } + } + + if (max != min && PubDate.isValidYear(min) && PubDate.isValidYear(max)) { + return Optional.of(new PubDate(null, guessYear(min, max))); + } + + if (max >= PubDate.MIN_YEAR) + return Optional.of(new PubDate(null, max)); + else + return Optional.empty(); + } + + + public static Optional dateFromHighestYearLookingSubstringWithGuess(String maybe, int guess) { + var matcher = yearPattern.matcher(maybe); + + int min = PubDate.MAX_YEAR + 1; + int max = PubDate.MIN_YEAR - 1; + + for (int i = 0; i < maybe.length() && matcher.find(i); i = matcher.end()) { + + String segment = maybe.substring(matcher.start(), matcher.end()); + OptionalInt year = parseYearString(segment); + + if (year.isEmpty()) + continue; + + int y = year.getAsInt(); + if (PubDate.isValidYear(y)) { + if (max < y) max = y; + if (min > y) min = y; + } + } + + if (max != min && PubDate.isValidYear(min) && PubDate.isValidYear(max)) { + return Optional.of(new PubDate(null, guessYear(min, max, guess))); + } + + if (max >= PubDate.MIN_YEAR) + return Optional.of(new PubDate(null, max)); + else + return Optional.empty(); + } + + public static int guessYear(int min, int max, int educatedGuess) { + int var = max - min; + + if (var < 3) + return min; + + int avg = (max + min) / 2; + int guess = (avg + educatedGuess) / 2; + + if (guess < min) + return min; + if (guess > max) + return max; + + return guess; + } + + public static int guessYear(int min, int max) { + return (max + min) / 2; + } + + public static int guessYear(EdgeHtmlStandard standard) { + // Create some jitter to avoid having documents piling up in the same four years + // as this would make searching in those years disproportionately useless + + double guess = standard.yearGuess + ThreadLocalRandom.current().nextGaussian(); + + if (guess < PubDate.MIN_YEAR) { + return PubDate.MIN_YEAR; + } + if (guess > PubDate.MAX_YEAR) { + return PubDate.MAX_YEAR; + } + return (int) guess; + } + + public static Optional parse8601(String maybe) { + return parseOptionally(maybe, DateTimeFormatter.ISO_DATE) + .or(() -> parseOptionallyWithTime(maybe, DateTimeFormatter.ISO_DATE_TIME)) + .or(() -> parseOptionallyWithZonedTime(maybe, DateTimeFormatter.ISO_DATE_TIME)) + .map(PubDate::new); + } + + public static Optional parse1123(String maybe) { + return parseOptionally(maybe, DateTimeFormatter.RFC_1123_DATE_TIME) + .map(PubDate::new); + } + + public static Optional parseOptionally(String str, DateTimeFormatter formatter) { + try { + return Optional.of(LocalDate.parse(str, formatter)); + } + catch (DateTimeException ex) { + return Optional.empty(); + } + } + public static Optional parseOptionallyWithTime(String str, DateTimeFormatter formatter) { + try { + return Optional.of(LocalDateTime.parse(str, formatter).toLocalDate()); + } + catch (DateTimeException ex) { + return Optional.empty(); + } + } + public static Optional parseOptionallyWithZonedTime(String str, DateTimeFormatter formatter) { + try { + return Optional.of(ZonedDateTime.parse(str, formatter).toLocalDate()); + } + catch (DateTimeException ex) { + return Optional.empty(); + } + } + public static boolean validateDate(PubDate date) { + return (date.year() >= PubDate.MIN_YEAR && date.year() <= PubDate.MAX_YEAR); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateSniffer.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateSniffer.java new file mode 100644 index 00000000..25a5ece1 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateSniffer.java @@ -0,0 +1,50 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate; + +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic.*; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import org.jsoup.nodes.Document; + +import java.util.ArrayList; +import java.util.List; + +public class PubDateSniffer { + + private final List heuristics = new ArrayList<>(); + + public PubDateSniffer() { + heuristics.add(new PubDateHeuristicHtml5ItempropDateTag()); + heuristics.add(new PubDateHeuristicHtml5ArticleDateTag()); + heuristics.add(new PubDateHeuristicJSONLD()); + heuristics.add(new PubDateHeuristicMicrodata()); + heuristics.add(new PubDateHeuristicOpenGraph()); + heuristics.add(new PubDateHeuristicRDFaTag()); + + // The more questionable heuristics should be kept below this line + heuristics.add(new PubDateHeuristicUrlPatternPass1()); + + heuristics.add(new PubDateHeuristicDOMParsingPass1()); + heuristics.add(new PubDateHeuristicHtml5AnyTimeTag()); + + heuristics.add(new PubDateHeuristicDOMParsingPass2()); + heuristics.add(new PubDateHeuristicUrlPatternPass2()); + + heuristics.add(new PubDateHeuristicLastModified()); + // This is complete guesswork + + heuristics.add(new PubDateHeuristicGuessFromHtmlStandard()); + } + + public PubDate getPubDate(String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard, boolean runExpensive) { + final PubDateEffortLevel effortLevel = runExpensive ? PubDateEffortLevel.HIGH : PubDateEffortLevel.LOW; + + for (var heuristic : heuristics) { + var maybe = heuristic.apply(effortLevel, headers, url, document, htmlStandard); + if (maybe.isPresent()) + return maybe.get(); + } + + return new PubDate(); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java new file mode 100644 index 00000000..cc85ab2a --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java @@ -0,0 +1,148 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic; + +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import org.jetbrains.annotations.NotNull; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.nodes.TextNode; +import org.jsoup.select.NodeFilter; + +import java.util.Optional; + +public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic { + + @Override + public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { + if (effortLevel == PubDateEffortLevel.LOW) + return Optional.empty(); + + DateExtractingNodeVisitorPass filter = new DateExtractingNodeVisitorPass(htmlStandard); + + document.filter(filter); + + return Optional.ofNullable(filter.pubDate); + } + + + private static class DateExtractingNodeVisitorPass implements NodeFilter { + public PubDate pubDate; + private final EdgeHtmlStandard htmlStandard; + + private DateExtractingNodeVisitorPass(EdgeHtmlStandard htmlStandard) { + this.htmlStandard = htmlStandard; + } + + @NotNull + @Override + public FilterResult head(@NotNull Node node, int depth) { + if (node instanceof TextNode tn) onTextNode(tn); + if (node instanceof Element el) onElementNode(el); + + if (hasPubDate()) { + return FilterResult.STOP; + } + return FilterResult.CONTINUE; + } + + public void onTextNode(TextNode tn) { + String text = tn.getWholeText(); + + if (text.length() < 32 && isCandidatForCopyrightNotice(text)) { + parse(text); + } + } + + + public void onElementNode(Element el) { + if (hasCommonClass(el)) { + parse(el.text()); + } + + if (!hasPubDate()) + tryParsePhpBBDate(el); + } + + + public boolean isCandidatForCopyrightNotice(String text) { + if (text.contains("ublished")) + return true; + if (text.contains("opyright")) + return true; + if (text.contains("©")) + return true; + if (text.contains("(c)")) + return true; + + return false; + } + + + public boolean hasCommonClass(Element el) { + var classes = el.classNames(); + + return classes.contains("entry-meta") // wordpress + || classes.contains("byline") + || classes.contains("author") + || classes.contains("submitted") + || el.id().contains("footer-info-lastmod"); // mediawiki + } + + public void tryParsePhpBBDate(Element el) { + + /* Match HTML on the form
[...] Posted: Sun Oct 03, 2010 5:37 pm 
+ * this is used on old phpBB message boards + * + * Schematically the DOM looks like this + * + * b - TextNode[ Sun Oct 03, 2010 5:37 pm ] + * | + * TextNode[Posted:] + */ + if ("b".equals(el.tagName()) + && el.childNodeSize() == 1 + && el.childNode(0) instanceof TextNode ctn + && "Posted:".equals(ctn.getWholeText()) + && el.nextSibling() instanceof TextNode ntn + ) + { + parse(ntn.getWholeText()); + } + } + + + public boolean hasPubDate() { + return pubDate != null; + } + public void setPubDate(PubDate pubDate) { + this.pubDate = pubDate; + } + + @NotNull + @Override + public FilterResult tail(@NotNull Node node, int depth) { + return FilterResult.CONTINUE; + } + + private void parse(String text) { + if (htmlStandard == EdgeHtmlStandard.UNKNOWN) { + PubDateParser + .dateFromHighestYearLookingSubstring(text) + .ifPresent(this::setPubDate); + } + else { + PubDateParser + .dateFromHighestYearLookingSubstringWithGuess(text, htmlStandard.yearGuess) + .ifPresent(this::setPubDate); + } + } + + + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java new file mode 100644 index 00000000..264f9eb1 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java @@ -0,0 +1,123 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic; + +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import org.jetbrains.annotations.NotNull; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Node; +import org.jsoup.nodes.TextNode; +import org.jsoup.select.NodeFilter; + +import java.util.Optional; + +public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic { + + @Override + public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { + if (effortLevel == PubDateEffortLevel.LOW) + return Optional.empty(); + + DateExtractingNodeVisitor filter = new DateExtractingNodeVisitor(htmlStandard); + + document.filter(filter); + + return Optional.ofNullable(filter.pubDate); + } + + + private static class DateExtractingNodeVisitor implements NodeFilter { + public PubDate pubDate; + private final EdgeHtmlStandard htmlStandard; + + private DateExtractingNodeVisitor(EdgeHtmlStandard htmlStandard) { + this.htmlStandard = htmlStandard; + } + + @NotNull + @Override + public FilterResult head(@NotNull Node node, int depth) { + if (node instanceof TextNode tn) onTextNode(tn); + + if (hasPubDate()) { + return FilterResult.STOP; + } + return FilterResult.CONTINUE; + } + + public void onTextNode(TextNode tn) { + String text = tn.getWholeText(); + + if (isPossibleCandidate(text)) { + parse(text); + } + } + + + public boolean hasPubDate() { + return pubDate != null; + } + public void setPubDate(PubDate pubDate) { + this.pubDate = pubDate; + } + + @NotNull + @Override + public FilterResult tail(@NotNull Node node, int depth) { + return FilterResult.CONTINUE; + } + + private void parse(String text) { + if (htmlStandard == EdgeHtmlStandard.UNKNOWN) { + PubDateParser + .dateFromHighestYearLookingSubstring(text) + .ifPresent(this::setPubDate); + } + else { + PubDateParser + .dateFromHighestYearLookingSubstringWithGuess(text, htmlStandard.yearGuess) + .ifPresent(this::setPubDate); + } + } + + + } + + // This is basically the regex (^|[ ./\-])(\d{4})([ ./\-]$), but + // unchecked regexes are too slow + + public static boolean isPossibleCandidate(String text) { + if (text.length() >= 4 && text.length() < 24) { + int ct = 0; + char prevC = ' '; + boolean goodStart = true; + for (int i = 0; i < text.length(); i++) { + char c = text.charAt(i); + if (Character.isDigit(c)) { + if (ct++ == 0) { + goodStart = isGoodBreak(prevC); + } + } + else { + if (ct == 4 && goodStart && isGoodBreak(c)) return true; + else { + ct = 0; + } + } + prevC = c; + } + + if (ct == 4 && goodStart) + return true; + } + return false; + } + + private static boolean isGoodBreak(char c) { + return "./-,".indexOf(c) >= 0 || Character.isSpaceChar(c); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java new file mode 100644 index 00000000..e3d0e556 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java @@ -0,0 +1,23 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic; + +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import org.jsoup.nodes.Document; + +import java.util.Optional; + +public class PubDateHeuristicGuessFromHtmlStandard implements PubDateHeuristic { + + @Override + public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { + if (htmlStandard == EdgeHtmlStandard.UNKNOWN) + return Optional.empty(); + + return Optional.of(new PubDate(null, PubDateParser.guessYear(htmlStandard))); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java new file mode 100644 index 00000000..d20ed246 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java @@ -0,0 +1,33 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic; + +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import org.jsoup.nodes.Document; + +import java.util.Optional; + +public class PubDateHeuristicHtml5AnyTimeTag implements PubDateHeuristic { + + @Override + public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { + // HTML5, alternative approach + for (var tag : document.select("time")) { + var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime")); + if (maybeDate.isPresent()) { + return maybeDate; + } + + maybeDate = PubDateParser.attemptParseDate(tag.wholeText()); + if (maybeDate.isPresent()) { + return maybeDate; + } + } + + return Optional.empty(); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java new file mode 100644 index 00000000..78c54b9a --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java @@ -0,0 +1,28 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic; + +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import org.jsoup.nodes.Document; + +import java.util.Optional; + +public class PubDateHeuristicHtml5ArticleDateTag implements PubDateHeuristic { + + @Override + public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { + // HTML5 + for (var tag : document.select("time[pubdate=\"pubdate\"]")) { + var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime")); + if (maybeDate.isPresent()) { + return maybeDate; + } + } + + return Optional.empty(); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java new file mode 100644 index 00000000..8dec0f6a --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java @@ -0,0 +1,27 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic; + +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import org.jsoup.nodes.Document; + +import java.util.Optional; + +public class PubDateHeuristicHtml5ItempropDateTag implements PubDateHeuristic { + + @Override + public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { + for (var tag : document.select("time[itemprop=\"datePublished\"]")) { + var maybeDate = PubDateParser.attemptParseDate(tag.attr("content")); + if (maybeDate.isPresent()) { + return maybeDate; + } + } + + return Optional.empty(); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicJSONLD.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicJSONLD.java new file mode 100644 index 00000000..2187a744 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicJSONLD.java @@ -0,0 +1,49 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic; + +import com.google.gson.Gson; +import com.google.gson.GsonBuilder; +import com.google.gson.JsonSyntaxException; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import org.jsoup.nodes.Document; + +import java.util.Optional; + +public class PubDateHeuristicJSONLD implements PubDateHeuristic { + + @Override + public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { + for (var tag : document.select("script[type=\"application/ld+json\"]")) { + var maybeDate = parseLdJson(tag.data()) + .flatMap(PubDateParser::attemptParseDate); + + if (maybeDate.isPresent()) { + return maybeDate; + } + } + + return Optional.empty(); + } + + + private static class JsonModel { + String datePublished; + } + private static Gson gson = new GsonBuilder().create(); + + public Optional parseLdJson(String content) { + try { + var model = gson.fromJson(content, JsonModel.class); + return Optional.ofNullable(model) + .map(m -> m.datePublished); + } + catch (JsonSyntaxException ex) { + return Optional.empty(); + } + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicLastModified.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicLastModified.java new file mode 100644 index 00000000..5a47c9df --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicLastModified.java @@ -0,0 +1,29 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic; + +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import org.jsoup.nodes.Document; + +import java.util.Optional; + +public class PubDateHeuristicLastModified implements PubDateHeuristic { + + @Override + public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { + String lmString = "last-modified: "; + int offset = headers.toLowerCase().indexOf(lmString); + + if (offset < 0) + return Optional.empty(); + int end = headers.indexOf('\n', offset); + if (end < 0) end = headers.length(); + + String lmDate = headers.substring(offset + lmString.length(), end); + return PubDateParser.attemptParseDate(lmDate); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicMicrodata.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicMicrodata.java new file mode 100644 index 00000000..a257bba2 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicMicrodata.java @@ -0,0 +1,28 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic; + +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import org.jsoup.nodes.Document; + +import java.util.Optional; + +public class PubDateHeuristicMicrodata implements PubDateHeuristic { + + @Override + public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { + + for (var tag : document.select("meta[itemprop=\"datePublished\"]")) { + var maybeDate = PubDateParser.attemptParseDate(tag.attr("content")); + if (maybeDate.isPresent()) { + return maybeDate; + } + } + + return Optional.empty(); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicOpenGraph.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicOpenGraph.java new file mode 100644 index 00000000..bd9b66a9 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicOpenGraph.java @@ -0,0 +1,27 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic; + +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import org.jsoup.nodes.Document; + +import java.util.Optional; + +public class PubDateHeuristicOpenGraph implements PubDateHeuristic { + + @Override + public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { + // OG + for (var tag : document.select("meta[property=\"article:published_time\"]")) { + var maybeDate = PubDateParser.attemptParseDate(tag.attr("content")); + if (maybeDate.isPresent()) { + return maybeDate; + } + } + + return Optional.empty(); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicRDFaTag.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicRDFaTag.java new file mode 100644 index 00000000..2618cdef --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicRDFaTag.java @@ -0,0 +1,27 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic; + +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import org.jsoup.nodes.Document; + +import java.util.Optional; + +public class PubDateHeuristicRDFaTag implements PubDateHeuristic { + + @Override + public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { + for (var tag : document.select("meta[property=\"datePublished\"]")) { + var maybeDate = PubDateParser.attemptParseDate(tag.attr("content")); + if (maybeDate.isPresent()) { + return maybeDate; + } + } + + return Optional.empty(); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java new file mode 100644 index 00000000..70b19ad0 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java @@ -0,0 +1,45 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic; + +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import org.jsoup.nodes.Document; + +import java.util.Optional; +import java.util.OptionalInt; +import java.util.regex.Pattern; + +public class PubDateHeuristicUrlPatternPass1 implements PubDateHeuristic { + + private static final Pattern yearUrlPattern = Pattern.compile("/\\d{4}/"); + + // False positive rate is much higher in the 1990s, only include 2000s+ in pass 1 + private static final int MIN_URL_PATTERN_YEAR = 2000; + + @Override + public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { + final String urlString = url.path; + + var matcher = yearUrlPattern.matcher(urlString); + + for (int i = 0; i < urlString.length() && matcher.find(i); i = matcher.end()) { + + String segment = urlString.substring(matcher.start() + 1, matcher.end() - 1); + + OptionalInt year = PubDateParser.parseYearString(segment); + + if (year.isEmpty()) + continue; + + int y = year.getAsInt(); + if (y >= MIN_URL_PATTERN_YEAR && y <= PubDate.MAX_YEAR) { + return Optional.of(new PubDate(null, y)); + } + } + + return Optional.empty(); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java new file mode 100644 index 00000000..19aceecd --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java @@ -0,0 +1,42 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic; + +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import org.jsoup.nodes.Document; + +import java.util.Optional; +import java.util.OptionalInt; +import java.util.regex.Pattern; + +public class PubDateHeuristicUrlPatternPass2 implements PubDateHeuristic { + + private static final Pattern yearUrlPattern = Pattern.compile("/\\d{4}/"); + + @Override + public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) { + final String urlString = url.path; + + var matcher = yearUrlPattern.matcher(urlString); + + for (int i = 0; i < urlString.length() && matcher.find(i); i = matcher.end()) { + + String segment = urlString.substring(matcher.start() + 1, matcher.end() - 1); + + OptionalInt year = PubDateParser.parseYearString(segment); + + if (year.isEmpty()) + continue; + + int y = year.getAsInt(); + if (y >= PubDate.MIN_YEAR && y <= PubDate.MAX_YEAR) { + return Optional.of(new PubDate(null, y)); + } + } + + return Optional.empty(); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/explorer/ExplorerService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/explorer/ExplorerService.java index ef5d935c..38ec7b6b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/explorer/ExplorerService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/explorer/ExplorerService.java @@ -117,12 +117,12 @@ public class ExplorerService extends Service { NV.NEIGHBOR_NAME, NV.RELATEDNESS, (LV.DOMAIN_ID IS NOT NULL), - (STATE = 'ACTIVE' OR STATE='SOCIAL_MEDIA'), + (STATE = 'ACTIVE' OR STATE='SOCIAL_MEDIA' OR STATE='REDIR'), INDEXED > 0 FROM EC_NEIGHBORS_VIEW NV LEFT JOIN EC_NEIGHBORS_VIEW LV ON (NV.NEIGHBOR_ID=LV.DOMAIN_ID) INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=NV.NEIGHBOR_ID - WHERE NV.DOMAIN_ID=? + WHERE NV.DOMAIN_ID IN (?,?) GROUP BY NV.NEIGHBOR_ID ORDER BY NV.RELATEDNESS DESC """); @@ -131,12 +131,12 @@ public class ExplorerService extends Service { NV.DOMAIN_NAME, NV.RELATEDNESS, (LV.NEIGHBOR_ID IS NOT NULL), - (STATE = 'ACTIVE' OR STATE='SOCIAL_MEDIA'), + (STATE = 'ACTIVE' OR STATE='SOCIAL_MEDIA' OR STATE='REDIR'), INDEXED > 0 FROM EC_NEIGHBORS_VIEW NV LEFT JOIN EC_NEIGHBORS_VIEW LV ON (NV.DOMAIN_ID=LV.NEIGHBOR_ID) INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=NV.DOMAIN_ID - WHERE NV.NEIGHBOR_ID=? + WHERE NV.NEIGHBOR_ID IN (?,?) GROUP BY NV.DOMAIN_ID ORDER BY NV.RELATEDNESS DESC """ @@ -145,6 +145,8 @@ public class ExplorerService extends Service { ) { stmt.setInt(1, domainIdInformation.domainId); + stmt.setInt(2, domainIdInformation.aliasId); + var rsp = stmt.executeQuery(); while (rsp.next()) { @@ -172,6 +174,8 @@ public class ExplorerService extends Service { } stmtRev.setInt(1, domainIdInformation.domainId); + stmtRev.setInt(2, domainIdInformation.aliasId); + rsp = stmtRev.executeQuery(); while (rsp.next()) { @@ -211,22 +215,24 @@ public class ExplorerService extends Service { try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(""" - SELECT IFNULL(ALIAS.ID, DOMAIN.ID), DOMAIN.INDEXED>0 OR ALIAS.INDEXED>0, ALIAS.DOMAIN_NAME + SELECT DOMAIN.ID, IFNULL(ALIAS.ID, DOMAIN.ID), DOMAIN.INDEXED>0 OR ALIAS.INDEXED>0, ALIAS.DOMAIN_NAME FROM EC_DOMAIN DOMAIN - LEFT JOIN EC_DOMAIN ALIAS ON DOMAIN.DOMAIN_ALIAS=ALIAS.ID + LEFT JOIN EC_DOMAIN ALIAS ON DOMAIN.DOMAIN_ALIAS=ALIAS.ID WHERE DOMAIN.DOMAIN_NAME=? """)) { + stmt.setString(1, query); var rsp = stmt.executeQuery(); if (rsp.next()) { return new DomainIdInformation( rsp.getInt(1), - rsp.getBoolean(2), - rsp.getString(3) + rsp.getInt(2), + rsp.getBoolean(3), + rsp.getString(4) ); } } - return new DomainIdInformation(-1, false, null); + return new DomainIdInformation(-1, -1, false, null); } private String trimUrlJunk(String query) { @@ -245,7 +251,7 @@ public class ExplorerService extends Service { return query; } - record DomainIdInformation(int domainId, boolean indexed, String alias) { + record DomainIdInformation(int domainId, int aliasId, boolean indexed, String alias) { boolean isPresent() { return domainId >= 0; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeHtmlStandard.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeHtmlStandard.java index 18142da2..dd3d0cec 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeHtmlStandard.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgeHtmlStandard.java @@ -1,19 +1,22 @@ package nu.marginalia.wmsa.edge.model.crawl; public enum EdgeHtmlStandard { - PLAIN(0, 1), - UNKNOWN(0, 1), - HTML123(0, 1), - HTML4(-0.1, 1.05), - XHTML(-0.1, 1.05), - HTML5(0.5, 1.1); + PLAIN(0, 1, 1993), + UNKNOWN(0, 1, 2000), + HTML123(0, 1, 1997), + HTML4(-0.1, 1.05, 2006), + XHTML(-0.1, 1.05, 2006), + HTML5(0.5, 1.1, 2018); public final double offset; public final double scale; - EdgeHtmlStandard(double offset, double scale) { + public final int yearGuess; + + EdgeHtmlStandard(double offset, double scale, int yearGuess) { this.offset = offset; this.scale = scale; + this.yearGuess = yearGuess; } } diff --git a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql index 994c6473..f0dc851c 100644 --- a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql +++ b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql @@ -77,6 +77,8 @@ CREATE TABLE IF NOT EXISTS EC_PAGE_DATA ( DATA_HASH INTEGER NOT NULL, QUALITY DOUBLE NOT NULL, + PUB_YEAR SMALLINT, + FOREIGN KEY (ID) REFERENCES EC_URL(ID) ON DELETE CASCADE ) CHARACTER SET utf8mb4 diff --git a/marginalia_nu/src/main/resources/templates/edge/index.hdb b/marginalia_nu/src/main/resources/templates/edge/index.hdb index 1f71e6f6..fe30d9d6 100644 --- a/marginalia_nu/src/main/resources/templates/edge/index.hdb +++ b/marginalia_nu/src/main/resources/templates/edge/index.hdb @@ -28,38 +28,28 @@

Publicity, Discussion and Events

diff --git a/marginalia_nu/src/main/resources/templates/edge/search-results.hdb b/marginalia_nu/src/main/resources/templates/edge/search-results.hdb index 6a4b9cfc..b0a0848e 100644 --- a/marginalia_nu/src/main/resources/templates/edge/search-results.hdb +++ b/marginalia_nu/src/main/resources/templates/edge/search-results.hdb @@ -20,7 +20,9 @@ {{>edge/parts/search-form}} +
+ {{#if maintenanceMessage}}

Maintenance

{{maintenanceMessage}}

{{/if}} {{#if evalResult}}

Evaluation

{{query}} = {{evalResult}}


{{/if}} {{#each wiki.entries}}

Encyclopedia

{{.}} Encyclopedia Page


{{/each}} diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java index 220078f3..0dde33c9 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java @@ -75,7 +75,8 @@ class SqlLoadProcessedDocumentTest { EdgeHtmlStandard.HTML5, 100, 12345, - -3.14 + -3.14, + null ))); var details = dataStoreDao.getUrlDetailsMulti(new EdgeIdArray<>(loaderData.getUrlId(new EdgeUrl("https://www.marginalia.nu/")))); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/PubDateSnifferTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/PubDateSnifferTest.java new file mode 100644 index 00000000..f0c5f5fc --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/PubDateSnifferTest.java @@ -0,0 +1,254 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic; + +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateSniffer; +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic.PubDateHeuristicDOMParsingPass2; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; +import org.jsoup.Jsoup; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.net.URISyntaxException; +import java.nio.file.Files; +import java.nio.file.Path; + +import static org.junit.jupiter.api.Assertions.*; + +class PubDateSnifferTest { + + PubDateSniffer dateSniffer = new PubDateSniffer(); + + @Test + public void testGetYearFromText() { + var ret = PubDateParser.dateFromHighestYearLookingSubstring("© 2005-2010 Bob Dobbs"); + assertTrue(ret.isPresent()); + assertEquals(2010, ret.get().year()); + + ret = PubDateParser.dateFromHighestYearLookingSubstring("© 99 Bob Dobbs"); + assertFalse(ret.isPresent()); + + ret = PubDateParser.dateFromHighestYearLookingSubstring("© 1939 Bob Dobbs"); + assertFalse(ret.isPresent()); + + ret = PubDateParser.dateFromHighestYearLookingSubstring("In the year 2525, if man is still alive"); + assertFalse(ret.isPresent()); + } + + @Test + public void testParse() { + var ret = PubDateParser.attemptParseDate("2022-01-01"); + assertTrue(ret.isPresent()); + assertEquals("2022-01-01", ret.get().dateIso8601()); + assertEquals(2022, ret.get().year()); + + ret = PubDateParser.attemptParseDate("2022-08-24T14:39:14Z"); + assertTrue(ret.isPresent()); + assertEquals("2022-08-24", ret.get().dateIso8601()); + assertEquals(2022, ret.get().year()); + + ret = PubDateParser.attemptParseDate("2022-08-24T14:39:14"); + assertTrue(ret.isPresent()); + assertEquals("2022-08-24", ret.get().dateIso8601()); + assertEquals(2022, ret.get().year()); + + ret = PubDateParser.attemptParseDate("Sun, 21 Oct 2018 12:16:24 GMT"); + assertTrue(ret.isPresent()); + assertEquals("2018-10-21", ret.get().dateIso8601()); + assertEquals(2018, ret.get().year()); + + ret = PubDateParser.attemptParseDate("July 13, 2006"); + assertTrue(ret.isPresent()); + assertEquals(2006, ret.get().year()); + + } + + + @Test + public void testHtml5A() throws URISyntaxException { + var ret = dateSniffer.getPubDate("", + new EdgeUrl("https://www.example.com/"), + Jsoup.parse(""" + + +
+ + Wow, sure lor 'em boss +
+ """), EdgeHtmlStandard.UNKNOWN, true); + + assertFalse(ret.isEmpty()); + assertEquals("2022-08-24", ret.dateIso8601()); + } + + @Test + public void testHtml5B() throws URISyntaxException { + var ret = dateSniffer.getPubDate("", + new EdgeUrl("https://www.example.com/"), + Jsoup.parse(""" + + + + Wow, sure lor 'em boss + + """), EdgeHtmlStandard.UNKNOWN, true); + + assertFalse(ret.isEmpty()); + assertEquals("2022-08-24", ret.dateIso8601()); + } + + @Test + public void testHtml5C() throws URISyntaxException { + var ret = dateSniffer.getPubDate("", + new EdgeUrl("https://www.example.com/"), + Jsoup.parse(""" + + + + Wow, sure lor 'em boss + + """), EdgeHtmlStandard.UNKNOWN, true); + + assertFalse(ret.isEmpty()); + assertEquals(2006, ret.year()); + } + + @Test + public void testProblemCases() throws IOException, URISyntaxException { + var ret = dateSniffer.getPubDate("", + new EdgeUrl("https://www.example.com/"), + Jsoup.parse(Files.readString(Path.of("/home/vlofgren/Code/tmp-data/The Switch to Linux Begins .html"))), EdgeHtmlStandard.HTML5, true); + + assertFalse(ret.isEmpty()); + assertEquals(2006, ret.year()); + + ret = dateSniffer.getPubDate("", + new EdgeUrl("https://www.example.com/"), + Jsoup.parse(Files.readString(Path.of("/home/vlofgren/Code/tmp-data/Black Hat USA 2010 Understanding and Deploying DNSSEC by Paul Wouters and Patrick Nauber.html"))), EdgeHtmlStandard.XHTML, true); + + assertFalse(ret.isEmpty()); + assertEquals(2010, ret.year()); + } + + @Test + public void testGuessYear() { + System.out.println(PubDateParser.guessYear(2010, 2020)); + System.out.println(PubDateParser.guessYear(2010, 2020)); + System.out.println(PubDateParser.guessYear(2010, 2020)); + System.out.println(PubDateParser.guessYear(2010, 2020)); + System.out.println(PubDateParser.guessYear(2010, 2020)); + } + + @Test + public void testMicrodata() throws URISyntaxException { + var ret = dateSniffer.getPubDate("", + new EdgeUrl("https://www.example.com/"), + Jsoup.parse(""" + + + + """), EdgeHtmlStandard.UNKNOWN, true); + + assertFalse(ret.isEmpty()); + assertEquals("2022-08-24", ret.dateIso8601()); + } + + @Test + public void testRDFa() throws URISyntaxException { + var ret = dateSniffer.getPubDate("", + new EdgeUrl("https://www.example.com/"), + Jsoup.parse(""" + + + + """),EdgeHtmlStandard.UNKNOWN, true); + + assertFalse(ret.isEmpty()); + assertEquals("2022-08-24", ret.dateIso8601()); + } + + @Test + public void testLD() throws URISyntaxException { + var ret = dateSniffer.getPubDate("", + new EdgeUrl("https://www.example.com/"), + Jsoup.parse(""" + + + + """), EdgeHtmlStandard.UNKNOWN, true); + + assertFalse(ret.isEmpty()); + assertEquals("2004-08-24", ret.dateIso8601()); + } + + @Test + public void testPath() throws URISyntaxException { + var ret = dateSniffer.getPubDate("", + new EdgeUrl("https://www.example.com/articles/2022/04/how-to-detect-dates"), + Jsoup.parse(""" + + + No date in the HTML + """), EdgeHtmlStandard.UNKNOWN, true); + + assertFalse(ret.isEmpty()); + assertNull(ret.dateIso8601()); + assertEquals(2022, ret.year()); + } + + @Test + public void testHeader() throws URISyntaxException { + var ret = dateSniffer.getPubDate("content-type: application/pdf\netag: \"4fc0ba8a7f5090b6fa6be385dca206ec\"\nlast-modified: Thu, 03 Feb 2022 19:22:58 GMT\ncontent-length: 298819\ndate: Wed, 24 Aug 2022 19:48:52 GMT\ncache-control: public, no-transform, immutable, max-age\u003d31536000\naccess-control-expose-headers: Content-Length,Content-Disposition,Content-Range,Etag,Server-Timing,Vary,X-Cld-Error,X-Content-Type-Options\naccess-control-allow-origin: *\naccept-ranges: bytes\ntiming-allow-origin: *\nserver: Cloudinary\nstrict-transport-security: max-age\u003d604800\nx-content-type-options: nosniff\nserver-timing: akam;dur\u003d25;start\u003d2022-08-24T19:48:52.519Z;desc\u003dmiss,rtt;dur\u003d19,cloudinary;dur\u003d129;start\u003d2022-08-23T06:35:17.331Z\n", + new EdgeUrl("https://www.example.com/"), + Jsoup.parse(""" + + + No date in the HTML + """), EdgeHtmlStandard.UNKNOWN, true); + + assertFalse(ret.isEmpty()); + assertEquals("2022-02-03", ret.dateIso8601()); + } + + + @Test + public void testDOM() throws URISyntaxException { + var ret = dateSniffer.getPubDate("", + new EdgeUrl("https://www.example.com/"), + Jsoup.parse(""" + + +

Published 2003, updated 2022

+ """), EdgeHtmlStandard.HTML5, true); + + assertFalse(ret.isEmpty()); + assertNull(ret.dateIso8601()); + assertEquals(2015, ret.year()); + } + + @Test + public void testCandidate() { + System.out.println(PubDateHeuristicDOMParsingPass2.isPossibleCandidate("(C) 2007")); + System.out.println(PubDateHeuristicDOMParsingPass2.isPossibleCandidate("(C) 2007-01-01")); + System.out.println(PubDateHeuristicDOMParsingPass2.isPossibleCandidate("(C) 01-01.2007")); + System.out.println(PubDateHeuristicDOMParsingPass2.isPossibleCandidate("Only $1999")); + System.out.println(PubDateHeuristicDOMParsingPass2.isPossibleCandidate("1998B")); + System.out.println(PubDateHeuristicDOMParsingPass2.isPossibleCandidate("1998B")); + System.out.println(PubDateHeuristicDOMParsingPass2.isPossibleCandidate("2010 black hat ™")); + } + + @Test + public void testOldInvision() throws URISyntaxException { + var ret = dateSniffer.getPubDate("", + new EdgeUrl("https://www.example.com/"), + Jsoup.parse(""" + + +
 Post subject: Keyboards.
Post #1 Posted: Sun Oct 03, 2010 5:37 pm 
+ """), EdgeHtmlStandard.UNKNOWN, true); + + assertFalse(ret.isEmpty()); + assertNull(ret.dateIso8601()); + assertEquals(2010, ret.year()); + } +} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/SummaryExtractorTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/SummaryExtractorTest.java index f37ca367..47929f43 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/SummaryExtractorTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/SummaryExtractorTest.java @@ -2,22 +2,91 @@ package nu.marginalia.wmsa.edge.converting.processor.logic; import org.jsoup.Jsoup; import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import java.io.FileOutputStream; import java.io.IOException; +import java.io.PrintWriter; +import java.nio.file.Files; import java.nio.file.Path; +import java.util.Comparator; import java.util.HashMap; import java.util.Map; import java.util.Objects; class SummaryExtractorTest { + SummaryExtractor summaryExtractor; + @BeforeEach + public void setUp() { + summaryExtractor = new SummaryExtractor(255); + } + + @Test + public void testSummaryFilter() throws IOException { + String html = readClassPathFile("html/monadnock.html"); + var doc = Jsoup.parse(html); + var filter = new SummaryExtractionFilter(); + doc.filter(filter); + + filter.statistics.entrySet().stream().sorted(Comparator.comparing(e -> -e.getValue().textLength())) + .filter(e -> e.getValue().textToTagRatio() > 0.75) + .filter(e -> e.getValue().isElement()) + .filter(e -> e.getValue().textLength() > 32) + .filter(e -> e.getValue().pos() < filter.cnt / 2.) + .limit(5) + .forEach(e -> { + System.out.println(e.getKey().nodeName() + ":" + e.getValue() + " / " + e.getValue().textToTagRatio()); + System.out.println(e.getValue().text()); + }); + } + @Test + public void testSummaryFilter3() throws IOException { + var data = Path.of("/home/vlofgren/Code/tmp-data/url-327999153"); + String html = Files.readString(data); + var doc = Jsoup.parse(html); + var filter = new SummaryExtractionFilter(); + doc.filter(filter); + + filter.getSummary(255); + } + @Test + public void testSummaryFilter2() throws IOException { + var data = Path.of("/home/vlofgren/Code/tmp-data/"); + + System.out.println("Running"); + + var fos = new PrintWriter(new FileOutputStream("/tmp/summaryDiff.html")); + fos.println(""); + + for (var file : Objects.requireNonNull(data.toFile().listFiles())) { + + var doc = Jsoup.parse(Files.readString(file.toPath())); + fos.println(""); + fos.println(""); + } + + fos.println("
" + file.getName() + "
"); + var filter = new SummaryExtractionFilter(); + + doc.select("header,nav,#header,#nav,#navigation,.header,.nav,.navigation,ul,li").remove(); + doc.filter(filter); + var ret = filter.getSummary(255); + + fos.println(ret); + fos.println(""); + String summary = summaryExtractor.extractSummary(Jsoup.parse(Files.readString(file.toPath()))); + fos.println(summary); + fos.println("
"); + fos.flush(); + } @Test void extractSurrey() throws IOException { String html = readClassPathFile("html/summarization/surrey.html"); - SummaryExtractor se = new SummaryExtractor(255); + var doc = Jsoup.parse(html); + String summary = summaryExtractor.extractSummary(doc); - String summary = se.extractSummary(Jsoup.parse(html)); Assertions.assertFalse(summary.isBlank()); @@ -27,9 +96,8 @@ class SummaryExtractorTest { @Test void extractSurrey1() throws IOException { String html = readClassPathFile("html/summarization/surrey.html.1"); - SummaryExtractor se = new SummaryExtractor(255); - - String summary = se.extractSummary(Jsoup.parse(html)); + var doc = Jsoup.parse(html); + String summary = summaryExtractor.extractSummary(doc); Assertions.assertFalse(summary.isBlank()); @@ -39,9 +107,8 @@ class SummaryExtractorTest { @Test void extract187() throws IOException { String html = readClassPathFile("html/summarization/187.shtml"); - SummaryExtractor se = new SummaryExtractor(255); - - String summary = se.extractSummary(Jsoup.parse(html)); + var doc = Jsoup.parse(html); + String summary = summaryExtractor.extractSummary(doc); Assertions.assertFalse(summary.isBlank()); @@ -51,9 +118,9 @@ class SummaryExtractorTest { @Test void extractMonadnock() throws IOException { String html = readClassPathFile("html/monadnock.html"); - SummaryExtractor se = new SummaryExtractor(255); - String summary = se.extractSummary(Jsoup.parse(html)); + var doc = Jsoup.parse(html); + String summary = summaryExtractor.extractSummary(doc); Assertions.assertFalse(summary.isBlank()); @@ -63,9 +130,9 @@ class SummaryExtractorTest { @Test public void testWorkSet() throws IOException { var workSet = readWorkSet(); - SummaryExtractor se = new SummaryExtractor(255); workSet.forEach((path, str) -> { - String summary = se.extractSummary(Jsoup.parse(str)); + var doc = Jsoup.parse(str); + String summary = summaryExtractor.extractSummary(doc); System.out.println(path + ": " + summary); }); } @@ -85,4 +152,5 @@ class SummaryExtractorTest { } return result; } + } \ No newline at end of file