From 8b8bf0748f186f7de745a76f353b8d061fe23932 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 11 Nov 2024 13:24:58 +0100 Subject: [PATCH] (feature-extraction) Add new DocumentHeaders class encapsulating Html headers. Also adds a few new html features for CDNs and S3 hosting for use in ranking and query refinement. --- .../marginalia/model/crawl/HtmlFeature.java | 5 ++ .../converting/model/DocumentHeaders.java | 60 +++++++++++++++++++ .../logic/DocumentGeneratorExtractor.java | 10 ++-- .../processor/logic/FeatureExtractor.java | 27 ++++++++- .../plugin/HtmlDocumentProcessorPlugin.java | 16 ++--- .../processor/pubdate/PubDateHeuristic.java | 3 +- .../processor/pubdate/PubDateSniffer.java | 3 +- .../PubDateHeuristicDOMParsingPass1.java | 3 +- .../PubDateHeuristicDOMParsingPass2.java | 3 +- ...PubDateHeuristicGuessFromHtmlStandard.java | 3 +- .../PubDateHeuristicHtml5AnyTimeTag.java | 3 +- .../PubDateHeuristicHtml5ArticleDateTag.java | 3 +- .../PubDateHeuristicHtml5ItempropDateTag.java | 3 +- .../heuristic/PubDateHeuristicJSONLD.java | 3 +- .../PubDateHeuristicLastModified.java | 16 ++--- .../heuristic/PubDateHeuristicMicrodata.java | 3 +- .../heuristic/PubDateHeuristicOpenGraph.java | 3 +- .../heuristic/PubDateHeuristicRDFaTag.java | 3 +- .../PubDateHeuristicUrlPatternPass1.java | 3 +- .../PubDateHeuristicUrlPatternPass2.java | 3 +- .../converting/model/DocumentHeadersTest.java | 40 +++++++++++++ .../JavadocSpecializationTest.java | 3 +- .../LemmySpecializationTest.java | 5 +- .../XenForoSpecializationTest.java | 3 +- .../processor/pubdate/PubDateSnifferTest.java | 29 +++++---- 25 files changed, 196 insertions(+), 60 deletions(-) create mode 100644 code/processes/converting-process/java/nu/marginalia/converting/model/DocumentHeaders.java create mode 100644 code/processes/converting-process/test/nu/marginalia/converting/model/DocumentHeadersTest.java diff --git a/code/common/model/java/nu/marginalia/model/crawl/HtmlFeature.java b/code/common/model/java/nu/marginalia/model/crawl/HtmlFeature.java index 56232a4b..c92c6726 100644 --- a/code/common/model/java/nu/marginalia/model/crawl/HtmlFeature.java +++ b/code/common/model/java/nu/marginalia/model/crawl/HtmlFeature.java @@ -16,6 +16,9 @@ public enum HtmlFeature { KEBAB_CASE_URL("special:kcurl"), // https://www.example.com/urls-that-look-like-this/ LONG_URL("special:longurl"), + CLOUDFLARE_FEATURE("special:cloudflare"), + CDN_FEATURE("special:cdn"), + VIEWPORT("special:viewport"), COOKIES("special:cookies"), @@ -60,6 +63,8 @@ public enum HtmlFeature { DOFOLLOW_LINK("special:dofollow"), APPLE_TOUCH_ICON("special:appleicon"), + S3_FEATURE("special:s3"), + UNKNOWN("special:uncategorized"); diff --git a/code/processes/converting-process/java/nu/marginalia/converting/model/DocumentHeaders.java b/code/processes/converting-process/java/nu/marginalia/converting/model/DocumentHeaders.java new file mode 100644 index 00000000..c40de4e6 --- /dev/null +++ b/code/processes/converting-process/java/nu/marginalia/converting/model/DocumentHeaders.java @@ -0,0 +1,60 @@ +package nu.marginalia.converting.model; + +import java.util.*; +import java.util.regex.Pattern; + +/** Encapsulates the HTTP headers of a document. + */ +public class DocumentHeaders { + public final String raw; + + private final Map> headers = new HashMap<>(); + + private static final Pattern NEWLINE_PATTERN = Pattern.compile("(\r?\n)+"); + + public DocumentHeaders(String raw) { + this.raw = Objects.requireNonNullElse(raw, ""); + + for (var line : eachLine()) { + int colonIndex = line.indexOf(':'); + + if (colonIndex == -1) continue; + + String key = line.substring(0, colonIndex).trim().toLowerCase(); + String value = line.substring(colonIndex + 1).trim(); + + headers.computeIfAbsent(key, k -> new ArrayList<>()).add(value); + } + } + + public List get(String key) { + return headers.getOrDefault(key.toLowerCase(), List.of()); + } + + public List eachLine() { + if (raw.isBlank()) + return List.of(); + + return List.of(NEWLINE_PATTERN.split(raw)); + } + + public List eachLineLowercase() { + if (raw.isBlank()) + return List.of(); + + return List.of(NEWLINE_PATTERN.split(raw.toLowerCase())); + } + + public boolean contains(String key) { + return headers.containsKey(key.toLowerCase()); + } + public boolean contains(String key, String value) { + return headers.getOrDefault(key.toLowerCase(), List.of()).contains(value); + } + public boolean containsIgnoreCase(String key, String value) { + return headers.getOrDefault(key.toLowerCase(), List.of()) + .stream() + .map(String::toLowerCase) + .anyMatch(s -> s.equals(value.toLowerCase())); + } +} diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java index dea7cefa..c3c9eac4 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java @@ -1,5 +1,6 @@ package nu.marginalia.converting.processor.logic; +import nu.marginalia.converting.model.DocumentHeaders; import nu.marginalia.converting.model.GeneratorType; import org.apache.commons.lang3.StringUtils; import org.jsoup.nodes.Document; @@ -12,7 +13,7 @@ import java.util.List; public class DocumentGeneratorExtractor { private static final String defaultValue = "unset"; - public DocumentGenerator detectGenerator(Document doc, String responseHeaders) { + public DocumentGenerator detectGenerator(Document doc, DocumentHeaders responseHeaders) { var tags = doc.select("meta[name=generator]"); @@ -76,7 +77,7 @@ public class DocumentGeneratorExtractor { } // Fallback logic when there is no meta tag - private DocumentGenerator fingerprintServerTech(Document doc, String responseHeaders) { + private DocumentGenerator fingerprintServerTech(Document doc, DocumentHeaders responseHeaders) { for (var comment : doc.getElementsByTag("head").comments()) { String data = comment.getData(); @@ -149,8 +150,7 @@ public class DocumentGeneratorExtractor { return DocumentGenerator.of("gatsby"); } - String[] headers = responseHeaders.toLowerCase().split("\n+"); - for (var header : headers) { + for (var header : responseHeaders.eachLineLowercase()) { if (header.contains("x-drupal-cache")) { return DocumentGenerator.of("drupal"); } @@ -169,7 +169,7 @@ public class DocumentGeneratorExtractor { } // These should be all the way down as they are the most generic - for (var header : headers) { + for (var header : responseHeaders.eachLineLowercase()) { if (header.contains("server: mastodon")) { return DocumentGenerator.of("mastodon"); } diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java index 3f08037f..f2bac097 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java @@ -2,6 +2,7 @@ package nu.marginalia.converting.processor.logic; import com.google.inject.Inject; import com.google.inject.Singleton; +import nu.marginalia.converting.model.DocumentHeaders; import nu.marginalia.converting.processor.classifier.adblock.AdblockSimulator; import nu.marginalia.converting.processor.classifier.adblock.GoogleAnwersSpamDetector; import nu.marginalia.converting.processor.classifier.topic.RecipeDetector; @@ -84,7 +85,7 @@ public class FeatureExtractor { this.googleAnwersSpamDetector = googleAnwersSpamDetector; } - public Set getFeatures(EdgeUrl url, Document doc, DocumentLanguageData dld) { + public Set getFeatures(EdgeUrl url, Document doc, DocumentHeaders headers, DocumentLanguageData dld) { final Set features = new HashSet<>(); final Elements scriptTags = doc.getElementsByTag("script"); @@ -313,6 +314,30 @@ public class FeatureExtractor { } } + // check for cloudflare headers + if (headers.contains("Cf-Ray") || headers.containsIgnoreCase("server", "Cloudflare")) + { + features.add(HtmlFeature.CLOUDFLARE_FEATURE); + features.add(HtmlFeature.CDN_FEATURE); + } + + // check for amazon cloudfront headers + if (headers.contains("X-Amz-Cf-Id")) + { + features.add(HtmlFeature.CDN_FEATURE); + } + + // check for fastly headers + if (headers.contains("x-fastly-request-id")) + { + features.add(HtmlFeature.CDN_FEATURE); + } + + // check for s3 hosting + if (headers.containsIgnoreCase("server", "AmazonS3")) { + features.add(HtmlFeature.S3_FEATURE); + } + if (recipeDetector.testP(dld) > 0.5) features.add(HtmlFeature.CATEGORY_FOOD); // these should be mutually exclusive diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java index 3e93e5cd..09b4a360 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java @@ -3,6 +3,7 @@ package nu.marginalia.converting.processor.plugin; import com.google.inject.Inject; import com.google.inject.name.Named; import nu.marginalia.converting.model.DisqualifiedException; +import nu.marginalia.converting.model.DocumentHeaders; import nu.marginalia.converting.model.GeneratorType; import nu.marginalia.converting.model.ProcessedDocumentDetails; import nu.marginalia.converting.processor.DocumentClass; @@ -39,7 +40,6 @@ import org.slf4j.LoggerFactory; import java.net.URISyntaxException; import java.util.EnumSet; import java.util.HashSet; -import java.util.Objects; import java.util.Set; import static nu.marginalia.converting.model.DisqualifiedException.DisqualificationReason; @@ -127,10 +127,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin } final EdgeUrl url = new EdgeUrl(crawledDocument.url); + final DocumentHeaders documentHeaders = new DocumentHeaders(crawledDocument.headers); - final var generatorParts = documentGeneratorExtractor.detectGenerator(doc, - Objects.requireNonNullElse(crawledDocument.headers, "") - ); + final var generatorParts = documentGeneratorExtractor.detectGenerator(doc, documentHeaders); final var specialization = htmlProcessorSpecializations.select(generatorParts, url); @@ -155,7 +154,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin documentLengthLogic.validateLength(dld, specialization.lengthModifier() * documentClass.lengthLimitModifier()); - final Set features = featureExtractor.getFeatures(url, doc, dld); + final Set features = featureExtractor.getFeatures(url, doc, documentHeaders, dld); ret.features = features; ret.quality = documentValuator.adjustQuality(quality, features); @@ -165,12 +164,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin throw new DisqualifiedException(DisqualificationReason.QUALITY); } - PubDate pubDate = pubDateSniffer.getPubDate( - Objects.requireNonNullElse(crawledDocument.headers, ""), - url, - doc, - standard, - true); + PubDate pubDate = pubDateSniffer.getPubDate(documentHeaders, url, doc, standard, true); EnumSet documentFlags = documentFlags(features, generatorParts.type()); diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateHeuristic.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateHeuristic.java index d348c75a..23e84b48 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateHeuristic.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateHeuristic.java @@ -1,5 +1,6 @@ package nu.marginalia.converting.processor.pubdate; +import nu.marginalia.converting.model.DocumentHeaders; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.html.HtmlStandard; @@ -9,5 +10,5 @@ import java.util.Optional; public interface PubDateHeuristic { - Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard); + Optional apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard); } diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateSniffer.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateSniffer.java index 4ec1c4f9..3bfa5b48 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateSniffer.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateSniffer.java @@ -1,5 +1,6 @@ package nu.marginalia.converting.processor.pubdate; +import nu.marginalia.converting.model.DocumentHeaders; import nu.marginalia.converting.processor.pubdate.heuristic.*; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawl.PubDate; @@ -37,7 +38,7 @@ public class PubDateSniffer { heuristics.add(new PubDateHeuristicGuessFromHtmlStandard()); } - public PubDate getPubDate(String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard, boolean runExpensive) { + public PubDate getPubDate(DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard, boolean runExpensive) { final PubDateEffortLevel effortLevel = runExpensive ? PubDateEffortLevel.HIGH : PubDateEffortLevel.LOW; for (var heuristic : heuristics) { diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java index 5ab86c17..fe16b9a2 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java @@ -1,5 +1,6 @@ package nu.marginalia.converting.processor.pubdate.heuristic; +import nu.marginalia.converting.model.DocumentHeaders; import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; import nu.marginalia.converting.processor.pubdate.PubDateParser; @@ -18,7 +19,7 @@ import java.util.Optional; public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic { @Override - public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { + public Optional apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { if (effortLevel == PubDateEffortLevel.LOW) return Optional.empty(); diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java index eb42a3c4..2d01f93b 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java @@ -1,5 +1,6 @@ package nu.marginalia.converting.processor.pubdate.heuristic; +import nu.marginalia.converting.model.DocumentHeaders; import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; import nu.marginalia.converting.processor.pubdate.PubDateFromHtmlStandard; import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; @@ -18,7 +19,7 @@ import java.util.Optional; public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic { @Override - public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { + public Optional apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { if (effortLevel == PubDateEffortLevel.LOW) return Optional.empty(); diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java index cffbe178..b7e35b2c 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java @@ -1,5 +1,6 @@ package nu.marginalia.converting.processor.pubdate.heuristic; +import nu.marginalia.converting.model.DocumentHeaders; import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; import nu.marginalia.converting.processor.pubdate.PubDateParser; @@ -13,7 +14,7 @@ import java.util.Optional; public class PubDateHeuristicGuessFromHtmlStandard implements PubDateHeuristic { @Override - public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { + public Optional apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { if (htmlStandard == HtmlStandard.UNKNOWN) return Optional.empty(); diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java index 1d4d6a90..76da2a31 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java @@ -1,5 +1,6 @@ package nu.marginalia.converting.processor.pubdate.heuristic; +import nu.marginalia.converting.model.DocumentHeaders; import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; import nu.marginalia.converting.processor.pubdate.PubDateParser; @@ -13,7 +14,7 @@ import java.util.Optional; public class PubDateHeuristicHtml5AnyTimeTag implements PubDateHeuristic { @Override - public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { + public Optional apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { // HTML5, alternative approach for (var tag : document.select("time")) { var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime")); diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java index e484e40b..13bd1c41 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java @@ -1,5 +1,6 @@ package nu.marginalia.converting.processor.pubdate.heuristic; +import nu.marginalia.converting.model.DocumentHeaders; import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; import nu.marginalia.converting.processor.pubdate.PubDateParser; @@ -13,7 +14,7 @@ import java.util.Optional; public class PubDateHeuristicHtml5ArticleDateTag implements PubDateHeuristic { @Override - public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { + public Optional apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { // HTML5 for (var tag : document.select("time[pubdate=\"pubdate\"]")) { var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime")); diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java index 0cedf842..1bd1c4bb 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java @@ -1,5 +1,6 @@ package nu.marginalia.converting.processor.pubdate.heuristic; +import nu.marginalia.converting.model.DocumentHeaders; import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; import nu.marginalia.converting.processor.pubdate.PubDateParser; @@ -13,7 +14,7 @@ import java.util.Optional; public class PubDateHeuristicHtml5ItempropDateTag implements PubDateHeuristic { @Override - public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { + public Optional apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { for (var tag : document.select("time[itemprop=\"datePublished\"]")) { var maybeDate = PubDateParser.attemptParseDate(tag.attr("content")); if (maybeDate.isPresent()) { diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicJSONLD.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicJSONLD.java index 27d25208..8ab16825 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicJSONLD.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicJSONLD.java @@ -5,6 +5,7 @@ import com.google.gson.GsonBuilder; import com.google.gson.JsonSyntaxException; import com.google.gson.annotations.SerializedName; import lombok.ToString; +import nu.marginalia.converting.model.DocumentHeaders; import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; import nu.marginalia.converting.processor.pubdate.PubDateParser; @@ -21,7 +22,7 @@ import java.util.Optional; public class PubDateHeuristicJSONLD implements PubDateHeuristic { @Override - public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { + public Optional apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { for (var tag : document.select("script[type=\"application/ld+json\"]")) { var maybeDate = parseLdJson(tag.data()) .flatMap(PubDateParser::attemptParseDate); diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicLastModified.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicLastModified.java index 0bc1a4bc..09caaa39 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicLastModified.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicLastModified.java @@ -1,5 +1,6 @@ package nu.marginalia.converting.processor.pubdate.heuristic; +import nu.marginalia.converting.model.DocumentHeaders; import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; import nu.marginalia.converting.processor.pubdate.PubDateParser; @@ -8,22 +9,17 @@ import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.html.HtmlStandard; import org.jsoup.nodes.Document; +import java.util.List; import java.util.Optional; public class PubDateHeuristicLastModified implements PubDateHeuristic { @Override - public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { - String lmString = "last-modified: "; - int offset = headers.toLowerCase().indexOf(lmString); - - if (offset < 0) + public Optional apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { + List lastModified = headers.get("last-modified"); + if (lastModified.isEmpty()) return Optional.empty(); - int end = headers.indexOf('\n', offset); - if (end < 0) end = headers.length(); - - String lmDate = headers.substring(offset + lmString.length(), end); - return PubDateParser.attemptParseDate(lmDate); + return PubDateParser.attemptParseDate(lastModified.getFirst()); } } diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicMicrodata.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicMicrodata.java index 04858bbd..9d0def44 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicMicrodata.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicMicrodata.java @@ -1,5 +1,6 @@ package nu.marginalia.converting.processor.pubdate.heuristic; +import nu.marginalia.converting.model.DocumentHeaders; import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; import nu.marginalia.converting.processor.pubdate.PubDateParser; @@ -13,7 +14,7 @@ import java.util.Optional; public class PubDateHeuristicMicrodata implements PubDateHeuristic { @Override - public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { + public Optional apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { for (var tag : document.select("meta[itemprop=\"datePublished\"]")) { var maybeDate = PubDateParser.attemptParseDate(tag.attr("content")); diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicOpenGraph.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicOpenGraph.java index 0c1bc6d3..84a1c13b 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicOpenGraph.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicOpenGraph.java @@ -1,5 +1,6 @@ package nu.marginalia.converting.processor.pubdate.heuristic; +import nu.marginalia.converting.model.DocumentHeaders; import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; import nu.marginalia.converting.processor.pubdate.PubDateParser; @@ -13,7 +14,7 @@ import java.util.Optional; public class PubDateHeuristicOpenGraph implements PubDateHeuristic { @Override - public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { + public Optional apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { // OG for (var tag : document.select("meta[property=\"article:published_time\"]")) { var maybeDate = PubDateParser.attemptParseDate(tag.attr("content")); diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicRDFaTag.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicRDFaTag.java index a158bd9a..5a91e123 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicRDFaTag.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicRDFaTag.java @@ -1,5 +1,6 @@ package nu.marginalia.converting.processor.pubdate.heuristic; +import nu.marginalia.converting.model.DocumentHeaders; import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; import nu.marginalia.converting.processor.pubdate.PubDateParser; @@ -13,7 +14,7 @@ import java.util.Optional; public class PubDateHeuristicRDFaTag implements PubDateHeuristic { @Override - public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { + public Optional apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { for (var tag : document.select("meta[property=\"datePublished\"]")) { var maybeDate = PubDateParser.attemptParseDate(tag.attr("content")); if (maybeDate.isPresent()) { diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java index 16a55c5f..ed6aa129 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java @@ -1,5 +1,6 @@ package nu.marginalia.converting.processor.pubdate.heuristic; +import nu.marginalia.converting.model.DocumentHeaders; import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; import nu.marginalia.converting.processor.pubdate.PubDateParser; @@ -20,7 +21,7 @@ public class PubDateHeuristicUrlPatternPass1 implements PubDateHeuristic { private static final int MIN_URL_PATTERN_YEAR = 2000; @Override - public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { + public Optional apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { final String urlString = url.path; var matcher = yearUrlPattern.matcher(urlString); diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java index e5226266..ceb436fe 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java @@ -1,5 +1,6 @@ package nu.marginalia.converting.processor.pubdate.heuristic; +import nu.marginalia.converting.model.DocumentHeaders; import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; import nu.marginalia.converting.processor.pubdate.PubDateParser; @@ -17,7 +18,7 @@ public class PubDateHeuristicUrlPatternPass2 implements PubDateHeuristic { private static final Pattern yearUrlPattern = Pattern.compile("/\\d{4}/"); @Override - public Optional apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, + public Optional apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { final String urlString = url.path; diff --git a/code/processes/converting-process/test/nu/marginalia/converting/model/DocumentHeadersTest.java b/code/processes/converting-process/test/nu/marginalia/converting/model/DocumentHeadersTest.java new file mode 100644 index 00000000..dc481685 --- /dev/null +++ b/code/processes/converting-process/test/nu/marginalia/converting/model/DocumentHeadersTest.java @@ -0,0 +1,40 @@ +package nu.marginalia.converting.model; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.util.List; + +public class DocumentHeadersTest { + + @Test + void testNull() { + DocumentHeaders headers = new DocumentHeaders(null); + Assertions.assertEquals("", headers.raw); + Assertions.assertEquals(List.of(), headers.eachLine()); + } + + @Test + void testEmpty() { + DocumentHeaders headers = new DocumentHeaders(""); + Assertions.assertEquals("", headers.raw); + Assertions.assertEquals(List.of(), headers.eachLine()); + } + + @Test + void testDoubleNewlinesEmpty() { + DocumentHeaders headers = new DocumentHeaders("server: test\r\n\n\r\nfoo: bar"); + Assertions.assertEquals(List.of("server: test", "foo: bar"), headers.eachLine()); + } + + @Test + void containsIgnoreCaseGivenKeyAndValueInDifferentCasesReturnsTrue() { + String raw = "Key1: Value1\r\nkey2: value2\r\nKEY3: VALUE3"; + DocumentHeaders headers = new DocumentHeaders(raw); + + Assertions.assertTrue(headers.containsIgnoreCase("key1", "value1")); + Assertions.assertTrue(headers.containsIgnoreCase("key2", "value2")); + Assertions.assertTrue(headers.containsIgnoreCase("key3", "value3")); + } + +} \ No newline at end of file diff --git a/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/JavadocSpecializationTest.java b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/JavadocSpecializationTest.java index a9b60211..253fc673 100644 --- a/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/JavadocSpecializationTest.java +++ b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/JavadocSpecializationTest.java @@ -1,5 +1,6 @@ package nu.marginalia.converting.processor.plugin.specialization; +import nu.marginalia.converting.model.DocumentHeaders; import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor; import nu.marginalia.converting.processor.summary.SummaryExtractor; import nu.marginalia.test.CommonTestData; @@ -34,7 +35,7 @@ class JavadocSpecializationTest { @Test void generatorExtraction() { - var gen = generatorExtractor.detectGenerator(Jsoup.parse(thread), ""); + var gen = generatorExtractor.detectGenerator(Jsoup.parse(thread), new DocumentHeaders("")); System.out.println(gen); } diff --git a/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/LemmySpecializationTest.java b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/LemmySpecializationTest.java index 6d72bb51..178796df 100644 --- a/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/LemmySpecializationTest.java +++ b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/LemmySpecializationTest.java @@ -1,5 +1,6 @@ package nu.marginalia.converting.processor.plugin.specialization; +import nu.marginalia.converting.model.DocumentHeaders; import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor; import nu.marginalia.converting.processor.summary.SummaryExtractor; import nu.marginalia.test.CommonTestData; @@ -37,8 +38,8 @@ class LemmySpecializationTest { @Test void generatorExtraction() { - var generatorIndex = generatorExtractor.detectGenerator(Jsoup.parse(lemmyIndexHtml), ""); - var generatorPost = generatorExtractor.detectGenerator(Jsoup.parse(lemmyPost), ""); + var generatorIndex = generatorExtractor.detectGenerator(Jsoup.parse(lemmyIndexHtml), new DocumentHeaders("")); + var generatorPost = generatorExtractor.detectGenerator(Jsoup.parse(lemmyPost), new DocumentHeaders("")); System.out.println(generatorIndex); System.out.println(generatorPost); diff --git a/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/XenForoSpecializationTest.java b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/XenForoSpecializationTest.java index 581dea3c..3efd2900 100644 --- a/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/XenForoSpecializationTest.java +++ b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/XenForoSpecializationTest.java @@ -1,5 +1,6 @@ package nu.marginalia.converting.processor.plugin.specialization; +import nu.marginalia.converting.model.DocumentHeaders; import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor; import nu.marginalia.converting.processor.summary.SummaryExtractor; import nu.marginalia.test.CommonTestData; @@ -34,7 +35,7 @@ class XenForoSpecializationTest { @Test void generatorExtraction() { - var gen = generatorExtractor.detectGenerator(Jsoup.parse(thread), ""); + var gen = generatorExtractor.detectGenerator(Jsoup.parse(thread), new DocumentHeaders("")); System.out.println(gen); } diff --git a/code/processes/converting-process/test/nu/marginalia/converting/processor/pubdate/PubDateSnifferTest.java b/code/processes/converting-process/test/nu/marginalia/converting/processor/pubdate/PubDateSnifferTest.java index c0ef172c..89896af0 100644 --- a/code/processes/converting-process/test/nu/marginalia/converting/processor/pubdate/PubDateSnifferTest.java +++ b/code/processes/converting-process/test/nu/marginalia/converting/processor/pubdate/PubDateSnifferTest.java @@ -1,8 +1,7 @@ package nu.marginalia.converting.processor.pubdate; import nu.marginalia.WmsaHome; -import nu.marginalia.converting.processor.pubdate.PubDateParser; -import nu.marginalia.converting.processor.pubdate.PubDateSniffer; +import nu.marginalia.converting.model.DocumentHeaders; import nu.marginalia.converting.processor.pubdate.heuristic.PubDateHeuristicDOMParsingPass2; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.html.HtmlStandard; @@ -66,7 +65,7 @@ class PubDateSnifferTest { @Test public void testHtml5A() throws URISyntaxException { - var ret = dateSniffer.getPubDate("", + var ret = dateSniffer.getPubDate(new DocumentHeaders(""), new EdgeUrl("https://www.example.com/"), Jsoup.parse(""" @@ -83,7 +82,7 @@ class PubDateSnifferTest { @Test public void testHtml5B() throws URISyntaxException { - var ret = dateSniffer.getPubDate("", + var ret = dateSniffer.getPubDate(new DocumentHeaders(""), new EdgeUrl("https://www.example.com/"), Jsoup.parse(""" @@ -99,7 +98,7 @@ class PubDateSnifferTest { @Test public void testHtml5C() throws URISyntaxException { - var ret = dateSniffer.getPubDate("", + var ret = dateSniffer.getPubDate(new DocumentHeaders(""), new EdgeUrl("https://www.example.com/"), Jsoup.parse(""" @@ -115,14 +114,14 @@ class PubDateSnifferTest { @Test public void testProblemCases() throws IOException, URISyntaxException { - var ret = dateSniffer.getPubDate("", + var ret = dateSniffer.getPubDate(new DocumentHeaders(""), new EdgeUrl("https://www.example.com/"), Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/The Switch to Linux Begins .html"))), HtmlStandard.HTML5, true); assertFalse(ret.isEmpty()); assertEquals(2006, ret.year()); - ret = dateSniffer.getPubDate("", + ret = dateSniffer.getPubDate(new DocumentHeaders(""), new EdgeUrl("https://www.example.com/"), Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/Black Hat USA 2010 Understanding and Deploying DNSSEC by Paul Wouters and Patrick Nauber.html"))), HtmlStandard.XHTML, true); @@ -141,7 +140,7 @@ class PubDateSnifferTest { @Test public void testMicrodata() throws URISyntaxException { - var ret = dateSniffer.getPubDate("", + var ret = dateSniffer.getPubDate(new DocumentHeaders(""), new EdgeUrl("https://www.example.com/"), Jsoup.parse(""" @@ -155,7 +154,7 @@ class PubDateSnifferTest { @Test public void testRDFa() throws URISyntaxException { - var ret = dateSniffer.getPubDate("", + var ret = dateSniffer.getPubDate(new DocumentHeaders(""), new EdgeUrl("https://www.example.com/"), Jsoup.parse(""" @@ -169,7 +168,7 @@ class PubDateSnifferTest { @Test public void testLD() throws URISyntaxException { - var ret = dateSniffer.getPubDate("", + var ret = dateSniffer.getPubDate(new DocumentHeaders(""), new EdgeUrl("https://www.example.com/"), Jsoup.parse(""" @@ -183,7 +182,7 @@ class PubDateSnifferTest { @Test public void testLDWithGraph() throws URISyntaxException { - var ret = dateSniffer.getPubDate("", + var ret = dateSniffer.getPubDate(new DocumentHeaders(""), new EdgeUrl("https://www.example.com/"), Jsoup.parse(""" @@ -197,7 +196,7 @@ class PubDateSnifferTest { @Test public void testPath() throws URISyntaxException { - var ret = dateSniffer.getPubDate("", + var ret = dateSniffer.getPubDate(new DocumentHeaders(""), new EdgeUrl("https://www.example.com/articles/2022/04/how-to-detect-dates"), Jsoup.parse(""" @@ -212,7 +211,7 @@ class PubDateSnifferTest { @Test public void testHeader() throws URISyntaxException { - var ret = dateSniffer.getPubDate("content-type: application/pdf\netag: \"4fc0ba8a7f5090b6fa6be385dca206ec\"\nlast-modified: Thu, 03 Feb 2022 19:22:58 GMT\ncontent-length: 298819\ndate: Wed, 24 Aug 2022 19:48:52 GMT\ncache-control: public, no-transform, immutable, max-age\u003d31536000\naccess-control-expose-headers: Content-Length,Content-Disposition,Content-Range,Etag,Server-Timing,Vary,X-Cld-Error,X-Content-Type-Options\naccess-control-allow-origin: *\naccept-ranges: bytes\ntiming-allow-origin: *\nserver: Cloudinary\nstrict-transport-security: max-age\u003d604800\nx-content-type-options: nosniff\nserver-timing: akam;dur\u003d25;start\u003d2022-08-24T19:48:52.519Z;desc\u003dmiss,rtt;dur\u003d19,cloudinary;dur\u003d129;start\u003d2022-08-23T06:35:17.331Z\n", + var ret = dateSniffer.getPubDate(new DocumentHeaders("content-type: application/pdf\netag: \"4fc0ba8a7f5090b6fa6be385dca206ec\"\nlast-modified: Thu, 03 Feb 2022 19:22:58 GMT\ncontent-length: 298819\ndate: Wed, 24 Aug 2022 19:48:52 GMT\ncache-control: public, no-transform, immutable, max-age\u003d31536000\naccess-control-expose-headers: Content-Length,Content-Disposition,Content-Range,Etag,Server-Timing,Vary,X-Cld-Error,X-Content-Type-Options\naccess-control-allow-origin: *\naccept-ranges: bytes\ntiming-allow-origin: *\nserver: Cloudinary\nstrict-transport-security: max-age\u003d604800\nx-content-type-options: nosniff\nserver-timing: akam;dur\u003d25;start\u003d2022-08-24T19:48:52.519Z;desc\u003dmiss,rtt;dur\u003d19,cloudinary;dur\u003d129;start\u003d2022-08-23T06:35:17.331Z\n"), new EdgeUrl("https://www.example.com/"), Jsoup.parse(""" @@ -227,7 +226,7 @@ class PubDateSnifferTest { @Test public void testDOM() throws URISyntaxException { - var ret = dateSniffer.getPubDate("", + var ret = dateSniffer.getPubDate(new DocumentHeaders(""), new EdgeUrl("https://www.example.com/"), Jsoup.parse(""" @@ -253,7 +252,7 @@ class PubDateSnifferTest { @Test public void testOldInvision() throws URISyntaxException { - var ret = dateSniffer.getPubDate("", + var ret = dateSniffer.getPubDate(new DocumentHeaders(""), new EdgeUrl("https://www.example.com/"), Jsoup.parse("""