(feature-extraction) Add new DocumentHeaders class encapsulating Html headers.

Also adds a few new html features for CDNs and  S3 hosting for use in ranking and query refinement.
This commit is contained in:
Viktor Lofgren 2024-11-11 13:24:58 +01:00
parent 5cc71ae586
commit 8b8bf0748f
25 changed files with 196 additions and 60 deletions

View File

@ -16,6 +16,9 @@ public enum HtmlFeature {
KEBAB_CASE_URL("special:kcurl"), // https://www.example.com/urls-that-look-like-this/ KEBAB_CASE_URL("special:kcurl"), // https://www.example.com/urls-that-look-like-this/
LONG_URL("special:longurl"), LONG_URL("special:longurl"),
CLOUDFLARE_FEATURE("special:cloudflare"),
CDN_FEATURE("special:cdn"),
VIEWPORT("special:viewport"), VIEWPORT("special:viewport"),
COOKIES("special:cookies"), COOKIES("special:cookies"),
@ -60,6 +63,8 @@ public enum HtmlFeature {
DOFOLLOW_LINK("special:dofollow"), DOFOLLOW_LINK("special:dofollow"),
APPLE_TOUCH_ICON("special:appleicon"), APPLE_TOUCH_ICON("special:appleicon"),
S3_FEATURE("special:s3"),
UNKNOWN("special:uncategorized"); UNKNOWN("special:uncategorized");

View File

@ -0,0 +1,60 @@
package nu.marginalia.converting.model;
import java.util.*;
import java.util.regex.Pattern;
/** Encapsulates the HTTP headers of a document.
*/
public class DocumentHeaders {
public final String raw;
private final Map<String, List<String>> headers = new HashMap<>();
private static final Pattern NEWLINE_PATTERN = Pattern.compile("(\r?\n)+");
public DocumentHeaders(String raw) {
this.raw = Objects.requireNonNullElse(raw, "");
for (var line : eachLine()) {
int colonIndex = line.indexOf(':');
if (colonIndex == -1) continue;
String key = line.substring(0, colonIndex).trim().toLowerCase();
String value = line.substring(colonIndex + 1).trim();
headers.computeIfAbsent(key, k -> new ArrayList<>()).add(value);
}
}
public List<String> get(String key) {
return headers.getOrDefault(key.toLowerCase(), List.of());
}
public List<String> eachLine() {
if (raw.isBlank())
return List.of();
return List.of(NEWLINE_PATTERN.split(raw));
}
public List<String> eachLineLowercase() {
if (raw.isBlank())
return List.of();
return List.of(NEWLINE_PATTERN.split(raw.toLowerCase()));
}
public boolean contains(String key) {
return headers.containsKey(key.toLowerCase());
}
public boolean contains(String key, String value) {
return headers.getOrDefault(key.toLowerCase(), List.of()).contains(value);
}
public boolean containsIgnoreCase(String key, String value) {
return headers.getOrDefault(key.toLowerCase(), List.of())
.stream()
.map(String::toLowerCase)
.anyMatch(s -> s.equals(value.toLowerCase()));
}
}

View File

@ -1,5 +1,6 @@
package nu.marginalia.converting.processor.logic; package nu.marginalia.converting.processor.logic;
import nu.marginalia.converting.model.DocumentHeaders;
import nu.marginalia.converting.model.GeneratorType; import nu.marginalia.converting.model.GeneratorType;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
@ -12,7 +13,7 @@ import java.util.List;
public class DocumentGeneratorExtractor { public class DocumentGeneratorExtractor {
private static final String defaultValue = "unset"; private static final String defaultValue = "unset";
public DocumentGenerator detectGenerator(Document doc, String responseHeaders) { public DocumentGenerator detectGenerator(Document doc, DocumentHeaders responseHeaders) {
var tags = doc.select("meta[name=generator]"); var tags = doc.select("meta[name=generator]");
@ -76,7 +77,7 @@ public class DocumentGeneratorExtractor {
} }
// Fallback logic when there is no meta tag // Fallback logic when there is no meta tag
private DocumentGenerator fingerprintServerTech(Document doc, String responseHeaders) { private DocumentGenerator fingerprintServerTech(Document doc, DocumentHeaders responseHeaders) {
for (var comment : doc.getElementsByTag("head").comments()) { for (var comment : doc.getElementsByTag("head").comments()) {
String data = comment.getData(); String data = comment.getData();
@ -149,8 +150,7 @@ public class DocumentGeneratorExtractor {
return DocumentGenerator.of("gatsby"); return DocumentGenerator.of("gatsby");
} }
String[] headers = responseHeaders.toLowerCase().split("\n+"); for (var header : responseHeaders.eachLineLowercase()) {
for (var header : headers) {
if (header.contains("x-drupal-cache")) { if (header.contains("x-drupal-cache")) {
return DocumentGenerator.of("drupal"); return DocumentGenerator.of("drupal");
} }
@ -169,7 +169,7 @@ public class DocumentGeneratorExtractor {
} }
// These should be all the way down as they are the most generic // These should be all the way down as they are the most generic
for (var header : headers) { for (var header : responseHeaders.eachLineLowercase()) {
if (header.contains("server: mastodon")) { if (header.contains("server: mastodon")) {
return DocumentGenerator.of("mastodon"); return DocumentGenerator.of("mastodon");
} }

View File

@ -2,6 +2,7 @@ package nu.marginalia.converting.processor.logic;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import nu.marginalia.converting.model.DocumentHeaders;
import nu.marginalia.converting.processor.classifier.adblock.AdblockSimulator; import nu.marginalia.converting.processor.classifier.adblock.AdblockSimulator;
import nu.marginalia.converting.processor.classifier.adblock.GoogleAnwersSpamDetector; import nu.marginalia.converting.processor.classifier.adblock.GoogleAnwersSpamDetector;
import nu.marginalia.converting.processor.classifier.topic.RecipeDetector; import nu.marginalia.converting.processor.classifier.topic.RecipeDetector;
@ -84,7 +85,7 @@ public class FeatureExtractor {
this.googleAnwersSpamDetector = googleAnwersSpamDetector; this.googleAnwersSpamDetector = googleAnwersSpamDetector;
} }
public Set<HtmlFeature> getFeatures(EdgeUrl url, Document doc, DocumentLanguageData dld) { public Set<HtmlFeature> getFeatures(EdgeUrl url, Document doc, DocumentHeaders headers, DocumentLanguageData dld) {
final Set<HtmlFeature> features = new HashSet<>(); final Set<HtmlFeature> features = new HashSet<>();
final Elements scriptTags = doc.getElementsByTag("script"); final Elements scriptTags = doc.getElementsByTag("script");
@ -313,6 +314,30 @@ public class FeatureExtractor {
} }
} }
// check for cloudflare headers
if (headers.contains("Cf-Ray") || headers.containsIgnoreCase("server", "Cloudflare"))
{
features.add(HtmlFeature.CLOUDFLARE_FEATURE);
features.add(HtmlFeature.CDN_FEATURE);
}
// check for amazon cloudfront headers
if (headers.contains("X-Amz-Cf-Id"))
{
features.add(HtmlFeature.CDN_FEATURE);
}
// check for fastly headers
if (headers.contains("x-fastly-request-id"))
{
features.add(HtmlFeature.CDN_FEATURE);
}
// check for s3 hosting
if (headers.containsIgnoreCase("server", "AmazonS3")) {
features.add(HtmlFeature.S3_FEATURE);
}
if (recipeDetector.testP(dld) > 0.5) if (recipeDetector.testP(dld) > 0.5)
features.add(HtmlFeature.CATEGORY_FOOD); features.add(HtmlFeature.CATEGORY_FOOD);
// these should be mutually exclusive // these should be mutually exclusive

View File

@ -3,6 +3,7 @@ package nu.marginalia.converting.processor.plugin;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.name.Named; import com.google.inject.name.Named;
import nu.marginalia.converting.model.DisqualifiedException; import nu.marginalia.converting.model.DisqualifiedException;
import nu.marginalia.converting.model.DocumentHeaders;
import nu.marginalia.converting.model.GeneratorType; import nu.marginalia.converting.model.GeneratorType;
import nu.marginalia.converting.model.ProcessedDocumentDetails; import nu.marginalia.converting.model.ProcessedDocumentDetails;
import nu.marginalia.converting.processor.DocumentClass; import nu.marginalia.converting.processor.DocumentClass;
@ -39,7 +40,6 @@ import org.slf4j.LoggerFactory;
import java.net.URISyntaxException; import java.net.URISyntaxException;
import java.util.EnumSet; import java.util.EnumSet;
import java.util.HashSet; import java.util.HashSet;
import java.util.Objects;
import java.util.Set; import java.util.Set;
import static nu.marginalia.converting.model.DisqualifiedException.DisqualificationReason; import static nu.marginalia.converting.model.DisqualifiedException.DisqualificationReason;
@ -127,10 +127,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
} }
final EdgeUrl url = new EdgeUrl(crawledDocument.url); final EdgeUrl url = new EdgeUrl(crawledDocument.url);
final DocumentHeaders documentHeaders = new DocumentHeaders(crawledDocument.headers);
final var generatorParts = documentGeneratorExtractor.detectGenerator(doc, final var generatorParts = documentGeneratorExtractor.detectGenerator(doc, documentHeaders);
Objects.requireNonNullElse(crawledDocument.headers, "")
);
final var specialization = htmlProcessorSpecializations.select(generatorParts, url); final var specialization = htmlProcessorSpecializations.select(generatorParts, url);
@ -155,7 +154,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
documentLengthLogic.validateLength(dld, specialization.lengthModifier() * documentClass.lengthLimitModifier()); documentLengthLogic.validateLength(dld, specialization.lengthModifier() * documentClass.lengthLimitModifier());
final Set<HtmlFeature> features = featureExtractor.getFeatures(url, doc, dld); final Set<HtmlFeature> features = featureExtractor.getFeatures(url, doc, documentHeaders, dld);
ret.features = features; ret.features = features;
ret.quality = documentValuator.adjustQuality(quality, features); ret.quality = documentValuator.adjustQuality(quality, features);
@ -165,12 +164,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
throw new DisqualifiedException(DisqualificationReason.QUALITY); throw new DisqualifiedException(DisqualificationReason.QUALITY);
} }
PubDate pubDate = pubDateSniffer.getPubDate( PubDate pubDate = pubDateSniffer.getPubDate(documentHeaders, url, doc, standard, true);
Objects.requireNonNullElse(crawledDocument.headers, ""),
url,
doc,
standard,
true);
EnumSet<DocumentFlags> documentFlags = documentFlags(features, generatorParts.type()); EnumSet<DocumentFlags> documentFlags = documentFlags(features, generatorParts.type());

View File

@ -1,5 +1,6 @@
package nu.marginalia.converting.processor.pubdate; package nu.marginalia.converting.processor.pubdate;
import nu.marginalia.converting.model.DocumentHeaders;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.html.HtmlStandard;
@ -9,5 +10,5 @@ import java.util.Optional;
public interface PubDateHeuristic { public interface PubDateHeuristic {
Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard); Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard);
} }

View File

@ -1,5 +1,6 @@
package nu.marginalia.converting.processor.pubdate; package nu.marginalia.converting.processor.pubdate;
import nu.marginalia.converting.model.DocumentHeaders;
import nu.marginalia.converting.processor.pubdate.heuristic.*; import nu.marginalia.converting.processor.pubdate.heuristic.*;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
@ -37,7 +38,7 @@ public class PubDateSniffer {
heuristics.add(new PubDateHeuristicGuessFromHtmlStandard()); heuristics.add(new PubDateHeuristicGuessFromHtmlStandard());
} }
public PubDate getPubDate(String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard, boolean runExpensive) { public PubDate getPubDate(DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard, boolean runExpensive) {
final PubDateEffortLevel effortLevel = runExpensive ? PubDateEffortLevel.HIGH : PubDateEffortLevel.LOW; final PubDateEffortLevel effortLevel = runExpensive ? PubDateEffortLevel.HIGH : PubDateEffortLevel.LOW;
for (var heuristic : heuristics) { for (var heuristic : heuristics) {

View File

@ -1,5 +1,6 @@
package nu.marginalia.converting.processor.pubdate.heuristic; package nu.marginalia.converting.processor.pubdate.heuristic;
import nu.marginalia.converting.model.DocumentHeaders;
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
import nu.marginalia.converting.processor.pubdate.PubDateParser; import nu.marginalia.converting.processor.pubdate.PubDateParser;
@ -18,7 +19,7 @@ import java.util.Optional;
public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic { public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
@Override @Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
if (effortLevel == PubDateEffortLevel.LOW) if (effortLevel == PubDateEffortLevel.LOW)
return Optional.empty(); return Optional.empty();

View File

@ -1,5 +1,6 @@
package nu.marginalia.converting.processor.pubdate.heuristic; package nu.marginalia.converting.processor.pubdate.heuristic;
import nu.marginalia.converting.model.DocumentHeaders;
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
import nu.marginalia.converting.processor.pubdate.PubDateFromHtmlStandard; import nu.marginalia.converting.processor.pubdate.PubDateFromHtmlStandard;
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
@ -18,7 +19,7 @@ import java.util.Optional;
public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic { public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
@Override @Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
if (effortLevel == PubDateEffortLevel.LOW) if (effortLevel == PubDateEffortLevel.LOW)
return Optional.empty(); return Optional.empty();

View File

@ -1,5 +1,6 @@
package nu.marginalia.converting.processor.pubdate.heuristic; package nu.marginalia.converting.processor.pubdate.heuristic;
import nu.marginalia.converting.model.DocumentHeaders;
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
import nu.marginalia.converting.processor.pubdate.PubDateParser; import nu.marginalia.converting.processor.pubdate.PubDateParser;
@ -13,7 +14,7 @@ import java.util.Optional;
public class PubDateHeuristicGuessFromHtmlStandard implements PubDateHeuristic { public class PubDateHeuristicGuessFromHtmlStandard implements PubDateHeuristic {
@Override @Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
if (htmlStandard == HtmlStandard.UNKNOWN) if (htmlStandard == HtmlStandard.UNKNOWN)
return Optional.empty(); return Optional.empty();

View File

@ -1,5 +1,6 @@
package nu.marginalia.converting.processor.pubdate.heuristic; package nu.marginalia.converting.processor.pubdate.heuristic;
import nu.marginalia.converting.model.DocumentHeaders;
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
import nu.marginalia.converting.processor.pubdate.PubDateParser; import nu.marginalia.converting.processor.pubdate.PubDateParser;
@ -13,7 +14,7 @@ import java.util.Optional;
public class PubDateHeuristicHtml5AnyTimeTag implements PubDateHeuristic { public class PubDateHeuristicHtml5AnyTimeTag implements PubDateHeuristic {
@Override @Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
// HTML5, alternative approach // HTML5, alternative approach
for (var tag : document.select("time")) { for (var tag : document.select("time")) {
var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime")); var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime"));

View File

@ -1,5 +1,6 @@
package nu.marginalia.converting.processor.pubdate.heuristic; package nu.marginalia.converting.processor.pubdate.heuristic;
import nu.marginalia.converting.model.DocumentHeaders;
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
import nu.marginalia.converting.processor.pubdate.PubDateParser; import nu.marginalia.converting.processor.pubdate.PubDateParser;
@ -13,7 +14,7 @@ import java.util.Optional;
public class PubDateHeuristicHtml5ArticleDateTag implements PubDateHeuristic { public class PubDateHeuristicHtml5ArticleDateTag implements PubDateHeuristic {
@Override @Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
// HTML5 // HTML5
for (var tag : document.select("time[pubdate=\"pubdate\"]")) { for (var tag : document.select("time[pubdate=\"pubdate\"]")) {
var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime")); var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime"));

View File

@ -1,5 +1,6 @@
package nu.marginalia.converting.processor.pubdate.heuristic; package nu.marginalia.converting.processor.pubdate.heuristic;
import nu.marginalia.converting.model.DocumentHeaders;
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
import nu.marginalia.converting.processor.pubdate.PubDateParser; import nu.marginalia.converting.processor.pubdate.PubDateParser;
@ -13,7 +14,7 @@ import java.util.Optional;
public class PubDateHeuristicHtml5ItempropDateTag implements PubDateHeuristic { public class PubDateHeuristicHtml5ItempropDateTag implements PubDateHeuristic {
@Override @Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
for (var tag : document.select("time[itemprop=\"datePublished\"]")) { for (var tag : document.select("time[itemprop=\"datePublished\"]")) {
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content")); var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
if (maybeDate.isPresent()) { if (maybeDate.isPresent()) {

View File

@ -5,6 +5,7 @@ import com.google.gson.GsonBuilder;
import com.google.gson.JsonSyntaxException; import com.google.gson.JsonSyntaxException;
import com.google.gson.annotations.SerializedName; import com.google.gson.annotations.SerializedName;
import lombok.ToString; import lombok.ToString;
import nu.marginalia.converting.model.DocumentHeaders;
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
import nu.marginalia.converting.processor.pubdate.PubDateParser; import nu.marginalia.converting.processor.pubdate.PubDateParser;
@ -21,7 +22,7 @@ import java.util.Optional;
public class PubDateHeuristicJSONLD implements PubDateHeuristic { public class PubDateHeuristicJSONLD implements PubDateHeuristic {
@Override @Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
for (var tag : document.select("script[type=\"application/ld+json\"]")) { for (var tag : document.select("script[type=\"application/ld+json\"]")) {
var maybeDate = parseLdJson(tag.data()) var maybeDate = parseLdJson(tag.data())
.flatMap(PubDateParser::attemptParseDate); .flatMap(PubDateParser::attemptParseDate);

View File

@ -1,5 +1,6 @@
package nu.marginalia.converting.processor.pubdate.heuristic; package nu.marginalia.converting.processor.pubdate.heuristic;
import nu.marginalia.converting.model.DocumentHeaders;
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
import nu.marginalia.converting.processor.pubdate.PubDateParser; import nu.marginalia.converting.processor.pubdate.PubDateParser;
@ -8,22 +9,17 @@ import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.html.HtmlStandard;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import java.util.List;
import java.util.Optional; import java.util.Optional;
public class PubDateHeuristicLastModified implements PubDateHeuristic { public class PubDateHeuristicLastModified implements PubDateHeuristic {
@Override @Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
String lmString = "last-modified: "; List<String> lastModified = headers.get("last-modified");
int offset = headers.toLowerCase().indexOf(lmString); if (lastModified.isEmpty())
if (offset < 0)
return Optional.empty(); return Optional.empty();
int end = headers.indexOf('\n', offset); return PubDateParser.attemptParseDate(lastModified.getFirst());
if (end < 0) end = headers.length();
String lmDate = headers.substring(offset + lmString.length(), end);
return PubDateParser.attemptParseDate(lmDate);
} }
} }

View File

@ -1,5 +1,6 @@
package nu.marginalia.converting.processor.pubdate.heuristic; package nu.marginalia.converting.processor.pubdate.heuristic;
import nu.marginalia.converting.model.DocumentHeaders;
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
import nu.marginalia.converting.processor.pubdate.PubDateParser; import nu.marginalia.converting.processor.pubdate.PubDateParser;
@ -13,7 +14,7 @@ import java.util.Optional;
public class PubDateHeuristicMicrodata implements PubDateHeuristic { public class PubDateHeuristicMicrodata implements PubDateHeuristic {
@Override @Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
for (var tag : document.select("meta[itemprop=\"datePublished\"]")) { for (var tag : document.select("meta[itemprop=\"datePublished\"]")) {
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content")); var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));

View File

@ -1,5 +1,6 @@
package nu.marginalia.converting.processor.pubdate.heuristic; package nu.marginalia.converting.processor.pubdate.heuristic;
import nu.marginalia.converting.model.DocumentHeaders;
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
import nu.marginalia.converting.processor.pubdate.PubDateParser; import nu.marginalia.converting.processor.pubdate.PubDateParser;
@ -13,7 +14,7 @@ import java.util.Optional;
public class PubDateHeuristicOpenGraph implements PubDateHeuristic { public class PubDateHeuristicOpenGraph implements PubDateHeuristic {
@Override @Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
// OG // OG
for (var tag : document.select("meta[property=\"article:published_time\"]")) { for (var tag : document.select("meta[property=\"article:published_time\"]")) {
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content")); var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));

View File

@ -1,5 +1,6 @@
package nu.marginalia.converting.processor.pubdate.heuristic; package nu.marginalia.converting.processor.pubdate.heuristic;
import nu.marginalia.converting.model.DocumentHeaders;
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
import nu.marginalia.converting.processor.pubdate.PubDateParser; import nu.marginalia.converting.processor.pubdate.PubDateParser;
@ -13,7 +14,7 @@ import java.util.Optional;
public class PubDateHeuristicRDFaTag implements PubDateHeuristic { public class PubDateHeuristicRDFaTag implements PubDateHeuristic {
@Override @Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
for (var tag : document.select("meta[property=\"datePublished\"]")) { for (var tag : document.select("meta[property=\"datePublished\"]")) {
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content")); var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
if (maybeDate.isPresent()) { if (maybeDate.isPresent()) {

View File

@ -1,5 +1,6 @@
package nu.marginalia.converting.processor.pubdate.heuristic; package nu.marginalia.converting.processor.pubdate.heuristic;
import nu.marginalia.converting.model.DocumentHeaders;
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
import nu.marginalia.converting.processor.pubdate.PubDateParser; import nu.marginalia.converting.processor.pubdate.PubDateParser;
@ -20,7 +21,7 @@ public class PubDateHeuristicUrlPatternPass1 implements PubDateHeuristic {
private static final int MIN_URL_PATTERN_YEAR = 2000; private static final int MIN_URL_PATTERN_YEAR = 2000;
@Override @Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) { public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
final String urlString = url.path; final String urlString = url.path;
var matcher = yearUrlPattern.matcher(urlString); var matcher = yearUrlPattern.matcher(urlString);

View File

@ -1,5 +1,6 @@
package nu.marginalia.converting.processor.pubdate.heuristic; package nu.marginalia.converting.processor.pubdate.heuristic;
import nu.marginalia.converting.model.DocumentHeaders;
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
import nu.marginalia.converting.processor.pubdate.PubDateParser; import nu.marginalia.converting.processor.pubdate.PubDateParser;
@ -17,7 +18,7 @@ public class PubDateHeuristicUrlPatternPass2 implements PubDateHeuristic {
private static final Pattern yearUrlPattern = Pattern.compile("/\\d{4}/"); private static final Pattern yearUrlPattern = Pattern.compile("/\\d{4}/");
@Override @Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url,
Document document, HtmlStandard htmlStandard) { Document document, HtmlStandard htmlStandard) {
final String urlString = url.path; final String urlString = url.path;

View File

@ -0,0 +1,40 @@
package nu.marginalia.converting.model;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import java.util.List;
public class DocumentHeadersTest {
@Test
void testNull() {
DocumentHeaders headers = new DocumentHeaders(null);
Assertions.assertEquals("", headers.raw);
Assertions.assertEquals(List.of(), headers.eachLine());
}
@Test
void testEmpty() {
DocumentHeaders headers = new DocumentHeaders("");
Assertions.assertEquals("", headers.raw);
Assertions.assertEquals(List.of(), headers.eachLine());
}
@Test
void testDoubleNewlinesEmpty() {
DocumentHeaders headers = new DocumentHeaders("server: test\r\n\n\r\nfoo: bar");
Assertions.assertEquals(List.of("server: test", "foo: bar"), headers.eachLine());
}
@Test
void containsIgnoreCaseGivenKeyAndValueInDifferentCasesReturnsTrue() {
String raw = "Key1: Value1\r\nkey2: value2\r\nKEY3: VALUE3";
DocumentHeaders headers = new DocumentHeaders(raw);
Assertions.assertTrue(headers.containsIgnoreCase("key1", "value1"));
Assertions.assertTrue(headers.containsIgnoreCase("key2", "value2"));
Assertions.assertTrue(headers.containsIgnoreCase("key3", "value3"));
}
}

View File

@ -1,5 +1,6 @@
package nu.marginalia.converting.processor.plugin.specialization; package nu.marginalia.converting.processor.plugin.specialization;
import nu.marginalia.converting.model.DocumentHeaders;
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor; import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
import nu.marginalia.converting.processor.summary.SummaryExtractor; import nu.marginalia.converting.processor.summary.SummaryExtractor;
import nu.marginalia.test.CommonTestData; import nu.marginalia.test.CommonTestData;
@ -34,7 +35,7 @@ class JavadocSpecializationTest {
@Test @Test
void generatorExtraction() { void generatorExtraction() {
var gen = generatorExtractor.detectGenerator(Jsoup.parse(thread), ""); var gen = generatorExtractor.detectGenerator(Jsoup.parse(thread), new DocumentHeaders(""));
System.out.println(gen); System.out.println(gen);
} }

View File

@ -1,5 +1,6 @@
package nu.marginalia.converting.processor.plugin.specialization; package nu.marginalia.converting.processor.plugin.specialization;
import nu.marginalia.converting.model.DocumentHeaders;
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor; import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
import nu.marginalia.converting.processor.summary.SummaryExtractor; import nu.marginalia.converting.processor.summary.SummaryExtractor;
import nu.marginalia.test.CommonTestData; import nu.marginalia.test.CommonTestData;
@ -37,8 +38,8 @@ class LemmySpecializationTest {
@Test @Test
void generatorExtraction() { void generatorExtraction() {
var generatorIndex = generatorExtractor.detectGenerator(Jsoup.parse(lemmyIndexHtml), ""); var generatorIndex = generatorExtractor.detectGenerator(Jsoup.parse(lemmyIndexHtml), new DocumentHeaders(""));
var generatorPost = generatorExtractor.detectGenerator(Jsoup.parse(lemmyPost), ""); var generatorPost = generatorExtractor.detectGenerator(Jsoup.parse(lemmyPost), new DocumentHeaders(""));
System.out.println(generatorIndex); System.out.println(generatorIndex);
System.out.println(generatorPost); System.out.println(generatorPost);

View File

@ -1,5 +1,6 @@
package nu.marginalia.converting.processor.plugin.specialization; package nu.marginalia.converting.processor.plugin.specialization;
import nu.marginalia.converting.model.DocumentHeaders;
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor; import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
import nu.marginalia.converting.processor.summary.SummaryExtractor; import nu.marginalia.converting.processor.summary.SummaryExtractor;
import nu.marginalia.test.CommonTestData; import nu.marginalia.test.CommonTestData;
@ -34,7 +35,7 @@ class XenForoSpecializationTest {
@Test @Test
void generatorExtraction() { void generatorExtraction() {
var gen = generatorExtractor.detectGenerator(Jsoup.parse(thread), ""); var gen = generatorExtractor.detectGenerator(Jsoup.parse(thread), new DocumentHeaders(""));
System.out.println(gen); System.out.println(gen);
} }

View File

@ -1,8 +1,7 @@
package nu.marginalia.converting.processor.pubdate; package nu.marginalia.converting.processor.pubdate;
import nu.marginalia.WmsaHome; import nu.marginalia.WmsaHome;
import nu.marginalia.converting.processor.pubdate.PubDateParser; import nu.marginalia.converting.model.DocumentHeaders;
import nu.marginalia.converting.processor.pubdate.PubDateSniffer;
import nu.marginalia.converting.processor.pubdate.heuristic.PubDateHeuristicDOMParsingPass2; import nu.marginalia.converting.processor.pubdate.heuristic.PubDateHeuristicDOMParsingPass2;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.html.HtmlStandard;
@ -66,7 +65,7 @@ class PubDateSnifferTest {
@Test @Test
public void testHtml5A() throws URISyntaxException { public void testHtml5A() throws URISyntaxException {
var ret = dateSniffer.getPubDate("", var ret = dateSniffer.getPubDate(new DocumentHeaders(""),
new EdgeUrl("https://www.example.com/"), new EdgeUrl("https://www.example.com/"),
Jsoup.parse(""" Jsoup.parse("""
<!doctype html> <!doctype html>
@ -83,7 +82,7 @@ class PubDateSnifferTest {
@Test @Test
public void testHtml5B() throws URISyntaxException { public void testHtml5B() throws URISyntaxException {
var ret = dateSniffer.getPubDate("", var ret = dateSniffer.getPubDate(new DocumentHeaders(""),
new EdgeUrl("https://www.example.com/"), new EdgeUrl("https://www.example.com/"),
Jsoup.parse(""" Jsoup.parse("""
<!doctype html> <!doctype html>
@ -99,7 +98,7 @@ class PubDateSnifferTest {
@Test @Test
public void testHtml5C() throws URISyntaxException { public void testHtml5C() throws URISyntaxException {
var ret = dateSniffer.getPubDate("", var ret = dateSniffer.getPubDate(new DocumentHeaders(""),
new EdgeUrl("https://www.example.com/"), new EdgeUrl("https://www.example.com/"),
Jsoup.parse(""" Jsoup.parse("""
<!doctype html> <!doctype html>
@ -115,14 +114,14 @@ class PubDateSnifferTest {
@Test @Test
public void testProblemCases() throws IOException, URISyntaxException { public void testProblemCases() throws IOException, URISyntaxException {
var ret = dateSniffer.getPubDate("", var ret = dateSniffer.getPubDate(new DocumentHeaders(""),
new EdgeUrl("https://www.example.com/"), new EdgeUrl("https://www.example.com/"),
Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/The Switch to Linux Begins .html"))), HtmlStandard.HTML5, true); Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/The Switch to Linux Begins .html"))), HtmlStandard.HTML5, true);
assertFalse(ret.isEmpty()); assertFalse(ret.isEmpty());
assertEquals(2006, ret.year()); assertEquals(2006, ret.year());
ret = dateSniffer.getPubDate("", ret = dateSniffer.getPubDate(new DocumentHeaders(""),
new EdgeUrl("https://www.example.com/"), new EdgeUrl("https://www.example.com/"),
Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/Black Hat USA 2010 Understanding and Deploying DNSSEC by Paul Wouters and Patrick Nauber.html"))), HtmlStandard.XHTML, true); Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/Black Hat USA 2010 Understanding and Deploying DNSSEC by Paul Wouters and Patrick Nauber.html"))), HtmlStandard.XHTML, true);
@ -141,7 +140,7 @@ class PubDateSnifferTest {
@Test @Test
public void testMicrodata() throws URISyntaxException { public void testMicrodata() throws URISyntaxException {
var ret = dateSniffer.getPubDate("", var ret = dateSniffer.getPubDate(new DocumentHeaders(""),
new EdgeUrl("https://www.example.com/"), new EdgeUrl("https://www.example.com/"),
Jsoup.parse(""" Jsoup.parse("""
<!doctype html> <!doctype html>
@ -155,7 +154,7 @@ class PubDateSnifferTest {
@Test @Test
public void testRDFa() throws URISyntaxException { public void testRDFa() throws URISyntaxException {
var ret = dateSniffer.getPubDate("", var ret = dateSniffer.getPubDate(new DocumentHeaders(""),
new EdgeUrl("https://www.example.com/"), new EdgeUrl("https://www.example.com/"),
Jsoup.parse(""" Jsoup.parse("""
<!doctype html> <!doctype html>
@ -169,7 +168,7 @@ class PubDateSnifferTest {
@Test @Test
public void testLD() throws URISyntaxException { public void testLD() throws URISyntaxException {
var ret = dateSniffer.getPubDate("", var ret = dateSniffer.getPubDate(new DocumentHeaders(""),
new EdgeUrl("https://www.example.com/"), new EdgeUrl("https://www.example.com/"),
Jsoup.parse(""" Jsoup.parse("""
<!doctype html> <!doctype html>
@ -183,7 +182,7 @@ class PubDateSnifferTest {
@Test @Test
public void testLDWithGraph() throws URISyntaxException { public void testLDWithGraph() throws URISyntaxException {
var ret = dateSniffer.getPubDate("", var ret = dateSniffer.getPubDate(new DocumentHeaders(""),
new EdgeUrl("https://www.example.com/"), new EdgeUrl("https://www.example.com/"),
Jsoup.parse(""" Jsoup.parse("""
<!doctype html> <!doctype html>
@ -197,7 +196,7 @@ class PubDateSnifferTest {
@Test @Test
public void testPath() throws URISyntaxException { public void testPath() throws URISyntaxException {
var ret = dateSniffer.getPubDate("", var ret = dateSniffer.getPubDate(new DocumentHeaders(""),
new EdgeUrl("https://www.example.com/articles/2022/04/how-to-detect-dates"), new EdgeUrl("https://www.example.com/articles/2022/04/how-to-detect-dates"),
Jsoup.parse(""" Jsoup.parse("""
<!doctype html> <!doctype html>
@ -212,7 +211,7 @@ class PubDateSnifferTest {
@Test @Test
public void testHeader() throws URISyntaxException { public void testHeader() throws URISyntaxException {
var ret = dateSniffer.getPubDate("content-type: application/pdf\netag: \"4fc0ba8a7f5090b6fa6be385dca206ec\"\nlast-modified: Thu, 03 Feb 2022 19:22:58 GMT\ncontent-length: 298819\ndate: Wed, 24 Aug 2022 19:48:52 GMT\ncache-control: public, no-transform, immutable, max-age\u003d31536000\naccess-control-expose-headers: Content-Length,Content-Disposition,Content-Range,Etag,Server-Timing,Vary,X-Cld-Error,X-Content-Type-Options\naccess-control-allow-origin: *\naccept-ranges: bytes\ntiming-allow-origin: *\nserver: Cloudinary\nstrict-transport-security: max-age\u003d604800\nx-content-type-options: nosniff\nserver-timing: akam;dur\u003d25;start\u003d2022-08-24T19:48:52.519Z;desc\u003dmiss,rtt;dur\u003d19,cloudinary;dur\u003d129;start\u003d2022-08-23T06:35:17.331Z\n", var ret = dateSniffer.getPubDate(new DocumentHeaders("content-type: application/pdf\netag: \"4fc0ba8a7f5090b6fa6be385dca206ec\"\nlast-modified: Thu, 03 Feb 2022 19:22:58 GMT\ncontent-length: 298819\ndate: Wed, 24 Aug 2022 19:48:52 GMT\ncache-control: public, no-transform, immutable, max-age\u003d31536000\naccess-control-expose-headers: Content-Length,Content-Disposition,Content-Range,Etag,Server-Timing,Vary,X-Cld-Error,X-Content-Type-Options\naccess-control-allow-origin: *\naccept-ranges: bytes\ntiming-allow-origin: *\nserver: Cloudinary\nstrict-transport-security: max-age\u003d604800\nx-content-type-options: nosniff\nserver-timing: akam;dur\u003d25;start\u003d2022-08-24T19:48:52.519Z;desc\u003dmiss,rtt;dur\u003d19,cloudinary;dur\u003d129;start\u003d2022-08-23T06:35:17.331Z\n"),
new EdgeUrl("https://www.example.com/"), new EdgeUrl("https://www.example.com/"),
Jsoup.parse(""" Jsoup.parse("""
<!doctype html> <!doctype html>
@ -227,7 +226,7 @@ class PubDateSnifferTest {
@Test @Test
public void testDOM() throws URISyntaxException { public void testDOM() throws URISyntaxException {
var ret = dateSniffer.getPubDate("", var ret = dateSniffer.getPubDate(new DocumentHeaders(""),
new EdgeUrl("https://www.example.com/"), new EdgeUrl("https://www.example.com/"),
Jsoup.parse(""" Jsoup.parse("""
<!doctype html> <!doctype html>
@ -253,7 +252,7 @@ class PubDateSnifferTest {
@Test @Test
public void testOldInvision() throws URISyntaxException { public void testOldInvision() throws URISyntaxException {
var ret = dateSniffer.getPubDate("", var ret = dateSniffer.getPubDate(new DocumentHeaders(""),
new EdgeUrl("https://www.example.com/"), new EdgeUrl("https://www.example.com/"),
Jsoup.parse(""" Jsoup.parse("""
<!doctype html> <!doctype html>