mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-22 20:48:59 +00:00
(feature-extraction) Add new DocumentHeaders class encapsulating Html headers.
Also adds a few new html features for CDNs and S3 hosting for use in ranking and query refinement.
This commit is contained in:
parent
5cc71ae586
commit
8b8bf0748f
@ -16,6 +16,9 @@ public enum HtmlFeature {
|
||||
KEBAB_CASE_URL("special:kcurl"), // https://www.example.com/urls-that-look-like-this/
|
||||
LONG_URL("special:longurl"),
|
||||
|
||||
CLOUDFLARE_FEATURE("special:cloudflare"),
|
||||
CDN_FEATURE("special:cdn"),
|
||||
|
||||
VIEWPORT("special:viewport"),
|
||||
|
||||
COOKIES("special:cookies"),
|
||||
@ -60,6 +63,8 @@ public enum HtmlFeature {
|
||||
DOFOLLOW_LINK("special:dofollow"),
|
||||
APPLE_TOUCH_ICON("special:appleicon"),
|
||||
|
||||
S3_FEATURE("special:s3"),
|
||||
|
||||
UNKNOWN("special:uncategorized");
|
||||
|
||||
|
||||
|
@ -0,0 +1,60 @@
|
||||
package nu.marginalia.converting.model;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/** Encapsulates the HTTP headers of a document.
|
||||
*/
|
||||
public class DocumentHeaders {
|
||||
public final String raw;
|
||||
|
||||
private final Map<String, List<String>> headers = new HashMap<>();
|
||||
|
||||
private static final Pattern NEWLINE_PATTERN = Pattern.compile("(\r?\n)+");
|
||||
|
||||
public DocumentHeaders(String raw) {
|
||||
this.raw = Objects.requireNonNullElse(raw, "");
|
||||
|
||||
for (var line : eachLine()) {
|
||||
int colonIndex = line.indexOf(':');
|
||||
|
||||
if (colonIndex == -1) continue;
|
||||
|
||||
String key = line.substring(0, colonIndex).trim().toLowerCase();
|
||||
String value = line.substring(colonIndex + 1).trim();
|
||||
|
||||
headers.computeIfAbsent(key, k -> new ArrayList<>()).add(value);
|
||||
}
|
||||
}
|
||||
|
||||
public List<String> get(String key) {
|
||||
return headers.getOrDefault(key.toLowerCase(), List.of());
|
||||
}
|
||||
|
||||
public List<String> eachLine() {
|
||||
if (raw.isBlank())
|
||||
return List.of();
|
||||
|
||||
return List.of(NEWLINE_PATTERN.split(raw));
|
||||
}
|
||||
|
||||
public List<String> eachLineLowercase() {
|
||||
if (raw.isBlank())
|
||||
return List.of();
|
||||
|
||||
return List.of(NEWLINE_PATTERN.split(raw.toLowerCase()));
|
||||
}
|
||||
|
||||
public boolean contains(String key) {
|
||||
return headers.containsKey(key.toLowerCase());
|
||||
}
|
||||
public boolean contains(String key, String value) {
|
||||
return headers.getOrDefault(key.toLowerCase(), List.of()).contains(value);
|
||||
}
|
||||
public boolean containsIgnoreCase(String key, String value) {
|
||||
return headers.getOrDefault(key.toLowerCase(), List.of())
|
||||
.stream()
|
||||
.map(String::toLowerCase)
|
||||
.anyMatch(s -> s.equals(value.toLowerCase()));
|
||||
}
|
||||
}
|
@ -1,5 +1,6 @@
|
||||
package nu.marginalia.converting.processor.logic;
|
||||
|
||||
import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.model.GeneratorType;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jsoup.nodes.Document;
|
||||
@ -12,7 +13,7 @@ import java.util.List;
|
||||
public class DocumentGeneratorExtractor {
|
||||
private static final String defaultValue = "unset";
|
||||
|
||||
public DocumentGenerator detectGenerator(Document doc, String responseHeaders) {
|
||||
public DocumentGenerator detectGenerator(Document doc, DocumentHeaders responseHeaders) {
|
||||
|
||||
var tags = doc.select("meta[name=generator]");
|
||||
|
||||
@ -76,7 +77,7 @@ public class DocumentGeneratorExtractor {
|
||||
}
|
||||
|
||||
// Fallback logic when there is no meta tag
|
||||
private DocumentGenerator fingerprintServerTech(Document doc, String responseHeaders) {
|
||||
private DocumentGenerator fingerprintServerTech(Document doc, DocumentHeaders responseHeaders) {
|
||||
|
||||
for (var comment : doc.getElementsByTag("head").comments()) {
|
||||
String data = comment.getData();
|
||||
@ -149,8 +150,7 @@ public class DocumentGeneratorExtractor {
|
||||
return DocumentGenerator.of("gatsby");
|
||||
}
|
||||
|
||||
String[] headers = responseHeaders.toLowerCase().split("\n+");
|
||||
for (var header : headers) {
|
||||
for (var header : responseHeaders.eachLineLowercase()) {
|
||||
if (header.contains("x-drupal-cache")) {
|
||||
return DocumentGenerator.of("drupal");
|
||||
}
|
||||
@ -169,7 +169,7 @@ public class DocumentGeneratorExtractor {
|
||||
}
|
||||
|
||||
// These should be all the way down as they are the most generic
|
||||
for (var header : headers) {
|
||||
for (var header : responseHeaders.eachLineLowercase()) {
|
||||
if (header.contains("server: mastodon")) {
|
||||
return DocumentGenerator.of("mastodon");
|
||||
}
|
||||
|
@ -2,6 +2,7 @@ package nu.marginalia.converting.processor.logic;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.classifier.adblock.AdblockSimulator;
|
||||
import nu.marginalia.converting.processor.classifier.adblock.GoogleAnwersSpamDetector;
|
||||
import nu.marginalia.converting.processor.classifier.topic.RecipeDetector;
|
||||
@ -84,7 +85,7 @@ public class FeatureExtractor {
|
||||
this.googleAnwersSpamDetector = googleAnwersSpamDetector;
|
||||
}
|
||||
|
||||
public Set<HtmlFeature> getFeatures(EdgeUrl url, Document doc, DocumentLanguageData dld) {
|
||||
public Set<HtmlFeature> getFeatures(EdgeUrl url, Document doc, DocumentHeaders headers, DocumentLanguageData dld) {
|
||||
final Set<HtmlFeature> features = new HashSet<>();
|
||||
|
||||
final Elements scriptTags = doc.getElementsByTag("script");
|
||||
@ -313,6 +314,30 @@ public class FeatureExtractor {
|
||||
}
|
||||
}
|
||||
|
||||
// check for cloudflare headers
|
||||
if (headers.contains("Cf-Ray") || headers.containsIgnoreCase("server", "Cloudflare"))
|
||||
{
|
||||
features.add(HtmlFeature.CLOUDFLARE_FEATURE);
|
||||
features.add(HtmlFeature.CDN_FEATURE);
|
||||
}
|
||||
|
||||
// check for amazon cloudfront headers
|
||||
if (headers.contains("X-Amz-Cf-Id"))
|
||||
{
|
||||
features.add(HtmlFeature.CDN_FEATURE);
|
||||
}
|
||||
|
||||
// check for fastly headers
|
||||
if (headers.contains("x-fastly-request-id"))
|
||||
{
|
||||
features.add(HtmlFeature.CDN_FEATURE);
|
||||
}
|
||||
|
||||
// check for s3 hosting
|
||||
if (headers.containsIgnoreCase("server", "AmazonS3")) {
|
||||
features.add(HtmlFeature.S3_FEATURE);
|
||||
}
|
||||
|
||||
if (recipeDetector.testP(dld) > 0.5)
|
||||
features.add(HtmlFeature.CATEGORY_FOOD);
|
||||
// these should be mutually exclusive
|
||||
|
@ -3,6 +3,7 @@ package nu.marginalia.converting.processor.plugin;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.converting.model.DisqualifiedException;
|
||||
import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.model.GeneratorType;
|
||||
import nu.marginalia.converting.model.ProcessedDocumentDetails;
|
||||
import nu.marginalia.converting.processor.DocumentClass;
|
||||
@ -39,7 +40,6 @@ import org.slf4j.LoggerFactory;
|
||||
import java.net.URISyntaxException;
|
||||
import java.util.EnumSet;
|
||||
import java.util.HashSet;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
|
||||
import static nu.marginalia.converting.model.DisqualifiedException.DisqualificationReason;
|
||||
@ -127,10 +127,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
}
|
||||
|
||||
final EdgeUrl url = new EdgeUrl(crawledDocument.url);
|
||||
final DocumentHeaders documentHeaders = new DocumentHeaders(crawledDocument.headers);
|
||||
|
||||
final var generatorParts = documentGeneratorExtractor.detectGenerator(doc,
|
||||
Objects.requireNonNullElse(crawledDocument.headers, "")
|
||||
);
|
||||
final var generatorParts = documentGeneratorExtractor.detectGenerator(doc, documentHeaders);
|
||||
|
||||
final var specialization = htmlProcessorSpecializations.select(generatorParts, url);
|
||||
|
||||
@ -155,7 +154,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
|
||||
documentLengthLogic.validateLength(dld, specialization.lengthModifier() * documentClass.lengthLimitModifier());
|
||||
|
||||
final Set<HtmlFeature> features = featureExtractor.getFeatures(url, doc, dld);
|
||||
final Set<HtmlFeature> features = featureExtractor.getFeatures(url, doc, documentHeaders, dld);
|
||||
|
||||
ret.features = features;
|
||||
ret.quality = documentValuator.adjustQuality(quality, features);
|
||||
@ -165,12 +164,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
throw new DisqualifiedException(DisqualificationReason.QUALITY);
|
||||
}
|
||||
|
||||
PubDate pubDate = pubDateSniffer.getPubDate(
|
||||
Objects.requireNonNullElse(crawledDocument.headers, ""),
|
||||
url,
|
||||
doc,
|
||||
standard,
|
||||
true);
|
||||
PubDate pubDate = pubDateSniffer.getPubDate(documentHeaders, url, doc, standard, true);
|
||||
|
||||
EnumSet<DocumentFlags> documentFlags = documentFlags(features, generatorParts.type());
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
package nu.marginalia.converting.processor.pubdate;
|
||||
|
||||
import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
@ -9,5 +10,5 @@ import java.util.Optional;
|
||||
|
||||
public interface PubDateHeuristic {
|
||||
|
||||
Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard);
|
||||
Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard);
|
||||
}
|
||||
|
@ -1,5 +1,6 @@
|
||||
package nu.marginalia.converting.processor.pubdate;
|
||||
|
||||
import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.pubdate.heuristic.*;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
@ -37,7 +38,7 @@ public class PubDateSniffer {
|
||||
heuristics.add(new PubDateHeuristicGuessFromHtmlStandard());
|
||||
}
|
||||
|
||||
public PubDate getPubDate(String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard, boolean runExpensive) {
|
||||
public PubDate getPubDate(DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard, boolean runExpensive) {
|
||||
final PubDateEffortLevel effortLevel = runExpensive ? PubDateEffortLevel.HIGH : PubDateEffortLevel.LOW;
|
||||
|
||||
for (var heuristic : heuristics) {
|
||||
|
@ -1,5 +1,6 @@
|
||||
package nu.marginalia.converting.processor.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
@ -18,7 +19,7 @@ import java.util.Optional;
|
||||
public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
|
||||
|
||||
@Override
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||
if (effortLevel == PubDateEffortLevel.LOW)
|
||||
return Optional.empty();
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
package nu.marginalia.converting.processor.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateFromHtmlStandard;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
@ -18,7 +19,7 @@ import java.util.Optional;
|
||||
public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
|
||||
|
||||
@Override
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||
if (effortLevel == PubDateEffortLevel.LOW)
|
||||
return Optional.empty();
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
package nu.marginalia.converting.processor.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
@ -13,7 +14,7 @@ import java.util.Optional;
|
||||
public class PubDateHeuristicGuessFromHtmlStandard implements PubDateHeuristic {
|
||||
|
||||
@Override
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||
if (htmlStandard == HtmlStandard.UNKNOWN)
|
||||
return Optional.empty();
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
package nu.marginalia.converting.processor.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
@ -13,7 +14,7 @@ import java.util.Optional;
|
||||
public class PubDateHeuristicHtml5AnyTimeTag implements PubDateHeuristic {
|
||||
|
||||
@Override
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||
// HTML5, alternative approach
|
||||
for (var tag : document.select("time")) {
|
||||
var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime"));
|
||||
|
@ -1,5 +1,6 @@
|
||||
package nu.marginalia.converting.processor.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
@ -13,7 +14,7 @@ import java.util.Optional;
|
||||
public class PubDateHeuristicHtml5ArticleDateTag implements PubDateHeuristic {
|
||||
|
||||
@Override
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||
// HTML5
|
||||
for (var tag : document.select("time[pubdate=\"pubdate\"]")) {
|
||||
var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime"));
|
||||
|
@ -1,5 +1,6 @@
|
||||
package nu.marginalia.converting.processor.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
@ -13,7 +14,7 @@ import java.util.Optional;
|
||||
public class PubDateHeuristicHtml5ItempropDateTag implements PubDateHeuristic {
|
||||
|
||||
@Override
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||
for (var tag : document.select("time[itemprop=\"datePublished\"]")) {
|
||||
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
||||
if (maybeDate.isPresent()) {
|
||||
|
@ -5,6 +5,7 @@ import com.google.gson.GsonBuilder;
|
||||
import com.google.gson.JsonSyntaxException;
|
||||
import com.google.gson.annotations.SerializedName;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
@ -21,7 +22,7 @@ import java.util.Optional;
|
||||
public class PubDateHeuristicJSONLD implements PubDateHeuristic {
|
||||
|
||||
@Override
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||
for (var tag : document.select("script[type=\"application/ld+json\"]")) {
|
||||
var maybeDate = parseLdJson(tag.data())
|
||||
.flatMap(PubDateParser::attemptParseDate);
|
||||
|
@ -1,5 +1,6 @@
|
||||
package nu.marginalia.converting.processor.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
@ -8,22 +9,17 @@ import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
public class PubDateHeuristicLastModified implements PubDateHeuristic {
|
||||
|
||||
@Override
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||
String lmString = "last-modified: ";
|
||||
int offset = headers.toLowerCase().indexOf(lmString);
|
||||
|
||||
if (offset < 0)
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||
List<String> lastModified = headers.get("last-modified");
|
||||
if (lastModified.isEmpty())
|
||||
return Optional.empty();
|
||||
int end = headers.indexOf('\n', offset);
|
||||
if (end < 0) end = headers.length();
|
||||
|
||||
String lmDate = headers.substring(offset + lmString.length(), end);
|
||||
return PubDateParser.attemptParseDate(lmDate);
|
||||
return PubDateParser.attemptParseDate(lastModified.getFirst());
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,5 +1,6 @@
|
||||
package nu.marginalia.converting.processor.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
@ -13,7 +14,7 @@ import java.util.Optional;
|
||||
public class PubDateHeuristicMicrodata implements PubDateHeuristic {
|
||||
|
||||
@Override
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||
|
||||
for (var tag : document.select("meta[itemprop=\"datePublished\"]")) {
|
||||
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
||||
|
@ -1,5 +1,6 @@
|
||||
package nu.marginalia.converting.processor.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
@ -13,7 +14,7 @@ import java.util.Optional;
|
||||
public class PubDateHeuristicOpenGraph implements PubDateHeuristic {
|
||||
|
||||
@Override
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||
// OG
|
||||
for (var tag : document.select("meta[property=\"article:published_time\"]")) {
|
||||
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
||||
|
@ -1,5 +1,6 @@
|
||||
package nu.marginalia.converting.processor.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
@ -13,7 +14,7 @@ import java.util.Optional;
|
||||
public class PubDateHeuristicRDFaTag implements PubDateHeuristic {
|
||||
|
||||
@Override
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||
for (var tag : document.select("meta[property=\"datePublished\"]")) {
|
||||
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
||||
if (maybeDate.isPresent()) {
|
||||
|
@ -1,5 +1,6 @@
|
||||
package nu.marginalia.converting.processor.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
@ -20,7 +21,7 @@ public class PubDateHeuristicUrlPatternPass1 implements PubDateHeuristic {
|
||||
private static final int MIN_URL_PATTERN_YEAR = 2000;
|
||||
|
||||
@Override
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||
final String urlString = url.path;
|
||||
|
||||
var matcher = yearUrlPattern.matcher(urlString);
|
||||
|
@ -1,5 +1,6 @@
|
||||
package nu.marginalia.converting.processor.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
@ -17,7 +18,7 @@ public class PubDateHeuristicUrlPatternPass2 implements PubDateHeuristic {
|
||||
private static final Pattern yearUrlPattern = Pattern.compile("/\\d{4}/");
|
||||
|
||||
@Override
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url,
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url,
|
||||
Document document, HtmlStandard htmlStandard) {
|
||||
final String urlString = url.path;
|
||||
|
||||
|
@ -0,0 +1,40 @@
|
||||
package nu.marginalia.converting.model;
|
||||
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class DocumentHeadersTest {
|
||||
|
||||
@Test
|
||||
void testNull() {
|
||||
DocumentHeaders headers = new DocumentHeaders(null);
|
||||
Assertions.assertEquals("", headers.raw);
|
||||
Assertions.assertEquals(List.of(), headers.eachLine());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testEmpty() {
|
||||
DocumentHeaders headers = new DocumentHeaders("");
|
||||
Assertions.assertEquals("", headers.raw);
|
||||
Assertions.assertEquals(List.of(), headers.eachLine());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testDoubleNewlinesEmpty() {
|
||||
DocumentHeaders headers = new DocumentHeaders("server: test\r\n\n\r\nfoo: bar");
|
||||
Assertions.assertEquals(List.of("server: test", "foo: bar"), headers.eachLine());
|
||||
}
|
||||
|
||||
@Test
|
||||
void containsIgnoreCaseGivenKeyAndValueInDifferentCasesReturnsTrue() {
|
||||
String raw = "Key1: Value1\r\nkey2: value2\r\nKEY3: VALUE3";
|
||||
DocumentHeaders headers = new DocumentHeaders(raw);
|
||||
|
||||
Assertions.assertTrue(headers.containsIgnoreCase("key1", "value1"));
|
||||
Assertions.assertTrue(headers.containsIgnoreCase("key2", "value2"));
|
||||
Assertions.assertTrue(headers.containsIgnoreCase("key3", "value3"));
|
||||
}
|
||||
|
||||
}
|
@ -1,5 +1,6 @@
|
||||
package nu.marginalia.converting.processor.plugin.specialization;
|
||||
|
||||
import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
|
||||
import nu.marginalia.converting.processor.summary.SummaryExtractor;
|
||||
import nu.marginalia.test.CommonTestData;
|
||||
@ -34,7 +35,7 @@ class JavadocSpecializationTest {
|
||||
|
||||
@Test
|
||||
void generatorExtraction() {
|
||||
var gen = generatorExtractor.detectGenerator(Jsoup.parse(thread), "");
|
||||
var gen = generatorExtractor.detectGenerator(Jsoup.parse(thread), new DocumentHeaders(""));
|
||||
|
||||
System.out.println(gen);
|
||||
}
|
||||
|
@ -1,5 +1,6 @@
|
||||
package nu.marginalia.converting.processor.plugin.specialization;
|
||||
|
||||
import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
|
||||
import nu.marginalia.converting.processor.summary.SummaryExtractor;
|
||||
import nu.marginalia.test.CommonTestData;
|
||||
@ -37,8 +38,8 @@ class LemmySpecializationTest {
|
||||
|
||||
@Test
|
||||
void generatorExtraction() {
|
||||
var generatorIndex = generatorExtractor.detectGenerator(Jsoup.parse(lemmyIndexHtml), "");
|
||||
var generatorPost = generatorExtractor.detectGenerator(Jsoup.parse(lemmyPost), "");
|
||||
var generatorIndex = generatorExtractor.detectGenerator(Jsoup.parse(lemmyIndexHtml), new DocumentHeaders(""));
|
||||
var generatorPost = generatorExtractor.detectGenerator(Jsoup.parse(lemmyPost), new DocumentHeaders(""));
|
||||
|
||||
System.out.println(generatorIndex);
|
||||
System.out.println(generatorPost);
|
||||
|
@ -1,5 +1,6 @@
|
||||
package nu.marginalia.converting.processor.plugin.specialization;
|
||||
|
||||
import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
|
||||
import nu.marginalia.converting.processor.summary.SummaryExtractor;
|
||||
import nu.marginalia.test.CommonTestData;
|
||||
@ -34,7 +35,7 @@ class XenForoSpecializationTest {
|
||||
|
||||
@Test
|
||||
void generatorExtraction() {
|
||||
var gen = generatorExtractor.detectGenerator(Jsoup.parse(thread), "");
|
||||
var gen = generatorExtractor.detectGenerator(Jsoup.parse(thread), new DocumentHeaders(""));
|
||||
|
||||
System.out.println(gen);
|
||||
}
|
||||
|
@ -1,8 +1,7 @@
|
||||
package nu.marginalia.converting.processor.pubdate;
|
||||
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateSniffer;
|
||||
import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.pubdate.heuristic.PubDateHeuristicDOMParsingPass2;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
@ -66,7 +65,7 @@ class PubDateSnifferTest {
|
||||
|
||||
@Test
|
||||
public void testHtml5A() throws URISyntaxException {
|
||||
var ret = dateSniffer.getPubDate("",
|
||||
var ret = dateSniffer.getPubDate(new DocumentHeaders(""),
|
||||
new EdgeUrl("https://www.example.com/"),
|
||||
Jsoup.parse("""
|
||||
<!doctype html>
|
||||
@ -83,7 +82,7 @@ class PubDateSnifferTest {
|
||||
|
||||
@Test
|
||||
public void testHtml5B() throws URISyntaxException {
|
||||
var ret = dateSniffer.getPubDate("",
|
||||
var ret = dateSniffer.getPubDate(new DocumentHeaders(""),
|
||||
new EdgeUrl("https://www.example.com/"),
|
||||
Jsoup.parse("""
|
||||
<!doctype html>
|
||||
@ -99,7 +98,7 @@ class PubDateSnifferTest {
|
||||
|
||||
@Test
|
||||
public void testHtml5C() throws URISyntaxException {
|
||||
var ret = dateSniffer.getPubDate("",
|
||||
var ret = dateSniffer.getPubDate(new DocumentHeaders(""),
|
||||
new EdgeUrl("https://www.example.com/"),
|
||||
Jsoup.parse("""
|
||||
<!doctype html>
|
||||
@ -115,14 +114,14 @@ class PubDateSnifferTest {
|
||||
|
||||
@Test
|
||||
public void testProblemCases() throws IOException, URISyntaxException {
|
||||
var ret = dateSniffer.getPubDate("",
|
||||
var ret = dateSniffer.getPubDate(new DocumentHeaders(""),
|
||||
new EdgeUrl("https://www.example.com/"),
|
||||
Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/The Switch to Linux Begins .html"))), HtmlStandard.HTML5, true);
|
||||
|
||||
assertFalse(ret.isEmpty());
|
||||
assertEquals(2006, ret.year());
|
||||
|
||||
ret = dateSniffer.getPubDate("",
|
||||
ret = dateSniffer.getPubDate(new DocumentHeaders(""),
|
||||
new EdgeUrl("https://www.example.com/"),
|
||||
Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/Black Hat USA 2010 Understanding and Deploying DNSSEC by Paul Wouters and Patrick Nauber.html"))), HtmlStandard.XHTML, true);
|
||||
|
||||
@ -141,7 +140,7 @@ class PubDateSnifferTest {
|
||||
|
||||
@Test
|
||||
public void testMicrodata() throws URISyntaxException {
|
||||
var ret = dateSniffer.getPubDate("",
|
||||
var ret = dateSniffer.getPubDate(new DocumentHeaders(""),
|
||||
new EdgeUrl("https://www.example.com/"),
|
||||
Jsoup.parse("""
|
||||
<!doctype html>
|
||||
@ -155,7 +154,7 @@ class PubDateSnifferTest {
|
||||
|
||||
@Test
|
||||
public void testRDFa() throws URISyntaxException {
|
||||
var ret = dateSniffer.getPubDate("",
|
||||
var ret = dateSniffer.getPubDate(new DocumentHeaders(""),
|
||||
new EdgeUrl("https://www.example.com/"),
|
||||
Jsoup.parse("""
|
||||
<!doctype html>
|
||||
@ -169,7 +168,7 @@ class PubDateSnifferTest {
|
||||
|
||||
@Test
|
||||
public void testLD() throws URISyntaxException {
|
||||
var ret = dateSniffer.getPubDate("",
|
||||
var ret = dateSniffer.getPubDate(new DocumentHeaders(""),
|
||||
new EdgeUrl("https://www.example.com/"),
|
||||
Jsoup.parse("""
|
||||
<!doctype html>
|
||||
@ -183,7 +182,7 @@ class PubDateSnifferTest {
|
||||
|
||||
@Test
|
||||
public void testLDWithGraph() throws URISyntaxException {
|
||||
var ret = dateSniffer.getPubDate("",
|
||||
var ret = dateSniffer.getPubDate(new DocumentHeaders(""),
|
||||
new EdgeUrl("https://www.example.com/"),
|
||||
Jsoup.parse("""
|
||||
<!doctype html>
|
||||
@ -197,7 +196,7 @@ class PubDateSnifferTest {
|
||||
|
||||
@Test
|
||||
public void testPath() throws URISyntaxException {
|
||||
var ret = dateSniffer.getPubDate("",
|
||||
var ret = dateSniffer.getPubDate(new DocumentHeaders(""),
|
||||
new EdgeUrl("https://www.example.com/articles/2022/04/how-to-detect-dates"),
|
||||
Jsoup.parse("""
|
||||
<!doctype html>
|
||||
@ -212,7 +211,7 @@ class PubDateSnifferTest {
|
||||
|
||||
@Test
|
||||
public void testHeader() throws URISyntaxException {
|
||||
var ret = dateSniffer.getPubDate("content-type: application/pdf\netag: \"4fc0ba8a7f5090b6fa6be385dca206ec\"\nlast-modified: Thu, 03 Feb 2022 19:22:58 GMT\ncontent-length: 298819\ndate: Wed, 24 Aug 2022 19:48:52 GMT\ncache-control: public, no-transform, immutable, max-age\u003d31536000\naccess-control-expose-headers: Content-Length,Content-Disposition,Content-Range,Etag,Server-Timing,Vary,X-Cld-Error,X-Content-Type-Options\naccess-control-allow-origin: *\naccept-ranges: bytes\ntiming-allow-origin: *\nserver: Cloudinary\nstrict-transport-security: max-age\u003d604800\nx-content-type-options: nosniff\nserver-timing: akam;dur\u003d25;start\u003d2022-08-24T19:48:52.519Z;desc\u003dmiss,rtt;dur\u003d19,cloudinary;dur\u003d129;start\u003d2022-08-23T06:35:17.331Z\n",
|
||||
var ret = dateSniffer.getPubDate(new DocumentHeaders("content-type: application/pdf\netag: \"4fc0ba8a7f5090b6fa6be385dca206ec\"\nlast-modified: Thu, 03 Feb 2022 19:22:58 GMT\ncontent-length: 298819\ndate: Wed, 24 Aug 2022 19:48:52 GMT\ncache-control: public, no-transform, immutable, max-age\u003d31536000\naccess-control-expose-headers: Content-Length,Content-Disposition,Content-Range,Etag,Server-Timing,Vary,X-Cld-Error,X-Content-Type-Options\naccess-control-allow-origin: *\naccept-ranges: bytes\ntiming-allow-origin: *\nserver: Cloudinary\nstrict-transport-security: max-age\u003d604800\nx-content-type-options: nosniff\nserver-timing: akam;dur\u003d25;start\u003d2022-08-24T19:48:52.519Z;desc\u003dmiss,rtt;dur\u003d19,cloudinary;dur\u003d129;start\u003d2022-08-23T06:35:17.331Z\n"),
|
||||
new EdgeUrl("https://www.example.com/"),
|
||||
Jsoup.parse("""
|
||||
<!doctype html>
|
||||
@ -227,7 +226,7 @@ class PubDateSnifferTest {
|
||||
|
||||
@Test
|
||||
public void testDOM() throws URISyntaxException {
|
||||
var ret = dateSniffer.getPubDate("",
|
||||
var ret = dateSniffer.getPubDate(new DocumentHeaders(""),
|
||||
new EdgeUrl("https://www.example.com/"),
|
||||
Jsoup.parse("""
|
||||
<!doctype html>
|
||||
@ -253,7 +252,7 @@ class PubDateSnifferTest {
|
||||
|
||||
@Test
|
||||
public void testOldInvision() throws URISyntaxException {
|
||||
var ret = dateSniffer.getPubDate("",
|
||||
var ret = dateSniffer.getPubDate(new DocumentHeaders(""),
|
||||
new EdgeUrl("https://www.example.com/"),
|
||||
Jsoup.parse("""
|
||||
<!doctype html>
|
||||
|
Loading…
Reference in New Issue
Block a user