mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
GUI fixes and cleanups (#122)
Co-authored-by: vlofgren <vlofgren@gmail.com> Co-authored-by: vlofgren <vlofgren@marginalia.nu> Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/122
This commit is contained in:
parent
c6abbc12f6
commit
e676d8729e
42
CONTRIBUTING.md
Normal file
42
CONTRIBUTING.md
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
# Contributing
|
||||||
|
|
||||||
|
At present this is mostly a solo project, but
|
||||||
|
external contributions are very welcome.
|
||||||
|
|
||||||
|
This is a bit of a special project,
|
||||||
|
in part because a search engine isn't
|
||||||
|
like a text editor that you can just
|
||||||
|
download and tinker with; and in part
|
||||||
|
because it's as much a research project
|
||||||
|
as it is a search engine.
|
||||||
|
|
||||||
|
If you have an idea for a cool change,
|
||||||
|
send an email to <kontakt@marginalia.nu> and
|
||||||
|
we can discuss its feasibility.
|
||||||
|
|
||||||
|
Search is essentially a fractal of interesting
|
||||||
|
problems, so even if you don't have an idea,
|
||||||
|
just a skillset (really any), odds are there's
|
||||||
|
something interesting I could point you to.
|
||||||
|
|
||||||
|
## Release and branches
|
||||||
|
|
||||||
|
The search engine has a release cycle of
|
||||||
|
once per 6-8 weeks, coinciding with the crawling
|
||||||
|
cycle. Where model-breaking changes and changes to
|
||||||
|
the crawler can be introduced.
|
||||||
|
|
||||||
|
## Quick Set Up
|
||||||
|
|
||||||
|
There is a [Set Up Guide](https://git.marginalia.nu/marginalia/marginalia.nu/wiki/Setup/Search)
|
||||||
|
in the wiki. It has a small tendency to oxidize rather
|
||||||
|
rapidly since the project currently does not have a
|
||||||
|
lot of contributors to test it. If you find a problem
|
||||||
|
with the guide, email <kontakt@marginalia.nu>.
|
||||||
|
|
||||||
|
## Documentation
|
||||||
|
|
||||||
|
What documentation exists resides here:
|
||||||
|
|
||||||
|
https://git.marginalia.nu/marginalia/marginalia.nu/wiki
|
||||||
|
|
@ -30,7 +30,7 @@ public class DocumentsCompiler {
|
|||||||
var details = doc.details;
|
var details = doc.details;
|
||||||
|
|
||||||
if (details != null) {
|
if (details != null) {
|
||||||
ret.add(new LoadProcessedDocument(doc.url, doc.state, details.title, details.description, HtmlFeature.encode(details.features), details.standard, details.length, details.hashCode, details.quality));
|
ret.add(new LoadProcessedDocument(doc.url, doc.state, details.title, details.description, HtmlFeature.encode(details.features), details.standard, details.length, details.hashCode, details.quality, details.pubYear));
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
ret.add(new LoadProcessedDocumentWithError(doc.url, doc.state, doc.stateReason));
|
ret.add(new LoadProcessedDocumentWithError(doc.url, doc.state, doc.stateReason));
|
||||||
|
@ -7,6 +7,8 @@ import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
|||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState;
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState;
|
||||||
|
|
||||||
|
import javax.annotation.Nullable;
|
||||||
|
|
||||||
|
|
||||||
public record LoadProcessedDocument(EdgeUrl url,
|
public record LoadProcessedDocument(EdgeUrl url,
|
||||||
EdgeUrlState state,
|
EdgeUrlState state,
|
||||||
@ -16,7 +18,8 @@ public record LoadProcessedDocument(EdgeUrl url,
|
|||||||
EdgeHtmlStandard standard,
|
EdgeHtmlStandard standard,
|
||||||
int length,
|
int length,
|
||||||
long hash,
|
long hash,
|
||||||
double quality) implements Instruction
|
double quality,
|
||||||
|
@Nullable Integer pubYear) implements Instruction
|
||||||
{
|
{
|
||||||
@Override
|
@Override
|
||||||
public void apply(Interpreter interpreter) {
|
public void apply(Interpreter interpreter) {
|
||||||
|
@ -8,6 +8,7 @@ import org.slf4j.Logger;
|
|||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
|
import java.sql.Types;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import static java.sql.Statement.SUCCESS_NO_INFO;
|
import static java.sql.Statement.SUCCESS_NO_INFO;
|
||||||
@ -34,10 +35,11 @@ public class SqlLoadProcessedDocument {
|
|||||||
IN FEATURES INT,
|
IN FEATURES INT,
|
||||||
IN STANDARD VARCHAR(32),
|
IN STANDARD VARCHAR(32),
|
||||||
IN QUALITY DOUBLE,
|
IN QUALITY DOUBLE,
|
||||||
IN HASH INT)
|
IN HASH INT,
|
||||||
|
IN PUB_YEAR SMALLINT)
|
||||||
BEGIN
|
BEGIN
|
||||||
SET FOREIGN_KEY_CHECKS=0;
|
SET FOREIGN_KEY_CHECKS=0;
|
||||||
REPLACE INTO EC_PAGE_DATA(ID, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY) VALUES (URL_ID, TITLE, DESCRIPTION, LENGTH, STANDARD, FEATURES, HASH, QUALITY);
|
REPLACE INTO EC_PAGE_DATA(ID, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR) VALUES (URL_ID, TITLE, DESCRIPTION, LENGTH, STANDARD, FEATURES, HASH, QUALITY, PUB_YEAR);
|
||||||
UPDATE EC_URL SET VISITED=1, STATE=STATE WHERE ID=URL_ID;
|
UPDATE EC_URL SET VISITED=1, STATE=STATE WHERE ID=URL_ID;
|
||||||
SET FOREIGN_KEY_CHECKS=1;
|
SET FOREIGN_KEY_CHECKS=1;
|
||||||
END
|
END
|
||||||
@ -62,7 +64,7 @@ public class SqlLoadProcessedDocument {
|
|||||||
public void load(LoaderData data, List<LoadProcessedDocument> documents) {
|
public void load(LoaderData data, List<LoadProcessedDocument> documents) {
|
||||||
|
|
||||||
try (var conn = dataSource.getConnection();
|
try (var conn = dataSource.getConnection();
|
||||||
var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT(?, ?, ?, ?, ?, ?, ?, ?, ?)")) {
|
var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)")) {
|
||||||
conn.setAutoCommit(false);
|
conn.setAutoCommit(false);
|
||||||
|
|
||||||
int cnt = 0; int batchOffset = 0;
|
int cnt = 0; int batchOffset = 0;
|
||||||
@ -82,6 +84,12 @@ public class SqlLoadProcessedDocument {
|
|||||||
stmt.setString(7, doc.standard().name());
|
stmt.setString(7, doc.standard().name());
|
||||||
stmt.setDouble(8, doc.quality());
|
stmt.setDouble(8, doc.quality());
|
||||||
stmt.setInt(9, (int) doc.hash());
|
stmt.setInt(9, (int) doc.hash());
|
||||||
|
if (doc.pubYear() != null) {
|
||||||
|
stmt.setShort(10, (short) doc.pubYear().intValue());
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
stmt.setInt(10, Types.SMALLINT);
|
||||||
|
}
|
||||||
stmt.addBatch();
|
stmt.addBatch();
|
||||||
|
|
||||||
if (++cnt == 100) {
|
if (++cnt == 100) {
|
||||||
|
@ -5,6 +5,7 @@ import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
|
|||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
||||||
|
|
||||||
|
import javax.annotation.Nullable;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
@ -13,6 +14,9 @@ public class ProcessedDocumentDetails {
|
|||||||
public String title;
|
public String title;
|
||||||
public String description;
|
public String description;
|
||||||
|
|
||||||
|
@Nullable
|
||||||
|
public Integer pubYear;
|
||||||
|
|
||||||
public int length;
|
public int length;
|
||||||
public double quality;
|
public double quality;
|
||||||
public long hashCode;
|
public long hashCode;
|
||||||
|
@ -13,6 +13,8 @@ import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException.Disqualifi
|
|||||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
|
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
|
||||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocumentDetails;
|
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocumentDetails;
|
||||||
import nu.marginalia.wmsa.edge.converting.processor.logic.*;
|
import nu.marginalia.wmsa.edge.converting.processor.logic.*;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateSniffer;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
|
||||||
@ -47,6 +49,7 @@ public class DocumentProcessor {
|
|||||||
private final TitleExtractor titleExtractor;
|
private final TitleExtractor titleExtractor;
|
||||||
private final DocumentKeywordExtractor keywordExtractor;
|
private final DocumentKeywordExtractor keywordExtractor;
|
||||||
private final SummaryExtractor summaryExtractor;
|
private final SummaryExtractor summaryExtractor;
|
||||||
|
private final PubDateSniffer pubDateSniffer;
|
||||||
|
|
||||||
private static final DocumentValuator documentValuator = new DocumentValuator();
|
private static final DocumentValuator documentValuator = new DocumentValuator();
|
||||||
private static final LanguageFilter languageFilter = new LanguageFilter();
|
private static final LanguageFilter languageFilter = new LanguageFilter();
|
||||||
@ -60,7 +63,8 @@ public class DocumentProcessor {
|
|||||||
FeatureExtractor featureExtractor,
|
FeatureExtractor featureExtractor,
|
||||||
TitleExtractor titleExtractor,
|
TitleExtractor titleExtractor,
|
||||||
DocumentKeywordExtractor keywordExtractor,
|
DocumentKeywordExtractor keywordExtractor,
|
||||||
SummaryExtractor summaryExtractor)
|
SummaryExtractor summaryExtractor,
|
||||||
|
PubDateSniffer pubDateSniffer)
|
||||||
{
|
{
|
||||||
this.minDocumentLength = minDocumentLength;
|
this.minDocumentLength = minDocumentLength;
|
||||||
this.minDocumentQuality = minDocumentQuality;
|
this.minDocumentQuality = minDocumentQuality;
|
||||||
@ -69,6 +73,7 @@ public class DocumentProcessor {
|
|||||||
this.titleExtractor = titleExtractor;
|
this.titleExtractor = titleExtractor;
|
||||||
this.keywordExtractor = keywordExtractor;
|
this.keywordExtractor = keywordExtractor;
|
||||||
this.summaryExtractor = summaryExtractor;
|
this.summaryExtractor = summaryExtractor;
|
||||||
|
this.pubDateSniffer = pubDateSniffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
public ProcessedDocument makeDisqualifiedStub(CrawledDocument crawledDocument) {
|
public ProcessedDocument makeDisqualifiedStub(CrawledDocument crawledDocument) {
|
||||||
@ -177,6 +182,9 @@ public class DocumentProcessor {
|
|||||||
Document doc = Jsoup.parse(crawledDocument.documentBody);
|
Document doc = Jsoup.parse(crawledDocument.documentBody);
|
||||||
|
|
||||||
if (AcceptableAds.hasAcceptableAdsTag(doc)) {
|
if (AcceptableAds.hasAcceptableAdsTag(doc)) {
|
||||||
|
// I've never encountered a website where this hasn't been a severe indicator
|
||||||
|
// of spam
|
||||||
|
|
||||||
throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS);
|
throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -204,8 +212,10 @@ public class DocumentProcessor {
|
|||||||
ret.quality = documentValuator.getQuality(crawledDocument, ret.standard, doc, dld);
|
ret.quality = documentValuator.getQuality(crawledDocument, ret.standard, doc, dld);
|
||||||
ret.hashCode = HashCode.fromString(crawledDocument.documentBodyHash).asLong();
|
ret.hashCode = HashCode.fromString(crawledDocument.documentBodyHash).asLong();
|
||||||
|
|
||||||
|
|
||||||
KeywordMetadata keywordMetadata = new KeywordMetadata(ret.quality);
|
KeywordMetadata keywordMetadata = new KeywordMetadata(ret.quality);
|
||||||
|
|
||||||
|
PubDate pubDate;
|
||||||
EdgePageWordSet words;
|
EdgePageWordSet words;
|
||||||
if (shouldDoSimpleProcessing(url, ret)) {
|
if (shouldDoSimpleProcessing(url, ret)) {
|
||||||
/* Some documents we'll index, but only superficially. This is a compromise
|
/* Some documents we'll index, but only superficially. This is a compromise
|
||||||
@ -215,17 +225,25 @@ public class DocumentProcessor {
|
|||||||
ret.features = Set.of(HtmlFeature.UNKNOWN);
|
ret.features = Set.of(HtmlFeature.UNKNOWN);
|
||||||
words = keywordExtractor.extractKeywordsMinimal(dld, keywordMetadata);
|
words = keywordExtractor.extractKeywordsMinimal(dld, keywordMetadata);
|
||||||
ret.description = "";
|
ret.description = "";
|
||||||
|
|
||||||
|
pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, false);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
ret.features = featureExtractor.getFeatures(crawledDomain, doc, dld);
|
ret.features = featureExtractor.getFeatures(crawledDomain, doc, dld);
|
||||||
words = keywordExtractor.extractKeywords(dld, keywordMetadata);
|
words = keywordExtractor.extractKeywords(dld, keywordMetadata);
|
||||||
ret.description = getDescription(doc);
|
ret.description = getDescription(doc);
|
||||||
|
|
||||||
|
pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
addMetaWords(ret, url, crawledDomain, words);
|
addMetaWords(ret, url, pubDate, crawledDomain, words);
|
||||||
|
|
||||||
getLinks(url, ret, doc, words);
|
getLinks(url, ret, doc, words);
|
||||||
|
|
||||||
|
if (pubDate.hasYear()) {
|
||||||
|
ret.pubYear = pubDate.year();
|
||||||
|
}
|
||||||
|
|
||||||
return new DetailsWithWords(ret, words);
|
return new DetailsWithWords(ret, words);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -256,7 +274,7 @@ public class DocumentProcessor {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void addMetaWords(ProcessedDocumentDetails ret, EdgeUrl url, CrawledDomain domain, EdgePageWordSet words) {
|
private void addMetaWords(ProcessedDocumentDetails ret, EdgeUrl url, PubDate pubDate, CrawledDomain domain, EdgePageWordSet words) {
|
||||||
List<String> tagWords = new ArrayList<>();
|
List<String> tagWords = new ArrayList<>();
|
||||||
|
|
||||||
var edgeDomain = url.domain;
|
var edgeDomain = url.domain;
|
||||||
@ -276,6 +294,13 @@ public class DocumentProcessor {
|
|||||||
|
|
||||||
ret.features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add);
|
ret.features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add);
|
||||||
|
|
||||||
|
if (pubDate.year() > 1900) {
|
||||||
|
tagWords.add("year:" + pubDate.year());
|
||||||
|
}
|
||||||
|
if (pubDate.dateIso8601() != null) {
|
||||||
|
tagWords.add("pub:" + pubDate.dateIso8601());
|
||||||
|
}
|
||||||
|
|
||||||
words.appendWithNoMeta(IndexBlock.Meta, tagWords);
|
words.appendWithNoMeta(IndexBlock.Meta, tagWords);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -0,0 +1,250 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.processor.logic;
|
||||||
|
|
||||||
|
import com.google.common.base.Strings;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.jsoup.nodes.Element;
|
||||||
|
import org.jsoup.nodes.Node;
|
||||||
|
import org.jsoup.nodes.TextNode;
|
||||||
|
import org.jsoup.select.NodeFilter;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
import static org.jsoup.internal.StringUtil.isActuallyWhitespace;
|
||||||
|
import static org.jsoup.internal.StringUtil.isInvisibleChar;
|
||||||
|
|
||||||
|
public class SummaryExtractionFilter implements NodeFilter {
|
||||||
|
|
||||||
|
public Map<Node, NodeStatistics> statistics = new HashMap<>(10000);
|
||||||
|
public Map<Node, Integer> pos = new HashMap<>(10000);
|
||||||
|
public int cnt = 0;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public FilterResult head(Node node, int depth) {
|
||||||
|
pos.put(node, cnt++);
|
||||||
|
return FilterResult.CONTINUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public FilterResult tail(Node node, int depth) {
|
||||||
|
if (node instanceof TextNode tn) {
|
||||||
|
statistics.put(node, new NodeStatistics(tn, 0, textLength(tn.getWholeText()), pos.getOrDefault(tn, cnt)));
|
||||||
|
}
|
||||||
|
else if (node instanceof Element e) {
|
||||||
|
statistics.put(node, aggregateStatistics(e));
|
||||||
|
|
||||||
|
if (shouldPruneTag(e)) {
|
||||||
|
return FilterResult.REMOVE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return FilterResult.CONTINUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean shouldPruneTag(Element tag) {
|
||||||
|
String tagName = tag.tagName();
|
||||||
|
|
||||||
|
if ("h1".equalsIgnoreCase(tagName)) return true;
|
||||||
|
if ("h2".equalsIgnoreCase(tagName)) return true;
|
||||||
|
if ("h3".equalsIgnoreCase(tagName)) return true;
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getSummary(int maxLength) {
|
||||||
|
List<NodeStatistics> ret = new ArrayList<>(statistics.size());
|
||||||
|
for (var stats : statistics.values()) {
|
||||||
|
if (stats.textToTagRatio() < 0.85) continue;
|
||||||
|
if (!stats.isElement() || !stats.isAppropriateTagType()) continue;
|
||||||
|
if (stats.textLength() < 128) continue;
|
||||||
|
if (stats.isLink()) continue;
|
||||||
|
|
||||||
|
ret.add(stats);
|
||||||
|
}
|
||||||
|
ret.sort(Comparator.comparing(e -> -e.textLength()));
|
||||||
|
if (ret.size() > 32) ret.subList(32, ret.size()).clear();
|
||||||
|
ret.sort(Comparator.comparing(NodeStatistics::pos));
|
||||||
|
if (ret.size() > 3) ret.subList(3, ret.size()).clear();
|
||||||
|
ret.sort(Comparator.comparing(NodeStatistics::isBody));
|
||||||
|
if (ret.size() >= 1) {
|
||||||
|
return StringUtils.abbreviate(ret.get(0).text(), "", maxLength);
|
||||||
|
}
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
private NodeStatistics aggregateStatistics(Element e) {
|
||||||
|
int text = 0;
|
||||||
|
int tag = 0;
|
||||||
|
|
||||||
|
String tagName = e.tagName();
|
||||||
|
if (!tagName.equalsIgnoreCase("br") && !tagName.equalsIgnoreCase("p")) {
|
||||||
|
tag += tagName.length();
|
||||||
|
}
|
||||||
|
|
||||||
|
int numAttributes = e.attributesSize();
|
||||||
|
tag += Math.max(numAttributes - 1, 0);
|
||||||
|
|
||||||
|
if (numAttributes > 0) {
|
||||||
|
var attrs = e.attributes();
|
||||||
|
for (var attr : attrs) {
|
||||||
|
if (Strings.isNullOrEmpty(attr.getValue()))
|
||||||
|
tag += attr.getKey().length();
|
||||||
|
else {
|
||||||
|
tag += 3 + attr.getKey().length() + attr.getValue().length();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (var childNode : e.childNodes()) {
|
||||||
|
var cn = statistics.get(childNode);
|
||||||
|
|
||||||
|
if (cn != null) {
|
||||||
|
boolean isLink = (tagName.equalsIgnoreCase("a") || cn.isLink());
|
||||||
|
if (isLink) {
|
||||||
|
tag += cn.tagLength + cn.textLength;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
text += cn.textLength;
|
||||||
|
tag += cn.tagLength;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!cn.isElement()) {
|
||||||
|
statistics.remove(cn.node);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return new NodeStatistics(e, tag, text, pos.getOrDefault(e, cnt));
|
||||||
|
}
|
||||||
|
|
||||||
|
private int textLength(String str) {
|
||||||
|
int length = 0;
|
||||||
|
|
||||||
|
// This is a modified version of JSoup's StringUtil.normaliseWhitespace()
|
||||||
|
// that doesn't do allocation
|
||||||
|
|
||||||
|
int len = str.length();
|
||||||
|
int c;
|
||||||
|
boolean lastWasWhite = false;
|
||||||
|
boolean reachedNonWhite = false;
|
||||||
|
|
||||||
|
for (int i = 0; i < len; i+= Character.charCount(c)) {
|
||||||
|
c = str.codePointAt(i);
|
||||||
|
if (isActuallyWhitespace(c)) {
|
||||||
|
if ((!reachedNonWhite) || lastWasWhite)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (isAscii(c) && Character.isAlphabetic(c)) {
|
||||||
|
length++;
|
||||||
|
}
|
||||||
|
|
||||||
|
lastWasWhite = true;
|
||||||
|
}
|
||||||
|
else if (!isInvisibleChar(c)) {
|
||||||
|
if (isAscii(c) && Character.isAlphabetic(c)) {
|
||||||
|
length++;
|
||||||
|
}
|
||||||
|
lastWasWhite = false;
|
||||||
|
reachedNonWhite = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isAscii(int cp) {
|
||||||
|
return (cp & ~0x7F) == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public record NodeStatistics(Node node, int tagLength, int textLength, int pos) {
|
||||||
|
public double textToTagRatio() {
|
||||||
|
if (textLength == 0) return 1;
|
||||||
|
|
||||||
|
return textLength / (double)(tagLength + textLength);
|
||||||
|
}
|
||||||
|
|
||||||
|
public String text() {
|
||||||
|
if (node instanceof Element e) {
|
||||||
|
return e.text();
|
||||||
|
}
|
||||||
|
else if (node instanceof TextNode tn) {
|
||||||
|
return tn.text();
|
||||||
|
}
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isElement() {
|
||||||
|
return node instanceof Element;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isLink() {
|
||||||
|
if (node instanceof Element el) {
|
||||||
|
return "a".equalsIgnoreCase(el.tagName());
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isAppropriateTagType() {
|
||||||
|
|
||||||
|
if (node instanceof Element el) {
|
||||||
|
String tagName = el.tagName();
|
||||||
|
if ("blockquote".equalsIgnoreCase(tagName))
|
||||||
|
return false;
|
||||||
|
if ("tt".equalsIgnoreCase(tagName))
|
||||||
|
return false;
|
||||||
|
if ("ol".equalsIgnoreCase(tagName))
|
||||||
|
return false;
|
||||||
|
if ("ul".equalsIgnoreCase(tagName))
|
||||||
|
return false;
|
||||||
|
if ("li".equalsIgnoreCase(tagName))
|
||||||
|
return false;
|
||||||
|
if ("h1".equalsIgnoreCase(tagName))
|
||||||
|
return false;
|
||||||
|
if ("h2".equalsIgnoreCase(tagName))
|
||||||
|
return false;
|
||||||
|
if ("h3".equalsIgnoreCase(tagName))
|
||||||
|
return false;
|
||||||
|
if ("th".equalsIgnoreCase(tagName))
|
||||||
|
return false;
|
||||||
|
if ("td".equalsIgnoreCase(tagName))
|
||||||
|
return false;
|
||||||
|
if ("tbody".equalsIgnoreCase(tagName))
|
||||||
|
return false;
|
||||||
|
if ("html".equalsIgnoreCase(tagName))
|
||||||
|
return false;
|
||||||
|
if ("title".equalsIgnoreCase(tagName))
|
||||||
|
return false;
|
||||||
|
if ("#root".equalsIgnoreCase(tagName))
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (node.parent() instanceof Element elp) {
|
||||||
|
if ("a".equals(elp.tagName()))
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isBody() {
|
||||||
|
if (node instanceof Element el) {
|
||||||
|
return "body".equalsIgnoreCase(el.tagName());
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String tagName() {
|
||||||
|
if (node instanceof Element el) {
|
||||||
|
return el.tagName();
|
||||||
|
}
|
||||||
|
return '$'+node.getClass().getSimpleName();
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
return String.format("NodeStatistics[%s %d p %d %d]", tagName(), pos, tagLength, textLength);
|
||||||
|
}
|
||||||
|
|
||||||
|
public double sortValue() {
|
||||||
|
return -textToTagRatio() * Math.log(1 + textLength) / Math.log(1+pos);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -11,7 +11,7 @@ import java.util.regex.Pattern;
|
|||||||
public class SummaryExtractor {
|
public class SummaryExtractor {
|
||||||
private final int maxSummaryLength;
|
private final int maxSummaryLength;
|
||||||
|
|
||||||
private final Pattern truncatedCharacters = Pattern.compile("[^a-zA-Z0-9.,!?\\-'\"]+|[\\-.,!?' ]{3,}");
|
private final Pattern truncatedCharacters = Pattern.compile("[\\-.,!?' ]{3,}");
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public SummaryExtractor(@Named("max-summary-length") Integer maxSummaryLength) {
|
public SummaryExtractor(@Named("max-summary-length") Integer maxSummaryLength) {
|
||||||
@ -19,12 +19,9 @@ public class SummaryExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public String extractSummary(Document parsed) {
|
public String extractSummary(Document parsed) {
|
||||||
var cleanDoc = parsed.clone();
|
|
||||||
cleanDoc.select("header,nav,#header,#nav,#navigation,.header,.nav,.navigation,ul,li").remove();
|
|
||||||
|
|
||||||
String summaryString;
|
String summaryString;
|
||||||
|
|
||||||
summaryString = extractSummaryRaw(cleanDoc);
|
summaryString = extractSummaryRaw(parsed);
|
||||||
summaryString = truncatedCharacters.matcher(summaryString).replaceAll(" ");
|
summaryString = truncatedCharacters.matcher(summaryString).replaceAll(" ");
|
||||||
summaryString = StringUtils.abbreviate(summaryString, "", maxSummaryLength);
|
summaryString = StringUtils.abbreviate(summaryString, "", maxSummaryLength);
|
||||||
|
|
||||||
@ -36,9 +33,13 @@ public class SummaryExtractor {
|
|||||||
|
|
||||||
String maybe;
|
String maybe;
|
||||||
|
|
||||||
// Plan A
|
parsed.select("header,nav,#header,#nav,#navigation,.header,.nav,.navigation,ul,li").remove();
|
||||||
|
|
||||||
maybe = getSummaryByTagDensity(parsed);
|
// Plan A
|
||||||
|
maybe = getSummaryNew(parsed.clone());
|
||||||
|
if (!maybe.isBlank()) return maybe;
|
||||||
|
|
||||||
|
maybe = getSummaryByTagDensity(parsed.clone());
|
||||||
if (!maybe.isBlank()) return maybe;
|
if (!maybe.isBlank()) return maybe;
|
||||||
|
|
||||||
// Plan B: Open Graph Description
|
// Plan B: Open Graph Description
|
||||||
@ -53,6 +54,14 @@ public class SummaryExtractor {
|
|||||||
return lastDitchSummaryEffort(parsed);
|
return lastDitchSummaryEffort(parsed);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private String getSummaryNew(Document parsed) {
|
||||||
|
var filter = new SummaryExtractionFilter();
|
||||||
|
|
||||||
|
parsed.filter(filter);
|
||||||
|
|
||||||
|
return filter.getSummary(maxSummaryLength+32);
|
||||||
|
}
|
||||||
|
|
||||||
private String getSummaryByTagDensity(Document parsed) {
|
private String getSummaryByTagDensity(Document parsed) {
|
||||||
StringBuilder content = new StringBuilder();
|
StringBuilder content = new StringBuilder();
|
||||||
|
|
||||||
@ -92,6 +101,7 @@ public class SummaryExtractor {
|
|||||||
|
|
||||||
return parsed.body().text();
|
return parsed.body().text();
|
||||||
}
|
}
|
||||||
|
|
||||||
private double htmlTagDensity(Element elem) {
|
private double htmlTagDensity(Element elem) {
|
||||||
return (double) elem.text().length() / elem.html().length();
|
return (double) elem.text().length() / elem.html().length();
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,46 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate;
|
||||||
|
|
||||||
|
import java.time.LocalDate;
|
||||||
|
import java.time.format.DateTimeFormatter;
|
||||||
|
|
||||||
|
public record PubDate(String dateIso8601, int year) {
|
||||||
|
|
||||||
|
// First year we'll believe something can have been published on the web
|
||||||
|
// cut off at 1995 to reduce false positive error rate; number of bona fide
|
||||||
|
// documents from these years are so few almost all hits are wrong
|
||||||
|
|
||||||
|
public static final int MIN_YEAR = 1995;
|
||||||
|
|
||||||
|
// Last year we'll believe something can be published in
|
||||||
|
public static final int MAX_YEAR = LocalDate.now().getYear() + 1;
|
||||||
|
|
||||||
|
|
||||||
|
public PubDate() {
|
||||||
|
this(null, Integer.MIN_VALUE);
|
||||||
|
}
|
||||||
|
|
||||||
|
public PubDate(LocalDate date) {
|
||||||
|
this(date.format(DateTimeFormatter.ISO_DATE), date.getYear());
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isEmpty() {
|
||||||
|
return year == Integer.MIN_VALUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String describe() {
|
||||||
|
if (dateIso8601 != null)
|
||||||
|
return dateIso8601;
|
||||||
|
|
||||||
|
if (hasYear())
|
||||||
|
return Integer.toString(year);
|
||||||
|
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
public static boolean isValidYear(int year) {
|
||||||
|
return year >= MIN_YEAR && year <= MAX_YEAR;
|
||||||
|
}
|
||||||
|
public boolean hasYear() {
|
||||||
|
return isValidYear(this.year);
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,6 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate;
|
||||||
|
|
||||||
|
public enum PubDateEffortLevel {
|
||||||
|
LOW,
|
||||||
|
HIGH
|
||||||
|
}
|
@ -0,0 +1,12 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
public interface PubDateHeuristic {
|
||||||
|
|
||||||
|
Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard);
|
||||||
|
}
|
@ -0,0 +1,178 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
||||||
|
|
||||||
|
import java.time.DateTimeException;
|
||||||
|
import java.time.LocalDate;
|
||||||
|
import java.time.LocalDateTime;
|
||||||
|
import java.time.ZonedDateTime;
|
||||||
|
import java.time.format.DateTimeFormatter;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.OptionalInt;
|
||||||
|
import java.util.concurrent.ThreadLocalRandom;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
public class PubDateParser {
|
||||||
|
|
||||||
|
public static Optional<PubDate> attemptParseDate(String date) {
|
||||||
|
return Optional.ofNullable(date)
|
||||||
|
.filter(str -> str.length() >= 4 && str.length() < 32)
|
||||||
|
.flatMap(str ->
|
||||||
|
parse8601(str)
|
||||||
|
.or(() -> parse1123(str))
|
||||||
|
.or(() -> dateFromHighestYearLookingSubstring(str))
|
||||||
|
)
|
||||||
|
.filter(PubDateParser::validateDate);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static OptionalInt parseYearString(String yearString) {
|
||||||
|
try {
|
||||||
|
return OptionalInt.of(Integer.parseInt(yearString));
|
||||||
|
}
|
||||||
|
catch (NumberFormatException ex) {
|
||||||
|
return OptionalInt.empty();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static final Pattern yearPattern = Pattern.compile("\\d{4}");
|
||||||
|
|
||||||
|
public static Optional<PubDate> dateFromHighestYearLookingSubstring(String maybe) {
|
||||||
|
var matcher = yearPattern.matcher(maybe);
|
||||||
|
|
||||||
|
int min = PubDate.MAX_YEAR + 1;
|
||||||
|
int max = PubDate.MIN_YEAR - 1;
|
||||||
|
|
||||||
|
for (int i = 0; i < maybe.length() && matcher.find(i); i = matcher.end()) {
|
||||||
|
|
||||||
|
String segment = maybe.substring(matcher.start(), matcher.end());
|
||||||
|
OptionalInt year = parseYearString(segment);
|
||||||
|
|
||||||
|
if (year.isEmpty())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
int y = year.getAsInt();
|
||||||
|
if (PubDate.isValidYear(y)) {
|
||||||
|
if (max < y) max = y;
|
||||||
|
if (min > y) min = y;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (max != min && PubDate.isValidYear(min) && PubDate.isValidYear(max)) {
|
||||||
|
return Optional.of(new PubDate(null, guessYear(min, max)));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (max >= PubDate.MIN_YEAR)
|
||||||
|
return Optional.of(new PubDate(null, max));
|
||||||
|
else
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static Optional<PubDate> dateFromHighestYearLookingSubstringWithGuess(String maybe, int guess) {
|
||||||
|
var matcher = yearPattern.matcher(maybe);
|
||||||
|
|
||||||
|
int min = PubDate.MAX_YEAR + 1;
|
||||||
|
int max = PubDate.MIN_YEAR - 1;
|
||||||
|
|
||||||
|
for (int i = 0; i < maybe.length() && matcher.find(i); i = matcher.end()) {
|
||||||
|
|
||||||
|
String segment = maybe.substring(matcher.start(), matcher.end());
|
||||||
|
OptionalInt year = parseYearString(segment);
|
||||||
|
|
||||||
|
if (year.isEmpty())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
int y = year.getAsInt();
|
||||||
|
if (PubDate.isValidYear(y)) {
|
||||||
|
if (max < y) max = y;
|
||||||
|
if (min > y) min = y;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (max != min && PubDate.isValidYear(min) && PubDate.isValidYear(max)) {
|
||||||
|
return Optional.of(new PubDate(null, guessYear(min, max, guess)));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (max >= PubDate.MIN_YEAR)
|
||||||
|
return Optional.of(new PubDate(null, max));
|
||||||
|
else
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static int guessYear(int min, int max, int educatedGuess) {
|
||||||
|
int var = max - min;
|
||||||
|
|
||||||
|
if (var < 3)
|
||||||
|
return min;
|
||||||
|
|
||||||
|
int avg = (max + min) / 2;
|
||||||
|
int guess = (avg + educatedGuess) / 2;
|
||||||
|
|
||||||
|
if (guess < min)
|
||||||
|
return min;
|
||||||
|
if (guess > max)
|
||||||
|
return max;
|
||||||
|
|
||||||
|
return guess;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static int guessYear(int min, int max) {
|
||||||
|
return (max + min) / 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static int guessYear(EdgeHtmlStandard standard) {
|
||||||
|
// Create some jitter to avoid having documents piling up in the same four years
|
||||||
|
// as this would make searching in those years disproportionately useless
|
||||||
|
|
||||||
|
double guess = standard.yearGuess + ThreadLocalRandom.current().nextGaussian();
|
||||||
|
|
||||||
|
if (guess < PubDate.MIN_YEAR) {
|
||||||
|
return PubDate.MIN_YEAR;
|
||||||
|
}
|
||||||
|
if (guess > PubDate.MAX_YEAR) {
|
||||||
|
return PubDate.MAX_YEAR;
|
||||||
|
}
|
||||||
|
return (int) guess;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Optional<PubDate> parse8601(String maybe) {
|
||||||
|
return parseOptionally(maybe, DateTimeFormatter.ISO_DATE)
|
||||||
|
.or(() -> parseOptionallyWithTime(maybe, DateTimeFormatter.ISO_DATE_TIME))
|
||||||
|
.or(() -> parseOptionallyWithZonedTime(maybe, DateTimeFormatter.ISO_DATE_TIME))
|
||||||
|
.map(PubDate::new);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Optional<PubDate> parse1123(String maybe) {
|
||||||
|
return parseOptionally(maybe, DateTimeFormatter.RFC_1123_DATE_TIME)
|
||||||
|
.map(PubDate::new);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Optional<LocalDate> parseOptionally(String str, DateTimeFormatter formatter) {
|
||||||
|
try {
|
||||||
|
return Optional.of(LocalDate.parse(str, formatter));
|
||||||
|
}
|
||||||
|
catch (DateTimeException ex) {
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
public static Optional<LocalDate> parseOptionallyWithTime(String str, DateTimeFormatter formatter) {
|
||||||
|
try {
|
||||||
|
return Optional.of(LocalDateTime.parse(str, formatter).toLocalDate());
|
||||||
|
}
|
||||||
|
catch (DateTimeException ex) {
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
public static Optional<LocalDate> parseOptionallyWithZonedTime(String str, DateTimeFormatter formatter) {
|
||||||
|
try {
|
||||||
|
return Optional.of(ZonedDateTime.parse(str, formatter).toLocalDate());
|
||||||
|
}
|
||||||
|
catch (DateTimeException ex) {
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
public static boolean validateDate(PubDate date) {
|
||||||
|
return (date.year() >= PubDate.MIN_YEAR && date.year() <= PubDate.MAX_YEAR);
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,50 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic.*;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class PubDateSniffer {
|
||||||
|
|
||||||
|
private final List<PubDateHeuristic> heuristics = new ArrayList<>();
|
||||||
|
|
||||||
|
public PubDateSniffer() {
|
||||||
|
heuristics.add(new PubDateHeuristicHtml5ItempropDateTag());
|
||||||
|
heuristics.add(new PubDateHeuristicHtml5ArticleDateTag());
|
||||||
|
heuristics.add(new PubDateHeuristicJSONLD());
|
||||||
|
heuristics.add(new PubDateHeuristicMicrodata());
|
||||||
|
heuristics.add(new PubDateHeuristicOpenGraph());
|
||||||
|
heuristics.add(new PubDateHeuristicRDFaTag());
|
||||||
|
|
||||||
|
// The more questionable heuristics should be kept below this line
|
||||||
|
heuristics.add(new PubDateHeuristicUrlPatternPass1());
|
||||||
|
|
||||||
|
heuristics.add(new PubDateHeuristicDOMParsingPass1());
|
||||||
|
heuristics.add(new PubDateHeuristicHtml5AnyTimeTag());
|
||||||
|
|
||||||
|
heuristics.add(new PubDateHeuristicDOMParsingPass2());
|
||||||
|
heuristics.add(new PubDateHeuristicUrlPatternPass2());
|
||||||
|
|
||||||
|
heuristics.add(new PubDateHeuristicLastModified());
|
||||||
|
// This is complete guesswork
|
||||||
|
|
||||||
|
heuristics.add(new PubDateHeuristicGuessFromHtmlStandard());
|
||||||
|
}
|
||||||
|
|
||||||
|
public PubDate getPubDate(String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard, boolean runExpensive) {
|
||||||
|
final PubDateEffortLevel effortLevel = runExpensive ? PubDateEffortLevel.HIGH : PubDateEffortLevel.LOW;
|
||||||
|
|
||||||
|
for (var heuristic : heuristics) {
|
||||||
|
var maybe = heuristic.apply(effortLevel, headers, url, document, htmlStandard);
|
||||||
|
if (maybe.isPresent())
|
||||||
|
return maybe.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
return new PubDate();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,148 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
||||||
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
import org.jsoup.nodes.Element;
|
||||||
|
import org.jsoup.nodes.Node;
|
||||||
|
import org.jsoup.nodes.TextNode;
|
||||||
|
import org.jsoup.select.NodeFilter;
|
||||||
|
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
|
||||||
|
if (effortLevel == PubDateEffortLevel.LOW)
|
||||||
|
return Optional.empty();
|
||||||
|
|
||||||
|
DateExtractingNodeVisitorPass filter = new DateExtractingNodeVisitorPass(htmlStandard);
|
||||||
|
|
||||||
|
document.filter(filter);
|
||||||
|
|
||||||
|
return Optional.ofNullable(filter.pubDate);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static class DateExtractingNodeVisitorPass implements NodeFilter {
|
||||||
|
public PubDate pubDate;
|
||||||
|
private final EdgeHtmlStandard htmlStandard;
|
||||||
|
|
||||||
|
private DateExtractingNodeVisitorPass(EdgeHtmlStandard htmlStandard) {
|
||||||
|
this.htmlStandard = htmlStandard;
|
||||||
|
}
|
||||||
|
|
||||||
|
@NotNull
|
||||||
|
@Override
|
||||||
|
public FilterResult head(@NotNull Node node, int depth) {
|
||||||
|
if (node instanceof TextNode tn) onTextNode(tn);
|
||||||
|
if (node instanceof Element el) onElementNode(el);
|
||||||
|
|
||||||
|
if (hasPubDate()) {
|
||||||
|
return FilterResult.STOP;
|
||||||
|
}
|
||||||
|
return FilterResult.CONTINUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void onTextNode(TextNode tn) {
|
||||||
|
String text = tn.getWholeText();
|
||||||
|
|
||||||
|
if (text.length() < 32 && isCandidatForCopyrightNotice(text)) {
|
||||||
|
parse(text);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void onElementNode(Element el) {
|
||||||
|
if (hasCommonClass(el)) {
|
||||||
|
parse(el.text());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!hasPubDate())
|
||||||
|
tryParsePhpBBDate(el);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean isCandidatForCopyrightNotice(String text) {
|
||||||
|
if (text.contains("ublished"))
|
||||||
|
return true;
|
||||||
|
if (text.contains("opyright"))
|
||||||
|
return true;
|
||||||
|
if (text.contains("©"))
|
||||||
|
return true;
|
||||||
|
if (text.contains("(c)"))
|
||||||
|
return true;
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean hasCommonClass(Element el) {
|
||||||
|
var classes = el.classNames();
|
||||||
|
|
||||||
|
return classes.contains("entry-meta") // wordpress
|
||||||
|
|| classes.contains("byline")
|
||||||
|
|| classes.contains("author")
|
||||||
|
|| classes.contains("submitted")
|
||||||
|
|| el.id().contains("footer-info-lastmod"); // mediawiki
|
||||||
|
}
|
||||||
|
|
||||||
|
public void tryParsePhpBBDate(Element el) {
|
||||||
|
|
||||||
|
/* Match HTML on the form <div>[...] <b>Posted:</b> Sun Oct 03, 2010 5:37 pm </div>
|
||||||
|
* this is used on old phpBB message boards
|
||||||
|
*
|
||||||
|
* Schematically the DOM looks like this
|
||||||
|
*
|
||||||
|
* b - TextNode[ Sun Oct 03, 2010 5:37 pm ]
|
||||||
|
* |
|
||||||
|
* TextNode[Posted:]
|
||||||
|
*/
|
||||||
|
if ("b".equals(el.tagName())
|
||||||
|
&& el.childNodeSize() == 1
|
||||||
|
&& el.childNode(0) instanceof TextNode ctn
|
||||||
|
&& "Posted:".equals(ctn.getWholeText())
|
||||||
|
&& el.nextSibling() instanceof TextNode ntn
|
||||||
|
)
|
||||||
|
{
|
||||||
|
parse(ntn.getWholeText());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean hasPubDate() {
|
||||||
|
return pubDate != null;
|
||||||
|
}
|
||||||
|
public void setPubDate(PubDate pubDate) {
|
||||||
|
this.pubDate = pubDate;
|
||||||
|
}
|
||||||
|
|
||||||
|
@NotNull
|
||||||
|
@Override
|
||||||
|
public FilterResult tail(@NotNull Node node, int depth) {
|
||||||
|
return FilterResult.CONTINUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void parse(String text) {
|
||||||
|
if (htmlStandard == EdgeHtmlStandard.UNKNOWN) {
|
||||||
|
PubDateParser
|
||||||
|
.dateFromHighestYearLookingSubstring(text)
|
||||||
|
.ifPresent(this::setPubDate);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
PubDateParser
|
||||||
|
.dateFromHighestYearLookingSubstringWithGuess(text, htmlStandard.yearGuess)
|
||||||
|
.ifPresent(this::setPubDate);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,123 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
||||||
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
import org.jsoup.nodes.Node;
|
||||||
|
import org.jsoup.nodes.TextNode;
|
||||||
|
import org.jsoup.select.NodeFilter;
|
||||||
|
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
|
||||||
|
if (effortLevel == PubDateEffortLevel.LOW)
|
||||||
|
return Optional.empty();
|
||||||
|
|
||||||
|
DateExtractingNodeVisitor filter = new DateExtractingNodeVisitor(htmlStandard);
|
||||||
|
|
||||||
|
document.filter(filter);
|
||||||
|
|
||||||
|
return Optional.ofNullable(filter.pubDate);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static class DateExtractingNodeVisitor implements NodeFilter {
|
||||||
|
public PubDate pubDate;
|
||||||
|
private final EdgeHtmlStandard htmlStandard;
|
||||||
|
|
||||||
|
private DateExtractingNodeVisitor(EdgeHtmlStandard htmlStandard) {
|
||||||
|
this.htmlStandard = htmlStandard;
|
||||||
|
}
|
||||||
|
|
||||||
|
@NotNull
|
||||||
|
@Override
|
||||||
|
public FilterResult head(@NotNull Node node, int depth) {
|
||||||
|
if (node instanceof TextNode tn) onTextNode(tn);
|
||||||
|
|
||||||
|
if (hasPubDate()) {
|
||||||
|
return FilterResult.STOP;
|
||||||
|
}
|
||||||
|
return FilterResult.CONTINUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void onTextNode(TextNode tn) {
|
||||||
|
String text = tn.getWholeText();
|
||||||
|
|
||||||
|
if (isPossibleCandidate(text)) {
|
||||||
|
parse(text);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean hasPubDate() {
|
||||||
|
return pubDate != null;
|
||||||
|
}
|
||||||
|
public void setPubDate(PubDate pubDate) {
|
||||||
|
this.pubDate = pubDate;
|
||||||
|
}
|
||||||
|
|
||||||
|
@NotNull
|
||||||
|
@Override
|
||||||
|
public FilterResult tail(@NotNull Node node, int depth) {
|
||||||
|
return FilterResult.CONTINUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void parse(String text) {
|
||||||
|
if (htmlStandard == EdgeHtmlStandard.UNKNOWN) {
|
||||||
|
PubDateParser
|
||||||
|
.dateFromHighestYearLookingSubstring(text)
|
||||||
|
.ifPresent(this::setPubDate);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
PubDateParser
|
||||||
|
.dateFromHighestYearLookingSubstringWithGuess(text, htmlStandard.yearGuess)
|
||||||
|
.ifPresent(this::setPubDate);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// This is basically the regex (^|[ ./\-])(\d{4})([ ./\-]$), but
|
||||||
|
// unchecked regexes are too slow
|
||||||
|
|
||||||
|
public static boolean isPossibleCandidate(String text) {
|
||||||
|
if (text.length() >= 4 && text.length() < 24) {
|
||||||
|
int ct = 0;
|
||||||
|
char prevC = ' ';
|
||||||
|
boolean goodStart = true;
|
||||||
|
for (int i = 0; i < text.length(); i++) {
|
||||||
|
char c = text.charAt(i);
|
||||||
|
if (Character.isDigit(c)) {
|
||||||
|
if (ct++ == 0) {
|
||||||
|
goodStart = isGoodBreak(prevC);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if (ct == 4 && goodStart && isGoodBreak(c)) return true;
|
||||||
|
else {
|
||||||
|
ct = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
prevC = c;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ct == 4 && goodStart)
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean isGoodBreak(char c) {
|
||||||
|
return "./-,".indexOf(c) >= 0 || Character.isSpaceChar(c);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,23 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
public class PubDateHeuristicGuessFromHtmlStandard implements PubDateHeuristic {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
|
||||||
|
if (htmlStandard == EdgeHtmlStandard.UNKNOWN)
|
||||||
|
return Optional.empty();
|
||||||
|
|
||||||
|
return Optional.of(new PubDate(null, PubDateParser.guessYear(htmlStandard)));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,33 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
public class PubDateHeuristicHtml5AnyTimeTag implements PubDateHeuristic {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
|
||||||
|
// HTML5, alternative approach
|
||||||
|
for (var tag : document.select("time")) {
|
||||||
|
var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime"));
|
||||||
|
if (maybeDate.isPresent()) {
|
||||||
|
return maybeDate;
|
||||||
|
}
|
||||||
|
|
||||||
|
maybeDate = PubDateParser.attemptParseDate(tag.wholeText());
|
||||||
|
if (maybeDate.isPresent()) {
|
||||||
|
return maybeDate;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,28 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
public class PubDateHeuristicHtml5ArticleDateTag implements PubDateHeuristic {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
|
||||||
|
// HTML5
|
||||||
|
for (var tag : document.select("time[pubdate=\"pubdate\"]")) {
|
||||||
|
var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime"));
|
||||||
|
if (maybeDate.isPresent()) {
|
||||||
|
return maybeDate;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,27 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
public class PubDateHeuristicHtml5ItempropDateTag implements PubDateHeuristic {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
|
||||||
|
for (var tag : document.select("time[itemprop=\"datePublished\"]")) {
|
||||||
|
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
||||||
|
if (maybeDate.isPresent()) {
|
||||||
|
return maybeDate;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,49 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import com.google.gson.GsonBuilder;
|
||||||
|
import com.google.gson.JsonSyntaxException;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
public class PubDateHeuristicJSONLD implements PubDateHeuristic {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
|
||||||
|
for (var tag : document.select("script[type=\"application/ld+json\"]")) {
|
||||||
|
var maybeDate = parseLdJson(tag.data())
|
||||||
|
.flatMap(PubDateParser::attemptParseDate);
|
||||||
|
|
||||||
|
if (maybeDate.isPresent()) {
|
||||||
|
return maybeDate;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static class JsonModel {
|
||||||
|
String datePublished;
|
||||||
|
}
|
||||||
|
private static Gson gson = new GsonBuilder().create();
|
||||||
|
|
||||||
|
public Optional<String> parseLdJson(String content) {
|
||||||
|
try {
|
||||||
|
var model = gson.fromJson(content, JsonModel.class);
|
||||||
|
return Optional.ofNullable(model)
|
||||||
|
.map(m -> m.datePublished);
|
||||||
|
}
|
||||||
|
catch (JsonSyntaxException ex) {
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,29 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
public class PubDateHeuristicLastModified implements PubDateHeuristic {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
|
||||||
|
String lmString = "last-modified: ";
|
||||||
|
int offset = headers.toLowerCase().indexOf(lmString);
|
||||||
|
|
||||||
|
if (offset < 0)
|
||||||
|
return Optional.empty();
|
||||||
|
int end = headers.indexOf('\n', offset);
|
||||||
|
if (end < 0) end = headers.length();
|
||||||
|
|
||||||
|
String lmDate = headers.substring(offset + lmString.length(), end);
|
||||||
|
return PubDateParser.attemptParseDate(lmDate);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,28 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
public class PubDateHeuristicMicrodata implements PubDateHeuristic {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
|
||||||
|
|
||||||
|
for (var tag : document.select("meta[itemprop=\"datePublished\"]")) {
|
||||||
|
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
||||||
|
if (maybeDate.isPresent()) {
|
||||||
|
return maybeDate;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,27 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
public class PubDateHeuristicOpenGraph implements PubDateHeuristic {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
|
||||||
|
// OG
|
||||||
|
for (var tag : document.select("meta[property=\"article:published_time\"]")) {
|
||||||
|
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
||||||
|
if (maybeDate.isPresent()) {
|
||||||
|
return maybeDate;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,27 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
public class PubDateHeuristicRDFaTag implements PubDateHeuristic {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
|
||||||
|
for (var tag : document.select("meta[property=\"datePublished\"]")) {
|
||||||
|
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
||||||
|
if (maybeDate.isPresent()) {
|
||||||
|
return maybeDate;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,45 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.OptionalInt;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
public class PubDateHeuristicUrlPatternPass1 implements PubDateHeuristic {
|
||||||
|
|
||||||
|
private static final Pattern yearUrlPattern = Pattern.compile("/\\d{4}/");
|
||||||
|
|
||||||
|
// False positive rate is much higher in the 1990s, only include 2000s+ in pass 1
|
||||||
|
private static final int MIN_URL_PATTERN_YEAR = 2000;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
|
||||||
|
final String urlString = url.path;
|
||||||
|
|
||||||
|
var matcher = yearUrlPattern.matcher(urlString);
|
||||||
|
|
||||||
|
for (int i = 0; i < urlString.length() && matcher.find(i); i = matcher.end()) {
|
||||||
|
|
||||||
|
String segment = urlString.substring(matcher.start() + 1, matcher.end() - 1);
|
||||||
|
|
||||||
|
OptionalInt year = PubDateParser.parseYearString(segment);
|
||||||
|
|
||||||
|
if (year.isEmpty())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
int y = year.getAsInt();
|
||||||
|
if (y >= MIN_URL_PATTERN_YEAR && y <= PubDate.MAX_YEAR) {
|
||||||
|
return Optional.of(new PubDate(null, y));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,42 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateEffortLevel;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateHeuristic;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.OptionalInt;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
public class PubDateHeuristicUrlPatternPass2 implements PubDateHeuristic {
|
||||||
|
|
||||||
|
private static final Pattern yearUrlPattern = Pattern.compile("/\\d{4}/");
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, EdgeHtmlStandard htmlStandard) {
|
||||||
|
final String urlString = url.path;
|
||||||
|
|
||||||
|
var matcher = yearUrlPattern.matcher(urlString);
|
||||||
|
|
||||||
|
for (int i = 0; i < urlString.length() && matcher.find(i); i = matcher.end()) {
|
||||||
|
|
||||||
|
String segment = urlString.substring(matcher.start() + 1, matcher.end() - 1);
|
||||||
|
|
||||||
|
OptionalInt year = PubDateParser.parseYearString(segment);
|
||||||
|
|
||||||
|
if (year.isEmpty())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
int y = year.getAsInt();
|
||||||
|
if (y >= PubDate.MIN_YEAR && y <= PubDate.MAX_YEAR) {
|
||||||
|
return Optional.of(new PubDate(null, y));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
}
|
@ -117,12 +117,12 @@ public class ExplorerService extends Service {
|
|||||||
NV.NEIGHBOR_NAME,
|
NV.NEIGHBOR_NAME,
|
||||||
NV.RELATEDNESS,
|
NV.RELATEDNESS,
|
||||||
(LV.DOMAIN_ID IS NOT NULL),
|
(LV.DOMAIN_ID IS NOT NULL),
|
||||||
(STATE = 'ACTIVE' OR STATE='SOCIAL_MEDIA'),
|
(STATE = 'ACTIVE' OR STATE='SOCIAL_MEDIA' OR STATE='REDIR'),
|
||||||
INDEXED > 0
|
INDEXED > 0
|
||||||
FROM EC_NEIGHBORS_VIEW NV
|
FROM EC_NEIGHBORS_VIEW NV
|
||||||
LEFT JOIN EC_NEIGHBORS_VIEW LV ON (NV.NEIGHBOR_ID=LV.DOMAIN_ID)
|
LEFT JOIN EC_NEIGHBORS_VIEW LV ON (NV.NEIGHBOR_ID=LV.DOMAIN_ID)
|
||||||
INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=NV.NEIGHBOR_ID
|
INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=NV.NEIGHBOR_ID
|
||||||
WHERE NV.DOMAIN_ID=?
|
WHERE NV.DOMAIN_ID IN (?,?)
|
||||||
GROUP BY NV.NEIGHBOR_ID
|
GROUP BY NV.NEIGHBOR_ID
|
||||||
ORDER BY NV.RELATEDNESS DESC
|
ORDER BY NV.RELATEDNESS DESC
|
||||||
""");
|
""");
|
||||||
@ -131,12 +131,12 @@ public class ExplorerService extends Service {
|
|||||||
NV.DOMAIN_NAME,
|
NV.DOMAIN_NAME,
|
||||||
NV.RELATEDNESS,
|
NV.RELATEDNESS,
|
||||||
(LV.NEIGHBOR_ID IS NOT NULL),
|
(LV.NEIGHBOR_ID IS NOT NULL),
|
||||||
(STATE = 'ACTIVE' OR STATE='SOCIAL_MEDIA'),
|
(STATE = 'ACTIVE' OR STATE='SOCIAL_MEDIA' OR STATE='REDIR'),
|
||||||
INDEXED > 0
|
INDEXED > 0
|
||||||
FROM EC_NEIGHBORS_VIEW NV
|
FROM EC_NEIGHBORS_VIEW NV
|
||||||
LEFT JOIN EC_NEIGHBORS_VIEW LV ON (NV.DOMAIN_ID=LV.NEIGHBOR_ID)
|
LEFT JOIN EC_NEIGHBORS_VIEW LV ON (NV.DOMAIN_ID=LV.NEIGHBOR_ID)
|
||||||
INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=NV.DOMAIN_ID
|
INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=NV.DOMAIN_ID
|
||||||
WHERE NV.NEIGHBOR_ID=?
|
WHERE NV.NEIGHBOR_ID IN (?,?)
|
||||||
GROUP BY NV.DOMAIN_ID
|
GROUP BY NV.DOMAIN_ID
|
||||||
ORDER BY NV.RELATEDNESS DESC
|
ORDER BY NV.RELATEDNESS DESC
|
||||||
"""
|
"""
|
||||||
@ -145,6 +145,8 @@ public class ExplorerService extends Service {
|
|||||||
) {
|
) {
|
||||||
|
|
||||||
stmt.setInt(1, domainIdInformation.domainId);
|
stmt.setInt(1, domainIdInformation.domainId);
|
||||||
|
stmt.setInt(2, domainIdInformation.aliasId);
|
||||||
|
|
||||||
var rsp = stmt.executeQuery();
|
var rsp = stmt.executeQuery();
|
||||||
while (rsp.next()) {
|
while (rsp.next()) {
|
||||||
|
|
||||||
@ -172,6 +174,8 @@ public class ExplorerService extends Service {
|
|||||||
}
|
}
|
||||||
|
|
||||||
stmtRev.setInt(1, domainIdInformation.domainId);
|
stmtRev.setInt(1, domainIdInformation.domainId);
|
||||||
|
stmtRev.setInt(2, domainIdInformation.aliasId);
|
||||||
|
|
||||||
rsp = stmtRev.executeQuery();
|
rsp = stmtRev.executeQuery();
|
||||||
while (rsp.next()) {
|
while (rsp.next()) {
|
||||||
|
|
||||||
@ -211,22 +215,24 @@ public class ExplorerService extends Service {
|
|||||||
|
|
||||||
try (var conn = dataSource.getConnection();
|
try (var conn = dataSource.getConnection();
|
||||||
var stmt = conn.prepareStatement("""
|
var stmt = conn.prepareStatement("""
|
||||||
SELECT IFNULL(ALIAS.ID, DOMAIN.ID), DOMAIN.INDEXED>0 OR ALIAS.INDEXED>0, ALIAS.DOMAIN_NAME
|
SELECT DOMAIN.ID, IFNULL(ALIAS.ID, DOMAIN.ID), DOMAIN.INDEXED>0 OR ALIAS.INDEXED>0, ALIAS.DOMAIN_NAME
|
||||||
FROM EC_DOMAIN DOMAIN
|
FROM EC_DOMAIN DOMAIN
|
||||||
LEFT JOIN EC_DOMAIN ALIAS ON DOMAIN.DOMAIN_ALIAS=ALIAS.ID
|
LEFT JOIN EC_DOMAIN ALIAS ON DOMAIN.DOMAIN_ALIAS=ALIAS.ID
|
||||||
WHERE DOMAIN.DOMAIN_NAME=?
|
WHERE DOMAIN.DOMAIN_NAME=?
|
||||||
""")) {
|
""")) {
|
||||||
|
|
||||||
stmt.setString(1, query);
|
stmt.setString(1, query);
|
||||||
var rsp = stmt.executeQuery();
|
var rsp = stmt.executeQuery();
|
||||||
if (rsp.next()) {
|
if (rsp.next()) {
|
||||||
return new DomainIdInformation(
|
return new DomainIdInformation(
|
||||||
rsp.getInt(1),
|
rsp.getInt(1),
|
||||||
rsp.getBoolean(2),
|
rsp.getInt(2),
|
||||||
rsp.getString(3)
|
rsp.getBoolean(3),
|
||||||
|
rsp.getString(4)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return new DomainIdInformation(-1, false, null);
|
return new DomainIdInformation(-1, -1, false, null);
|
||||||
}
|
}
|
||||||
|
|
||||||
private String trimUrlJunk(String query) {
|
private String trimUrlJunk(String query) {
|
||||||
@ -245,7 +251,7 @@ public class ExplorerService extends Service {
|
|||||||
return query;
|
return query;
|
||||||
}
|
}
|
||||||
|
|
||||||
record DomainIdInformation(int domainId, boolean indexed, String alias) {
|
record DomainIdInformation(int domainId, int aliasId, boolean indexed, String alias) {
|
||||||
boolean isPresent() {
|
boolean isPresent() {
|
||||||
return domainId >= 0;
|
return domainId >= 0;
|
||||||
}
|
}
|
||||||
|
@ -1,19 +1,22 @@
|
|||||||
package nu.marginalia.wmsa.edge.model.crawl;
|
package nu.marginalia.wmsa.edge.model.crawl;
|
||||||
|
|
||||||
public enum EdgeHtmlStandard {
|
public enum EdgeHtmlStandard {
|
||||||
PLAIN(0, 1),
|
PLAIN(0, 1, 1993),
|
||||||
UNKNOWN(0, 1),
|
UNKNOWN(0, 1, 2000),
|
||||||
HTML123(0, 1),
|
HTML123(0, 1, 1997),
|
||||||
HTML4(-0.1, 1.05),
|
HTML4(-0.1, 1.05, 2006),
|
||||||
XHTML(-0.1, 1.05),
|
XHTML(-0.1, 1.05, 2006),
|
||||||
HTML5(0.5, 1.1);
|
HTML5(0.5, 1.1, 2018);
|
||||||
|
|
||||||
public final double offset;
|
public final double offset;
|
||||||
public final double scale;
|
public final double scale;
|
||||||
|
|
||||||
EdgeHtmlStandard(double offset, double scale) {
|
public final int yearGuess;
|
||||||
|
|
||||||
|
EdgeHtmlStandard(double offset, double scale, int yearGuess) {
|
||||||
this.offset = offset;
|
this.offset = offset;
|
||||||
this.scale = scale;
|
this.scale = scale;
|
||||||
|
this.yearGuess = yearGuess;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -77,6 +77,8 @@ CREATE TABLE IF NOT EXISTS EC_PAGE_DATA (
|
|||||||
DATA_HASH INTEGER NOT NULL,
|
DATA_HASH INTEGER NOT NULL,
|
||||||
QUALITY DOUBLE NOT NULL,
|
QUALITY DOUBLE NOT NULL,
|
||||||
|
|
||||||
|
PUB_YEAR SMALLINT,
|
||||||
|
|
||||||
FOREIGN KEY (ID) REFERENCES EC_URL(ID) ON DELETE CASCADE
|
FOREIGN KEY (ID) REFERENCES EC_URL(ID) ON DELETE CASCADE
|
||||||
)
|
)
|
||||||
CHARACTER SET utf8mb4
|
CHARACTER SET utf8mb4
|
||||||
|
@ -28,38 +28,28 @@
|
|||||||
<h2>Publicity, Discussion and Events</h2>
|
<h2>Publicity, Discussion and Events</h2>
|
||||||
<div class="info">
|
<div class="info">
|
||||||
<dl>
|
<dl>
|
||||||
<dt><a href="https://www.hs.fi/visio/art-2000009139237.html">Google ei enää tideä</a></dt>
|
<dt><a href="https://memex.marginalia.nu/log/64-hundred-million.gmi" rel="nofollow">Marginalia's Index Reaches 100,000,000 Documents</a> 🎊</dt>
|
||||||
|
<dd>2022-10-21</dd>
|
||||||
|
<dt><a href="https://www.hs.fi/visio/art-2000009139237.html" rel="nofollow">Google ei enää tideä</a></dt>
|
||||||
<dd>Helsing Sanomat 🇫🇮 2022-10-19</dd>
|
<dd>Helsing Sanomat 🇫🇮 2022-10-19</dd>
|
||||||
<dt><a href="https://www.deutschlandfunkkultur.de/google-suche-100.html">Kritik an Googles Suche - Platzhirsch auf dem Nebenschauplatz</a></dt>
|
<dt><a href="https://www.deutschlandfunkkultur.de/google-suche-100.html" rel="nofollow">Kritik an Googles Suche - Platzhirsch auf dem Nebenschauplatz</a></dt>
|
||||||
<dd>Deutschlandfunk Kultur 🇩🇪, 2022-08-18</dd>
|
<dd>Deutschlandfunk Kultur 🇩🇪, 2022-08-18</dd>
|
||||||
<dt><a href="https://news.ycombinator.com/item?id=31536626" rel="nofollow">Marginalia Goes Open Source</a></dt>
|
<dt><a href="https://news.ycombinator.com/item?id=31536626" rel="nofollow">Marginalia Goes Open Source</a></dt>
|
||||||
<dd>Hacker News, 2022-05-28</dd>
|
<dd>Hacker News, 2022-05-28</dd>
|
||||||
<dt><a href="https://www.youtube.com/watch?v=rTSEr0cRJY8" rel="nofollow">You Should Check Out the Indie Web</a> 🎞️</dt>
|
<dt><a href="https://www.youtube.com/watch?v=rTSEr0cRJY8" rel="nofollow">You Should Check Out the Indie Web</a> 🎞️</dt>
|
||||||
<dd>YouTube, You've Got Kat, 2022-03-15 </dd>
|
<dd>YouTube, You've Got Kat, 2022-03-15 </dd>
|
||||||
<dt>
|
<dt> <a href="https://www.newyorker.com/culture/infinite-scroll/what-google-search-isnt-showing-you" rel="nofollow">What Google Search Isn't Showing You</a> </dt>
|
||||||
<a href="https://www.newyorker.com/culture/infinite-scroll/what-google-search-isnt-showing-you" rel="nofollow">What Google Search Isn't Showing You</a>
|
|
||||||
</dt>
|
|
||||||
<dd>The New Yorker 🎩, 2022-03-10</dd>
|
<dd>The New Yorker 🎩, 2022-03-10</dd>
|
||||||
<dt>
|
<dt> <a href="https://www.metafilter.com/194653/Marginalia-Search-Serendipity-Engineering" rel="nofollow">Marginalia Search - Serendipity Engineering</a> </dt>
|
||||||
<a href="https://www.metafilter.com/194653/Marginalia-Search-Serendipity-Engineering" rel="nofollow">Marginalia Search - Serendipity Engineering</a>
|
|
||||||
</dt>
|
|
||||||
<dd>MetaFilter, 2022-03-09</dd>
|
<dd>MetaFilter, 2022-03-09</dd>
|
||||||
<dt>
|
<dt> 🎂 <a href="https://memex.marginalia.nu/log/49-marginalia-1-year.gmi">First anniversary</a>! 🎊 </dt>
|
||||||
🎂 <a href="https://memex.marginalia.nu/log/49-marginalia-1-year.gmi">First anniversary</a>! 🎊
|
<dd> 2022-02-26 </dd>
|
||||||
</dt>
|
<dt> <a href="https://onezero.medium.com/a-search-engine-designed-to-surprise-you-b81944ed5c06" rel="nofollow">A Search Engine Designed To Surprise You</a> </dt>
|
||||||
<dd>
|
|
||||||
2022-02-26
|
|
||||||
</dd>
|
|
||||||
<dt>
|
|
||||||
<a href="https://onezero.medium.com/a-search-engine-designed-to-surprise-you-b81944ed5c06" rel="nofollow">A Search Engine Designed To Surprise You</a>
|
|
||||||
</dt>
|
|
||||||
<dd>Clive Thompson OneZero, 2021-09-16</dd>
|
<dd>Clive Thompson OneZero, 2021-09-16</dd>
|
||||||
<dt>
|
<dt> <a href="https://news.ycombinator.com/item?id=28550764" rel="nofollow"> A search engine that favors text-heavy sites and punishes modern web design</a> </dt>
|
||||||
<a href="https://news.ycombinator.com/item?id=28550764" rel="nofollow"> A search engine that favors text-heavy sites and punishes modern web design</a>
|
<dd> Hacker News, 2021-09-16 </dd>
|
||||||
</dt>
|
<dt>Development begins </dt>
|
||||||
<dd>
|
<dd>2021-02-26</dd>
|
||||||
Hacker News, 2021-09-16
|
|
||||||
</dd>
|
|
||||||
</dl>
|
</dl>
|
||||||
</div>
|
</div>
|
||||||
</section>
|
</section>
|
||||||
|
@ -20,7 +20,9 @@
|
|||||||
{{>edge/parts/search-form}}
|
{{>edge/parts/search-form}}
|
||||||
|
|
||||||
|
|
||||||
|
<hr class="w3m-helper" />
|
||||||
<section class="cards">
|
<section class="cards">
|
||||||
|
|
||||||
{{#if maintenanceMessage}}<section class="card problems onlyscreen"><h2>Maintenance</h2><p class="description">{{maintenanceMessage}}</p></section>{{/if}}
|
{{#if maintenanceMessage}}<section class="card problems onlyscreen"><h2>Maintenance</h2><p class="description">{{maintenanceMessage}}</p></section>{{/if}}
|
||||||
{{#if evalResult}}<section class="card semantic onlyscreen"><h2>Evaluation</h2><p class="description">{{query}} = {{evalResult}}</p><hr class="w3m-helper" /></section>{{/if}}
|
{{#if evalResult}}<section class="card semantic onlyscreen"><h2>Evaluation</h2><p class="description">{{query}} = {{evalResult}}</p><hr class="w3m-helper" /></section>{{/if}}
|
||||||
{{#each wiki.entries}}<section class="card semantic onlyscreen"><h2>Encyclopedia</h2><p class="description"><a href="https://encyclopedia.marginalia.nu/wiki/{{.}}"><em>{{.}}</em> Encyclopedia Page</a></p><hr class="w3m-helper" /></section>{{/each}}
|
{{#each wiki.entries}}<section class="card semantic onlyscreen"><h2>Encyclopedia</h2><p class="description"><a href="https://encyclopedia.marginalia.nu/wiki/{{.}}"><em>{{.}}</em> Encyclopedia Page</a></p><hr class="w3m-helper" /></section>{{/each}}
|
||||||
|
@ -75,7 +75,8 @@ class SqlLoadProcessedDocumentTest {
|
|||||||
EdgeHtmlStandard.HTML5,
|
EdgeHtmlStandard.HTML5,
|
||||||
100,
|
100,
|
||||||
12345,
|
12345,
|
||||||
-3.14
|
-3.14,
|
||||||
|
null
|
||||||
)));
|
)));
|
||||||
|
|
||||||
var details = dataStoreDao.getUrlDetailsMulti(new EdgeIdArray<>(loaderData.getUrlId(new EdgeUrl("https://www.marginalia.nu/"))));
|
var details = dataStoreDao.getUrlDetailsMulti(new EdgeIdArray<>(loaderData.getUrlId(new EdgeUrl("https://www.marginalia.nu/"))));
|
||||||
|
@ -0,0 +1,254 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.processor.logic;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateSniffer;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic.PubDateHeuristicDOMParsingPass2;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
|
class PubDateSnifferTest {
|
||||||
|
|
||||||
|
PubDateSniffer dateSniffer = new PubDateSniffer();
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetYearFromText() {
|
||||||
|
var ret = PubDateParser.dateFromHighestYearLookingSubstring("© 2005-2010 Bob Dobbs");
|
||||||
|
assertTrue(ret.isPresent());
|
||||||
|
assertEquals(2010, ret.get().year());
|
||||||
|
|
||||||
|
ret = PubDateParser.dateFromHighestYearLookingSubstring("© 99 Bob Dobbs");
|
||||||
|
assertFalse(ret.isPresent());
|
||||||
|
|
||||||
|
ret = PubDateParser.dateFromHighestYearLookingSubstring("© 1939 Bob Dobbs");
|
||||||
|
assertFalse(ret.isPresent());
|
||||||
|
|
||||||
|
ret = PubDateParser.dateFromHighestYearLookingSubstring("In the year 2525, if man is still alive");
|
||||||
|
assertFalse(ret.isPresent());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testParse() {
|
||||||
|
var ret = PubDateParser.attemptParseDate("2022-01-01");
|
||||||
|
assertTrue(ret.isPresent());
|
||||||
|
assertEquals("2022-01-01", ret.get().dateIso8601());
|
||||||
|
assertEquals(2022, ret.get().year());
|
||||||
|
|
||||||
|
ret = PubDateParser.attemptParseDate("2022-08-24T14:39:14Z");
|
||||||
|
assertTrue(ret.isPresent());
|
||||||
|
assertEquals("2022-08-24", ret.get().dateIso8601());
|
||||||
|
assertEquals(2022, ret.get().year());
|
||||||
|
|
||||||
|
ret = PubDateParser.attemptParseDate("2022-08-24T14:39:14");
|
||||||
|
assertTrue(ret.isPresent());
|
||||||
|
assertEquals("2022-08-24", ret.get().dateIso8601());
|
||||||
|
assertEquals(2022, ret.get().year());
|
||||||
|
|
||||||
|
ret = PubDateParser.attemptParseDate("Sun, 21 Oct 2018 12:16:24 GMT");
|
||||||
|
assertTrue(ret.isPresent());
|
||||||
|
assertEquals("2018-10-21", ret.get().dateIso8601());
|
||||||
|
assertEquals(2018, ret.get().year());
|
||||||
|
|
||||||
|
ret = PubDateParser.attemptParseDate("July 13, 2006");
|
||||||
|
assertTrue(ret.isPresent());
|
||||||
|
assertEquals(2006, ret.get().year());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testHtml5A() throws URISyntaxException {
|
||||||
|
var ret = dateSniffer.getPubDate("",
|
||||||
|
new EdgeUrl("https://www.example.com/"),
|
||||||
|
Jsoup.parse("""
|
||||||
|
<!doctype html>
|
||||||
|
<html>
|
||||||
|
<article>
|
||||||
|
<time pubdate="pubdate" datetime="2022-08-24">time</time>
|
||||||
|
Wow, sure lor 'em boss
|
||||||
|
</article>
|
||||||
|
"""), EdgeHtmlStandard.UNKNOWN, true);
|
||||||
|
|
||||||
|
assertFalse(ret.isEmpty());
|
||||||
|
assertEquals("2022-08-24", ret.dateIso8601());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testHtml5B() throws URISyntaxException {
|
||||||
|
var ret = dateSniffer.getPubDate("",
|
||||||
|
new EdgeUrl("https://www.example.com/"),
|
||||||
|
Jsoup.parse("""
|
||||||
|
<!doctype html>
|
||||||
|
<html>
|
||||||
|
<time>2022-08-24</time>
|
||||||
|
Wow, sure lor 'em boss
|
||||||
|
</article>
|
||||||
|
"""), EdgeHtmlStandard.UNKNOWN, true);
|
||||||
|
|
||||||
|
assertFalse(ret.isEmpty());
|
||||||
|
assertEquals("2022-08-24", ret.dateIso8601());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testHtml5C() throws URISyntaxException {
|
||||||
|
var ret = dateSniffer.getPubDate("",
|
||||||
|
new EdgeUrl("https://www.example.com/"),
|
||||||
|
Jsoup.parse("""
|
||||||
|
<!doctype html>
|
||||||
|
<html>
|
||||||
|
<time class="published" datetime="July 13, 2006">July 13, 2006</time>
|
||||||
|
Wow, sure lor 'em boss
|
||||||
|
</article>
|
||||||
|
"""), EdgeHtmlStandard.UNKNOWN, true);
|
||||||
|
|
||||||
|
assertFalse(ret.isEmpty());
|
||||||
|
assertEquals(2006, ret.year());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testProblemCases() throws IOException, URISyntaxException {
|
||||||
|
var ret = dateSniffer.getPubDate("",
|
||||||
|
new EdgeUrl("https://www.example.com/"),
|
||||||
|
Jsoup.parse(Files.readString(Path.of("/home/vlofgren/Code/tmp-data/The Switch to Linux Begins .html"))), EdgeHtmlStandard.HTML5, true);
|
||||||
|
|
||||||
|
assertFalse(ret.isEmpty());
|
||||||
|
assertEquals(2006, ret.year());
|
||||||
|
|
||||||
|
ret = dateSniffer.getPubDate("",
|
||||||
|
new EdgeUrl("https://www.example.com/"),
|
||||||
|
Jsoup.parse(Files.readString(Path.of("/home/vlofgren/Code/tmp-data/Black Hat USA 2010 Understanding and Deploying DNSSEC by Paul Wouters and Patrick Nauber.html"))), EdgeHtmlStandard.XHTML, true);
|
||||||
|
|
||||||
|
assertFalse(ret.isEmpty());
|
||||||
|
assertEquals(2010, ret.year());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGuessYear() {
|
||||||
|
System.out.println(PubDateParser.guessYear(2010, 2020));
|
||||||
|
System.out.println(PubDateParser.guessYear(2010, 2020));
|
||||||
|
System.out.println(PubDateParser.guessYear(2010, 2020));
|
||||||
|
System.out.println(PubDateParser.guessYear(2010, 2020));
|
||||||
|
System.out.println(PubDateParser.guessYear(2010, 2020));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testMicrodata() throws URISyntaxException {
|
||||||
|
var ret = dateSniffer.getPubDate("",
|
||||||
|
new EdgeUrl("https://www.example.com/"),
|
||||||
|
Jsoup.parse("""
|
||||||
|
<!doctype html>
|
||||||
|
<html>
|
||||||
|
<meta itemprop="datePublished" content="2022-08-24" />
|
||||||
|
"""), EdgeHtmlStandard.UNKNOWN, true);
|
||||||
|
|
||||||
|
assertFalse(ret.isEmpty());
|
||||||
|
assertEquals("2022-08-24", ret.dateIso8601());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testRDFa() throws URISyntaxException {
|
||||||
|
var ret = dateSniffer.getPubDate("",
|
||||||
|
new EdgeUrl("https://www.example.com/"),
|
||||||
|
Jsoup.parse("""
|
||||||
|
<!doctype html>
|
||||||
|
<html>
|
||||||
|
<meta property="datePublished" content="2022-08-24" />
|
||||||
|
"""),EdgeHtmlStandard.UNKNOWN, true);
|
||||||
|
|
||||||
|
assertFalse(ret.isEmpty());
|
||||||
|
assertEquals("2022-08-24", ret.dateIso8601());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testLD() throws URISyntaxException {
|
||||||
|
var ret = dateSniffer.getPubDate("",
|
||||||
|
new EdgeUrl("https://www.example.com/"),
|
||||||
|
Jsoup.parse("""
|
||||||
|
<!doctype html>
|
||||||
|
<html>
|
||||||
|
<script type="application/ld+json">{"@context":"https:\\/\\/schema.org","@type":"Article","name":"In the Year 2525","url":"https:\\/\\/en.wikipedia.org\\/wiki\\/In_the_Year_2525","sameAs":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","mainEntity":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\\/\\/www.wikimedia.org\\/static\\/images\\/wmf-hor-googpub.png"}},"datePublished":"2004-08-24T14:39:14Z","dateModified":"2022-10-20T11:54:37Z","image":"https:\\/\\/upload.wikimedia.org\\/wikipedia\\/commons\\/4\\/4a\\/In_the_Year_2525_by_Zager_and_Evans_US_vinyl_Side-A_RCA_release.png","headline":"song written and compsoed by Rick Evans, originally recorded by Zager and Evans and released in 1969"}</script><script type="application/ld+json">{"@context":"https:\\/\\/schema.org","@type":"Article","name":"In the Year 2525","url":"https:\\/\\/en.wikipedia.org\\/wiki\\/In_the_Year_2525","sameAs":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","mainEntity":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\\/\\/www.wikimedia.org\\/static\\/images\\/wmf-hor-googpub.png"}},"datePublished":"2004-08-24T14:39:14Z","dateModified":"2022-10-20T11:54:37Z","image":"https:\\/\\/upload.wikimedia.org\\/wikipedia\\/commons\\/4\\/4a\\/In_the_Year_2525_by_Zager_and_Evans_US_vinyl_Side-A_RCA_release.png","headline":"song written and compsoed by Rick Evans, originally recorded by Zager and Evans and released in 1969"}</script>
|
||||||
|
"""), EdgeHtmlStandard.UNKNOWN, true);
|
||||||
|
|
||||||
|
assertFalse(ret.isEmpty());
|
||||||
|
assertEquals("2004-08-24", ret.dateIso8601());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testPath() throws URISyntaxException {
|
||||||
|
var ret = dateSniffer.getPubDate("",
|
||||||
|
new EdgeUrl("https://www.example.com/articles/2022/04/how-to-detect-dates"),
|
||||||
|
Jsoup.parse("""
|
||||||
|
<!doctype html>
|
||||||
|
<html>
|
||||||
|
<title>No date in the HTML</title>
|
||||||
|
"""), EdgeHtmlStandard.UNKNOWN, true);
|
||||||
|
|
||||||
|
assertFalse(ret.isEmpty());
|
||||||
|
assertNull(ret.dateIso8601());
|
||||||
|
assertEquals(2022, ret.year());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testHeader() throws URISyntaxException {
|
||||||
|
var ret = dateSniffer.getPubDate("content-type: application/pdf\netag: \"4fc0ba8a7f5090b6fa6be385dca206ec\"\nlast-modified: Thu, 03 Feb 2022 19:22:58 GMT\ncontent-length: 298819\ndate: Wed, 24 Aug 2022 19:48:52 GMT\ncache-control: public, no-transform, immutable, max-age\u003d31536000\naccess-control-expose-headers: Content-Length,Content-Disposition,Content-Range,Etag,Server-Timing,Vary,X-Cld-Error,X-Content-Type-Options\naccess-control-allow-origin: *\naccept-ranges: bytes\ntiming-allow-origin: *\nserver: Cloudinary\nstrict-transport-security: max-age\u003d604800\nx-content-type-options: nosniff\nserver-timing: akam;dur\u003d25;start\u003d2022-08-24T19:48:52.519Z;desc\u003dmiss,rtt;dur\u003d19,cloudinary;dur\u003d129;start\u003d2022-08-23T06:35:17.331Z\n",
|
||||||
|
new EdgeUrl("https://www.example.com/"),
|
||||||
|
Jsoup.parse("""
|
||||||
|
<!doctype html>
|
||||||
|
<html>
|
||||||
|
<title>No date in the HTML</title>
|
||||||
|
"""), EdgeHtmlStandard.UNKNOWN, true);
|
||||||
|
|
||||||
|
assertFalse(ret.isEmpty());
|
||||||
|
assertEquals("2022-02-03", ret.dateIso8601());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDOM() throws URISyntaxException {
|
||||||
|
var ret = dateSniffer.getPubDate("",
|
||||||
|
new EdgeUrl("https://www.example.com/"),
|
||||||
|
Jsoup.parse("""
|
||||||
|
<!doctype html>
|
||||||
|
<html>
|
||||||
|
<p>Published 2003, updated 2022</p>
|
||||||
|
"""), EdgeHtmlStandard.HTML5, true);
|
||||||
|
|
||||||
|
assertFalse(ret.isEmpty());
|
||||||
|
assertNull(ret.dateIso8601());
|
||||||
|
assertEquals(2015, ret.year());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testCandidate() {
|
||||||
|
System.out.println(PubDateHeuristicDOMParsingPass2.isPossibleCandidate("(C) 2007"));
|
||||||
|
System.out.println(PubDateHeuristicDOMParsingPass2.isPossibleCandidate("(C) 2007-01-01"));
|
||||||
|
System.out.println(PubDateHeuristicDOMParsingPass2.isPossibleCandidate("(C) 01-01.2007"));
|
||||||
|
System.out.println(PubDateHeuristicDOMParsingPass2.isPossibleCandidate("Only $1999"));
|
||||||
|
System.out.println(PubDateHeuristicDOMParsingPass2.isPossibleCandidate("1998B"));
|
||||||
|
System.out.println(PubDateHeuristicDOMParsingPass2.isPossibleCandidate("1998B"));
|
||||||
|
System.out.println(PubDateHeuristicDOMParsingPass2.isPossibleCandidate("2010 black hat ™"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testOldInvision() throws URISyntaxException {
|
||||||
|
var ret = dateSniffer.getPubDate("",
|
||||||
|
new EdgeUrl("https://www.example.com/"),
|
||||||
|
Jsoup.parse("""
|
||||||
|
<!doctype html>
|
||||||
|
<html>
|
||||||
|
<div style="float: left;"> <b>Post subject:</b> Keyboards.</div><div style="float: right;"><span class="postdetails"><b><img src="./styles/subsilver2/imageset/icon_post_target.gif" width="12" height="9" alt="Post" title="Post" /> <a href="./viewtopic.php?p=34580&sid=cf0c13dedebb4fea1f03fa73e510cd9f#p34580">#1</a></b></span> <b>Posted:</b> Sun Oct 03, 2010 5:37 pm </div>
|
||||||
|
"""), EdgeHtmlStandard.UNKNOWN, true);
|
||||||
|
|
||||||
|
assertFalse(ret.isEmpty());
|
||||||
|
assertNull(ret.dateIso8601());
|
||||||
|
assertEquals(2010, ret.year());
|
||||||
|
}
|
||||||
|
}
|
@ -2,22 +2,91 @@ package nu.marginalia.wmsa.edge.converting.processor.logic;
|
|||||||
|
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.junit.jupiter.api.Assertions;
|
import org.junit.jupiter.api.Assertions;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.io.FileOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.PrintWriter;
|
||||||
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.util.Comparator;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
|
|
||||||
class SummaryExtractorTest {
|
class SummaryExtractorTest {
|
||||||
|
SummaryExtractor summaryExtractor;
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() {
|
||||||
|
summaryExtractor = new SummaryExtractor(255);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSummaryFilter() throws IOException {
|
||||||
|
String html = readClassPathFile("html/monadnock.html");
|
||||||
|
var doc = Jsoup.parse(html);
|
||||||
|
var filter = new SummaryExtractionFilter();
|
||||||
|
doc.filter(filter);
|
||||||
|
|
||||||
|
filter.statistics.entrySet().stream().sorted(Comparator.comparing(e -> -e.getValue().textLength()))
|
||||||
|
.filter(e -> e.getValue().textToTagRatio() > 0.75)
|
||||||
|
.filter(e -> e.getValue().isElement())
|
||||||
|
.filter(e -> e.getValue().textLength() > 32)
|
||||||
|
.filter(e -> e.getValue().pos() < filter.cnt / 2.)
|
||||||
|
.limit(5)
|
||||||
|
.forEach(e -> {
|
||||||
|
System.out.println(e.getKey().nodeName() + ":" + e.getValue() + " / " + e.getValue().textToTagRatio());
|
||||||
|
System.out.println(e.getValue().text());
|
||||||
|
});
|
||||||
|
}
|
||||||
|
@Test
|
||||||
|
public void testSummaryFilter3() throws IOException {
|
||||||
|
var data = Path.of("/home/vlofgren/Code/tmp-data/url-327999153");
|
||||||
|
String html = Files.readString(data);
|
||||||
|
var doc = Jsoup.parse(html);
|
||||||
|
var filter = new SummaryExtractionFilter();
|
||||||
|
doc.filter(filter);
|
||||||
|
|
||||||
|
filter.getSummary(255);
|
||||||
|
}
|
||||||
|
@Test
|
||||||
|
public void testSummaryFilter2() throws IOException {
|
||||||
|
var data = Path.of("/home/vlofgren/Code/tmp-data/");
|
||||||
|
|
||||||
|
System.out.println("Running");
|
||||||
|
|
||||||
|
var fos = new PrintWriter(new FileOutputStream("/tmp/summaryDiff.html"));
|
||||||
|
fos.println("<table>");
|
||||||
|
|
||||||
|
for (var file : Objects.requireNonNull(data.toFile().listFiles())) {
|
||||||
|
|
||||||
|
var doc = Jsoup.parse(Files.readString(file.toPath()));
|
||||||
|
fos.println("<tr><th colspan=2>" + file.getName() + "</th></tr>");
|
||||||
|
fos.println("<tr><td width=50%>");
|
||||||
|
var filter = new SummaryExtractionFilter();
|
||||||
|
|
||||||
|
doc.select("header,nav,#header,#nav,#navigation,.header,.nav,.navigation,ul,li").remove();
|
||||||
|
doc.filter(filter);
|
||||||
|
var ret = filter.getSummary(255);
|
||||||
|
|
||||||
|
fos.println(ret);
|
||||||
|
fos.println("</td><td width=50%>");
|
||||||
|
String summary = summaryExtractor.extractSummary(Jsoup.parse(Files.readString(file.toPath())));
|
||||||
|
fos.println(summary);
|
||||||
|
fos.println("</td></tr>");
|
||||||
|
}
|
||||||
|
|
||||||
|
fos.println("</table>");
|
||||||
|
fos.flush();
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void extractSurrey() throws IOException {
|
void extractSurrey() throws IOException {
|
||||||
String html = readClassPathFile("html/summarization/surrey.html");
|
String html = readClassPathFile("html/summarization/surrey.html");
|
||||||
SummaryExtractor se = new SummaryExtractor(255);
|
var doc = Jsoup.parse(html);
|
||||||
|
String summary = summaryExtractor.extractSummary(doc);
|
||||||
|
|
||||||
String summary = se.extractSummary(Jsoup.parse(html));
|
|
||||||
|
|
||||||
Assertions.assertFalse(summary.isBlank());
|
Assertions.assertFalse(summary.isBlank());
|
||||||
|
|
||||||
@ -27,9 +96,8 @@ class SummaryExtractorTest {
|
|||||||
@Test
|
@Test
|
||||||
void extractSurrey1() throws IOException {
|
void extractSurrey1() throws IOException {
|
||||||
String html = readClassPathFile("html/summarization/surrey.html.1");
|
String html = readClassPathFile("html/summarization/surrey.html.1");
|
||||||
SummaryExtractor se = new SummaryExtractor(255);
|
var doc = Jsoup.parse(html);
|
||||||
|
String summary = summaryExtractor.extractSummary(doc);
|
||||||
String summary = se.extractSummary(Jsoup.parse(html));
|
|
||||||
|
|
||||||
Assertions.assertFalse(summary.isBlank());
|
Assertions.assertFalse(summary.isBlank());
|
||||||
|
|
||||||
@ -39,9 +107,8 @@ class SummaryExtractorTest {
|
|||||||
@Test
|
@Test
|
||||||
void extract187() throws IOException {
|
void extract187() throws IOException {
|
||||||
String html = readClassPathFile("html/summarization/187.shtml");
|
String html = readClassPathFile("html/summarization/187.shtml");
|
||||||
SummaryExtractor se = new SummaryExtractor(255);
|
var doc = Jsoup.parse(html);
|
||||||
|
String summary = summaryExtractor.extractSummary(doc);
|
||||||
String summary = se.extractSummary(Jsoup.parse(html));
|
|
||||||
|
|
||||||
Assertions.assertFalse(summary.isBlank());
|
Assertions.assertFalse(summary.isBlank());
|
||||||
|
|
||||||
@ -51,9 +118,9 @@ class SummaryExtractorTest {
|
|||||||
@Test
|
@Test
|
||||||
void extractMonadnock() throws IOException {
|
void extractMonadnock() throws IOException {
|
||||||
String html = readClassPathFile("html/monadnock.html");
|
String html = readClassPathFile("html/monadnock.html");
|
||||||
SummaryExtractor se = new SummaryExtractor(255);
|
|
||||||
|
|
||||||
String summary = se.extractSummary(Jsoup.parse(html));
|
var doc = Jsoup.parse(html);
|
||||||
|
String summary = summaryExtractor.extractSummary(doc);
|
||||||
|
|
||||||
Assertions.assertFalse(summary.isBlank());
|
Assertions.assertFalse(summary.isBlank());
|
||||||
|
|
||||||
@ -63,9 +130,9 @@ class SummaryExtractorTest {
|
|||||||
@Test
|
@Test
|
||||||
public void testWorkSet() throws IOException {
|
public void testWorkSet() throws IOException {
|
||||||
var workSet = readWorkSet();
|
var workSet = readWorkSet();
|
||||||
SummaryExtractor se = new SummaryExtractor(255);
|
|
||||||
workSet.forEach((path, str) -> {
|
workSet.forEach((path, str) -> {
|
||||||
String summary = se.extractSummary(Jsoup.parse(str));
|
var doc = Jsoup.parse(str);
|
||||||
|
String summary = summaryExtractor.extractSummary(doc);
|
||||||
System.out.println(path + ": " + summary);
|
System.out.println(path + ": " + summary);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@ -85,4 +152,5 @@ class SummaryExtractorTest {
|
|||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
Loading…
Reference in New Issue
Block a user