mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Specialized logic for processing Lemmy-based websites.
This commit is contained in:
parent
b0c7480d06
commit
f8f9f04158
@ -42,10 +42,13 @@ public class SummaryExtractor {
|
|||||||
String maybe = heuristic.summarize(parsed, importantWords);
|
String maybe = heuristic.summarize(parsed, importantWords);
|
||||||
if (!maybe.isBlank()) {
|
if (!maybe.isBlank()) {
|
||||||
String cleaned = truncatedCharacters.matcher(maybe).replaceAll(" ");
|
String cleaned = truncatedCharacters.matcher(maybe).replaceAll(" ");
|
||||||
return StringUtils.abbreviate(cleaned, "", maxSummaryLength);
|
return abbreivateSummary(cleaned);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String abbreivateSummary(String summary) {
|
||||||
|
return StringUtils.abbreviate(summary, "", maxSummaryLength);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -79,6 +79,8 @@ dependencies {
|
|||||||
testImplementation libs.bundles.slf4j.test
|
testImplementation libs.bundles.slf4j.test
|
||||||
testImplementation libs.bundles.junit
|
testImplementation libs.bundles.junit
|
||||||
testImplementation libs.mockito
|
testImplementation libs.mockito
|
||||||
|
|
||||||
|
testImplementation project(':code:processes:test-data')
|
||||||
}
|
}
|
||||||
|
|
||||||
test {
|
test {
|
||||||
|
@ -36,7 +36,8 @@ public class DisqualifiedException extends Exception {
|
|||||||
ROBOTS_TXT,
|
ROBOTS_TXT,
|
||||||
ERROR,
|
ERROR,
|
||||||
Timeout, // Don't you dare
|
Timeout, // Don't you dare
|
||||||
BAD_CANONICAL
|
BAD_CANONICAL,
|
||||||
|
IRRELEVANT
|
||||||
;
|
;
|
||||||
|
|
||||||
public static DisqualificationReason fromCrawlerStatus(CrawlerDocumentStatus crawlerStatus) {
|
public static DisqualificationReason fromCrawlerStatus(CrawlerDocumentStatus crawlerStatus) {
|
||||||
|
@ -76,6 +76,13 @@ public class DocumentGeneratorExtractor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (var scriptTags : doc.head().select("script")) {
|
||||||
|
if (scriptTags.html().contains("window.lemmyConfig")) {
|
||||||
|
return DocumentGenerator.of("lemmy");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
return DocumentGenerator.unset();
|
return DocumentGenerator.unset();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -152,7 +159,7 @@ public class DocumentGeneratorExtractor {
|
|||||||
"notepad", "namo", "arachnophilia", "scite",
|
"notepad", "namo", "arachnophilia", "scite",
|
||||||
"alleycode", "htmlkit", "acehtml", "bluefish", "htmled", "cutehtml", "fileedit", "cocoa"
|
"alleycode", "htmlkit", "acehtml", "bluefish", "htmled", "cutehtml", "fileedit", "cocoa"
|
||||||
-> GeneratorType.MANUAL;
|
-> GeneratorType.MANUAL;
|
||||||
case "vbulletin", "phpbb", "mybb", "nodebb", "flarum", "discourse"
|
case "vbulletin", "phpbb", "mybb", "nodebb", "flarum", "discourse", "lemmy"
|
||||||
-> GeneratorType.FORUM;
|
-> GeneratorType.FORUM;
|
||||||
case "mediawiki", "dokuwiki", "sharepoint"
|
case "mediawiki", "dokuwiki", "sharepoint"
|
||||||
-> GeneratorType.WIKI;
|
-> GeneratorType.WIKI;
|
||||||
|
@ -5,15 +5,11 @@ import com.google.inject.Singleton;
|
|||||||
import com.google.inject.name.Named;
|
import com.google.inject.name.Named;
|
||||||
import nu.marginalia.converting.model.DisqualifiedException;
|
import nu.marginalia.converting.model.DisqualifiedException;
|
||||||
import nu.marginalia.language.model.DocumentLanguageData;
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
import nu.marginalia.model.idx.DocumentFlags;
|
|
||||||
|
|
||||||
import java.util.EnumSet;
|
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class DocumentLengthLogic {
|
public class DocumentLengthLogic {
|
||||||
private final int minDocumentLength;
|
private final int minDocumentLength;
|
||||||
private final int shortDocumentLength = 2500;
|
|
||||||
private final int longDocumentLength = 7500;
|
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public DocumentLengthLogic(@Named("min-document-length") Integer minDocumentLength) {
|
public DocumentLengthLogic(@Named("min-document-length") Integer minDocumentLength) {
|
||||||
@ -31,8 +27,10 @@ public class DocumentLengthLogic {
|
|||||||
return (int) Math.round((totalWords / (double) numSentences) / 4.);
|
return (int) Math.round((totalWords / (double) numSentences) / 4.);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void validateLength(DocumentLanguageData dld) throws DisqualifiedException {
|
public void validateLength(DocumentLanguageData dld,
|
||||||
if (dld.totalNumWords() < minDocumentLength) {
|
double modifier) throws DisqualifiedException
|
||||||
|
{
|
||||||
|
if (modifier * dld.totalNumWords() < minDocumentLength) {
|
||||||
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LENGTH);
|
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LENGTH);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -4,13 +4,14 @@ import com.google.inject.Inject;
|
|||||||
import com.google.inject.name.Named;
|
import com.google.inject.name.Named;
|
||||||
import nu.marginalia.converting.model.GeneratorType;
|
import nu.marginalia.converting.model.GeneratorType;
|
||||||
import nu.marginalia.converting.processor.MetaRobotsTag;
|
import nu.marginalia.converting.processor.MetaRobotsTag;
|
||||||
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
|
||||||
import nu.marginalia.converting.processor.logic.dom.MeasureLengthVisitor;
|
import nu.marginalia.converting.processor.logic.dom.MeasureLengthVisitor;
|
||||||
import nu.marginalia.converting.processor.logic.links.FileLinks;
|
import nu.marginalia.converting.processor.logic.links.FileLinks;
|
||||||
import nu.marginalia.converting.processor.logic.links.LinkProcessor;
|
import nu.marginalia.converting.processor.logic.links.LinkProcessor;
|
||||||
|
import nu.marginalia.converting.processor.plugin.specialization.DefaultSpecialization;
|
||||||
|
import nu.marginalia.converting.processor.plugin.specialization.HtmlProcessorSpecialization;
|
||||||
|
import nu.marginalia.converting.processor.plugin.specialization.LemmySpecialization;
|
||||||
import nu.marginalia.language.model.DocumentLanguageData;
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
import nu.marginalia.model.crawl.HtmlFeature;
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
import nu.marginalia.summary.SummaryExtractor;
|
|
||||||
import nu.marginalia.link_parser.LinkParser;
|
import nu.marginalia.link_parser.LinkParser;
|
||||||
import nu.marginalia.crawling.model.CrawledDocument;
|
import nu.marginalia.crawling.model.CrawledDocument;
|
||||||
import nu.marginalia.crawling.model.CrawledDomain;
|
import nu.marginalia.crawling.model.CrawledDomain;
|
||||||
@ -49,7 +50,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
private final FeatureExtractor featureExtractor;
|
private final FeatureExtractor featureExtractor;
|
||||||
private final TitleExtractor titleExtractor;
|
private final TitleExtractor titleExtractor;
|
||||||
private final DocumentKeywordExtractor keywordExtractor;
|
private final DocumentKeywordExtractor keywordExtractor;
|
||||||
private final SummaryExtractor summaryExtractor;
|
|
||||||
private final PubDateSniffer pubDateSniffer;
|
private final PubDateSniffer pubDateSniffer;
|
||||||
|
|
||||||
private final DocumentLengthLogic documentLengthLogic;
|
private final DocumentLengthLogic documentLengthLogic;
|
||||||
@ -61,6 +61,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
private static final LinkParser linkParser = new LinkParser();
|
private static final LinkParser linkParser = new LinkParser();
|
||||||
private static final FeedExtractor feedExtractor = new FeedExtractor(linkParser);
|
private static final FeedExtractor feedExtractor = new FeedExtractor(linkParser);
|
||||||
|
|
||||||
|
private final DefaultSpecialization defaultSpecialization;
|
||||||
|
private final LemmySpecialization lemmySpecialization;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public HtmlDocumentProcessorPlugin(
|
public HtmlDocumentProcessorPlugin(
|
||||||
@Named("min-document-quality") Double minDocumentQuality,
|
@Named("min-document-quality") Double minDocumentQuality,
|
||||||
@ -68,11 +71,10 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
FeatureExtractor featureExtractor,
|
FeatureExtractor featureExtractor,
|
||||||
TitleExtractor titleExtractor,
|
TitleExtractor titleExtractor,
|
||||||
DocumentKeywordExtractor keywordExtractor,
|
DocumentKeywordExtractor keywordExtractor,
|
||||||
SummaryExtractor summaryExtractor,
|
|
||||||
PubDateSniffer pubDateSniffer,
|
PubDateSniffer pubDateSniffer,
|
||||||
DocumentLengthLogic documentLengthLogic,
|
DocumentLengthLogic documentLengthLogic,
|
||||||
MetaRobotsTag metaRobotsTag,
|
MetaRobotsTag metaRobotsTag,
|
||||||
DocumentGeneratorExtractor documentGeneratorExtractor) {
|
DocumentGeneratorExtractor documentGeneratorExtractor, DefaultSpecialization defaultSpecialization, LemmySpecialization lemmySpecialization) {
|
||||||
this.documentLengthLogic = documentLengthLogic;
|
this.documentLengthLogic = documentLengthLogic;
|
||||||
this.minDocumentQuality = minDocumentQuality;
|
this.minDocumentQuality = minDocumentQuality;
|
||||||
this.sentenceExtractor = sentenceExtractor;
|
this.sentenceExtractor = sentenceExtractor;
|
||||||
@ -80,11 +82,12 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
|
|
||||||
this.titleExtractor = titleExtractor;
|
this.titleExtractor = titleExtractor;
|
||||||
this.keywordExtractor = keywordExtractor;
|
this.keywordExtractor = keywordExtractor;
|
||||||
this.summaryExtractor = summaryExtractor;
|
|
||||||
this.pubDateSniffer = pubDateSniffer;
|
this.pubDateSniffer = pubDateSniffer;
|
||||||
this.metaRobotsTag = metaRobotsTag;
|
this.metaRobotsTag = metaRobotsTag;
|
||||||
|
|
||||||
this.documentGeneratorExtractor = documentGeneratorExtractor;
|
this.documentGeneratorExtractor = documentGeneratorExtractor;
|
||||||
|
this.defaultSpecialization = defaultSpecialization;
|
||||||
|
this.lemmySpecialization = lemmySpecialization;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -110,7 +113,15 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
|
|
||||||
final EdgeUrl url = new EdgeUrl(crawledDocument.url);
|
final EdgeUrl url = new EdgeUrl(crawledDocument.url);
|
||||||
|
|
||||||
DocumentLanguageData dld = sentenceExtractor.extractSentences(prune(doc));
|
final var generatorParts = documentGeneratorExtractor.generatorCleaned(doc);
|
||||||
|
|
||||||
|
final var specialization = selectSpecialization(generatorParts);
|
||||||
|
|
||||||
|
if (!specialization.shouldIndex(url)) {
|
||||||
|
throw new DisqualifiedException(DisqualificationReason.IRRELEVANT);
|
||||||
|
}
|
||||||
|
|
||||||
|
DocumentLanguageData dld = sentenceExtractor.extractSentences(specialization.prune(doc));
|
||||||
|
|
||||||
checkDocumentLanguage(dld);
|
checkDocumentLanguage(dld);
|
||||||
|
|
||||||
@ -127,7 +138,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
|
|
||||||
// don't move this up! it uses title and quality
|
// don't move this up! it uses title and quality
|
||||||
// and is run before the heavy computations below
|
// and is run before the heavy computations below
|
||||||
documentLengthLogic.validateLength(dld);
|
documentLengthLogic.validateLength(dld, specialization.lengthModifier());
|
||||||
if (isDisqualified(url, ret)) {
|
if (isDisqualified(url, ret)) {
|
||||||
throw new DisqualifiedException(DisqualificationReason.QUALITY);
|
throw new DisqualifiedException(DisqualificationReason.QUALITY);
|
||||||
}
|
}
|
||||||
@ -138,8 +149,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
|
|
||||||
PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, standard, true);
|
PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, standard, true);
|
||||||
|
|
||||||
final var generatorParts = documentGeneratorExtractor.generatorCleaned(doc);
|
|
||||||
|
|
||||||
EnumSet<DocumentFlags> documentFlags = documentFlags(features, generatorParts.type());
|
EnumSet<DocumentFlags> documentFlags = documentFlags(features, generatorParts.type());
|
||||||
|
|
||||||
ret.metadata = new DocumentMetadata(
|
ret.metadata = new DocumentMetadata(
|
||||||
@ -148,10 +157,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
|
|
||||||
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url);
|
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url);
|
||||||
|
|
||||||
ret.description = getDescription(doc, words.importantWords);
|
ret.description = specialization.getSummary(doc, words.importantWords);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
ret.generator = generatorParts.type();
|
ret.generator = generatorParts.type();
|
||||||
|
|
||||||
var tagWords = new MetaTagsBuilder()
|
var tagWords = new MetaTagsBuilder()
|
||||||
@ -174,6 +180,16 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
return new DetailsWithWords(ret, words);
|
return new DetailsWithWords(ret, words);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Depending on the generator tag, we may want to use specialized logic for pruning and summarizing the document */
|
||||||
|
private HtmlProcessorSpecialization selectSpecialization(DocumentGeneratorExtractor.DocumentGenerator generatorParts) {
|
||||||
|
|
||||||
|
if (generatorParts.keywords().contains("lemmy")) {
|
||||||
|
return lemmySpecialization;
|
||||||
|
}
|
||||||
|
|
||||||
|
return defaultSpecialization;
|
||||||
|
}
|
||||||
|
|
||||||
private EnumSet<DocumentFlags> documentFlags(Set<HtmlFeature> features, GeneratorType type) {
|
private EnumSet<DocumentFlags> documentFlags(Set<HtmlFeature> features, GeneratorType type) {
|
||||||
EnumSet<DocumentFlags> flags = EnumSet.noneOf(DocumentFlags.class);
|
EnumSet<DocumentFlags> flags = EnumSet.noneOf(DocumentFlags.class);
|
||||||
|
|
||||||
@ -191,16 +207,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
return flags;
|
return flags;
|
||||||
}
|
}
|
||||||
|
|
||||||
private Document prune(Document doc) {
|
|
||||||
final var prunedDoc = doc.clone();
|
|
||||||
|
|
||||||
prunedDoc.getElementsByTag("svg").remove();
|
|
||||||
prunedDoc.body().filter(new DomPruningFilter(0.5));
|
|
||||||
|
|
||||||
return prunedDoc;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private static final GuardedRegex mastodonFeedRegex = GuardedRegexFactory.startsWith("/@", "^/@[^/]+/?$");
|
private static final GuardedRegex mastodonFeedRegex = GuardedRegexFactory.startsWith("/@", "^/@[^/]+/?$");
|
||||||
|
|
||||||
private boolean isDisqualified(EdgeUrl url, ProcessedDocumentDetails ret) {
|
private boolean isDisqualified(EdgeUrl url, ProcessedDocumentDetails ret) {
|
||||||
@ -285,23 +291,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
return htmlStandard;
|
return htmlStandard;
|
||||||
}
|
}
|
||||||
|
|
||||||
private String getDescription(Document doc,
|
|
||||||
Set<String> importantWords)
|
|
||||||
{
|
|
||||||
List<String> cleanedWords = new ArrayList<>(importantWords.size());
|
|
||||||
|
|
||||||
for (var word : importantWords) {
|
|
||||||
// summary extraction is not interested in n-grams
|
|
||||||
if (word.contains("_")) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
cleanedWords.add(word);
|
|
||||||
}
|
|
||||||
|
|
||||||
return summaryExtractor.extractSummary(doc, cleanedWords);
|
|
||||||
}
|
|
||||||
|
|
||||||
private int getLength(Document doc) {
|
private int getLength(Document doc) {
|
||||||
var mlv = new MeasureLengthVisitor();
|
var mlv = new MeasureLengthVisitor();
|
||||||
doc.traverse(mlv);
|
doc.traverse(mlv);
|
||||||
|
@ -70,7 +70,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
|
|||||||
|
|
||||||
checkDocumentLanguage(dld);
|
checkDocumentLanguage(dld);
|
||||||
|
|
||||||
documentLengthLogic.validateLength(dld);
|
documentLengthLogic.validateLength(dld, 1.0);
|
||||||
|
|
||||||
var ret = new ProcessedDocumentDetails();
|
var ret = new ProcessedDocumentDetails();
|
||||||
|
|
||||||
|
@ -0,0 +1,49 @@
|
|||||||
|
package nu.marginalia.converting.processor.plugin.specialization;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
||||||
|
import nu.marginalia.summary.SummaryExtractor;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class DefaultSpecialization implements HtmlProcessorSpecialization {
|
||||||
|
|
||||||
|
private final SummaryExtractor summaryExtractor;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public DefaultSpecialization(SummaryExtractor summaryExtractor) {
|
||||||
|
this.summaryExtractor = summaryExtractor;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Document prune(Document doc) {
|
||||||
|
final var prunedDoc = doc.clone();
|
||||||
|
|
||||||
|
prunedDoc.getElementsByTag("svg").remove();
|
||||||
|
prunedDoc.body().filter(new DomPruningFilter(0.5));
|
||||||
|
|
||||||
|
return prunedDoc;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getSummary(Document doc,
|
||||||
|
Set<String> importantWords) {
|
||||||
|
List<String> cleanedWords = new ArrayList<>(importantWords.size());
|
||||||
|
|
||||||
|
for (var word : importantWords) {
|
||||||
|
// summary extraction is not interested in n-grams
|
||||||
|
if (word.contains("_")) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
cleanedWords.add(word);
|
||||||
|
}
|
||||||
|
|
||||||
|
return summaryExtractor.extractSummary(doc, cleanedWords);
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,19 @@
|
|||||||
|
package nu.marginalia.converting.processor.plugin.specialization;
|
||||||
|
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
/** This interface is used to specify how to process a specific website.
|
||||||
|
* The implementations of this interface are used by the HtmlProcessor to
|
||||||
|
* process the HTML documents.
|
||||||
|
*/
|
||||||
|
public interface HtmlProcessorSpecialization {
|
||||||
|
Document prune(Document original);
|
||||||
|
String getSummary(Document original,
|
||||||
|
Set<String> importantWords);
|
||||||
|
|
||||||
|
default boolean shouldIndex(EdgeUrl url) { return true; }
|
||||||
|
default double lengthModifier() { return 1.0; }
|
||||||
|
}
|
@ -0,0 +1,67 @@
|
|||||||
|
package nu.marginalia.converting.processor.plugin.specialization;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import nu.marginalia.summary.SummaryExtractor;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
/** This class is used to specify how to process a website running Lemmy */
|
||||||
|
@Singleton
|
||||||
|
public class LemmySpecialization implements HtmlProcessorSpecialization {
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(LemmySpecialization.class);
|
||||||
|
private final SummaryExtractor summaryExtractor;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public LemmySpecialization(SummaryExtractor summaryExtractor) {
|
||||||
|
this.summaryExtractor = summaryExtractor;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Document prune(Document document) {
|
||||||
|
|
||||||
|
// Remove the sidebar
|
||||||
|
|
||||||
|
var newDoc = new Document(document.baseUri());
|
||||||
|
var bodyTag = newDoc.appendElement("body");
|
||||||
|
|
||||||
|
for (var pTag : document.getElementsByTag("p")) {
|
||||||
|
bodyTag.appendChild(newDoc.createElement("p").text(pTag.text()));
|
||||||
|
}
|
||||||
|
|
||||||
|
return newDoc;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getSummary(Document document, Set<String> importantWords) {
|
||||||
|
StringBuilder summary = new StringBuilder();
|
||||||
|
|
||||||
|
for (var pTag : document.getElementsByTag("p")) {
|
||||||
|
if (summary.length() > 512) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
String text = pTag.text();
|
||||||
|
|
||||||
|
if (text.isBlank())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
summary
|
||||||
|
.append(text)
|
||||||
|
.append(' ');
|
||||||
|
}
|
||||||
|
|
||||||
|
return summaryExtractor.abbreivateSummary(summary.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Since we're stripping down the document to only contain the relevant comments,
|
||||||
|
* we need to add an artificial lenght modifier to the document to avoid filtering out
|
||||||
|
* documents that are of adequate length but fail to meet the minimum length requirement
|
||||||
|
* that assumes a certain amount of chaff.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public double lengthModifier() {
|
||||||
|
return 1.5;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,56 @@
|
|||||||
|
package nu.marginalia.converting.processor.plugin.specialization;
|
||||||
|
|
||||||
|
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
|
||||||
|
import nu.marginalia.summary.SummaryExtractor;
|
||||||
|
import nu.marginalia.test.CommonTestData;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
class LemmySpecializationTest {
|
||||||
|
|
||||||
|
static LemmySpecialization specialization;
|
||||||
|
static DocumentGeneratorExtractor generatorExtractor = new DocumentGeneratorExtractor();
|
||||||
|
|
||||||
|
String lemmyIndexHtml = CommonTestData.loadTestData("mock-crawl-data/lemmy/index.html");
|
||||||
|
String lemmyPost = CommonTestData.loadTestData("mock-crawl-data/lemmy/108995.html");
|
||||||
|
String lemmyIndexC = CommonTestData.loadTestData("mock-crawl-data/lemmy/c_startrek.html");
|
||||||
|
|
||||||
|
@BeforeAll
|
||||||
|
public static void setUpAll() {
|
||||||
|
specialization = new LemmySpecialization(
|
||||||
|
new SummaryExtractor(255,
|
||||||
|
null,
|
||||||
|
null,
|
||||||
|
null,
|
||||||
|
null,
|
||||||
|
null));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void prune() {
|
||||||
|
System.out.println(specialization.prune(Jsoup.parse(lemmyIndexHtml)));
|
||||||
|
System.out.println(specialization.prune(Jsoup.parse(lemmyPost)));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void generatorExtraction() {
|
||||||
|
var generatorIndex = generatorExtractor.generatorCleaned(Jsoup.parse(lemmyIndexHtml));
|
||||||
|
var generatorPost = generatorExtractor.generatorCleaned(Jsoup.parse(lemmyPost));
|
||||||
|
|
||||||
|
System.out.println(generatorIndex);
|
||||||
|
System.out.println(generatorPost);
|
||||||
|
}
|
||||||
|
@Test
|
||||||
|
void getSummary() {
|
||||||
|
String summaryPost = specialization.getSummary(Jsoup.parse(lemmyPost), Set.of(""));
|
||||||
|
String summaryIndex = specialization.getSummary(Jsoup.parse(lemmyIndexHtml), Set.of(""));
|
||||||
|
String summaryC = specialization.getSummary(Jsoup.parse(lemmyIndexC), Set.of(""));
|
||||||
|
|
||||||
|
System.out.println(summaryPost);
|
||||||
|
System.out.println(summaryIndex);
|
||||||
|
System.out.println(summaryC);
|
||||||
|
}
|
||||||
|
}
|
@ -14,13 +14,12 @@ import nu.marginalia.crawling.model.SerializableCrawlData;
|
|||||||
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
|
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import org.jsoup.Jsoup;
|
import nu.marginalia.test.CommonTestData;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
@ -54,20 +53,16 @@ public class CrawlerMockFetcherTest {
|
|||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private void registerUrlClasspathData(EdgeUrl url, String path) {
|
private void registerUrlClasspathData(EdgeUrl url, String path) {
|
||||||
try (var resourceStream = getClass().getClassLoader().getResourceAsStream(path)) {
|
var data = BigString.encode(CommonTestData.loadTestData(path));
|
||||||
if (resourceStream == null) throw new IllegalArgumentException("No such resource: " + path);
|
|
||||||
|
|
||||||
var data = BigString.encode(new String(resourceStream.readAllBytes(), StandardCharsets.UTF_8));
|
mockData.put(url, CrawledDocument.builder()
|
||||||
|
.crawlId("1")
|
||||||
mockData.put(url, CrawledDocument.builder()
|
.url(url.toString())
|
||||||
.crawlId("1")
|
.contentType("text/html")
|
||||||
.url(url.toString())
|
.httpStatus(200)
|
||||||
.contentType("text/html")
|
.crawlerStatus(CrawlerDocumentStatus.OK.name())
|
||||||
.httpStatus(200)
|
.documentBody(data)
|
||||||
.crawlerStatus(CrawlerDocumentStatus.OK.name())
|
.build());
|
||||||
.documentBody(data)
|
|
||||||
.build());
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -0,0 +1,15 @@
|
|||||||
|
package nu.marginalia.test;
|
||||||
|
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
|
||||||
|
public class CommonTestData {
|
||||||
|
public static String loadTestData(String path) {
|
||||||
|
try (var resourceStream = CommonTestData.class.getClassLoader().getResourceAsStream(path)) {
|
||||||
|
if (resourceStream == null) throw new IllegalArgumentException("No such resource: " + path);
|
||||||
|
|
||||||
|
return new String(resourceStream.readAllBytes(), StandardCharsets.UTF_8);
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user