Specialized logic for processing Lemmy-based websites.

This commit is contained in:
Viktor Lofgren 2023-06-26 16:54:03 +02:00 committed by Viktor
parent b0c7480d06
commit f8f9f04158
13 changed files with 268 additions and 67 deletions

View File

@ -42,10 +42,13 @@ public class SummaryExtractor {
String maybe = heuristic.summarize(parsed, importantWords);
if (!maybe.isBlank()) {
String cleaned = truncatedCharacters.matcher(maybe).replaceAll(" ");
return StringUtils.abbreviate(cleaned, "", maxSummaryLength);
return abbreivateSummary(cleaned);
}
}
return "";
}
public String abbreivateSummary(String summary) {
return StringUtils.abbreviate(summary, "", maxSummaryLength);
}
}

View File

@ -79,6 +79,8 @@ dependencies {
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
testImplementation project(':code:processes:test-data')
}
test {

View File

@ -36,7 +36,8 @@ public class DisqualifiedException extends Exception {
ROBOTS_TXT,
ERROR,
Timeout, // Don't you dare
BAD_CANONICAL
BAD_CANONICAL,
IRRELEVANT
;
public static DisqualificationReason fromCrawlerStatus(CrawlerDocumentStatus crawlerStatus) {

View File

@ -76,6 +76,13 @@ public class DocumentGeneratorExtractor {
}
}
for (var scriptTags : doc.head().select("script")) {
if (scriptTags.html().contains("window.lemmyConfig")) {
return DocumentGenerator.of("lemmy");
}
}
return DocumentGenerator.unset();
}
@ -152,7 +159,7 @@ public class DocumentGeneratorExtractor {
"notepad", "namo", "arachnophilia", "scite",
"alleycode", "htmlkit", "acehtml", "bluefish", "htmled", "cutehtml", "fileedit", "cocoa"
-> GeneratorType.MANUAL;
case "vbulletin", "phpbb", "mybb", "nodebb", "flarum", "discourse"
case "vbulletin", "phpbb", "mybb", "nodebb", "flarum", "discourse", "lemmy"
-> GeneratorType.FORUM;
case "mediawiki", "dokuwiki", "sharepoint"
-> GeneratorType.WIKI;

View File

@ -5,15 +5,11 @@ import com.google.inject.Singleton;
import com.google.inject.name.Named;
import nu.marginalia.converting.model.DisqualifiedException;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.model.idx.DocumentFlags;
import java.util.EnumSet;
@Singleton
public class DocumentLengthLogic {
private final int minDocumentLength;
private final int shortDocumentLength = 2500;
private final int longDocumentLength = 7500;
@Inject
public DocumentLengthLogic(@Named("min-document-length") Integer minDocumentLength) {
@ -31,8 +27,10 @@ public class DocumentLengthLogic {
return (int) Math.round((totalWords / (double) numSentences) / 4.);
}
public void validateLength(DocumentLanguageData dld) throws DisqualifiedException {
if (dld.totalNumWords() < minDocumentLength) {
public void validateLength(DocumentLanguageData dld,
double modifier) throws DisqualifiedException
{
if (modifier * dld.totalNumWords() < minDocumentLength) {
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LENGTH);
}
}

View File

@ -4,13 +4,14 @@ import com.google.inject.Inject;
import com.google.inject.name.Named;
import nu.marginalia.converting.model.GeneratorType;
import nu.marginalia.converting.processor.MetaRobotsTag;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.converting.processor.logic.dom.MeasureLengthVisitor;
import nu.marginalia.converting.processor.logic.links.FileLinks;
import nu.marginalia.converting.processor.logic.links.LinkProcessor;
import nu.marginalia.converting.processor.plugin.specialization.DefaultSpecialization;
import nu.marginalia.converting.processor.plugin.specialization.HtmlProcessorSpecialization;
import nu.marginalia.converting.processor.plugin.specialization.LemmySpecialization;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.summary.SummaryExtractor;
import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
@ -49,7 +50,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
private final FeatureExtractor featureExtractor;
private final TitleExtractor titleExtractor;
private final DocumentKeywordExtractor keywordExtractor;
private final SummaryExtractor summaryExtractor;
private final PubDateSniffer pubDateSniffer;
private final DocumentLengthLogic documentLengthLogic;
@ -61,6 +61,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
private static final LinkParser linkParser = new LinkParser();
private static final FeedExtractor feedExtractor = new FeedExtractor(linkParser);
private final DefaultSpecialization defaultSpecialization;
private final LemmySpecialization lemmySpecialization;
@Inject
public HtmlDocumentProcessorPlugin(
@Named("min-document-quality") Double minDocumentQuality,
@ -68,11 +71,10 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
FeatureExtractor featureExtractor,
TitleExtractor titleExtractor,
DocumentKeywordExtractor keywordExtractor,
SummaryExtractor summaryExtractor,
PubDateSniffer pubDateSniffer,
DocumentLengthLogic documentLengthLogic,
MetaRobotsTag metaRobotsTag,
DocumentGeneratorExtractor documentGeneratorExtractor) {
DocumentGeneratorExtractor documentGeneratorExtractor, DefaultSpecialization defaultSpecialization, LemmySpecialization lemmySpecialization) {
this.documentLengthLogic = documentLengthLogic;
this.minDocumentQuality = minDocumentQuality;
this.sentenceExtractor = sentenceExtractor;
@ -80,11 +82,12 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
this.titleExtractor = titleExtractor;
this.keywordExtractor = keywordExtractor;
this.summaryExtractor = summaryExtractor;
this.pubDateSniffer = pubDateSniffer;
this.metaRobotsTag = metaRobotsTag;
this.documentGeneratorExtractor = documentGeneratorExtractor;
this.defaultSpecialization = defaultSpecialization;
this.lemmySpecialization = lemmySpecialization;
}
@Override
@ -110,7 +113,15 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
final EdgeUrl url = new EdgeUrl(crawledDocument.url);
DocumentLanguageData dld = sentenceExtractor.extractSentences(prune(doc));
final var generatorParts = documentGeneratorExtractor.generatorCleaned(doc);
final var specialization = selectSpecialization(generatorParts);
if (!specialization.shouldIndex(url)) {
throw new DisqualifiedException(DisqualificationReason.IRRELEVANT);
}
DocumentLanguageData dld = sentenceExtractor.extractSentences(specialization.prune(doc));
checkDocumentLanguage(dld);
@ -127,7 +138,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
// don't move this up! it uses title and quality
// and is run before the heavy computations below
documentLengthLogic.validateLength(dld);
documentLengthLogic.validateLength(dld, specialization.lengthModifier());
if (isDisqualified(url, ret)) {
throw new DisqualifiedException(DisqualificationReason.QUALITY);
}
@ -138,8 +149,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, standard, true);
final var generatorParts = documentGeneratorExtractor.generatorCleaned(doc);
EnumSet<DocumentFlags> documentFlags = documentFlags(features, generatorParts.type());
ret.metadata = new DocumentMetadata(
@ -148,10 +157,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url);
ret.description = getDescription(doc, words.importantWords);
ret.description = specialization.getSummary(doc, words.importantWords);
ret.generator = generatorParts.type();
var tagWords = new MetaTagsBuilder()
@ -174,6 +180,16 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
return new DetailsWithWords(ret, words);
}
/** Depending on the generator tag, we may want to use specialized logic for pruning and summarizing the document */
private HtmlProcessorSpecialization selectSpecialization(DocumentGeneratorExtractor.DocumentGenerator generatorParts) {
if (generatorParts.keywords().contains("lemmy")) {
return lemmySpecialization;
}
return defaultSpecialization;
}
private EnumSet<DocumentFlags> documentFlags(Set<HtmlFeature> features, GeneratorType type) {
EnumSet<DocumentFlags> flags = EnumSet.noneOf(DocumentFlags.class);
@ -191,16 +207,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
return flags;
}
private Document prune(Document doc) {
final var prunedDoc = doc.clone();
prunedDoc.getElementsByTag("svg").remove();
prunedDoc.body().filter(new DomPruningFilter(0.5));
return prunedDoc;
}
private static final GuardedRegex mastodonFeedRegex = GuardedRegexFactory.startsWith("/@", "^/@[^/]+/?$");
private boolean isDisqualified(EdgeUrl url, ProcessedDocumentDetails ret) {
@ -285,23 +291,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
return htmlStandard;
}
private String getDescription(Document doc,
Set<String> importantWords)
{
List<String> cleanedWords = new ArrayList<>(importantWords.size());
for (var word : importantWords) {
// summary extraction is not interested in n-grams
if (word.contains("_")) {
continue;
}
cleanedWords.add(word);
}
return summaryExtractor.extractSummary(doc, cleanedWords);
}
private int getLength(Document doc) {
var mlv = new MeasureLengthVisitor();
doc.traverse(mlv);

View File

@ -70,7 +70,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
checkDocumentLanguage(dld);
documentLengthLogic.validateLength(dld);
documentLengthLogic.validateLength(dld, 1.0);
var ret = new ProcessedDocumentDetails();

View File

@ -0,0 +1,49 @@
package nu.marginalia.converting.processor.plugin.specialization;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.summary.SummaryExtractor;
import org.jsoup.nodes.Document;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
@Singleton
public class DefaultSpecialization implements HtmlProcessorSpecialization {
private final SummaryExtractor summaryExtractor;
@Inject
public DefaultSpecialization(SummaryExtractor summaryExtractor) {
this.summaryExtractor = summaryExtractor;
}
@Override
public Document prune(Document doc) {
final var prunedDoc = doc.clone();
prunedDoc.getElementsByTag("svg").remove();
prunedDoc.body().filter(new DomPruningFilter(0.5));
return prunedDoc;
}
@Override
public String getSummary(Document doc,
Set<String> importantWords) {
List<String> cleanedWords = new ArrayList<>(importantWords.size());
for (var word : importantWords) {
// summary extraction is not interested in n-grams
if (word.contains("_")) {
continue;
}
cleanedWords.add(word);
}
return summaryExtractor.extractSummary(doc, cleanedWords);
}
}

View File

@ -0,0 +1,19 @@
package nu.marginalia.converting.processor.plugin.specialization;
import nu.marginalia.model.EdgeUrl;
import org.jsoup.nodes.Document;
import java.util.Set;
/** This interface is used to specify how to process a specific website.
* The implementations of this interface are used by the HtmlProcessor to
* process the HTML documents.
*/
public interface HtmlProcessorSpecialization {
Document prune(Document original);
String getSummary(Document original,
Set<String> importantWords);
default boolean shouldIndex(EdgeUrl url) { return true; }
default double lengthModifier() { return 1.0; }
}

View File

@ -0,0 +1,67 @@
package nu.marginalia.converting.processor.plugin.specialization;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.summary.SummaryExtractor;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Set;
/** This class is used to specify how to process a website running Lemmy */
@Singleton
public class LemmySpecialization implements HtmlProcessorSpecialization {
private static final Logger logger = LoggerFactory.getLogger(LemmySpecialization.class);
private final SummaryExtractor summaryExtractor;
@Inject
public LemmySpecialization(SummaryExtractor summaryExtractor) {
this.summaryExtractor = summaryExtractor;
}
public Document prune(Document document) {
// Remove the sidebar
var newDoc = new Document(document.baseUri());
var bodyTag = newDoc.appendElement("body");
for (var pTag : document.getElementsByTag("p")) {
bodyTag.appendChild(newDoc.createElement("p").text(pTag.text()));
}
return newDoc;
}
public String getSummary(Document document, Set<String> importantWords) {
StringBuilder summary = new StringBuilder();
for (var pTag : document.getElementsByTag("p")) {
if (summary.length() > 512) {
break;
}
String text = pTag.text();
if (text.isBlank())
continue;
summary
.append(text)
.append(' ');
}
return summaryExtractor.abbreivateSummary(summary.toString());
}
/** Since we're stripping down the document to only contain the relevant comments,
* we need to add an artificial lenght modifier to the document to avoid filtering out
* documents that are of adequate length but fail to meet the minimum length requirement
* that assumes a certain amount of chaff.
*/
@Override
public double lengthModifier() {
return 1.5;
}
}

View File

@ -0,0 +1,56 @@
package nu.marginalia.converting.processor.plugin.specialization;
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
import nu.marginalia.summary.SummaryExtractor;
import nu.marginalia.test.CommonTestData;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import java.util.Set;
class LemmySpecializationTest {
static LemmySpecialization specialization;
static DocumentGeneratorExtractor generatorExtractor = new DocumentGeneratorExtractor();
String lemmyIndexHtml = CommonTestData.loadTestData("mock-crawl-data/lemmy/index.html");
String lemmyPost = CommonTestData.loadTestData("mock-crawl-data/lemmy/108995.html");
String lemmyIndexC = CommonTestData.loadTestData("mock-crawl-data/lemmy/c_startrek.html");
@BeforeAll
public static void setUpAll() {
specialization = new LemmySpecialization(
new SummaryExtractor(255,
null,
null,
null,
null,
null));
}
@Test
void prune() {
System.out.println(specialization.prune(Jsoup.parse(lemmyIndexHtml)));
System.out.println(specialization.prune(Jsoup.parse(lemmyPost)));
}
@Test
void generatorExtraction() {
var generatorIndex = generatorExtractor.generatorCleaned(Jsoup.parse(lemmyIndexHtml));
var generatorPost = generatorExtractor.generatorCleaned(Jsoup.parse(lemmyPost));
System.out.println(generatorIndex);
System.out.println(generatorPost);
}
@Test
void getSummary() {
String summaryPost = specialization.getSummary(Jsoup.parse(lemmyPost), Set.of(""));
String summaryIndex = specialization.getSummary(Jsoup.parse(lemmyIndexHtml), Set.of(""));
String summaryC = specialization.getSummary(Jsoup.parse(lemmyIndexC), Set.of(""));
System.out.println(summaryPost);
System.out.println(summaryIndex);
System.out.println(summaryC);
}
}

View File

@ -14,13 +14,12 @@ import nu.marginalia.crawling.model.SerializableCrawlData;
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import org.jsoup.Jsoup;
import nu.marginalia.test.CommonTestData;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
@ -54,20 +53,16 @@ public class CrawlerMockFetcherTest {
@SneakyThrows
private void registerUrlClasspathData(EdgeUrl url, String path) {
try (var resourceStream = getClass().getClassLoader().getResourceAsStream(path)) {
if (resourceStream == null) throw new IllegalArgumentException("No such resource: " + path);
var data = BigString.encode(CommonTestData.loadTestData(path));
var data = BigString.encode(new String(resourceStream.readAllBytes(), StandardCharsets.UTF_8));
mockData.put(url, CrawledDocument.builder()
.crawlId("1")
.url(url.toString())
.contentType("text/html")
.httpStatus(200)
.crawlerStatus(CrawlerDocumentStatus.OK.name())
.documentBody(data)
.build());
}
mockData.put(url, CrawledDocument.builder()
.crawlId("1")
.url(url.toString())
.contentType("text/html")
.httpStatus(200)
.crawlerStatus(CrawlerDocumentStatus.OK.name())
.documentBody(data)
.build());
}

View File

@ -0,0 +1,15 @@
package nu.marginalia.test;
import java.nio.charset.StandardCharsets;
public class CommonTestData {
public static String loadTestData(String path) {
try (var resourceStream = CommonTestData.class.getClassLoader().getResourceAsStream(path)) {
if (resourceStream == null) throw new IllegalArgumentException("No such resource: " + path);
return new String(resourceStream.readAllBytes(), StandardCharsets.UTF_8);
} catch (Exception e) {
throw new RuntimeException(e);
}
}
}