Specialized logic for processing Lemmy-based websites.

This commit is contained in:
Viktor Lofgren 2023-06-26 16:54:03 +02:00 committed by Viktor
parent b0c7480d06
commit f8f9f04158
13 changed files with 268 additions and 67 deletions

View File

@ -42,10 +42,13 @@ public class SummaryExtractor {
String maybe = heuristic.summarize(parsed, importantWords); String maybe = heuristic.summarize(parsed, importantWords);
if (!maybe.isBlank()) { if (!maybe.isBlank()) {
String cleaned = truncatedCharacters.matcher(maybe).replaceAll(" "); String cleaned = truncatedCharacters.matcher(maybe).replaceAll(" ");
return StringUtils.abbreviate(cleaned, "", maxSummaryLength); return abbreivateSummary(cleaned);
} }
} }
return ""; return "";
} }
public String abbreivateSummary(String summary) {
return StringUtils.abbreviate(summary, "", maxSummaryLength);
}
} }

View File

@ -79,6 +79,8 @@ dependencies {
testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit testImplementation libs.bundles.junit
testImplementation libs.mockito testImplementation libs.mockito
testImplementation project(':code:processes:test-data')
} }
test { test {

View File

@ -36,7 +36,8 @@ public class DisqualifiedException extends Exception {
ROBOTS_TXT, ROBOTS_TXT,
ERROR, ERROR,
Timeout, // Don't you dare Timeout, // Don't you dare
BAD_CANONICAL BAD_CANONICAL,
IRRELEVANT
; ;
public static DisqualificationReason fromCrawlerStatus(CrawlerDocumentStatus crawlerStatus) { public static DisqualificationReason fromCrawlerStatus(CrawlerDocumentStatus crawlerStatus) {

View File

@ -76,6 +76,13 @@ public class DocumentGeneratorExtractor {
} }
} }
for (var scriptTags : doc.head().select("script")) {
if (scriptTags.html().contains("window.lemmyConfig")) {
return DocumentGenerator.of("lemmy");
}
}
return DocumentGenerator.unset(); return DocumentGenerator.unset();
} }
@ -152,7 +159,7 @@ public class DocumentGeneratorExtractor {
"notepad", "namo", "arachnophilia", "scite", "notepad", "namo", "arachnophilia", "scite",
"alleycode", "htmlkit", "acehtml", "bluefish", "htmled", "cutehtml", "fileedit", "cocoa" "alleycode", "htmlkit", "acehtml", "bluefish", "htmled", "cutehtml", "fileedit", "cocoa"
-> GeneratorType.MANUAL; -> GeneratorType.MANUAL;
case "vbulletin", "phpbb", "mybb", "nodebb", "flarum", "discourse" case "vbulletin", "phpbb", "mybb", "nodebb", "flarum", "discourse", "lemmy"
-> GeneratorType.FORUM; -> GeneratorType.FORUM;
case "mediawiki", "dokuwiki", "sharepoint" case "mediawiki", "dokuwiki", "sharepoint"
-> GeneratorType.WIKI; -> GeneratorType.WIKI;

View File

@ -5,15 +5,11 @@ import com.google.inject.Singleton;
import com.google.inject.name.Named; import com.google.inject.name.Named;
import nu.marginalia.converting.model.DisqualifiedException; import nu.marginalia.converting.model.DisqualifiedException;
import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.model.idx.DocumentFlags;
import java.util.EnumSet;
@Singleton @Singleton
public class DocumentLengthLogic { public class DocumentLengthLogic {
private final int minDocumentLength; private final int minDocumentLength;
private final int shortDocumentLength = 2500;
private final int longDocumentLength = 7500;
@Inject @Inject
public DocumentLengthLogic(@Named("min-document-length") Integer minDocumentLength) { public DocumentLengthLogic(@Named("min-document-length") Integer minDocumentLength) {
@ -31,8 +27,10 @@ public class DocumentLengthLogic {
return (int) Math.round((totalWords / (double) numSentences) / 4.); return (int) Math.round((totalWords / (double) numSentences) / 4.);
} }
public void validateLength(DocumentLanguageData dld) throws DisqualifiedException { public void validateLength(DocumentLanguageData dld,
if (dld.totalNumWords() < minDocumentLength) { double modifier) throws DisqualifiedException
{
if (modifier * dld.totalNumWords() < minDocumentLength) {
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LENGTH); throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LENGTH);
} }
} }

View File

@ -4,13 +4,14 @@ import com.google.inject.Inject;
import com.google.inject.name.Named; import com.google.inject.name.Named;
import nu.marginalia.converting.model.GeneratorType; import nu.marginalia.converting.model.GeneratorType;
import nu.marginalia.converting.processor.MetaRobotsTag; import nu.marginalia.converting.processor.MetaRobotsTag;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.converting.processor.logic.dom.MeasureLengthVisitor; import nu.marginalia.converting.processor.logic.dom.MeasureLengthVisitor;
import nu.marginalia.converting.processor.logic.links.FileLinks; import nu.marginalia.converting.processor.logic.links.FileLinks;
import nu.marginalia.converting.processor.logic.links.LinkProcessor; import nu.marginalia.converting.processor.logic.links.LinkProcessor;
import nu.marginalia.converting.processor.plugin.specialization.DefaultSpecialization;
import nu.marginalia.converting.processor.plugin.specialization.HtmlProcessorSpecialization;
import nu.marginalia.converting.processor.plugin.specialization.LemmySpecialization;
import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.summary.SummaryExtractor;
import nu.marginalia.link_parser.LinkParser; import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.CrawledDomain;
@ -49,7 +50,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
private final FeatureExtractor featureExtractor; private final FeatureExtractor featureExtractor;
private final TitleExtractor titleExtractor; private final TitleExtractor titleExtractor;
private final DocumentKeywordExtractor keywordExtractor; private final DocumentKeywordExtractor keywordExtractor;
private final SummaryExtractor summaryExtractor;
private final PubDateSniffer pubDateSniffer; private final PubDateSniffer pubDateSniffer;
private final DocumentLengthLogic documentLengthLogic; private final DocumentLengthLogic documentLengthLogic;
@ -61,6 +61,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
private static final LinkParser linkParser = new LinkParser(); private static final LinkParser linkParser = new LinkParser();
private static final FeedExtractor feedExtractor = new FeedExtractor(linkParser); private static final FeedExtractor feedExtractor = new FeedExtractor(linkParser);
private final DefaultSpecialization defaultSpecialization;
private final LemmySpecialization lemmySpecialization;
@Inject @Inject
public HtmlDocumentProcessorPlugin( public HtmlDocumentProcessorPlugin(
@Named("min-document-quality") Double minDocumentQuality, @Named("min-document-quality") Double minDocumentQuality,
@ -68,11 +71,10 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
FeatureExtractor featureExtractor, FeatureExtractor featureExtractor,
TitleExtractor titleExtractor, TitleExtractor titleExtractor,
DocumentKeywordExtractor keywordExtractor, DocumentKeywordExtractor keywordExtractor,
SummaryExtractor summaryExtractor,
PubDateSniffer pubDateSniffer, PubDateSniffer pubDateSniffer,
DocumentLengthLogic documentLengthLogic, DocumentLengthLogic documentLengthLogic,
MetaRobotsTag metaRobotsTag, MetaRobotsTag metaRobotsTag,
DocumentGeneratorExtractor documentGeneratorExtractor) { DocumentGeneratorExtractor documentGeneratorExtractor, DefaultSpecialization defaultSpecialization, LemmySpecialization lemmySpecialization) {
this.documentLengthLogic = documentLengthLogic; this.documentLengthLogic = documentLengthLogic;
this.minDocumentQuality = minDocumentQuality; this.minDocumentQuality = minDocumentQuality;
this.sentenceExtractor = sentenceExtractor; this.sentenceExtractor = sentenceExtractor;
@ -80,11 +82,12 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
this.titleExtractor = titleExtractor; this.titleExtractor = titleExtractor;
this.keywordExtractor = keywordExtractor; this.keywordExtractor = keywordExtractor;
this.summaryExtractor = summaryExtractor;
this.pubDateSniffer = pubDateSniffer; this.pubDateSniffer = pubDateSniffer;
this.metaRobotsTag = metaRobotsTag; this.metaRobotsTag = metaRobotsTag;
this.documentGeneratorExtractor = documentGeneratorExtractor; this.documentGeneratorExtractor = documentGeneratorExtractor;
this.defaultSpecialization = defaultSpecialization;
this.lemmySpecialization = lemmySpecialization;
} }
@Override @Override
@ -110,7 +113,15 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
final EdgeUrl url = new EdgeUrl(crawledDocument.url); final EdgeUrl url = new EdgeUrl(crawledDocument.url);
DocumentLanguageData dld = sentenceExtractor.extractSentences(prune(doc)); final var generatorParts = documentGeneratorExtractor.generatorCleaned(doc);
final var specialization = selectSpecialization(generatorParts);
if (!specialization.shouldIndex(url)) {
throw new DisqualifiedException(DisqualificationReason.IRRELEVANT);
}
DocumentLanguageData dld = sentenceExtractor.extractSentences(specialization.prune(doc));
checkDocumentLanguage(dld); checkDocumentLanguage(dld);
@ -127,7 +138,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
// don't move this up! it uses title and quality // don't move this up! it uses title and quality
// and is run before the heavy computations below // and is run before the heavy computations below
documentLengthLogic.validateLength(dld); documentLengthLogic.validateLength(dld, specialization.lengthModifier());
if (isDisqualified(url, ret)) { if (isDisqualified(url, ret)) {
throw new DisqualifiedException(DisqualificationReason.QUALITY); throw new DisqualifiedException(DisqualificationReason.QUALITY);
} }
@ -138,8 +149,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, standard, true); PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, standard, true);
final var generatorParts = documentGeneratorExtractor.generatorCleaned(doc);
EnumSet<DocumentFlags> documentFlags = documentFlags(features, generatorParts.type()); EnumSet<DocumentFlags> documentFlags = documentFlags(features, generatorParts.type());
ret.metadata = new DocumentMetadata( ret.metadata = new DocumentMetadata(
@ -148,10 +157,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url); DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url);
ret.description = getDescription(doc, words.importantWords); ret.description = specialization.getSummary(doc, words.importantWords);
ret.generator = generatorParts.type(); ret.generator = generatorParts.type();
var tagWords = new MetaTagsBuilder() var tagWords = new MetaTagsBuilder()
@ -174,6 +180,16 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
return new DetailsWithWords(ret, words); return new DetailsWithWords(ret, words);
} }
/** Depending on the generator tag, we may want to use specialized logic for pruning and summarizing the document */
private HtmlProcessorSpecialization selectSpecialization(DocumentGeneratorExtractor.DocumentGenerator generatorParts) {
if (generatorParts.keywords().contains("lemmy")) {
return lemmySpecialization;
}
return defaultSpecialization;
}
private EnumSet<DocumentFlags> documentFlags(Set<HtmlFeature> features, GeneratorType type) { private EnumSet<DocumentFlags> documentFlags(Set<HtmlFeature> features, GeneratorType type) {
EnumSet<DocumentFlags> flags = EnumSet.noneOf(DocumentFlags.class); EnumSet<DocumentFlags> flags = EnumSet.noneOf(DocumentFlags.class);
@ -191,16 +207,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
return flags; return flags;
} }
private Document prune(Document doc) {
final var prunedDoc = doc.clone();
prunedDoc.getElementsByTag("svg").remove();
prunedDoc.body().filter(new DomPruningFilter(0.5));
return prunedDoc;
}
private static final GuardedRegex mastodonFeedRegex = GuardedRegexFactory.startsWith("/@", "^/@[^/]+/?$"); private static final GuardedRegex mastodonFeedRegex = GuardedRegexFactory.startsWith("/@", "^/@[^/]+/?$");
private boolean isDisqualified(EdgeUrl url, ProcessedDocumentDetails ret) { private boolean isDisqualified(EdgeUrl url, ProcessedDocumentDetails ret) {
@ -285,23 +291,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
return htmlStandard; return htmlStandard;
} }
private String getDescription(Document doc,
Set<String> importantWords)
{
List<String> cleanedWords = new ArrayList<>(importantWords.size());
for (var word : importantWords) {
// summary extraction is not interested in n-grams
if (word.contains("_")) {
continue;
}
cleanedWords.add(word);
}
return summaryExtractor.extractSummary(doc, cleanedWords);
}
private int getLength(Document doc) { private int getLength(Document doc) {
var mlv = new MeasureLengthVisitor(); var mlv = new MeasureLengthVisitor();
doc.traverse(mlv); doc.traverse(mlv);

View File

@ -70,7 +70,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
checkDocumentLanguage(dld); checkDocumentLanguage(dld);
documentLengthLogic.validateLength(dld); documentLengthLogic.validateLength(dld, 1.0);
var ret = new ProcessedDocumentDetails(); var ret = new ProcessedDocumentDetails();

View File

@ -0,0 +1,49 @@
package nu.marginalia.converting.processor.plugin.specialization;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.summary.SummaryExtractor;
import org.jsoup.nodes.Document;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
@Singleton
public class DefaultSpecialization implements HtmlProcessorSpecialization {
private final SummaryExtractor summaryExtractor;
@Inject
public DefaultSpecialization(SummaryExtractor summaryExtractor) {
this.summaryExtractor = summaryExtractor;
}
@Override
public Document prune(Document doc) {
final var prunedDoc = doc.clone();
prunedDoc.getElementsByTag("svg").remove();
prunedDoc.body().filter(new DomPruningFilter(0.5));
return prunedDoc;
}
@Override
public String getSummary(Document doc,
Set<String> importantWords) {
List<String> cleanedWords = new ArrayList<>(importantWords.size());
for (var word : importantWords) {
// summary extraction is not interested in n-grams
if (word.contains("_")) {
continue;
}
cleanedWords.add(word);
}
return summaryExtractor.extractSummary(doc, cleanedWords);
}
}

View File

@ -0,0 +1,19 @@
package nu.marginalia.converting.processor.plugin.specialization;
import nu.marginalia.model.EdgeUrl;
import org.jsoup.nodes.Document;
import java.util.Set;
/** This interface is used to specify how to process a specific website.
* The implementations of this interface are used by the HtmlProcessor to
* process the HTML documents.
*/
public interface HtmlProcessorSpecialization {
Document prune(Document original);
String getSummary(Document original,
Set<String> importantWords);
default boolean shouldIndex(EdgeUrl url) { return true; }
default double lengthModifier() { return 1.0; }
}

View File

@ -0,0 +1,67 @@
package nu.marginalia.converting.processor.plugin.specialization;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.summary.SummaryExtractor;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Set;
/** This class is used to specify how to process a website running Lemmy */
@Singleton
public class LemmySpecialization implements HtmlProcessorSpecialization {
private static final Logger logger = LoggerFactory.getLogger(LemmySpecialization.class);
private final SummaryExtractor summaryExtractor;
@Inject
public LemmySpecialization(SummaryExtractor summaryExtractor) {
this.summaryExtractor = summaryExtractor;
}
public Document prune(Document document) {
// Remove the sidebar
var newDoc = new Document(document.baseUri());
var bodyTag = newDoc.appendElement("body");
for (var pTag : document.getElementsByTag("p")) {
bodyTag.appendChild(newDoc.createElement("p").text(pTag.text()));
}
return newDoc;
}
public String getSummary(Document document, Set<String> importantWords) {
StringBuilder summary = new StringBuilder();
for (var pTag : document.getElementsByTag("p")) {
if (summary.length() > 512) {
break;
}
String text = pTag.text();
if (text.isBlank())
continue;
summary
.append(text)
.append(' ');
}
return summaryExtractor.abbreivateSummary(summary.toString());
}
/** Since we're stripping down the document to only contain the relevant comments,
* we need to add an artificial lenght modifier to the document to avoid filtering out
* documents that are of adequate length but fail to meet the minimum length requirement
* that assumes a certain amount of chaff.
*/
@Override
public double lengthModifier() {
return 1.5;
}
}

View File

@ -0,0 +1,56 @@
package nu.marginalia.converting.processor.plugin.specialization;
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
import nu.marginalia.summary.SummaryExtractor;
import nu.marginalia.test.CommonTestData;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import java.util.Set;
class LemmySpecializationTest {
static LemmySpecialization specialization;
static DocumentGeneratorExtractor generatorExtractor = new DocumentGeneratorExtractor();
String lemmyIndexHtml = CommonTestData.loadTestData("mock-crawl-data/lemmy/index.html");
String lemmyPost = CommonTestData.loadTestData("mock-crawl-data/lemmy/108995.html");
String lemmyIndexC = CommonTestData.loadTestData("mock-crawl-data/lemmy/c_startrek.html");
@BeforeAll
public static void setUpAll() {
specialization = new LemmySpecialization(
new SummaryExtractor(255,
null,
null,
null,
null,
null));
}
@Test
void prune() {
System.out.println(specialization.prune(Jsoup.parse(lemmyIndexHtml)));
System.out.println(specialization.prune(Jsoup.parse(lemmyPost)));
}
@Test
void generatorExtraction() {
var generatorIndex = generatorExtractor.generatorCleaned(Jsoup.parse(lemmyIndexHtml));
var generatorPost = generatorExtractor.generatorCleaned(Jsoup.parse(lemmyPost));
System.out.println(generatorIndex);
System.out.println(generatorPost);
}
@Test
void getSummary() {
String summaryPost = specialization.getSummary(Jsoup.parse(lemmyPost), Set.of(""));
String summaryIndex = specialization.getSummary(Jsoup.parse(lemmyIndexHtml), Set.of(""));
String summaryC = specialization.getSummary(Jsoup.parse(lemmyIndexC), Set.of(""));
System.out.println(summaryPost);
System.out.println(summaryIndex);
System.out.println(summaryC);
}
}

View File

@ -14,13 +14,12 @@ import nu.marginalia.crawling.model.SerializableCrawlData;
import nu.marginalia.crawling.model.spec.CrawlingSpecification; import nu.marginalia.crawling.model.spec.CrawlingSpecification;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import org.jsoup.Jsoup; import nu.marginalia.test.CommonTestData;
import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.URISyntaxException; import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.ArrayList; import java.util.ArrayList;
@ -54,10 +53,7 @@ public class CrawlerMockFetcherTest {
@SneakyThrows @SneakyThrows
private void registerUrlClasspathData(EdgeUrl url, String path) { private void registerUrlClasspathData(EdgeUrl url, String path) {
try (var resourceStream = getClass().getClassLoader().getResourceAsStream(path)) { var data = BigString.encode(CommonTestData.loadTestData(path));
if (resourceStream == null) throw new IllegalArgumentException("No such resource: " + path);
var data = BigString.encode(new String(resourceStream.readAllBytes(), StandardCharsets.UTF_8));
mockData.put(url, CrawledDocument.builder() mockData.put(url, CrawledDocument.builder()
.crawlId("1") .crawlId("1")
@ -67,7 +63,6 @@ public class CrawlerMockFetcherTest {
.crawlerStatus(CrawlerDocumentStatus.OK.name()) .crawlerStatus(CrawlerDocumentStatus.OK.name())
.documentBody(data) .documentBody(data)
.build()); .build());
}
} }

View File

@ -0,0 +1,15 @@
package nu.marginalia.test;
import java.nio.charset.StandardCharsets;
public class CommonTestData {
public static String loadTestData(String path) {
try (var resourceStream = CommonTestData.class.getClassLoader().getResourceAsStream(path)) {
if (resourceStream == null) throw new IllegalArgumentException("No such resource: " + path);
return new String(resourceStream.readAllBytes(), StandardCharsets.UTF_8);
} catch (Exception e) {
throw new RuntimeException(e);
}
}
}