mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Specialized logic for processing Lemmy-based websites.
This commit is contained in:
parent
b0c7480d06
commit
f8f9f04158
@ -42,10 +42,13 @@ public class SummaryExtractor {
|
||||
String maybe = heuristic.summarize(parsed, importantWords);
|
||||
if (!maybe.isBlank()) {
|
||||
String cleaned = truncatedCharacters.matcher(maybe).replaceAll(" ");
|
||||
return StringUtils.abbreviate(cleaned, "", maxSummaryLength);
|
||||
return abbreivateSummary(cleaned);
|
||||
}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
public String abbreivateSummary(String summary) {
|
||||
return StringUtils.abbreviate(summary, "", maxSummaryLength);
|
||||
}
|
||||
}
|
||||
|
@ -79,6 +79,8 @@ dependencies {
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
||||
testImplementation project(':code:processes:test-data')
|
||||
}
|
||||
|
||||
test {
|
||||
|
@ -36,7 +36,8 @@ public class DisqualifiedException extends Exception {
|
||||
ROBOTS_TXT,
|
||||
ERROR,
|
||||
Timeout, // Don't you dare
|
||||
BAD_CANONICAL
|
||||
BAD_CANONICAL,
|
||||
IRRELEVANT
|
||||
;
|
||||
|
||||
public static DisqualificationReason fromCrawlerStatus(CrawlerDocumentStatus crawlerStatus) {
|
||||
|
@ -76,6 +76,13 @@ public class DocumentGeneratorExtractor {
|
||||
}
|
||||
}
|
||||
|
||||
for (var scriptTags : doc.head().select("script")) {
|
||||
if (scriptTags.html().contains("window.lemmyConfig")) {
|
||||
return DocumentGenerator.of("lemmy");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return DocumentGenerator.unset();
|
||||
}
|
||||
|
||||
@ -152,7 +159,7 @@ public class DocumentGeneratorExtractor {
|
||||
"notepad", "namo", "arachnophilia", "scite",
|
||||
"alleycode", "htmlkit", "acehtml", "bluefish", "htmled", "cutehtml", "fileedit", "cocoa"
|
||||
-> GeneratorType.MANUAL;
|
||||
case "vbulletin", "phpbb", "mybb", "nodebb", "flarum", "discourse"
|
||||
case "vbulletin", "phpbb", "mybb", "nodebb", "flarum", "discourse", "lemmy"
|
||||
-> GeneratorType.FORUM;
|
||||
case "mediawiki", "dokuwiki", "sharepoint"
|
||||
-> GeneratorType.WIKI;
|
||||
|
@ -5,15 +5,11 @@ import com.google.inject.Singleton;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.converting.model.DisqualifiedException;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.model.idx.DocumentFlags;
|
||||
|
||||
import java.util.EnumSet;
|
||||
|
||||
@Singleton
|
||||
public class DocumentLengthLogic {
|
||||
private final int minDocumentLength;
|
||||
private final int shortDocumentLength = 2500;
|
||||
private final int longDocumentLength = 7500;
|
||||
|
||||
|
||||
@Inject
|
||||
public DocumentLengthLogic(@Named("min-document-length") Integer minDocumentLength) {
|
||||
@ -31,8 +27,10 @@ public class DocumentLengthLogic {
|
||||
return (int) Math.round((totalWords / (double) numSentences) / 4.);
|
||||
}
|
||||
|
||||
public void validateLength(DocumentLanguageData dld) throws DisqualifiedException {
|
||||
if (dld.totalNumWords() < minDocumentLength) {
|
||||
public void validateLength(DocumentLanguageData dld,
|
||||
double modifier) throws DisqualifiedException
|
||||
{
|
||||
if (modifier * dld.totalNumWords() < minDocumentLength) {
|
||||
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LENGTH);
|
||||
}
|
||||
}
|
||||
|
@ -4,13 +4,14 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.converting.model.GeneratorType;
|
||||
import nu.marginalia.converting.processor.MetaRobotsTag;
|
||||
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
||||
import nu.marginalia.converting.processor.logic.dom.MeasureLengthVisitor;
|
||||
import nu.marginalia.converting.processor.logic.links.FileLinks;
|
||||
import nu.marginalia.converting.processor.logic.links.LinkProcessor;
|
||||
import nu.marginalia.converting.processor.plugin.specialization.DefaultSpecialization;
|
||||
import nu.marginalia.converting.processor.plugin.specialization.HtmlProcessorSpecialization;
|
||||
import nu.marginalia.converting.processor.plugin.specialization.LemmySpecialization;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.summary.SummaryExtractor;
|
||||
import nu.marginalia.link_parser.LinkParser;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.crawling.model.CrawledDomain;
|
||||
@ -49,7 +50,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
private final FeatureExtractor featureExtractor;
|
||||
private final TitleExtractor titleExtractor;
|
||||
private final DocumentKeywordExtractor keywordExtractor;
|
||||
private final SummaryExtractor summaryExtractor;
|
||||
private final PubDateSniffer pubDateSniffer;
|
||||
|
||||
private final DocumentLengthLogic documentLengthLogic;
|
||||
@ -61,6 +61,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
private static final LinkParser linkParser = new LinkParser();
|
||||
private static final FeedExtractor feedExtractor = new FeedExtractor(linkParser);
|
||||
|
||||
private final DefaultSpecialization defaultSpecialization;
|
||||
private final LemmySpecialization lemmySpecialization;
|
||||
|
||||
@Inject
|
||||
public HtmlDocumentProcessorPlugin(
|
||||
@Named("min-document-quality") Double minDocumentQuality,
|
||||
@ -68,11 +71,10 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
FeatureExtractor featureExtractor,
|
||||
TitleExtractor titleExtractor,
|
||||
DocumentKeywordExtractor keywordExtractor,
|
||||
SummaryExtractor summaryExtractor,
|
||||
PubDateSniffer pubDateSniffer,
|
||||
DocumentLengthLogic documentLengthLogic,
|
||||
MetaRobotsTag metaRobotsTag,
|
||||
DocumentGeneratorExtractor documentGeneratorExtractor) {
|
||||
DocumentGeneratorExtractor documentGeneratorExtractor, DefaultSpecialization defaultSpecialization, LemmySpecialization lemmySpecialization) {
|
||||
this.documentLengthLogic = documentLengthLogic;
|
||||
this.minDocumentQuality = minDocumentQuality;
|
||||
this.sentenceExtractor = sentenceExtractor;
|
||||
@ -80,11 +82,12 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
|
||||
this.titleExtractor = titleExtractor;
|
||||
this.keywordExtractor = keywordExtractor;
|
||||
this.summaryExtractor = summaryExtractor;
|
||||
this.pubDateSniffer = pubDateSniffer;
|
||||
this.metaRobotsTag = metaRobotsTag;
|
||||
|
||||
this.documentGeneratorExtractor = documentGeneratorExtractor;
|
||||
this.defaultSpecialization = defaultSpecialization;
|
||||
this.lemmySpecialization = lemmySpecialization;
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -110,7 +113,15 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
|
||||
final EdgeUrl url = new EdgeUrl(crawledDocument.url);
|
||||
|
||||
DocumentLanguageData dld = sentenceExtractor.extractSentences(prune(doc));
|
||||
final var generatorParts = documentGeneratorExtractor.generatorCleaned(doc);
|
||||
|
||||
final var specialization = selectSpecialization(generatorParts);
|
||||
|
||||
if (!specialization.shouldIndex(url)) {
|
||||
throw new DisqualifiedException(DisqualificationReason.IRRELEVANT);
|
||||
}
|
||||
|
||||
DocumentLanguageData dld = sentenceExtractor.extractSentences(specialization.prune(doc));
|
||||
|
||||
checkDocumentLanguage(dld);
|
||||
|
||||
@ -127,7 +138,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
|
||||
// don't move this up! it uses title and quality
|
||||
// and is run before the heavy computations below
|
||||
documentLengthLogic.validateLength(dld);
|
||||
documentLengthLogic.validateLength(dld, specialization.lengthModifier());
|
||||
if (isDisqualified(url, ret)) {
|
||||
throw new DisqualifiedException(DisqualificationReason.QUALITY);
|
||||
}
|
||||
@ -138,8 +149,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
|
||||
PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, standard, true);
|
||||
|
||||
final var generatorParts = documentGeneratorExtractor.generatorCleaned(doc);
|
||||
|
||||
EnumSet<DocumentFlags> documentFlags = documentFlags(features, generatorParts.type());
|
||||
|
||||
ret.metadata = new DocumentMetadata(
|
||||
@ -148,10 +157,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
|
||||
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url);
|
||||
|
||||
ret.description = getDescription(doc, words.importantWords);
|
||||
|
||||
|
||||
|
||||
ret.description = specialization.getSummary(doc, words.importantWords);
|
||||
ret.generator = generatorParts.type();
|
||||
|
||||
var tagWords = new MetaTagsBuilder()
|
||||
@ -174,6 +180,16 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
return new DetailsWithWords(ret, words);
|
||||
}
|
||||
|
||||
/** Depending on the generator tag, we may want to use specialized logic for pruning and summarizing the document */
|
||||
private HtmlProcessorSpecialization selectSpecialization(DocumentGeneratorExtractor.DocumentGenerator generatorParts) {
|
||||
|
||||
if (generatorParts.keywords().contains("lemmy")) {
|
||||
return lemmySpecialization;
|
||||
}
|
||||
|
||||
return defaultSpecialization;
|
||||
}
|
||||
|
||||
private EnumSet<DocumentFlags> documentFlags(Set<HtmlFeature> features, GeneratorType type) {
|
||||
EnumSet<DocumentFlags> flags = EnumSet.noneOf(DocumentFlags.class);
|
||||
|
||||
@ -191,16 +207,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
return flags;
|
||||
}
|
||||
|
||||
private Document prune(Document doc) {
|
||||
final var prunedDoc = doc.clone();
|
||||
|
||||
prunedDoc.getElementsByTag("svg").remove();
|
||||
prunedDoc.body().filter(new DomPruningFilter(0.5));
|
||||
|
||||
return prunedDoc;
|
||||
}
|
||||
|
||||
|
||||
private static final GuardedRegex mastodonFeedRegex = GuardedRegexFactory.startsWith("/@", "^/@[^/]+/?$");
|
||||
|
||||
private boolean isDisqualified(EdgeUrl url, ProcessedDocumentDetails ret) {
|
||||
@ -285,23 +291,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
return htmlStandard;
|
||||
}
|
||||
|
||||
private String getDescription(Document doc,
|
||||
Set<String> importantWords)
|
||||
{
|
||||
List<String> cleanedWords = new ArrayList<>(importantWords.size());
|
||||
|
||||
for (var word : importantWords) {
|
||||
// summary extraction is not interested in n-grams
|
||||
if (word.contains("_")) {
|
||||
continue;
|
||||
}
|
||||
|
||||
cleanedWords.add(word);
|
||||
}
|
||||
|
||||
return summaryExtractor.extractSummary(doc, cleanedWords);
|
||||
}
|
||||
|
||||
private int getLength(Document doc) {
|
||||
var mlv = new MeasureLengthVisitor();
|
||||
doc.traverse(mlv);
|
||||
|
@ -70,7 +70,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
|
||||
|
||||
checkDocumentLanguage(dld);
|
||||
|
||||
documentLengthLogic.validateLength(dld);
|
||||
documentLengthLogic.validateLength(dld, 1.0);
|
||||
|
||||
var ret = new ProcessedDocumentDetails();
|
||||
|
||||
|
@ -0,0 +1,49 @@
|
||||
package nu.marginalia.converting.processor.plugin.specialization;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
||||
import nu.marginalia.summary.SummaryExtractor;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
@Singleton
|
||||
public class DefaultSpecialization implements HtmlProcessorSpecialization {
|
||||
|
||||
private final SummaryExtractor summaryExtractor;
|
||||
|
||||
@Inject
|
||||
public DefaultSpecialization(SummaryExtractor summaryExtractor) {
|
||||
this.summaryExtractor = summaryExtractor;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Document prune(Document doc) {
|
||||
final var prunedDoc = doc.clone();
|
||||
|
||||
prunedDoc.getElementsByTag("svg").remove();
|
||||
prunedDoc.body().filter(new DomPruningFilter(0.5));
|
||||
|
||||
return prunedDoc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getSummary(Document doc,
|
||||
Set<String> importantWords) {
|
||||
List<String> cleanedWords = new ArrayList<>(importantWords.size());
|
||||
|
||||
for (var word : importantWords) {
|
||||
// summary extraction is not interested in n-grams
|
||||
if (word.contains("_")) {
|
||||
continue;
|
||||
}
|
||||
|
||||
cleanedWords.add(word);
|
||||
}
|
||||
|
||||
return summaryExtractor.extractSummary(doc, cleanedWords);
|
||||
}
|
||||
}
|
@ -0,0 +1,19 @@
|
||||
package nu.marginalia.converting.processor.plugin.specialization;
|
||||
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
/** This interface is used to specify how to process a specific website.
|
||||
* The implementations of this interface are used by the HtmlProcessor to
|
||||
* process the HTML documents.
|
||||
*/
|
||||
public interface HtmlProcessorSpecialization {
|
||||
Document prune(Document original);
|
||||
String getSummary(Document original,
|
||||
Set<String> importantWords);
|
||||
|
||||
default boolean shouldIndex(EdgeUrl url) { return true; }
|
||||
default double lengthModifier() { return 1.0; }
|
||||
}
|
@ -0,0 +1,67 @@
|
||||
package nu.marginalia.converting.processor.plugin.specialization;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.summary.SummaryExtractor;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
/** This class is used to specify how to process a website running Lemmy */
|
||||
@Singleton
|
||||
public class LemmySpecialization implements HtmlProcessorSpecialization {
|
||||
private static final Logger logger = LoggerFactory.getLogger(LemmySpecialization.class);
|
||||
private final SummaryExtractor summaryExtractor;
|
||||
|
||||
@Inject
|
||||
public LemmySpecialization(SummaryExtractor summaryExtractor) {
|
||||
this.summaryExtractor = summaryExtractor;
|
||||
}
|
||||
|
||||
public Document prune(Document document) {
|
||||
|
||||
// Remove the sidebar
|
||||
|
||||
var newDoc = new Document(document.baseUri());
|
||||
var bodyTag = newDoc.appendElement("body");
|
||||
|
||||
for (var pTag : document.getElementsByTag("p")) {
|
||||
bodyTag.appendChild(newDoc.createElement("p").text(pTag.text()));
|
||||
}
|
||||
|
||||
return newDoc;
|
||||
}
|
||||
|
||||
public String getSummary(Document document, Set<String> importantWords) {
|
||||
StringBuilder summary = new StringBuilder();
|
||||
|
||||
for (var pTag : document.getElementsByTag("p")) {
|
||||
if (summary.length() > 512) {
|
||||
break;
|
||||
}
|
||||
String text = pTag.text();
|
||||
|
||||
if (text.isBlank())
|
||||
continue;
|
||||
|
||||
summary
|
||||
.append(text)
|
||||
.append(' ');
|
||||
}
|
||||
|
||||
return summaryExtractor.abbreivateSummary(summary.toString());
|
||||
}
|
||||
|
||||
/** Since we're stripping down the document to only contain the relevant comments,
|
||||
* we need to add an artificial lenght modifier to the document to avoid filtering out
|
||||
* documents that are of adequate length but fail to meet the minimum length requirement
|
||||
* that assumes a certain amount of chaff.
|
||||
*/
|
||||
@Override
|
||||
public double lengthModifier() {
|
||||
return 1.5;
|
||||
}
|
||||
}
|
@ -0,0 +1,56 @@
|
||||
package nu.marginalia.converting.processor.plugin.specialization;
|
||||
|
||||
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
|
||||
import nu.marginalia.summary.SummaryExtractor;
|
||||
import nu.marginalia.test.CommonTestData;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
class LemmySpecializationTest {
|
||||
|
||||
static LemmySpecialization specialization;
|
||||
static DocumentGeneratorExtractor generatorExtractor = new DocumentGeneratorExtractor();
|
||||
|
||||
String lemmyIndexHtml = CommonTestData.loadTestData("mock-crawl-data/lemmy/index.html");
|
||||
String lemmyPost = CommonTestData.loadTestData("mock-crawl-data/lemmy/108995.html");
|
||||
String lemmyIndexC = CommonTestData.loadTestData("mock-crawl-data/lemmy/c_startrek.html");
|
||||
|
||||
@BeforeAll
|
||||
public static void setUpAll() {
|
||||
specialization = new LemmySpecialization(
|
||||
new SummaryExtractor(255,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null));
|
||||
}
|
||||
|
||||
@Test
|
||||
void prune() {
|
||||
System.out.println(specialization.prune(Jsoup.parse(lemmyIndexHtml)));
|
||||
System.out.println(specialization.prune(Jsoup.parse(lemmyPost)));
|
||||
}
|
||||
|
||||
@Test
|
||||
void generatorExtraction() {
|
||||
var generatorIndex = generatorExtractor.generatorCleaned(Jsoup.parse(lemmyIndexHtml));
|
||||
var generatorPost = generatorExtractor.generatorCleaned(Jsoup.parse(lemmyPost));
|
||||
|
||||
System.out.println(generatorIndex);
|
||||
System.out.println(generatorPost);
|
||||
}
|
||||
@Test
|
||||
void getSummary() {
|
||||
String summaryPost = specialization.getSummary(Jsoup.parse(lemmyPost), Set.of(""));
|
||||
String summaryIndex = specialization.getSummary(Jsoup.parse(lemmyIndexHtml), Set.of(""));
|
||||
String summaryC = specialization.getSummary(Jsoup.parse(lemmyIndexC), Set.of(""));
|
||||
|
||||
System.out.println(summaryPost);
|
||||
System.out.println(summaryIndex);
|
||||
System.out.println(summaryC);
|
||||
}
|
||||
}
|
@ -14,13 +14,12 @@ import nu.marginalia.crawling.model.SerializableCrawlData;
|
||||
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.jsoup.Jsoup;
|
||||
import nu.marginalia.test.CommonTestData;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
@ -54,20 +53,16 @@ public class CrawlerMockFetcherTest {
|
||||
|
||||
@SneakyThrows
|
||||
private void registerUrlClasspathData(EdgeUrl url, String path) {
|
||||
try (var resourceStream = getClass().getClassLoader().getResourceAsStream(path)) {
|
||||
if (resourceStream == null) throw new IllegalArgumentException("No such resource: " + path);
|
||||
var data = BigString.encode(CommonTestData.loadTestData(path));
|
||||
|
||||
var data = BigString.encode(new String(resourceStream.readAllBytes(), StandardCharsets.UTF_8));
|
||||
|
||||
mockData.put(url, CrawledDocument.builder()
|
||||
.crawlId("1")
|
||||
.url(url.toString())
|
||||
.contentType("text/html")
|
||||
.httpStatus(200)
|
||||
.crawlerStatus(CrawlerDocumentStatus.OK.name())
|
||||
.documentBody(data)
|
||||
.build());
|
||||
}
|
||||
mockData.put(url, CrawledDocument.builder()
|
||||
.crawlId("1")
|
||||
.url(url.toString())
|
||||
.contentType("text/html")
|
||||
.httpStatus(200)
|
||||
.crawlerStatus(CrawlerDocumentStatus.OK.name())
|
||||
.documentBody(data)
|
||||
.build());
|
||||
|
||||
}
|
||||
|
||||
|
@ -0,0 +1,15 @@
|
||||
package nu.marginalia.test;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
public class CommonTestData {
|
||||
public static String loadTestData(String path) {
|
||||
try (var resourceStream = CommonTestData.class.getClassLoader().getResourceAsStream(path)) {
|
||||
if (resourceStream == null) throw new IllegalArgumentException("No such resource: " + path);
|
||||
|
||||
return new String(resourceStream.readAllBytes(), StandardCharsets.UTF_8);
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user