Add generator fingerprint for xenforo.

Also clean up the specializations logic a bit, and add a barebones specialization for phpbb that cleans out paths we aren't interested in but doesn't touch pruning or summarizing logic for now.
This commit is contained in:
Viktor Lofgren 2023-07-01 14:43:49 +02:00
parent 9bd0e3ce58
commit a000256223
9 changed files with 2940 additions and 41 deletions

View File

@ -7,9 +7,7 @@ import nu.marginalia.converting.processor.MetaRobotsTag;
import nu.marginalia.converting.processor.logic.dom.MeasureLengthVisitor;
import nu.marginalia.converting.processor.logic.links.FileLinks;
import nu.marginalia.converting.processor.logic.links.LinkProcessor;
import nu.marginalia.converting.processor.plugin.specialization.DefaultSpecialization;
import nu.marginalia.converting.processor.plugin.specialization.HtmlProcessorSpecialization;
import nu.marginalia.converting.processor.plugin.specialization.LemmySpecialization;
import nu.marginalia.converting.processor.plugin.specialization.*;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.link_parser.LinkParser;
@ -61,8 +59,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
private static final LinkParser linkParser = new LinkParser();
private static final FeedExtractor feedExtractor = new FeedExtractor(linkParser);
private final DefaultSpecialization defaultSpecialization;
private final LemmySpecialization lemmySpecialization;
private final HtmlProcessorSpecializations htmlProcessorSpecializations;
@Inject
public HtmlDocumentProcessorPlugin(
@ -74,7 +71,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
PubDateSniffer pubDateSniffer,
DocumentLengthLogic documentLengthLogic,
MetaRobotsTag metaRobotsTag,
DocumentGeneratorExtractor documentGeneratorExtractor, DefaultSpecialization defaultSpecialization, LemmySpecialization lemmySpecialization) {
DocumentGeneratorExtractor documentGeneratorExtractor,
HtmlProcessorSpecializations specializations)
{
this.documentLengthLogic = documentLengthLogic;
this.minDocumentQuality = minDocumentQuality;
this.sentenceExtractor = sentenceExtractor;
@ -86,8 +85,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
this.metaRobotsTag = metaRobotsTag;
this.documentGeneratorExtractor = documentGeneratorExtractor;
this.defaultSpecialization = defaultSpecialization;
this.lemmySpecialization = lemmySpecialization;
this.htmlProcessorSpecializations = specializations;
}
@Override
@ -115,7 +113,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
final var generatorParts = documentGeneratorExtractor.generatorCleaned(doc);
final var specialization = selectSpecialization(generatorParts);
final var specialization = htmlProcessorSpecializations.select(generatorParts);
if (!specialization.shouldIndex(url)) {
throw new DisqualifiedException(DisqualificationReason.IRRELEVANT);
@ -180,16 +178,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
return new DetailsWithWords(ret, words);
}
/** Depending on the generator tag, we may want to use specialized logic for pruning and summarizing the document */
private HtmlProcessorSpecialization selectSpecialization(DocumentGeneratorExtractor.DocumentGenerator generatorParts) {
if (generatorParts.keywords().contains("lemmy")) {
return lemmySpecialization;
}
return defaultSpecialization;
}
private EnumSet<DocumentFlags> documentFlags(Set<HtmlFeature> features, GeneratorType type) {
EnumSet<DocumentFlags> flags = EnumSet.noneOf(DocumentFlags.class);

View File

@ -11,7 +11,7 @@ import java.util.List;
import java.util.Set;
@Singleton
public class DefaultSpecialization implements HtmlProcessorSpecialization {
public class DefaultSpecialization implements HtmlProcessorSpecializations.HtmlProcessorSpecializationIf {
private final SummaryExtractor summaryExtractor;

View File

@ -1,19 +0,0 @@
package nu.marginalia.converting.processor.plugin.specialization;
import nu.marginalia.model.EdgeUrl;
import org.jsoup.nodes.Document;
import java.util.Set;
/** This interface is used to specify how to process a specific website.
* The implementations of this interface are used by the HtmlProcessor to
* process the HTML documents.
*/
public interface HtmlProcessorSpecialization {
Document prune(Document original);
String getSummary(Document original,
Set<String> importantWords);
default boolean shouldIndex(EdgeUrl url) { return true; }
default double lengthModifier() { return 1.0; }
}

View File

@ -0,0 +1,54 @@
package nu.marginalia.converting.processor.plugin.specialization;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
import nu.marginalia.model.EdgeUrl;
import org.jsoup.nodes.Document;
import java.util.Set;
@Singleton
public class HtmlProcessorSpecializations {
private final LemmySpecialization lemmySpecialization;
private final XenForoSpecialization xenforoSpecialization;
private final PhpBBSpecialization phpBBSpecialization;
private final DefaultSpecialization defaultSpecialization;
@Inject
public HtmlProcessorSpecializations(LemmySpecialization lemmySpecialization,
XenForoSpecialization xenforoSpecialization,
PhpBBSpecialization phpBBSpecialization, DefaultSpecialization defaultSpecialization) {
this.lemmySpecialization = lemmySpecialization;
this.xenforoSpecialization = xenforoSpecialization;
this.phpBBSpecialization = phpBBSpecialization;
this.defaultSpecialization = defaultSpecialization;
}
/** Depending on the generator tag, we may want to use specialized logic for pruning and summarizing the document */
public HtmlProcessorSpecializationIf select(DocumentGeneratorExtractor.DocumentGenerator generator) {
if (generator.keywords().contains("lemmy")) {
return lemmySpecialization;
}
if (generator.keywords().contains("xenforo")) {
return xenforoSpecialization;
}
if (generator.keywords().contains("phpbb")) {
return xenforoSpecialization;
}
return defaultSpecialization;
}
/** This interface is used to specify how to process a specific website.
* The implementations of this interface are used by the HtmlProcessor to
* process the HTML documents.
*/
public interface HtmlProcessorSpecializationIf {
Document prune(Document original);
String getSummary(Document original,
Set<String> importantWords);
default boolean shouldIndex(EdgeUrl url) { return true; }
default double lengthModifier() { return 1.0; }
}
}

View File

@ -2,7 +2,6 @@ package nu.marginalia.converting.processor.plugin.specialization;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.summary.SummaryExtractor;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
@ -12,7 +11,7 @@ import java.util.Set;
/** This class is used to specify how to process a website running Lemmy */
@Singleton
public class LemmySpecialization implements HtmlProcessorSpecialization {
public class LemmySpecialization implements HtmlProcessorSpecializations.HtmlProcessorSpecializationIf {
private static final Logger logger = LoggerFactory.getLogger(LemmySpecialization.class);
private final SummaryExtractor summaryExtractor;

View File

@ -0,0 +1,26 @@
package nu.marginalia.converting.processor.plugin.specialization;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.summary.SummaryExtractor;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Set;
@Singleton
public class PhpBBSpecialization extends DefaultSpecialization {
private static final Logger logger = LoggerFactory.getLogger(PhpBBSpecialization.class);
@Inject
public PhpBBSpecialization(SummaryExtractor summaryExtractor) {
super(summaryExtractor);
}
@Override
public boolean shouldIndex(EdgeUrl url) {
return url.path.contains("viewtopic.php");
}
}

View File

@ -0,0 +1,75 @@
package nu.marginalia.converting.processor.plugin.specialization;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.summary.SummaryExtractor;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Set;
@Singleton
public class XenForoSpecialization implements HtmlProcessorSpecializations.HtmlProcessorSpecializationIf {
private static final Logger logger = LoggerFactory.getLogger(XenForoSpecialization.class);
private final SummaryExtractor summaryExtractor;
@Inject
public XenForoSpecialization(SummaryExtractor summaryExtractor) {
this.summaryExtractor = summaryExtractor;
}
public Document prune(Document document) {
// Remove the sidebar
var newDoc = new Document(document.baseUri());
var bodyTag = newDoc.appendElement("body");
var article = bodyTag.appendElement("article");
var firstTime = document.getElementsByTag("time").first();
if (firstTime != null) {
// Ensure we get the publish date
var timeTag = newDoc.createElement("time");
timeTag.attr("datetime", firstTime.attr("datetime"));
timeTag.attr("pubdate", "pubdate");
timeTag.text(firstTime.attr("datetime"));
article.appendChild(timeTag);
}
for (var post : document.getElementsByClass("message-inner")) {
String user = post.getElementsByClass("message-name").text();
String text = post.getElementsByClass("bbWrapper").text();
article.appendChild(newDoc.createElement("p").text(user + ": " + text));
}
return newDoc;
}
public String getSummary(Document document, Set<String> importantWords) {
StringBuilder summary = new StringBuilder();
for (var pTag : document.getElementsByClass("bbWrapper")) {
if (summary.length() > 512) {
break;
}
String text = pTag.text();
if (text.isBlank())
continue;
summary
.append(text)
.append(' ');
}
return summaryExtractor.abbreivateSummary(summary.toString());
}
@Override
public double lengthModifier() {
return 1.25;
}
}

View File

@ -0,0 +1,48 @@
package nu.marginalia.converting.processor.plugin.specialization;
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
import nu.marginalia.summary.SummaryExtractor;
import nu.marginalia.test.CommonTestData;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import java.util.Set;
class XenForoSpecializationTest {
static XenForoSpecialization specialization;
static DocumentGeneratorExtractor generatorExtractor = new DocumentGeneratorExtractor();
String thread = CommonTestData.loadTestData("mock-crawl-data/xenforo/thread.html");
@BeforeAll
public static void setUpAll() {
specialization = new XenForoSpecialization(
new SummaryExtractor(255,
null,
null,
null,
null,
null));
}
@Test
void prune() {
System.out.println(specialization.prune(Jsoup.parse(thread)));
}
@Test
void generatorExtraction() {
var gen = generatorExtractor.generatorCleaned(Jsoup.parse(thread));
System.out.println(gen);
}
@Test
void getSummary() {
String summary = specialization.getSummary(Jsoup.parse(thread), Set.of(""));
System.out.println(summary);
}
}

File diff suppressed because it is too large Load Diff