mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Add generator fingerprint for xenforo.
Also clean up the specializations logic a bit, and add a barebones specialization for phpbb that cleans out paths we aren't interested in but doesn't touch pruning or summarizing logic for now.
This commit is contained in:
parent
9bd0e3ce58
commit
a000256223
@ -7,9 +7,7 @@ import nu.marginalia.converting.processor.MetaRobotsTag;
|
||||
import nu.marginalia.converting.processor.logic.dom.MeasureLengthVisitor;
|
||||
import nu.marginalia.converting.processor.logic.links.FileLinks;
|
||||
import nu.marginalia.converting.processor.logic.links.LinkProcessor;
|
||||
import nu.marginalia.converting.processor.plugin.specialization.DefaultSpecialization;
|
||||
import nu.marginalia.converting.processor.plugin.specialization.HtmlProcessorSpecialization;
|
||||
import nu.marginalia.converting.processor.plugin.specialization.LemmySpecialization;
|
||||
import nu.marginalia.converting.processor.plugin.specialization.*;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.link_parser.LinkParser;
|
||||
@ -61,8 +59,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
private static final LinkParser linkParser = new LinkParser();
|
||||
private static final FeedExtractor feedExtractor = new FeedExtractor(linkParser);
|
||||
|
||||
private final DefaultSpecialization defaultSpecialization;
|
||||
private final LemmySpecialization lemmySpecialization;
|
||||
private final HtmlProcessorSpecializations htmlProcessorSpecializations;
|
||||
|
||||
@Inject
|
||||
public HtmlDocumentProcessorPlugin(
|
||||
@ -74,7 +71,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
PubDateSniffer pubDateSniffer,
|
||||
DocumentLengthLogic documentLengthLogic,
|
||||
MetaRobotsTag metaRobotsTag,
|
||||
DocumentGeneratorExtractor documentGeneratorExtractor, DefaultSpecialization defaultSpecialization, LemmySpecialization lemmySpecialization) {
|
||||
DocumentGeneratorExtractor documentGeneratorExtractor,
|
||||
HtmlProcessorSpecializations specializations)
|
||||
{
|
||||
this.documentLengthLogic = documentLengthLogic;
|
||||
this.minDocumentQuality = minDocumentQuality;
|
||||
this.sentenceExtractor = sentenceExtractor;
|
||||
@ -86,8 +85,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
this.metaRobotsTag = metaRobotsTag;
|
||||
|
||||
this.documentGeneratorExtractor = documentGeneratorExtractor;
|
||||
this.defaultSpecialization = defaultSpecialization;
|
||||
this.lemmySpecialization = lemmySpecialization;
|
||||
this.htmlProcessorSpecializations = specializations;
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -115,7 +113,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
|
||||
final var generatorParts = documentGeneratorExtractor.generatorCleaned(doc);
|
||||
|
||||
final var specialization = selectSpecialization(generatorParts);
|
||||
final var specialization = htmlProcessorSpecializations.select(generatorParts);
|
||||
|
||||
if (!specialization.shouldIndex(url)) {
|
||||
throw new DisqualifiedException(DisqualificationReason.IRRELEVANT);
|
||||
@ -180,16 +178,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
return new DetailsWithWords(ret, words);
|
||||
}
|
||||
|
||||
/** Depending on the generator tag, we may want to use specialized logic for pruning and summarizing the document */
|
||||
private HtmlProcessorSpecialization selectSpecialization(DocumentGeneratorExtractor.DocumentGenerator generatorParts) {
|
||||
|
||||
if (generatorParts.keywords().contains("lemmy")) {
|
||||
return lemmySpecialization;
|
||||
}
|
||||
|
||||
return defaultSpecialization;
|
||||
}
|
||||
|
||||
private EnumSet<DocumentFlags> documentFlags(Set<HtmlFeature> features, GeneratorType type) {
|
||||
EnumSet<DocumentFlags> flags = EnumSet.noneOf(DocumentFlags.class);
|
||||
|
||||
|
@ -11,7 +11,7 @@ import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
@Singleton
|
||||
public class DefaultSpecialization implements HtmlProcessorSpecialization {
|
||||
public class DefaultSpecialization implements HtmlProcessorSpecializations.HtmlProcessorSpecializationIf {
|
||||
|
||||
private final SummaryExtractor summaryExtractor;
|
||||
|
||||
|
@ -1,19 +0,0 @@
|
||||
package nu.marginalia.converting.processor.plugin.specialization;
|
||||
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
/** This interface is used to specify how to process a specific website.
|
||||
* The implementations of this interface are used by the HtmlProcessor to
|
||||
* process the HTML documents.
|
||||
*/
|
||||
public interface HtmlProcessorSpecialization {
|
||||
Document prune(Document original);
|
||||
String getSummary(Document original,
|
||||
Set<String> importantWords);
|
||||
|
||||
default boolean shouldIndex(EdgeUrl url) { return true; }
|
||||
default double lengthModifier() { return 1.0; }
|
||||
}
|
@ -0,0 +1,54 @@
|
||||
package nu.marginalia.converting.processor.plugin.specialization;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
@Singleton
|
||||
public class HtmlProcessorSpecializations {
|
||||
private final LemmySpecialization lemmySpecialization;
|
||||
private final XenForoSpecialization xenforoSpecialization;
|
||||
private final PhpBBSpecialization phpBBSpecialization;
|
||||
private final DefaultSpecialization defaultSpecialization;
|
||||
|
||||
@Inject
|
||||
public HtmlProcessorSpecializations(LemmySpecialization lemmySpecialization,
|
||||
XenForoSpecialization xenforoSpecialization,
|
||||
PhpBBSpecialization phpBBSpecialization, DefaultSpecialization defaultSpecialization) {
|
||||
this.lemmySpecialization = lemmySpecialization;
|
||||
this.xenforoSpecialization = xenforoSpecialization;
|
||||
this.phpBBSpecialization = phpBBSpecialization;
|
||||
this.defaultSpecialization = defaultSpecialization;
|
||||
}
|
||||
|
||||
/** Depending on the generator tag, we may want to use specialized logic for pruning and summarizing the document */
|
||||
public HtmlProcessorSpecializationIf select(DocumentGeneratorExtractor.DocumentGenerator generator) {
|
||||
if (generator.keywords().contains("lemmy")) {
|
||||
return lemmySpecialization;
|
||||
}
|
||||
if (generator.keywords().contains("xenforo")) {
|
||||
return xenforoSpecialization;
|
||||
}
|
||||
if (generator.keywords().contains("phpbb")) {
|
||||
return xenforoSpecialization;
|
||||
}
|
||||
return defaultSpecialization;
|
||||
}
|
||||
|
||||
/** This interface is used to specify how to process a specific website.
|
||||
* The implementations of this interface are used by the HtmlProcessor to
|
||||
* process the HTML documents.
|
||||
*/
|
||||
public interface HtmlProcessorSpecializationIf {
|
||||
Document prune(Document original);
|
||||
String getSummary(Document original,
|
||||
Set<String> importantWords);
|
||||
|
||||
default boolean shouldIndex(EdgeUrl url) { return true; }
|
||||
default double lengthModifier() { return 1.0; }
|
||||
}
|
||||
}
|
@ -2,7 +2,6 @@ package nu.marginalia.converting.processor.plugin.specialization;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.summary.SummaryExtractor;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.slf4j.Logger;
|
||||
@ -12,7 +11,7 @@ import java.util.Set;
|
||||
|
||||
/** This class is used to specify how to process a website running Lemmy */
|
||||
@Singleton
|
||||
public class LemmySpecialization implements HtmlProcessorSpecialization {
|
||||
public class LemmySpecialization implements HtmlProcessorSpecializations.HtmlProcessorSpecializationIf {
|
||||
private static final Logger logger = LoggerFactory.getLogger(LemmySpecialization.class);
|
||||
private final SummaryExtractor summaryExtractor;
|
||||
|
||||
|
@ -0,0 +1,26 @@
|
||||
package nu.marginalia.converting.processor.plugin.specialization;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.summary.SummaryExtractor;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
@Singleton
|
||||
public class PhpBBSpecialization extends DefaultSpecialization {
|
||||
private static final Logger logger = LoggerFactory.getLogger(PhpBBSpecialization.class);
|
||||
|
||||
@Inject
|
||||
public PhpBBSpecialization(SummaryExtractor summaryExtractor) {
|
||||
super(summaryExtractor);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean shouldIndex(EdgeUrl url) {
|
||||
return url.path.contains("viewtopic.php");
|
||||
}
|
||||
}
|
@ -0,0 +1,75 @@
|
||||
package nu.marginalia.converting.processor.plugin.specialization;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.summary.SummaryExtractor;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
@Singleton
|
||||
public class XenForoSpecialization implements HtmlProcessorSpecializations.HtmlProcessorSpecializationIf {
|
||||
private static final Logger logger = LoggerFactory.getLogger(XenForoSpecialization.class);
|
||||
private final SummaryExtractor summaryExtractor;
|
||||
|
||||
@Inject
|
||||
public XenForoSpecialization(SummaryExtractor summaryExtractor) {
|
||||
this.summaryExtractor = summaryExtractor;
|
||||
}
|
||||
|
||||
public Document prune(Document document) {
|
||||
|
||||
// Remove the sidebar
|
||||
|
||||
var newDoc = new Document(document.baseUri());
|
||||
var bodyTag = newDoc.appendElement("body");
|
||||
var article = bodyTag.appendElement("article");
|
||||
var firstTime = document.getElementsByTag("time").first();
|
||||
|
||||
if (firstTime != null) {
|
||||
// Ensure we get the publish date
|
||||
var timeTag = newDoc.createElement("time");
|
||||
|
||||
timeTag.attr("datetime", firstTime.attr("datetime"));
|
||||
timeTag.attr("pubdate", "pubdate");
|
||||
timeTag.text(firstTime.attr("datetime"));
|
||||
|
||||
article.appendChild(timeTag);
|
||||
}
|
||||
|
||||
for (var post : document.getElementsByClass("message-inner")) {
|
||||
String user = post.getElementsByClass("message-name").text();
|
||||
String text = post.getElementsByClass("bbWrapper").text();
|
||||
article.appendChild(newDoc.createElement("p").text(user + ": " + text));
|
||||
}
|
||||
|
||||
return newDoc;
|
||||
}
|
||||
|
||||
public String getSummary(Document document, Set<String> importantWords) {
|
||||
StringBuilder summary = new StringBuilder();
|
||||
|
||||
for (var pTag : document.getElementsByClass("bbWrapper")) {
|
||||
if (summary.length() > 512) {
|
||||
break;
|
||||
}
|
||||
String text = pTag.text();
|
||||
|
||||
if (text.isBlank())
|
||||
continue;
|
||||
|
||||
summary
|
||||
.append(text)
|
||||
.append(' ');
|
||||
}
|
||||
|
||||
return summaryExtractor.abbreivateSummary(summary.toString());
|
||||
}
|
||||
|
||||
@Override
|
||||
public double lengthModifier() {
|
||||
return 1.25;
|
||||
}
|
||||
}
|
@ -0,0 +1,48 @@
|
||||
package nu.marginalia.converting.processor.plugin.specialization;
|
||||
|
||||
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
|
||||
import nu.marginalia.summary.SummaryExtractor;
|
||||
import nu.marginalia.test.CommonTestData;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
class XenForoSpecializationTest {
|
||||
|
||||
static XenForoSpecialization specialization;
|
||||
static DocumentGeneratorExtractor generatorExtractor = new DocumentGeneratorExtractor();
|
||||
|
||||
String thread = CommonTestData.loadTestData("mock-crawl-data/xenforo/thread.html");
|
||||
|
||||
@BeforeAll
|
||||
public static void setUpAll() {
|
||||
specialization = new XenForoSpecialization(
|
||||
new SummaryExtractor(255,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null));
|
||||
}
|
||||
|
||||
@Test
|
||||
void prune() {
|
||||
System.out.println(specialization.prune(Jsoup.parse(thread)));
|
||||
}
|
||||
|
||||
@Test
|
||||
void generatorExtraction() {
|
||||
var gen = generatorExtractor.generatorCleaned(Jsoup.parse(thread));
|
||||
|
||||
System.out.println(gen);
|
||||
}
|
||||
|
||||
@Test
|
||||
void getSummary() {
|
||||
String summary = specialization.getSummary(Jsoup.parse(thread), Set.of(""));
|
||||
|
||||
System.out.println(summary);
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user