mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Add generator fingerprint for xenforo.
Also clean up the specializations logic a bit, and add a barebones specialization for phpbb that cleans out paths we aren't interested in but doesn't touch pruning or summarizing logic for now.
This commit is contained in:
parent
9bd0e3ce58
commit
a000256223
@ -7,9 +7,7 @@ import nu.marginalia.converting.processor.MetaRobotsTag;
|
|||||||
import nu.marginalia.converting.processor.logic.dom.MeasureLengthVisitor;
|
import nu.marginalia.converting.processor.logic.dom.MeasureLengthVisitor;
|
||||||
import nu.marginalia.converting.processor.logic.links.FileLinks;
|
import nu.marginalia.converting.processor.logic.links.FileLinks;
|
||||||
import nu.marginalia.converting.processor.logic.links.LinkProcessor;
|
import nu.marginalia.converting.processor.logic.links.LinkProcessor;
|
||||||
import nu.marginalia.converting.processor.plugin.specialization.DefaultSpecialization;
|
import nu.marginalia.converting.processor.plugin.specialization.*;
|
||||||
import nu.marginalia.converting.processor.plugin.specialization.HtmlProcessorSpecialization;
|
|
||||||
import nu.marginalia.converting.processor.plugin.specialization.LemmySpecialization;
|
|
||||||
import nu.marginalia.language.model.DocumentLanguageData;
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
import nu.marginalia.model.crawl.HtmlFeature;
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
import nu.marginalia.link_parser.LinkParser;
|
import nu.marginalia.link_parser.LinkParser;
|
||||||
@ -61,8 +59,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
private static final LinkParser linkParser = new LinkParser();
|
private static final LinkParser linkParser = new LinkParser();
|
||||||
private static final FeedExtractor feedExtractor = new FeedExtractor(linkParser);
|
private static final FeedExtractor feedExtractor = new FeedExtractor(linkParser);
|
||||||
|
|
||||||
private final DefaultSpecialization defaultSpecialization;
|
private final HtmlProcessorSpecializations htmlProcessorSpecializations;
|
||||||
private final LemmySpecialization lemmySpecialization;
|
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public HtmlDocumentProcessorPlugin(
|
public HtmlDocumentProcessorPlugin(
|
||||||
@ -74,7 +71,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
PubDateSniffer pubDateSniffer,
|
PubDateSniffer pubDateSniffer,
|
||||||
DocumentLengthLogic documentLengthLogic,
|
DocumentLengthLogic documentLengthLogic,
|
||||||
MetaRobotsTag metaRobotsTag,
|
MetaRobotsTag metaRobotsTag,
|
||||||
DocumentGeneratorExtractor documentGeneratorExtractor, DefaultSpecialization defaultSpecialization, LemmySpecialization lemmySpecialization) {
|
DocumentGeneratorExtractor documentGeneratorExtractor,
|
||||||
|
HtmlProcessorSpecializations specializations)
|
||||||
|
{
|
||||||
this.documentLengthLogic = documentLengthLogic;
|
this.documentLengthLogic = documentLengthLogic;
|
||||||
this.minDocumentQuality = minDocumentQuality;
|
this.minDocumentQuality = minDocumentQuality;
|
||||||
this.sentenceExtractor = sentenceExtractor;
|
this.sentenceExtractor = sentenceExtractor;
|
||||||
@ -86,8 +85,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
this.metaRobotsTag = metaRobotsTag;
|
this.metaRobotsTag = metaRobotsTag;
|
||||||
|
|
||||||
this.documentGeneratorExtractor = documentGeneratorExtractor;
|
this.documentGeneratorExtractor = documentGeneratorExtractor;
|
||||||
this.defaultSpecialization = defaultSpecialization;
|
this.htmlProcessorSpecializations = specializations;
|
||||||
this.lemmySpecialization = lemmySpecialization;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -115,7 +113,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
|
|
||||||
final var generatorParts = documentGeneratorExtractor.generatorCleaned(doc);
|
final var generatorParts = documentGeneratorExtractor.generatorCleaned(doc);
|
||||||
|
|
||||||
final var specialization = selectSpecialization(generatorParts);
|
final var specialization = htmlProcessorSpecializations.select(generatorParts);
|
||||||
|
|
||||||
if (!specialization.shouldIndex(url)) {
|
if (!specialization.shouldIndex(url)) {
|
||||||
throw new DisqualifiedException(DisqualificationReason.IRRELEVANT);
|
throw new DisqualifiedException(DisqualificationReason.IRRELEVANT);
|
||||||
@ -180,16 +178,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
return new DetailsWithWords(ret, words);
|
return new DetailsWithWords(ret, words);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Depending on the generator tag, we may want to use specialized logic for pruning and summarizing the document */
|
|
||||||
private HtmlProcessorSpecialization selectSpecialization(DocumentGeneratorExtractor.DocumentGenerator generatorParts) {
|
|
||||||
|
|
||||||
if (generatorParts.keywords().contains("lemmy")) {
|
|
||||||
return lemmySpecialization;
|
|
||||||
}
|
|
||||||
|
|
||||||
return defaultSpecialization;
|
|
||||||
}
|
|
||||||
|
|
||||||
private EnumSet<DocumentFlags> documentFlags(Set<HtmlFeature> features, GeneratorType type) {
|
private EnumSet<DocumentFlags> documentFlags(Set<HtmlFeature> features, GeneratorType type) {
|
||||||
EnumSet<DocumentFlags> flags = EnumSet.noneOf(DocumentFlags.class);
|
EnumSet<DocumentFlags> flags = EnumSet.noneOf(DocumentFlags.class);
|
||||||
|
|
||||||
|
@ -11,7 +11,7 @@ import java.util.List;
|
|||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class DefaultSpecialization implements HtmlProcessorSpecialization {
|
public class DefaultSpecialization implements HtmlProcessorSpecializations.HtmlProcessorSpecializationIf {
|
||||||
|
|
||||||
private final SummaryExtractor summaryExtractor;
|
private final SummaryExtractor summaryExtractor;
|
||||||
|
|
||||||
|
@ -1,19 +0,0 @@
|
|||||||
package nu.marginalia.converting.processor.plugin.specialization;
|
|
||||||
|
|
||||||
import nu.marginalia.model.EdgeUrl;
|
|
||||||
import org.jsoup.nodes.Document;
|
|
||||||
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
/** This interface is used to specify how to process a specific website.
|
|
||||||
* The implementations of this interface are used by the HtmlProcessor to
|
|
||||||
* process the HTML documents.
|
|
||||||
*/
|
|
||||||
public interface HtmlProcessorSpecialization {
|
|
||||||
Document prune(Document original);
|
|
||||||
String getSummary(Document original,
|
|
||||||
Set<String> importantWords);
|
|
||||||
|
|
||||||
default boolean shouldIndex(EdgeUrl url) { return true; }
|
|
||||||
default double lengthModifier() { return 1.0; }
|
|
||||||
}
|
|
@ -0,0 +1,54 @@
|
|||||||
|
package nu.marginalia.converting.processor.plugin.specialization;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class HtmlProcessorSpecializations {
|
||||||
|
private final LemmySpecialization lemmySpecialization;
|
||||||
|
private final XenForoSpecialization xenforoSpecialization;
|
||||||
|
private final PhpBBSpecialization phpBBSpecialization;
|
||||||
|
private final DefaultSpecialization defaultSpecialization;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public HtmlProcessorSpecializations(LemmySpecialization lemmySpecialization,
|
||||||
|
XenForoSpecialization xenforoSpecialization,
|
||||||
|
PhpBBSpecialization phpBBSpecialization, DefaultSpecialization defaultSpecialization) {
|
||||||
|
this.lemmySpecialization = lemmySpecialization;
|
||||||
|
this.xenforoSpecialization = xenforoSpecialization;
|
||||||
|
this.phpBBSpecialization = phpBBSpecialization;
|
||||||
|
this.defaultSpecialization = defaultSpecialization;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Depending on the generator tag, we may want to use specialized logic for pruning and summarizing the document */
|
||||||
|
public HtmlProcessorSpecializationIf select(DocumentGeneratorExtractor.DocumentGenerator generator) {
|
||||||
|
if (generator.keywords().contains("lemmy")) {
|
||||||
|
return lemmySpecialization;
|
||||||
|
}
|
||||||
|
if (generator.keywords().contains("xenforo")) {
|
||||||
|
return xenforoSpecialization;
|
||||||
|
}
|
||||||
|
if (generator.keywords().contains("phpbb")) {
|
||||||
|
return xenforoSpecialization;
|
||||||
|
}
|
||||||
|
return defaultSpecialization;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** This interface is used to specify how to process a specific website.
|
||||||
|
* The implementations of this interface are used by the HtmlProcessor to
|
||||||
|
* process the HTML documents.
|
||||||
|
*/
|
||||||
|
public interface HtmlProcessorSpecializationIf {
|
||||||
|
Document prune(Document original);
|
||||||
|
String getSummary(Document original,
|
||||||
|
Set<String> importantWords);
|
||||||
|
|
||||||
|
default boolean shouldIndex(EdgeUrl url) { return true; }
|
||||||
|
default double lengthModifier() { return 1.0; }
|
||||||
|
}
|
||||||
|
}
|
@ -2,7 +2,6 @@ package nu.marginalia.converting.processor.plugin.specialization;
|
|||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
|
||||||
import nu.marginalia.summary.SummaryExtractor;
|
import nu.marginalia.summary.SummaryExtractor;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -12,7 +11,7 @@ import java.util.Set;
|
|||||||
|
|
||||||
/** This class is used to specify how to process a website running Lemmy */
|
/** This class is used to specify how to process a website running Lemmy */
|
||||||
@Singleton
|
@Singleton
|
||||||
public class LemmySpecialization implements HtmlProcessorSpecialization {
|
public class LemmySpecialization implements HtmlProcessorSpecializations.HtmlProcessorSpecializationIf {
|
||||||
private static final Logger logger = LoggerFactory.getLogger(LemmySpecialization.class);
|
private static final Logger logger = LoggerFactory.getLogger(LemmySpecialization.class);
|
||||||
private final SummaryExtractor summaryExtractor;
|
private final SummaryExtractor summaryExtractor;
|
||||||
|
|
||||||
|
@ -0,0 +1,26 @@
|
|||||||
|
package nu.marginalia.converting.processor.plugin.specialization;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import nu.marginalia.summary.SummaryExtractor;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class PhpBBSpecialization extends DefaultSpecialization {
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(PhpBBSpecialization.class);
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public PhpBBSpecialization(SummaryExtractor summaryExtractor) {
|
||||||
|
super(summaryExtractor);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean shouldIndex(EdgeUrl url) {
|
||||||
|
return url.path.contains("viewtopic.php");
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,75 @@
|
|||||||
|
package nu.marginalia.converting.processor.plugin.specialization;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import nu.marginalia.summary.SummaryExtractor;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class XenForoSpecialization implements HtmlProcessorSpecializations.HtmlProcessorSpecializationIf {
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(XenForoSpecialization.class);
|
||||||
|
private final SummaryExtractor summaryExtractor;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public XenForoSpecialization(SummaryExtractor summaryExtractor) {
|
||||||
|
this.summaryExtractor = summaryExtractor;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Document prune(Document document) {
|
||||||
|
|
||||||
|
// Remove the sidebar
|
||||||
|
|
||||||
|
var newDoc = new Document(document.baseUri());
|
||||||
|
var bodyTag = newDoc.appendElement("body");
|
||||||
|
var article = bodyTag.appendElement("article");
|
||||||
|
var firstTime = document.getElementsByTag("time").first();
|
||||||
|
|
||||||
|
if (firstTime != null) {
|
||||||
|
// Ensure we get the publish date
|
||||||
|
var timeTag = newDoc.createElement("time");
|
||||||
|
|
||||||
|
timeTag.attr("datetime", firstTime.attr("datetime"));
|
||||||
|
timeTag.attr("pubdate", "pubdate");
|
||||||
|
timeTag.text(firstTime.attr("datetime"));
|
||||||
|
|
||||||
|
article.appendChild(timeTag);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (var post : document.getElementsByClass("message-inner")) {
|
||||||
|
String user = post.getElementsByClass("message-name").text();
|
||||||
|
String text = post.getElementsByClass("bbWrapper").text();
|
||||||
|
article.appendChild(newDoc.createElement("p").text(user + ": " + text));
|
||||||
|
}
|
||||||
|
|
||||||
|
return newDoc;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getSummary(Document document, Set<String> importantWords) {
|
||||||
|
StringBuilder summary = new StringBuilder();
|
||||||
|
|
||||||
|
for (var pTag : document.getElementsByClass("bbWrapper")) {
|
||||||
|
if (summary.length() > 512) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
String text = pTag.text();
|
||||||
|
|
||||||
|
if (text.isBlank())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
summary
|
||||||
|
.append(text)
|
||||||
|
.append(' ');
|
||||||
|
}
|
||||||
|
|
||||||
|
return summaryExtractor.abbreivateSummary(summary.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double lengthModifier() {
|
||||||
|
return 1.25;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,48 @@
|
|||||||
|
package nu.marginalia.converting.processor.plugin.specialization;
|
||||||
|
|
||||||
|
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
|
||||||
|
import nu.marginalia.summary.SummaryExtractor;
|
||||||
|
import nu.marginalia.test.CommonTestData;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
class XenForoSpecializationTest {
|
||||||
|
|
||||||
|
static XenForoSpecialization specialization;
|
||||||
|
static DocumentGeneratorExtractor generatorExtractor = new DocumentGeneratorExtractor();
|
||||||
|
|
||||||
|
String thread = CommonTestData.loadTestData("mock-crawl-data/xenforo/thread.html");
|
||||||
|
|
||||||
|
@BeforeAll
|
||||||
|
public static void setUpAll() {
|
||||||
|
specialization = new XenForoSpecialization(
|
||||||
|
new SummaryExtractor(255,
|
||||||
|
null,
|
||||||
|
null,
|
||||||
|
null,
|
||||||
|
null,
|
||||||
|
null));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void prune() {
|
||||||
|
System.out.println(specialization.prune(Jsoup.parse(thread)));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void generatorExtraction() {
|
||||||
|
var gen = generatorExtractor.generatorCleaned(Jsoup.parse(thread));
|
||||||
|
|
||||||
|
System.out.println(gen);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void getSummary() {
|
||||||
|
String summary = specialization.getSummary(Jsoup.parse(thread), Set.of(""));
|
||||||
|
|
||||||
|
System.out.println(summary);
|
||||||
|
}
|
||||||
|
}
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user