(generator) Add special workaround to flag fextralife as a wiki

This commit is contained in:
Viktor Lofgren 2024-12-10 22:22:52 +01:00
parent cf7f84f033
commit 461bc3eb1a
5 changed files with 22 additions and 10 deletions

View File

@ -2,6 +2,7 @@ package nu.marginalia.converting.processor.logic;
import nu.marginalia.converting.model.DocumentHeaders;
import nu.marginalia.converting.model.GeneratorType;
import nu.marginalia.model.EdgeUrl;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Document;
@ -13,7 +14,12 @@ import java.util.List;
public class DocumentGeneratorExtractor {
private static final String defaultValue = "unset";
public DocumentGenerator detectGenerator(Document doc, DocumentHeaders responseHeaders) {
public DocumentGenerator detectGenerator(EdgeUrl url, Document doc, DocumentHeaders responseHeaders) {
// Fextralife leaves no known tech fingerprint, but we know it's a wiki software of some sort
if (url.domain.toString().endsWith(".wiki.fextralife.com")) {
return DocumentGenerator.of("wiki");
}
var tags = doc.select("meta[name=generator]");
@ -69,6 +75,7 @@ public class DocumentGeneratorExtractor {
}
}
if (parts.length > 1) {
return DocumentGenerator.of(parts[0], parts[0] + "_" + truncVersion(parts[1]));
}
@ -282,7 +289,7 @@ public class DocumentGeneratorExtractor {
-> GeneratorType.FORUM;
case "mediawiki", "dokuwiki", "wikidot", "sharepoint"
-> GeneratorType.WIKI;
case "pandoc", "mkdocs", "doxygen", "javadoc", "asciidoc", "jsdoc", "FluxGarden"
case "pandoc", "mkdocs", "doxygen", "javadoc", "asciidoc", "jsdoc", "FluxGarden", "wiki"
-> GeneratorType.DOCS;
case "woocommerce", "shopfactory", "prestashop", "magento", "shopify", "sitedirect", "seomatic", "osclass"
-> GeneratorType.ECOMMERCE_AND_SPAM;

View File

@ -129,7 +129,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
final EdgeUrl url = new EdgeUrl(crawledDocument.url);
final DocumentHeaders documentHeaders = new DocumentHeaders(crawledDocument.headers);
final var generatorParts = documentGeneratorExtractor.detectGenerator(doc, documentHeaders);
final var generatorParts = documentGeneratorExtractor.detectGenerator(url, doc, documentHeaders);
final var specialization = htmlProcessorSpecializations.select(generatorParts, url);

View File

@ -3,6 +3,7 @@ package nu.marginalia.converting.processor.plugin.specialization;
import nu.marginalia.converting.model.DocumentHeaders;
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.test.CommonTestData;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.BeforeAll;
@ -34,8 +35,8 @@ class JavadocSpecializationTest {
}
@Test
void generatorExtraction() {
var gen = generatorExtractor.detectGenerator(Jsoup.parse(thread), new DocumentHeaders(""));
void generatorExtraction() throws Exception {
var gen = generatorExtractor.detectGenerator(new EdgeUrl("https://www.example.com/"), Jsoup.parse(thread), new DocumentHeaders(""));
System.out.println(gen);
}

View File

@ -3,11 +3,13 @@ package nu.marginalia.converting.processor.plugin.specialization;
import nu.marginalia.converting.model.DocumentHeaders;
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.test.CommonTestData;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import java.net.URISyntaxException;
import java.util.Set;
class LemmySpecializationTest {
@ -37,9 +39,9 @@ class LemmySpecializationTest {
}
@Test
void generatorExtraction() {
var generatorIndex = generatorExtractor.detectGenerator(Jsoup.parse(lemmyIndexHtml), new DocumentHeaders(""));
var generatorPost = generatorExtractor.detectGenerator(Jsoup.parse(lemmyPost), new DocumentHeaders(""));
void generatorExtraction() throws URISyntaxException {
var generatorIndex = generatorExtractor.detectGenerator(new EdgeUrl("https://www.example.com/"), Jsoup.parse(lemmyIndexHtml), new DocumentHeaders(""));
var generatorPost = generatorExtractor.detectGenerator(new EdgeUrl("https://www.example.com/"), Jsoup.parse(lemmyPost), new DocumentHeaders(""));
System.out.println(generatorIndex);
System.out.println(generatorPost);

View File

@ -3,11 +3,13 @@ package nu.marginalia.converting.processor.plugin.specialization;
import nu.marginalia.converting.model.DocumentHeaders;
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.test.CommonTestData;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import java.net.URISyntaxException;
import java.util.Set;
class XenForoSpecializationTest {
@ -34,8 +36,8 @@ class XenForoSpecializationTest {
}
@Test
void generatorExtraction() {
var gen = generatorExtractor.detectGenerator(Jsoup.parse(thread), new DocumentHeaders(""));
void generatorExtraction() throws URISyntaxException {
var gen = generatorExtractor.detectGenerator(new EdgeUrl("https://www.example.com/"), Jsoup.parse(thread), new DocumentHeaders(""));
System.out.println(gen);
}