From 461bc3eb1ab419afe8a8a16516dc7b1a3b85af39 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 10 Dec 2024 22:22:52 +0100 Subject: [PATCH] (generator) Add special workaround to flag fextralife as a wiki --- .../processor/logic/DocumentGeneratorExtractor.java | 11 +++++++++-- .../processor/plugin/HtmlDocumentProcessorPlugin.java | 2 +- .../specialization/JavadocSpecializationTest.java | 5 +++-- .../specialization/LemmySpecializationTest.java | 8 +++++--- .../specialization/XenForoSpecializationTest.java | 6 ++++-- 5 files changed, 22 insertions(+), 10 deletions(-) diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java index cfc333c7..e6a87089 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java @@ -2,6 +2,7 @@ package nu.marginalia.converting.processor.logic; import nu.marginalia.converting.model.DocumentHeaders; import nu.marginalia.converting.model.GeneratorType; +import nu.marginalia.model.EdgeUrl; import org.apache.commons.lang3.StringUtils; import org.jsoup.nodes.Document; @@ -13,7 +14,12 @@ import java.util.List; public class DocumentGeneratorExtractor { private static final String defaultValue = "unset"; - public DocumentGenerator detectGenerator(Document doc, DocumentHeaders responseHeaders) { + public DocumentGenerator detectGenerator(EdgeUrl url, Document doc, DocumentHeaders responseHeaders) { + + // Fextralife leaves no known tech fingerprint, but we know it's a wiki software of some sort + if (url.domain.toString().endsWith(".wiki.fextralife.com")) { + return DocumentGenerator.of("wiki"); + } var tags = doc.select("meta[name=generator]"); @@ -69,6 +75,7 @@ public class DocumentGeneratorExtractor { } } + if (parts.length > 1) { return DocumentGenerator.of(parts[0], parts[0] + "_" + truncVersion(parts[1])); } @@ -282,7 +289,7 @@ public class DocumentGeneratorExtractor { -> GeneratorType.FORUM; case "mediawiki", "dokuwiki", "wikidot", "sharepoint" -> GeneratorType.WIKI; - case "pandoc", "mkdocs", "doxygen", "javadoc", "asciidoc", "jsdoc", "FluxGarden" + case "pandoc", "mkdocs", "doxygen", "javadoc", "asciidoc", "jsdoc", "FluxGarden", "wiki" -> GeneratorType.DOCS; case "woocommerce", "shopfactory", "prestashop", "magento", "shopify", "sitedirect", "seomatic", "osclass" -> GeneratorType.ECOMMERCE_AND_SPAM; diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java index 09b4a360..e27d0f68 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java @@ -129,7 +129,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin final EdgeUrl url = new EdgeUrl(crawledDocument.url); final DocumentHeaders documentHeaders = new DocumentHeaders(crawledDocument.headers); - final var generatorParts = documentGeneratorExtractor.detectGenerator(doc, documentHeaders); + final var generatorParts = documentGeneratorExtractor.detectGenerator(url, doc, documentHeaders); final var specialization = htmlProcessorSpecializations.select(generatorParts, url); diff --git a/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/JavadocSpecializationTest.java b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/JavadocSpecializationTest.java index 253fc673..1b162790 100644 --- a/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/JavadocSpecializationTest.java +++ b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/JavadocSpecializationTest.java @@ -3,6 +3,7 @@ package nu.marginalia.converting.processor.plugin.specialization; import nu.marginalia.converting.model.DocumentHeaders; import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor; import nu.marginalia.converting.processor.summary.SummaryExtractor; +import nu.marginalia.model.EdgeUrl; import nu.marginalia.test.CommonTestData; import org.jsoup.Jsoup; import org.junit.jupiter.api.BeforeAll; @@ -34,8 +35,8 @@ class JavadocSpecializationTest { } @Test - void generatorExtraction() { - var gen = generatorExtractor.detectGenerator(Jsoup.parse(thread), new DocumentHeaders("")); + void generatorExtraction() throws Exception { + var gen = generatorExtractor.detectGenerator(new EdgeUrl("https://www.example.com/"), Jsoup.parse(thread), new DocumentHeaders("")); System.out.println(gen); } diff --git a/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/LemmySpecializationTest.java b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/LemmySpecializationTest.java index 178796df..77d3fc05 100644 --- a/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/LemmySpecializationTest.java +++ b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/LemmySpecializationTest.java @@ -3,11 +3,13 @@ package nu.marginalia.converting.processor.plugin.specialization; import nu.marginalia.converting.model.DocumentHeaders; import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor; import nu.marginalia.converting.processor.summary.SummaryExtractor; +import nu.marginalia.model.EdgeUrl; import nu.marginalia.test.CommonTestData; import org.jsoup.Jsoup; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; +import java.net.URISyntaxException; import java.util.Set; class LemmySpecializationTest { @@ -37,9 +39,9 @@ class LemmySpecializationTest { } @Test - void generatorExtraction() { - var generatorIndex = generatorExtractor.detectGenerator(Jsoup.parse(lemmyIndexHtml), new DocumentHeaders("")); - var generatorPost = generatorExtractor.detectGenerator(Jsoup.parse(lemmyPost), new DocumentHeaders("")); + void generatorExtraction() throws URISyntaxException { + var generatorIndex = generatorExtractor.detectGenerator(new EdgeUrl("https://www.example.com/"), Jsoup.parse(lemmyIndexHtml), new DocumentHeaders("")); + var generatorPost = generatorExtractor.detectGenerator(new EdgeUrl("https://www.example.com/"), Jsoup.parse(lemmyPost), new DocumentHeaders("")); System.out.println(generatorIndex); System.out.println(generatorPost); diff --git a/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/XenForoSpecializationTest.java b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/XenForoSpecializationTest.java index 3efd2900..c4005c06 100644 --- a/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/XenForoSpecializationTest.java +++ b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/XenForoSpecializationTest.java @@ -3,11 +3,13 @@ package nu.marginalia.converting.processor.plugin.specialization; import nu.marginalia.converting.model.DocumentHeaders; import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor; import nu.marginalia.converting.processor.summary.SummaryExtractor; +import nu.marginalia.model.EdgeUrl; import nu.marginalia.test.CommonTestData; import org.jsoup.Jsoup; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; +import java.net.URISyntaxException; import java.util.Set; class XenForoSpecializationTest { @@ -34,8 +36,8 @@ class XenForoSpecializationTest { } @Test - void generatorExtraction() { - var gen = generatorExtractor.detectGenerator(Jsoup.parse(thread), new DocumentHeaders("")); + void generatorExtraction() throws URISyntaxException { + var gen = generatorExtractor.detectGenerator(new EdgeUrl("https://www.example.com/"), Jsoup.parse(thread), new DocumentHeaders("")); System.out.println(gen); }