mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(generator) Add special workaround to flag fextralife as a wiki
This commit is contained in:
parent
cf7f84f033
commit
461bc3eb1a
@ -2,6 +2,7 @@ package nu.marginalia.converting.processor.logic;
|
||||
|
||||
import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.model.GeneratorType;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
@ -13,7 +14,12 @@ import java.util.List;
|
||||
public class DocumentGeneratorExtractor {
|
||||
private static final String defaultValue = "unset";
|
||||
|
||||
public DocumentGenerator detectGenerator(Document doc, DocumentHeaders responseHeaders) {
|
||||
public DocumentGenerator detectGenerator(EdgeUrl url, Document doc, DocumentHeaders responseHeaders) {
|
||||
|
||||
// Fextralife leaves no known tech fingerprint, but we know it's a wiki software of some sort
|
||||
if (url.domain.toString().endsWith(".wiki.fextralife.com")) {
|
||||
return DocumentGenerator.of("wiki");
|
||||
}
|
||||
|
||||
var tags = doc.select("meta[name=generator]");
|
||||
|
||||
@ -69,6 +75,7 @@ public class DocumentGeneratorExtractor {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (parts.length > 1) {
|
||||
return DocumentGenerator.of(parts[0], parts[0] + "_" + truncVersion(parts[1]));
|
||||
}
|
||||
@ -282,7 +289,7 @@ public class DocumentGeneratorExtractor {
|
||||
-> GeneratorType.FORUM;
|
||||
case "mediawiki", "dokuwiki", "wikidot", "sharepoint"
|
||||
-> GeneratorType.WIKI;
|
||||
case "pandoc", "mkdocs", "doxygen", "javadoc", "asciidoc", "jsdoc", "FluxGarden"
|
||||
case "pandoc", "mkdocs", "doxygen", "javadoc", "asciidoc", "jsdoc", "FluxGarden", "wiki"
|
||||
-> GeneratorType.DOCS;
|
||||
case "woocommerce", "shopfactory", "prestashop", "magento", "shopify", "sitedirect", "seomatic", "osclass"
|
||||
-> GeneratorType.ECOMMERCE_AND_SPAM;
|
||||
|
@ -129,7 +129,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
final EdgeUrl url = new EdgeUrl(crawledDocument.url);
|
||||
final DocumentHeaders documentHeaders = new DocumentHeaders(crawledDocument.headers);
|
||||
|
||||
final var generatorParts = documentGeneratorExtractor.detectGenerator(doc, documentHeaders);
|
||||
final var generatorParts = documentGeneratorExtractor.detectGenerator(url, doc, documentHeaders);
|
||||
|
||||
final var specialization = htmlProcessorSpecializations.select(generatorParts, url);
|
||||
|
||||
|
@ -3,6 +3,7 @@ package nu.marginalia.converting.processor.plugin.specialization;
|
||||
import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
|
||||
import nu.marginalia.converting.processor.summary.SummaryExtractor;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.test.CommonTestData;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
@ -34,8 +35,8 @@ class JavadocSpecializationTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
void generatorExtraction() {
|
||||
var gen = generatorExtractor.detectGenerator(Jsoup.parse(thread), new DocumentHeaders(""));
|
||||
void generatorExtraction() throws Exception {
|
||||
var gen = generatorExtractor.detectGenerator(new EdgeUrl("https://www.example.com/"), Jsoup.parse(thread), new DocumentHeaders(""));
|
||||
|
||||
System.out.println(gen);
|
||||
}
|
||||
|
@ -3,11 +3,13 @@ package nu.marginalia.converting.processor.plugin.specialization;
|
||||
import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
|
||||
import nu.marginalia.converting.processor.summary.SummaryExtractor;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.test.CommonTestData;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
import java.util.Set;
|
||||
|
||||
class LemmySpecializationTest {
|
||||
@ -37,9 +39,9 @@ class LemmySpecializationTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
void generatorExtraction() {
|
||||
var generatorIndex = generatorExtractor.detectGenerator(Jsoup.parse(lemmyIndexHtml), new DocumentHeaders(""));
|
||||
var generatorPost = generatorExtractor.detectGenerator(Jsoup.parse(lemmyPost), new DocumentHeaders(""));
|
||||
void generatorExtraction() throws URISyntaxException {
|
||||
var generatorIndex = generatorExtractor.detectGenerator(new EdgeUrl("https://www.example.com/"), Jsoup.parse(lemmyIndexHtml), new DocumentHeaders(""));
|
||||
var generatorPost = generatorExtractor.detectGenerator(new EdgeUrl("https://www.example.com/"), Jsoup.parse(lemmyPost), new DocumentHeaders(""));
|
||||
|
||||
System.out.println(generatorIndex);
|
||||
System.out.println(generatorPost);
|
||||
|
@ -3,11 +3,13 @@ package nu.marginalia.converting.processor.plugin.specialization;
|
||||
import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
|
||||
import nu.marginalia.converting.processor.summary.SummaryExtractor;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.test.CommonTestData;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
import java.util.Set;
|
||||
|
||||
class XenForoSpecializationTest {
|
||||
@ -34,8 +36,8 @@ class XenForoSpecializationTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
void generatorExtraction() {
|
||||
var gen = generatorExtractor.detectGenerator(Jsoup.parse(thread), new DocumentHeaders(""));
|
||||
void generatorExtraction() throws URISyntaxException {
|
||||
var gen = generatorExtractor.detectGenerator(new EdgeUrl("https://www.example.com/"), Jsoup.parse(thread), new DocumentHeaders(""));
|
||||
|
||||
System.out.println(gen);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user