From bd2c3855ed9944974bc26f7dd8066fe6ca548475 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 23 Jun 2023 21:35:28 +0200 Subject: [PATCH] Add bits and keywords for generator classes (docs, forum, wiki). --- .../marginalia/model/idx/DocumentFlags.java | 8 +-- .../converting/model/GeneratorType.java | 7 ++- .../logic/DocumentGeneratorExtractor.java | 59 ++++++++++++++----- .../plugin/HtmlDocumentProcessorPlugin.java | 7 +-- .../index/svc/searchset/RankingSearchSet.java | 17 +----- 5 files changed, 58 insertions(+), 40 deletions(-) diff --git a/code/common/model/src/main/java/nu/marginalia/model/idx/DocumentFlags.java b/code/common/model/src/main/java/nu/marginalia/model/idx/DocumentFlags.java index b39dfe1b..7ed409c1 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/idx/DocumentFlags.java +++ b/code/common/model/src/main/java/nu/marginalia/model/idx/DocumentFlags.java @@ -5,10 +5,10 @@ import java.util.EnumSet; public enum DocumentFlags { Javascript, PlainText, - GeneratorSpammy, - GeneratorVintage, - GeneratorBlog, - GeneratorForumWiki, + GeneratorDocs, + GeneratorForum, + GeneratorWiki, + Unused6, Unused7, Unused8, ; diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/model/GeneratorType.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/model/GeneratorType.java index 3f5a1439..42cae758 100644 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/model/GeneratorType.java +++ b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/model/GeneratorType.java @@ -6,8 +6,9 @@ public enum GeneratorType { ZOOMER_STATIC, CMS, SAAS, - MANUAL_RETRO, - MANUAL_NEW, - DOCS_FORUM_WIKI, + MANUAL, + FORUM, + WIKI, + DOCS, ECOMMERCE_AND_SPAM } \ No newline at end of file diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java index b9d7bc91..4a7878c9 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java @@ -4,6 +4,8 @@ import nu.marginalia.converting.model.GeneratorType; import org.apache.commons.lang3.StringUtils; import org.jsoup.nodes.Document; +import java.util.ArrayList; +import java.util.Arrays; import java.util.List; /** Extract keywords for the document meta generator tag */ @@ -13,8 +15,10 @@ public class DocumentGeneratorExtractor { public DocumentGenerator generatorCleaned(Document doc) { var tags = doc.select("meta[name=generator]"); + if (tags.size() == 0) { - return DocumentGenerator.unset(); + // Some sites have a comment in the head instead of a meta tag + return fingerprintByComments(doc); } if (tags.size() > 1) { return DocumentGenerator.multiple(); @@ -22,7 +26,7 @@ public class DocumentGeneratorExtractor { String generator = tags.attr("content"); // Remove leading or trailing junk from the generator string, "powered by" etc. - generator = trim(generator); + generator = removePrefixOrSuffix(generator); if (generator.isBlank()) return DocumentGenerator.unset(); @@ -63,11 +67,29 @@ public class DocumentGeneratorExtractor { } } - private String trim(String generator) { + // Fallback logic when there is no meta tag + private DocumentGenerator fingerprintByComments(Document doc) { + + for (var comment : doc.getElementsByTag("head").comments()) { + if (comment.getData().contains("Generated by javadoc")) { + return DocumentGenerator.of("javadoc"); + } + } + + return DocumentGenerator.unset(); + } + + private String removePrefixOrSuffix(String generator) { generator = generator.toLowerCase().trim(); - if (generator.startsWith("powered by ")) { - generator = generator.substring("powered by ".length()); + + // strip common prefixes + for (String prefix : Arrays.asList("powered by ", "generated by ")) { + + if (generator.startsWith(prefix)) { + generator = generator.substring(prefix.length()); + break; + } } int dashIdx = generator.indexOf('-'); // Some strings have values like 'foobar 2.3 - the free online generator!' @@ -82,7 +104,8 @@ public class DocumentGeneratorExtractor { } // Censor exact version strings, being able to search by major version is enough - // for any non-blackhat purpose + // for any non-blackhat purpose; creating a directory with exact version string + // is a security risk for the site owner. private String truncVersion(String part) { int periodIdx = part.indexOf('.', part.startsWith("0.") ? 2 : 0); @@ -101,6 +124,8 @@ public class DocumentGeneratorExtractor { if (parts.length == 0) return unset(); + List keywords = new ArrayList<>(List.of(parts)); + final GeneratorType type = switch (parts[0]) { case "joomla", "wordpress", "drupal", "plone", "postnuke", "divi", "freeway", "unicity", "modx", "sitemagic", "agility", "edlio", "blogger", "slider", "slider_revolution", "gravcms", @@ -121,23 +146,29 @@ public class DocumentGeneratorExtractor { -> GeneratorType.BOOMER_STATIC; case "hugo", "jekyll", "hakyll", "gatsby", "react", "gridsome" -> GeneratorType.ZOOMER_STATIC; - case "vi", "vim", "emacs", "orgmode", "hand", "vscode", "atom", "bbedit", "nano" - -> GeneratorType.MANUAL_NEW; - case "notepad.exe", "gedit", "me", + case "vi", "vim", "emacs", "orgmode", "hand", "vscode", "atom", "bbedit", "nano", + "notepad.exe", "gedit", "me", "geany", "sublime", "notepad++", "author", "notepad", "namo", "arachnophilia", "scite", "alleycode", "htmlkit", "acehtml", "bluefish", "htmled", "cutehtml", "fileedit", "cocoa" - -> GeneratorType.MANUAL_RETRO; - case "vbulletin", "phpbb", "mybb", "nodebb", "flarum", - "discourse", "mediawiki", "dokuwiki", "pandoc", "mkdocs", "sharepoint", "doxygen" - -> GeneratorType.DOCS_FORUM_WIKI; + -> GeneratorType.MANUAL; + case "vbulletin", "phpbb", "mybb", "nodebb", "flarum", "discourse" + -> GeneratorType.FORUM; + case "mediawiki", "dokuwiki", "sharepoint" + -> GeneratorType.WIKI; + case "pandoc", "mkdocs", "doxygen", "javadoc" + -> GeneratorType.DOCS; case "woocommerce", "shopfactory", "prestashop", "magento", "shopify", "sitedirect", "seomatic" -> GeneratorType.ECOMMERCE_AND_SPAM; default -> GeneratorType.UNKNOWN; }; - return new DocumentGenerator(type, List.of(parts)); + if (type != GeneratorType.UNKNOWN) { + keywords.add(type.name().toLowerCase()); + } + + return new DocumentGenerator(type, keywords); } public static DocumentGenerator multiple() { diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java index b9b1afd4..3feeb754 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java @@ -182,10 +182,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin } switch (type) { - case ECOMMERCE_AND_SPAM -> flags.add(DocumentFlags.GeneratorSpammy); - case DOCS_FORUM_WIKI -> flags.add(DocumentFlags.GeneratorForumWiki); - case ZOOMER_STATIC, MANUAL_NEW -> flags.add(DocumentFlags.GeneratorBlog); - case MANUAL_RETRO, BOOMER_STATIC -> flags.add(DocumentFlags.GeneratorVintage); + case DOCS -> flags.add(DocumentFlags.GeneratorDocs); + case FORUM -> flags.add(DocumentFlags.GeneratorForum); + case WIKI -> flags.add(DocumentFlags.GeneratorWiki); default -> {} // no flags } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/RankingSearchSet.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/RankingSearchSet.java index 1367afe4..64507955 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/RankingSearchSet.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/RankingSearchSet.java @@ -64,26 +64,13 @@ public class RankingSearchSet implements SearchSet { @Override public boolean contains(int urlId, long documentMetadata) { - // For ranked search sets, exclude excessively commercial sites - // TODO: Maybe this particular check should be moved up to the search service and be opt-in? - if (DocumentMetadata.hasFlags(documentMetadata, DocumentFlags.GeneratorSpammy.asBit())) { - return false; - } // This is the main check if (set.contains(urlId) || set.isEmpty()) { return true; } - - // For the rest, let through some domains that are not in the set based on the generator tag - if (identifier == SearchSetIdentifier.SMALLWEB) { - return DocumentMetadata.hasFlags(documentMetadata, DocumentFlags.GeneratorBlog.asBit()); - } - if (identifier == SearchSetIdentifier.RETRO) { - return DocumentMetadata.hasFlags(documentMetadata, DocumentFlags.GeneratorVintage.asBit()); - } - - return DocumentMetadata.hasFlags(documentMetadata, DocumentFlags.GeneratorForumWiki.asBit()); + // TODO + return false; } public void write() throws IOException {